首页 > 解决方案 > 加载 RandomForestClassifier 时的性能问题

问题描述

我在这个 GitHub 存储库之后训练了一个用于仇恨言论检测的 RandomForestClassifier: https ://github.com/aman-saha/hate-speech-detection

我使用了 char_bigram_features.csv、word_bigram_features.csv 和 tfidf_features.csv 的特征,它们帮助我达到了 92% 左右的准确率。现在我想用它来预测看不见的文本数据。为此,我正在保存和恢复 CountVectorizers 的词汇表以及模型本身。

test =  pd.DataFrame({'index': 0, 'text':['I hate you'],'class':None})
test['text'] = test['text'].str.lower()
test['text'] = [' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",line).split()) for line in test['text']]

def get_char_bigrams(test):
    loaded_vector = CountVectorizer(vocabulary = pickle.load(open(r"./charbigrams_feature.pkl", "rb")), analyzer='char', stop_words='english',min_df=.002, max_df=.8,ngram_range=(2,2))
    cv_char_mat = loaded_vector.transform(test['text'])
    bigrams = pd.DataFrame(cv_char_mat.todense(), index=test['index'], columns=loaded_vector.get_feature_names())
    bigrams = bigrams.add_prefix('char_bigrams:')   
    return bigrams

def get_word_bigrams(test):
    loaded_vector = CountVectorizer(vocabulary = pickle.load(open(r"./wordbigrams_feature.pkl", "rb")),  stop_words='english',min_df=.002, max_df=.8,ngram_range=(2,2))
    cv_mat = loaded_vector.transform(test['text'])
    bigrams = pd.DataFrame(cv_mat.todense(), index=test['index'], columns=loaded_vector.get_feature_names())
    bigrams = bigrams.add_prefix('word_bigrams:')   
    return bigrams

def get_tfidf_features(test):
    loaded_vector = CountVectorizer(vocabulary = pickle.load(open(r"./tfidf_feature.pkl", "rb")),  stop_words='english',min_df=.002, max_df=.8,ngram_range=(1,1))
    cv_mat = loaded_vector.transform(test['text'])
    transformer = TfidfTransformer()
    transformed_weights = transformer.fit_transform(cv_mat)
    weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
    weights_df = pd.DataFrame({'term': loaded_vector.get_feature_names(), 'weight': weights})
    weights_df.sort_values(by='weight', ascending=False).head(80)
    transformed_weights.toarray()

    tf_idf =pd.DataFrame(transformed_weights.todense(), index=test['index'], columns=loaded_vector.get_feature_names())
    tf_idf = tf_idf.add_prefix('tfidf:')

    return tf_idf

char_bigrams = get_char_bigrams(test)
word_bigrams = get_word_bigrams(test)
tfidf_sparse_matrix = get_tfidf_features(test)

df_list=[test,char_bigrams, word_bigrams, tfidf_sparse_matrix]
input = df_list[0]
for df in df_list[1:]:
    input = input.merge(df, on='index')
input.columns.values
input=input.iloc[:,3:] #all features
model = joblib.load("./random_forest.joblib")
print(model.predict(input))

它总是返回零。我比较了训练和预测的特征,发现新特征df缺少列['char_bigrams:.\r\n', 'char_bigrams:s\r\n'],但没有错误。是否有可能,这是导致性能不佳的原因?

提前致谢。

标签: pythonscikit-learnrandom-forest

解决方案


推荐阅读