NLP-机器学习文本分类源代码+数据集
立即下载
资源介绍:
建立基于Logistics Regression算法的文本分类模型,其完整流程包括:数据预处理、特征工程、构建分类器、最优参数选择、模型评估与保存等。
# encoding:utf-8
import pandas as pd
import random
import time
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import re
import jieba
# 数据预处理:清洗、分词、去停用词 ##
def preprocess_text(content,words):
print('开始数据预处理...')
stopwords = pd.read_csv("C:/Users/xxx/Documents/NLP/data/Stopwords.txt", index_col=False,quoting=3, sep="\t", names=['stopword'], encoding='utf8')
stopwords = stopwords['stopword'].values
for line in range(len(content)):
try:
content[line] = content[line].lower()
# 删除提及(例如:@zhangsan);删除URL链接;删除标签(例如:#Amazing);删除记号和下一个字符(例如:he's);删除数字;删除2个及以上的空格;删除两端空格
content[line] = re.sub("\S+@\S+|https*\S+|#\S+|\'\w+|\d+", " ", content[line])
content[line] = re.sub(r'[’!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’!\[\\\]^_`{|}~]+', ' ',content[line]) # 删除特殊字符
content[line] = re.sub('\s{2,}', " ", content[line])
content[line] = content[line].strip()
segs = jieba.lcut(content[line])
segs = filter(lambda x: x != " ", segs)
segs = filter(lambda x: x not in stopwords, segs)
segs = list(segs)
Allsegs = " ".join(segs)
words.append(Allsegs)
except Exception as e:
print ("something go wrong----"+content[line])
continue
## 文本向量化
def Vectorize(sentences):
## 将得到的数据集打散,生成更可靠的训练集分布 ##
random.shuffle(sentences)
## 用sk-learn切分数据,分成训练集和测试集 ##
x, y = zip(*sentences)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=10)
## TF-IDF向量化并保存特征集
vec = TfidfVectorizer(analyzer='word', max_df=0.6, ngram_range=[1,3], max_features=30000)
vec.fit(x_train)
print('(训练集,特征量) =', vec.transform(x_train).toarray().shape)
print('测试集',len(x_test))
return vec,x_train,y_train,x_test,y_test
## 用随机网格搜索法调参,寻找最优参数##
def ChoosePara():
pipe_rf = LogisticRegression()
param_grid = {'C': [1,10,100,1e3],
'solver':['newton-cg','lbfgs', 'sag'],
'multi_class':['ovr','multinomial'],
'max_iter':[100,600,1000]}
gs = RandomizedSearchCV(estimator=pipe_rf, param_distributions=param_grid, scoring='accuracy', cv=10, n_jobs=1) #estimator:选择的机器学习算法, n_jobs:并行数,cv:交叉验证参数,scoring:模型评价标准
gs = gs.fit(vec.transform(x_train), y_train)
print('Best score:', gs.best_score_)
print('Best param:', gs.best_params_)
# 构造逻辑回归分类器
def BuildModel():
print('分类器开始工作...')
begin = time.perf_counter()
Classifier = LogisticRegression(C=100, penalty='l2', solver='newton-cg', multi_class='ovr', max_iter=100).fit(vec.transform(x_train),y_train) # # penalty:正则化参数, solver:优化算法选择参数, multi_class='multinomial'分类方式选择参数, C:正则化系数的倒数,
y_pred = Classifier.predict(vec.transform(x_test))
print("\n逻辑回归评估指标:\n", classification_report(y_test,y_pred))
acc = Classifier.score(vec.transform(x_test), y_test)
end_time = time.perf_counter()
run_time = end_time - begin
print('准确率为:', acc, '建模运行时间', run_time, 's')
y_pred_proba = [max(max(Classifier.predict_proba(i))) for i in vec.transform(x_test)]
TestResult = pd.DataFrame(confusion_matrix(y_test, y_pred), columns=['Low Risk', 'High Risk', 'MRA'],index=['Itemized', 'Non-Itemized', 'Non-Itemized-MRA'])
print('\n',TestResult)
# # # #提取测试集预测情况
# # # 将Content_Words中的元素存入字典中,以第三个元素作为键
dict_a = {tup[2]: tup for tup in Content_Words}
# 按照x_test的顺序对Content_Words的第1元素进行排序
TestID = [dict_a[b][0] for b in x_test if b in dict_a]
# 按照x_test的顺序对Content_Words的第2元素进行排序
TestContent = [dict_a[b][1] for b in x_test if b in dict_a]
TestDF = pd.DataFrame({'ImgName':TestID})
TestDF['Content'] = TestContent
TestDF['CutWords'] = x_test
TestDF['label'] = y_test
y_test_pred = [''.join(i) for i in y_pred]
TestDF['Test_pred'] = y_test_pred
TestDF['Probability'] = y_pred_proba
TestDF.to_excel('C:/Users/xxx/Desktop/测试集情况.xlsx', index=False)
print('测试集情况导出成功')
return vec,Classifier
if __name__ == '__main__':
## 导入数据 ##
df = pd.read_excel("C:/Users/xxx/Desktop/Data.xlsx")
df = df.dropna() # 清除缺失值
# ## 转换为列表 ##
content = df.content.values.tolist()
label = df.label.values.tolist()
words = []
## 调用预处理函数,并保存分词库 ##
preprocess_text(content, words)
df['CutWords'] = words
df.to_excel("C:/Users/xxx/Desktop/Data-clear.xlsx",index=False)
####### 调用已保存过的cut words, 以免每次测试模型都要重新分词 ##
Cut_df = pd.read_excel("C:/Users/xxx/Desktop/Data-clear.xlsx")
Cut_df = Cut_df.dropna()
ImgName = Cut_df.ImgName.values.tolist()
cut_content = Cut_df.content.values.tolist()
cut_words = Cut_df.CutWords.values.tolist()
label = Cut_df.label.values.tolist()
print(pd.value_counts(label))
sentences = list(zip(cut_words, label))
Content_Words = list(zip(ImgName, cut_content, cut_words))
## 语料库向量化
vec, x_train, y_train, x_test, y_test = Vectorize(sentences)
# ## 寻找算法最优参数
# ChoosePara()
# ## 调用建模函数
vec,Classifier = BuildModel()
## 保存 model
joblib.dump(vec,'C:/Users/xxx/desktop/Features.m')
print('特征集保存成功')
joblib.dump(Classifier, 'C:/Users/xxx/desktop/Model.m')
print('模型保存成功!')