第15章 电商产品评论数据情感分析
"""
Created on Thu Nov 9 15:12:30 2017
@author: lu
"""
import jieba
import pandas as pd
from gensim import corpora, models
"""
由于每个阶段的数据文件存在依赖关系,所以这里输出保存在了data/目录下
programmer_1-->提取数据
programmer_2-->数据去重
programmer_3-->利用正则去除一些数据
programmer_4-->使用jieba分词
programmer_5-->分词之后的语义分析,LDA模型分析正面负面情感
"""
def programmer_1():
inputfile = "data/huizong.csv"
outputfile = "data/meidi_jd.txt"
data = pd.read_csv(inputfile, encoding="utf-8")
data = data[[u"评论"]][data[u"品牌"] == u"美的"]
data.to_csv(outputfile, index=False, header=False, encoding="utf8")
def programmer_2():
inputfile = "data/meidi_jd.txt"
outputfile = "data/meidi_jd_process_1.txt"
data = pd.read_csv(inputfile, encoding="utf8", header=None)
l1 = len(data)
data = pd.DataFrame(data[0].unique())
l2 = len(data)
data.to_csv(outputfile, index=False, header=False, encoding="utf8")
print(u"删除了%s条评论" % (l1 - l2))
def programmer_3():
inputfile1 = u"data/meidi_jd_process_end_负面情感结果.txt"
inputfile2 = u"data/meidi_jd_process_end_正面情感结果.txt"
outputfile1 = "data/meidi_jd_neg.txt"
outputfile2 = "data/meidi_jd_pos.txt"
data1 = pd.read_csv(inputfile1, encoding="utf8", header=None)
data2 = pd.read_csv(inputfile2, encoding="utf8", header=None)
data1 = pd.DataFrame(data1[0].str.replace(".*?\d+?\\t ", ""))
data2 = pd.DataFrame(data2[0].str.replace(".*?\d+?\\t ", ""))
data1.to_csv(outputfile1, index=False, header=False, encoding="utf8")
data2.to_csv(outputfile2, index=False, header=False, encoding="utf8")
def programmer_4():
inputfile1 = "data/meidi_jd_neg.txt"
inputfile2 = "data/meidi_jd_pos.txt"
outputfile1 = "data/meidi_jd_neg_cut.txt"
outputfile2 = "data/meidi_jd_pos_cut.txt"
data1 = pd.read_csv(inputfile1, encoding="utf8", header=None)
data2 = pd.read_csv(inputfile2, encoding="utf8", header=None)
def mycut(s): return " ".join(jieba.cut(s))
data1 = data1[0].apply(mycut)
data2 = data2[0].apply(mycut)
data1.to_csv(outputfile1, index=False, header=False, encoding="utf8")
data2.to_csv(outputfile2, index=False, header=False, encoding="utf8")
def programmer_5():
negfile = "data/meidi_jd_neg_cut.txt"
posfile = "data/meidi_jd_pos_cut.txt"
stoplist = "data/stoplist.txt"
neg = pd.read_csv(negfile, encoding="utf8", header=None)
pos = pd.read_csv(posfile, encoding="utf8", header=None)
"""
sep设置分割词,由于csv默认半角逗号为分割词,而且该词恰好位于停用词表中
所以会导致读取错误
解决办法是手动设置一个不存在的分割词,这里使用的是tipdm
参数engine加上,指定引擎,避免警告
"""
stop = pd.read_csv(stoplist, encoding="utf8", header=None, sep="tipdm", engine="python")
stop = [" ", ""] + list(stop[0])
neg[1] = neg[0].apply(lambda s: s.split(" "))
neg[2] = neg[1].apply(lambda x: [i for i in x if i not in stop])
pos[1] = pos[0].apply(lambda s: s.split(" "))
pos[2] = pos[1].apply(lambda x: [i for i in x if i not in stop])
neg_dict = corpora.Dictionary(neg[2])
neg_corpus = [neg_dict.doc2bow(i) for i in neg[2]]
neg_lda = models.LdaModel(neg_corpus, num_topics=3, id2word=neg_dict)
for i in range(3):
print(neg_lda.print_topic(i))
pos_dict = corpora.Dictionary(pos[2])
pos_corpus = [pos_dict.doc2bow(i) for i in pos[2]]
pos_lda = models.LdaModel(pos_corpus, num_topics=3, id2word=pos_dict)
for i in range(3):
print(pos_lda.print_topic(i))
if __name__ == "__main__":
pass