[python]LDA模型使用流程及代码

目录

数据预处理

去除停用词

构建LDA模型

可视化——pyLDAvis

 主题个数确认

困惑度计算

一致性得分


数据预处理

该步骤可自行处理,用excel也好,用python也罢,只要将待分析文本处理为csv或txt存储格式即可。注意:一条文本占一行

例如感想.txt:

我喜欢吃汉堡

小明喜欢吃螺蛳粉

螺蛳粉外卖好贵

以上句子来源于吃完一个汉堡还想再点碗螺蛳粉,但外卖好贵从而选择放弃的我

去除停用词

import re
import jieba as jb
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords

# 对句子进行分词
def seg_sentence(sentence):
    sentence = re.sub(u'[0-9\.]+', u'', sentence)
    #jb.add_word('词汇')		# 这里是加入自定义的词来补充jieba词典
    sentence_seged = jb.cut(sentence.strip())
    stopwords = stopwordslist('自己搜来的停用词表.txt')  # 这里加载停用词的路径
    outstr = ''
    for word in sentence_seged:
        if word not in stopwords and word.__len__()>1:
            if word != '\t':
                outstr += word
                outstr += " "
    return outstr


inputs = open('感想.txt', 'r', encoding='utf-8')

outputs = open('感想分词.txt', 'w',encoding='utf-8')
for line in inputs:
    line_seg = seg_sentence(line)  # 这里的返回值是字符串
    outputs.write(line_seg + '\n')
outputs.close()
inputs.close()

该步骤生成感想分词.txt:

我 喜欢 吃 汉堡

小明 喜欢 吃 螺蛳粉

螺蛳粉 外卖 好贵

句子 来源于 吃完 一个 汉堡  再点碗 螺蛳粉 外卖 好贵  选择 放弃 

构建LDA模型

假设主题个数设为4个(num_topics的参数)

import codecs
from gensim import corpora
from gensim.models import LdaModel
from gensim.corpora import Dictionary


train = []

fp = codecs.open('感想分词.txt','r',encoding='utf8')
for line in fp:
    if line != '':
        line = line.split()
        train.append([w for w in line])

dictionary = corpora.Dictionary(train)

corpus = [dictionary.doc2bow(text) for text in train]

lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=4, passes=100)
# num_topics:主题数目
# passes:训练伦次
# num_words:每个主题下输出的term的数目

for topic in lda.print_topics(num_words = 20):
    termNumber = topic[0]
    print(topic[0], ':', sep='')
    listOfTerms = topic[1].split('+')
    for term in listOfTerms:
        listItems = term.split('*')
        print('  ', listItems[1], '(', listItems[0], ')', sep='')

可视化——pyLDAvis

import pyLDAvis.gensim_models

'''插入之前的代码片段'''
import codecs
from gensim import corpora
from gensim.models import LdaModel
from gensim.corpora import Dictionary


train = []

fp = codecs.open('感想分词.txt','r',encoding='utf8')
for line in fp:
    if line != '':
        line = line.split()
        train.append([w for w in line])

dictionary = corpora.Dictionary(train)

corpus = [dictionary.doc2bow(text) for text in train]

lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=4, passes=100)
# num_topics:主题数目
# passes:训练伦次
# num_words:每个主题下输出的term的数目

for topic in lda.print_topics(num_words = 20):
    termNumber = topic[0]
    print(topic[0], ':', sep='')
    listOfTerms = topic[1].split('+')
    for term in listOfTerms:
        listItems = term.split('*')
        print('  ', listItems[1], '(', listItems[0], ')', sep='')
        
d=pyLDAvis.gensim_models.prepare(lda, corpus, dictionary)

'''
lda: 计算好的话题模型

corpus: 文档词频矩阵

dictionary: 词语空间
'''

#pyLDAvis.show(d)		#展示在浏览器
# pyLDAvis.displace(d) #展示在notebook的output cell中
pyLDAvis.save_html(d, 'lda_pass4.html')

这样就会生成看起来很炫酷的图啦(只是示例):

 主题个数确认

计算不同参数下结果的 Perlexity(困惑度)和 Coherence score(一致性评分),选择困惑度最低且一致性评分最高的参数值作为最终参数设定。

困惑度计算

import gensim
from gensim import corpora, models
import matplotlib.pyplot as plt
import matplotlib
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
 
 
 
 # 准备数据
PATH = "感想分词.txt"  #已经进行了分词的文档(如何分词前面的文章有介绍)
 
 
file_object2=open(PATH,encoding = 'utf-8',errors = 'ignore').read().split('\n')  
data_set=[] #建立存储分词的列表
for i in range(len(file_object2)):
    result=[]
    seg_list = file_object2[i].split()  #读取没一行文本
    for w in seg_list :#读取每一行分词
        result.append(w)
    data_set.append(result)
print(data_set)  #输出所有分词列表
 
 
dictionary = corpora.Dictionary(data_set)  # 构建 document-term matrix
corpus = [dictionary.doc2bow(text) for text in data_set]
Lda = gensim.models.ldamodel.LdaModel  # 创建LDA对象
 
#计算困惑度
def perplexity(num_topics):
    ldamodel = Lda(corpus, num_topics=num_topics, id2word = dictionary, passes=50)  #passes为迭代次数,次数越多越精准
    print(ldamodel.print_topics(num_topics=num_topics, num_words=20))  #num_words为每个主题下的词语数量
    print(ldamodel.log_perplexity(corpus))
    return ldamodel.log_perplexity(corpus)
 
# 绘制困惑度折线图
x = range(1,20)  #主题范围数量
y = [perplexity(i) for i in x]
plt.plot(x, y)
plt.xlabel('主题数目')
plt.ylabel('困惑度大小')
plt.rcParams['font.sans-serif']=['SimHei']
matplotlib.rcParams['axes.unicode_minus']=False
plt.title('主题-困惑度变化情况')
plt.show()

 

一致性得分

import gensim
from gensim import corpora, models
import matplotlib.pyplot as plt
import matplotlib
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
 
 
 
 # 准备数据
PATH = "感想分词.txt"  #已经进行了分词的文档(如何分词前面的文章有介绍)
 
 
file_object2=open(PATH,encoding = 'utf-8',errors = 'ignore').read().split('\n')  
data_set=[] #建立存储分词的列表
for i in range(len(file_object2)):
    result=[]
    seg_list = file_object2[i].split()  #读取没一行文本
    for w in seg_list :#读取每一行分词
        result.append(w)
    data_set.append(result)
print(data_set)  #输出所有分词列表
 
 
dictionary = corpora.Dictionary(data_set)  # 构建 document-term matrix
corpus = [dictionary.doc2bow(text) for text in data_set]
Lda = gensim.models.ldamodel.LdaModel  # 创建LDA对象
 

def coherence(num_topics):
    ldamodel = Lda(corpus, num_topics=num_topics, id2word = dictionary, passes=50)  #passes为迭代次数,次数越多越精准
    coherence_model_lda = CoherenceModel(model=ldamodel, texts=data_set, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)
    return coherence_lda
 
# 绘制困惑度折线图
x = range(1,20)  #主题范围数量
y = [coherence(i) for i in x]
plt.plot(x,y)
plt.xlabel('主题数目')
plt.ylabel('coherence大小')
plt.rcParams['font.sans-serif']=['SimHei']
matplotlib.rcParams['axes.unicode_minus']=False
plt.title('主题-coherence变化情况')
plt.show()

 结语

整个流程就大致是这样啦!有问题欢迎一起交流!!

 

来源:alwaysluc

物联沃分享整理
物联沃-IOTWORD物联网 » [python]LDA模型使用流程及代码

发表评论