使用Python进行红楼梦文本分析的详细指南

1.  获取小说文本  读取文件

# 获取小说文本
# 读取文件
fn = open("prepare\\红楼梦_曹雪芹.txt", encoding="utf-8")
string_data = fn.read()  # 读出整个文件
fn.close()  # 关闭文件

2.对文本进行处理 

# 文本预处理
pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"')  # 定义正则表达式匹配模式
txt = re.sub(pattern, '', string_data)  # 将符合模式的字符去除
print('预处理完毕')


# 停词文档
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords
stopwords = stopwordslist('D:\\Python studybag\\prepare\\tingyong.txt')
excludes = {'之','其','方','即','因','仍','故','尚','乃','呀','吗','咧','罢','咧','啊','罢','了','么',
            '或', ' 亦', '于', ' 皆', '的', '着', '一' , '不', '把', '让','向', '往', '是' , '在', '别',
            '好', '可', '便', '就',' 但','越','再','更', '比','很','偏',
            '那里','如今','一个','我们','你们','起来','姑娘','这里','二人','说道',
            '知道','如何','今日','什么','于是','还有','出来','他们','众人','奶奶',
            '自己','一面','太太','只见','怎么','两个','没有','不是','不知','这个',
            '听见','这样','进来','告诉','东西','咱们','就是','如此','回来','大家',
            '只是','老爷','只得','丫头','这些','不敢','出去','所以','不过','姐姐',
            '的话','不好','鸳鸯','一时','过来','不能','心里','银子','答应','几个'} # 排除的词汇

3. 词频

# 通过键值对的形式存储词语及其出现的次数
counts1 = {}  # 存放词性词频
counts2 = {}  # 存放人物词频
# # 生成词频词性文件
def getWordTimes1():
    cutFinal = pseg.cut(txt)

    for w in cutFinal:
        if w.word in stopwords or w.word == None:
            continue
        else:
            real_word = w.word + '_' + w.flag
        counts1[real_word] = counts1.get(real_word, 0) + 1


getWordTimes1()

items1 = list(counts1.items())
# 进行降序排列 根据词语出现的次数进行从大到小排序
items1.sort(key=lambda x: x[1], reverse=True)


# 导出数据
# 分词生成人物词频(写入文档)
def wordFreq1(filepath, topn1):
    with codecs.open(filepath, "w", "utf-8") as f:
        for i in range(topn1):
            word, count = items1[i]
            f.write("{}:{}\n".format(word, count))


# 生成词频文件
wordFreq1("output\\红楼梦词频词性.txt", 300)

# 将txt文本里的数据转换为字典形式
fr1 = open('output\\红楼梦词频词性.txt', 'r', encoding='utf-8')
dic1 = {}
keys1 = []  # 用来存储读取的顺序
for line in fr1:
    # 去空白,并用split()方法返回列表
    v1 = line.strip().split(':')
    dic1[v1[0]] = v1[1]
    keys1.append(v1[0])
fr1.close()

list_name1 = list(dic1.keys())  # 人名
list_name_times1 = list(dic1.values())  # 提取字典里的数据作为绘图数据
def create_wordproperties():
    bar1 = Bar()
    bar1.add_xaxis(list_name1[0:keshihuaTop])
    bar1.add_yaxis("词语出现次数", list_name_times1)
    bar1.set_global_opts(title_opts=opts.TitleOpts(title="词频词性可视化图", subtitle="词频词性top10"),
                         xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": 45}))
    bar1.set_series_opts(label_opts=opts.LabelOpts(position="top"))

    # 生成 html 文件
    bar1.render("\\output\\红楼梦词频词性可视化图.html")

 

4. 对人名进行分析

# 得到 分词和出现次数
def getWordTimes2():
    # 分词,返回词性
    poss = pseg.cut(txt)
    for w in poss:
        if w.flag != 'nr' or len(w.word) < 2 or w.word in excludes:
            continue  # 当分词长度小于2或该词词性不为nr(人名)时认为该词不为人名
        elif w.word == '宝哥哥' or w.word == '宝玉曰' or w.word == '宝二爷' or w.word == '绛洞花主' \
                or w.word == '怡红公子' or w.word == '宝兄弟' or w.word == '混世魔王' or w.word == '宝玉':
            real_word = '贾宝玉'
        elif w.word == '黛玉' or w.word == '颦儿' or w.word == '潇湘妃子' or w.word == '林姑娘' or \
                w.word == '林妹妹' or w.word == '黛玉曰' or w.word == '颦颦':
            real_word = '林黛玉'
        elif w.word == '宝钗' or w.word == '宝钗曰' or w.word == '宝丫头' or w.word == '宝姐姐' or w.word == '薛大姑娘':
            real_word = '林宝钗'
        elif w.word == '熙凤' or w.word == '熙凤曰' or w.word == '琏二奶奶' or w.word == '凤辣子' or w.word == '凤哥儿' \
                or w.word == '凤丫头' or w.word == '凤姐' or w.word == '凤姐儿' or w.word == '琏二嫂子':
            real_word = '王熙凤'
        elif w.word == '贾母' or w.word == '贾母曰' or w.word == '史太君' or w.word == '老祖宗' or w.word == '老太太' \
                or w.word == '老神仙':
            real_word = '贾母'
        elif w.word == '湘云' or w.word == '湘云曰' or w.word == '枕霞旧友' or w.word == '史大姑娘' or w.word == '云妹妹':
            real_word = '史湘云'
        elif w.word == '姨妈' or w.word == '姨妈曰' or w.word == '薛夫人' or w.word == '薛王氏' or w.word == '姨太太':
            real_word = '贾迎春'
        elif w.word == '探春' or w.word == '探春曰' or w.word == '玫瑰花' or w.word == '蕉下客':
            real_word = '贾探春'
        elif w.word == '贾珍' or w.word == '贾珍曰' or w.word == '珍老爷' or w.word == '大爷' or w.word == '大哥哥' :
            real_word = '贾珍'
        elif w.word == '贾琏' or w.word == '贾琏曰' or w.word == '琏二爷' or w.word == '二爷':
            real_word = '贾琏'
        elif w.word == '袭人' or w.word == '袭人曰' or w.word == '蕊珠' or w.word == '花珍珠':
            real_word = '袭人'
        elif w.word == '平儿' or w.word == '平儿曰' or w.word == '小平' or w.word == '平姑娘' or w.word == '平姐姐':
            real_word = '平儿'  # 把相同意思的名字归为一个人

        else:
            real_word = w.word
        counts2[real_word] = counts2.get(real_word, 0) + 1


getWordTimes2()
items2 = list(counts2.items())
# 进行降序排列 根据词语出现的次数进行从大到小排序
items2.sort(key=lambda x: x[1], reverse=True)


# 导出数据
# 分词生成人物词频(写入文档)
def wordFreq2(filepath, topn):
    with codecs.open(filepath, "w", "utf-8") as f:
        for i in range(topn):
            word, count = items2[i]
            f.write("{}:{}\n".format(word, count))


# 生成词频文件
wordFreq2("D:\\Python studybag\\output\\红楼梦词频_人名.txt", 300)

# 将txt文本里的数据转换为字典形式
fr = open('D:\\Python studybag\\output\\红楼梦词频_人名.txt', 'r', encoding='utf-8')
dic = {}
keys = []  # 用来存储读取的顺序
for line in fr:
    # 去空白,并用split()方法返回列表
    v = line.strip().split(':')
    dic[v[0]] = v[1]
    keys.append(v[0])
fr.close()
# 输出前几个的键值对
print("人物出现次数TOP", mainTop)
print(list(dic.items())[:mainTop])

#  绘图
# 人名列表 (用于人物关系图,pyecharts人物出场次数图)
list_name = list(dic.keys())  # 人名
list_name_times = list(dic.values())  # 提取字典里的数据作为绘图数据


# 可视化人物出场次数
def creat_people_view():
    bar = Bar()
    bar.add_xaxis(list_name[0:keshihuaTop])
    bar.add_yaxis("人物出场次数", list_name_times)
    bar.set_global_opts(title_opts=opts.TitleOpts(title="人物出场次数可视化图", subtitle="红楼梦TOP10"),
                        xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": 45}))
    bar.set_series_opts(label_opts=opts.LabelOpts(position="top"))
    # bar.render_notebook()  # 在 notebook 中展示
    # make_snapshot(snapshot, bar.render(), "bar.png")
    # 生成 html 文件
    bar.render("D:\\Python studybag\\output\\红楼梦人物出场次数可视化图.html")

 5.词云

 

# 使用pyecharts 的方法生成词云
def creat_wordcloud_pyecharts():
    wordsAndTimes = list(dic.items())
    (
        WordCloud()
        .add(series_name="人物次数", data_pair=wordsAndTimes,
             word_size_range=[20, 100], textstyle_opts=opts.TextStyleOpts(font_family="cursive"), )
        .set_global_opts(title_opts=opts.TitleOpts(title="红楼梦词云"))
        .render("D:\\Python studybag\\output\\红楼梦词云_人名.html")
    )


# 颜色生成
colorNum = len(list_name[0:peopleTop])


# print('颜色数',colorNum)
def randomcolor():
    colorArr = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F']
    color = ""
    for i in range(6):
        color += colorArr[random.randint(0, 14)]
    return "#" + color


def color_list():
    colorList = []
    for i in range(colorNum):
        colorList.append(randomcolor())
    return colorList

 

物联沃分享整理
物联沃-IOTWORD物联网 » 使用Python进行红楼梦文本分析的详细指南

发表评论