基于微信开放的个人号接口python库itchat,实现对微信好友的获取,并对省份、性别、微信签名做数据分析。
效果:
直接上代码,建三个空文本文件stopwords.txt,newdit.txt、unionWords.txt,下载字体simhei.ttf或删除字体要求的代码,就可以直接运行。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
|
#wxfriends.py 2018-07-09 import itchat import sys import pandas as pd import matplotlib.pyplot as plt plt.rcParams[ 'font.sans-serif' ] = [ 'SimHei' ] #绘图时可以显示中文 plt.rcParams[ 'axes.unicode_minus' ] = False #绘图时可以显示中文 import jieba import jieba.posseg as pseg from scipy.misc import imread from wordcloud import WordCloud from os import path #解决编码问题 non_bmp_map = dict .fromkeys( range ( 0x10000 , sys.maxunicode + 1 ), 0xfffd ) #获取好友信息 def getFriends(): friends = itchat.get_friends(update = True )[ 0 :] flists = [] for i in friends: fdict = {} fdict[ 'NickName' ] = i[ 'NickName' ].translate(non_bmp_map) if i[ 'Sex' ] = = 1 : fdict[ 'Sex' ] = '男' elif i[ 'Sex' ] = = 2 : fdict[ 'Sex' ] = '女' else : fdict[ 'Sex' ] = '雌雄同体' if i[ 'Province' ] = = '': fdict[ 'Province' ] = '未知' else : fdict[ 'Province' ] = i[ 'Province' ] fdict[ 'City' ] = i[ 'City' ] fdict[ 'Signature' ] = i[ 'Signature' ] flists.append(fdict) return flists #将好友信息保存成CSV def saveCSV(lists): df = pd.DataFrame(lists) try : df.to_csv( "wxfriends.csv" ,index = True ,encoding = 'gb18030' ) except Exception as ret: print (ret) return df #统计性别、省份字段 def anysys(df): df_sex = pd.DataFrame(df[ 'Sex' ].value_counts()) df_province = pd.DataFrame(df[ 'Province' ].value_counts()[: 15 ]) df_signature = pd.DataFrame(df[ 'Signature' ]) return df_sex,df_province,df_signature #绘制柱状图,并保存 def draw_chart(df_list,x_feature): try : x = list (df_list.index) ylist = df_list.values y = [] for i in ylist : for j in i: y.append(j) plt.bar(x,y,label = x_feature) plt.legend() plt.savefig(x_feature) plt.close() except : print ( "绘图失败" ) #解析取个性签名构成列表 def getSignList(signature): sig_list = [] for i in signature.values: for j in i: sig_list.append(j.translate(non_bmp_map)) return sig_list #分词处理,并根据需要填写停用词、自定义词、合并词替换 def segmentWords(txtlist): stop_words = set (line.strip() for line in open ( 'stopwords.txt' , encoding = 'utf-8' )) newslist = [] #新增自定义词 jieba.load_userdict( "newdit.txt" ) for subject in txtlist: if subject.isspace(): continue word_list = pseg.cut(subject) for word, flag in word_list: if not word in stop_words and flag = = 'n' or flag = = 'eng' and word ! = 'span' and word ! = 'class' : newslist.append(word) #合并指定的相似词 for line in open ( 'unionWords.txt' , encoding = 'utf-8' ): newline = line.encode( 'utf-8' ).decode( 'utf-8-sig' ) #解决ufeff问题 unionlist = newline.split( "*" ) for j in range ( 1 , len (unionlist)): #wordDict[unionlist[0]] += wordDict.pop(unionlist[j],0) for index,value in enumerate (newslist): if value = = unionlist[j]: newslist[index] = unionlist[ 0 ] return newslist #高频词统计 def countWords(newslist): wordDict = {} for item in newslist: wordDict[item] = wordDict.get(item, 0 ) + 1 itemList = list (wordDict.items()) itemList.sort(key = lambda x:x[ 1 ],reverse = True ) for i in range ( 100 ): word, count = itemList[i] print ( "{}:{}" . format (word,count)) #绘制词云 def drawPlant(newslist): d = path.dirname(__file__) mask_image = imread(path.join(d, "timg.png" )) content = ' ' .join(newslist) wordcloud = WordCloud(font_path = 'simhei.ttf' , background_color = "white" ,width = 1300 ,height = 620 , max_words = 200 ).generate(content) #mask=mask_image, # Display the generated image: plt.imshow(wordcloud) plt.axis( "off" ) wordcloud.to_file( 'wordcloud.jpg' ) plt.show() def main(): #登陆微信 itchat.auto_login() # 登陆后不需要扫码 hotReload=True flists = getFriends() fdf = saveCSV(flists) df_sex,df_province,df_signature = anysys(fdf) draw_chart(df_sex, "性别" ) draw_chart(df_province, "省份" ) wordList = segmentWords(getSignList(df_signature)) countWords(wordList) drawPlant(wordList) main() |
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/zenobia119/article/details/80990970