今天,我上的课,学了索引排序与搜索。让我们用python实现,觉得有点意思就跟大家分享一波。
代码如下图:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
import requests import re def News_Spider(): #定义一个爬虫 url = 'https://news.sina.com.cn/' #url地址,新浪新闻 headers = { #请求头 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36' } response = requests.get(url,headers,verify = False ) #针对https,采用verify=False response.encoding = 'utf-8' #编码方式 html = response.text #获取页面源代码 #print(html)#打印源代码 reg = 'target="_blank">(.*?)</a>' #设置规则 content = re.findall(reg,html) #从页面源代码中筛选 ls = [] #定义一个空列表 for c in content: if '<' in c: continue else : if len (c) > 6 and '客户端' not in c: #print(c) ls.append(c) else : continue docu_set = {} #定义一个字典 for l in range ( len (ls)): docu_set[ 'd{}' . format (l + 1 )] = ls[l] #格式化方法,从1开始 return docu_set def change_set(): all_words = [] #定义一个空列表用于存储 docu_set = News_Spider() for i in docu_set.values(): cut = i.split() #分词 all_words.extend(cut) #添加分词 set_all_words = set (all_words) return set_all_words #print(set_all_words) def reverse_index(): invert_index = dict () #定义空字典 set_all_words = change_set() #将返回值传递给变量 docu_set = News_Spider() for b in set_all_words: temp = [] for k in docu_set.keys(): field = docu_set[k] split_field = field.split() if b in split_field: temp.append(k) invert_index[b] = temp print (invert_index) return invert_index def Select(): docu_set = News_Spider() invert_index = reverse_index() news = [] # for i in invert_index: # print(invert_index[i]) while True : Find = str ( input ( '请输入查找内容:' )) if Find = = '不查了' : break for Contetnt in invert_index: #循环每一个键 if Find in Contetnt: #如果输入在键的字符串中 Result = invert_index[Contetnt] #循环出字典中每一个对应的值 #print(Result) for r in Result: #循环每一个值 if r in docu_set.keys(): #如果值在字典中 news.append(docu_set[r]) #列表增加字典docu_set的值 print (docu_set[r]) #打印输出字典的值 else : continue else : if Find not in Contetnt: news.append( '很抱歉,没有找到更多内容!!' ) #news = set(news) for n in news: if '很抱歉' in n: print (n) break else : print (n) def main_function(): #定义一个主方法 News_Spider() change_set() reverse_index() Select() if __name__ = = '__main__' : #程序入口 main_function() |
运行结果如下图:
关于代码的解释,我写在注释中了。
到此这篇关于python实现简单的索引排序与搜索功能的文章就介绍到这了,更多相关python实现索引排序和搜索内容请搜索服务器之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持服务器之家!
原文链接:https://blog.csdn.net/weixin_43408020/article/details/115661315