如下所示:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
|
# -*- coding:utf-8 -*- from datetime import datetime import re def Main(): sourcr_dir = '/data/u_lx_data/fudan/muying/muying_11yue_all.txt' target_dir = '/data/u_lx_data/fudan/muying/python/uid_regular_get.txt' uset = set () #去重 print ( "开始。。。。。" ) print (datetime.now().strftime( '%Y-%m-%d %H:%M:%S' )) with open (target_dir, 'w+' ) as f_write: with open (sourcr_dir, 'r' ) as f_scorce: for line in f_scorce: line = line.strip().split( "\t" ) # 宝宝树 if line[ 2 ] = = 'babytree.com' : uidList = re.findall(r '.*NL=u%02(u\d+)' , line[ 3 ], re.I) if uidList: # 去重代码 if uidList[ 0 ] not in uset: f_write.write(uidList[ 0 ] + "\n" ) uset.add(uidList[ 0 ]) print ( "宝宝树已完成" ) # 柚宝宝 elif line[ 2 ] = = 'youzibuy.com' : if line[ 4 ].find( "yunqi.youzibuy.com/tae_top_notify" ) ! = - 1 : uidList = re.findall(r '.*myuid=(\d+)' , line[ 4 ], re.I) if uidList: if uidList[ 0 ] not in uset: f_write.write(uidList[ 0 ] + "\n" ) uset.add(uidList[ 0 ]) print ( "柚宝宝已完成" ) # 妈妈帮 elif line[ 2 ] = = 'mmbang.com' : uidList = re.findall(r '.*uid=(\d+)' , line[ 3 ], re.I) if uidList: if uidList[ 0 ] not in uset: f_write.write(uidList[ 0 ] + "\n" ) uset.add(uidList[ 0 ]) print ( "妈妈帮已完成" ) # 妈妈网 elif line[ 2 ] = = 'mama.cn' : if line[ 4 ].find( "mapi.mama.cn/feed/users/show" ) ! = - 1 : uidList = re.findall(r '.*friend_uid=(\d+)' , line[ 4 ], re.I) if uidList: if uidList[ 0 ] not in uset: f_write.write(uidList[ 0 ] + "\n" ) uset.add(uidList[ 0 ]) if line[ 4 ].find( "mamaquan/mmq_thread" ) ! = - 1 : uidList = re.findall(r '.*uid=(\d+)' , line[ 4 ], re.I) if uidList: if uidList[ 0 ] not in uset: f_write.write(uidList[ 0 ] + "\n" ) uset.add(uidList[ 0 ]) print ( "妈妈网已完成" ) # 育儿网 elif line[ 2 ] = = 'ci123.com' : uidList = re.findall(r '.*ci123js=([a-zA-Z]+\d+)' , line[ 3 ], re.I) if uidList: if uidList[ 0 ] not in uset: f_write.write(uidList[ 0 ] + "\n" ) uset.add(uidList[ 0 ]) print ( "育儿网已完成" ) print ( "完成。。。。。" ) print (datetime.now().strftime( '%Y-%m-%d %H:%M:%S' )) if __name__ = = "__main__" : Main() |
以上这篇对python读写文件去重、RE、set的使用详解就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/IBoyMan/article/details/79401596