本文实例讲述了Python实现的大数据分析操作系统日志功能。分享给大家供大家参考,具体如下:
一 代码
1、大文件切分
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
import os import os.path import time def FileSplit(sourceFile, targetFolder): if not os.path.isfile(sourceFile): print (sourceFile, ' does not exist.' ) return if not os.path.isdir(targetFolder): os.mkdir(targetFolder) tempData = [] number = 1000 fileNum = 1 linesRead = 0 with open (sourceFile, 'r' ) as srcFile: dataLine = srcFile.readline().strip() while dataLine: for i in range (number): tempData.append(dataLine) dataLine = srcFile.readline() if not dataLine: break desFile = os.path.join(targetFolder, sourceFile[ 0 : - 4 ] + str (fileNum) + '.txt' ) with open (desFile, 'a+' ) as f: f.writelines(tempData) tempData = [] fileNum = fileNum + 1 if __name__ = = '__main__' : #sourceFile = input('Input the source file to split:') #targetFolder = input('Input the target folder you want to place the split files:') sourceFile = 'test.txt' targetFolder = 'test' FileSplit(sourceFile, targetFolder) |
2、Mapper代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
import os import re import threading import time def Map (sourceFile): if not os.path.exists(sourceFile): print (sourceFile, ' does not exist.' ) return pattern = re. compile (r '[0-9]{1,2}/[0-9]{1,2}/[0-9]{4}' ) result = {} with open (sourceFile, 'r' ) as srcFile: for dataLine in srcFile: r = pattern.findall(dataLine) if r: t = result.get(r[ 0 ], 0 ) t + = 1 result[r[ 0 ]] = t desFile = sourceFile[ 0 : - 4 ] + '_map.txt' with open (desFile, 'a+' ) as fp: for k, v in result.items(): fp.write(k + ':' + str (v) + '\n' ) if __name__ = = '__main__' : desFolder = 'test' files = os.listdir(desFolder) #如果不使用多线程,可以直接这样写 '''for f in files: Map(desFolder + '\\' + f)''' #使用多线程 def Main(i): Map (desFolder + '\\' + files[i]) fileNumber = len (files) for i in range (fileNumber): t = threading.Thread(target = Main, args = (i,)) t.start() |
3.Reducer代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
import os def Reduce (sourceFolder, targetFile): if not os.path.isdir(sourceFolder): print (sourceFolder, ' does not exist.' ) return result = {} #Deal only with the mapped files allFiles = [sourceFolder + '\\'+f for f in os.listdir(sourceFolder) if f.endswith(' _map.txt')] for f in allFiles: with open (f, 'r' ) as fp: for line in fp: line = line.strip() if not line: continue position = line.index( ':' ) key = line[ 0 :position] value = int (line[position + 1 :]) result[key] = result.get(key, 0 ) + value with open (targetFile, 'w' ) as fp: for k,v in result.items(): fp.write(k + ':' + str (v) + '\n' ) if __name__ = = '__main__' : Reduce ( 'test' , 'test\\result.txt' ) |
二 运行结果
依次运行上面3个程序,得到最终结果:
07/10/2013:4634
07/16/2013:51
08/15/2013:3958
07/11/2013:1
10/09/2013:733
12/11/2013:564
02/12/2014:4102
05/14/2014:737
希望本文所述对大家Python程序设计有所帮助。
原文链接:https://blog.csdn.net/chengqiuming/article/details/78601136