入门级爬虫:只抓取书籍名称,信息及下载地址并存储到数据库
数据库工具类:DBUtil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
import pymysql class DBUtils( object ): def connDB( self ): #连接数据库 conn = pymysql.connect(host = '192.168.251.114' ,port = 3306 , user = 'root' ,passwd = 'b6f3g2' ,db = 'yangsj' ,charset = 'utf8' ); cur = conn.cursor(); return (conn,cur); def exeUpdate( self ,conn,cur,sql): #更新或插入操作 sta = cur.execute(sql); conn.commit(); return (sta); def exeDelete( self ,conn,cur,IDs): #删除操作 demo 没用到 sta = 0 ; for eachID in IDs.split( ' ' ): sta + = cur.execute( "delete from students where Id=%d" % ( int (eachID))); conn.commit(); return (sta); def exeQuery( self ,cur,sql): #查找操作 effect_row = cur.execute(sql); return (effect_row,cur); def connClose( self ,conn,cur): #关闭连接,释放资源 cur.close(); conn.close(); if __name__ = = '__main__' : dbUtil = DBUtils(); conn,cur = dbUtil.connDB(); |
书籍操作文件 bookOpe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
from DBUtil import DBUtils from bookInfo import Book from bookInfo import DownLoadInfo import logging logging.basicConfig( level = logging.INFO ) class BookOperator( object ): def __addBook( self ,book): logging.info( "add book:%s" % book.bookName); dbUtil = DBUtils(); conn,cur = dbUtil.connDB(); insertBookSql = ( "insert into book (bookName,bookUrl,bookInfo) values ('%s','%s','%s');" % (book.bookName,book.downLoadUrl,book.mainInfo)); dbUtil.exeUpdate(conn,cur,insertBookSql); dbUtil.connClose(conn,cur); def __selectLastBookId( self ): logging.info( "selectLastBookId " ); dbUtil = DBUtils(); conn,cur = dbUtil.connDB(); selectLastBookSql = "select id from book order by id desc limit 1" ; effect_row,cur = dbUtil.exeQuery(cur,selectLastBookSql); bookId = cur.fetchone()[ 0 ]; dbUtil.connClose(conn,cur); return bookId; def __addBookDownLoadInfos( self ,downLoadInfos,bookId): logging.info( "add bookId:%s" % bookId); dbUtil = DBUtils(); conn,cur = dbUtil.connDB(); for downLoadinfo in downLoadInfos: insertBookDownLoadInfo = ( "insert into book_down_url (bookId,downName,downUrl) values ('%s','%s','%s');" % (bookId,downLoadinfo.downName,downLoadinfo.downUrl)); dbUtil.exeUpdate(conn,cur,insertBookDownLoadInfo); dbUtil.connClose(conn,cur); def addBookInfo( self ,book): logging.info( "add bookInfo:%s" % book.bookName); self .__addBook(book); bookId = self .__selectLastBookId(); self .__addBookDownLoadInfos(book.downLoadInfos,bookId); if __name__ = = '__main__' : bookope = BookOperator(); book = Book( "aaa" , "yang" , "cccc" ); book.addDownLoadUrl(DownLoadInfo( "aaa.html" , "书籍" )); bookope.addBookInfo(book); |
书籍信息文件 bookInfo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
import sys sys.encoding = "utf8" class Book( object ): #书籍信息# def __init__( self ,mainInfo,downLoadUrl,bookName): self .mainInfo = mainInfo; self .downLoadUrl = downLoadUrl; self .bookName = bookName; self .downLoadInfos = []; def addDownLoadUrl( self ,downloadInfo): self .downLoadInfos.append(downloadInfo); def print_book_info( self ): print ( "bookName :%s" % ( self .bookName)); class DownLoadInfo( object ): #下载信息# def __init__( self ,downUrl,downName): self .downUrl = downUrl; self .downName = downName; def print_down_info( self ): print ( "downLoad %s - %s" % ( self .downUrl, self .downName)); |
51job界面解析文件 FiveOneJobFetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
import requests from bs4 import BeautifulSoup import sys from bookInfo import Book from bookInfo import DownLoadInfo import logging sys.encoding = "utf8" class PageFetch( object ): host = "//www.zzvips.com/" ; #域名+分类 category = "books/" ; #具体请求页 def __init__( self ,pageUrl): self .pageUrl = pageUrl; #完整URL self .url = PageFetch.host + PageFetch.category + pageUrl; def __getPageContent( self ): req = requests.get( self .url); if req.status_code = = 200 : req.encoding = "gb2312" ; strText = req.text; return strText; else : return ""; def getPageContent(url): req = requests.get(url); if req.status_code = = 200 : req.encoding = "gb2312" ; strText = req.text; return strText; else : return ""; def __getMaxPageNumAndUrl( self ): fetchUrl = self .pageUrl; #获取分页地址 分页url 形如 list45_2.html 2为页号# maxPageNum = 0 ; maxLink = ""; while maxLink = = "": url = PageFetch.host + PageFetch.category + fetchUrl; reqContent = PageFetch.getPageContent(url) soup = BeautifulSoup (reqContent, "html.parser" ); for ul in soup.select( ".plist" ): print ( "数据" ); print (ul); maxPageNum = ul.select( "strong" )[ 0 ].text; alink = ul.select( "a" ); if alink[ - 1 ][ 'href' ] = = "#" : maxLink = alink[ 1 ][ 'href' ]; else : fetchUrl = alink[ - 1 ][ 'href' ]; return maxPageNum,maxLink; def __formatPage( self ,pageNum): #格式化url 形如 list45_2.html# lineBeginSite = self .pageUrl.index( "_" ) + 1 ; docBeginSite = self .pageUrl.index( "." ); return self .pageUrl[:lineBeginSite] + str (pageNum + 1 ) + self .pageUrl[docBeginSite:]; def getBookPageList( self ): #获取书籍每页的URL# shortPageList = []; maxPageNum,urlPattern = self .__getMaxPageNumAndUrl(); for i in range ( int (maxPageNum)): shortPageList.append( self .host + self .category + self .__formatPage(i)); return shortPageList; def getDownloadPage(url): downPage = []; reqContent = PageFetch.getPageContent(url); soup = BeautifulSoup (reqContent, "html.parser" ); for a in soup.select( ".cur-cat-list .btn-dl" ): downPage.append(PageFetch.host + a[ 'href' ]); return downPage; def getBookInfo(url): logging.info( "获取书籍信息url:%s" % url); reqContent = PageFetch.getPageContent(url); soup = BeautifulSoup (reqContent, "html.parser" ); mainInfo = (soup.select( "#soft-intro" ))[ 0 ].text.replace( "截图:" ," ").replace(" ' "," "); title = (soup.select( "dl dt h1" ))[ 0 ].text.replace( "'" ,""); book = Book(mainInfo,url,title); for ul in soup.select( ".ul_Address" ): for li in ul.select( "li" ): downLoadInfo = DownLoadInfo(li.select( "a" )[ 0 ][ 'href' ],li.select( "a" )[ 0 ].text); book.addDownLoadUrl(downLoadInfo); return book; if __name__ = = '__main__' : p = PageFetch( "list152_1.html" ); shortPageList = p.getBookPageList(); downPage = []; for page in shortPageList: downLoadPage = PageFetch.getDownloadPage(page); downPage = downPage + downLoadPage; print ( "================汇总如下===============================" ); for bookDownLoadPage in downPage: book = PageFetch.getBookInfo(bookDownLoadPage); print (book.bookName + ":%s" % book.downLoadUrl); for d in book.downLoadInfos: print ( "%s - %s" % (d.downUrl,d.downName)); # p = PageFetch("list977_1.html"); # p = p.getMaxPageNumAndUrl(); # print (p); |
执行文件,以上文件copy在相同的文件夹下 执行此文件即可 51Job.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
from FiveOneJobFetch import PageFetch from bookInfo import Book from bookInfo import DownLoadInfo from bookOpe import BookOperator def main(url): p = PageFetch(url); shortPageList = p.getBookPageList(); bookOperator = BookOperator(); downPage = []; for page in shortPageList: downLoadPage = PageFetch.getDownloadPage(page); downPage = downPage + downLoadPage; for bookDownLoadPage in downPage: book = PageFetch.getBookInfo(bookDownLoadPage); bookOperator.addBookInfo(book); print ( "数据抓取成功:" + url); if __name__ = = '__main__' : urls = [ "list152_35.html" , "list300_2.html" , "list476_6.html" , "list977_2.html" , "list572_5.html" , "list509_2.html" , "list481_1.html" , "list576_1.html" , "list482_1.html" , "list483_1.html" , "list484_1.html" ]; for url in urls: main(url); |
数据库表:书籍信息表和下载地址表
1
2
3
4
5
6
7
8
9
10
|
CREATE TABLE `book` ( `id` INT (11) NOT NULL AUTO_INCREMENT, `bookName` VARCHAR (200) NULL DEFAULT NULL , `bookUrl` VARCHAR (500) NULL DEFAULT NULL , `bookInfo` TEXT NULL , PRIMARY KEY (`id`) ) COLLATE = 'utf8mb4_general_ci' ENGINE=InnoDB AUTO_INCREMENT=2936; |
1
2
3
4
5
6
7
8
9
10
|
CREATE TABLE `book_down_url` ( `id` INT (11) NOT NULL AUTO_INCREMENT, `bookId` INT (11) NOT NULL DEFAULT '0' , `downName` VARCHAR (200) NOT NULL DEFAULT '0' , `downUrl` VARCHAR (2000) NOT NULL DEFAULT '0' , PRIMARY KEY (`id`) ) COLLATE = 'utf8mb4_general_ci' ENGINE=InnoDB AUTO_INCREMENT=44441; |
git地址:https://git.oschina.net/yangsj/BookFetch/tree/master