python定向爬虫校园论坛帖子信息_Python

引言

写这个小爬虫主要是为了爬校园论坛上的实习信息，主要采用了Requests库

源码

URLs.py

主要功能是根据一个初始url（包含page页面参数）来获得page页面从当前页面数到pageNum的url列表

									import re

									def getURLs(url, attr, pageNum=1):

									  all_links = []

									  try:

									    now_page_number = int(re.search(attr+'=(\d+)', url, re.S).group(1))

									    for i in range(now_page_number, pageNum + 1):

									      new_url = re.sub(attr+'=\d+', attr+'=%s' % i, url, re.S)

									      all_links.append(new_url)

									    return all_links

									  except TypeError:

									    print "arguments TypeError:attr should be string."

uni_2_native.py

由于论坛上爬取得到的网页上的中文都是unicode编码的形式，文本格式都为 &#XXXX;的形式，所以在爬得网站内容后还需要对其进行转换

									import sys

									import re

									reload(sys)

									sys.setdefaultencoding('utf-8')

									def get_native(raw):

									  tostring = raw

									  while True:

									    obj = re.search('&#(.*?);', tostring, flags=re.S)

									    if obj is None:

									      break

									    else:

									      raw, code = obj.group(0), obj.group(1)

									      tostring = re.sub(raw, unichr(int(code)), tostring)

									  return tostring

存入SQLite数据库：saveInfo.py

									# -*- coding: utf-8 -*-

									import MySQLdb

									class saveSqlite():

									  def __init__(self):

									    self.infoList = []

									  def saveSingle(self, author=None, title=None, date=None, url=None,reply=0, view=0):

									    if author is None or title is None or date is None or url is None:

									      print "No info saved!"

									    else:

									      singleDict = {}

									      singleDict['author'] = author

									      singleDict['title'] = title

									      singleDict['date'] = date

									      singleDict['url'] = url

									      singleDict['reply'] = reply

									      singleDict['view'] = view

									      self.infoList.append(singleDict)

									  def toMySQL(self):

									    conn = MySQLdb.connect(host='localhost', user='root', passwd='', port=3306, db='db_name', charset='utf8')

									    cursor = conn.cursor()

									    # sql = "select * from info"

									    # n = cursor.execute(sql)

									    # for row in cursor.fetchall():

									    #   for r in row:

									    #     print r

									    #   print '\n'

									    sql = "delete from info"

									    cursor.execute(sql)

									    conn.commit()

									    sql = "insert into info(title,author,url,date,reply,view) values (%s,%s,%s,%s,%s,%s)"

									    params = []

									    for each in self.infoList:

									      params.append((each['title'], each['author'], each['url'], each['date'], each['reply'], each['view']))

									    cursor.executemany(sql, params)

									    conn.commit()

									    cursor.close()

									    conn.close()

									  def show(self):

									    for each in self.infoList:

									      print "author: "+each['author']

									      print "title: "+each['title']

									      print "date: "+each['date']

									      print "url: "+each['url']

									      print "reply: "+str(each['reply'])

									      print "view: "+str(each['view'])

									      print '\n'

									if __name__ == '__main__':

									  save = saveSqlite()

									  save.saveSingle('网','aaa','2008-10-10 10:10:10','www.baidu.com',1,1)

									  # save.show()

									  save.toMySQL()

主要爬虫代码

				?

									import requests

									from lxml import etree

									from cc98 import uni_2_native, URLs, saveInfo

									# 根据自己所需要爬的网站，伪造一个header

									headers ={

									  'Accept': '',

									  'Accept-Encoding': '',

									  'Accept-Language': '',

									  'Connection': '',

									  'Cookie': '',

									  'Host': '',

									  'Referer': '',

									  'Upgrade-Insecure-Requests': '',

									  'User-Agent': ''

									}

									url = 'http://www.cc98.org/list.asp?boardid=459&page=1&action='

									cc98 = 'http://www.cc98.org/'

									print "get infomation from cc98..."

									urls = URLs.getURLs(url, "page", 50)

									savetools = saveInfo.saveSqlite()

									for url in urls:

									  r = requests.get(url, headers=headers)

									  html = uni_2_native.get_native(r.text)

									  selector = etree.HTML(html)

									  content_tr_list = selector.xpath('//form/table[@class="tableborder1 list-topic-table"]/tbody/tr')

									  for each in content_tr_list:

									    href = each.xpath('./td[2]/a/@href')

									    if len(href) == 0:

									      continue

									    else:

									      # print len(href)

									      # not very well using for, though just one element in list

									      # but I don't know why I cannot get the data by index

									      for each_href in href:

									        link = cc98 + each_href

									      title_author_time = each.xpath('./td[2]/a/@title')

									      # print len(title_author_time)

									      for info in title_author_time:

									        info_split = info.split('\n')

									        title = info_split[0][1:len(info_split[0])-1]

									        author = info_split[1][3:]

									        date = info_split[2][3:]

									      hot = each.xpath('./td[4]/text()')

									      # print len(hot)

									      for hot_num in hot:

									        reply_view = hot_num.strip().split('/')

									        reply, view = reply_view[0], reply_view[1]

									      savetools.saveSingle(author=author, title=title, date=date, url=link, reply=reply, view=view)

									print "All got! Now saving to Database..."

									# savetools.show()

									savetools.toMySQL()

									print "ALL CLEAR! Have Fun!"