Python爬虫包BeautifulSoup实例（三）_Python

一步一步构建一个爬虫实例，抓取糗事百科的段子

先不用beautifulsoup包来进行解析

第一步，访问网址并抓取源码

									# -*- coding: utf-8 -*-

									# @Author: HaonanWu

									# @Date:  2016-12-22 16:16:08

									# @Last Modified by:  HaonanWu

									# @Last Modified time: 2016-12-22 20:17:13

									import urllib

									import urllib2

									import re

									import os

									if __name__ == '__main__':

									  # 访问网址并抓取源码

									  url = 'http://www.qiushibaike.com/textnew/page/1/?s=4941357'

									  user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'

									  headers = {'User-Agent':user_agent}

									  try:

									    request = urllib2.Request(url = url, headers = headers)

									    response = urllib2.urlopen(request)

									    content = response.read()

									  except urllib2.HTTPError as e:

									    print e

									    exit()

									  except urllib2.URLError as e:

									    print e

									    exit()

									  print content.decode('utf-8')

第二步，利用正则表达式提取信息

首先先观察源码中，你需要的内容的位置以及如何识别
然后用正则表达式去识别读取
注意正则表达式中的 . 是不能匹配\n的，所以需要设置一下匹配模式。

				?

									# -*- coding: utf-8 -*-

									# @Author: HaonanWu

									# @Date:  2016-12-22 16:16:08

									# @Last Modified by:  HaonanWu

									# @Last Modified time: 2016-12-22 20:17:13

									import urllib

									import urllib2

									import re

									import os

									if __name__ == '__main__':

									  # 访问网址并抓取源码

									  url = 'http://www.qiushibaike.com/textnew/page/1/?s=4941357'

									  user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'

									  headers = {'User-Agent':user_agent}

									  try:

									    request = urllib2.Request(url = url, headers = headers)

									    response = urllib2.urlopen(request)

									    content = response.read()

									  except urllib2.HTTPError as e:

									    print e

									    exit()

									  except urllib2.URLError as e:

									    print e

									    exit()

									  regex = re.compile('<div class="content">.*?<span>(.*?)</span>.*?</div>', re.S)

									  items = re.findall(regex, content)

									  # 提取数据

									  # 注意换行符，设置 . 能够匹配换行符

									  for item in items:

									    print item

第三步，修正数据并保存到文件中

				?

									# -*- coding: utf-8 -*-

									# @Author: HaonanWu

									# @Date:  2016-12-22 16:16:08

									# @Last Modified by:  HaonanWu

									# @Last Modified time: 2016-12-22 21:41:32

									import urllib

									import urllib2

									import re

									import os

									if __name__ == '__main__':

									  # 访问网址并抓取源码

									  url = 'http://www.qiushibaike.com/textnew/page/1/?s=4941357'

									  user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'

									  headers = {'User-Agent':user_agent}

									  try:

									    request = urllib2.Request(url = url, headers = headers)

									    response = urllib2.urlopen(request)

									    content = response.read()

									  except urllib2.HTTPError as e:

									    print e

									    exit()

									  except urllib2.URLError as e:

									    print e

									    exit()

									  regex = re.compile('<div class="content">.*?<span>(.*?)</span>.*?</div>', re.S)

									  items = re.findall(regex, content)

									  # 提取数据

									  # 注意换行符，设置 . 能够匹配换行符

									  path = './qiubai'

									  if not os.path.exists(path):

									    os.makedirs(path)

									  count = 1

									  for item in items:

									    #整理数据，去掉\n,将<br/>换成\n

									    item = item.replace('\n', '').replace('<br/>', '\n')

									    filepath = path + '/' + str(count) + '.txt'

									    f = open(filepath, 'w')

									    f.write(item)

									    f.close()

									    count += 1

第四步，将多个页面下的内容都抓取下来

				?

									# -*- coding: utf-8 -*-

									# @Author: HaonanWu

									# @Date:  2016-12-22 16:16:08

									# @Last Modified by:  HaonanWu

									# @Last Modified time: 2016-12-22 20:17:13

									import urllib

									import urllib2

									import re

									import os

									if __name__ == '__main__':

									  # 访问网址并抓取源码

									  path = './qiubai'

									  if not os.path.exists(path):

									    os.makedirs(path)

									  user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'

									  headers = {'User-Agent':user_agent}

									  regex = re.compile('<div class="content">.*?<span>(.*?)</span>.*?</div>', re.S)

									  count = 1

									  for cnt in range(1, 35):

									    print '第' + str(cnt) + '轮'

									    url = 'http://www.qiushibaike.com/textnew/page/' + str(cnt) + '/?s=4941357'

									    try:

									      request = urllib2.Request(url = url, headers = headers)

									      response = urllib2.urlopen(request)

									      content = response.read()

									    except urllib2.HTTPError as e:

									      print e

									      exit()

									    except urllib2.URLError as e:

									      print e

									      exit()

									    # print content

									    # 提取数据

									    # 注意换行符，设置 . 能够匹配换行符

									    items = re.findall(regex, content)

									    # 保存信息

									    for item in items:

									      #  print item

									      #整理数据，去掉\n,将<br/>换成\n

									      item = item.replace('\n', '').replace('<br/>', '\n')

									      filepath = path + '/' + str(count) + '.txt'

									      f = open(filepath, 'w')

									      f.write(item)

									      f.close()

									      count += 1

									  print '完成'

使用BeautifulSoup对源码进行解析

				?

									# -*- coding: utf-8 -*-

									# @Author: HaonanWu

									# @Date:  2016-12-22 16:16:08

									# @Last Modified by:  HaonanWu

									# @Last Modified time: 2016-12-22 21:34:02

									import urllib

									import urllib2

									import re

									import os

									from bs4 import BeautifulSoup

									if __name__ == '__main__':

									  url = 'http://www.qiushibaike.com/textnew/page/1/?s=4941357'

									  user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'

									  headers = {'User-Agent':user_agent}

									  request = urllib2.Request(url = url, headers = headers)

									  response = urllib2.urlopen(request)

									  # print response.read()

									  soup_packetpage = BeautifulSoup(response, 'lxml')

									  items = soup_packetpage.find_all("div", class_="content")

									  for item in items:

									    try:

									      content = item.span.string

									    except AttributeError as e:

									      print e

									      exit()

									    if content:

									      print content + "\n"

这是用BeautifulSoup去抓取书本以及其价格的代码
可以通过对比得出到bs4对标签的读取以及标签内容的读取
（因为我自己也没有学到这一部分，目前只能依葫芦画瓢地写）

				?

									# -*- coding: utf-8 -*-

									# @Author: HaonanWu

									# @Date:  2016-12-22 20:37:38

									# @Last Modified by:  HaonanWu

									# @Last Modified time: 2016-12-22 21:27:30

									import urllib2

									import urllib

									import re 

									from bs4 import BeautifulSoup 

									url = "https://www.packtpub.com/all"

									try:

									  html = urllib2.urlopen(url) 

									except urllib2.HTTPError as e:

									  print e

									  exit()

									soup_packtpage = BeautifulSoup(html, 'lxml') 

									all_book_title = soup_packtpage.find_all("div", class_="book-block-title") 

									price_regexp = re.compile(u"\s+\$\s\d+\.\d+") 

									for book_title in all_book_title: 

									  try:

									    print "Book's name is " + book_title.string.strip()

									  except AttributeError as e:

									    print e

									    exit()

									  book_price = book_title.find_next(text=price_regexp) 

									  try:

									    print "Book's price is "+ book_price.strip()

									  except AttributeError as e:

									    print e

									    exit()

									  print ""