Python爬虫框架Scrapy实例代码_Python

目标任务：爬取腾讯社招信息，需要爬取的内容为：职位名称，职位的详情链接，职位类别，招聘人数，工作地点，发布时间。

一、创建Scrapy项目

scrapy startproject Tencent

命令执行后，会创建一个Tencent文件夹，结构如下

二、编写item文件，根据需要爬取的内容定义爬取字段

									# -*- coding: utf-8 -*-

									import scrapy

									class TencentItem(scrapy.Item):

									  # 职位名

									  positionname = scrapy.Field()

									  # 详情连接

									  positionlink = scrapy.Field()

									  # 职位类别

									  positionType = scrapy.Field()

									  # 招聘人数

									  peopleNum = scrapy.Field()

									  # 工作地点

									  workLocation = scrapy.Field()

									  # 发布时间

									  publishTime = scrapy.Field()

三、编写spider文件

进入Tencent目录，使用命令创建一个基础爬虫类：

1 2	`# tencentPostion为爬虫名，tencent.com为爬虫作用范围` `scrapy genspider tencentPostion` `"tencent.com"`

执行命令后会在spiders文件夹中创建一个tencentPostion.py的文件，现在开始对其编写：

									# -*- coding: utf-8 -*-

									import scrapy

									from tencent.items import TencentItem

									class TencentpositionSpider(scrapy.Spider):

									  """

									  功能：爬取腾讯社招信息

									  """

									  # 爬虫名

									  name = "tencentPosition"

									  # 爬虫作用范围

									  allowed_domains = ["tencent.com"]

									  url = "http://hr.tencent.com/position.php?&start="

									  offset = 0

									  # 起始url

									  start_urls = [url + str(offset)]

									  def parse(self, response):

									    for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):

									      # 初始化模型对象

									      item = TencentItem()

									      # 职位名称

									      item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0]

									      # 详情连接

									      item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0]

									      # 职位类别

									      item['positionType'] = each.xpath("./td[2]/text()").extract()[0]

									      # 招聘人数

									      item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0]

									      # 工作地点

									      item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]

									      # 发布时间

									      item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]

									      yield item

									    if self.offset < 1680:

									      self.offset += 10

									    # 每次处理完一页的数据之后，重新发送下一页页面请求

									    # self.offset自增10，同时拼接为新的url，并调用回调函数self.parse处理Response

									    yield scrapy.Request(self.url + str(self.offset), callback = self.parse)

四、编写pipelines文件

									# -*- coding: utf-8 -*-

									import json

									class TencentPipeline(object):

									　　""" 

									    功能：保存item数据 

									  """

									  def __init__(self):

									    self.filename = open("tencent.json", "w")

									  def process_item(self, item, spider):

									    text = json.dumps(dict(item), ensure_ascii = False) + ",\n"

									    self.filename.write(text.encode("utf-8"))

									    return item

									  def close_spider(self, spider):

									    self.filename.close()

五、settings文件设置（主要设置内容）

									# 设置请求头部，添加url

									DEFAULT_REQUEST_HEADERS = {

									  "User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",

									  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'

									}

									# 设置item——pipelines

									ITEM_PIPELINES = {

									  'tencent.pipelines.TencentPipeline': 300,

									}

执行命令，运行程序

1 2	`# tencentPosition为爬虫名` `scrapy crwal tencentPosition`

使用CrawlSpider类改写

									# 创建项目

									scrapy startproject TencentSpider

									# 进入项目目录下，创建爬虫文件

									scrapy genspider -t crawl tencent tencent.com

									item等文件写法不变，主要是爬虫文件的编写

									# -*- coding:utf-8 -*-

									import scrapy

									# 导入CrawlSpider类和Rule

									from scrapy.spiders import CrawlSpider, Rule

									# 导入链接规则匹配类，用来提取符合规则的连接

									from scrapy.linkextractors import LinkExtractor

									from TencentSpider.items import TencentItem

									class TencentSpider(CrawlSpider):

									  name = "tencent"

									  allow_domains = ["hr.tencent.com"]

									  start_urls = ["http://hr.tencent.com/position.php?&start=0#a"]

									  # Response里链接的提取规则，返回的符合匹配规则的链接匹配对象的列表

									  pagelink = LinkExtractor(allow=("start=\d+"))

									  rules = [

									    # 获取这个列表里的链接，依次发送请求，并且继续跟进，调用指定回调函数处理

									    Rule(pagelink, callback = "parseTencent", follow = True)

									  ]

									  # 指定的回调函数

									  def parseTencent(self, response):

									    for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):

									      item = TencentItem()

									      # 职位名称

									      item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0]

									      # 详情连接

									      item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0]

									      # 职位类别

									      item['positionType'] = each.xpath("./td[2]/text()").extract()[0]

									      # 招聘人数

									      item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0]

									      # 工作地点

									      item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]

									      # 发布时间

									      item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]

									      yield item