网站首页 文章专栏 scrapy爬虫标准流程
一般我们一个小的爬虫项目,比如爬取一些文章等,直接用python的requests库,配合re模块就可以很快的完成。但是对于一些大规模的爬取,我们需要实现多线程、异步io,数据库连接等操作,自己从头写起会有些麻烦。这时可以用scrapy这个爬虫框架。
Scrapy使用了Twisted作为框架,Twisted有些特殊的地方是它是事件驱动的,并且比较适合异步的代码。对于会阻塞线程的操作包含访问文件、数据库或者Web、产生新的进程并需要处理新进程的输出(如运行shell命令)、执行系统层次操作的代码(如等待系统队列),Twisted提供了允许执行上面的操作但不会阻塞代码执行的方法。
scrapy startproject bing_search
命令执行后,会创建一个bing_search文件夹.
scrapy genspider example example.com
执行命令后会在spiders文件夹中创建一个example.py的文件。
# tencentPosition为爬虫名 scrapy crawl tencentPosition
items.py
负责数据模型的建立,类似于实体类。
items.py里存放的是我们要爬取数据的字段信息,代码如下: 我们分别要爬取的信息包括:文章标题,文件发布时间,文章url地址,url_object_id是我们会对地址进行md5加密,front_image_url 是文章下图片的url地址,front_image_path图片的存放路径
class JoBoleArticleItem(scrapy.Item): title = scrapy.Field() create_date = scrapy.Field() url = scrapy.Field() url_object_id = scrapy.Field() front_image_url = scrapy.Field() front_image_path = scrapy.Field() praise_nums = scrapy.Field() fav_nums = scrapy.Field() comment_nums = scrapy.Field() tag = scrapy.Field() content = scrapy.Field()
pipelines.py
负责对spider返回数据的处理。
pipeline主要是对spiders中爬虫的返回的数据的处理,这里我们可以让写入到数据库,也可以让写入到文件等等。 下面代码中主要包括的写入到json文件以及写入到数据库,包括异步插入到数据库,还有图片的处理,这里我们可以定义各种我们需要的pipeline,当然这里我们不同的pipeline是有一定的顺序的,需要的设置是在settings配置文件中,如下,后面的数字表示的是优先级,数字越小优先级越高。
# -*- coding: utf-8 -*- import json class TencentPipeline(object): """ 功能:保存item数据 """ def __init__(self): self.filename = open("tencent.json", "w") def process_item(self, item, spider): text = json.dumps(dict(item), ensure_ascii = False) + ",\n" self.filename.write(text.encode("utf-8")) return item def close_spider(self, spider): self.filename.close()
middlewares.py
settings.py
# 设置请求头部,添加url DEFAULT_REQUEST_HEADERS = { "User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' } # 设置item——pipelines ITEM_PIPELINES = { 'tencent.pipelines.TencentPipeline': 300, }
spiders/
负责存放继承自scrapy的爬虫类。
基础爬虫类
# -*- coding: utf-8 -*- import scrapy from tencent.items import TencentItem class TencentpositionSpider(scrapy.Spider): """ 功能:爬取腾讯社招信息 """ # 爬虫名 name = "tencentPosition" # 爬虫作用范围 allowed_domains = ["tencent.com"] url = "http://hr.tencent.com/position.php?&start=" offset = 0 # 起始url start_urls = [url + str(offset)] def parse(self, response): for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): # 初始化模型对象 item = TencentItem() # 职位名称 item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0] # 详情连接 item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0] # 职位类别 item['positionType'] = each.xpath("./td[2]/text()").extract()[0] # 招聘人数 item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0] # 工作地点 item['workLocation'] = each.xpath("./td[4]/text()").extract()[0] # 发布时间 item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] yield item if self.offset < 1680: self.offset += 10 # 每次处理完一页的数据之后,重新发送下一页页面请求 # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
crawl 爬虫类
# -*- coding:utf-8 -*- import scrapy # 导入CrawlSpider类和Rule from scrapy.spiders import CrawlSpider, Rule # 导入链接规则匹配类,用来提取符合规则的连接 from scrapy.linkextractors import LinkExtractor from TencentSpider.items import TencentItem class TencentSpider(CrawlSpider): name = "tencent" allow_domains = ["hr.tencent.com"] start_urls = ["http://hr.tencent.com/position.php?&start=0#a"] # Response里链接的提取规则,返回的符合匹配规则的链接匹配对象的列表 pagelink = LinkExtractor(allow=("start=\d+")) rules = [ # 获取这个列表里的链接,依次发送请求,并且继续跟进,调用指定回调函数处理 Rule(pagelink, callback = "parseTencent", follow = True) ] # 指定的回调函数 def parseTencent(self, response): for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): item = TencentItem() # 职位名称 item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0] # 详情连接 item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0] # 职位类别 item['positionType'] = each.xpath("./td[2]/text()").extract()[0] # 招聘人数 item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0] # 工作地点 item['workLocation'] = each.xpath("./td[4]/text()").extract()[0] # 发布时间 item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] yield item
scrapy.cfg
scrapy基础配置
一些其他的爬虫pipeline,可能有用,比如说写入数据库等
class JobbolespiderPipeline(object): def process_item(self, item, spider): return item class JsonWithEncodingPipeline(object): ''' 返回json数据到文件 ''' def __init__(self): self.file = codecs.open("article.json",'w',encoding="utf-8") def process_item(self, item, spider): lines = json.dumps(dict(item),ensure_ascii=False) + "\n" self.file.write(lines) return item def spider_closed(self,spider): self.file.close() class MysqlPipeline(object): ''' 插入mysql数据库 ''' def __init__(self): self.conn =pymysql.connect(host='192.168.1.19',port=3306,user='root',passwd='123456',db='article_spider',use_unicode=True, charset="utf8") self.cursor = self.conn.cursor() def process_item(self,item,spider): insert_sql = ''' insert into jobbole_article(title,create_date,url,url_object_id,front_image_url,front_image_path,comment_nums,fav_nums,praise_nums,tag,content) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ''' self.cursor.execute(insert_sql,(item["title"],item["create_date"],item["url"],item["url_object_id"],item["front_image_url"],item["front_image_path"],item["comment_nums"],item["fav_nums"],item["praise_nums"],item["tag"],item["content"])) self.conn.commit() class MysqlTwistedPipline(object): ''' 采用异步的方式插入数据 ''' def __init__(self,dbpool): self.dbpool = dbpool @classmethod def from_settings(cls,settings): dbparms = dict( host = settings["MYSQL_HOST"], port = settings["MYSQL_PORT"], user = settings["MYSQL_USER"], passwd = settings["MYSQL_PASSWD"], db = settings["MYSQL_DB"], use_unicode = True, charset="utf8", ) dbpool = adbapi.ConnectionPool("pymysql",**dbparms) return cls(dbpool) def process_item(self,item,spider): ''' 使用twisted将mysql插入变成异步 :param item: :param spider: :return: ''' query = self.dbpool.runInteraction(self.do_insert,item) query.addErrback(self.handle_error) def handle_error(self,failure): #处理异步插入的异常 print(failure) def do_insert(self,cursor,item): #具体插入数据 insert_sql = ''' insert into jobbole_article(title,create_date,url,url_object_id,front_image_url,front_image_path,comment_nums,fav_nums,praise_nums,tag,content) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ''' cursor.execute(insert_sql,(item["title"],item["create_date"],item["url"],item["url_object_id"],item["front_image_url"],item["front_image_path"],item["comment_nums"],item["fav_nums"],item["praise_nums"],item["tag"],item["content"])) class ArticleImagePipeline(ImagesPipeline): ''' 对图片的处理 ''' def item_completed(self, results, item, info): for ok ,value in results: if ok: image_file_path = value["path"] item['front_image_path'] = image_file_path else: item['front_image_path'] = "" return item