首页 > 技术文章 > Scrapy_糗事百科

waterr 2021-02-27 17:42 原文

1.进入桌面

1 cd C:\Users\Mr_wa\Desktop

2.新建项目

scrapy startproject qsbk

3.新建爬虫

cd qsbk

scrapy genspider qsbk_spider qiushibaike.com 

4.修改settings.py

1 ROBOTSTXT_OBEY = False
2 
3 DEFAULT_REQUEST_HEADERS = {
4   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
5   'Accept-Language': 'en',
6   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
7 }

5.书写qsbk_spider.py(start_urls)

 1 import scrapy
 2 
 3 
 4 class QsbkSpiderSpider(scrapy.Spider):
 5     name = 'qsbk_spider'
 6     allowed_domains = ['qiushibaike.com']
 7     start_urls = ['https://www.qiushibaike.com/text/page/1/']
 8 
 9     def parse(self, response):
10         pass

6.根目录新建运行文件start.py

1 from scrapy import cmdline
2 
3 cmdline.execute(['scrapy', 'crawl', 'qsbk_spider'])

7.爬虫测试(查看response类型)

 1 import scrapy
 2 
 3 
 4 class QsbkSpiderSpider(scrapy.Spider):
 5     name = 'qsbk_spider'
 6     allowed_domains = ['qiushibaike.com']
 7     start_urls = ['https://www.qiushibaike.com/text/page/1/']
 8 
 9     def parse(self, response):
10         print("===================")
11         print(type(response))
12         print("===================")

 8.在start.py中运行结果

===================
<class 'scrapy.http.response.html.HtmlResponse'>
===================

8.xpath解析数据

获取作者和段子内容

 

 

 

 1 import scrapy
 2 
 3 
 4 class QsbkSpiderSpider(scrapy.Spider):
 5     name = 'qsbk_spider'
 6     allowed_domains = ['qiushibaike.com']
 7     start_urls = ['https://www.qiushibaike.com/text/page/1/']
 8 
 9     def parse(self, response):
10         # content_list : SelectorList
11         content_list = response.xpath("//div[@class='col1 old-style-col1']/div")
12         # content : selector
13         for content in content_list:
14             author = content.xpath(".//h2/text()").get().strip()
15             text = content.xpath(".//div[@class='content']//text()").getall()
16             text = "".join(text).strip()
17             duanzi = {'author': author, 'content': text}
18             print(duanzi)

9.存储数据

在item.py中定义数据保存的类型,(而不是上面的字典 duanzi = {'author': author, 'content': text}

1 import scrapy
2 
3 
4 class QsbkItem(scrapy.Item):
5     # define the fields for your item here like:
6     # name = scrapy.Field()
7     author = scrapy.Field()
8     text = scrapy.Field()

qsbk_spider.py

import scrapy
from ..items import QsbkItem


class QsbkSpiderSpider(scrapy.Spider):
    name = 'qsbk_spider'
    allowed_domains = ['qiushibaike.com']
    start_urls = ['https://www.qiushibaike.com/text/page/1/']

    def parse(self, response):
        # content_list : SelectorList
        content_list = response.xpath("//div[@class='col1 old-style-col1']/div")
        # content : selector
        for content in content_list:
            author = content.xpath(".//h2/text()").get().strip()
            text = content.xpath(".//div[@class='content']//text()").getall()
            text = "".join(text).strip()
            item = QsbkItem(author=author, content=text)
            yield item

settings.py 取消注释

ITEM_PIPELINES = {
   'qsbk.pipelines.QsbkPipeline': 300,
}

 pipelines.py 保存数据

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json 

class QsbkPipeline:

    def process_item(self, item, spider):
        with open('qsbk.json', 'a', encoding='utf-8') as f:
            f.write(json.dumps(dict(item), ensure_ascii=False))
        return item

保存为json

 

爬取多页,并保存为csv格式

qsbk_spider.py

import scrapy
from ..items import QsbkItem


class QsbkSpiderSpider(scrapy.Spider):
    name = 'qsbk_spider'
    allowed_domains = ['qiushibaike.com']
    start_urls = ['https://www.qiushibaike.com/text/page/1/']

    def parse(self, response):
        # content_list : SelectorList
        content_list = response.xpath("//div[@class='col1 old-style-col1']/div")
        # content : selector
        for content in content_list:
            author = content.xpath(".//h2/text()").get().strip()
            text = content.xpath(".//div[@class='content']//text()").getall()
            text = "".join(text).strip()
            item = QsbkItem(author=author, content=text)

            yield item

        next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get()
        
        if next_url:
            next_url = 'https://www.qiushibaike.com' + next_url
            yield scrapy.Request(next_url, callback=self.parse)

pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import csv 

class QsbkPipeline:
    def process_item(self, item, spider):
        with open('qsbk.csv', 'a', encoding='utf-8', newline="") as f:
            writer = csv.writer(f)
            writer.writerow([item["author"], item["content"]])
        return item

 

推荐阅读