首页 > 解决方案 > 使用 Python 进行 Web Scraping - 在完成之前重新启动 Spider

问题描述

我正在使用 Python 上的蜘蛛从网络上抓取一些文本。我必须输入一个单词,按回车键,然后蜘蛛启动。我需要知道是否存在在关闭之前完成蜘蛛重新启动的方法,例如:我传递了一些单词,蜘蛛抓取信息,完成后返回开头并要求我输入另一个单词,等等.

这是我的代码:

蜘蛛.py:

import scrapy
from scrapy.spiders import Spider
from verbos.items import MercadoItem


class MercadoSpider(scrapy.Spider):
name = 'verbos'
allowed_domain = ['www.pt.bab.la']
print("Ingrese un verbo:")
variable = input()
start_urls = ['https://pt.bab.la/verbo/portugues/' + variable]

eu_xpaths = (
    '//*[@id="conjFull"]/div[2]/div/div[2]/div[1]/div[2]/text()', 
    '//*[@id="conjFull"]/div[2]/div/div[4]/div[1]/div[2]/text()',
    ' //*[@id="conjFull"]/div[2]/div/div[3]/div[1]/div[2]/text()',
    '//*[@id="conjFull"]/div[2]/div/div[5]/div[1]/div[2]/text()',
    '//*[@id="conjFull"]/div[2]/div/div[6]/div[1]/div[2]/text()',
    '//*[@id="conjFull"]/div[2]/div/div[8]/div[1]/div[2]/text()',
    '//*[@class="content-column"]/div[3]/div[1]/div[1]/div[2]/div[1]/div[2]/text()',
    '//*[@class="content-column"]/div[3]/div[1]/div[1]/div[3]/div[1]/div[2]/text()',
    '//*[@class="content-column"]/div[6]/div[1]/div[1]/div[2]/div[1]/div[2]/text()',
    '//*[@class="content-column"]/div[5]/div[1]/div[1]/div[2]/div[1]/div[2]/text()'
    )
tu_xpaths= (
    '//*[@id="conjFull"]/div[2]/div/div[2]/div[2]/div[2]/text()',
    '//*[@id="conjFull"]/div[2]/div/div[4]/div[2]/div[2]/text()',
    '//*[@id="conjFull"]/div[2]/div/div[3]/div[2]/div[2]/text()',
    '//*[@id="conjFull"]/div[2]/div/div[5]/div[2]/div[2]/text()',
    '//*[@id="conjFull"]/div[2]/div/div[6]/div[2]/div[2]/text()',
    '//*[@id="conjFull"]/div[2]/div/div[8]/div[2]/div[2]/text()',
    '//*[@class="content-column"]/div[3]/div[1]/div[1]/div[2]/div[2]/div[2]/text()',
    '//*[@class="content-column"]/div[3]/div[1]/div[1]/div[3]/div[2]/div[2]/text()',
    '//*[@class="content-column"]/div[6]/div[1]/div[1]/div[2]/div[1]/div[2]/text()',
    '//*[@class="content-column"]/div[5]/div[1]/div[1]/div[2]/div[1]/div[2]/text()'
    )
voce_xpaths= (
    '//*[@id="conjFull"]/div[2]/div/div[2]/div[3]/div[2]/text()',
    '//*[@id="conjFull"]/div[2]/div/div[4]/div[3]/div[2]/text()',
    '//*[@id="conjFull"]/div[2]/div/div[3]/div[3]/div[2]/text()',
    '//*[@id="conjFull"]/div[2]/div/div[5]/div[3]/div[2]/text()',
    '//*[@id="conjFull"]/div[2]/div/div[6]/div[3]/div[2]/text()',
    '//*[@id="conjFull"]/div[2]/div/div[8]/div[3]/div[2]/text()',
    '//*[@class="content-column"]/div[3]/div[1]/div[1]/div[2]/div[3]/div[2]/text()',
    '//*[@class="content-column"]/div[3]/div[1]/div[1]/div[3]/div[3]/div[2]/text()',
    '//*[@class="content-column"]/div[6]/div[1]/div[1]/div[2]/div[1]/div[2]/text()',
    '//*[@class="content-column"]/div[5]/div[1]/div[1]/div[2]/div[1]/div[2]/text()'
    )
nos_xpaths= (
'//*[@id="conjFull"]/div[2]/div/div[2]/div[4]/div[2]/text()',
'//*[@id="conjFull"]/div[2]/div/div[4]/div[4]/div[2]/text()',
'//*[@id="conjFull"]/div[2]/div/div[3]/div[4]/div[2]/text()',
'//*[@id="conjFull"]/div[2]/div/div[5]/div[4]/div[2]/text()',
'//*[@id="conjFull"]/div[2]/div/div[6]/div[4]/div[2]/text()',
'//*[@id="conjFull"]/div[2]/div/div[8]/div[4]/div[2]/text()',
'//*[@class="content-column"]/div[3]/div[1]/div[1]/div[2]/div[4]/div[2]/text()', '//*[@class="content-column"]/div[3]/div[1]/div[1]/div[3]/div[4]/div[2]/text()', '//*[@class="content-column"]/div[6]/div[1]/div[1]/div[2]/div[1]/div[2]/text()', '//*[@class="content-column"]/div[5]/div[1]/div[1]/div[2]/div[1]/div[2]/text()'
)
voces_xpaths= (
'//*[@id="conjFull"]/div[2]/div/div[2]/div[6]/div[2]/text()',
'//*[@id="conjFull"]/div[2]/div/div[4]/div[6]/div[2]/text()',
'//*[@id="conjFull"]/div[2]/div/div[3]/div[6]/div[2]/text()',
'//*[@id="conjFull"]/div[2]/div/div[5]/div[6]/div[2]/text()',
'//*[@id="conjFull"]/div[2]/div/div[6]/div[6]/div[2]/text()',
'//*[@id="conjFull"]/div[2]/div/div[8]/div[6]/div[2]/text()',
'//*[@class="content-column"]/div[3]/div[1]/div[1]/div[2]/div[6]/div[2]/text()', '//*[@class="content-column"]/div[3]/div[1]/div[1]/div[3]/div[6]/div[2]/text()', '//*[@class="content-column"]/div[6]/div[1]/div[1]/div[2]/div[1]/div[2]/text()', '//*[@class="content-column"]/div[5]/div[1]/div[1]/div[2]/div[1]/div[2]/text()'
)
campos = ("eu", "tu", "voce", "nos", "voces")


def parse(self, response):

       response.selector.remove_namespaces()

       for xpaths in zip (self.eu_xpaths, self.tu_xpaths, self.voce_xpaths, self.nos_xpaths, self.voces_xpaths):
           data = [response.xpath(xpath).getall() for xpath in xpaths]
           yield {campo: valor for campo, valor in zip(self.campos, data)}

设置.py:

BOT_NAME = 'verbos'

SPIDER_MODULES = ['verbos.spiders']
NEWSPIDER_MODULE = 'verbos.spiders'

ITEM_PIPELINES = {'verbos.pipelines.MercadoPipeline': 200}

FEED_FORMAT = "csv"
FEED_URI = 'verbos_items.csv'
FEED_EXPORT_ENCODING='cp1252'

ROBOTSTXT_OBEY = False

谢谢!

标签: pythoncsvscrapy

解决方案


推荐阅读