python - Scrapy 中的 ItemLoader
问题描述
我似乎无法让 ItemLoader 工作。我在scrapy日志中没有收到任何错误,只是没有提取任何内容。任何想法都会有所帮助!
import scrapy
from medium.items import MediumItem
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider
import logging
from scrapy.utils.log import configure_logging
class DataSpider(CrawlSpider):
custom_settings = {
'LOG_FILE': 'my_log.log',
'LOG_LEVEL': 'ERROR'}
logging.getLogger().addHandler(logging.StreamHandler())
name = 'data'
allowed_domains = ['medium.com', 'towardsdatascience.com']
start_urls = ['https://medium.com/tag/python/archive/02/01']
handle_httpstatus_list = [302]
def parse(self,response):
articles = response.xpath('//div[@class="postArticle postArticle--short js-postArticle js-
trackPostPresentation js-trackPostScrolls"]')
for article in articles:
if article.xpath('.//a[@class="button button--smaller button--chromeless u-baseColor--
buttonNormal"]/@href').extract_first():
l = ItemLoader(item = MediumItem(), selector = article)
l.default_output_processor = scrapy.loader.processors.TakeFirst()
l.add_css('Title','div > h3::text')
l.add_xpath('Name','.//a[@class="ds-link ds-link--styleSubtle link link--darken link-
-accent u-accentColor--textNormal u-accentColor--textDarken"]/text()')
l.add_css('Read','span::attr(title)')
l.add_xpath('Publication', './/a[@class="ds-link ds-link--styleSubtle link--darken
link--accent u-accentColor--textNormal"]/text()')
l.add_xpath('Claps','.//button[@class="button button--chromeless u-baseColor--
buttonNormal js-multirecommendCountButton u-disablePointerEvents"]/text()')
l.add_xpath('Responses','.//a[@class="button button--chromeless u-baseColor--
buttonNormal"]/text()')
l.add_value('Page',response.url)
yield l.load_item()
项目文件是
从 scrapy.item 导入 scrapy 导入项目、字段
class MediumItem(Item):
Title = scrapy.Field()
Name = scrapy.Field()
Date = scrapy.Field()
Read = scrapy.Field()
Publication = scrapy.Field()
Claps = scrapy.Field()
Responses = scrapy.Field()
解决方案
一开始我遇到两个问题
它需要
Page = scrapy.Field()
页面
https://medium.com/tag/python/archive/02/01
被重定向到,https://medium.com/tag/python/archive
但被阻止handle_httpstatus_list = [302]
删除后
handle_httpstatus_list
,我从第一页获取数据
结果 (csv)
Claps,Date,Name,Page,Publication,Read,Responses,Title
81K,,Daniel van Flymen,https://medium.com/tag/python/archive,,9 min read,383 responses,Learn Blockchains by Building One
25K,,Jonny Fox,https://medium.com/tag/python/archive,,6 min read,63 responses,Regex tutorial — A quick cheatsheet by examples
9.6K,,Susan Li,https://medium.com/tag/python/archive,,9 min read,112 responses,"Building A Logistic Regression in Python, Step by Step"
5.8K,,Adi Bronshtein,https://medium.com/tag/python/archive,,9 min read,46 responses,Train/Test Split and Cross Validation in Python
7.8K,,Will Koehrsen,https://medium.com/tag/python/archive,,21 min read,42 responses,Random Forest in Python
7.2K,,Ted Petrou,https://medium.com/tag/python/archive,,24 min read,34 responses,Selecting Subsets of Data in Pandas: Part 1
11.1K,,Milo Spencer-Harper,https://medium.com/tag/python/archive,,6 min read,86 responses,How to build a simple neural network in 9 lines of Python code
5.2K,,Michael Galarnyk,https://medium.com/tag/python/archive,,8 min read,27 responses,PCA using Python (scikit-learn)
64K,,TK,https://medium.com/tag/python/archive,,11 min read,148 responses,Learning Python: From Zero to Hero
6.9K,,Susan Li,https://medium.com/tag/python/archive,,9 min read,75 responses,An End-to-End Project on Time Series Analysis and Forecasting with Python
我使用的代码 - 全部在一个文件中,无需创建项目
import scrapy
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider
import logging
from scrapy.utils.log import configure_logging
class MediumItem(scrapy.Item):
Title = scrapy.Field()
Name = scrapy.Field()
Date = scrapy.Field()
Read = scrapy.Field()
Publication = scrapy.Field()
Claps = scrapy.Field()
Responses = scrapy.Field()
Page = scrapy.Field()
class DataSpider(CrawlSpider):
custom_settings = {
'LOG_FILE': 'my_log.log',
'LOG_LEVEL': 'ERROR'}
logging.getLogger().addHandler(logging.StreamHandler())
name = 'data'
allowed_domains = ['medium.com', 'towardsdatascience.com']
start_urls = ['https://medium.com/tag/python/archive/02/01']
#handle_httpstatus_list = [302]
def parse(self,response):
print('url:', response.url)
articles = response.xpath('//div[@class="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls"]')
for article in articles:
if article.xpath('.//a[@class="button button--smaller button--chromeless u-baseColor--buttonNormal"]/@href').extract_first():
l = ItemLoader(item = MediumItem(), selector = article)
l.default_output_processor = scrapy.loader.processors.TakeFirst()
l.add_css('Title','div > h3::text')
l.add_xpath('Name','.//a[@class="ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken"]/text()')
l.add_css('Read','span::attr(title)')
l.add_xpath('Publication', './/a[@class="ds-link ds-link--styleSubtle link--darkenlink--accent u-accentColor--textNormal"]/text()')
l.add_xpath('Claps','.//button[@class="button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents"]/text()')
l.add_xpath('Responses','.//a[@class="button button--chromeless u-baseColor--buttonNormal"]/text()')
l.add_value('Page', response.url)
yield l.load_item()
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(DataSpider)
c.start()
推荐阅读
- javascript - setState 循环遍历一组道具 - React ComponentDidUpdate
- r - 我怎样才能接受...的第一个输入?
- swift - ObjectMapper 和具有唯一键的数组
- c# - 即使指定了 JWT 身份验证,也没有指定身份验证方案错误
- javascript - 在 CSS 和/或 javascript 中更改 SVG 颜色(带有对象标签),不会做任何事情
- flutter - CustomPainter 和 CustomClipper 为同一路径产生不同的输出
- google-apps-script - 仅充当 Google 客户端的可发布插件的 OAuth 要求
- jquery - 如何比较jquery中的以下值?
- javascript - Express.js - 允许无限的子路由,但至少有一个
- r - R:lmer()错误消息:未使用的参数(family =“binomial”)