python - Scrapy Project - 如何使用 - 带有暂停和恢复开关的“项目和管道”抓取、抓取嵌套关系数据并将其导出到 CSV?
问题描述
我正在尝试获取几种类型的“嵌套数据”,清理它们并将它们保存到单独的 CSV 文件中。
want all data to be saved to separate CSV files as per separate items/pipelines.
and want to give the output file name, from the URL param variable "CatUrlParam" that was used in the main "Google.py" spider file (The category number).
Also want to use pause and resume functionality for the same.
Also please suggest good coding practices, like how do I clean HTML data in items or pipelines sections.
先感谢您。
谷歌.py
import re
from urllib.parse import urlsplit
import scrapy
from GoogleBot.GoogleBot.items import GooglebotItem, GooglebotItemTwo
class GoogleSpider(scrapy.Spider):
name = 'Google'
start_urls = ['http://Demo.com/index.php']
CatUrlParam = 1
def parse(self, response):
item = GooglebotItem()
item["Main_Page_Items_Cat_No"] = response.css('strong::text').getall()[(self.CatUrlParam-1):self.CatUrlParam]
item["Main_Page_Items_Cat_Name"] = response.css('.textb tr+ tr td:nth-child(2)::text').getall()[(self.CatUrlParam-1):self.CatUrlParam]
item["MainCatWise_Comps_Page_Url_List"] = [response.urljoin(response.css('#contant-contant td:nth-child(3) a::attr(href)').getall()[x]) for x in range(len(response.css('#contant-contant td:nth-child(3) a::attr(href)').getall()))][(self.CatUrlParam-1):self.CatUrlParam]
item["SubCatWise_Indvidual_Page_Url_List"] = [response.urljoin(response.css('#contant-contant td:nth-child(4) a::attr(href)').getall()[x]) for x in range(len(response.css('#contant-contant td:nth-child(4) a::attr(href)').getall()))][(self.CatUrlParam-1):self.CatUrlParam]
item["Main_Page_Resps_Url"] = [response.url]
yield item
for MainCmpPageLink in item["MainCatWise_Comps_Page_Url_List"]:
yield scrapy.Request(MainCmpPageLink, self.parse_getCompDetailsLink)
for CompaniesSubCatPageLink in item["SubCatWise_Indvidual_Page_Url_List"]:
yield scrapy.Request(CompaniesSubCatPageLink, self.parse_getSubCatPageLink)
def parse_getCompDetailsLink(self, response):
itemTwo = GooglebotItemTwo()
itemTwo["Cmp_Name_List"] = [re.sub("(\\xa0)", "", response.css('form tr+ tr td:nth-child(1)::text').getall()[x]) for x in range(len(response.css('form tr+ tr td:nth-child(1)::text').getall()))]
itemTwo["Cmp_Products_Names_List"] = [re.sub(r';', '', response.css('form tr+ tr td:nth-child(2)::text').getall()[x]) for x in range(len(response.css('form tr+ tr td:nth-child(2)::text').getall()))]
itemTwo["Cmp_Products_Codes_List"] = [response.css('form tr+ tr td:nth-child(3)::text').getall()[x] for x in range(len(response.css('form tr+ tr td:nth-child(3)::text').getall()))]
itemTwo["Cmp_Cert_Url_List"] = [response.urljoin(response.css('#contant-contant td:nth-child(4) a::attr(href)').getall()[x]) for x in range(len(response.css('#contant-contant td:nth-child(4) a::attr(href)').getall()))]
itemTwo["Cmp_Prof_Url_List"] = [urlsplit(response.request.url)[0]+'://'+urlsplit(response.request.url)[1]+'/'+urlsplit(x)[2][4:]+'?'+urlsplit(x)[3][:-2] for x in [response.urljoin(response.css('#contant-contant td:nth-child(5) a::attr(href)').getall()[x]) for x in range(len(response.css('#contant-contant td:nth-child(5) a::attr(href)').getall()))]]
itemTwo["Cmps_Page_Resps_Url"] = [response.url]
yield itemTwo
for CmpCertLink in itemTwo["Cmp_Cert_Url_List"]:
yield scrapy.Request(CmpCertLink, self.parse_getCompCertPageLinkList)
for CmpProfLink in itemTwo["Cmp_Prof_Url_List"]:
yield scrapy.Request(CmpProfLink, self.parse_getCompProfPageLinkLIst)
def parse_getCompCertPageLinkList(self, response):
itemThree = GooglebotItemThree()
itemThree["Cmp_Cert_Name"] = response.css('font font:nth-child(1) b::text').get()
itemThree["Cmp_Cert_Phone"] = [[re.split(', ', re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(Phone:)', '', re.sub(r'(<BR>)|(<br>)', ', ', re.sub(r';', '/', response.css('#divbody tr+ tr td+ td').getall()[x]))), 3) for x in range(len(response.css('#divbody tr+ tr td+ td').getall()))][0][0] if (([re.split(', ', re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(Phone:)', '', re.sub(r'(<BR>)|(<br>)', ', ', re.sub(r';', '/', response.css('#divbody tr+ tr td+ td').getall()[x]))), 3) for x in range(len(response.css('#divbody tr+ tr td+ td').getall()))][0][0]) != '') else 'No PhoneNo in Cert.']
itemThree["Cmp_Cert_Fax"] = [[re.split(', ', re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(Fax:)', '', re.sub(r'(<BR>)|(<br>)', ', ', response.css('#divbody tr+ tr td+ td').getall()[x])), 3) for x in range(len(response.css('#divbody tr+ tr td+ td').getall()))][0][1] if (([re.split(', ', re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(Fax:)', '', re.sub(r'(<BR>)|(<br>)', ', ', response.css('#divbody tr+ tr td+ td').getall()[x])), 3) for x in range(len(response.css('#divbody tr+ tr td+ td').getall()))][0][1]) != '') else 'No FaxNo in Cert.']
itemThree["Cmp_Cert_Email"] = [[re.split(', ', re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(Email:)', '', re.sub(r'(<BR>)|(<br>)', ', ', response.css('#divbody tr+ tr td+ td').getall()[x])), 3) for x in range(len(response.css('#divbody tr+ tr td+ td').getall()))][0][2] if (([re.split(', ', re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(Email:)', '', re.sub(r'(<BR>)|(<br>)', ', ', response.css('#divbody tr+ tr td+ td').getall()[x])), 3) for x in range(len(response.css('#divbody tr+ tr td+ td').getall()))][0][2]) != '') else 'No EmailID in Cert.']
itemThree["Cmp_Cert_Web"] = [[re.split(', ', re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(Web:)', '', re.sub(r'(<BR>)|(<br>)', ', ', response.css('#divbody tr+ tr td+ td').getall()[x])), 3) for x in range(len(response.css('#divbody tr+ tr td+ td').getall()))][0][3] if (([re.split(', ', re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(Web:)', '', re.sub(r'(<BR>)|(<br>)', ', ', response.css('#divbody tr+ tr td+ td').getall()[x])), 3) for x in range(len(response.css('#divbody tr+ tr td+ td').getall()))][0][3]) != '') else 'No WebAdd in Cert.']
itemThree["Cmp_Cert_Status"] = [[re.split(', ', re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(Status:)', '', re.sub(r'(<BR>)|(<br>)', ', ', response.css('#divbody tr+ tr td+ td').getall()[x])), 3) for x in range(len(response.css('#divbody tr+ tr td+ td').getall()))][1][0] if (([re.split(', ', re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(Status:)', '', re.sub(r'(<BR>)|(<br>)', ', ', response.css('#divbody tr+ tr td+ td').getall()[x])), 3) for x in range(len(response.css('#divbody tr+ tr td+ td').getall()))][1][0]) != '') else 'No Status in Cert.']
itemThree["Cmp_Cert_RCMCNo"] = [[re.split(', ', re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(RCMC No:)', '', re.sub(r'(<BR>)|(<br>)', ', ', response.css('#divbody tr+ tr td+ td').getall()[x])), 3) for x in range(len(response.css('#divbody tr+ tr td+ td').getall()))][2][0] if (([re.split(', ', re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(RCMC No:)', '', re.sub(r'(<BR>)|(<br>)', ', ', response.css('#divbody tr+ tr td+ td').getall()[x])), 3) for x in range(len(response.css('#divbody tr+ tr td+ td').getall()))][2][0]) != '') else 'No RCMCNo in Cert.']
itemThree["Cmp_Cert_Address"] = [[re.sub(r'(\r)|(\n)|(\t)', '', response.css('#divbody tr:nth-child(2) td:nth-child(1) font::text').getall()[x]) for x in range(len(response.css('#divbody tr:nth-child(2) td:nth-child(1) font::text').getall()))][0]+', '+[re.sub(r'(\r)|(\n)|(\t)', '', response.css('#divbody tr:nth-child(2) td:nth-child(1) font::text').getall()[x]) for x in range(len(response.css('#divbody tr:nth-child(2) td:nth-child(1) font::text').getall()))][1] if (([re.sub(r'(\r)|(\n)|(\t)', '', response.css('#divbody tr:nth-child(2) td:nth-child(1) font::text').getall()[x]) for x in range(len(response.css('#divbody tr:nth-child(2) td:nth-child(1) font::text').getall()))][0]+', '+[re.sub(r'(\r)|(\n)|(\t)', '', response.css('#divbody tr:nth-child(2) td:nth-child(1) font::text').getall()[x]) for x in range(len(response.css('#divbody tr:nth-child(2) td:nth-child(1) font::text').getall()))][1]) != '') else 'No Address in Cert.']
itemThree["Cmp_Cert_Valid_UpTo"] = [[re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(is Registered FIEO Member. Current Membership Valid Upto: )', '', response.css('br+ font').getall()[x]) for x in range(len(response.css('br+ font').getall()))][0] if (([re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(is Registered FIEO Member. Current Membership Valid Upto: )', '', response.css('br+ font').getall()[x]) for x in range(len(response.css('br+ font').getall()))][0]) != '') else 'No Valid UpTo Data in Cert.']
itemThree["Cmp_Cert_Contact_Person"] = [[re.split(',', re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(Contact:)|(Head:)', '', re.sub(r'(<BR>)|(<br>)', ', ', response.css('#divbody tr:nth-child(3) font').getall()[x])), 2) for x in range(len(response.css('#divbody tr:nth-child(3) font').getall()))][0][0] if (([re.split(',', re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(Contact:)|(Head:)', '', re.sub(r'(<BR>)|(<br>)', ', ', response.css('#divbody tr:nth-child(3) font').getall()[x])), 2) for x in range(len(response.css('#divbody tr:nth-child(3) font').getall()))][0][0]) != '') else 'No Contact Person Data in Cert.']
itemThree["Cmp_Cert_Contact_Head"] = [[re.split(',', re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(Contact:)|(Head:)', '', re.sub(r'(<BR>)|(<br>)', ', ', response.css('#divbody tr:nth-child(3) font').getall()[x])), 2) for x in range(len(response.css('#divbody tr:nth-child(3) font').getall()))][0][1] if (([re.split(',', re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(Contact:)|(Head:)', '', re.sub(r'(<BR>)|(<br>)', ', ', response.css('#divbody tr:nth-child(3) font').getall()[x])), 2) for x in range(len(response.css('#divbody tr:nth-child(3) font').getall()))][0][1]) != '') else 'No Contact Head in Cert.']
itemThree["Cmp_Cert_Constitution"] = [[response.css('#divbody td td td:nth-child(1) font::text').getall()[x] for x in range(len(response.css('#divbody td td td:nth-child(1) font::text').getall()))][0] if (([response.css('#divbody td td td:nth-child(1) font::text').getall()[x] for x in range(len(response.css('#divbody td td td:nth-child(1) font::text').getall()))][0]) != '') else 'No Constitution in Cert.']
itemThree["Cmp_Cert_Catagory"] = [[re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(Category:)', '', response.xpath('//*[(@id = "divbody")]//tr[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]//td').getall()[x]) for x in range(len(response.xpath('//*[(@id = "divbody")]//tr[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]//td').getall()))][0] if (([re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')|(Category:)', '', response.xpath('//*[(@id = "divbody")]//tr[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]//td').getall()[x]) for x in range(len(response.xpath('//*[(@id = "divbody")]//tr[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]//td').getall()))][0]) != '') else 'No Catagory Data in Cert.']
itemThree["Cmp_Cert_Products_And_Services"] = [[re.sub(r';', ', ', response.css('tr:nth-child(6) font::text').getall()[x]) for x in range(len(response.css('tr:nth-child(6) font::text').getall()))][1] if (([re.sub(r';', ', ', response.css('tr:nth-child(6) font::text').getall()[x]) for x in range(len(response.css('tr:nth-child(6) font::text').getall()))][1]) != '') else 'No Products and Services in Cert.']
itemThree["Cmp_Cert_Products_And_Services_Details"] = [x if (x != '' or x != None) else ' No Products and Services Detailed Data in Cert.' for x in [re.sub("(\\r)|(\\n)", "", response.css('#explanation1::text').getall()[x]) for x in range(len(response.css('#explanation1::text').getall())) if (re.sub("(\\r)|(\\n)", "", response.css('#explanation1::text').getall()[x]) != '')]]
itemThree["Cmps_Cert_Resps_Url"] = response.url
yield itemThree
def parse_getCompProfPageLinkLIst(self, response):
itemFour = GooglebotItemFour()
itemFour["Cmp_Prof_Name"] = response.css('.textb .textb td.textb strong::text').get()
itemFour["Cmp_Prof_Products_List"] = [re.sub(r';', ', ', re.sub(r'(<[^>]+>)', '', response.css('.textb .textb tr+ tr td:nth-child(1)').getall()[x])) if ((re.sub(r'(<[^>]+>)', '', response.css('.textb .textb tr+ tr td:nth-child(1)').getall()[x])) != '') else ' No/Err Profile Products Data' for x in range(len(response.css('.textb .textb tr+ tr td:nth-child(1)').getall()))]
itemFour["Cmp_Prof_Code_List"] = [str(re.sub(r'(<[^>]+>)', '', re.sub(r';', ', ', response.css('.textb .textb tr+ tr td:nth-child(2)').getall()[x]))) if (str(re.sub(r'(<[^>]+>)', '', re.sub(r';', ', ', response.css('.textb .textb tr+ tr td:nth-child(2)').getall()[x]))) != '') else ' No/Err Profile Products ITC HS Code Data' for x in range(len(response.css('.textb .textb tr+ tr td:nth-child(2)').getall()))]
itemFour["Cmps_Prof_Resps_Url"] = response.url
yield itemFour
def parse_getSubCatPageLink(self, response):
itemFive = GooglebotItemFive()
itemFive["SubCat_Code_Number"] = response.css('strong::text').getall()
itemFive["SubCat_Name"] = [re.sub(r'(<[^>]+>)|(-->)|(\r)|(\n)|(\t)|(\')', '', response.css('tr+ tr td:nth-child(2)').getall()[x]) for x in range(len(response.css('tr+ tr td:nth-child(2)').getall()))]
itemFive["SubCat_Comps_Page_Url_List"] = [response.urljoin(response.css('#contant-contant td:nth-child(3) a::attr(href)').getall()[i]) if ((response.css('#contant-contant td:nth-child(3) a::attr(href)').getall()[i]) != '') else ' No SubCat_Comps_Page_Url_List Data' for i in range(len(response.css('#contant-contant td:nth-child(3) a::attr(href)').getall()))]
itemFive["SubCats_SubChapter_Comps_Page_Url_List"] = [response.urljoin(', '.join(re.findall(r'(?:href=")([^"]+)', response.css('tr+ tr td:nth-child(4)').getall()[x]))) if (', '.join(re.findall(r'(?:href=")([^"]+)', (response.css('tr+ tr td:nth-child(4)').getall()[x]))) != '') else ' No SubCats_SubChapter_Comps_Page_Url_List Data' for x in range(len(response.css('tr+ tr td:nth-child(4)').getall()))]
itemFive["SubCats_Cmps_Page_Resps_Url"] = response.url
yield itemFive
for SubCatSubChapCompPageLink in itemFive['SubCats_SubChapter_Comps_Page_Url_List']:
if ' No SubCats_SubChapter_Comps_Page_Url_List Data' not in SubCatSubChapCompPageLink:
yield scrapy.Request(SubCatSubChapCompPageLink, self.parse_getSubCatPageLink)
for SubCatCompPageLink in itemFive['SubCat_Comps_Page_Url_List']:
yield scrapy.Request(SubCatCompPageLink, self.parse_getCompDetailsLink)
项目.py
import scrapy
class GooglebotItem(scrapy.Item):
Main_Page_Items_Cat_No = scrapy.Field()
Main_Page_Items_Cat_Name = scrapy.Field()
MainCatWise_Comps_Page_Url_List = scrapy.Field()
SubCatWise_Indvidual_Page_Url_List = scrapy.Field()
Main_Page_Resps_Url = scrapy.Field()
class GooglebotItemTwo(scrapy.Item):
Cmp_Name_List = scrapy.Field()
Cmp_Products_Names_List = scrapy.Field()
Cmp_Products_Codes_List = scrapy.Field()
Cmp_Cert_Url_List = scrapy.Field()
Cmp_Prof_Url_List = scrapy.Field()
Cmps_Page_Resps_Url = scrapy.Field()
编辑 - pipelines.py
from itemadapter import ItemAdapter
import pandas as pd
from .spiders.Google import GoogleSpider
class GooglebotPipeline:
CatUrlParam = GoogleSpider()
def process_item(self, item, spider):
AllMainPageLinks = {
"Main_Page_Items_Cat_No": item["Main_Page_Items_Cat_No"],
"Main_Page_Items_Cat_Name": item["Main_Page_Items_Cat_Name"],
"MainCatWise_Comps_Page_Url_List": item["MainCatWise_Comps_Page_Url_List"],
"SubCatWise_Indvidual_Page_Url_List": item["SubCatWise_Indvidual_Page_Url_List"],
"Main_Page_Resps_Url": item["Main_Page_Resps_Url"]
}
df1 = pd.DataFrame(AllMainPageLinks)
df1.to_csv(f"{self.CatUrlParam.CatUrlParam} - 1 AllMainPageLinks - {self.CatUrlParam.CatUrlParam}.csv", mode='a', index=False, header=False)
class GooglebotPipelineTwo:
def process_item(self, item, spider):
AllCompsLinks = {
"Cmp_Name_List": item["Cmp_Name_List"],
"Cmp_Products_Names_List": item["Cmp_Products_Names_List"],
"Cmp_Products_Codes_List": item["Cmp_Products_Codes_List"],
"Cmp_Cert_Url_List": item["Cmp_Cert_Url_List"],
"Cmp_Prof_Url_List": item["Cmp_Prof_Url_List"],
"Cmps_Page_Resps_Url": item["Cmps_Page_Resps_Url"]
}
df2 = pd.DataFrame(AllCompsLinks)
df2.to_csv(f"{self.CatUrlParam.CatUrlParam} - 2 AllCompsLinks - {self.CatUrlParam.CatUrlParam}.csv", mode='a', index=False, header=False)
设置.py
import random
BOT_NAME = 'GoogleBot'
SPIDER_MODULES = ['GoogleBot.spiders']
NEWSPIDER_MODULE = 'GoogleBot.spiders'
MY_CUST_UAL = ['Googlebot/2.1 (+http://www.googlebot.com/bot.html)',
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36',
'Googlebot/2.1 (+http://www.google.com/bot.html)',
'Googlebot']
USER_AGENT = random.choice(MY_CUST_UAL)
ROBOTSTXT_OBEY = True
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
ITEM_PIPELINES = {
'GoogleBot.pipelines.GooglebotPipeline': 300,
'GoogleBot.pipelines.GooglebotPipelineTwo': 300
}
更新 -产生错误
2021-06-27 21:34:52 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://Demo.com/robots.txt> (referer: None)
2021-06-27 21:34:52 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://Demo.com/Offrings.php> (referer: None)
2021-06-27 21:34:53 [scrapy.core.scraper] ERROR: Error processing {'MainCatWise_Comps_Page_Url_List': ['http://Demo.com/search.php?stype=Like&searchStringProducts=01'],
'Main_Page_Items_Cat_Name': ['CAT ONE'],
'Main_Page_Items_Cat_No': ['01'],
'Main_Page_Resps_Url': ['http://Demo.com/Offrings.php'],
'SubCatWise_Indvidual_Page_Url_List': ['http://Demo.com/BrowseBuyersbyCatagory.php?groupID=01']}
Traceback (most recent call last):
File "c:\program files\python38\lib\site-packages\twisted\internet\defer.py", line 662, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "c:\program files\python38\lib\site-packages\scrapy\utils\defer.py", line 150, in f
return deferred_from_coro(coro_f(*coro_args, **coro_kwargs))
File "E:\SFT\EDU\PROJ\EXP 30\New folder\GoogleBot\GoogleBot\pipelines.py", line 36, in process_item
"Cmp_Name_List": item["Cmp_Name_List"],
TypeError: 'NoneType' object is not subscriptable
2021-06-27 21:34:54 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://Demo.com/search.php?stype=Like&searchStringProducts=01> (referer: http://Demo.com/Offrings.php)
2021-06-27 21:34:54 [scrapy.core.scraper] ERROR: Error processing
{'Cmp_Cert_Url_List': ['http://Demo.com/certificateview.php?num=1', 'http://Demo.com/certificateview.php?num=2', ... TOTAL URLS 23, 'http://Demo.com/certificateview.php?num=23'],
'Cmp_Name_List': ['COMPANY 1', 'COMPANY 2', ... TOTAL NAMES 23, 'COMPANY 23'],
'Cmp_Products_Codes_List': ['NUM 1', 'NUM 2', ... TOTAL NUMBERS 23, 'NUM 23'],
'Cmp_Products_Names_List': ['1', '2', ... TOTAL ITEMS 23, '23'],
'Cmp_Prof_Url_List': ['http://Demo.com/prof.php?id=1', 'http://Demo.com/prof.php?id=2', ... TOTAL ITEMS 23, 'http://Demo.com/prof.php?id=23'],
'Cmps_Page_Resps_Url': ['http://Demo.com/search.php?stype=Like&searchStringProducts=01']
}
Traceback (most recent call last):
File "c:\program files\python38\lib\site-packages\twisted\internet\defer.py", line 662, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "c:\program files\python38\lib\site-packages\scrapy\utils\defer.py", line 150, in f
return deferred_from_coro(coro_f(*coro_args, **coro_kwargs))
File "E:\SFT\EDU\PROJ\EXP 30\New folder\GoogleBot\GoogleBot\pipelines.py", line 11, in process_item
"Main_Page_Items_Cat_No": item["Main_Page_Items_Cat_No"],
File "c:\program files\python38\lib\site-packages\scrapy\item.py", line 94, in __getitem__
return self._values[key]
KeyError: 'Main_Page_Items_Cat_No'
2021-06-27 21:34:54 [scrapy.core.engine] INFO: Closing spider (finished)
2021-06-27 21:34:54 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1149,
'downloader/request_count': 3,
'downloader/request_method_count/GET': 3,
'downloader/response_bytes': 41513,
'downloader/response_count': 3,
'downloader/response_status_count/200': 3,
'elapsed_time_seconds': 3.079236,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2021, 6, 27, 16, 4, 54, 960780),
'httpcompression/response_bytes': 204541,
'httpcompression/response_count': 3,
'log_count/DEBUG': 3,
'log_count/ERROR': 2,
'log_count/INFO': 10,
'request_depth_max': 1,
'response_received_count': 3,
'robotstxt/request_count': 1,
'robotstxt/response_count': 1,
'robotstxt/response_status_count/200': 1,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2021, 6, 27, 16, 4, 51, 881544)}
2021-06-27 21:34:54 [scrapy.core.engine] INFO: Spider closed (finished)
编辑摘要
编辑 - “Pipelines.py”和更新 - “产生错误” - 然而 2 个主要错误 - 1)TypeError:'NoneType' 对象不可下标,2)KeyError:'Main_Page_Items_Cat_No'
解决方案
FileName1 = open(f"{AllMainPageLinks}.csv", "w")
您将字典及其所有内容放入您正在创建的文件名中,而不是带有路径的普通文件名。
当您的操作系统尝试打开具有 name 的文件时,它会变得有点困惑{'Main_Page_Items_Cat_No': ['“Very productive use of my time. I attend these events to help keep up with technological advances and the state of the practice. I often get information at this event to pass to my team to keep them informed on solutions that we may want to incorporate in our environment.”'], 'Main_Page_Items_Cat_Name': [], 'MainCatWise_Comps_Page_Url_List': [], 'SubCatWise_Indvidual_Page_Url_List': [], 'Main_Page_Resps_Url': ['https://events.idg.com/index.php']}.csv
。
推荐阅读
- javascript - 仅为文本区域编辑器(角度)中下拉菜单中出现的单词设置不同的颜色
- javascript - 并行 rxJs Observables,无需等待全部完成
- php - 如何解析xml输出
- c# - 如何对 Json 进行去串化并绑定到变量/属性 c#
- python - 如何使用我拥有的以下数据绘制时间序列图
- swift - 找出与物体前锚点的距离
- r - 如何在不在 r 中创建标题的情况下进行转置?
- javascript - 使用 javascript 或 typescript 检查每个值是否包含在数组中
- php - 哪种数据类型适用于 mysql 中的这种格式 (2008-02-04T10:20:00.000+00:00)?
- web - 主机操作系统作为 Web 应用程序