python - 将抓取的数据存储到 MySQL
问题描述
我写了一个scrapy bot来抓取数据,现在我想把它存储到MySQL中。我为此编写了一个管道脚本。但它不工作不知道它面临什么问题。任何人都可以提出究竟是什么问题。代码运行完美,我认为它不能只连接到 mySQL。
Scrapy 机器人代码是
from..items import NdtvItem
class NdtvReviewSpider(scrapy.Spider):
name = 'ndtv_review'
page_number = 2
start_urls = ['https://www.ndtv.com/movie-reviews/page-1'
]
def parse(self, response):
items = {}
i = 1
count = response.xpath('//*[@id="ins_storylist"]/ul/li[{}]/div[2]/div[1]/a/i/text()'.format("*")).getall()
count = len(count)
while i<=count:
outputs = NdtvItem()
outputs['page_title'] = response.xpath('//*[@id="ins_storylist"]/ul/li[{}]/div[2]/div[1]/a/i/text()'.format(i)).get()
outputs['review_content'] = response.xpath("//*[@id='ins_storylist']/ul/li[{}]/div[2]/div[3]/text()".format(i)).get()
outputs['review_link'] = response.xpath("//*[@id='ins_storylist']/ul/li[{}]/div[2]/div[1]/a/@href".format(i)).get()
i+=1
fl = 0
if outputs['page_title'] == []:
outputs['page_title'] = ''
fl+=1
if outputs['review_content'] == []:
outputs['review_content'] = ''
fl+=1
if outputs['review_link'] == []:
outputs['review_link'] = ''
fl += 1
else:
yield outputs
#if outputs['review_content'] != []:
# if 'Review:' in outputs['review_content'].split(" ") or 'review:' in outputs['review_content'].split(" ") :
# yield outputs
next_page = 'https://www.ndtv.com/movie-reviews/page-' + str(NdtvReviewSpider.page_number) +''
if NdtvReviewSpider.page_number <= 15:
NdtvReviewSpider.page_number += 1
yield response.follow(next_page, callback = self.parse)
pass
管道脚本是
# 在此处定义您的项目管道 # # 不要忘记将管道添加到 ITEM_PIPELINES 设置 # 见:https://docs.scrapy.org/en/latest/topics/item-pipeline.html 导入 mysql.connector 类 ReviewBotPipeline(对象): def __init__(self): self.create_connection() self.create_table() def create_connection(self): self.conn = mysql.connector.connect( 主机='本地主机', 用户 = 'root', passwd = 'xxxxxx', 数据库='审查' ) self.curr = self.conn.cursor() 定义创建表(自我): self.curr.execute(""" DROP TABLE IF EXISTS review_tb """) self.curr.execute(""" 创建表 review_tb( page_title 文本, review_content 文本, review_link 文本 ) """) def process_item(自我,输出,蜘蛛): self.store_db(输出) 返回输出 def store_db(自我,输出): self.curr.execute(""" 插入 review_tb 值(%s,%s,%s)""",( 输出['page_title'][0], 输出['review_content'][0], 输出['review_link'][0] )) self.conn.commit()
这就是我得到的错误。
File "/Users/divyanshu/env/lib/python3.7/site-packages/twisted/internet/defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/divyanshu/env/lib/python3.7/site-packages/scrapy/utils/defer.py", line 154, in f
return deferred_from_coro(coro_f(*coro_args, **coro_kwargs))
File "/Users/divyanshu/review_bot/review_bot/pipelines.py", line 33, in process_item
self.store_db(outputs)
File "/Users/divyanshu/review_bot/review_bot/pipelines.py", line 40, in store_db
outputs['review_link']
File "/Users/divyanshu/env/lib/python3.7/site-packages/mysql/connector/cursor_cext.py", line 248, in execute
prepared = self._cnx.prepare_for_mysql(params)
File "/Users/divyanshu/env/lib/python3.7/site-packages/mysql/connector/connection_cext.py", line 626, in prepare_for_mysql
result = self._cmysql.convert_to_mysql(*params)
_mysql_connector.MySQLInterfaceError: Python type list cannot be converted
解决方案
另一种方法。用于提取和存储数据的代码更简单,更易读。
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
from simplified_scrapy.core.mysql_objstore import MysqlObjStore
from simplified_html.request_render import RequestRender
class ReviewBotPipeline(MysqlObjStore):
def saveObj(self, data):
conn = None
cur = None
try:
conn = self.connect()
cur = conn.cursor()
try:
cur.execute("insert into review_tb(page_title,review_content,review_link) values(%s,%s,%s)",
(data['page_title'], data['review_content'],
data['review_link']))
return conn.commit()
except Exception as err:
conn.rollback()
print(err)
except Exception as err:
print(err)
finally:
if (cur): cur.close()
if (conn): conn.close()
class MySpider(Spider):
name = 'ndtv_review'
start_urls = [{
'url': 'https://www.ndtv.com/movie-reviews/page-1',
'requestMethod': 'render' # The download mode is 'render'
}] # Entry page
concurrencyPer1s = 1
refresh_urls = True
obj_store = ReviewBotPipeline(
name, {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'pwd': 'root',
'dbName': 'review',
'tbName': 'review_tb'
})
# Download page using headless browser. start
# pyppeteer needs to be installed
req = RequestRender({
'executablePath':
'/Applications/chrome.app/Contents/MacOS/Google Chrome'
})
def renderUrl(self, url, callback):
self.req.get(url['url'], self._callback, extr_data=(url, callback))
def _callback(self, html, url, extr_data):
extr_data[1](html, extr_data[0], self)
# Download page using headless browser. end
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
datas = []
lis = doc.selects('div#ins_storylist>li')
for li in lis:
outputs = {}
a = li.select('div.nstory_header>a')
if not a: continue
outputs['page_title'] = a.text
outputs['review_content'] = li.select('div.nstory_intro').text
outputs['review_link'] = a.href
datas.append(outputs)
lstA = doc.selects('div.new_pagination>a')
return {"Urls": lstA, "Data": datas}
SimplifiedMain.startThread(MySpider()) # Start download
这里有更多例子:https ://github.com/yiyedata/simplified-scrapy-demo/tree/master/spider_examples
推荐阅读
- pytest - 沙盒中的 Bazel 和 py_test - 有什么方法可以定义输出?
- python-3.x - 如果在 Pandas 中读取 csv 文件时发生错误标记数据错误,如何跳过文件
- javascript - 如何在javascript中本地保存树视图节点
- javascript - 尝试通过使用javascript单击图像来设置图像的不透明度,但不断出现错误
- angular - 谷歌地图地方请求只返回一个地方
- amazon-web-services - 使用 cloudFormation 将角色附加到现有 EC2 实例
- angular - 从对象数组的 ng-For 循环内动态地将纬度和经度传递给角度谷歌地图
- python - 如果 x,y 在坐标框内
- magento - 使用 XAMPP 安装 Magento 2.3 时出错。无法打开本地主机/演示网站
- javascript - 如何使用 getJSON 读取和显示 json 二维数组?