首页 > 解决方案 > 将抓取的数据存储到 MySQL

问题描述

我写了一个scrapy bot来抓取数据,现在我想把它存储到MySQL中。我为此编写了一个管道脚本。但它不工作不知道它面临什么问题。任何人都可以提出究竟是什么问题。代码运行完美,我认为它不能只连接到 mySQL。

Scrapy 机器人代码是

from..items import NdtvItem

class NdtvReviewSpider(scrapy.Spider):
    name = 'ndtv_review'
    page_number = 2
    start_urls = ['https://www.ndtv.com/movie-reviews/page-1'
    ]

    def parse(self, response):

        items = {}

        i = 1
        count = response.xpath('//*[@id="ins_storylist"]/ul/li[{}]/div[2]/div[1]/a/i/text()'.format("*")).getall()
        count = len(count)
        while i<=count:
            outputs = NdtvItem()
            outputs['page_title'] = response.xpath('//*[@id="ins_storylist"]/ul/li[{}]/div[2]/div[1]/a/i/text()'.format(i)).get()
            outputs['review_content'] = response.xpath("//*[@id='ins_storylist']/ul/li[{}]/div[2]/div[3]/text()".format(i)).get()
            outputs['review_link'] = response.xpath("//*[@id='ins_storylist']/ul/li[{}]/div[2]/div[1]/a/@href".format(i)).get()
            i+=1
            fl = 0
            if outputs['page_title'] == []:
                outputs['page_title'] = ''
                fl+=1
            if outputs['review_content'] == []:
                outputs['review_content'] = ''
                fl+=1
            if outputs['review_link'] == []:
                outputs['review_link'] = ''
                fl += 1
            else:
                yield outputs
                #if outputs['review_content'] != []:
                 #   if 'Review:' in outputs['review_content'].split(" ") or 'review:' in outputs['review_content'].split(" ") :
                  #      yield outputs


        next_page = 'https://www.ndtv.com/movie-reviews/page-' + str(NdtvReviewSpider.page_number) +''
        if NdtvReviewSpider.page_number <= 15:
            NdtvReviewSpider.page_number += 1  
            yield response.follow(next_page, callback = self.parse)
        
        pass

管道脚本是

# 在此处定义您的项目管道
#
# 不要忘记将管道添加到 ITEM_PIPELINES 设置
# 见:https://docs.scrapy.org/en/latest/topics/item-pipeline.html

导入 mysql.connector

类 ReviewBotPipeline(对象):

    def __init__(self):
        self.create_connection()
        self.create_table()

    def create_connection(self):
        self.conn = mysql.connector.connect(
            主机='本地主机',
            用户 = 'root',
            passwd = 'xxxxxx',
            数据库='审查'
        )
        self.curr = self.conn.cursor()

    定义创建表(自我):
        self.curr.execute(""" DROP TABLE IF EXISTS review_tb """)
        self.curr.execute(""" 创建表 review_tb(
            page_title 文本,
            review_content 文本,
            review_link 文本
            ) """)
    def process_item(自我,输出,蜘蛛):
        self.store_db(输出)
        返回输出

    def store_db(自我,输出):
        self.curr.execute(""" 插入 review_tb 值(%s,%s,%s)""",(
            输出['page_title'][0],
            输出['review_content'][0],
            输出['review_link'][0]
        ))
        self.conn.commit()



这就是我得到的错误。

  File "/Users/divyanshu/env/lib/python3.7/site-packages/twisted/internet/defer.py", line 654, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "/Users/divyanshu/env/lib/python3.7/site-packages/scrapy/utils/defer.py", line 154, in f
    return deferred_from_coro(coro_f(*coro_args, **coro_kwargs))
  File "/Users/divyanshu/review_bot/review_bot/pipelines.py", line 33, in process_item
    self.store_db(outputs)
  File "/Users/divyanshu/review_bot/review_bot/pipelines.py", line 40, in store_db
    outputs['review_link']
  File "/Users/divyanshu/env/lib/python3.7/site-packages/mysql/connector/cursor_cext.py", line 248, in execute
    prepared = self._cnx.prepare_for_mysql(params)
  File "/Users/divyanshu/env/lib/python3.7/site-packages/mysql/connector/connection_cext.py", line 626, in prepare_for_mysql
    result = self._cmysql.convert_to_mysql(*params)
_mysql_connector.MySQLInterfaceError: Python type list cannot be converted

标签: pythonmysqlpython-3.xscrapyweb-crawler

解决方案


另一种方法。用于提取和存储数据的代码更简单,更易读。

from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
from simplified_scrapy.core.mysql_objstore import MysqlObjStore
from simplified_html.request_render import RequestRender

class ReviewBotPipeline(MysqlObjStore):
    def saveObj(self, data):
        conn = None
        cur = None
        try:
            conn = self.connect()
            cur = conn.cursor()
            try:
                cur.execute("insert into review_tb(page_title,review_content,review_link) values(%s,%s,%s)",
                            (data['page_title'], data['review_content'],
                             data['review_link']))
                return conn.commit()
            except Exception as err:
                conn.rollback()
                print(err)
        except Exception as err:
            print(err)
        finally:
            if (cur): cur.close()
            if (conn): conn.close()

class MySpider(Spider):
    name = 'ndtv_review'
    start_urls = [{
        'url': 'https://www.ndtv.com/movie-reviews/page-1',
        'requestMethod': 'render' # The download mode is 'render'
    }]  # Entry page
    concurrencyPer1s = 1
    refresh_urls = True
    obj_store = ReviewBotPipeline(
        name, {
            'host': '127.0.0.1',
            'port': 3306,
            'user': 'root',
            'pwd': 'root',
            'dbName': 'review',
            'tbName': 'review_tb'
        })

    # Download page using headless browser. start
    # pyppeteer needs to be installed
    req = RequestRender({
        'executablePath':
        '/Applications/chrome.app/Contents/MacOS/Google Chrome'
    })

    def renderUrl(self, url, callback):
        self.req.get(url['url'], self._callback, extr_data=(url, callback))

    def _callback(self, html, url, extr_data):
        extr_data[1](html, extr_data[0], self)
    # Download page using headless browser. end

    def extract(self, url, html, models, modelNames):
        doc = SimplifiedDoc(html)
        datas = []
        lis = doc.selects('div#ins_storylist>li')
        for li in lis:
            outputs = {}
            a = li.select('div.nstory_header>a')
            if not a: continue
            outputs['page_title'] = a.text
            outputs['review_content'] = li.select('div.nstory_intro').text
            outputs['review_link'] = a.href
            datas.append(outputs)
        lstA = doc.selects('div.new_pagination>a')
        return {"Urls": lstA, "Data": datas}


SimplifiedMain.startThread(MySpider())  # Start download

这里有更多例子:https ://github.com/yiyedata/simplified-scrapy-demo/tree/master/spider_examples


推荐阅读