首页 > 解决方案 > 如何使用 spider.CrawlerRunner() 或 CrawlerProcess() 在 django 视图中调用scrapy spider

问题描述


在访问 Stack Overflow 以及其他 git 等中的所有链接后,我无法解决我的错误。基本上,我想通过提供特定的主题标签来废弃 instagram,即用户将从 Django UI 中提供一个主题标签,而 Django 函数将把该主题标签发送给scrapy spider,以便使用

scrapy.spider.spider.CrawlProcess()


希望得到帮助!


以下是我的代码:

django/models.py

from django.db import models

class InstaAccountDetails(models.Model):
    website = models.URLField()
    post_id = models.PositiveIntegerField() 
    shortcode = models.CharField(max_length=200)
    caption = models.CharField(max_length=500)
    display_url = models.ImageField()
    loc_id = models.PositiveIntegerField()
    loc_name = models.CharField(max_length=200)
    loc_lat = models.IntegerField()
    loc_lon = models.IntegerField()
    owner_id = models.PositiveIntegerField()
    owner_name = models.CharField(max_length=200)
    taken_at_timestamp = models.DateField()

    def __str__(self):
        return self.owner_name

django/views.py

from django.shortcuts import render, redirect
from django.http import JsonResponse
from scrapy.crawler import CrawlerProcess, CrawlerRunner
from scrapy.utils.project import get_project_settings
from instagram_scrapy.instagram_scrapy.spiders.instaCrawler import InstagramSpider

def home(request):
    return render(request, 'instagram_link_app/homePage.html')

def get_insta_details(request):
    field_value = request.GET.get('value_holder')
    print("request value ::::::::::::::: ",field_value)
    try:
        if field_value:
            return JsonResponse({"msg:" "Hii"})
            process = CrawlerProcess(get_project_settings())

            process.crawl(InstagramSpider(hashtag=field_value))
            process.start()  #the script will block here until the crawling is finished
            print(process)
            data['process'] = process
            return JsonResponse(data)
        else:
            return JsonResponse({'message': 'None'})
    except Exception as e:
        return JsonResponse({'message': str(e)})

刮擦/settings.py

import os
import sys
import django
BOT_NAME = 'instagram_scrapy'
SPIDER_MODULES = ['instagram_scrapy.spiders']
NEWSPIDER_MODULE = 'instagram_scrapy.spiders'
ITEM_PIPELINES = {
   'instagram_scrapy.pipelines.InstagramScrapyPipeline': 800,
}
sys.path.append(os.path.join(os.path.dirname("..")))
os.environ['DJANGO_SETTINGS_MODULE'] = 'instagram_link.settings'
django.setup()

scrapy/myspiders.py

import scrapy
import json
import time
import os.path
from instagram_scrapy.instagram_scrapy.items import InstaAccountItem
class InstagramSpider(scrapy.Spider):
    name = "instaCrawler"  # Name of the Spider, required value

    def init__(self, *args, **kwargs):
        try:
            print("kwargs:::::::::", kwargs)
            self.hashtag = kwargs.get('hashtag','myinterior')
            print(":::::::", self.hashtag)
            self.start_urls = ["https://www.instagram.com/explore/tags/" + self.hashtag + "/?__a=1"]
            print("start_urls:::::::", self.start_urls)
            self.date = time.strftime("%d-%m-%Y_%H")
            self.checkpoint_path = './scraped/%s/%s/.checkpoint' % (self.name, self.hashtag)
            print("checkpoint_path:::::::", self.checkpoint_path)
            self.custom_settings = {
                'FEED_URI': './scraped/%s/%s/%s' % (self.name, self.hashtag, self.date),
            }
            print("self.custom_settings:::::::::::::::::::::: ",self.custom_settings)
            # super(InstagramSpider, self).__init__(*args, **kwargs)
        except Exception as e:
            print("Error:::::::",str(e))
     def parse(self, response):
        return self.parse_htag(response)

scrapy/piplines.py

from scrapy.utils.serialize import ScrapyJSONEncoder

_encoder = ScrapyJSONEncoder()


class InstagramScrapyPipeline(object):
    def process_item(self, item, spider):
        item.save()
        return item

刮擦/items.py

import scrapy
from scrapy.item import Field
from scrapy_djangoitem import DjangoItem
from instagram_link_app.models import InstaAccountDetails

class InstaAccountItem(DjangoItem):
    django_model = InstaAccountDetails

我的错误

在 django/admin 页面中

OperationalError at /admin/instagram_link_app/instaaccountdetails/

no such column: instagram_link_app_instaaccountdetails.website

输入主题标签后出错

Internal Server Error: /get_insta_details/
Traceback (most recent call last):
  File "/home/chandniabhatia/Documents/Chandni/Django/Django_Project_POCs/scraper_instagram/lib/python3.6/site-packages/django/core/handlers/exception.py", line 35, in inner
    response = get_response(request)
  File "/home/chandniabhatia/Documents/Chandni/Django/Django_Project_POCs/scraper_instagram/lib/python3.6/site-packages/django/core/handlers/base.py", line 128, in _get_response
    response = self.process_exception_by_middleware(e, request)
  File "/home/chandniabhatia/Documents/Chandni/Django/Django_Project_POCs/scraper_instagram/lib/python3.6/site-packages/django/core/handlers/base.py", line 126, in _get_response
    response = wrapped_callback(request, *callback_args, **callback_kwargs)
  File "/home/chandniabhatia/Documents/Chandni/Django/Django_Project_POCs/scraper_instagram/instagram_link/instagram_link_app/views.py", line 76, in get_insta_details
    return JsonResponse({"msg:" "Hii"})
  File "/home/chandniabhatia/Documents/Chandni/Django/Django_Project_POCs/scraper_instagram/lib/python3.6/site-packages/django/http/response.py", line 503, in __init__
    'In order to allow non-dict objects to be serialized set the '
TypeError: In order to allow non-dict objects to be serialized set the safe parameter to False.

标签: pythondjangopython-3.xscrapydjango-2.0

解决方案


推荐阅读