python - 如何使用 spider.CrawlerRunner() 或 CrawlerProcess() 在 django 视图中调用scrapy spider
问题描述
在访问 Stack Overflow 以及其他 git 等中的所有链接后,我无法解决我的错误。基本上,我想通过提供特定的主题标签来废弃 instagram,即用户将从 Django UI 中提供一个主题标签,而 Django 函数将把该主题标签发送给scrapy spider,以便使用
scrapy.spider.spider.CrawlProcess()
希望得到帮助!
以下是我的代码:
django/models.py
from django.db import models
class InstaAccountDetails(models.Model):
website = models.URLField()
post_id = models.PositiveIntegerField()
shortcode = models.CharField(max_length=200)
caption = models.CharField(max_length=500)
display_url = models.ImageField()
loc_id = models.PositiveIntegerField()
loc_name = models.CharField(max_length=200)
loc_lat = models.IntegerField()
loc_lon = models.IntegerField()
owner_id = models.PositiveIntegerField()
owner_name = models.CharField(max_length=200)
taken_at_timestamp = models.DateField()
def __str__(self):
return self.owner_name
django/views.py
from django.shortcuts import render, redirect
from django.http import JsonResponse
from scrapy.crawler import CrawlerProcess, CrawlerRunner
from scrapy.utils.project import get_project_settings
from instagram_scrapy.instagram_scrapy.spiders.instaCrawler import InstagramSpider
def home(request):
return render(request, 'instagram_link_app/homePage.html')
def get_insta_details(request):
field_value = request.GET.get('value_holder')
print("request value ::::::::::::::: ",field_value)
try:
if field_value:
return JsonResponse({"msg:" "Hii"})
process = CrawlerProcess(get_project_settings())
process.crawl(InstagramSpider(hashtag=field_value))
process.start() #the script will block here until the crawling is finished
print(process)
data['process'] = process
return JsonResponse(data)
else:
return JsonResponse({'message': 'None'})
except Exception as e:
return JsonResponse({'message': str(e)})
刮擦/settings.py
import os
import sys
import django
BOT_NAME = 'instagram_scrapy'
SPIDER_MODULES = ['instagram_scrapy.spiders']
NEWSPIDER_MODULE = 'instagram_scrapy.spiders'
ITEM_PIPELINES = {
'instagram_scrapy.pipelines.InstagramScrapyPipeline': 800,
}
sys.path.append(os.path.join(os.path.dirname("..")))
os.environ['DJANGO_SETTINGS_MODULE'] = 'instagram_link.settings'
django.setup()
scrapy/myspiders.py
import scrapy
import json
import time
import os.path
from instagram_scrapy.instagram_scrapy.items import InstaAccountItem
class InstagramSpider(scrapy.Spider):
name = "instaCrawler" # Name of the Spider, required value
def init__(self, *args, **kwargs):
try:
print("kwargs:::::::::", kwargs)
self.hashtag = kwargs.get('hashtag','myinterior')
print(":::::::", self.hashtag)
self.start_urls = ["https://www.instagram.com/explore/tags/" + self.hashtag + "/?__a=1"]
print("start_urls:::::::", self.start_urls)
self.date = time.strftime("%d-%m-%Y_%H")
self.checkpoint_path = './scraped/%s/%s/.checkpoint' % (self.name, self.hashtag)
print("checkpoint_path:::::::", self.checkpoint_path)
self.custom_settings = {
'FEED_URI': './scraped/%s/%s/%s' % (self.name, self.hashtag, self.date),
}
print("self.custom_settings:::::::::::::::::::::: ",self.custom_settings)
# super(InstagramSpider, self).__init__(*args, **kwargs)
except Exception as e:
print("Error:::::::",str(e))
def parse(self, response):
return self.parse_htag(response)
scrapy/piplines.py
from scrapy.utils.serialize import ScrapyJSONEncoder
_encoder = ScrapyJSONEncoder()
class InstagramScrapyPipeline(object):
def process_item(self, item, spider):
item.save()
return item
刮擦/items.py
import scrapy
from scrapy.item import Field
from scrapy_djangoitem import DjangoItem
from instagram_link_app.models import InstaAccountDetails
class InstaAccountItem(DjangoItem):
django_model = InstaAccountDetails
我的错误
在 django/admin 页面中
OperationalError at /admin/instagram_link_app/instaaccountdetails/
no such column: instagram_link_app_instaaccountdetails.website
输入主题标签后出错
Internal Server Error: /get_insta_details/
Traceback (most recent call last):
File "/home/chandniabhatia/Documents/Chandni/Django/Django_Project_POCs/scraper_instagram/lib/python3.6/site-packages/django/core/handlers/exception.py", line 35, in inner
response = get_response(request)
File "/home/chandniabhatia/Documents/Chandni/Django/Django_Project_POCs/scraper_instagram/lib/python3.6/site-packages/django/core/handlers/base.py", line 128, in _get_response
response = self.process_exception_by_middleware(e, request)
File "/home/chandniabhatia/Documents/Chandni/Django/Django_Project_POCs/scraper_instagram/lib/python3.6/site-packages/django/core/handlers/base.py", line 126, in _get_response
response = wrapped_callback(request, *callback_args, **callback_kwargs)
File "/home/chandniabhatia/Documents/Chandni/Django/Django_Project_POCs/scraper_instagram/instagram_link/instagram_link_app/views.py", line 76, in get_insta_details
return JsonResponse({"msg:" "Hii"})
File "/home/chandniabhatia/Documents/Chandni/Django/Django_Project_POCs/scraper_instagram/lib/python3.6/site-packages/django/http/response.py", line 503, in __init__
'In order to allow non-dict objects to be serialized set the '
TypeError: In order to allow non-dict objects to be serialized set the safe parameter to False.
解决方案
推荐阅读
- keras - Keras 中的多序列 RNN/LSTM
- openssl - 如何修复 evp_cipher_ctx 和 std::pair
::第二个不完整的类型错误? - bash - 错误:使用带有 bash for 循环的 BLAST 时位置参数过多 (1)
- javascript - 在数组结束后重新启动 for 循环
- c# - 如何访问保存在服务器和本地的文件?
- php - PHP脚本后如何正确显示HTML页面?
- django - django rest框架ListAPIView没有返回值
- javascript - 类型错误:migrationCreator 不是函数
- tomcat - nginx + tomcat 反向代理麻烦
- python - 安装 ggplot 后 matplotlib 出现问题