python - How to use scrapy to get to the next chapter on fanfiction.net?
问题描述
On fanfiction.net, this is the HTML code to get the chapters of a story:
<select id="chap_select" title="Chapter Navigation" name="chapter" onchange="self.location = '/s/13109220/'+ this.options[this.selectedIndex].value + '/Son-of-the-Hunt';">
<option value="1" selected="">1. Chapter 1</option>
<option value="2">2. Camp</option>
<option value="3">3. Chapter 3</option>
</select>
What I want is to use this to go to the next chapter and keep downloading the text content, but the normal way of doing it with it calling self.fanfiction()
recursively which would not work because of the self.storyNum += 1
line.
import scrapy, docx, time
import subprocess as sp
class FanfictionDownloader(scrapy.Spider):
name = "fanfiction"
storyNum = 0
nextPage = False
urls = []
docText = ''
title = ''
def start_requests(self):
sp.call('cls', shell=True)
self.urls = list(str(input("Enter url seperated by a comma and space (, ): ")).split(', '))
for url in self.urls:
if self.urls[self.storyNum].startswith('https://www.fanfiction.net/s/'):
yield scrapy.Request(url=url, callback=self.fanfiction)
elif self.urls[self.storyNum].startswith('https://www.wattpad.com/'):
yield scrapy.Request(url=url, callback=self.wattpad)
else:
print('Not a valid link, ending downloader.')
time.sleep(5)
quit()
sp.call('cls', shell=True)
def fanfiction(self, response):
self.storyNum += 1
doc = docx.Document()
chapters = ''
totalChapters = 0
currentChapter = 1
i = 0
for para in response.css('div#storytext > p'):
text = (para.xpath('text() | */text() | */*/text()').getall())
self.title = (response.xpath('//*[@id="profile_top"]/b/text()').get())
storyId = ((response.xpath('//*[@id="profile_top"]/span[4]/text()[5]').get()).replace(' - id: ', ''))
chapters = (response.xpath('//*[@id="chap_select"]/option/text()').getall())
totalChapters = len(chapters[0:int(len(chapters) / 2)])
finalText = [
[x.replace('\u00ef', 'ï').replace('\u2013', '–').replace('\u2026', '...') for x in text],
['Story %s: %s' % (self.storyNum, self.urls[self.storyNum - 1])],
['Title: %s' % (self.title)],
['Story ID: %s' % (storyId)],
['Total Chapters: %s' % (totalChapters)],
['Chapter Names: %s' % [chapters[0:int(len(chapters) / 2)]]],
]
if len(finalText[0][0]) > 1:
self.docText = (''.join(finalText[0][0:]))
else:
self.docText = finalText[0][0]
if self.nextPage == False:
doc.add_paragraph(self.docText)
else:
doc.add_page_break(self.docText)
self.nextPage = False
doc.add_paragraph()
sp.call('cls', shell=True)
doc.save('./../%s.docx' % (self.title))
i += 1
yield {'line ' + str(i): finalText}
sp.call('cls', shell=True)
def wattpad(self, response):
pass
解决方案
你真的需要为故事编号保留一个计数器吗?
我认为只要找到一个页面,您就可以生成下一页,例如:
if response.xpath('//button[text()="Next >"]'):
next_link = response.xpath('//button[text()="Next >"]')[0].attrib['onclick'].replace('self.location=', '').replace("'", '')
yield response.follow('https://www.fanfiction.net' + next_link, self.fanfiction)
如评论中所述,您应该使用项目管道来关心将项目“存储”在文档中。
这里有一些东西可以给你一个想法,这对我有用,你必须适应你的用例:
import docx
import scrapy
class StoryPipeline:
def open_spider(self, spider):
self.doc = docx.Document()
def process_item(self, item, spider):
if 'title' in item:
self.title = item['title']
self.doc.add_paragraph(str(item))
else:
self.doc.add_paragraph('\n\n'.join(item['paragraphs']))
def close_spider(self, spider):
self.doc.save('./%s.docx' % (self.title))
class FanfictionDownloader(scrapy.Spider):
name = "fanfiction.net"
custom_settings = {
"ITEM_PIPELINES": {
"myspider.StoryPipeline": 300,
}
}
def start_requests(self):
start_url = 'https://www.fanfiction.net/s/11734723/1/This-Past-Storm'
yield scrapy.Request(url=start_url, callback=self.parse)
def parse(self, response):
title = response.xpath('//*[@id="profile_top"]/b/text()').get()
storyId = response.xpath('//*[@id="profile_top"]/span[4]/text()[5]').get().replace(' - id: ', '')
chapters = response.xpath('(//select[@id="chap_select"])[1]/option/text()').getall()
yield {
'title': title,
'storyId': storyId,
'chapters': chapters,
'totalChapters': len(chapters),
}
for x in self._parse_paragraphs(response):
yield x
def parse_next(self, response):
for x in self._parse_paragraphs(response):
yield x
def _parse_paragraphs(self, response):
paragraphs = response.xpath('//div[@id="storytext"]//text()').getall()
yield {'paragraphs': paragraphs}
next_button = response.xpath('(//button[text()="Next >"])[1]/@onclick').get()
if next_button:
next_url = next_button.replace('self.location=', '').replace("'", '')
yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_next)
推荐阅读
- javascript - 为什么这个`do`-`while`循环在结束后重复最后一个值?
- scala - Spark - 更改数据集中属于长尾的记录的值
- python - 如何使用 Python Pandas 将微秒转换为人类可读的日期和时间?
- java - Spring + Spring Security 请求仅接受内容类型 x-www-form-urlencoded
- azure - 连接到由混合连接管理器服务的混合连接
- python - 模块无法安装在 Django 虚拟环境中
- opencv - 为什么按该顺序应用滚动俯仰和偏航?
- python - 如何在 Django App 中通用地设置记录器类
- php - 如何通过 PHP 使用现有的 Drupal 7 用户数据库
- google-apps-script - GAS 代码限制从谷歌工作表生成的 pdf 仅限于具有数据的行