首页 > 解决方案 > How to use scrapy to get to the next chapter on fanfiction.net?

问题描述

On fanfiction.net, this is the HTML code to get the chapters of a story:

<select id="chap_select" title="Chapter Navigation" name="chapter" onchange="self.location = '/s/13109220/'+ this.options[this.selectedIndex].value + '/Son-of-the-Hunt';">
  <option value="1" selected="">1. Chapter 1</option>
  <option value="2">2. Camp</option>
  <option value="3">3. Chapter 3</option>
</select>

What I want is to use this to go to the next chapter and keep downloading the text content, but the normal way of doing it with it calling self.fanfiction() recursively which would not work because of the self.storyNum += 1 line.

import scrapy, docx, time
import subprocess as sp

class FanfictionDownloader(scrapy.Spider):
    name = "fanfiction"
    storyNum = 0
    nextPage = False
    urls = []
    docText = ''
    title = ''

    def start_requests(self):
        sp.call('cls', shell=True)
        self.urls = list(str(input("Enter url seperated by a comma and space (, ): ")).split(', '))
        for url in self.urls:
            if self.urls[self.storyNum].startswith('https://www.fanfiction.net/s/'):
                yield scrapy.Request(url=url, callback=self.fanfiction)
            elif self.urls[self.storyNum].startswith('https://www.wattpad.com/'):
                yield scrapy.Request(url=url, callback=self.wattpad)
            else:
                print('Not a valid link, ending downloader.')
                time.sleep(5)
                quit()
                sp.call('cls', shell=True)

    def fanfiction(self, response):
        self.storyNum += 1
        doc = docx.Document()
        chapters = ''
        totalChapters = 0
        currentChapter = 1
        i = 0
        for para in response.css('div#storytext > p'):
            text = (para.xpath('text() | */text() | */*/text()').getall())
            self.title = (response.xpath('//*[@id="profile_top"]/b/text()').get())
            storyId = ((response.xpath('//*[@id="profile_top"]/span[4]/text()[5]').get()).replace(' - id: ', ''))
            chapters = (response.xpath('//*[@id="chap_select"]/option/text()').getall())
            totalChapters = len(chapters[0:int(len(chapters) / 2)])
            finalText = [
                [x.replace('\u00ef', 'ï').replace('\u2013', '–').replace('\u2026', '...') for x in text],
                ['Story %s: %s' % (self.storyNum,  self.urls[self.storyNum - 1])],
                ['Title: %s' % (self.title)],
                ['Story ID: %s' % (storyId)],
                ['Total Chapters: %s' % (totalChapters)],
                ['Chapter Names: %s' % [chapters[0:int(len(chapters) / 2)]]],
            ]
            if len(finalText[0][0]) > 1:
                self.docText = (''.join(finalText[0][0:]))
            else:
                self.docText = finalText[0][0]
            if self.nextPage == False:
                doc.add_paragraph(self.docText)
            else:
                doc.add_page_break(self.docText)
                self.nextPage = False
                doc.add_paragraph()
            sp.call('cls', shell=True)
            doc.save('./../%s.docx' % (self.title))
            i += 1
            yield {'line ' + str(i): finalText}
            sp.call('cls', shell=True)

    def wattpad(self, response):
        pass

标签: pythonweb-scrapingscrapy

解决方案


你真的需要为故事编号保留一个计数器吗?

我认为只要找到一个页面,您就可以生成下一页,例如:

if response.xpath('//button[text()="Next >"]'):
    next_link = response.xpath('//button[text()="Next >"]')[0].attrib['onclick'].replace('self.location=', '').replace("'", '')
    yield response.follow('https://www.fanfiction.net' + next_link, self.fanfiction)

如评论中所述,您应该使用项目管道来关心将项目“存储”在文档中。

这里有一些东西可以给你一个想法,这对我有用,你必须适应你的用例:

import docx
import scrapy

class StoryPipeline:

    def open_spider(self, spider):
        self.doc = docx.Document()

    def process_item(self, item, spider):
        if 'title' in item:
            self.title = item['title']
            self.doc.add_paragraph(str(item))
        else:
            self.doc.add_paragraph('\n\n'.join(item['paragraphs']))

    def close_spider(self, spider):
        self.doc.save('./%s.docx' % (self.title))


class FanfictionDownloader(scrapy.Spider):

    name = "fanfiction.net"

    custom_settings = {
        "ITEM_PIPELINES": {
            "myspider.StoryPipeline": 300,
        }
    }

    def start_requests(self):
        start_url = 'https://www.fanfiction.net/s/11734723/1/This-Past-Storm'
        yield scrapy.Request(url=start_url, callback=self.parse)

    def parse(self, response):

        title = response.xpath('//*[@id="profile_top"]/b/text()').get()
        storyId = response.xpath('//*[@id="profile_top"]/span[4]/text()[5]').get().replace(' - id: ', '')
        chapters = response.xpath('(//select[@id="chap_select"])[1]/option/text()').getall()

        yield {
            'title': title,
            'storyId': storyId,
            'chapters': chapters,
            'totalChapters': len(chapters),
        }

        for x in self._parse_paragraphs(response):
            yield x

    def parse_next(self, response):

        for x in self._parse_paragraphs(response):
            yield x

    def _parse_paragraphs(self, response):
        paragraphs = response.xpath('//div[@id="storytext"]//text()').getall()

        yield {'paragraphs': paragraphs}

        next_button = response.xpath('(//button[text()="Next >"])[1]/@onclick').get()
        if next_button:
            next_url = next_button.replace('self.location=', '').replace("'", '')
            yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_next)

推荐阅读