首页 > 解决方案 > 我想在对多个页面进行图像抓取后下载

问题描述

我想在对多个页面进行图像抓取后下载。但是,无法下载所有图像,因为它们被 [for syntax] 覆盖。

下面是我的代码。怎么了?

from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests as rq

for page in range(2,4):
baseUrl = 'https://onepiecetreasurecruise.fr/Artwork/index.php?page=index'
plusUrl = baseUrl + str(page)
html = urlopen(plusUrl).read()
soup = BeautifulSoup(html, 'html.parser')
img = soup.find_all(class_='card-img-top')
listimg = []
    for i in img:
       listimg.append(i['src'])
n = 1
    for index, img_link in enumerate(listimg):
        img_data = rq.get(img_link).content
        with open('./onepiece/' + str(index+1) + '.png', 'wb+') as f:
            f.write(img_data)
            n += 1

标签: imagefor-loopbeautifulsoupdownloadweb-crawler

解决方案


另一种方法是下载所有图片。

from simplified_scrapy import Spider, SimplifiedDoc, utils, SimplifiedMain


class ImageSpider(Spider):
    name = 'onepiecetreasurecruise'
    start_urls = ['https://onepiecetreasurecruise.fr/Artwork/index.php?page=index']

    # refresh_urls = True
    concurrencyPer1s = 0.5 # set download speed
    imgPath = 'images/'
    def __init__(self):
        Spider.__init__(self, self.name)  # necessary
        utils.createDir(self.imgPath) # create image dir

    def afterResponse(self, response, url, error=None, extra=None):
        try: # save images
            flag = utils.saveResponseAsFile(response, self.imgPath, 'image')
            if flag: return None
        except Exception as err:
            print(err)
        return Spider.afterResponse(self, response, url, error, extra)

    def extract(self, url, html, models, modelNames):
        doc = SimplifiedDoc(html)
        # image urls
        urls = doc.body.getElements('p', value='card-text').a
        if (urls):
            for u in urls:
                u['header']={'Referer': url['url']}
            self.saveUrl(urls)
        # next page urls
        u = doc.body.getElementByText('Suivant',tag='a')
        if (u):
            u['href'] = utils.absoluteUrl(url.url,u.href)
            self.saveUrl(u)
        return True


SimplifiedMain.startThread(ImageSpider()) # start download

推荐阅读