python - Python WebScraping 在没有完成且没有给出错误的情况下关闭
问题描述
我正在制作一个简单的 WebScraping 来下载网站某些冠军项目的图像,我放了一个带有 5 个字符的“for”,它只执行其中的 2 个,然后关闭而不给出任何错误!
import bs4 as bs
import sys,os
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
print("#1 __init__")
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('#2 On Load finished')
def Callable(self, html_str):
print("#3 Callable\n")
self.html = html_str
self.app.quit()
def already_exist(image_name):
for _, _, folder in os.walk('Images'):
if image_name in folder:
return False
else:
return True
def ImageDownload(url):
image_name = url.split("/")
try:
if already_exist(image_name[-1]):
full_path = "Images/" + image_name[-1]
urllib.request.urlretrieve(url, full_path)
print("Download %s" % image_name)
else:
print("Image already Downloaded >: %s" % image_name[-1])
except:
print("Error Download")
def main():
champions = ['Amumu','Akali','Zed','Nunu'] #champions
for champ in champions:
try:
print("\nDownloading Images >: %s"% champ)
data = Page('https://www.probuilds.net/champions/details/%s' % champ.strip())
soup = bs.BeautifulSoup(data.html, 'html.parser')
items = soup.find_all('div',{'class':'items'})
for photos in items:
images = photos.find_all('img')
for image in images:
ImageDownload(image['src'])
except:
print("Shi...")
main()
我没有收到任何错误,但程序只执行了 2 次,这是问题所在,有人帮助我!!!
解决方案
看起来QWebEnginePage没有正确关闭,建议重用而不是创建另一个QWebEnginePage,因此使用旧答案作为基础,我实现了以下解决方案:
import os
import sys
import bs4 as bs
import urllib.request
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class WebPage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
self.process(self.url(), html)
if not self.fetchNext():
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def process(self, url, html):
print('loaded: [%d chars] %s' % (len(html), url.toString()))
class ScrapePage(WebPage):
def __init__(self):
super(ScrapePage, self).__init__()
self.results = set()
def process(self, url, html):
soup = bs.BeautifulSoup(html, 'html.parser')
items = soup.find_all('div',{'class':'items'})
for photos in items:
images = photos.find_all('img')
for image in images:
self.results.add(image['src'])
def already_exist(image_name):
for _, _, folder in os.walk('Images'):
if image_name in folder:
return False
else:
return True
def ImageDownload(url):
image_name = url.split("/")
try:
if already_exist(image_name[-1]):
full_path = "Images/" + image_name[-1]
urllib.request.urlretrieve(url, full_path)
print("Download %s" % image_name)
else:
print("Image already Downloaded >: %s" % image_name[-1])
except:
print("Error Download")
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
webpage = ScrapePage()
champions = ['Amumu','Akali','Zed','Nunu']
base_url = 'https://www.probuilds.net/champions/details/'
urls = []
for champ in champions:
url = QtCore.QUrl(base_url).resolved(QtCore.QUrl(champ))
urls.append(url)
webpage.start(urls)
app.exec_()
for url in webpage.results:
ImageDownload(url)
推荐阅读
- python - Pandas-python:数据帧在特定字符串后拆分为新行
- python - Debian 10:Selenium firefox_driver 与 Firefox ESR 不兼容
- angular - 引用组件类型时的循环依赖
- oracle - 如何通过过滤分区中的值然后子分区来编写选择plSQL脚本
- go - 如何将错误作为值而不是引用返回
- python - 午夜的 strftime 格式为“24”而不是“00”
- authentication - 你能 Windows Auth 到 Basic Auth 吗?
- javascript - 这个条件函数有什么问题?
- python - 类中的变量需要帮助
- javascript - 从特定的第 3 层嵌套 JSON 制表符属性中过滤整个表