python-3.x - 在python中使用多线程请求图像时出现操作系统错误
问题描述
我正在制作一个程序,该程序从网站获取有关游戏的信息,其中包括信息、图像,因为我试图下载该网站上所有游戏的信息,使用具有 1Mbps 连接的单线程会非常痛苦,所以我决定对这个问题采取行动,并编程为游戏开始时的每个字母生成一个线程,(游戏可以通过这样的过滤)。因此,在将相应图像下载到某个游戏的函数内部,虽然我有多个线程,但在执行的某个时间点(迟早)会引发错误,然后在处理它的 except 块内,引发另一个异常,等等,一遍又一遍......这会立即导致线程结束,但事实是,当我只剩下一个小线程可以依赖时,
问题: 如何解决这个问题,为什么会这样?
推论: 我认为,当多个线程到达download_image函数内的 requests.get 行(问题必须所在的函数)时,可能由于多个请求而失败......据我所知.
我真的不知道如何解决这个问题,话虽如此,我将不胜感激任何帮助,在此先感谢。
我摆脱了所有不必对上述问题做任何事情的功能。 我在程序结束时生成线程,每个线程目标函数都命名为get_all_games_from_letter。
代码
from bs4 import BeautifulSoup
from string import ascii_lowercase
from datetime import date
from vandal_constants import *
from PIL import Image
from requests.exceptions import ConnectionError
from exceptions import NoTitleException
from validator_collection import url as url_check
from rawgpy import RAWG
from io import BytesIO
import traceback
import requests
import threading
import sqlite3
import concurrent.futures
### GLOBALS #####
FROM_RAWG = False
INSERT_SQL = ''
# CONSTANTS ########
rawg = RAWG('A Collector')
#################
def download_image(tag=None, game=None, rawg_game=None):
if tag:
return sqlite3.Binary(requests.get(url).content) if (url := tag['data-src']) else None
elif game:
global FROM_RAWG
img_tag = game.select_one(IMG_TAG_SELECTOR)
if img_tag and img_tag.get('data-src', None):
try:
if url_check(img_tag['data-src']):
return sqlite3.Binary(requests.get(img_tag['data-src']).content)
print(f"{img_tag['data-src']} is NOT a valid url")
except ConnectionError:
try:
print('Error While downloading from "Vandal.elespannol.com" website:')
traceback.print_exc()
except Exception:
print('Another Exception Ocurred')
traceback.print_exc()
except OSError:
print('Error en el Handshake parece')
traceback.print_exc()
FROM_RAWG = True
if rawg_game and getattr(rawg_game, 'background_image', None):
try:
print('Continue to download from RAWG')
return sqlite3.Binary(requests.get(rawg_game.background_image).content)
except ConnectionError:
print('Error While downloading from RAWG:')
traceback.print_exc()
return None
def prepare_game_record(game, db_games_set):
global INSERT_SQL
title = getattr(game.select_one(TITLE_TAG_SELECTOR), 'text', None)
if not title:
raise NoTitleException()
if title in db_games_set:
print(f'Already Have {title} in database')
return None
description = game.select_one(DESCRIPTION_TAG_SELECTOR)
rawg_game = None
try:
rawg_game = rawg.search(title)[0]
except Exception as err:
print('No rawg')
traceback.print_exc()
game_data = {
'nombre': title,
'descripcion': description.text if description else rawg_game.description if rawg_game else '',
'genero': genres if (genres := translate_genres(game.select_one(GENRES_TAG_SELECTOR).contents[1].strip().split(' / '))) else '',
'fondo': resize_image(img) if (img := download_image(game=game, rawg_game=rawg_game)) and not FROM_RAWG else img,
'year': None,
}
if not INSERT_SQL:
INSERT_SQL = construct_sql_insert(**game_data)
if hasattr(rawg_game, 'released'):
game_data['year'] = date.fromisoformat(rawg_game.released).year
return tuple(game_data.values())
def get_all_games_from_letter(letter):
global FROM_RAWG
counter = 36
hashes_set = set()
with sqlite3.connect('/media/l0new0lf/LocalStorage/data.db') as connection:
cursor = connection.cursor()
cursor.execute(f'SELECT nombre FROM juegos where nombre like "{letter.upper()}%"')
db_games_set = []
for row in cursor:
db_games_set.append(row[0])
db_games_set = set(db_games_set)
while True:
try:
prepared_games = []
rq = requests.get(
f'https://vandal.elespanol.com/juegos/13/pc/letra/{letter}/inicio/{counter}')
if rq:
print('Request GET: from ' +
f'https://vandal.elespanol.com/juegos/13/pc/letra/{letter}/inicio/{counter}' + ' Got Workable HTML !')
else:
print('Request GET: from ' +
f'https://vandal.elespanol.com/juegos/13/pc/letra/{letter}/inicio/{counter}' + ' Not Working !!, getting next page!')
continue
if rq.status_code == 301 or rq.status_code == 302 or rq.status_code == 303 or rq.status_code == 304:
print(f'No more games in letter {letter}\n**REDIRECTING TO **')
break
counter += 1
soup = BeautifulSoup(rq.content, 'lxml')
main_table = soup.select_one(GAME_SEARCH_RESULTS_TABLE_SELECTOR)
if hash(main_table.get_text()) not in hashes_set:
hashes_set.add(hash(main_table.get_text()))
else:
print('Repeated page ! I\'m done with this letter.')
break
game_tables = main_table.find_all(
'table', {'class': GAME_TABLES_CLASS})
print('entering game_tables loop')
for game in game_tables:
FROM_RAWG = False
try:
game_record = prepare_game_record(game, db_games_set)
except NoTitleException:
print('There is no title for this game, DISCARDING!')
continue
except Exception as err:
print('Unknown ERROR in prepare_games_record function')
traceback.print_exc()
continue
if not game_record:
continue
prepared_games.append(game_record)
print('Game successfully prepared !')
if prepared_games:
print(f'Thread, Writing to Database')
try:
cursor.executemany(INSERT_SQL, prepared_games)
connection.commit()
except Exception as err:
print(err)
print('done')
except Exception as err:
print('TRULY UNEXPECTED EXCEPTION')
print(err)
traceback.print_exc()
continue
#get_all_games_from_letter('c') You use a single thread?, no trouble at all!!
with concurrent.futures.ThreadPoolExecutor(len(ascii_lowercase)) as executor:
for letter in ascii_lowercase:
executor.submit(get_all_games_from_letter, letter)
错误堆栈跟踪:
注意:这只是部分错误,但其余部分完全相同。
Game successfully prepared !
Error While downloading from "Vandal.elespannol.com" website:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 665, in urlopen
httplib_response = self._make_request(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 376, in _make_request
self._validate_conn(conn)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 996, in _validate_conn
conn.connect()
File "/usr/lib/python3/dist-packages/urllib3/connection.py", line 366, in connect
self.sock = ssl_wrap_socket(
File "/usr/lib/python3/dist-packages/urllib3/util/ssl_.py", line 370, in ssl_wrap_socket
return context.wrap_socket(sock, server_hostname=server_hostname)
File "/usr/lib/python3.8/ssl.py", line 500, in wrap_socket
return self.sslsocket_class._create(
File "/usr/lib/python3.8/ssl.py", line 1040, in _create
self.do_handshake()
File "/usr/lib/python3.8/ssl.py", line 1309, in do_handshake
self._sslobj.do_handshake()
OSError: [Errno 0] Error
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 439, in send
resp = conn.urlopen(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 719, in urlopen
retries = retries.increment(
File "/usr/lib/python3/dist-packages/urllib3/util/retry.py", line 400, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/lib/python3/dist-packages/six.py", line 702, in reraise
raise value.with_traceback(tb)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 665, in urlopen
httplib_response = self._make_request(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 376, in _make_request
self._validate_conn(conn)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 996, in _validate_conn
conn.connect()
File "/usr/lib/python3/dist-packages/urllib3/connection.py", line 366, in connect
self.sock = ssl_wrap_socket(
File "/usr/lib/python3/dist-packages/urllib3/util/ssl_.py", line 370, in ssl_wrap_socket
return context.wrap_socket(sock, server_hostname=server_hostname)
File "/usr/lib/python3.8/ssl.py", line 500, in wrap_socket
return self.sslsocket_class._create(
File "/usr/lib/python3.8/ssl.py", line 1040, in _create
self.do_handshake()
File "/usr/lib/python3.8/ssl.py", line 1309, in do_handshake
self._sslobj.do_handshake()
urllib3.exceptions.ProtocolError: ('Connection aborted.', OSError(0, 'Error'))
解决方案
为了解决这个问题,人们只需要添加一个全局锁,以便当每个线程尝试请求获取图像时,它必须首先询问是否某个线程已经在使用它。也就是说,下载图像仅限于所有线程同时使用一次
#######GLOBALS####
lock = threading.Lock() #Add this to globals variables
##################
def download_image(tag=None, game=None, rawg_game=None):
if tag:
return sqlite3.Binary(requests.get(url).content) if (url := tag['data-src']) else None
elif game:
global FROM_RAWG
img_tag = game.select_one(IMG_TAG_SELECTOR)
if img_tag and img_tag.get('data-src', None):
try:
if url_check(img_tag['data-src']):
lock.acquire() #acquire the lock for downloading (it means other threads must wait until the one that acquired finishes)
temp = sqlite3.Binary(requests.get(img_tag['data-src']).content)
lock.release() # release the lock when done with receiving the HttpResponse
return temp
print(f"{img_tag['data-src']} is NOT a valid url")
except ConnectionError:
try:
print('Error While downloading from "Vandal.elespannol.com" website:')
traceback.print_exc()
except Exception:
print('Another Exception Ocurred')
traceback.print_exc()
except OSError:
print('Error en el Handshake parece')
traceback.print_exc()
FROM_RAWG = True
if rawg_game and getattr(rawg_game, 'background_image', None):
try:
print('Continue to download from RAWG')
lock.acquire() #acquire the lock for downloading (it means other threads must wait until the one that acquired finishes)
temp = sqlite3.Binary(requests.get(rawg_game.background_image).content)
lock.release() # release the lock when done with
return temp
except ConnectionError:
print('Error While downloading from RAWG:')
traceback.print_exc()
return None
完成了,在多个线程中下载图像不再有麻烦......但仍然......我实际上不知道为什么我需要确保一个 request.get 是为所有线程制作的,我认为操作系统处理通过使用队列或其他东西来解决这个问题。
推荐阅读
- java - 如何使用构造函数 java 启动 String 类
- html - Node.JS 编辑用户按钮转到同一用户
- apache-spark - 如何在rstudio中设置spark,spark_connect()不能工作
- php - 我想查询 ci_sessions 表来检查有多少用户在线
- java - 使用 Java 供应商/函数在静态方法中传递方法
- javascript - CSS下拉响应子菜单
- google-cloud-platform - BigQuery“此查询在运行时将处理 0 B”
- json - JSON.parse ReferenceError:未定义解析
- python - Python中的字符串问题
- python - int() 与 .astype('int') Python