首页 > 解决方案 > 在python中使用多线程请求图像时出现操作系统错误

问题描述

我正在制作一个程序,该程序从网站获取有关游戏的信息,其中包括信息、图像,因为我试图下载该网站上所有游戏的信息,使用具有 1Mbps 连接的单线程会非常痛苦,所以我决定对这个问题采取行动,并编程为游戏开始时的每个字母生成一个线程,(游戏可以通过这样的过滤)。因此,在将相应图像下载到某个游戏的函数内部,虽然我有多个线程,但在执行的某个时间点(迟早)会引发错误,然后在处理它的 except 块内,引发另一个异常,等等,一遍又一遍......这会立即导致线程结束,但事实是,当我只剩下一个小线程可以依赖时,

问题: 如何解决这个问题,为什么会这样?

推论: 我认为,当多个线程到达download_image函数内的 requests.get 行(问题必须所在的函数)时,可能由于多个请求而失败......据我所知.

我真的不知道如何解决这个问题,话虽如此,我将不胜感激任何帮助,在此先感谢。

我摆脱了所有不必对上述问题做任何事情的功能。 我在程序结束时生成线程,每个线程目标函数都命名为get_all_games_from_letter

代码

from bs4 import BeautifulSoup
from string import ascii_lowercase
from datetime import date
from vandal_constants import *
from PIL import Image
from requests.exceptions import ConnectionError
from exceptions import NoTitleException
from validator_collection import url as url_check
from rawgpy import RAWG
from io import BytesIO
import traceback
import requests
import threading
import sqlite3
import concurrent.futures

###  GLOBALS   #####
FROM_RAWG = False
INSERT_SQL = ''
# CONSTANTS ########

rawg = RAWG('A Collector')
#################
def download_image(tag=None, game=None, rawg_game=None):
    if tag:
        return sqlite3.Binary(requests.get(url).content) if (url := tag['data-src']) else None
    elif game:
        global FROM_RAWG
        img_tag = game.select_one(IMG_TAG_SELECTOR)
        if img_tag and img_tag.get('data-src', None):
            try:
                if url_check(img_tag['data-src']):
                    return sqlite3.Binary(requests.get(img_tag['data-src']).content)
                print(f"{img_tag['data-src']} is NOT a valid url")
            except ConnectionError:
                try:
                    print('Error While downloading from "Vandal.elespannol.com" website:')
                    traceback.print_exc()
                except Exception:
                    print('Another Exception Ocurred')
                    traceback.print_exc()
            except OSError:
                print('Error en el Handshake parece')
                traceback.print_exc()
                

        FROM_RAWG = True
        if rawg_game and getattr(rawg_game, 'background_image', None):
            try:
                print('Continue to download from RAWG')
                return sqlite3.Binary(requests.get(rawg_game.background_image).content)
            except ConnectionError:
                print('Error While downloading from RAWG:')
                traceback.print_exc()
            

    return None
def prepare_game_record(game, db_games_set):
    global INSERT_SQL


    title = getattr(game.select_one(TITLE_TAG_SELECTOR), 'text', None)

    if not title:
        raise NoTitleException()

    if title in db_games_set:
        print(f'Already Have {title} in database')
        return None
    
    description = game.select_one(DESCRIPTION_TAG_SELECTOR)
    rawg_game = None
    try:
        rawg_game = rawg.search(title)[0]
    except Exception as err:
        print('No rawg')
        traceback.print_exc()
        
    game_data = {
        'nombre': title,
        'descripcion': description.text if description else rawg_game.description if rawg_game else '',
        'genero': genres if (genres := translate_genres(game.select_one(GENRES_TAG_SELECTOR).contents[1].strip().split(' / '))) else '',
        'fondo':  resize_image(img) if (img := download_image(game=game, rawg_game=rawg_game)) and not FROM_RAWG else img,
        'year': None,
    }

    if not INSERT_SQL:
        INSERT_SQL = construct_sql_insert(**game_data)

    if hasattr(rawg_game, 'released'):
        game_data['year'] = date.fromisoformat(rawg_game.released).year
   

    return tuple(game_data.values())

def get_all_games_from_letter(letter):
    global FROM_RAWG
    counter = 36
    hashes_set = set()

    with sqlite3.connect('/media/l0new0lf/LocalStorage/data.db') as connection:
        cursor = connection.cursor()
        cursor.execute(f'SELECT nombre FROM juegos where nombre like "{letter.upper()}%"')
        db_games_set = []
        for row in cursor:
            db_games_set.append(row[0])
        db_games_set = set(db_games_set)

        while True:
            try:
                prepared_games = []
                rq = requests.get(
                    f'https://vandal.elespanol.com/juegos/13/pc/letra/{letter}/inicio/{counter}')

                if rq:
                    print('Request GET: from ' +
                        f'https://vandal.elespanol.com/juegos/13/pc/letra/{letter}/inicio/{counter}' + ' Got Workable HTML !')
                else:
                    print('Request GET: from ' +
                        f'https://vandal.elespanol.com/juegos/13/pc/letra/{letter}/inicio/{counter}' + ' Not Working !!, getting next page!')
                    continue

                if rq.status_code == 301 or rq.status_code == 302 or rq.status_code == 303 or rq.status_code == 304:
                    print(f'No more games in letter {letter}\n**REDIRECTING TO **')
                    break

                counter += 1

                soup = BeautifulSoup(rq.content, 'lxml')
                main_table = soup.select_one(GAME_SEARCH_RESULTS_TABLE_SELECTOR)

                if hash(main_table.get_text()) not in hashes_set:
                    hashes_set.add(hash(main_table.get_text()))
                else:
                    print('Repeated page ! I\'m done with this letter.')
                    break

                game_tables = main_table.find_all(
                    'table', {'class': GAME_TABLES_CLASS})

                print('entering game_tables loop')

                for game in game_tables:
                    FROM_RAWG = False
                    try:
                        game_record = prepare_game_record(game, db_games_set)
                    except NoTitleException:
                        print('There is no title for this game, DISCARDING!')
                        continue
                    except Exception as err:
                        print('Unknown ERROR in prepare_games_record function')
                        traceback.print_exc()
                        continue
                        


                    if not game_record:
                        continue
                    prepared_games.append(game_record)
                    print('Game successfully prepared !')

                if prepared_games:
                    print(f'Thread, Writing to Database')
                    try:
                        cursor.executemany(INSERT_SQL, prepared_games)
                        connection.commit()

                    except Exception as err:
                        print(err)

                    print('done')

            except Exception as err:
                print('TRULY UNEXPECTED EXCEPTION')
                print(err)
                traceback.print_exc()
                continue
#get_all_games_from_letter('c') You use a single thread?, no trouble at all!!
with concurrent.futures.ThreadPoolExecutor(len(ascii_lowercase)) as executor:
    for letter in ascii_lowercase:
        executor.submit(get_all_games_from_letter, letter)

错误堆栈跟踪:

注意:这只是部分错误,但其余部分完全相同。

Game successfully prepared !
Error While downloading from "Vandal.elespannol.com" website:
Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 665, in urlopen
    httplib_response = self._make_request(
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 376, in _make_request
    self._validate_conn(conn)
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 996, in _validate_conn
    conn.connect()
  File "/usr/lib/python3/dist-packages/urllib3/connection.py", line 366, in connect
    self.sock = ssl_wrap_socket(
  File "/usr/lib/python3/dist-packages/urllib3/util/ssl_.py", line 370, in ssl_wrap_socket
    return context.wrap_socket(sock, server_hostname=server_hostname)
  File "/usr/lib/python3.8/ssl.py", line 500, in wrap_socket
    return self.sslsocket_class._create(
  File "/usr/lib/python3.8/ssl.py", line 1040, in _create
    self.do_handshake()
  File "/usr/lib/python3.8/ssl.py", line 1309, in do_handshake
    self._sslobj.do_handshake()
OSError: [Errno 0] Error

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/requests/adapters.py", line 439, in send
    resp = conn.urlopen(
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 719, in urlopen
    retries = retries.increment(
  File "/usr/lib/python3/dist-packages/urllib3/util/retry.py", line 400, in increment
    raise six.reraise(type(error), error, _stacktrace)
  File "/usr/lib/python3/dist-packages/six.py", line 702, in reraise
    raise value.with_traceback(tb)
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 665, in urlopen
    httplib_response = self._make_request(
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 376, in _make_request
    self._validate_conn(conn)
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 996, in _validate_conn
    conn.connect()
  File "/usr/lib/python3/dist-packages/urllib3/connection.py", line 366, in connect
    self.sock = ssl_wrap_socket(
  File "/usr/lib/python3/dist-packages/urllib3/util/ssl_.py", line 370, in ssl_wrap_socket
    return context.wrap_socket(sock, server_hostname=server_hostname)
  File "/usr/lib/python3.8/ssl.py", line 500, in wrap_socket
    return self.sslsocket_class._create(
  File "/usr/lib/python3.8/ssl.py", line 1040, in _create
    self.do_handshake()
  File "/usr/lib/python3.8/ssl.py", line 1309, in do_handshake
    self._sslobj.do_handshake()
urllib3.exceptions.ProtocolError: ('Connection aborted.', OSError(0, 'Error'))

标签: python-3.xmultithreadingpython-requests

解决方案


为了解决这个问题,人们只需要添加一个全局锁,以便当每个线程尝试请求获取图像时,它必须首先询问是否某个线程已经在使用它。也就是说,下载图像仅限于所有线程同时使用一次

#######GLOBALS####
lock = threading.Lock() #Add this to globals variables
##################

def download_image(tag=None, game=None, rawg_game=None):
    if tag:
        return sqlite3.Binary(requests.get(url).content) if (url := tag['data-src']) else None
    elif game:
        global FROM_RAWG
        img_tag = game.select_one(IMG_TAG_SELECTOR)
        if img_tag and img_tag.get('data-src', None):
            try:
                if url_check(img_tag['data-src']):
                    lock.acquire() #acquire the lock for downloading (it means other threads must wait until the one that acquired finishes)
                    temp = sqlite3.Binary(requests.get(img_tag['data-src']).content)
                    lock.release() # release the lock when done with receiving the HttpResponse
                    return temp
                print(f"{img_tag['data-src']} is NOT a valid url")
            except ConnectionError:
                try:
                    print('Error While downloading from "Vandal.elespannol.com" website:')
                    traceback.print_exc()
                except Exception:
                    print('Another Exception Ocurred')
                    traceback.print_exc()
            except OSError:
                print('Error en el Handshake parece')
                traceback.print_exc()
                

        FROM_RAWG = True
        if rawg_game and getattr(rawg_game, 'background_image', None):
            try:
                print('Continue to download from RAWG')
                lock.acquire() #acquire the lock for downloading (it means other threads must wait until the one that acquired finishes)
                temp = sqlite3.Binary(requests.get(rawg_game.background_image).content)
                lock.release() # release the lock when done with
                return temp
            except ConnectionError:
                print('Error While downloading from RAWG:')
                traceback.print_exc()
            

    return None

完成了,在多个线程中下载图像不再有麻烦......但仍然......我实际上不知道为什么我需要确保一个 request.get 是为所有线程制作的,我认为操作系统处理通过使用队列或其他东西来解决这个问题。


推荐阅读