首页 > 解决方案 > Python 代理抓取:url 超出了最大重试次数(仅在 Mac 上,Windows 工作正常)

问题描述

大编辑:我发现只有在 Mac 上执行时才会收到此错误。Windows 工作正常,从不抛出异常。

我想从Discogs获取信息以供个人/教育使用。

我正在用 Python 构建爬虫(使用请求库)。为了加快整个过程,我正在使用免费代理列表,并且我正在代理爬行。在这里,我找到了一些免费的可靠代理,我已将它们放入我的代码中。

我想从这个页面开始访问所有母版页,遍历分页页和上面的每个母版页。此示例包含 2 页(全部 37 个母版)。

我创建了从动作缓冲区执行动作的线程,并且对于每个动作它都知道如何执行它。

帮手

import queue
import random
from fake_useragent import UserAgent

proxy_list = ["45.55.27.88:8080", "162.243.107.120:3128", "67.205.146.29:3128", "104.236.238.10:3128",
              "138.197.222.35:3128", "198.199.120.102:3128", "162.243.99.57:8080", "138.68.173.29:3128",
              "162.243.107.43:3128", "162.243.107.43:8080", "162.243.108.129:3128", "162.243.108.161:3128",
              "162.243.108.161:8080", "162.243.78.25:3128", "67.205.146.29:8080", "67.205.174.209:3128",
              "138.68.165.154:3128", "138.68.169.77:3128", "138.197.58.55:3128", "138.68.169.8:8080",
              "207.154.231.212:3128", "138.68.169.8:3128", "138.68.161.60:3128", "212.47.252.91:8118",
              "206.246.82.2:443", "202.166.117.46:8080", "185.93.3.70:8080", "192.117.146.110:80", "151.80.58.175:80",
              "139.162.235.163:31028", "103.25.138.233:8080", "163.172.173.187:3000", "113.253.113.90:80",
              "113.255.76.120:80", "159.8.114.37:25", "159.8.114.37:8123", "51.255.198.111:9999", "37.59.32.112:1080",
              "178.33.9.96:1080", "178.33.9.97:1080", "178.33.9.100:1080", "151.106.31.195:1080",
              "134.119.205.248:1080", "134.119.205.252:1080", "134.119.205.253:1080", "37.187.149.234:1080",
              "94.177.237.184:80", "178.33.9.101:1080", "134.119.184.69:1080", "134.119.184.70:1080",
              "134.119.184.75:1080", "134.119.184.87:1080", "134.119.184.94:1080", "94.177.237.184:8080",
              "134.119.205.243:1080", "88.190.203.36:80", "37.59.35.174:1080", "79.142.202.109:8080",
              "5.196.205.139:3128", "37.59.203.129:1080", "37.59.203.133:1080", "37.59.203.135:1080",
              "178.33.9.99:1080", "178.33.9.103:1080", "138.68.169.77:3128", "162.243.107.43:8080", "45.55.27.15:3128",
              "104.155.75.187:8080", "142.93.51.159:80", "213.148.240.2:80", "80.211.181.37:80", "66.70.170.147:80",
              "54.39.98.138:80", "204.48.22.246:80", "80.211.48.120:80", "142.93.182.13:80", "142.93.251.113:80",
              "66.70.173.54:80", "142.93.49.169:80", "192.99.226.30:80", "80.211.180.201:80", "213.136.87.65:80",
              "220.90.147.137:80", "68.185.57.66:80", "68.188.59.198:80", "50.203.239.19:80", "50.234.147.30:80",
              "148.251.238.35:80", "98.142.36.181:80", "128.140.225.41:80", "50.203.239.21:80", "50.203.239.31:80",
              "50.203.239.22:80", "75.150.88.59:80", "71.13.131.142:80", "27.255.91.146:80", "104.196.241.137:80",
              "94.177.237.184:3128", "134.119.205.244:1080", "37.59.203.132:1080", "178.128.176.221:8080",
              "142.93.250.239:80", "89.233.175.210:41258", "37.59.203.128:1080", "139.59.53.106:8080",
              "37.187.149.129:1080", "84.115.252.221:8080", "217.23.13.52:1080", "185.2.82.23:1080",
              "139.59.99.63:8080", "139.59.99.97:3128", "139.59.99.97:8080", "139.59.99.63:3128", "138.68.161.157:8080",
              "138.68.161.14:8080", "138.68.161.157:3128", "204.48.22.246:8080", "5.2.137.13:3128",
              "142.93.250.239:8080", "194.85.169.208:3128", "139.59.101.223:8080", "108.61.186.207:8080",
              "217.61.125.74:8080", "91.89.53.235:8080", "80.211.48.120:3128", "142.93.49.169:3128",
              "138.68.120.201:8080", "95.85.36.236:3128", "142.93.182.13:8080", "223.16.229.241:8080",
              "142.93.58.158:8080", "142.93.247.178:3128", "217.23.10.12:1080", "217.61.125.74:3128",
              "142.93.58.158:3128", "142.93.51.159:3128", "139.59.59.63:8080", "138.197.139.135:3128",
              "139.59.64.9:8080", "212.237.15.108:3128", "139.59.99.113:3128", "188.226.141.61:8080",
              "66.70.170.147:8080", "66.70.173.54:3128", "54.39.98.138:8799", "163.47.11.113:3128",
              "139.59.101.223:3128", "138.197.157.60:3128", "138.197.157.66:3128", "207.154.231.211:3128",
              "178.62.193.19:3128", "188.226.141.216:3128", "138.197.204.55:3128", "138.197.204.55:8080",
              "139.59.109.156:3128", "138.197.157.45:8080", "138.197.157.44:8080", "207.154.231.209:3128",
              "188.226.141.211:3128", "138.197.157.45:3128", "138.197.157.68:3128", "46.5.252.70:3128",
              "139.59.99.101:3128", "188.166.216.210:3128", "138.197.157.32:3128", "207.154.231.216:3128",
              "138.68.161.60:8080", "178.62.193.19:8080", "188.226.141.127:3128", "138.197.222.35:8080",
              "188.226.141.217:3128", "138.197.145.103:3128", "138.197.157.32:8080", "138.197.157.60:8080",
              "146.185.168.235:3128", "207.154.231.210:3128", "162.243.107.45:8080", "188.226.141.219:3128",
              "88.198.24.108:3128", "138.68.230.88:3128", "45.55.27.88:3128", "139.59.99.119:3128",
              "138.197.157.68:8080", "192.241.150.188:3128", "138.68.161.14:3128", "138.68.173.29:8080",
              "162.243.175.141:3128", "138.197.157.44:3128", "138.68.169.77:8080", "46.4.96.137:3128",
              "138.68.235.8:8080", "139.59.99.234:3128"]

random.shuffle(proxy_list)


class RequestHelper:
    proxies = None

    def __init__(self):
        self.proxies = self._get_proxies()

    def _get_proxies(self):
        temp = queue.Queue()
        for proxy in proxy_list:
            temp.put(proxy)
        return temp

    def put(self, proxy):
        self.proxies.put(proxy)

    def get_data(self):
        ip = self.proxies.get()
        proxy = {'http': 'http://' + ip}
        user_agent = {'user-agent': UserAgent().random}
        return {'proxy': proxy, 'user-agent': user_agent, 'ip': ip}

动作队列

import queue


class ActionQueue:

actions = None

# action = {'url': URL, 'action': TYPE_MASTER_LIST|TYPE_MASTER_PAGE|TYPE_RELEASE_PAGE }

def __init__(self):
    self.actions = queue.Queue()

def get_next(self):
    try:
        return self.actions.get_nowait()
    except queue.Empty as e:
        return None

def put(self, action):
    self.actions.put(action)

工人(线程)

import requests
import threading
from bs4 import BeautifulSoup
from time import sleep

BASE_URL = 'https://www.discogs.com'

TYPE_MASTER_LIST = 1
TYPE_MASTER_PAGE = 2
TYPE_RELEASE_PAGE = 3


class Worker(threading.Thread):
    THREAD_ID = 0

    MASTERS_DONE = 0

    def __init__(self, action_queue, request_helper):
        super(Worker, self).__init__()
        self.action_queue = action_queue
        self.request_helper = request_helper
        Worker.THREAD_ID += 1
        self.id = Worker.THREAD_ID
        self.success = 0
        self.setDaemon(True)
        pass

    def run(self):

        print('>[{tid}] is live.'.format(tid=self.id))
        request_data = self.request_helper.get_data()
        action_data = self.action_queue.get_next()

        while True:

            if action_data is None:
                sleep(5)
                action_data = self.action_queue.get_next()
                continue

            url = action_data['url']
            action = action_data['action']

            # change ip after successful requests
            if self.success == 10:
                self.success = 0
                request_data = self.request_helper.get_data()

            try:

                print('> [{id}] requests ({url}) with ({ip})'.format(id=self.id, url=url, ip=request_data['ip']))
                r = requests.get(url=url, headers=request_data['user-agent'], cookies={}, proxies=request_data['proxy'],
                                 timeout=7)

                # success
                if r.status_code == 200:
                    self.success += 1
                    soup = BeautifulSoup(r.text, 'lxml')
                    if action == TYPE_MASTER_LIST:
                        self._process_master_list(url, soup)
                    if action == TYPE_MASTER_PAGE:
                        self._process_master_page(url, soup)
                    print('> [{id}] finished - sleeping 3s.'.format(id=self.id))
                    sleep(3)
                    action_data = self.action_queue.get_next()
                # too many requests
                elif r.status_code == 429:
                    print('> [{id}] 429 fail - return action to queue - sleeping 5s.'.format(id=self.id))
                    sleep(5)
                else:
                    print('> [{id}] Random ERROR: {error_code}'.format(id=self.id, error_code=r.status_code))
                    sleep(5)

            except requests.exceptions.ConnectTimeout as e:
                print('> [{id}] == ConnectTimeout == [{ex}] - return action to queue - sleeping 10s.'.format(id=self.id,
                                                                                                             ex=str(e)))
                request_data = self.request_helper.get_data()
                sleep(10)

            except Exception as e:
                print('> [{id}] - random fail [{ex}].'.format(id=self.id, ex=str(e)))
                sleep(10)

            continue

    def _process_master_list(self, url, soup):
        print('> [{id}] - (1) processing {url}.'.format(id=self.id, url=url))

        master_page_urls = [BASE_URL + url['href'] for url in soup.select('.cards > .card > h4 > a')]
        for url in master_page_urls:
            self.action_queue.put({'url': url, 'action': TYPE_MASTER_PAGE})

        print('> [{id}] - added {cnt} master pages.'.format(id=self.id, cnt=str(len(master_page_urls))))

        link = soup.select_one('.pagination_next')
        if link is not None:
            master_list_url = BASE_URL + link['href']
            self.action_queue.put({'url': master_list_url, 'action': TYPE_MASTER_LIST})
            print('> [{id}] - added 1 master pages list.'.format(id=self.id))

    def _process_master_page(self, url, soup):
        print('> [{id}] - (2) processing {url}.'.format(id=self.id, url=url))
        Worker.MASTERS_DONE += 1
        print(' >>>>>>>> ' + str(Worker.MASTERS_DONE))

主要的

from worker import Worker
from actions import ActionQueue
from helper import RequestHelper
import time

BASE_URL = 'https://www.discogs.com'

TYPE_MASTER_LIST = 1
TYPE_MASTER_PAGE = 2
TYPE_RELEASE_PAGE = 3


def main():
    actions = ActionQueue()
    request_helper = RequestHelper()

    actions.put({
        'url': 'https://www.discogs.com/search/?limit=25&genre_exact=Hip+Hop&type=master&page=1&country_exact=Serbia',
        'action': TYPE_MASTER_LIST
    })

    workers = []
    for i in range(10):
        workers.append(Worker(actions, request_helper))

    for worker in workers:
        worker.start()

    while True:
        continue


if __name__ == "__main__":
    main()

代码在短时间内正确运行,之后它显示每个线程请求的 url 超出的最大重试次数,即使在更改代理之后也是如此。

[10] == ConnectTimeout == [HTTPSConnectionPool(host='www.discogs.com', port=443):最大重试次数超出 url:/Mar%C4%8Delo-Filteri-Deca-I-Sunce/master/640300 (由 ConnectTimeoutError(,'连接到 www.discogs.com 超时。(连接超时 = 7)'))] - 将操作返回到队列 - 休眠 10 秒。

每个线程都遭受相同的异常。

完整的执行日志可以在这里找到。我在 MacOS 下使用 Python 3.6。

标签: pythonweb-scrapingpython-requestsweb-crawler

解决方案


推荐阅读