首页 > 解决方案 > 检查域的img的脚本

问题描述

我需要一些帮助来调试我拥有的用于检查域徽标的代码。我对 100 万个域(a.com、a1.com、a2.com ...)进行了样本测试,我得到了 70 万个域的输出(缺少 30 万个域),这是我当前的代码

from scrapy.selector import Selector
import requests
from urllib.parse import urljoin
import os
import time
import concurrent.futures
FOUND_IMAGES = 'found.txt'
FOUND_DOMAINS = 'found_domains.txt'
SKIPPED = 'skipped.txt'
BASE_SIXTY = 'base64.txt'
NOT_FOUND_DOMAINS = 'input.txt'
XPATHS = ['//meta[contains(@content,"favico")]/@content', '//*[@type="image/x-icon"]/@href', 
          '//link[@rel="icon" and @type="image/png"]/@href']


def write_to_file(lst, filename):
    with open(filename, 'w') as f:
        for i in lst:
            f.write(f'{i}\n')


def read_file(fn):
    if os.path.isfile(fn):
        with open(fn, 'r') as f:
            temp = f.read().split('\n')
        return set(temp)
    else:
        return set()


def download_all_sites(sites):
    with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
        executor.map(get_image, sites)


headers = {
    'User-Agent': 'Firefox/82.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'DNT': '1',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
}
urls = read_file(NOT_FOUND_DOMAINS)
skipped_domains = read_file(SKIPPED)
base64 = read_file(BASE_SIXTY)
found = read_file(FOUND_IMAGES)
found_domains = read_file(FOUND_DOMAINS)
print(f"Initial length: {len(urls)}")
urls = urls - skipped_domains
urls = urls - base64
urls = urls - found_domains
if '' in urls:
    urls.remove('')
print(f"Final length: {len(urls)}")


def get_image(*urls2):
    for domain in urls2:
        img = ''
        url = f"http://www.{domain}"
        try:
            res = requests.get(url, headers=headers, timeout=20, allow_redirects=True)
        except (requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout,requests.exceptions.TooManyRedirects):
            skipped_domains.add(domain)
            continue
        response = Selector(res)
        for x in XPATHS:
            img = response.xpath(x).extract_first()
            if not img or ("data:image" in img and len(img) < 50):
                continue
            else:
                break
        if not img:
            skipped_domains.add(domain)
            continue
        if "base64" in img:
            base64.add(domain)
            continue
        else:
            if not img.startswith('http'):
                image_url = urljoin(url, img)
            found.add(image_url)
            found_domains.add(domain)


print("\n\n\nStart of log")
start_time = time.time()
download_all_sites(urls)
duration = time.time() - start_time
print(f"Downloaded {len(urls)} in {duration} seconds")
write_to_file(found, FOUND_IMAGES)
write_to_file(base64, BASE_SIXTY)
write_to_file(skipped_domains, SKIPPED)
write_to_file(found_domains, FOUND_DOMAINS)
print("Completed")

理想情况下,如果我在同一个输入上运行它两次,它应该没有要循环的 URL。我仔细检查了我的get_image功能,它不应该跳过任何网址(即所有域都将放在各自的 url 中set

concurrent.futures.ThreadPoolExecutor(max_workers=50)用来加速这个过程,我相信它不应该影响元素添加到集合中的方式。

# these 3 lines should give 0 on second run of same input
urls = urls - skipped_domains
urls = urls - base64
urls = urls - found_domains

标签: pythonpython-requests

解决方案


推荐阅读