首页 > 解决方案 > Python Multiprocessing.dummy Sockets 下载图像

问题描述

我正在使用 Python 套接字从网站下载图像。我对使用 requests 或 urllib 库不感兴趣。我想用线程加速这个过程。我以前使用过 multiprocessing.dummy 库,它通常对我有用。在这种情况下,这是非常不可预测的。当它应该下载 53 张图片时(参见下面的响应标题),它通常会下载 38 到 44 张。我计算了响应头消息的数量,其中有 53 个。这似乎表明我应该收到 53 张图片。然而,我的代码每次保存的图像少于 53 张。谁能发现我的线程在哪里出错?我在没有使用线程的情况下下载了所有图像,并且效果很好。这就是让我相信这是我的线程实现的问题。

PS D:\Documents\School\RIT\Classes\Summer 2018\CSEC 380\Homework\3\Script> python .\hw3-script.py
MESSAGE SENT
GET /gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files//xAcharya.jpg.pagespeed.ic.dQLJ0KfusA.jpg HTTP/1.0
Host: www.rit.edu
Accept: image/jpg, image/png


MESSAGE SENT
GET /gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files//xAbuaitah.jpg.pagespeed.ic.PFwk87Pcno.jpg HTTP/1.0
Host: www.rit.edu
Accept: image/jpg, image/png


MESSAGE SENT
GET /gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files/images/xCSEC.png.pagespeed.ic.Ep0KUkS94M.png HTTP/1.0
Host: www.rit.edu
Accept: image/jpg, image/png


MESSAGE SENT
GET /gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files//xJake,P20Brown.jpg.pagespeed.ic.KvGLjuuU03.jpg HTTP/1.0
Host: www.rit.edu
Accept: image/jpg, image/png

<...many more MESSAGE SENT and RESPONSE HEADERS...>

RESPONSE HEADERS
HTTP/1.1 200 OK
Date: Sun, 12 Aug 2018 15:19:44 GMT
Server: Apache
Link: <http://www.rit.edu/gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files/images/CSEC.png>; rel="canonical"
X-Content-Type-Options: nosniff
Accept-Ranges: bytes
Expires: Mon, 12 Aug 2019 12:51:06 GMT
Cache-Control: max-age=31536000
Etag: W/"0"
Last-Modified: Sun, 12 Aug 2018 12:51:06 GMT
X-Original-Content-Length: 13647
Content-Length: 10131
Connection: close
Content-Type: image/png

RESPONSE HEADERS
HTTP/1.1 200 OK
Date: Sun, 12 Aug 2018 15:19:44 GMT
Server: Apache
Link: <http://www.rit.edu/gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files/images/footer-logo.png>; rel="canonical"
X-Content-Type-Options: nosniff
Accept-Ranges: bytes
Expires: Mon, 12 Aug 2019 14:09:08 GMT
Cache-Control: max-age=31536000
Etag: W/"0"
Last-Modified: Sun, 12 Aug 2018 14:09:08 GMT
X-Original-Content-Length: 19921
Content-Length: 16125
Connection: close
Content-Type: image/png

Number of image urls: 53
Number of files downloaded: 42
Time elapsed: 0:00:04.019662

我的代码:

import sys
import socket
import re
import os
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime


from bs4 import BeautifulSoup


class MySocket:

    def __init__(self, sock=None):
        if sock is None:
            self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        else:
            self.sock = sock

    def connect(self, host, port):
        self.sock.connect((host, port))

    def myclose(self):
        self.sock.close()

    def mysend(self, msg, debug=False):
        if debug:
            print("MESSAGE SENT")
            print(msg.decode())
        self.sock.sendall(msg)

    def myreceive(self, debug=False):
        received = b''
        buffer = 1
        while True:
            part = self.sock.recv(buffer)
            received += part
            if part == b'':
                break
        if debug:
            print("Received...")
            print(received)
        return received


def get_image_urls(html):
    """
    Gets all urls corresponding to images in given html data
    :param data: html page to parse
    :return: list of image urls
    """
    soup = BeautifulSoup(html, "html.parser")
    img_urls = []
    for image in soup.find_all('img'):
        string = str(image)
        split = string.split("src=\"")
        if split[1][0] == '/':
            find_url = split[1].split("\"")
            url = find_url[0]
            img_urls.append(url)
            # img_urls.append("www.rit.edu" + url)
    return img_urls


def download_image(img_url):
    """
    Download images with the given socket and list of urls
    :param img_url: url corresponding to an image
    :return: None
    """
    image_socket = MySocket()
    image_socket.connect("www.rit.edu", 80)
    message = "GET " + img_url + " HTTP/1.0\r\n" \
              "Host: www.rit.edu\r\n" \
              "Accept: image/jpg, image/png\r\n\r\n"

    image_socket.mysend(message.encode(), debug=True)
    reply = image_socket.myreceive()
    headers = reply.split(b'\r\n\r\n')[0]

    print("RESPONSE HEADERS")
    print(headers.decode())
    print()

    image = reply[len(headers)+4:]
    img_name = str(len(os.listdir(".\\act1step2images"))) + img_url[-4:]
    f = open(os.path.join(".\\act1step2images", img_name), 'wb')
    f.write(image)
    f.close()


def download_images(image_urls, directory):
    if not os.path.exists(directory):
        os.mkdir(directory)

    pool = ThreadPool(100)
    pool.map(download_image, image_urls)
    pool.close()
    pool.join()


def main():
    start_time = datetime.now()

    host = "www.rit.edu"
    port = 80
    message = "GET /gccis/computingsecurity/people HTTP/1.0\r\n" \
              "Host: www.rit.edu\r\n" \
              "Accept: */*\r\n\r\n"
    part2_socket = MySocket()
    part2_socket.connect(host, port)
    part2_socket.mysend(message.encode())
    html = part2_socket.myreceive().decode()
    part2_socket.myclose()
    image_urls = get_image_urls(html)
    download_images(image_urls, ".\\act1step2images")
    print("Number of image urls:", len(image_urls))
    print("Number of files downloaded:", str(len(os.listdir(".\\act1step2images"))))

    print("Time elapsed:", datetime.now() - start_time)

main()

标签: pythonmultithreadingsocketshttppython-multithreading

解决方案


推荐阅读