python - Python Multiprocessing.dummy Sockets 下载图像
问题描述
我正在使用 Python 套接字从网站下载图像。我对使用 requests 或 urllib 库不感兴趣。我想用线程加速这个过程。我以前使用过 multiprocessing.dummy 库,它通常对我有用。在这种情况下,这是非常不可预测的。当它应该下载 53 张图片时(参见下面的响应标题),它通常会下载 38 到 44 张。我计算了响应头消息的数量,其中有 53 个。这似乎表明我应该收到 53 张图片。然而,我的代码每次保存的图像少于 53 张。谁能发现我的线程在哪里出错?我在没有使用线程的情况下下载了所有图像,并且效果很好。这就是让我相信这是我的线程实现的问题。
PS D:\Documents\School\RIT\Classes\Summer 2018\CSEC 380\Homework\3\Script> python .\hw3-script.py
MESSAGE SENT
GET /gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files//xAcharya.jpg.pagespeed.ic.dQLJ0KfusA.jpg HTTP/1.0
Host: www.rit.edu
Accept: image/jpg, image/png
MESSAGE SENT
GET /gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files//xAbuaitah.jpg.pagespeed.ic.PFwk87Pcno.jpg HTTP/1.0
Host: www.rit.edu
Accept: image/jpg, image/png
MESSAGE SENT
GET /gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files/images/xCSEC.png.pagespeed.ic.Ep0KUkS94M.png HTTP/1.0
Host: www.rit.edu
Accept: image/jpg, image/png
MESSAGE SENT
GET /gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files//xJake,P20Brown.jpg.pagespeed.ic.KvGLjuuU03.jpg HTTP/1.0
Host: www.rit.edu
Accept: image/jpg, image/png
<...many more MESSAGE SENT and RESPONSE HEADERS...>
RESPONSE HEADERS
HTTP/1.1 200 OK
Date: Sun, 12 Aug 2018 15:19:44 GMT
Server: Apache
Link: <http://www.rit.edu/gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files/images/CSEC.png>; rel="canonical"
X-Content-Type-Options: nosniff
Accept-Ranges: bytes
Expires: Mon, 12 Aug 2019 12:51:06 GMT
Cache-Control: max-age=31536000
Etag: W/"0"
Last-Modified: Sun, 12 Aug 2018 12:51:06 GMT
X-Original-Content-Length: 13647
Content-Length: 10131
Connection: close
Content-Type: image/png
RESPONSE HEADERS
HTTP/1.1 200 OK
Date: Sun, 12 Aug 2018 15:19:44 GMT
Server: Apache
Link: <http://www.rit.edu/gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files/images/footer-logo.png>; rel="canonical"
X-Content-Type-Options: nosniff
Accept-Ranges: bytes
Expires: Mon, 12 Aug 2019 14:09:08 GMT
Cache-Control: max-age=31536000
Etag: W/"0"
Last-Modified: Sun, 12 Aug 2018 14:09:08 GMT
X-Original-Content-Length: 19921
Content-Length: 16125
Connection: close
Content-Type: image/png
Number of image urls: 53
Number of files downloaded: 42
Time elapsed: 0:00:04.019662
我的代码:
import sys
import socket
import re
import os
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
from bs4 import BeautifulSoup
class MySocket:
def __init__(self, sock=None):
if sock is None:
self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
else:
self.sock = sock
def connect(self, host, port):
self.sock.connect((host, port))
def myclose(self):
self.sock.close()
def mysend(self, msg, debug=False):
if debug:
print("MESSAGE SENT")
print(msg.decode())
self.sock.sendall(msg)
def myreceive(self, debug=False):
received = b''
buffer = 1
while True:
part = self.sock.recv(buffer)
received += part
if part == b'':
break
if debug:
print("Received...")
print(received)
return received
def get_image_urls(html):
"""
Gets all urls corresponding to images in given html data
:param data: html page to parse
:return: list of image urls
"""
soup = BeautifulSoup(html, "html.parser")
img_urls = []
for image in soup.find_all('img'):
string = str(image)
split = string.split("src=\"")
if split[1][0] == '/':
find_url = split[1].split("\"")
url = find_url[0]
img_urls.append(url)
# img_urls.append("www.rit.edu" + url)
return img_urls
def download_image(img_url):
"""
Download images with the given socket and list of urls
:param img_url: url corresponding to an image
:return: None
"""
image_socket = MySocket()
image_socket.connect("www.rit.edu", 80)
message = "GET " + img_url + " HTTP/1.0\r\n" \
"Host: www.rit.edu\r\n" \
"Accept: image/jpg, image/png\r\n\r\n"
image_socket.mysend(message.encode(), debug=True)
reply = image_socket.myreceive()
headers = reply.split(b'\r\n\r\n')[0]
print("RESPONSE HEADERS")
print(headers.decode())
print()
image = reply[len(headers)+4:]
img_name = str(len(os.listdir(".\\act1step2images"))) + img_url[-4:]
f = open(os.path.join(".\\act1step2images", img_name), 'wb')
f.write(image)
f.close()
def download_images(image_urls, directory):
if not os.path.exists(directory):
os.mkdir(directory)
pool = ThreadPool(100)
pool.map(download_image, image_urls)
pool.close()
pool.join()
def main():
start_time = datetime.now()
host = "www.rit.edu"
port = 80
message = "GET /gccis/computingsecurity/people HTTP/1.0\r\n" \
"Host: www.rit.edu\r\n" \
"Accept: */*\r\n\r\n"
part2_socket = MySocket()
part2_socket.connect(host, port)
part2_socket.mysend(message.encode())
html = part2_socket.myreceive().decode()
part2_socket.myclose()
image_urls = get_image_urls(html)
download_images(image_urls, ".\\act1step2images")
print("Number of image urls:", len(image_urls))
print("Number of files downloaded:", str(len(os.listdir(".\\act1step2images"))))
print("Time elapsed:", datetime.now() - start_time)
main()
解决方案
推荐阅读
- angular - 导出数组和导出模块有什么区别?
- java - 在具有多个参数类型约束的 Kotlin 中调用具有泛型类型的方法
- python - 如何使用 Python 跳过 n 行二进制标准输入?
- .net - 如何创建以特定顺序工作的流畅界面
- python - 将 4D 数组写入 3D 矢量化 2 维
- node.js - 如何使用纱线正确安装 expo-cli?
- spring-data-mongodb - Spring MongoTemplate 不是正在进行的事务的一部分
- php - 无法通过 PHP 但使用 Postman 接收标头信息
- ansible - 如何在 jinja2 中转义双花括号?
- java - 当我尝试将我的应用程序连接到 webServices 时失败