python - 使用请求进行网络抓取时如何使用多线程
问题描述
以下是我用来抓取数据并将其保存为 pickle 文件的代码。对于返回错误的链接,代码只需记录 url 并将其附加到另一个列表中,以便我可以再次抓取它们。
data= []
failed_url = []
error =[]
start = time.asctime( time.localtime(time.time()) )
# list_url is a list containing all urls
for path in list_url:
try:
headers = {'Content-Type': 'application/json'}
filters = [dict(source_location="petitions_browse")]
params = dict(q=json.dumps(dict(filters=filters)))
result = requests.get(url=path,params=params)
time.sleep(5)
data_dict = result.json()
data.append({
'Data_dict': data_dict,
'id': data_dict['id'],
'title': data_dict['title'],
'text': data_dict['description'],
'Topic': data_dict['topic'],
'Topic_alt': data_dict['tags'][0]['name'],
'Created_date': data_dict['created_at'].split("T")[0],
'Created_time': data_dict['created_at'].split("T")[1],
})
time.sleep(5)
except Exception as e:
print(e)
error.append(e)
failed_url.append(path)
continue
if len(data)%50 == 0:
pickle.dump(data, open("data_checkpoint"+".pkl", "wb"))
pickle.dump(data, open("data"+".pkl", "wb"))
pickle.dump(failed_url, open("failed_url"+".pkl", "wb"))
end = time.asctime( time.localtime(time.time()) )
print('start time:', start, 'end time:', end)
print('Failed Links:', failed_url)
我使用以下代码将其调整为多处理:
def get_data(url):
try:
headers = {'Content-Type': 'application/json'}
filters = [dict(source_location="petitions_browse")]
params = dict(q=json.dumps(dict(filters=filters)))
result = requests.get(url=url,params=params)
data_dict = result.json()
return{
'Data_dict': data_dict,
'id': data_dict['id'],
'title': data_dict['title'],
'text': data_dict['description'],
'Topic': data_dict['topic'],
'Topic_alt': data_dict['tags'][0]['name'],
'Created_date': data_dict['created_at'].split("T")[0],
'Created_time': data_dict['created_at'].split("T")[1],
}
#time.sleep(5)
except Exception as e:
print(e)
try:
return {'Data_dict': data_dict}
except Exception as c:
return("failed_url", path)
#continue
with Pool(10) as pool:
result = pool.map(get_data, url_list)
print(result)
failed_url,data=[],[]
for res in result:
if isinstance(res,tuple):
failed_url.append(res[1])
else:
data.append(res)
pickle.dump(data, open("data"+".pkl", "wb"))
pickle.dump(failed_url, open("failed_url_test"+".pkl", "wb"))
这种适应似乎没有运行。对于少量链接,它会持续运行很长时间,因此似乎存在问题,因为对于相同数量的链接,没有多线程的代码更快。
解决方案
推荐阅读
- c - 虚拟内存混乱
- c# - IOS 14 请求限制照片访问
- scala - 在scala-spark中如何减少多个案例
- c++ - 删除的函数'virtual Classname::~Classname()'覆盖未删除的函数VIRTUAL DESTRUCTOR
- python - Cloud Build 服务帐号无权访问 storage.objects.get
- python - Pandas 按唯一 ID 和每个唯一 ID 的不同日期分组
- javascript - 带动画的垂直文字书写?CSS代码中的错误
- ios - 签名分发 iOS 应用程序时出错“无法为签名者构建自签名根的链”
- linux - 通过从 /etc/os-release、grub 引导加载程序更改 centos 名称来重新命名 Centos Linux 7
- swift - TabView 内的 NavigationView:使用 TabItem-Buttons 时如何到达顶级 TabView