首页 > 技术文章 > 崔庆才——Ajax今日头条多进程爬虫

Knight66666 2020-03-30 11:47 原文

# 从AJAX入手解决主页面无数据问题
import requests
from hashlib import md5
import os
from config import *
from requests.exceptions import RequestException
from urllib.parse import urlencode
import json
from bs4 import BeautifulSoup
from multiprocessing import Pool
import re
import pymongo
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
# Mongo服务要先启动,并创建你选定数据库,否则连接不上
def save_to_mongo(ret_dict):
if db[MONGO_TABLE].insert(ret_dict): # 知识点8:mongodb数据库的链接,配置文件方式传入
print("插入数据到数据库成功", ret_dict["title"])
return True
return False
def get_page_index(keyword,offset,headers,cookie):
try:
data = {
'aid':"24",
'app_name':"web_search",
'offset':offset,
'format':"json",
'keyword':keyword,
'autoload':"true",
'count':"20",
'en_qc':"1",
'cur_tab':"1",
'from':'search_tab',
'pd':"synthesis",
'timestamp':"1585525783382",
'_signature':"MqqdBAAgEBC1BxnpKjcMhjKr3BAAGwyzftELDyc2Vi7Ug4gGwX7WlzBBtoBfhTP9rT-Eha5MhBFoxSsOVuYXGF4F1L2sGmX9A07QT2rsGhAXHp38jFF3LG2nRBQu9o52X09"

}
# urllib库的编码方式
url = "https://www.toutiao.com/api/search/content/?"+urlencode(data)
response = requests.get(url,headers=headers,cookies=cookie)
if response.status_code == 200:
return response.text
return None
except RequestException:
print("Wrong!请求索引失败")
return None
# 拿url
def parse_page_index(html):
"""构造生成器即可,或者这个函数的返回值是一个列表"""
data = json.loads(html)
if data and "data" in data.keys():
for item in data.get("data"): # 知识点3:字典获取键的值的get方法
if "article_url" in item.keys():
url = item.get("article_url")
yield url
# 拿组图细节
def get_page_detail(url,headers,cookie):
try:
response = requests.get(url,headers=headers,cookies=cookie)
if response.status_code == 200:
content = response.content.decode()
return content
return None
except RequestException:
print("get函数出错")
return None

# 下载图片
def download(url,headers,cookie):
print("正在下载图片",url)
try:
response = requests.get(url, headers=headers, cookies=cookie)
if response.status_code == 200:
content = response.content
saveimg(content)
return None
except RequestException:
print("请求出错")
return None
# 保存图片
def saveimg(content):
file_path = "{0}/{1}.{2}".format(os.getcwd(),md5(content).hexdigest(),"jpg") # 知识点9:运用md5进行去重,md5的简单回顾
if not os.path.exists(file_path): # 知识点10:os方法的使用
with open(file_path,"wb") as f:
f.write(content)
def parse_page_detail(html, url,headers,cookie):
soup = BeautifulSoup(html, 'lxml')
result = soup.select('title')

title = result[0].get_text() if result else ''
# print(title)
images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)
result = re.search(images_pattern, html)
# print(result)
if result:
ret = result.group(1)
ret = ret.replace("\\", "")
ret = ret.replace("u002F", "/")
data = json.loads(ret)
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
# print(sub_images)
images = [item.get('url') for item in sub_images]
for image in images: download(image,headers,cookie)
return {
'title': title,
'url': url,
'images': images
}

def main(offset):
headers = {'user-agent':'xxx'}
cookie = {'cookie':'xxx'}
html = get_page_index("街拍",offset,headers,cookie)
for url in parse_page_index(html):
html = get_page_detail(url,headers,cookie)

if html:
result = parse_page_detail(html, url,headers,cookie)
if result:
print(result)
save_to_mongo(result)
if __name__ == "__main__":
# main()
groups = [x*20 for x in range(GROUP_START,GROUP_END+1)]
pool = Pool()
pool.map(main,groups)

推荐阅读