首页 > 技术文章 > 70行代码爬取 查字典_笑话(多线程)

Ly-233 2020-02-19 15:37 原文

逻辑流程

首先是些主要参数, 其中有需要爬取的站点链接, headers, Queue的关闭时间, 和内容界面的线程抓取数量

index = 'https://www.chazidian.com'
list_page = index + '/xiaohua{}/{}'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
}
timeout = 1
thread_quantity = 5

get_url:  解析列表中的url解析并传入Queue队列中

def get_url(queue):
    for i in range(1, 75 + 1):
        r = requests.get(list_page.format('', str(i)), headers=headers)
        pq = PyQuery(r.text)
        doc = pq('div.arctcot h3 a')
        for j in doc.items():
            queue.put(index + j.attr('href'))

get_conntent:  从Queue队列中获取url, 并解析内容界面, 并打印出来, 这里需要用while循环一直获取界面, 使用try_except捕获队列的超时异常

def get_content(queue):
    try:
        while True:
            url = queue.get(timeout=timeout)
            if 'https://' in url:
                r = requests.get(url, headers=headers)
                pq = PyQuery(r.text)
                doc = pq('div.arctcot')
                title = doc('a').text()
                content = doc('div.article_detail').text()
                img = doc('div.article_detail img').attr('src')
                # print(title.text())
                # print(content.text())
                if title and img:
                    if not 'http://' in img:
                        print(url)
                        print({title: index + img})
                elif title and content:
                    if not (content in title):
                        print(url)
                        print({title: content})
    except Empty:
        print('-' * 100)
        print('抓取完毕')

__main__:  进行线程开启和运行时间的计算

if __name__ == '__main__':
    start = time.time()
    queue_ = Queue(maxsize=1000)
    list_ = threading.Thread(target=get_url, args=(queue_,))
    list_.start()
    if True:
        for i in range(thread_quantity):
            content = threading.Thread(target=get_content, args=(queue_,))
            content.start()
            content.join()
    list_.join()
    end = time.time()
    print('用时: ', end - start - timeout)

下面是所有代码

from queue import Queue, Empty
import threading
import requests
from pyquery import PyQuery
import time

index = 'https://www.chazidian.com'
list_page = index + '/xiaohua{}/{}'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
}
timeout = 1
thread_quantity = 5


def get_url(queue):
    for i in range(1, 75 + 1):
        r = requests.get(list_page.format('', str(i)), headers=headers)
        pq = PyQuery(r.text)
        doc = pq('div.arctcot h3 a')
        for j in doc.items():
            queue.put(index + j.attr('href'))
    # print(doc)
    pass


def get_content(queue):
    try:
        while True:
            url = queue.get(timeout=timeout)
            if 'https://' in url:
                r = requests.get(url, headers=headers)
                pq = PyQuery(r.text)
                doc = pq('div.arctcot')
                title = doc('a').text()
                content = doc('div.article_detail').text()
                img = doc('div.article_detail img').attr('src')
                # print(title.text())
                # print(content.text())
                if title and img:
                    if not 'http://' in img:
                        print(url)
                        print({title: index + img})
                elif title and content:
                    if not (content in title):
                        print(url)
                        print({title: content})
    except Empty:
        print('-' * 100)
        print('抓取完毕')


if __name__ == '__main__':
    start = time.time()
    queue_ = Queue(maxsize=1000)
    list_ = threading.Thread(target=get_url, args=(queue_,))
    list_.start()
    if True:
        for i in range(thread_quantity):
            content = threading.Thread(target=get_content, args=(queue_,))
            content.start()
            content.join()
    list_.join()
    end = time.time()
    print('用时: ', end - start - timeout)
所有代码

 

推荐阅读