首页 > 解决方案 > 如何更快地更改基于 aiohttp lib 的异步爬取程序?

问题描述

我正在使用aysnc和aiohttp来抓取网页图片,但是当它运行时,我发现它的抓取速度并没有我想象的那么快。

有没有我可以改进的代码?在for循环中我使用了很多await内部,这是正确的处理方法吗?

async def fetch(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url=url,
                               headers=HEADERS,
                               proxy=PROXY_STR,
                               ) as response:
            text = await response.text()
            resp = Selector(text=text)
            nodes = resp.xpath('//div[@class="kl1-2"]')
            for node in nodes:
                next_url = node.xpath('.//div[@class="kl1-2a2"]/a/@href').extract_first()
                title = node.xpath('.//div[@class="kl1-2a2"]/a/@title').extract_first()
                await detail(session=session, next_url=next_url, title=title)
                print('next page')


async def detail(**kwargs):
    session = kwargs['session']
    next_url = kwargs['next_url']
    title = kwargs['title']
    print(next_url)
    print(title)
    async with session.get(
            url=next_url,
            headers=HEADERS,
            proxy=PROXY_STR,
    ) as response:
        text = await response.text()
        resp = Selector(text=text)
        nodes = resp.xpath('//div[@class="kl2-1"]//img/@src').extract()
        nodes = list(set(nodes))
        for img in nodes:
            await download_img(session=session,url=img,title=title)
            print('next image')

async def download_img(**kwargs):
    url= kwargs['url']
    title= kwargs['title']
   
    try:
        conn = aiohttp.TCPConnector(ssl=False)  # 防止ssl报错
        async with aiohttp.ClientSession(connector=conn, trust_env=True) as session:
            async with session.get(url=url, headers=SIMPLE_HEADERS, proxy=PROXY_STR) as response:
                if response.status>=200 and response.status<300:
                    f=await aiofiles.open(save_file,'wb')
                    await f.write(await response.read())
                    await f.close()

    except Exception as e:
        return

async def main():
    total_page = 3640
    for page in range(0,total_page,35):
        url = START_URL.format(page=page)
        await fetch(url)
        await asyncio.sleep(0)
        print(f'downing page {page}-')
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

标签: pythonpython-asyncioaiohttp

解决方案


推荐阅读