首页 > 解决方案 > 使用 puppeter node.js 减少网络抓取作业的时间

问题描述

我制作了一个作业脚本,用于定期抓取网页并将一些信息保存在 MongoDB 数据库中。我试图获得尽可能多的性能,现在我能够每 10 秒执行一次脚本。但是,我想进一步减少它,如果可能的话,时间在 1-10 秒之间。问题是,当我减少它时,我的代码会引发以下警告,并且某些执行会堆积未解决:

(node:9472) MaxListenersExceededWarning: Possible EventEmitter memory leak detected. 11 exit listeners added. Use emitter.setMaxListeners() to increase limit

有没有办法改进代码?

const $ = require('cheerio');
const MarketModel = require('./models/marketModel');
const mongoose = require('mongoose');
const puppeteer = require('puppeteer');
var schedule = require('node-schedule');
const {
    Cluster
} = require('puppeteer-cluster');


//Connection to DataBase:
mongoose.connect('mongodb://localhost:27017/Tradheo', {
    useNewUrlParser: true
});

mongoose.connection.on('error', error => console.log(error));
mongoose.Promise = global.Promise;

getMarketData = async () => {
    console.log("Web scraping to get market data...")

    let markets = []
    let marketSpain = {
        country: 'Spain',
        name: 'IBEX 35',
        companies: []
    }
    let marketGermany = {
        country: 'Germany',
        name: 'DAX',
        companies: []
    }


    const cluster = await Cluster.launch({
        concurrency: Cluster.CONCURRENCY_PAGE,
        maxConcurrency: 2,
    });

    await cluster.task(async ({
        page,
        data: url
    }) => {
        await page.goto({
            waitUntil: 'domcontentloaded'
        });
await page.setRequestInterception(true);
    page.on('request', request => {
      if (request.resourceType() === 'document') {
        request.continue();
      } else {
        request.abort();
      }
    });

        const html = await page.content();
        if (url === 'https://uk.investing.com/equities/spain') {
            console.log('Spain data page content loaded');
            $("table[class='genTbl closedTbl crossRatesTbl elpTbl elp30'] > tbody > tr", html).each((i, elem) => {
                marketSpain.companies.push({
                    name: $("td[class='bold left noWrap elp plusIconTd'] > a", html).eq(i).html(),
                    last: $("td", elem).eq(2).text(),
                    high: $("td", elem).eq(3).text(),
                    low: $("td", elem).eq(4).text(),
                    change: $("td", elem).eq(5).text(),
                    changePerCent: $("td", elem).eq(6).text(),
                    volume: $("td", elem).eq(7).text(),
                    time: $("td", elem).eq(8).text(),
                    purchase: false,
                    sale: false
                });
            });
            markets.push(marketSpain);
        } else {
            console.log('Germany data page content loaded');
            $("table[class='genTbl closedTbl crossRatesTbl elpTbl elp30'] > tbody > tr", html).each((i, elem) => {
                marketGermany.companies.push({
                    name: $("td[class='bold left noWrap elp plusIconTd'] > a", html).eq(i).html(),
                    last: $("td", elem).eq(2).text(),
                    high: $("td", elem).eq(3).text(),
                    low: $("td", elem).eq(4).text(),
                    change: $("td", elem).eq(5).text(),
                    changePerCent: $("td", elem).eq(6).text(),
                    volume: $("td", elem).eq(7).text(),
                    time: $("td", elem).eq(8).text(),
                    purchase: false,
                    sale: false
                });
            });
            markets.push(marketGermany);
        }
        if (markets.length === 2) {
            MarketModel.create({
                markets,
            }, (err) => {
                if (err) return handleError(err);
            })

            console.log("Done!")
        }
    });

    cluster.queue(url1);
    cluster.queue(url2);

    await cluster.idle();
    await cluster.close();

}


var j = schedule.scheduleJob('*/10 * 8-17 * * 1-5', function () {
    const now = new Date();
    //Checks that time is between 8:30 - 17:35 (schedule of the stock exchange)
    if (now.getHours() >= 8 && !(now.getHours() == 8 && now.getMinutes() < 30) && now.getHours() <= 17 && !(now.getHours() == 17 && now.getMinutes() > 35)) {
        getMarketData();
    }
});

更新:我添加了一些改进,例如将 waitUntil 属性设置为“domcontentloaded”并请求拦截以避免等待图像以及除 html 内容之外的任何类型的资源被加载。但是,似乎不足以实现目标。

标签: node.jsperformanceweb-scrapingpuppeteercheerio

解决方案


推荐阅读