首页 > 解决方案 > 如何使用 apify sdk 使 puppeteer 无头运行?

问题描述

我正在尝试使用 apify sdk 抓取页面的内容。这也适用于以下代码。但是,如何像 puppeteer.launch({headless: true}) 一样强制使用 Apify SDK 的无头模式?

供您参考的代码:

async function scrape(number) {
  let output = { links: [], title: [], content: [] };
  const URL = "https://somepage/";
  process.env.APIFY_LOCAL_STORAGE_DIR = '/someappfolder/apify_storage/run_' + number;

  const requestQueue = await Apify.openRequestQueue(number);
  await requestQueue.addRequest({ url: URL });
  const pseudoUrls = [new Apify.PseudoUrl(URL + "[.*]")];
  const crawler = new Apify.PuppeteerCrawler({
    requestQueue,
    handlePageFunction: async ({ request, page }) => {

      output.links.push(request.url);
      output.title.push(await page.title());
      output.content.push((await page.content()).length);
      var save = { url: request.url, title: await page.title(), content: (await page.content()).length };
      //sendToAirtable(save);

      console.log(`URL: ${request.url}`);
      await Apify.utils.enqueueLinks({
        page,
        selector: 'a',
        pseudoUrls,
        requestQueue,
      });
    },
    maxRequestsPerCrawl: 10,
    maxConcurrency: 10,
    minConcurrency: 2,
  });

  await crawler.run();
  return output;
};

标签: javascriptnode.jsexpresssdkapify

解决方案


添加与https://sdk.apify.com/docs/typedefs/launch-puppeteer-options#docsNavlaunchPuppeteerOptions: { headless: true }相同的级别requestQueue


推荐阅读