首页 > 解决方案 > 网络实时抓取流聊天(puppeteer.js)

问题描述

我想通过网络抓取实时从流中获取聊天。

试图在 puppeter 的 .then() 函数中创建一个 while 循环似乎并不有效,并且在某些实现中将它们全部破坏。

我能够进行初始抓取,但在所有情况下,程序都会结束并且不想遵循我实现的 while 循环。

没有while循环的工作代码

const puppeteer = require ('puppeteer');

//initiating Puppeteer
puppeteer
  .launch ()
  .then (async browser => {
    //opening a new page and navigating to the live stream
    const page = await browser.newPage ();
    await page.goto ('https://www.younow.com/Ken_Nara24');
    await page.waitForSelector ('body');
  
    //manipulating the page's content
    let getComments = await page.evaluate (() => {
    let comments = document.body.querySelectorAll ('.comment');
    let scrapeItems = [];

    

    comments.forEach (item => {
        let commentAuthor = item.querySelector ('div.user-card__header.mini-profile-launcher').innerText;
        let commentContent = '';
            try {
            commentContent = item.querySelector ('div.user-card__body.ng-star-inserted').innerText;
            } catch (err) {}
            scrapeItems.push ({
            commentAuthor: commentAuthor,
            commentContent: commentContent,
            });
        });
    
    
    let items = {
        "userComments": scrapeItems,
    };
    return items;
        
    });
    //outputting the scraped data
    console.log (getComments);
    //closing the browser
    await browser.close ();
  })
  //handling any errors
  .catch (function (err) {
    console.error (err);
  });

所有试图让这种逻辑循环的尝试都是徒劳的。我找不到明确定义如何或是否可以完成此类事情的方式或过去的问题/示例。我自己做了一些尝试来实现它,但没有任何东西可以正确编译。

我在这里错过了什么重要的东西吗?我只想听一个网页,然后每 3-5 秒重新抓取一次。

标签: javascriptnode.jsweb-scrapingpuppeteer

解决方案


如果您仍然需要帮助,可以尝试这种方式。

const puppeteer = require("puppeteer");
let pageScraping = false; /* set scraping to false */

const scraper = async () => {
  if (pageScraping == true) return; /* check if already scraping page */
  let browser, page;
  let pageUrl = 'https://www.younow.com/Ken_Nara24';

  try {
    pageScraping = true; /* set scraping to true */
    browser = await puppeteer.launch({ headless: true });
    page = await browser.newPage();
    await page.goto(pageUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });

    /* wait for chat to be visible */
    await page.waitForSelector('.chat', { visible: true, timeout: 60000 });

    let getComments = await page.evaluate(() => {
      let scrapeComments = [];
      let comments = document.querySelectorAll('.comment');

      comments.forEach(comment => {
        let commentContent = '';
        let commentAuthor = comment.querySelector('div[class="user-card__header mini-profile-launcher"]').innerText;
        commentContent = comment.querySelector('div[class="user-card__body ng-star-inserted"]').innerText;

        scrapeComments.push({
          'commentAuthor': commentAuthor,
          'commentContent': commentContent,
        });
      });

      return { 'userComments': scrapeComments };
    });

    console.log(await getComments); /* log comments */
  } catch (err) {
    console.log(err.message);
  } finally {
    if (browser) { /* check if browser is open befor trying to close */
      await browser.close();
      console.log('closing browser');
    }
    pageScraping = false; /* set scraping to false again */
    await setTimeout(scraper, 5000); /* wait 5 seconds befor re-scraping */
  }
}

setTimeout(scraper, 5000); /* start scraping */

推荐阅读