首页 > 解决方案 > 为什么我会在 alpine/docker 上获得僵尸傀儡进程?

问题描述

这是我的 puppeteer 控制器的全部内容:

import { Readability } from '@mozilla/readability';
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const jsdom = require('jsdom');
const { JSDOM } = jsdom;
const summarize = require('summarize');
const keyword_extractor = require('keyword-extractor');
const amex = require('../../csv/AMEX.json');
const nasdaq = require('../../csv/NASDAQ.json');
const nyse = require('../../csv/NYSE.json');
const cryptotickers = require('../../csv/cryptos.json');

puppeteer.use(StealthPlugin());

class Reader {
  constructor() {
    this.browser = null;
  }

  async getLink(link) {
    this.browser = await puppeteer.launch({
      devtools: false,
      headless: true,
      // product: 'firefox',
      executablePath: '/usr/bin/chromium-browser',
      args: [
        '--proxy-server=' + process.env.PROXY_HOST,
        '--no-sandbox',
        '--disable-dev-shm-usage',
        '--disable-gpu',
        '--single-process',
        '--disable-setuid-sandbox',
        '--no-zygote',
        '--shm-size=4gb',
        '--disable-infobars',
        '--ignore-certifcate-errors',
        '--ignore-certifcate-errors-spki-list',
        // '--user-agent="Mozilla/5.0 (iPhone; CPU iPhone OS 14_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"'
      ],
    });

    const { htm, title } = await this.spa(link);

    if (!htm) {
      await this.browser.close();
      return;
    }

    const text = txt(htm, link);
    const data = Object.assign({}, text);
    const parts = new URL(link);

    if (!data.title) {
      data.title = title;
    }

    data.summary = summary(data.content, data.title);
    data.tickers = tickers(data.content, data.textContent);
    data.cryptos = cryptos(data.content, data.textContent);
    data.meta = getMeta(htm);

    if (!data.title && data.meta.title) {
      data.title = data.meta.title;
    }

    data.url = link;
    data.htm = htm;
    data.host = parts.host;
    data.text = data.textContent;
    delete data.textContent;

    console.log('data fetched: ' + link);
    await this.browser.close();
    // await this.browser.disconnect();

    return data;
  }

  async spa(url) {
    let htm;
    let title;

    try {
      let page = await this.browser.newPage();

      await page.setRequestInterception(true);
      page.on('request', (req) => {
        if (
          req.resourceType() === 'stylesheet' ||
          req.resourceType() === 'font' ||
          req.resourceType() == 'image'
        ) {
          req.abort();
        } else {
          req.continue();
        }
      });

      await page.authenticate({
        username: process.env.PROXY_USER,
        password: process.env.PROXY_PASS,
      });

      await page.setViewport({ width: 800, height: 600 });
      // await page.goto(url, { waitUntil: 'networkidle2' });
      await page.goto(url, { waitUntil: 'domcontentloaded' });
      await this.autoScroll(page);
      await page.evaluate(() => window.scrollTo(0, 50));
      htm = await page.content();
      title = await page.evaluate(() => document.title);

      if (htm.indexOf('<title') === -1) {
        htm = await page.evaluate(() => document.documentElement.outerHTML);
      }

      console.log(title, 'title');
    } catch (err) {
      console.error(err, url);
    }

    return { htm, title };
  }

  async autoScroll(page) {
    await page.evaluate(async () => {
      new Promise((resolve, reject) => {
        try {
          const maxScroll = Number.MAX_SAFE_INTEGER;
          let lastScroll = 0;
          const interval = setInterval(() => {
            window.scrollBy(0, document.body.offsetHeight);
            const { scrollTop } = document.documentElement;
            if (scrollTop === maxScroll || scrollTop === lastScroll) {
              clearInterval(interval);
              resolve();
            } else {
              lastScroll = scrollTop;
            }
          }, 1000);
        } catch (error) {
          reject(error);
        }
      }).catch((error) => {
        console.error(error); // add catch here
      });
    });

    // await page.evaluate(async () => {
    //     await new Promise((resolve, reject) => {
    //         let totalHeight = 0;
    //         let distance = 300;
    //         let timer = setInterval(() => {
    //             const scrollHeight = document.body.scrollHeight;
    //             window.scrollBy(0, distance);
    //             totalHeight += distance;

    //             if(totalHeight >= scrollHeight){
    //                 clearInterval(timer);
    //                 resolve();
    //             }
    //         }, 100);
    //     });
    // });
  }
} // end Class Reader

async function summarization2(text) {
  let res;
  let data;

  console.log(text, process.env.DEEPAI_KEY);

  try {
    const body = new FormData();
    body.append('text', text);

    res = await fetch(`https://api.deepai.org/api/summarization`, {
      method: 'POST',
      body,
      headers: {
        'api-key': process.env.DEEPAI_KEY,
      },
    });

    data = await res.json();
  } catch (err) {
    console.error(err);
  }

  return data;
}
async function sentiment(text) {
  return await deepai.callStandardApi('sentiment-analysis', { text });
}

async function summarization(text) {
  return await deepai.callStandardApi('summarization', { text }).catch(console.error);
}

function summary(text, title) {
  if (!text) return {};
  const summary = summarize(`${title} - ${text}`);

  summary.topics = keyword_extractor
    .extract(`${title} - ${text}`, {
      language: 'english',
      remove_digits: true,
      return_changed_case: true,
      remove_duplicates: false,
    })
    .map(process);

  const counts = summary.topics.reduce(
    (acc, value) => ({
      ...acc,
      [value]: (acc[value] || 0) + 1,
    }),
    {},
  );

  let topics = [];

  for (let topic in counts) {
    topics.push({ topic, count: counts[topic] });
  }

  topics = topics.filter((t) => t.topic);
  topics = topics.sort((a, b) => {
    return b.count - a.count;
  });

  topics = topics.slice(0, 10);
  topics = topics.map((topic) => topic.topic);
  summary.topics = topics;

  function process(topic) {
    topic = topic.toLowerCase().trim();
    topic = topic.replace(/[\W_]+/g, '');
    topic = topic.replace(/\s+/g, '-');

    return topic;
  }

  console.log('summary: ', summary);

  return summary;
}

function tickers(htm, text) {
  if (!text) return {};

  const tickers = [];

  function findTicker(ticker, exchange) {
    let name = ticker.Name;

    if (name && name.indexOf('Twitter') === -1 && name.indexOf('Facebook') === -1) {
      name = name.replace(/,? ?Inc\.?/gi, '').replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
    }

    const regex = new RegExp(`\\b${name}\\b`, 'gi');

    if (text.match(regex)) {
      console.log(name);
      console.log(regex.toString());
      tickers.push({ name: ticker.Name, symbol: ticker.Symbol, exchange });
    }
  }

  amex.forEach((ticker) => {
    findTicker(ticker, 'amex');
  });
  nasdaq.forEach((ticker) => {
    findTicker(ticker, 'nasdaq');
  });
  nyse.forEach((ticker) => {
    findTicker(ticker, 'nyse');
  });

  console.log(tickers);

  return tickers;
}

function cryptos(htm, text) {
  if (!text) return {};

  const tickers = [];

  function findTicker(ticker) {
    const name = ticker.name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
    const regex = new RegExp(`\\b${name}\\b`, 'g');

    if (text.match(regex)) {
      console.log(name);
      console.log(regex.toString());
      tickers.push({ name: ticker.name, symbol: ticker.symbol });
    }
  }

  cryptotickers.forEach(findTicker);

  console.log(tickers);

  return tickers;
}

function getMeta(htm) {
  const doc = new JSDOM(htm);
  const meta = {};
  const thumb =
    doc.window.document.querySelector('meta[property="og:image"]') ||
    doc.window.document.querySelector('meta[name="twitter:image"]');
  const title = doc.window.document.title;

  meta.title = title;
  meta.thumb = thumb && thumb.getAttribute('content');

  return meta;
}

function txt(htm, link) {
  const url = new URL(link);
  const doc = new JSDOM(htm);
  doc.window.document
    .querySelectorAll('img')
    .forEach(
      (el) =>
        (el.src =
          el.src.indexOf('http') === 0 || el.src.indexOf('//') === 0
            ? el.src.indexOf('http://')
              ? el.src.replace('http:', '')
              : el.str
            : '//' + url.host + el.src),
    );
  doc.window.document
    .querySelectorAll('a[href]')
    .forEach(
      (el) =>
        (el.href =
          el.href && el.href.indexOf('/') === 0
            ? url.protocol + '//' + url.host + el.href
            : el.href),
    );
  const reader = new Readability(doc.window.document);
  return reader.parse();
}

export default Reader;

由于某些原因,几天后 docker 容器有太多 puppeteer 进程,因为在获取 url 时浏览器无法正确退出。

最终容器资源不足,整个应用程序冻结且无法访问。

标签: node.jsdockerpuppeteeralpine

解决方案


在 docker 中使用 Puppeteer 时我遇到了同样的问题。解决方案是在 docker 中实现哑初始化。那么应该以Dockerfile某种方式看起来像这样(我假设您正在开发一个节点项目,因此我们npm start在最后调用:

RUN apt-get install dumb-init // ... plus your other packages
... your remaining docker things
ENTRYPOINT ["/usr/bin/dumb-init", "--"]
CMD [ "npm", "start" ]

推荐阅读