node.js - 为什么我会在 alpine/docker 上获得僵尸傀儡进程?
问题描述
这是我的 puppeteer 控制器的全部内容:
import { Readability } from '@mozilla/readability';
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const jsdom = require('jsdom');
const { JSDOM } = jsdom;
const summarize = require('summarize');
const keyword_extractor = require('keyword-extractor');
const amex = require('../../csv/AMEX.json');
const nasdaq = require('../../csv/NASDAQ.json');
const nyse = require('../../csv/NYSE.json');
const cryptotickers = require('../../csv/cryptos.json');
puppeteer.use(StealthPlugin());
class Reader {
constructor() {
this.browser = null;
}
async getLink(link) {
this.browser = await puppeteer.launch({
devtools: false,
headless: true,
// product: 'firefox',
executablePath: '/usr/bin/chromium-browser',
args: [
'--proxy-server=' + process.env.PROXY_HOST,
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--single-process',
'--disable-setuid-sandbox',
'--no-zygote',
'--shm-size=4gb',
'--disable-infobars',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
// '--user-agent="Mozilla/5.0 (iPhone; CPU iPhone OS 14_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"'
],
});
const { htm, title } = await this.spa(link);
if (!htm) {
await this.browser.close();
return;
}
const text = txt(htm, link);
const data = Object.assign({}, text);
const parts = new URL(link);
if (!data.title) {
data.title = title;
}
data.summary = summary(data.content, data.title);
data.tickers = tickers(data.content, data.textContent);
data.cryptos = cryptos(data.content, data.textContent);
data.meta = getMeta(htm);
if (!data.title && data.meta.title) {
data.title = data.meta.title;
}
data.url = link;
data.htm = htm;
data.host = parts.host;
data.text = data.textContent;
delete data.textContent;
console.log('data fetched: ' + link);
await this.browser.close();
// await this.browser.disconnect();
return data;
}
async spa(url) {
let htm;
let title;
try {
let page = await this.browser.newPage();
await page.setRequestInterception(true);
page.on('request', (req) => {
if (
req.resourceType() === 'stylesheet' ||
req.resourceType() === 'font' ||
req.resourceType() == 'image'
) {
req.abort();
} else {
req.continue();
}
});
await page.authenticate({
username: process.env.PROXY_USER,
password: process.env.PROXY_PASS,
});
await page.setViewport({ width: 800, height: 600 });
// await page.goto(url, { waitUntil: 'networkidle2' });
await page.goto(url, { waitUntil: 'domcontentloaded' });
await this.autoScroll(page);
await page.evaluate(() => window.scrollTo(0, 50));
htm = await page.content();
title = await page.evaluate(() => document.title);
if (htm.indexOf('<title') === -1) {
htm = await page.evaluate(() => document.documentElement.outerHTML);
}
console.log(title, 'title');
} catch (err) {
console.error(err, url);
}
return { htm, title };
}
async autoScroll(page) {
await page.evaluate(async () => {
new Promise((resolve, reject) => {
try {
const maxScroll = Number.MAX_SAFE_INTEGER;
let lastScroll = 0;
const interval = setInterval(() => {
window.scrollBy(0, document.body.offsetHeight);
const { scrollTop } = document.documentElement;
if (scrollTop === maxScroll || scrollTop === lastScroll) {
clearInterval(interval);
resolve();
} else {
lastScroll = scrollTop;
}
}, 1000);
} catch (error) {
reject(error);
}
}).catch((error) => {
console.error(error); // add catch here
});
});
// await page.evaluate(async () => {
// await new Promise((resolve, reject) => {
// let totalHeight = 0;
// let distance = 300;
// let timer = setInterval(() => {
// const scrollHeight = document.body.scrollHeight;
// window.scrollBy(0, distance);
// totalHeight += distance;
// if(totalHeight >= scrollHeight){
// clearInterval(timer);
// resolve();
// }
// }, 100);
// });
// });
}
} // end Class Reader
async function summarization2(text) {
let res;
let data;
console.log(text, process.env.DEEPAI_KEY);
try {
const body = new FormData();
body.append('text', text);
res = await fetch(`https://api.deepai.org/api/summarization`, {
method: 'POST',
body,
headers: {
'api-key': process.env.DEEPAI_KEY,
},
});
data = await res.json();
} catch (err) {
console.error(err);
}
return data;
}
async function sentiment(text) {
return await deepai.callStandardApi('sentiment-analysis', { text });
}
async function summarization(text) {
return await deepai.callStandardApi('summarization', { text }).catch(console.error);
}
function summary(text, title) {
if (!text) return {};
const summary = summarize(`${title} - ${text}`);
summary.topics = keyword_extractor
.extract(`${title} - ${text}`, {
language: 'english',
remove_digits: true,
return_changed_case: true,
remove_duplicates: false,
})
.map(process);
const counts = summary.topics.reduce(
(acc, value) => ({
...acc,
[value]: (acc[value] || 0) + 1,
}),
{},
);
let topics = [];
for (let topic in counts) {
topics.push({ topic, count: counts[topic] });
}
topics = topics.filter((t) => t.topic);
topics = topics.sort((a, b) => {
return b.count - a.count;
});
topics = topics.slice(0, 10);
topics = topics.map((topic) => topic.topic);
summary.topics = topics;
function process(topic) {
topic = topic.toLowerCase().trim();
topic = topic.replace(/[\W_]+/g, '');
topic = topic.replace(/\s+/g, '-');
return topic;
}
console.log('summary: ', summary);
return summary;
}
function tickers(htm, text) {
if (!text) return {};
const tickers = [];
function findTicker(ticker, exchange) {
let name = ticker.Name;
if (name && name.indexOf('Twitter') === -1 && name.indexOf('Facebook') === -1) {
name = name.replace(/,? ?Inc\.?/gi, '').replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
const regex = new RegExp(`\\b${name}\\b`, 'gi');
if (text.match(regex)) {
console.log(name);
console.log(regex.toString());
tickers.push({ name: ticker.Name, symbol: ticker.Symbol, exchange });
}
}
amex.forEach((ticker) => {
findTicker(ticker, 'amex');
});
nasdaq.forEach((ticker) => {
findTicker(ticker, 'nasdaq');
});
nyse.forEach((ticker) => {
findTicker(ticker, 'nyse');
});
console.log(tickers);
return tickers;
}
function cryptos(htm, text) {
if (!text) return {};
const tickers = [];
function findTicker(ticker) {
const name = ticker.name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(`\\b${name}\\b`, 'g');
if (text.match(regex)) {
console.log(name);
console.log(regex.toString());
tickers.push({ name: ticker.name, symbol: ticker.symbol });
}
}
cryptotickers.forEach(findTicker);
console.log(tickers);
return tickers;
}
function getMeta(htm) {
const doc = new JSDOM(htm);
const meta = {};
const thumb =
doc.window.document.querySelector('meta[property="og:image"]') ||
doc.window.document.querySelector('meta[name="twitter:image"]');
const title = doc.window.document.title;
meta.title = title;
meta.thumb = thumb && thumb.getAttribute('content');
return meta;
}
function txt(htm, link) {
const url = new URL(link);
const doc = new JSDOM(htm);
doc.window.document
.querySelectorAll('img')
.forEach(
(el) =>
(el.src =
el.src.indexOf('http') === 0 || el.src.indexOf('//') === 0
? el.src.indexOf('http://')
? el.src.replace('http:', '')
: el.str
: '//' + url.host + el.src),
);
doc.window.document
.querySelectorAll('a[href]')
.forEach(
(el) =>
(el.href =
el.href && el.href.indexOf('/') === 0
? url.protocol + '//' + url.host + el.href
: el.href),
);
const reader = new Readability(doc.window.document);
return reader.parse();
}
export default Reader;
由于某些原因,几天后 docker 容器有太多 puppeteer 进程,因为在获取 url 时浏览器无法正确退出。
最终容器资源不足,整个应用程序冻结且无法访问。
解决方案
在 docker 中使用 Puppeteer 时我遇到了同样的问题。解决方案是在 docker 中实现哑初始化。那么应该以Dockerfile
某种方式看起来像这样(我假设您正在开发一个节点项目,因此我们npm start
在最后调用:
RUN apt-get install dumb-init // ... plus your other packages
... your remaining docker things
ENTRYPOINT ["/usr/bin/dumb-init", "--"]
CMD [ "npm", "start" ]
推荐阅读
- html - Flexbox Hero 不会居中
- android - 非周期性后台作业长时间运行
- python-3.x - Python while循环通过程序不起作用
- mysql - 将 MySQL 远程 URL 连接到 Spring Boot 应用程序
- mongodb - Meteor 中对象的发布计数
- libreoffice-calc - 在单元格中添加公式和字符串,libre calc 2017
- archive - 7zip 不将文件发送到存档位置
- spring-boot - spring boot、jax-rs、jersey中无法识别多个URL查询参数
- python - 反向搜索excel行并附加正整数
- ruby-on-rails - Rails docker应用程序在一段时间后停止工作