node.js - UnhandledPromiseRejectionWarning:错误:请求已被处理
问题描述
所以我有这个nodejs,它最初用作api,根据时间表使用puppeteer从网站上抓取数据,现在检查是否有时间表我使用了一个链接到模型查询的函数并检查是否有任何时间表此时此刻。它似乎工作并且我得到了数据,但是当我抓取第二篇文章和下一篇文章时,总是出现这个错误UnhandledPromiseRejectionWarning: Error: Request is already handled!
,然后UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch().
它似乎从 cpu 和内存中占用了大量资源。所以我的问题是,我的代码中是否有任何阻塞或任何可以做得更好的东西。
这是我的 server.js
function mydbqueryarticle(callback) {
News.findNullArticle(function(err, article) {
if(article!=null){
console.log('Crawling');
Crawl.article(err)
}else{
console.log('No article to crawl');
}
})
callback();
}
function wait10sec(){
setTimeout(function(){
mydbqueryarticle(wait10sec);
}, 10000);
}
wait10sec();
这是我的新闻模型
News.findNullArticle = function (result) {
dbConn.query("SELECT id, source, keyword, title, link from article where article_status = 0 ORDER BY created_at ASC LIMIT 1", function (err, res) {
if(err) {
console.log("error: ", err);
result(null, err);
}
else{
console.log('article : ', res);
result(null, res[0]);
}
});
};
这是我的爬行控制器
function crawl_article(news_id, news_link, all_page_tag, body_article, article_date, article_el, article_tag_el) {
try {
(async () => {
// Add stealth plugin and use defaults (all tricks to hide puppeteer usage)
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin())
// Add adblocker plugin to block all ads and trackers (saves bandwidth)
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker')
puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
puppeteer.launch({
headless: true,
executablePath: '/opt/google/chrome/google-chrome',
args: ['--autoplay-policy=user-gesture-required',
'--disable-background-networking',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-breakpad',
'--disable-client-side-phishing-detection',
'--disable-component-update',
'--disable-default-apps',
'--disable-dev-shm-usage',
'--disable-domain-reliability',
'--disable-extensions',
'--disable-features=AudioServiceOutOfProcess',
'--disable-hang-monitor',
'--disable-ipc-flooding-protection',
'--disable-notifications',
'--disable-offer-store-unmasked-wallet-cards',
'--disable-popup-blocking',
'--disable-print-preview',
'--disable-prompt-on-repost',
'--disable-renderer-backgrounding',
'--disable-setuid-sandbox',
'--disable-speech-api',
'--disable-sync',
'--hide-scrollbars',
'--ignore-gpu-blacklist',
'--metrics-recording-only',
'--mute-audio',
'--no-default-browser-check',
'--no-first-run',
'--no-pings',
'--no-sandbox',
'--no-zygote',
'--password-store=basic',
'--use-gl=swiftshader',
'--use-mock-keychain']
}).then(async browser => {
console.log(news_link);
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
await page.goto(news_link+all_page_tag);
await page.waitForSelector('body');
await autoScroll(page);
//await page.waitForTimeout(2500);
let current_time = moment().format('YYYY-MM-DD HH:mm:ss');
var rposts = await page.evaluate((body_article, article_date, article_el, article_tag_el) => {
let get_article_date, get_article, get_tag;
let posts = document.body.querySelectorAll(body_article);
postItems = [];
posts.forEach((item) => {
try{
if(item.querySelector(article_el)) {
// get_article = item.querySelector(article_el).innerText;
get_article = item.querySelector(article_el).innerHTML;
// get_article = item.querySelector('p').innerText;
} else {
get_article = item.querySelector('p').innerText;
}
if(item.querySelector(article_tag_el))
get_tag = item.querySelector(article_tag_el).innerText.replace(/\n/g, ",");
if(item.querySelector(article_date))
get_article_date = item.querySelector(article_date).innerText;
postItems.push({get_article_date, get_article, get_tag});
} catch(e) {}
});
return postItems;
}, body_article, article_date, article_el, article_tag_el);
//console.log(rposts[0].get_article);
console.log(rposts);
if(rposts.length > 0) {
News.updateArticle(news_id, rposts, function(err, news) {
if (err) console.log(err);
else console.log({error:false,message:"Article id " +news_id+" updated successfully!",data:rposts});
});
} else {
News.updateErrorArticle(news_id, rposts, function(err, news) {
if (err) console.log(err);
else console.log({error:false,message:"Article id " +news_id+" updated successfully!",data:rposts});
});
}
await browser.close();
}).catch(function(error) {
console.error(error);
});
})();
} catch (err) {
console.error(err);
}
}
exports.article = (err) => {
News.findNullArticle(function(err, result) {
// if (err) res.send(err);
if(result) {
let article_el, article_tag_el
Sources.findBySource(result.source, function(err, dt_source) {
if(dt_source) {
let crawl = crawl_article(result.id, result.link, dt_source.all_page_tag, dt_source.body_article, dt_source.article_date,dt_source.article_el, dt_source.article_tag_el);
}
// res.json({error:false, article:result, source:dt_source});
})
} else {
// res.send({ error:true, response: "No News return from query" });
}
});
};
解决方案
我想通了,我只是使用了 puppeteer 集群。
推荐阅读
- python - while vc.is_playing() 循环永远不会退出
- elasticsearch - 如何从kibana向远程服务器发送请求?
- mongodb - MongoDB - mongodump - 回复 listIndexes 时出错无法解析目录条目
- mysql - 数据透视表的问题,使用braintree附加不起作用
- javascript - 测试中未涵盖的回调函数 - 如何模拟?
- c - 如何声明和定义具有多个可变长度数组的结构?
- vue.js - 如何使 li 项目在 Vue 的侧边栏中处于活动状态
- c# - 无法在 Visual Studio 中运行所有测试。TestCaseFilter 缺少运算符“|”的格式不正确 或者 '&'
- javascript - 重置并设置 mousemove 超时以在不活动后执行某些操作
- python - Python kerberos 委托