javascript - 没有“下一页”按钮且网址不变时的分页
问题描述
我正在尝试使用 Puppeteer 和 Node.js抓取https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1
刮板工作,但只从第一页获取数据,因为我不知道如何分页。问题是 URL 不会根据页码而改变,并且没有“下一页”按钮。
如何使用这样的约束实现分页?
以下是我的完整代码:
let browser;
const ventureLoopResults = [];
const url =
"https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1";
// "https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1";
async function scrapeJobsInIndexPage(url) {
try {
// const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto("https://www.ventureloop.com/ventureloop/login.php", {
waitUntil: "networkidle0",
});
await page.click("#close-cookies", {
delay: 200,
});
await page.type("[name='email_1']", "natan.chapman@gmail.com", {
delay: 200,
});
await page.type("[name='pass']", "Aw8rbJ!9bXt*dpb", { delay: 200 });
await page.click("#formContainer > form > div:nth-child(5) > input", {
delay: 200,
});
await page.waitForNavigation();
await page.goto(url, { waitUntil: "networkidle0" });
// await page.goto(ls
// "https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1",
// { waitUntil: "networkidle0" });
// await page.waitForNavigation();
const html = await page.evaluate(() => document.body.innerHTML);
// const content = await page.content();
const $ = await cheerio.load(html);
// const $ = await cheerio.load(content);
// const homes = $("[itemprop='url']")
const jobs = $(".tsize a:even")
// const jobs = $(".tsize a:even")
.map(
(i, element) =>
"https://www.ventureloop.com/ventureloop/" + $(element).attr("href")
)
.get();
console.log(jobs);
return jobs;
} catch (erro) {
console.error(erro);
}
}
async function scrapeDescriptionPage(url, page) {
let jobText;
try {
await page.goto(url, { waitUntil: "networkidle0" });
const html = await page.evaluate(() => document.body.innerHTML);
const $ = await cheerio.load(html);
jobText = $("#formContainer").text();
const companyImage = await page.$eval(
// ".oc-photo-gallery .photo__10vsfGte img",
".cs-media img",
(img) => img.src
);
const location = $(".location.mid").text();
// const jobSalary = $(".css-1v5elnn.e11nt52q2 .small.css-10zcshf.e1v3ed7e1").text()
const jobPosition = $(".cs-post-title h2").text();
const companyName = $(".cs-post-title h3").text();
const applyLinkRedirect = $(".ltp-btn").attr("href");
// const jobDescription = $(".company-detail").html();
const jobDescription = $(
"#formContainer > form > div > div > div.company-detail > div:nth-child(3)"
).html();
const datePosted = $(
"#formContainer > form > div > div > div.company-detail > ul > li:nth-child(3) > span"
).text();
await page.goto(applyLinkRedirect, { waitUntil: "networkidle0" });
const applyLink = await page.url();
let ventureLoopResult = new testVentureLoopDB({
url,
jobPosition,
companyName,
applyLink,
jobDescription,
companyImage,
datePosted,
// jobSalary,
location,
});
ventureLoopResults.push(ventureLoopResult);
console.log(ventureLoopResults);
ventureLoopResult.save();
const listingModel = new GlassdoorDB(ventureLoopResult);
// const listingModel = new VentureLoopDB(ventureLoopResult);
// const listingModel = new VentureLoopDB(ventureLoopResult);
await listingModel.save();
console.log(result);
}
return ventureLoopResults;
// while (await page.$("[data-test='pagination-next']")) {
// await page.click("[data-test='pagination-next']");
// }
} catch (err) {
console.log(jobText);
console.log(url);
console.log(err);
}
}
async function main() {
await connectToMongoDb();
browser = await puppeteer.launch({ headless: false });
const descriptionPage = await browser.newPage();
const jobs = await scrapeJobsInIndexPage(
"https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1"
);
for (var i = 1; i < jobs.length; i++) {
const result = await scrapeDescriptionPage(jobs[i], descriptionPage);
// const datePosted = await scrapeDescriptionPage(jobs[i], descriptionPage);
console.log(result);
}
}
main();
解决方案
尝试这样的事情:
import puppeteer from 'puppeteer';
const browser = await puppeteer.launch({ headless: false, defaultViewport: null });
try {
const [page] = await browser.pages();
await page.goto('https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1');
const totalPagesSelector = '.pag_txt_tot';
const currentPageSelector = '.pag_txt_current';
await page.waitForSelector(totalPagesSelector);
const totalPages = await page.$eval(totalPagesSelector, el => Number(el.innerText));
for (let currentPage = 1; currentPage <= totalPages; currentPage++) {
await page.waitForFunction(
(sel, page) => document.querySelector(sel)?.innerText === String(page),
{},
currentPageSelector,
currentPage,
);
const data = await page.evaluate(() => {
const firstDataCell = document.querySelector('#news_tbl tr td')?.innerText;
return firstDataCell;
});
console.log(`${currentPage}: ${data}`);
await page.evaluate(() => {
document.querySelector('span.current').nextElementSibling?.querySelector('a').click();
});
}
} catch (err) { console.error(err); } finally { await browser.close(); }
推荐阅读
- sockets - 在后台运行服务器,而应用程序执行其他操作
- python-3.5 - (Napalm 自动化)在使用“with”测试打开文件/cisco 开关时使用 try except 块后出现缩进错误
- php - 如何从 mysql 获取数据并显示在首页?
- c# - 图像不使用 Razor 渲染
- lua - 尝试通过“pgsql”DSN 连接到 PostgreSQL 数据库在 FreeSWITCH 1.8 上返回“ERR [无法打开数据库文件]”
- nginx - 为什么在本地机器和容器中的任何地方都找不到 nginx 根路径?
- javascript - 立即输入并更改 div innerhtml
- pandas - 将第一行数据转换为字典并删除 nan 值
- database - 如何使用 JSON 格式将表发布到 DolphinDB 中的 MQTT 服务器
- java - 对于实现多个接口的具体类,该类的客户端如何遵循依赖倒置?