首页 > 解决方案 > 从一系列网址中抓取 - puppeteer

问题描述

我有 URL 列表...来自:http: //books.toscrape.com



Let objArray = 
[
{"Url": "books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html"},
{"Url": "books.toscrape.com/catalogue/tipping-the-velvet_999/index.html"},
{"Url": "books.toscrape.com/catalogue/soumission_998/index.html"}
]

如您所见,所有链接都有类似的刮擦。

I want to scrape the Titles, Prices And Stock Availability from above links.

我还尝试像这样遍历所有 URL:


for (var i = 0; i < objArray.length; ++i) {
    (async() => {
        let browser;
        try {
            browser = await puppeteer.launch({
                headless: false,
            });
            const page = await browser.newPage();
            await page.goto(url);

            const content = await page.content();
            const $ = cheerio.load(content);

            const Product_details = []

            const instock = $(div[class="col-sm-6 product_main"] p[class="instockavailability"]).text();
            const title = $(div[class="col-sm-6 product_main"] ).text();
            const price = $(div[class="col-sm-6 product_main"] p[price_color]).text()

            Product_details.push({
                Stock: instock,
                Title: title,
                Price: price,
            });
            fs.writeFileSync("files.json", JSON.stringify(Product_details), "utf8")
            console.log(Product_details)

        }

现在我上面的代码不起作用.....我想获取产品详细信息,例如:标题,价格

标签: puppeteer

解决方案


您可以将每个页面逻辑分成一个函数并尝试如下操作:

(async () => {
    let browser;
    try {
        browser = await puppeteer.launch({
            headless: false,
        });
        const page = await browser.newPage();
        const url = "http://books.toscrape.com/";

        const Product_details = [];

        await page.goto(url);
        Product_details.push(await getData(page, Product_details));

        while (await page.$('li[class="next"] a')) {
          await Promise.all([
            page.waitForNavigation(),
            page.click('li[class="next"] a'),
          ]);
          Product_details.push(await getData(page, Product_details));
        }

        fs.writeFileSync("Details.json", JSON.stringify(Product_details), "utf8");
    } catch (e) {
        console.log('Error-> ', e);
        await browser.close();
    }
})();

async function getData(page, details) {
    console.log(page.url());
    
    const html = await page.content();
    const $ = cheerio.load(html);

    const statsTable = $('li[col-xs-6 col-sm-4 col-md-3 col-lg-3]');

    statsTable.each(function() {
        const title = $(this).find('h3').text();
        const Price = $(this).find('p[class="price_color"]').text();

        details.push({
            Title: title,
            Price: Price
        });
    });
}

UPD:对上一版问题的回答:

const objArray = [
  { Url: 'books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html' },
  { Url: 'books.toscrape.com/catalogue/tipping-the-velvet_999/index.html' },
  { Url: 'books.toscrape.com/catalogue/soumission_998/index.html' },
];

(async () => {
    let browser;
    try {
        const Product_details = [];

        for (const { Url } of objArray) {
            browser = await puppeteer.launch({
                headless: false,
            });
            const page = await browser.newPage();

            await page.goto(`http://${Url}`);
            const content = await page.content();
            const $ = cheerio.load(content);

            const instock = $('div[class="col-sm-6 product_main"] p[class="instockavailability"]').text().trim();
            const title = $('div[class="col-sm-6 product_main"] h1').text().trim();
            const price = $('div[class="col-sm-6 product_main"] p[class="price_color"]').text().trim;

            Product_details.push({
                Stock: instock,
                Title: title,
                Price: price,
            });

            await browser.close();
        }

        console.log(Product_details);
        fs.writeFileSync('files.json', JSON.stringify(Product_details), 'utf8');
    } catch (e) {
        console.log('Error-> ', e);
        await browser.close();
    }
})();

推荐阅读