import { CheerioCrawler, createCheerioRouter } from 'crawlee'; import fs from 'fs'; export const router = createCheerioRouter(); router.addDefaultHandler(async ({ $ }) => { //Scrape product links from search results page const productLinks = $('h2 a').map((_, el) => 'https://www.amazon.com' + $(el).attr('href')).get(); console.log(`Found ${productLinks.length} product links`); //Add each product link to array for (const link of productLinks) { router.addHandler(async ({ $ }) => { const productInfo = {}; productInfo.storeName = 'Amazon'; productInfo.productTitle = $('span.a-size-large.product-title-word-break').text().trim(); productInfo.productDescription = $('div.a-row.a-size-base.a-color-secondary').text().trim(); productInfo.salePrice = $('span.a-offscreen').text().trim(); productInfo.originalPrice = $('span.a-price.a-text-price').text().trim(); productInfo.reviewScore = $('span.a-icon-alt').text().trim(); productInfo.shippingInfo = $('div.a-row.a-size-base.a-color-secondary.s-align-children-center').text().trim(); //Write product info to JSON file if (productInfoList.length > 0) { const rawData = JSON.stringify(productInfo, null, 2); fs.appendFile('rawData.json', rawData, (err) => { if (err) throw err; console.log(`Product info written to rawData.json for ${link}`); }); } }) //router.queue.addRequest({ url: link }); const amazon = new CheerioCrawler({ // Start the crawler right away and ensure there will always be 5 concurrent requests ran at any time minConcurrency: 1, //Ensure the crawler doesn't exceed 15 concurrent requests ran at any time maxConcurrency: 10, //but also ensure the crawler never exceeds 400 requests per minute maxRequestsPerMinute: 400, //Define route for crawler to run on requestHandler: router }); await amazon.run(link); console.log('running link') } });
import { CheerioCrawler } from 'crawlee'; import { router } from './routes.js'; const searchKeywords = 'computers'; // Replace with desired search keywords const searchUrl = `https://www.amazon.com/s?k=${searchKeywords}`; const startUrls = [searchUrl]; const crawler = new CheerioCrawler({ // Start the crawler right away and ensure there will always be 5 concurrent requests ran at any time minConcurrency: 5, // Ensure the crawler doesn't exceed 15 concurrent requests ran at any time maxConcurrency: 15, // ...but also ensure the crawler never exceeds 250 requests per minute maxRequestsPerMinute: 250, // Define router to run crawl requestHandler: router }); await crawler.run(startUrls);
INFO CheerioCrawler: Starting the crawl Found 31 product links WARN CheerioCrawler: Reclaiming failed request back to the list or queue. Expected `requests` to be of type `array` but received type `string` {"id":"b1h8C8G7WjcTMKd","url":"https://www.amazon.com/s?k=computers","retryCount":1} Found 35 product links WARN CheerioCrawler: Reclaiming failed request back to the list or queue. Expected `requests` to be of type `array` but received type `string` {"id":"b1h8C8G7WjcTMKd","url":"https://www.amazon.com/s?k=computers","retryCount":2} Found 35 product links WARN CheerioCrawler: Reclaiming failed request back to the list or queue. Expected `requests` to be of type `array` but received type `string` {"id":"b1h8C8G7WjcTMKd","url":"https://www.amazon.com/s?k=computers","retryCount":3} Found 27 product links ERROR CheerioCrawler: Request failed and reached maximum retries. ArgumentError: Expected `requests` to be of type `array` but received type `string` at ow (C:\Users\haris\OneDrive\Documents\GitHub\crawleeScraper\my-crawler\node_modules\ow\dist\index.js:33:28) at CheerioCrawler.addRequests (C:\Users\haris\OneDrive\Documents\GitHub\crawleeScraper\my-crawler\node_modules\@crawlee\basic\internals\basic-crawler.js:493:26) at CheerioCrawler.run (C:\Users\haris\OneDrive\Documents\GitHub\crawleeScraper\my-crawler\node_modules\@crawlee\basic\internals\basic-crawler.js:421:24) at process.processTicksAndRejections (node:internal/process/task_queues:95:5) at async file:///C:/Users/haris/OneDrive/Documents/GitHub/crawleeScraper/my-crawler/src/routes.js:45:5 {"id":"b1h8C8G7WjcTMKd","url":"https://www.amazon.com/s?k=computers","method":"GET","uniqueKey":"https://www.amazon.com/s?k=computers"} INFO CheerioCrawler: All requests from the queue have been processed, the crawler will shut down.
crawler.addRequests()
https://crawlee.dev/api/next/cheerio-crawler/class/CheerioCrawler#addRequests