routes.js:import { CheerioCrawler, createCheerioRouter } from 'crawlee';
import fs from 'fs';
export const router = createCheerioRouter();
const linkArray = [];
router.addHandler(async ({ $ }) => {
// Scrape product links from search results page
const productLinks = $('h2 a').map((_, el) => '
https://www.amazon.com' +
$(el).attr('href')).get();
console.log(
Found ${productLinks.length} product links
);
// Add each product link to array (this is inside router[01])
for (const link of productLinks) {
const router02 = createCheerioRouter();
router02.addDefaultHandler(async ({ $ }) => {
const productInfo = {};
productInfo.storeName = 'Amazon';
productInfo.productTitle =
$('span.a-size-large.product-title-word-break').text().trim();
productInfo.productDescription =
$('div.a-row.a-size-base.a-color-secondary').text().trim();
productInfo.salePrice = $('span.a-offscreen').text().trim();
productInfo.originalPrice = $('span.a-price.a-text-price').text().trim();
productInfo.reviewScore = $('span.a-icon-alt').text().trim();
productInfo.shippingInfo =
$('div.a-row.a-size-base.a-color-secondary.s-align-children-center').text().trim();
// Write product info to JSON file
if (productInfoList.length > 0) {
const rawData = JSON.stringify(productInfo, null, 2);
fs.appendFile('rawData.json', rawData, (err) => {
if (err) throw err;
console.log(
Product info written to rawData.json for ${link}
);
});
}
})
//router02.queue.addRequest({ url: link });
const amazon = new CheerioCrawler({
// Start the crawler right away and ensure there will always be 5 concurrent
requests ran at any time
minConcurrency: 1,
// Ensure the crawler doesn't exceed 15 concurrent requests ran at any time
maxConcurrency: 10,
// ...but also ensure the crawler never exceeds 400 requests per minute
maxRequestsPerMinute: 400,
// Define route for crawler to run on
requestHandler: router02
});
await amazon.run(link);
console.log('running link')
}
});