requestHandler: [router, router2]
import { CheerioCrawler, ProxyConfiguration, AutoscaledPool, SessionPool } from 'crawlee'; import { combinedRouter } from './combinedRouter.js'; const searchKeywords = 'hydroflasks'; // Replace with desired search keywords const startUrls = [ { url: `https://www.amazon.com/s?k=${searchKeywords}`, label: 'AMAZON' }, { url: `https://www.ebay.com/sch/i.html?_nkw=${searchKeywords}`, label: 'EBAY' }, ]; const crawler = new CheerioCrawler({ useSessionPool: true, sessionPoolOptions: { maxPoolSize: 100 }, // Set to true if you want the crawler to save cookies per session, // and set the cookie header to request automatically (default is true). persistCookiesPerSession: true, // Start the crawler right away and ensure there will always be 20 concurrent requests ran at any time minConcurrency: 20, // Ensure the crawler doesn't exceed 15 concurrent requests ran at any time maxConcurrency: 40, // ...but also ensure the crawler never exceeds 250 requests per minute maxRequestsPerMinute: 250, // Define router to run crawl requestHandler: combinedRouter }); export { crawler } await crawler.run(startUrls);
import { router as amazonRouter } from './amazon.js'; import { router2 as ebayRouter } from './ebay.js'; const combinedRouter = (request, crawler) => { amazonRouter(request, crawler); ebayRouter(request, crawler); };
import { createCheerioRouter } from 'crawlee'; import fs, { link } from 'fs'; import { crawler } from './main.js'; export const router = createCheerioRouter(); router.addHandler('AMAZON', async ({ $, crawler }) => { console.log('starting link scrape') // Scrape product links from search results page const productLinks = $('h2 a').map((_, el) => 'https://www.amazon.com' + $(el).attr('href')).get(); console.log(`Found ${productLinks.length} product links for Amazon`); console.log(productLinks) // Add each product link to request queue for (const link of productLinks) { const result = await crawler.addRequests([link]); await result.waitForAllRequestsToBeAdded; } }); router.addDefaultHandler(async ({ $, request, crawler }) => { const productInfo = {}; productInfo.link = request.url; productInfo.storeName = 'Amazon'; productInfo.productTitle = $('span#productTitle').text().trim(); productInfo.productDescription = $('div#productDescription').text().trim(); productInfo.salePrice = $('span#priceblock_ourprice').text().trim(); productInfo.originalPrice = $('span.priceBlockStrikePriceString').text().trim(); productInfo.reviewScore = $('span#acrPopover').attr('title'); productInfo.shippingInfo = $('div#ourprice_shippingmessage').text().trim(); // Write product info to JSON file if (Object.keys(productInfo).length > 0) { const rawData = JSON.stringify(productInfo, null, 2); fs.appendFile('rawData.json', rawData, (err) => { if (err) throw err; }); } console.log(`Product info written to rawData.json for amazon`); });
import { createCheerioRouter } from 'crawlee'; import fs, { link } from 'fs'; import { crawler } from './main.js'; export const router2 = createCheerioRouter(); router2.addHandler('EBAY', async ({ $, crawler }) => { console.log('starting link scrape') // Scrape product links from search results page const productLinks = $('a.item__info-link').map((_, el) => $(el).attr('href')).get(); console.log(`Found ${productLinks.length} product links for eBay`); // Add each product link to request queue for (const link of productLinks) { const result = await crawler.addRequests([link]); await result.waitForAllRequestsToBeAdded; } }); router2.addDefaultHandler(async ({ $, request, crawler }) => { const productInfo = {}; productInfo.link = request.url; productInfo.storeName = 'eBay'; productInfo.productTitle = $('h3.s-item__title').text().trim(); productInfo.productDescription = $('div.a-section.a-spacing-small.span.a-size-base-plus').text().trim(); productInfo.salePrice = $('span.s-item__price').text().trim(); productInfo.originalPrice = $('span.s-item__price--original').text().trim(); productInfo.reviewScore = $('div.s-item__reviews').text().trim(); productInfo.shippingInfo = $('span.s-item__shipping').text().trim(); // Write product info to JSON file if (Object.keys(productInfo).length > 0) { const rawData = JSON.stringify(productInfo, null, 2); fs.appendFile('rawData.json', rawData, (err) => { if (err) throw err; }); } });
"link": "https://www.amazon.com/Hydro-Flask-Standard-Flex-RAIN/dp/B08WWLPYKC/ref=sr_1_8?keywords=hydroflasks&qid=1684641714&sr=8-8", "storeName": "Amazon", "productTitle": "Hydro Flask Standard Mouth Bottle with Flex Cap", "productDescription": "The Standard Mouth bottle is ideal for sipping, while still accommodating ice cubes. Featuring the insulated Flex Cap, designed for ultimate portability and comfort.", "salePrice": "", "originalPrice": "", "reviewScore": "4.8 out of 5 stars", "shippingInfo": ""
"link": "https://www.amazon.com/Hydro-Flask-Standard-Flex-RAIN/dp/B08WWLPYKC/ref=sr_1_8?keywords=hydroflasks&qid=1684641714&sr=8-8", "storeName": "eBay", "productTitle": "", "productDescription": "", "salePrice": "", "originalPrice": "", "reviewScore": "", "shippingInfo": ""
const result = await crawler.addRequests([link]);
- so it would be something like const result = await crawler.addRequests([{ url: link, label: 'AMAZON_PRODUCT' }])
; and the same for ebay. You're experiencing some unexpected behavior, because this combination of different routers does not really make much sense for the crawler...