kotnRouter.addHandler('KOTN_DETAIL', async ({ log, page, parseWithCheerio }) => { log.info(`Scraping product URLs`); const $ = await parseWithCheerio() const productUrls: string[] = []; $('a').each((_, el) => { let productUrl = $(el).attr('href'); if (productUrl) { if (!productUrl.startsWith('https://')) { productUrl = 'https://www.kotn.com' + productUrl; if(productUrl.includes('/products')){ productUrls.push(productUrl); } } } }); // Push unique URLs to the dataset const uniqueProductUrls = Array.from(new Set(productUrls)); await Dataset.pushData({ urls: uniqueProductUrls, }); await Promise.all(uniqueProductUrls.map(link => kotnPw.addRequests([{ url: link, label: 'KOTN_PRODUCT' }]))); linksCount += uniqueProductUrls.length; await infiniteScroll(page, { maxScrollHeight: 0, }); console.log(uniqueProductUrls); console.log(`Total product links scraped so far: ${linksCount}`); // Run bronPuppet crawler once after pushing the first product requests if (linksCount === uniqueProductUrls.length) { await kotnPw.run(); } });
await crawler.run(startUrls)
ERROR CheerioCrawler:AutoscaledPool: isTaskReadyFunction failed ectory, open 'C:\Users\haris\OneDrive\Documents\GitHub\periodicScraper01\pscrape\storage\request_queues\default\1Rk4szfVGlTLik4.json'] { errno: -4058, code: 'ENOENT', syscall: 'open', path: 'C:\\Users\\haris\\OneDrive\\Documents\\GitHub\\periodicScraper01\\pscrape\\storage\\request_queues\\default\\1Rk4szfVGlTLik4.json' }
let salePrice = await page.$eval('span.price-value', (el) => el.textContent?.trim() || ''); let newTag = await page.$eval('span.price-ns', (el) => el.textContent?.trim() || ''); let originalPrice = salePrice; if(newTag){ originalPrice = newTag; }else{ return }
kotnRouter.addHandler('KOTN_DETAIL', async ({ page, log }) => { log.info('Scraping product URLs'); await page.goto(page.url(), { waitUntil: 'domcontentloaded' }) const productUrls: string[] = []; const links = await page.$$eval('a', (elements) => elements.map((el) => el.getAttribute('href')) ); for (const link of links) { if (link && !link.startsWith('https://')) { const productUrl = 'https://www.kotn.com' + link; if (productUrl.includes('/products')) { productUrls.push(productUrl); } } } // Push unique URLs to the dataset const uniqueProductUrls = Array.from(new Set(productUrls)); console.log(uniqueProductUrls); await Dataset.pushData({ urls: uniqueProductUrls, }); await Promise.all( uniqueProductUrls.map((link) => kotnCrawler.addRequests([{ url: link, label: 'KOTN_PRODUCT' }])) ); linksCount += uniqueProductUrls.length; console.log(uniqueProductUrls); console.log(`Total product links scraped so far: ${linksCount}`); }); z
requestHandler: [router, router2]
requestHandler: [router, router2]