It keeps on returning 403 even with rotating proxy pool
Source code:
import { PlaywrightCrawler, ProxyConfiguration } from 'crawlee';
import proxy from './proxy_config.js';
// PlaywrightCrawler crawls the web using a headless browser controlled by the Playwright library.
const proxyConfiguration = new ProxyConfiguration({
proxyUrls: [`http://${proxy.username}:${proxy.password}@${proxy.host}:${proxy.port}`]
});
const crawler = new PlaywrightCrawler({
// Use the requestHandler to process each of the crawled pages.
proxyConfiguration,
async requestHandler({ request, page, enqueueLinks, pushData, log }) {
const title = await page.title();
log.info(`Title of ${request.loadedUrl} is '${title}'`);
// Save results as JSON to `./storage/datasets/default` directory.
await pushData({ title, url: request.loadedUrl });
// Extract links from the current page and add them to the crawling queue.
await enqueueLinks();
},
// Uncomment this option to see the browser window.
// headless: false,
// Comment this option to scrape the full website.
maxRequestsPerCrawl: 20,
});
// Add first URL to the queue and start the crawl.
await crawler.run(['https://nopecha.com/demo/cloudflare']);
// Export the whole dataset to a single file in `./result.csv`.
await crawler.exportData('./result.csv');
// Or work with the data directly.
const data = await crawler.getData();
console.table(data.items);