import { PlaywrightCrawler, ProxyConfiguration } from 'crawlee'; import proxy from './proxy_config.js'; // PlaywrightCrawler crawls the web using a headless browser controlled by the Playwright library. const proxyConfiguration = new ProxyConfiguration({ proxyUrls: [`http://${proxy.username}:${proxy.password}@${proxy.host}:${proxy.port}`] }); const crawler = new PlaywrightCrawler({ // Use the requestHandler to process each of the crawled pages. proxyConfiguration, async requestHandler({ request, page, enqueueLinks, pushData, log }) { const title = await page.title(); log.info(`Title of ${request.loadedUrl} is '${title}'`); // Save results as JSON to `./storage/datasets/default` directory. await pushData({ title, url: request.loadedUrl }); // Extract links from the current page and add them to the crawling queue. await enqueueLinks(); }, // Uncomment this option to see the browser window. // headless: false, // Comment this option to scrape the full website. maxRequestsPerCrawl: 20, }); // Add first URL to the queue and start the crawl. await crawler.run(['https://nopecha.com/demo/cloudflare']); // Export the whole dataset to a single file in `./result.csv`. await crawler.exportData('./result.csv'); // Or work with the data directly. const data = await crawler.getData(); console.table(data.items);