Hi, I'm looking to introduce crawling websites into an existing workflow which doesn't suit batch processing. i.e. I want to scrape each website get the result and do some further processing downstream. I do have this working with the code attached however I imagine there's a better way to achieve this given I'll be concurrently processing this with up to 500 websites and my concern is memory allocation
export async function crawlWebsiteForAddresses(url: string) {
const ukPostcodeRegex = /\b([A-Z]{1,2}[0-9][A-Z0-9]?)\s?([0-9][A-Z]{2})\b/;
const addressSet = new AddressSet();
const crawler = new PlaywrightCrawler({
requestHandler: async ({ request, page, enqueueLinks, log }) => {
const content = await page.content();
const postcodeMatch = content.match(ukPostcodeRegex);
if (postcodeMatch) {
const postcode = postcodeMatch[0];
log.info(`UK postcode found on ${request.url}: ${postcode}`);
const addressElement = page.locator(`text=${postcode}`).first();
if (addressElement) {
const parentTextContent = await addressElement.evaluate((el) => (el.parentElement?.textContent ? el.parentElement?.textContent : ""));
log.info(`Address found for postcode ${postcode}: ${parentTextContent}`);
addressSet.add({ postcode, addressText: parentTextContent });
}
}
await enqueueLinks();
},
maxRequestsPerCrawl: 500,
});
await crawler.run([url]);
await crawler.teardown();
return addressSet;
}