Apify and Crawlee Official Forum

Updated last month

Scrape/crawl transactional rather than batch

At a glance
Hi, I'm looking to introduce crawling websites into an existing workflow which doesn't suit batch processing. i.e. I want to scrape each website get the result and do some further processing downstream. I do have this working with the code attached however I imagine there's a better way to achieve this given I'll be concurrently processing this with up to 500 websites and my concern is memory allocation

Plain Text
export async function crawlWebsiteForAddresses(url: string) {
  const ukPostcodeRegex = /\b([A-Z]{1,2}[0-9][A-Z0-9]?)\s?([0-9][A-Z]{2})\b/;
  const addressSet = new AddressSet();

  const crawler = new PlaywrightCrawler({
    requestHandler: async ({ request, page, enqueueLinks, log }) => {
      const content = await page.content();

      const postcodeMatch = content.match(ukPostcodeRegex);
      if (postcodeMatch) {
        const postcode = postcodeMatch[0];
        log.info(`UK postcode found on ${request.url}: ${postcode}`);
        const addressElement = page.locator(`text=${postcode}`).first();

        if (addressElement) {
          const parentTextContent = await addressElement.evaluate((el) => (el.parentElement?.textContent ? el.parentElement?.textContent : ""));
          log.info(`Address found for postcode ${postcode}: ${parentTextContent}`);
          addressSet.add({ postcode, addressText: parentTextContent });
        }
      }

      await enqueueLinks();
    },
    maxRequestsPerCrawl: 500,
  });

  await crawler.run([url]);
  await crawler.teardown();
  return addressSet;
}
Add a reply
Sign up and join the conversation on Discord