INFO CheerioCrawler: Starting the crawler. INFO CheerioCrawler: Crawler reached the maxRequestsPerCrawl limit of 50 requests and will shut down soon. Requests that are in progress will be allowed to finish. INFO CheerioCrawler: Earlier, the crawler reached the maxRequestsPerCrawl limit of 50 requests and all requests that were in progress at that time have now finished. In total, the crawler processed 60 requests and will shut down. INFO CheerioCrawler: Final request statistics:
import { CheerioCrawler, createCheerioRouter, type ProxyConfiguration, } from "crawlee"; import { createDbClient } from "../../db/CreateDbClient.ts"; export abstract class HTMLScraper { protected router: ReturnType<typeof createCheerioRouter>; protected crawler: CheerioCrawler; protected jobs: Job[] = []; public startUrls: string[] = []; constructor(options: ScraperOptions) { this.router = createCheerioRouter(); this.crawler = new CheerioCrawler({ proxyConfiguration: options.proxyConfiguration, requestHandler: this.router, maxRequestsPerCrawl: options.maxRequestsPerCrawl, maxRequestRetries: 2, maxRequestsPerMinute: 60, }); this.startUrls = options.startUrls; } public async run(): Promise<void> { await this.crawler.run(this.startUrls); } abstract initialize(): void; abstract handleDetailPage(): void; abstract DateToTimestamp(date: string): string; async saveJobs(): Promise<void> { const db = createDbClient(); try { const { data, error } = await db.from("Jobs").upsert(this.jobs, { onConflict: "hash", ignoreDuplicates: true, }); if (error) { throw error; } } catch (error) { console.error(error); } } } export interface ScraperOptions { startUrls: string[]; proxyConfiguration: ProxyConfiguration; maxRequestsPerCrawl: number; } export interface Job { title: string; url: string; hash: string; source: string; company: string | null; location: string | null; salary: string | null; posted_at: string; ends_at: string | null; }