My scraper uses BullMQ, which retrieves jobs (URL's) from the job queue and runs them with CheerioCrawler.
Is there any way to initialize the crawler once and keep using it. I assume this will also consume less resources and increase performance?
If there are any best practices that I have not implemented I would love to hear about it.
// worker.ts
import { Worker } from 'bullmq';
import { CheerioCrawler, ProxyConfiguration } from 'crawlee';
import Redis from 'ioredis';
import { router } from './router';
import dotenv from 'dotenv';
dotenv.config();
console.log("REDIS_URL_JOB_QUEUE", process.env.REDIS_URL_JOB_QUEUE)
const connection = new Redis(process.env.REDIS_URL_JOB_QUEUE || '', {
maxRetriesPerRequest: null
}); // Connect to a local Redis instance
const proxy = process.env?.PROXY_URL || '';
console.log('proxy', proxy)
const proxyConfiguration = new ProxyConfiguration({
proxyUrls: [proxy],
});
const crawler = new CheerioCrawler({
proxyConfiguration,
requestHandler: router,
});
const scraperWorker = new Worker(
'scraper',
async (job) => {
const url: string = job.data.url;
try {
// await crawler.addRequests([url]);
await crawler.run([
{
label: 'PRODUCT',
url
},
]);
// If everything went well, return a result
return { result: 'success' };
} catch (error) {
// If something went wrong, throw an error
console.error(`Scrape of ${url} failed with error ${error.message}`);
throw error;
}
},
{
connection, limiter: {
max: 2, // Max number of jobs to handle
duration: 5000 // per duration value in milliseconds (60,000ms = 1 minute)
}
}
);
scraperWorker.on('completed', (job, result) => {
console.log(`Job ${job.id} completed with result ${result.result}`);
});
scraperWorker.on('failed', (job, err) => {
if (!job) return console.log('Job not found');
console.log(`Job ${job.id} failed with error ${err.message}`);
});