Note that RequestList can be used together with RequestQueue by the same crawler. In such cases, each request from RequestList is enqueued into RequestQueue first and then consumed from the latter. This is necessary to avoid the same URL being processed more than once (from the list first and then possibly from the queue). In practical terms, such a combination can be useful when there is a large number of initial URLs, but more URLs would be added dynamically by the crawler.
addRequest
the only thing that your express.js endpoint do? How many requests are you adding at once this way?app.post('/scrape', async (req, res) => {
try {
var startUrls = [];
const AsinData = req.body.AsinList;
if(typeof AsinData === 'undefined'){
return res.status(400).json({ error: 'AsinList is undefined'});
}
if(AsinData.length === 0){
return res.status(400).json({ error: 'AsinList is empty'});
}else{
const regex = /^[A-Z0-9]{10}$/;
const isValid = AsinData.every((item) => regex.test(item));
if (isValid) {
console.log("All items in the list meet the criteria.");
}else{
return res.status(400).json({ error: 'All ASINS should match the patterns e.g. B0BM4ZPNV1 ' });
}
}
const queue = await RequestQueue.open("test");
console.log("Asin data",AsinData)
await AsinData.forEach((ASIN) => {
var url_per_asin = {url:
${BASE_URL}/gp/ajax/asin=${ASIN},
userData:{label:'test',keyword: ASIN},
uniqueKey: uuidv4()
}
queue.addRequests(url_per_asin);
startUrls.push(url_per_asin);
});
return res.send("Fetch started..")
} catch (error) {
// Handle any errors that occur
console.error(error);
res.status(500).send('Internal server error');
}
});
queue.addRequests
in wait AsinData.forEach
.forEach
I would just accumulate the requests into a list and the do a single queue.addRequests
call after the forEach
. Not sure why is the startUrls
there and what is its purpose. You may also use console.time
and console.timeEnd
(with unique labels for each request) to investigate what is causing long times https://developer.mozilla.org/en-US/docs/Web/API/console/timeEnd