async (crawlingContext, gotoOptions) => { const { page, request, crawler } = crawlingContext const queue = await crawler.getRequestQueue() const crawler_dto = request.userData.crawler_dto if (!request.url.endsWith('.pdf')) { gotoOptions.waitUntil = 'networkidle2' gotoOptions.timeout = 20000 await page.setBypassCSP(true) await page.setExtraHTTPHeaders({ 'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8', }) await page.setViewport({ width: 1440, height: 900 }) } page.on('response', async (page_response) => { if (page_response.headers()['content-type'] === 'application/pdf') { gotoOptions.timeout = 1 } }) },
router.addDefaultHandler
doesn't get called for 20 seconds... or at all? since the request times out (since it's a iframe type pdf)import { NonRetryableError } from 'crawlee'; preNavigationHooks: [ async ({ page }) => { page.on('response', async (page_response) => { if (page_response.headers()['content-type'] === 'application/pdf') { throw new NonRetryableError('PDFs are not supported'); } }); }, ]
page.waitForResponse
networkidle2
in requestHandler. There is no way to stop the page navigation in the middle.