chromium.use(stealthPlugin()); const router = createPlaywrightRouter(); router.addHandler( requestLabels.SPIDER, spiderDiscoveryHandlerFactory(container), ); router.addHandler(requestLabels.ARTICLE, articleHandlerFactory(container)); const config = new Configuration({ storageClient: new MemoryStorage({ localDataDirectory: `./storage/${message.messageId}`, writeMetadata: true, persistStorage: true, }), persistStateIntervalMillis: 5000, persistStorage: true, purgeOnStart: false, headless: false, }); const crawler = new PlaywrightCrawler( { launchContext: { launcher: chromium, }, requestHandler: router, errorHandler: (_request, error) => { logger.error(`${error.name}\n${error.message}`); }, maxRequestsPerCrawl: body.config.maxRequests > 0 ? body.config.maxRequests : undefined, useSessionPool: true, persistCookiesPerSession: true, }, config, );
import { PlaywrightCrawler, sleep } from 'crawlee'; import path from 'path'; const crawler = new PlaywrightCrawler({ headless: false, // Run in headful mode for debugging async requestHandler({request, page, enqueueLinks, session}) { console.log(session?.getCookies(request.url)) const a = 1; await enqueueLinks({globs: ['https://dca-global.org/serve-file/**']}) }, async errorHandler({ request }) { if(request.userData['attachment']) { console.log('download not error') request.noRetry = true } }, preNavigationHooks: [ async (crawlingContext) => { const a = 1; crawlingContext.page.once('response', async (resp) => { const disposition = await resp.headerValue('content-disposition') if (disposition && crawlingContext.request.url == resp.request().url()) { crawlingContext.request.userData['attachment'] = true; const download = await crawlingContext.page.waitForEvent('download') await download.saveAs(path.join('./storage/downloads', download.suggestedFilename())) } }) }, ], sessionPoolOptions: { maxPoolSize: 1 } }); const startUrls = ['https://dca-global.org/file/view/12756/interact-case-study-cedaci']; await crawler.addRequests(startUrls); await crawler.run();
type downloadPromise = { data: Buffer suggestedName: string } async requestHandler({ enqueueLinks }) { await enqueueLinks({globs: ['https://dca-global.org/serve-file/**']}) }, async errorHandler({ request }) { if(request.userData['download']) { console.log('checking for download') try { const download = await request.userData.download as downloadPromise | undefined if (download) { console.log('it was a file download not an error') } { console.log('no download, was actually an error') } } catch (err) { console.log('download failed') console.error(err) } } }, preNavigationHooks: [ async (crawlingContext) => { crawlingContext.request.userData['download'] = new Promise<downloadPromise | undefined>( async (resolve, reject) => { try { const response = await crawlingContext.page.waitForEvent('response') const disposition = await response.headerValue('content-disposition') if (disposition && crawlingContext.request.url == response.request().url()) { const download = await crawlingContext.page.waitForEvent('download') const stream = await download.createReadStream() const chunks: Buffer[] = []; stream.on('data', (chunk: Buffer) => { chunks.push(chunk); }); stream.on('end', () => { const buffer = Buffer.concat(chunks); resolve({data: buffer, suggestedName: download.suggestedFilename()}) }); setTimeout(() => reject(new Error('download not complete after 15 seconds')), 15000) } else { resolve(undefined) } } catch(err) { reject(err) } }) }, ],