this.page.on('response', async (response) => { const matches = /.*\.(jpg|png|svg|gif)$/.exec(response.url()); console.log(matches); if (matches && (matches.length === 2)) { const extension = matches[1]; const buffer = await response.buffer(); fs.writeFileSync(`downloads/${this.request.userData}.${extension}`, buffer, 'base64'); counter += 1; } });
drop()
on it, it simply fails with:Request queue with id: 7ae80a2d-3b06-4a8f-929d-4fbfc5947e81 does not exist.
this.page.on('response', async (response) => { const matches = /.*\.(jpg|png|svg|gif)$/.exec(response.url()); console.log(matches); if (matches && (matches.length === 2)) { const extension = matches[1]; const buffer = await response.buffer(); fs.writeFileSync(`downloads/${this.request.userData}.${extension}`, buffer, 'base64'); counter += 1; } });
drop()
on it, it simply fails with:Request queue with id: 7ae80a2d-3b06-4a8f-929d-4fbfc5947e81 does not exist.
await page.setRequestInterception(true); page.on('request', async (request) => { if (ok) { await request.continue(); } else { await request.abort(); } }
await page.setRequestInterception(true); page.on('request', async (request) => { if (ok) { await request.continue(); } else { await request.abort(); } }
import { ProxyConfiguration } from 'crawlee'; import { SMART_PROXY_DATACENTER_IPS } from '../utils/proxies.js'; import ApplicationRouter from './ApplicationRouter.js'; export default class TestProxies extends ApplicationRouter { async setup() { this.version = 1; this.prefix = 'TestProxies'; this.datasetName = `${this.prefix}_dataset_V${this.version}`; } async getInitialPages() { return [ { url: "https://ifconfig.co/?a=1", label: "page" }, { url: "https://ifconfig.co/?a=2", label: "page" }, { url: "https://ifconfig.co/?a=3", label: "page" }, { url: "https://ifconfig.co/?a=4", label: "page" }, ]; } getRequestQueueName() { return `${this.prefix}_queue`; } getPageRoot() { return 'https://ifconfig.co'; } // This is the entry async visitPage() { const ip = await this.text({ css: "#output" }) this.debug("Proxy IP is", ip); await this.sleep(4000); } async getCrawlerOptions() { return { maxRequestRetries: 3, maxConcurrency: 2, useSessionPool: true, sessionPoolOptions: { maxPoolSize: 25, sessionOptions: { maxUsageCount: 150, maxAgeSecs: 23*60, // IPs rotate after 30 minutes }, persistStateKeyValueStoreId: `${this.prefix}_V${this.version}_sessions`, persistStateKey: `${this.prefix}_V${this.version}_my-session-pool`, }, proxyConfiguration: new ProxyConfiguration({ proxyUrls: SMART_PROXY_DATACENTER_IPS }) } } }
import { ProxyConfiguration } from 'crawlee'; import { SMART_PROXY_DATACENTER_IPS } from '../utils/proxies.js'; import ApplicationRouter from './ApplicationRouter.js'; export default class TestProxies extends ApplicationRouter { async setup() { this.version = 1; this.prefix = 'TestProxies'; this.datasetName = `${this.prefix}_dataset_V${this.version}`; } async getInitialPages() { return [ { url: "https://ifconfig.co/?a=1", label: "page" }, { url: "https://ifconfig.co/?a=2", label: "page" }, { url: "https://ifconfig.co/?a=3", label: "page" }, { url: "https://ifconfig.co/?a=4", label: "page" }, ]; } getRequestQueueName() { return `${this.prefix}_queue`; } getPageRoot() { return 'https://ifconfig.co'; } // This is the entry async visitPage() { const ip = await this.text({ css: "#output" }) this.debug("Proxy IP is", ip); await this.sleep(4000); } async getCrawlerOptions() { return { maxRequestRetries: 3, maxConcurrency: 2, useSessionPool: true, sessionPoolOptions: { maxPoolSize: 25, sessionOptions: { maxUsageCount: 150, maxAgeSecs: 23*60, // IPs rotate after 30 minutes }, persistStateKeyValueStoreId: `${this.prefix}_V${this.version}_sessions`, persistStateKey: `${this.prefix}_V${this.version}_my-session-pool`, }, proxyConfiguration: new ProxyConfiguration({ proxyUrls: SMART_PROXY_DATACENTER_IPS }) } } }
apt-get install
apt-get install