Apify

Apify and Crawlee Official Forum

b
F
A
J
A
Members
Adi
A
Adi
Offline, last seen last month
Joined August 30, 2024
I am trying to upload screenshot to AWS and also using proxy server. Will uploading process consume any bandwidth of proxy servers?
3 comments
L
A
I am trying to run puppeteer with proxy chain package but I am getting this error message:
Plain Text
"errorType": "Error",
  "errorMessage": "Protocol error (Target.createTarget): Target closed.",

Code:
Plain Text
const chromium = require('chrome-aws-lambda');
const { addExtra } = require("puppeteer-extra");
const puppeteerExtra = addExtra(chromium.puppeteer);
const proxyChain = require('proxy-chain');

const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteerExtra.use(StealthPlugin());

exports.handler = async (event, context, callback) => {
    let finalResult = [];
    const url = ``;
    let browser;
    const oldProxyUrl = ''; // --> bright data proxy
    const newProxyUrl = await proxyChain.anonymizeProxy(oldProxyUrl);

    console.log("newProxyUrl", newProxyUrl)

    try {
        browser = await puppeteerExtra.launch({
            args: ['--no-sandbox', '--disable-setuid-sandbox', `--proxy-server=${newProxyUrl}`],
            defaultViewport: chromium.defaultViewport,
            executablePath: await chromium.executablePath,
            headless: chromium.headless
        });

        const page = await browser.newPage();

        await page.goto(url);

        finalResult = await extractElements(page);

    } catch (error) {
        return callback(error);
    } finally {
        await browser.close();
    }

    return callback(null, finalResult);
};
1 comment
L
For Puppetercrawler should I use header-generator package https://www.npmjs.com/package/header-generator for adding user agent
https://crawlee.dev/api/browser-crawler/interface/BrowserLaunchContext#userAgent or does it handle automatically based on fingerprint options?
2 comments
L
I running this code on Linux VPS:
Plain Text
const puppeteer = require('puppeteer');
const proxyChain = require('proxy-chain');

(async () => {
    const oldProxyUrl = 'http://auto:apify_proxy_XXXXXXX@proxy.apify.com:8000';
    const newProxyUrl = await proxyChain.anonymizeProxy(oldProxyUrl);

    console.log(newProxyUrl);

    const browser = await puppeteer.launch({
        args: [`--proxy-server=${newProxyUrl}`],
    });

    // Do your magic here...
    const page = await browser.newPage();
    await page.goto('https://www.example.com');
})();
But getting this error: Error: net::ERR_TUNNEL_CONNECTION_FAILED at https://www.google.com/
4 comments
L
A
Can I make use of proxy chain npm package with AWS Lambda?
https://www.npmjs.com/package/proxy-chain
2 comments
L
A
Hi, I have observed google search results for mobile device are different compared to desktop results. In that case can I some how use mobile proxy to get mobile view results from google search? Does this package https://www.npmjs.com/package/proxy-chain support mobile proxy?
1 comment
L
I am using Google Search Results Scraper but can I start from page 10 or page 100? if I have already scraped first 10 or 100 pages? Total search results are 200k. How can I extract results in batches? i.e extract data of every 100 pages? or do I need extract data of 200k results in one shot?
3 comments
L
s
A
I am trying to run crawlee on aws lambda but getting this error message: Reclaiming failed request back to the list or queue. Protocol error (Target.setAutoAttach): Target closed.
chromium version: 109
node version: 16
code:
Plain Text
exports.handler = async (event, context, callback) => {
    const finalResult = [];
    const url = ``;

    try {
        const crawler = new PuppeteerCrawler({
            launchContext: {
                useIncognitoPages: true,
                launchOptions: {
                    executablePath: await chromium.executablePath(),
                    args: ['--no-sandbox', '--disable-setuid-sandbox']
                },
                launcher: puppeteer
            },
            useSessionPool: true,
            requestHandlerTimeoutSecs: 60, 
            browserPoolOptions: {
                useFingerprints: true,
                fingerprintOptions: {
                    fingerprintGeneratorOptions: {
                        browsers: ['chrome'],
                        operatingSystems: ['windows'],
                        devices: ['desktop'],
                        locales: ['en-US', 'en']
                    },
                },
            },
            headless: true,

            async requestHandler({ request, page, enqueueLinks }) {
                log.info(`Processing ${request.url}...`);

            },

            // This function is called if the page processing failed more than maxRequestRetries+1 times.
            failedRequestHandler({ request }) {
                log.error(`Request ${request.url} failed too many times.`);
            },
        });

        // Run the crawler and wait for it to finish.
        await crawler.run([url]);
        log.info('Crawler finished.');

    } catch (error) {
        return callback(error);
    } finally {

    }
    return callback(null, finalResult);
};
6 comments
L
A
p
I am trying to run crawlee on aws lambda but getting this error message: Reclaiming failed request back to the list or queue. Protocol error (Target.setAutoAttach): Target closed.
chromium version: 109
node version: 16
code:
Plain Text
exports.handler = async (event, context, callback) => {
    const finalResult = [];
    const url = ``;

    try {
        const crawler = new PuppeteerCrawler({
            launchContext: {
                useIncognitoPages: true,
                launchOptions: {
                    executablePath: await chromium.executablePath(),
                    args: ['--no-sandbox', '--disable-setuid-sandbox']
                },
                launcher: puppeteer
            },
            useSessionPool: true,
            requestHandlerTimeoutSecs: 60, 
            browserPoolOptions: {
                useFingerprints: true,
                fingerprintOptions: {
                    fingerprintGeneratorOptions: {
                        browsers: ['chrome'],
                        operatingSystems: ['windows'],
                        devices: ['desktop'],
                        locales: ['en-US', 'en']
                    },
                },
            },
            headless: true,

            async requestHandler({ request, page, enqueueLinks }) {
                log.info(`Processing ${request.url}...`);

            },

            // This function is called if the page processing failed more than maxRequestRetries+1 times.
            failedRequestHandler({ request }) {
                log.error(`Request ${request.url} failed too many times.`);
            },
        });

        // Run the crawler and wait for it to finish.
        await crawler.run([url]);
        log.info('Crawler finished.');

    } catch (error) {
        return callback(error);
    } finally {

    }
    return callback(null, finalResult);
};
6 comments
L
A
p