import { PlaywrightCrawler, ProxyConfiguration } from "crawlee"; const startUrls = ['http://quotes.toscrape.com/js/']; const crawler = new PlaywrightCrawler({ requestHandler: async ({ page, parseWithCheerio }) => { await page.waitForSelector("div.quote span.text", { "timeout": 60000 }); const $ = await parseWithCheerio() const quotes = $("div.quote span.text") quotes.each((_, element) => { console.log($(element).text()) }); }, }); await crawler.run(startUrls);
const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ["url-to-proxy-port-im-using"] }) // and the add it to crawler const crawler = new PlaywrightCrawler({ proxyConfiguration, ...
enqueueLinks()
.enqueueLinks()
.pushData()
. Otherwise, if it's another page, repeat from 3.maxRequestsPerCrawl: 1000
, didn't help."requestsFinished":119
. A number that doesn't make sense at all. Less than the number of actually enqueued links but a lot more than the number of actuall processed pages.apify secrets:add aPrivateKey '-----BEGIN PRIVATE KEY-----_base64content_-----END PRIVATE KEY-----'
. Thanks for the help.{ "service": "AutoscaledPool", "time": "2024-10-30T16:42:17.049Z", "id": "cae4950d568a4b8bac375ffa5a40333c", "jobId": "9afee408-42bf-4194-b17c-9864db707e5c", "currentConcurrency": "4", "desiredConcurrency": "5", "systemStatus": "{\"isSystemIdle\":true,\"memInfo\":{\"isOverloaded\":false,\"limitRatio\":0.2,\"actualRatio\":0},\"eventLoopInfo\":{\"isOverloaded\":false,\"limitRatio\":0.6,\"actualRatio\":0},\"cpuInfo\":{\"isOverloaded\":false,\"limitRatio\":0.4,\"actualRatio\":0},\"clientInfo\":{\"isOverloaded\":false,\"limitRatio\":0.3,\"actualRatio\":0}}" }
{ "rejection": "true", "date": "Wed Oct 30 2024 16:42:38 GMT+0000 (Coordinated Universal Time)", "process": "{\"pid\":1,\"uid\":997,\"gid\":997,\"cwd\":\"/home/myuser\",\"execPath\":\"/usr/local/bin/node\",\"version\":\"v22.9.0\",\"argv\":[\"/usr/local/bin/node\",\"/home/myuser/FIDO-Scraper-Discovery\"],\"memoryUsage\":{\"rss\":337043456,\"heapTotal\":204886016,\"heapUsed\":168177928,\"external\":30148440,\"arrayBuffers\":14949780}}", "os": "{\"loadavg\":[3.08,3.38,3.68],\"uptime\":312222.44}", "stack": "response.headerValue: Target page, context or browser has been closed\n at Page.<anonymous> (/home/myuser/FIDO-Scraper-Discovery/dist/articleImagesPreNavHook.js:15:60)" }
import asyncio import json from crawlee import Request from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext async def main() -> None: crawler = HttpCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') response = context.http_response.read().decode('utf-8') context.log.info(f'Response: {response}') # To see the response in the logs. # Prepare a POST request to the form endpoint. request = Request.from_url( url='https://httpbin.org/post', method='POST', payload=json.dumps( { 'custname': 'John Doe', } ).encode(), ) # Run the crawler with the initial list of requests. await crawler.run([request]) if __name__ == '__main__': asyncio.run(main())
@router.default_handler async def default_handler(context: PlaywrightCrawlingContext) -> None: # This is a fallback route which will handle the start URL. context.log.info(f'default_handler is processing {context.request.url}') await context.page.wait_for_selector('.collection-block-item') await context.enqueue_links( selector='.collection-block-item', label='CATEGORY', )
ArgumentError: Expected propertymaxRequestsPerCrawl
to be of typenumber
but received typestring
in objectPlaywrightCrawlerOptions
maxRequestsPerCrawl: process.env.ACTOR_MAX_PAID_DATASET_ITEMS || input.limit,
typeof crawler.options.maxRequestsPerCrawl 'number'
Reclaiming failed request back to the list or queue. Redirected 10 times. Aborting.
const crawler = new PlaywrightCrawler({ // proxyConfiguration: new ProxyConfiguration({ proxyUrls: ['...'] }), requestHandler: async (ctx) => { if (ctx.request.url.includes("url1")) { await url1Router(ctx); } if (ctx.request.url.includes("url2")) { await url2Router(ctx); } if (ctx.request.url.includes("url3")) { await url3Router(ctx); } await Dataset.exportToJSON("data.json"); }, // Comment this option to scrape the full website. // maxRequestsPerCrawl: 20, });