Page.goto: Timeout 30000ms exceeded
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin user_plugin = PlaywrightBrowserPlugin(browser_options={"timeout": 60000}) browser_pool = BrowserPool(plugins=[user_plugin]) crawler = PlaywrightCrawler(browser_pool=browser_pool)
[crawlee.playwright_crawler._playwright_crawler] ERROR Request failed and reached maximum retries Traceback (most recent call last): File "/usr/local/lib/python3.12/site-packages/crawlee/basic_crawler/_context_pipeline.py", line 65, in __call__ result = await middleware_instance.__anext__() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/crawlee/playwright_crawler/_playwright_crawler.py", line 260, in _handle_blocked_request selector for selector in RETRY_CSS_SELECTORS if (await context.page.query_selector(selector)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/playwright/async_api/_generated.py", line 8064, in query_selector await self._impl_obj.query_selector(selector=selector, strict=strict) File "/usr/local/lib/python3.12/site-packages/playwright/_impl/_page.py", line 414, in query_selector return await self._main_frame.query_selector(selector, strict) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/playwright/_impl/_frame.py", line 304, in query_selector await self._channel.send("querySelector", locals_to_params(locals())) File "/usr/local/lib/python3.12/site-packages/playwright/_impl/_connection.py", line 59, in send return await self._connection.wrap_api_call( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/playwright/_impl/_connection.py", line 520, in wrap_api_call raise rewrite_error(error, f"{parsed_st['apiName']}: {error}") from None playwright._impl._errors.Error: Page.query_selector: Execution context was destroyed, most likely because of a navigation [crawlee._autoscaling.autoscaled_pool] INFO Waiting for remaining tasks to finish [crawlee.playwright_crawler._playwright_crawler] INFO Error analysis: total_errors=3 unique_errors=1
from apify import Actor, Request from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.proxy_configuration import ProxyConfiguration from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin async def main() -> None: async with Actor: # Retrieve the Actor input, and use default values if not provided. actor_input = await Actor.get_input() or {} start_urls = [url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])] proxy_settings = actor_input.get('proxy') proxy_configuration = ProxyConfiguration(proxy_urls=[ 'http://xxx:xxx@xxx:xxxx', ]) # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() user_plugin = PlaywrightBrowserPlugin(browser_options={"timeout": 60000}) browser_pool = BrowserPool(plugins=[user_plugin]) # Create a crawler. crawler = PlaywrightCrawler( max_requests_per_crawl=50, proxy_configuration=proxy_configuration, browser_pool=browser_pool ) # Define a request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: Actor.log.info("H") url = context.request.url Actor.log.info(f'Scraping {url}...') # Run the crawler with the starting requests. await crawler.run(start_urls)
request_handler_timeout
higher as its default value is 60 seconds maybe the problem occurs when there is interaction with the element and the handler closes by timeout.user_plugin = PlaywrightBrowserPlugin(browser_options={"timeout": 600000, 'headless': False})
request_handler_timeout=timedelta(minutes=100)
raise rewrite_error(error, f"{parsed_st['apiName']}: {error}") from None playwright._impl._errors.TimeoutError: Page.goto: Timeout 30000ms exceeded. Call log: - navigating to "https://apify.com/", waiting until "load"
timeout
only affects the opening of the browser. But not the page open π’@crawler.pre_navigation_hook async def log_navigation_url(context: PlaywrightPreNavigationContext) -> None: context.log.info(f'Navigating to {context.request.url} ...') context.page.set_default_navigation_timeout(60000)
pre_navigation_hook
is the only way available