@crawler.pre_navigation_hook async def hook1(context: HttpCrawlingContext) -> None: if context.request.label and 'basic' not in context.session.cookies: await context.send_request('https://httpbin.org/cookies/set/basic/100')
@crawler.router.default_handler async def handler_one(context: HttpCrawlingContext) -> None: session_cookie = context.session.cookies request = Request.from_url( url='https://httpbin.org/cookies/set/d/10', label='label_two', user_data={'session_cookie': session_cookie}) await context.add_requests([request]) @crawler.pre_navigation_hook async def hook1(context: HttpCrawlingContext) -> None: if context.request.label: context.session.cookies.update(context.request.user_data['session_cookie'])
from crawlee.sessions import SessionPool crawler = HttpCrawler( session_pool=SessionPool( max_pool_size=1, create_session_settings={ 'max_usage_count': float('inf'), }))
@crawler.pre_navigation_hook async def hook1(context: HttpCrawlingContext) -> None: if context.request.label and 'basic' not in context.session.cookies: await context.send_request('https://httpbin.org/cookies/set/basic/100')
@crawler.router.default_handler async def handler_one(context: HttpCrawlingContext) -> None: session_cookie = context.session.cookies request = Request.from_url( url='https://httpbin.org/cookies/set/d/10', label='label_two', user_data={'session_cookie': session_cookie}) await context.add_requests([request]) @crawler.pre_navigation_hook async def hook1(context: HttpCrawlingContext) -> None: if context.request.label: context.session.cookies.update(context.request.user_data['session_cookie'])
from crawlee.sessions import SessionPool crawler = HttpCrawler( session_pool=SessionPool( max_pool_size=1, create_session_settings={ 'max_usage_count': float('inf'), }))
cookie
to a Request it will overwrite any other cookies. So this approach works best when you want all requests to be made with the same cookie.pre_navigation_hook
you have more control over what happens.sessionid
cookie is responsible for this, you can hash it and pass it inside pre_navigation_hook
for all sessions that do not have a sessionid
.async def main() -> None: crawler = HttpCrawler() _cache = {} @crawler.pre_navigation_hook async def hook(context: HttpCrawlingContext) -> None: if 'sessionid' not in context.session.cookies and 'sessionid' in _cache: context.session.cookies['sessionid'] = _cache['sessionid'] @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}...') if 'sessionid' not in _cache and 'sessionid' in context.session.cookies: _cache['sessionid'] = context.session.cookies['sessionid'] print(context.http_response.read()) await context.add_requests([Request.from_url('https://httpbin.org/get')]) await crawler.run([Request.from_url('https://httpbin.org/cookies/set/sessionid/1')]) if __name__ == '__main__': asyncio.run(main())
use_state
since version 0.5.0async def main() -> None: crawler = HttpCrawler() @crawler.pre_navigation_hook async def hook(context: HttpCrawlingContext) -> None: _cache = await context.use_state() if 'sessionid' not in context.session.cookies and 'sessionid' in _cache: context.session.cookies['sessionid'] = _cache['sessionid'] @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}...') _cache = await context.use_state() if 'sessionid' not in _cache and 'sessionid' in context.session.cookies: _cache['sessionid'] = context.session.cookies['sessionid'] print(context.http_response.read()) await context.add_requests([Request.from_url('https://httpbin.org/get')]) await crawler.run([Request.from_url('https://httpbin.org/cookies/set/sessionid/1')]) if __name__ == '__main__': asyncio.run(main())
sessionid
cookie will be in every session and it doesn't matter when it was created.async def main() -> None: crawler = PlaywrightCrawler() @crawler.pre_navigation_hook async def hook(context: PlaywrightCrawlingContext) -> None: _cache = await context.use_state() if 'sessionid' in _cache: await context.page.context.add_cookies([_cache['sessionid']]) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context_cookies = await context.page.context.cookies(context.request.url) _cache = await context.use_state() target_cookie = None for cookie in context_cookies: if cookie['name'] == 'sessionid': target_cookie = cookie if 'sessionid' not in _cache and target_cookie: _cache['sessionid'] = target_cookie print(await context.page.content()) # clearing cookies to make sure that even if the same context is used. Our solution works. await context.page.context.clear_cookies() await context.add_requests([Request.from_url('https://httpbin.org/get')]) await crawler.run([Request.from_url('https://httpbin.org/cookies/set/sessionid/1')]) if __name__ == '__main__': asyncio.run(main())
Camoufox
with PlaywrightCrawler
for a while.Chromium
probably won't work.Chromium
works for you, then yes it is better than Camoufox
as it will use significantly less resources.Chromium
does not work, and Camoufox
is excessive.