@router.default_handler async def default_handler(context: PlaywrightCrawlingContext) -> None: # This is a fallback route which will handle the start URL. context.log.info(f'default_handler is processing {context.request.url}') await context.page.wait_for_selector('.collection-block-item') await context.enqueue_links( selector='.collection-block-item', label='CATEGORY', )
import asyncio from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.browsers._playwright_browser_plugin import PlaywrightBrowserPlugin from crawlee.browsers import BrowserPool async def main() -> None: plugin = PlaywrightBrowserPlugin( page_options={"extra_http_headers": {"cookie": "auth=to_rule_over_everyone"}} ) pool = BrowserPool(plugins=[plugin]) crawler = PlaywrightCrawler( max_requests_per_crawl=10, browser_pool=pool ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') content = await context.page.content() print(content) await crawler.run(['https://httpbin.org/get']) if __name__ == '__main__': asyncio.run(main())
import asyncio from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee import Request from crawlee.browsers._playwright_browser_plugin import PlaywrightBrowserPlugin from crawlee.browsers import BrowserPool async def main() -> None: user_headers = {} user_plugin = PlaywrightBrowserPlugin(page_options={"extra_http_headers": user_headers}) pool = BrowserPool(plugins=[user_plugin]) crawler = PlaywrightCrawler( max_requests_per_crawl=10, browser_pool=pool ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') content = await context.page.content() print(content) user_headers["cookie"] = "auth=to_rule_over_everyone" await context.add_requests([Request.from_url( "https://httpbin.org/get?page=2" )]) await crawler.run(['https://httpbin.org/get']) if __name__ == '__main__': asyncio.run(main())
# Create a browser pool with a Playwright browser plugin pool = BrowserPool( plugins=[ PlaywrightBrowserPlugin( browser_type='chromium', browser_options={'headless': False}, page_options={ 'extra_http_headers': { 'Custom-Header': 'Value' } } ) ] )
import asyncio from crawlee.fingerprint_suite import HeaderGenerator from crawlee._types import HttpHeaders from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext def get_common_headers(self): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'en-US,en;q=0.9', 'cookie': "auth=to_rule_over_everyone" } return HttpHeaders(headers) HeaderGenerator.get_common_headers = get_common_headers async def main() -> None: crawler = PlaywrightCrawler( max_requests_per_crawl=10 ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') content = await context.page.content() print(content) await crawler.run(['https://httpbin.org/get']) if __name__ == '__main__': asyncio.run(main())
import asyncio from crawlee.browsers import BrowserPool from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext class CustomBrowserPool(BrowserPool): async def new_page(self, *args, **kwargs): page = await super().new_page(*args, **kwargs) await page.page.set_extra_http_headers({'cookie': "auth=to_rule_over_everyone"}) return page async def main() -> None: crawler = PlaywrightCrawler( browser_pool=CustomBrowserPool(), max_requests_per_crawl=10 ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') content = await context.page.content() print(content) await crawler.run(['https://httpbin.org/get']) if __name__ == '__main__': asyncio.run(main())
pool = BrowserPool( plugins=[ PlaywrightBrowserPlugin( browser_type='chromium', browser_options={'headless': False}, page_options={ 'extra_http_headers': { 'Custom-Header': 'Value' } } ) ] )
@router.default_handler async def default_handler(context: PlaywrightCrawlingContext) -> None: # pseudo-code pool = BrowserPool() pool.plugins[0].page_options['extra_http_headers'] = { 'Custom-Header': 'New-Value }
import asyncio from crawlee.browsers import BrowserPool from crawlee.playwrightcrawler import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee import Request custom_headers = {} class CustomBrowserPool(BrowserPool): async def new_page(self, args, **kwargs): page = await super().new_page(args, **kwargs) await page.page.set_extra_http_headers(custom_headers) return page async def main() -> None: crawler = PlaywrightCrawler( browser_pool=CustomBrowserPool(), max_requests_per_crawl=10 ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') content = await context.page.content() custom_headers['cookie'] = "auth=to_rule_over_everyone" print(content) await context.add_requests([Request.from_url( "https://httpbin.org/get?page=2" )]) await crawler.run(['https://httpbin.org/get']) if __name == '__main': asyncio.run(main())
@crawler.pre_navigation_hook async def log_navigation_url(context: PlaywrightPreNavigationContext) -> None: await context.page.set_extra_http_headers(custom_headers) context.log.info(f'Navigating to {context.request.url} ...')