Apify and Crawlee Official Forum

Updated 2 months ago

How to store data in the same dict from different URL?

I have a list of results where I enqueue the link for each item. For each item, I need to crawl internal pages (tabs) and extract the data in tables and add the data to the same dict. I can extract the data from all the pages with router and enqueue_links but I am not able to gather all data in the same dict for each item. What is the best way to do it?
K
M
4 comments
@router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'default_handler is processing {context.request.url}')
context.page.wait_for_selector('li.resultado-busqueda > a')

await context.enqueue_links(
selector='li.resultado-busqueda > a',
label='ITEM',
)

next_button = await context.page.query_selector('.paginar2 > ul > li > a')

if next_button:
await context.enqueue_links(
selector='.paginar2 > ul > li > a',
label='RESULTS',
)

@router.handler('ITEM')
async def item_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'item_handler is processing {context.request.url}')

await context.page.wait_for_selector('#tabs > ul > li > a')

await context.enqueue_links(
selector='#tabs > ul > li > a',
label='TAB',
)

@router.handler('TAB')
async def tab_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'tab_handler is processing {context.request.url}')

tables = await context.page.query_selector_all('table')

data = {}

for table in tables:
rows = await table.query_selector_all('tr')

for row in rows:
key = await row.query_selector('th')
value = await row.query_selector('td')

if key and value:
key_text = await key.text_content()
value_text = await value.text_content()
data[key_text.strip()] = value_text.strip()

context.log.info(data)

await context.push_data(data)
Hi @Mantisus, thank you for your response. I checked the thread but I'm still trying to guess how to gather all the data from the same item to the same dataset. This is my try:

@router.handler('ITEM')
async def item_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'item_handler is processing {context.request.url}')

# Buscar todos los enlaces en navlist
await context.page.wait_for_selector('#tabs > ul > li > a')

label = await context.page.locator('#idBloqueDatos1 > table > tbody > tr > td').text_content()

tabs = await context.page.query_selector_all('#tabs > ul > li > a')

for tab in tabs:
url = await tab.get_attribute('href')
tab_name = await tab.text_content()

await context.add_requests([
Request.from_url(
url=url,
user_data={'label': label, 'tab_name': tab_name}),
])
Yeah, I can see that won't work for you.
I would use some sort of external storage. Like a global dictionary or some kind of class storage. Where your results would be aggregated.

Here's a simple code sample that would implement this

Plain Text
storage = {}

@router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
    context.log.info(f'default_handler is processing {context.request.url}')

    url = "https://httpbin.org/get?a=item"

    storage[url] = {}

    await context.add_requests([
        Request.from_url(
            url=url,
            label='ITEM',
            user_data={"item_url":url}),
        ])


@router.handler('ITEM')
async def item_handler(context: HttpCrawlingContext) -> None:
    context.log.info(f'item_handler is processing {context.request.url}')
    url = context.request.user_data["item_url"]
    tabs = [f"https://httpbin.org/get?tab{i}={i}" for i in range(11)]
    storage[url]["all_tabs"] = len(tabs)
    storage[url]["processed_tabs"] = 0
    requests = [Request.from_url(
            url=tab,
            label='TAB',
            user_data={"item_url":url}) for tab in tabs]
    await context.add_requests(requests)

@router.handler('TAB')
async def tab_handler(context: HttpCrawlingContext) -> None:
    context.log.info(f'tab_handler is processing {context.request.url}')

    url = context.request.user_data["item_url"]

    data = json.loads(context.http_response.read())

    for key, value in data["args"].items():
        storage[url].update({
            key: value
        })

    storage[url]["processed_tabs"] += 1
    if storage[url]["processed_tabs"] == storage[url]["all_tabs"]:
        del storage[url]["processed_tabs"]
        del storage[url]["all_tabs"]
        await context.push_data(storage[url])
        del storage[url]
Add a reply
Sign up and join the conversation on Discord