Apify Discord Mirror

Updated 3 months ago

How to store data in the same dict from different URL?

At a glance
The community member has a list of results where they enqueue the link for each item. For each item, they need to crawl internal pages (tabs) and extract the data in tables, adding the data to the same dictionary. They can extract the data from all the pages with a router and enqueue_links, but they are unable to gather all the data in the same dictionary for each item. The community members discuss potential solutions, including using external storage like a global dictionary or a class storage to aggregate the results.
Useful resources
I have a list of results where I enqueue the link for each item. For each item, I need to crawl internal pages (tabs) and extract the data in tables and add the data to the same dict. I can extract the data from all the pages with router and enqueue_links but I am not able to gather all data in the same dict for each item. What is the best way to do it?
K
M
4 comments
@router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'default_handler is processing {context.request.url}')
context.page.wait_for_selector('li.resultado-busqueda > a')

await context.enqueue_links(
selector='li.resultado-busqueda > a',
label='ITEM',
)

next_button = await context.page.query_selector('.paginar2 > ul > li > a')

if next_button:
await context.enqueue_links(
selector='.paginar2 > ul > li > a',
label='RESULTS',
)

@router.handler('ITEM')
async def item_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'item_handler is processing {context.request.url}')

await context.page.wait_for_selector('#tabs > ul > li > a')

await context.enqueue_links(
selector='#tabs > ul > li > a',
label='TAB',
)

@router.handler('TAB')
async def tab_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'tab_handler is processing {context.request.url}')

tables = await context.page.query_selector_all('table')

data = {}

for table in tables:
rows = await table.query_selector_all('tr')

for row in rows:
key = await row.query_selector('th')
value = await row.query_selector('td')

if key and value:
key_text = await key.text_content()
value_text = await value.text_content()
data[key_text.strip()] = value_text.strip()

context.log.info(data)

await context.push_data(data)
Hi @Mantisus, thank you for your response. I checked the thread but I'm still trying to guess how to gather all the data from the same item to the same dataset. This is my try:

@router.handler('ITEM')
async def item_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'item_handler is processing {context.request.url}')

# Buscar todos los enlaces en navlist
await context.page.wait_for_selector('#tabs > ul > li > a')

label = await context.page.locator('#idBloqueDatos1 > table > tbody > tr > td').text_content()

tabs = await context.page.query_selector_all('#tabs > ul > li > a')

for tab in tabs:
url = await tab.get_attribute('href')
tab_name = await tab.text_content()

await context.add_requests([
Request.from_url(
url=url,
user_data={'label': label, 'tab_name': tab_name}),
])
Yeah, I can see that won't work for you.
I would use some sort of external storage. Like a global dictionary or some kind of class storage. Where your results would be aggregated.

Here's a simple code sample that would implement this

Plain Text
storage = {}

@router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
    context.log.info(f'default_handler is processing {context.request.url}')

    url = "https://httpbin.org/get?a=item"

    storage[url] = {}

    await context.add_requests([
        Request.from_url(
            url=url,
            label='ITEM',
            user_data={"item_url":url}),
        ])


@router.handler('ITEM')
async def item_handler(context: HttpCrawlingContext) -> None:
    context.log.info(f'item_handler is processing {context.request.url}')
    url = context.request.user_data["item_url"]
    tabs = [f"https://httpbin.org/get?tab{i}={i}" for i in range(11)]
    storage[url]["all_tabs"] = len(tabs)
    storage[url]["processed_tabs"] = 0
    requests = [Request.from_url(
            url=tab,
            label='TAB',
            user_data={"item_url":url}) for tab in tabs]
    await context.add_requests(requests)

@router.handler('TAB')
async def tab_handler(context: HttpCrawlingContext) -> None:
    context.log.info(f'tab_handler is processing {context.request.url}')

    url = context.request.user_data["item_url"]

    data = json.loads(context.http_response.read())

    for key, value in data["args"].items():
        storage[url].update({
            key: value
        })

    storage[url]["processed_tabs"] += 1
    if storage[url]["processed_tabs"] == storage[url]["all_tabs"]:
        del storage[url]["processed_tabs"]
        del storage[url]["all_tabs"]
        await context.push_data(storage[url])
        del storage[url]
Add a reply
Sign up and join the conversation on Discord