[Bug]: Failed to click on the specified button to load more content

Open Hnam29 opened this issue 11 months ago • 2 comments

crawl4ai version

0.4.3bx

Expected Behavior

Able to click on the button to load new articles
Able to click on the link of each article for the article detail content

Current Behavior

Fail to click on the button to load new articles
Fail to click on the link of each article for the article detail content

Is this reproducible?

Yes

Inputs Causing the Bug

- URL: https://cafef.vn/du-lieu/tin-doanh-nghiep/vnd/event.chn

Steps to Reproduce

Code snippets

import asyncio
import json
from datetime import datetime
from urllib.parse import urljoin
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig

async def extract():
    current_time = datetime.now().strftime('%Y%m%d%H%M')
    filename = f"{current_time}_data.json"
    base_url = 'https://cafef.vn'  

    # Define the schema for the main page extraction
    schema_main = {
        "name": "Article",
        "baseSelector": "#divEvents > ul > li",  
        "fields": [
            {"name": "TITLE", "selector": "a.docnhanhTitle", "type": "text"},
            {"name": "LINK", "selector": "a.docnhanhTitle", "type": "attribute", "attribute": "href"},
            {"name": "TIME", "selector": "span.timeTitle", "type": "text"},
        ],
    }

    # Define the schema for the article detail extraction
    schema_article = {
        "name": "ArticleDetail",
        "baseSelector": "body",
        "fields": [
            {"name": "PARAGRAPHS", "selector": "p", "type": "text"},
        ],
    }

    main_extraction_strategy = JsonCssExtractionStrategy(schema_main, verbose=True)
    article_extraction_strategy = JsonCssExtractionStrategy(schema_article, verbose=True)

    article_details = []
    session_id = "test"  

    async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
        # Step 1: Load the initial page with the main articles 
        config_initial = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            extraction_strategy=main_extraction_strategy,
            session_id=session_id
        )
        result_initial = await crawler.arun(
            url='https://cafef.vn/du-lieu/tin-doanh-nghiep/vnd/event.chn',
            config=config_initial,
        )

        # Step 2: Click the button to load more content
        js_click_next = [
            "const nextButton = document.querySelector('#spanNext'); nextButton && nextButton.click();"
        ]
        wait_for_condition = "css:#divEvents > ul > li"  
        config_click = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            session_id=session_id,
            js_code=js_click_next,
            wait_for=wait_for_condition,
            js_only=True
        )
        result_click = await crawler.arun(
            url='https://cafef.vn/du-lieu/tin-doanh-nghiep/vnd/event.chn',
            config=config_click,
        )

        # Step 3: Re-run extraction on the new content loaded
        config_updated = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            extraction_strategy=main_extraction_strategy,
            session_id=session_id
        )
        result_updated = await crawler.arun(
            url='https://cafef.vn/du-lieu/tin-doanh-nghiep/vnd/event.chn',
            config=config_updated,
        )
        articles = json.loads(result_updated.extracted_content)

        # Process each article for details
        for index, article in enumerate(articles):
            title = article.get("TITLE")
            relative_link = article.get("LINK")
            # Convert relative URL to absolute using urljoin
            link = urljoin(base_url, relative_link)
            time = article.get("TIME")
            is_even = (index % 2 == 0)

            # JS snippet to wait until text of p tags are loaded on the detail page
            js_wait_for_content = """
            (async () => {
                let maxRetries = 10;
                let retries = 0;
                while (retries < maxRetries) {
                    if (document.querySelectorAll('p').length > 0) break;
                    await new Promise(resolve => setTimeout(resolve, 1000));
                    retries++;
                }
            })();
            """
            config_article = CrawlerRunConfig(
                cache_mode=CacheMode.BYPASS,
                extraction_strategy=article_extraction_strategy,
                js_code=[js_wait_for_content],
                session_id=session_id
            )
            try:
                result_article = await crawler.arun(
                    url=link,
                    config=config_article,
                )
                if not result_article.extracted_content:
                    print(f"[ERROR] No content extracted from: {link}")
                    continue

                paragraphs = json.loads(result_article.extracted_content)
                full_content = " ".join([p["PARAGRAPHS"] for p in paragraphs if p["PARAGRAPHS"]])

                article_details.append({
                    "TITLE": title,
                    "LINK": link,
                    "TIME": time,
                    "CONTENT": full_content,
                    "ROW_TYPE": "even" if is_even else "odd",
                })

            except Exception as e:
                print(f"[ERROR] Failed to process link {link}: {e}")

        await asyncio.sleep(2)  

    with open(filename, "w", encoding="utf-8") as json_file:
        json.dump(article_details, json_file, ensure_ascii=False, indent=4)

    print(f"Data has been saved to {filename}")

asyncio.run(extract())

OS

macOS - m1

Python version

3.11

Browser

Chrome

Browser version

No response

Error logs & Screenshots (if applicable)

The output just returns the articles of the first page.

Feb 22 '25 15:02 Hnam29