crawl4ai icon indicating copy to clipboard operation
crawl4ai copied to clipboard

[Bug]: Failed to click on the specified button to load more content

Open Hnam29 opened this issue 11 months ago • 2 comments

crawl4ai version

0.4.3bx

Expected Behavior

  • Able to click on the button to load new articles
  • Able to click on the link of each article for the article detail content

Current Behavior

  • Fail to click on the button to load new articles
  • Fail to click on the link of each article for the article detail content

Is this reproducible?

Yes

Inputs Causing the Bug

- URL: https://cafef.vn/du-lieu/tin-doanh-nghiep/vnd/event.chn

Steps to Reproduce


Code snippets

import asyncio
import json
from datetime import datetime
from urllib.parse import urljoin
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig

async def extract():
    current_time = datetime.now().strftime('%Y%m%d%H%M')
    filename = f"{current_time}_data.json"
    base_url = 'https://cafef.vn'  

    # Define the schema for the main page extraction
    schema_main = {
        "name": "Article",
        "baseSelector": "#divEvents > ul > li",  
        "fields": [
            {"name": "TITLE", "selector": "a.docnhanhTitle", "type": "text"},
            {"name": "LINK", "selector": "a.docnhanhTitle", "type": "attribute", "attribute": "href"},
            {"name": "TIME", "selector": "span.timeTitle", "type": "text"},
        ],
    }

    # Define the schema for the article detail extraction
    schema_article = {
        "name": "ArticleDetail",
        "baseSelector": "body",
        "fields": [
            {"name": "PARAGRAPHS", "selector": "p", "type": "text"},
        ],
    }

    main_extraction_strategy = JsonCssExtractionStrategy(schema_main, verbose=True)
    article_extraction_strategy = JsonCssExtractionStrategy(schema_article, verbose=True)

    article_details = []
    session_id = "test"  

    async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
        # Step 1: Load the initial page with the main articles 
        config_initial = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            extraction_strategy=main_extraction_strategy,
            session_id=session_id
        )
        result_initial = await crawler.arun(
            url='https://cafef.vn/du-lieu/tin-doanh-nghiep/vnd/event.chn',
            config=config_initial,
        )

        # Step 2: Click the button to load more content
        js_click_next = [
            "const nextButton = document.querySelector('#spanNext'); nextButton && nextButton.click();"
        ]
        wait_for_condition = "css:#divEvents > ul > li"  
        config_click = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            session_id=session_id,
            js_code=js_click_next,
            wait_for=wait_for_condition,
            js_only=True
        )
        result_click = await crawler.arun(
            url='https://cafef.vn/du-lieu/tin-doanh-nghiep/vnd/event.chn',
            config=config_click,
        )

        # Step 3: Re-run extraction on the new content loaded
        config_updated = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            extraction_strategy=main_extraction_strategy,
            session_id=session_id
        )
        result_updated = await crawler.arun(
            url='https://cafef.vn/du-lieu/tin-doanh-nghiep/vnd/event.chn',
            config=config_updated,
        )
        articles = json.loads(result_updated.extracted_content)

        # Process each article for details
        for index, article in enumerate(articles):
            title = article.get("TITLE")
            relative_link = article.get("LINK")
            # Convert relative URL to absolute using urljoin
            link = urljoin(base_url, relative_link)
            time = article.get("TIME")
            is_even = (index % 2 == 0)

            # JS snippet to wait until text of p tags are loaded on the detail page
            js_wait_for_content = """
            (async () => {
                let maxRetries = 10;
                let retries = 0;
                while (retries < maxRetries) {
                    if (document.querySelectorAll('p').length > 0) break;
                    await new Promise(resolve => setTimeout(resolve, 1000));
                    retries++;
                }
            })();
            """
            config_article = CrawlerRunConfig(
                cache_mode=CacheMode.BYPASS,
                extraction_strategy=article_extraction_strategy,
                js_code=[js_wait_for_content],
                session_id=session_id
            )
            try:
                result_article = await crawler.arun(
                    url=link,
                    config=config_article,
                )
                if not result_article.extracted_content:
                    print(f"[ERROR] No content extracted from: {link}")
                    continue

                paragraphs = json.loads(result_article.extracted_content)
                full_content = " ".join([p["PARAGRAPHS"] for p in paragraphs if p["PARAGRAPHS"]])

                article_details.append({
                    "TITLE": title,
                    "LINK": link,
                    "TIME": time,
                    "CONTENT": full_content,
                    "ROW_TYPE": "even" if is_even else "odd",
                })

            except Exception as e:
                print(f"[ERROR] Failed to process link {link}: {e}")

        await asyncio.sleep(2)  

    with open(filename, "w", encoding="utf-8") as json_file:
        json.dump(article_details, json_file, ensure_ascii=False, indent=4)

    print(f"Data has been saved to {filename}")

asyncio.run(extract())

OS

macOS - m1

Python version

3.11

Browser

Chrome

Browser version

No response

Error logs & Screenshots (if applicable)

The output just returns the articles of the first page.

Hnam29 avatar Feb 22 '25 15:02 Hnam29