crawl4ai
crawl4ai copied to clipboard
[Bug]: Failed to click on the specified button to load more content
crawl4ai version
0.4.3bx
Expected Behavior
- Able to click on the button to load new articles
- Able to click on the link of each article for the article detail content
Current Behavior
- Fail to click on the button to load new articles
- Fail to click on the link of each article for the article detail content
Is this reproducible?
Yes
Inputs Causing the Bug
- URL: https://cafef.vn/du-lieu/tin-doanh-nghiep/vnd/event.chn
Steps to Reproduce
Code snippets
import asyncio
import json
from datetime import datetime
from urllib.parse import urljoin
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
async def extract():
current_time = datetime.now().strftime('%Y%m%d%H%M')
filename = f"{current_time}_data.json"
base_url = 'https://cafef.vn'
# Define the schema for the main page extraction
schema_main = {
"name": "Article",
"baseSelector": "#divEvents > ul > li",
"fields": [
{"name": "TITLE", "selector": "a.docnhanhTitle", "type": "text"},
{"name": "LINK", "selector": "a.docnhanhTitle", "type": "attribute", "attribute": "href"},
{"name": "TIME", "selector": "span.timeTitle", "type": "text"},
],
}
# Define the schema for the article detail extraction
schema_article = {
"name": "ArticleDetail",
"baseSelector": "body",
"fields": [
{"name": "PARAGRAPHS", "selector": "p", "type": "text"},
],
}
main_extraction_strategy = JsonCssExtractionStrategy(schema_main, verbose=True)
article_extraction_strategy = JsonCssExtractionStrategy(schema_article, verbose=True)
article_details = []
session_id = "test"
async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
# Step 1: Load the initial page with the main articles
config_initial = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=main_extraction_strategy,
session_id=session_id
)
result_initial = await crawler.arun(
url='https://cafef.vn/du-lieu/tin-doanh-nghiep/vnd/event.chn',
config=config_initial,
)
# Step 2: Click the button to load more content
js_click_next = [
"const nextButton = document.querySelector('#spanNext'); nextButton && nextButton.click();"
]
wait_for_condition = "css:#divEvents > ul > li"
config_click = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
session_id=session_id,
js_code=js_click_next,
wait_for=wait_for_condition,
js_only=True
)
result_click = await crawler.arun(
url='https://cafef.vn/du-lieu/tin-doanh-nghiep/vnd/event.chn',
config=config_click,
)
# Step 3: Re-run extraction on the new content loaded
config_updated = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=main_extraction_strategy,
session_id=session_id
)
result_updated = await crawler.arun(
url='https://cafef.vn/du-lieu/tin-doanh-nghiep/vnd/event.chn',
config=config_updated,
)
articles = json.loads(result_updated.extracted_content)
# Process each article for details
for index, article in enumerate(articles):
title = article.get("TITLE")
relative_link = article.get("LINK")
# Convert relative URL to absolute using urljoin
link = urljoin(base_url, relative_link)
time = article.get("TIME")
is_even = (index % 2 == 0)
# JS snippet to wait until text of p tags are loaded on the detail page
js_wait_for_content = """
(async () => {
let maxRetries = 10;
let retries = 0;
while (retries < maxRetries) {
if (document.querySelectorAll('p').length > 0) break;
await new Promise(resolve => setTimeout(resolve, 1000));
retries++;
}
})();
"""
config_article = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=article_extraction_strategy,
js_code=[js_wait_for_content],
session_id=session_id
)
try:
result_article = await crawler.arun(
url=link,
config=config_article,
)
if not result_article.extracted_content:
print(f"[ERROR] No content extracted from: {link}")
continue
paragraphs = json.loads(result_article.extracted_content)
full_content = " ".join([p["PARAGRAPHS"] for p in paragraphs if p["PARAGRAPHS"]])
article_details.append({
"TITLE": title,
"LINK": link,
"TIME": time,
"CONTENT": full_content,
"ROW_TYPE": "even" if is_even else "odd",
})
except Exception as e:
print(f"[ERROR] Failed to process link {link}: {e}")
await asyncio.sleep(2)
with open(filename, "w", encoding="utf-8") as json_file:
json.dump(article_details, json_file, ensure_ascii=False, indent=4)
print(f"Data has been saved to {filename}")
asyncio.run(extract())
OS
macOS - m1
Python version
3.11
Browser
Chrome
Browser version
No response
Error logs & Screenshots (if applicable)
The output just returns the articles of the first page.