import asyncio from playwright.async_api import async_playwright from datetime import datetime, timezone from urllib.parse import urljoin, urlparse from typing import List, Dict, Optional import json import argparse from dependences.utils import disclaim_bool_string, prepare_output_folder, create_folder import requests import os import urllib.parse from pathlib import Path class LanguageExtractor: def __init__( self, url: str, short_segments_length_threshold: int = 30, max_total_length: int = 15000, ): self.url = url self.short_segments_length_threshold = short_segments_length_threshold self.max_total_length = max_total_length async def extract_content_with_lang_context(self) -> Dict: """ The verification is: Read through all the text content on the page and identify any passages that are in a different language than the page default Then check whether those passages have a lang attribute marking them correctly as being in a different language. If a language change exists in the text but no lang attribute is present → that's a failure of H58 """ async with async_playwright() as p: # Efficiently launch and manage the browser lifecycle browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() results = { "page_url": self.url, "main_page_lang": "not specified", "extracted_segments": [], "total_char_count": 0, } try: # Optimized wait: stop once the DOM is ready await page.goto(self.url, timeout=50000, wait_until="domcontentloaded") # 1. Get Root Language (Global Context) html_tag = page.locator("html") root_lang = await html_tag.get_attribute("lang") or "unknown" results["page_url"]= self.url results["main_page_lang"] = root_lang # 2. Find Leaf Nodes containing text (The H58 Logic) # We target elements with text but no child elements to get the 'cleanest' snippets elements = await page.locator("//*[text() and not(*)]").all() current_length = 0 max_length = self.max_total_length # only considers the text content, not the HTML tags for element in elements: if current_length >= max_length: results["extracted_segments"].append( "...[Truncated: Limit Reached]" ) break try: # Skip non-content tags tag_name = await element.evaluate( "el => el.tagName.toLowerCase()" ) if tag_name in ["script", "style", "noscript", "html"]: continue # Get local language context (The extract_languages logic) local_lang = await element.get_attribute("lang") # outer_html = await element.evaluate("el => el.outerHTML") clean_text = await element.inner_text() clean_text = clean_text.strip() if ( not clean_text or len(clean_text) < self.short_segments_length_threshold ): # Skip very short text which is unlikely to be meaningful for language detection # print(f"Skipping short text: '{clean_text}'") continue # Package the data: Text + its specific language metadata segment = { "tag": tag_name, "lang": local_lang if local_lang else "inherited", "html": clean_text, } results["extracted_segments"].append(segment) current_length += len(clean_text) except Exception as e: # Silently skip individual element errors to keep the loop moving continue results["total_char_count"] = ( current_length # only considers the text content, not the HTML tags, to calculate the total character count of the extracted content ) return results except Exception as e: return {"error": str(e)} finally: await browser.close()