wcag_AI_validation/dependences/language_extractor.py

import asyncio
from playwright.async_api import async_playwright
from datetime import datetime, timezone
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Optional
import json
import argparse
from dependences.utils import disclaim_bool_string, prepare_output_folder, create_folder
import requests
import os
import urllib.parse
from pathlib import Path


class LanguageExtractor:

    def __init__(
        self,
        url: str,
        short_segments_length_threshold: int = 30,
        max_total_length: int = 15000,
    ):

        self.url = url
        self.short_segments_length_threshold = short_segments_length_threshold
        self.max_total_length = max_total_length

    async def extract_content_with_lang_context(self) -> Dict:
        """
        The verification is:
        Read through all the text content on the page and identify any passages that are in a different language than the page default
        Then check whether those passages have a lang attribute marking them correctly as being in a different language.
        If a language change exists in the text but no lang attribute is present → that's a failure of H58
        """

        async with async_playwright() as p:
            # Efficiently launch and manage the browser lifecycle
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context()
            page = await context.new_page()

            results = {
                "page_url": self.url,
                "main_page_lang": "not specified",
                "extracted_segments": [],
                "total_char_count": 0,
            }

            try:
                # Optimized wait: stop once the DOM is ready
                await page.goto(self.url, timeout=50000, wait_until="domcontentloaded")

                # 1. Get Root Language (Global Context)
                html_tag = page.locator("html")
                root_lang = await html_tag.get_attribute("lang") or "unknown"
                results["page_url"]= self.url
                results["main_page_lang"] = root_lang

                # 2. Find Leaf Nodes containing text (The H58 Logic)
                # We target elements with text but no child elements to get the 'cleanest' snippets
                elements = await page.locator("//*[text() and not(*)]").all()

                current_length = 0
                max_length = self.max_total_length # only considers the text content, not the HTML tags

                for element in elements:
                    if current_length >= max_length:
                        results["extracted_segments"].append(
                            "...[Truncated: Limit Reached]"
                        )
                        break

                    try:
                        # Skip non-content tags
                        tag_name = await element.evaluate(
                            "el => el.tagName.toLowerCase()"
                        )
                        if tag_name in ["script", "style", "noscript", "html"]:
                            continue

                        # Get local language context (The extract_languages logic)
                        local_lang = await element.get_attribute("lang")
                        # outer_html = await element.evaluate("el => el.outerHTML")
                        clean_text = await element.inner_text()
                        clean_text = clean_text.strip()
                        if (
                            not clean_text
                            or len(clean_text) < self.short_segments_length_threshold
                        ):  # Skip very short text which is unlikely to be meaningful for language detection
                            # print(f"Skipping short text: '{clean_text}'")
                            continue

                        # Package the data: Text + its specific language metadata
                        segment = {
                            "tag": tag_name,
                            "lang": local_lang if local_lang else "inherited",
                            "html": clean_text,
                        }

                        results["extracted_segments"].append(segment)
                        current_length += len(clean_text)

                    except Exception as e:
                        # Silently skip individual element errors to keep the loop moving
                        continue

                results["total_char_count"] = (
                    current_length  # only considers the text content, not the HTML tags, to calculate the total character count of the extracted content
                )

                return results

            except Exception as e:
                return {"error": str(e)}
            finally:
                await browser.close()