wcag_AI_validation/dependences/language_extractor.py

import asyncio
from playwright.async_api import async_playwright
from datetime import datetime, timezone
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Optional
import json
import argparse
from dependences.utils import disclaim_bool_string, prepare_output_folder, create_folder
import requests
import os
import urllib.parse
from pathlib import Path


class LanguageExtractor:

    def __init__(
        self,
        url: str,
    ):

        self.url = url


    async def extract_languages(self, extract_context=True) -> Dict:

        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()

            try:
                #await page.goto(self.url, timeout=50000, wait_until="load")
                #await page.wait_for_timeout(2000)
                await page.goto(self.url, timeout=50000, wait_until="domcontentloaded")# faster in this case, we just need the DOM to be loaded, not necessarily all the resources

                lang_only_elements = []
                lang_and_xml_lang_elements = []

                # Extract the lang attribute of the <html> tag
                html_tag = page.locator('html')
                html_tag_lang = await html_tag.get_attribute('lang')
                html_tag_xml_lang = await html_tag.get_attribute('xml:lang')

                if html_tag_lang and html_tag_xml_lang:
                    lang_and_xml_lang_elements.append(
                        f'<html lang="{html_tag_lang}" xml:lang="{html_tag_xml_lang}"></html>'
                    )
                elif html_tag_lang:
                    lang_only_elements.append(f'<html lang="{html_tag_lang}"></html>')

                # Find all elements with the lang attribute (excluding <html>)
                elements_with_lang = await page.locator('//*[@lang and not(self::html)]').all()

                for element in elements_with_lang:
                    outer_html = await element.evaluate('el => el.outerHTML')
                    xml_lang = await element.get_attribute('xml:lang')
                    if xml_lang:
                        lang_and_xml_lang_elements.append(outer_html)
                    else:
                        lang_only_elements.append(outer_html)

                return {
                    "lang_only": "; ".join(lang_only_elements),
                    "lang_and_xml": "; ".join(lang_and_xml_lang_elements)
                }

            except Exception as e:
                print(f"Error extracting languages: {e}")
                return {"error": str(e)}

            finally:
                await browser.close()


    """
    ## quella da nodejs
    from playwright.async_api import Page

    async def h58(page: Page):
        results = []

        try:
            print("Identifying the main language of the page...")
            # Identify the main language of the page
            main_lang = "The main language of the page is: not specified"
            try:
                # Playwright uses locator() or query_selector()
                html_element = page.locator('html')
                lang_attribute = await html_element.get_attribute('lang')
                if lang_attribute:
                    main_lang = f"The main language of the page is: {lang_attribute}"
            except Exception as e:
                print(f"Error identifying main language: {e}")

            print("Find all elements containing text")
            # Find all elements containing text that don't have children (leaf nodes)
            try:
                # Playwright handles XPaths directly through the locator API
                elements = await page.locator('//*[text() and not(*)]').all()
            except Exception as e:
                print(f"Error finding text elements: {e}")
                return results

            print("Create a string to collect the outer html of all the elements containing text...")
            all_outer_html = ""

            for element in elements:
                try:
                    # Get the tag name
                    tag_name = await element.evaluate("el => el.tagName.toLowerCase()")

                    # Skip <html>, <style> and <script> elements
                    if tag_name in ['html', 'style', 'script']:
                        continue

                    # Get the outerHTML
                    html_content = await element.evaluate("el => el.outerHTML")
                    all_outer_html += html_content

                    # Truncate at 15,000 characters to save tokens
                    if len(all_outer_html) > 15000:
                        all_outer_html = all_outer_html[:15000] + "(...continues)"
                        break # Stop processing once limit is reached to save time

                except Exception as e:
                    print(f"Error processing element: {e}")

            # You can append the final result to your results list here
            results.append({"main_lang": main_lang, "content": all_outer_html})

        except Exception as e:
            print(f"Unexpected error: {e}")

        return results
    """
    async def extract_content_with_lang_context(self) -> Dict:
        """
        The verification is:
        Read through all the text content on the page and identify any passages that are in a different language than the page default
        Then check whether those passages have a lang attribute marking them correctly as being in a different language.
        If a language change exists in the text but no lang attribute is present → that's a failure of H58"""

        async with async_playwright() as p:
            # Efficiently launch and manage the browser lifecycle
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context()
            page = await context.new_page()

            results = {
                "main_page_lang": "not specified",
                "extracted_segments": [],
                "total_char_count": 0
            }

            try:
                # Optimized wait: stop once the DOM is ready
                await page.goto(self.url, timeout=50000, wait_until="domcontentloaded")

                # 1. Get Root Language (Global Context)
                html_tag = page.locator('html')
                root_lang = await html_tag.get_attribute('lang') or "unknown"
                results["main_page_lang"] = root_lang

                # 2. Find Leaf Nodes containing text (The H58 Logic)
                # We target elements with text but no child elements to get the 'cleanest' snippets
                elements = await page.locator('//*[text() and not(*)]').all()

                current_length = 0
                max_length = 15000

                for element in elements:
                    if current_length >= max_length:
                        results["extracted_segments"].append("...[Truncated: Limit Reached]")
                        break

                    try:
                        # Skip non-content tags
                        tag_name = await element.evaluate("el => el.tagName.toLowerCase()")
                        if tag_name in ['script', 'style', 'noscript', 'html']:
                            continue

                        # Get local language context (The extract_languages logic)
                        local_lang = await element.get_attribute('lang')
                        #outer_html = await element.evaluate("el => el.outerHTML")
                        clean_text = await element.inner_text()
                        clean_text = clean_text.strip()
                        if not clean_text:
                            continue

                        # Package the data: Text + its specific language metadata
                        segment = {
                            "tag": tag_name,
                            "lang": local_lang if local_lang else "inherited",
                            "html": clean_text
                        }

                        results["extracted_segments"].append(segment)
                        current_length += len(clean_text)

                    except Exception as e:
                        # Silently skip individual element errors to keep the loop moving
                        continue

                results["total_char_count"] = current_length
                return results

            except Exception as e:
                return {"error": str(e)}
            finally:
                await browser.close()