wcag_AI_validation/dependences/title_content_extractor.py

import asyncio
from playwright.async_api import async_playwright
from datetime import datetime, timezone
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Optional
import json
import argparse
from dependences.utils import disclaim_bool_string, prepare_output_folder, create_folder
import requests
import os
import urllib.parse
from pathlib import Path


class PageTitleExtractor:

    def __init__(self, url: str, threshold: int = 200):
        self.url = url
        self.threshold = threshold

    async def extract_page_title(self) -> Dict:

        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()

            try:
                await page.goto(self.url, timeout=50000, wait_until="domcontentloaded")

                # Extract the page title
                title = await page.title()
                if not title:
                    title = "<title></title>"

                # Extract headings and main content in a single JS call
                # Using an f-string to inject self.threshold
                # Note the double {{ }} for the JS logic to prevent Python errors
                structural_text = await page.evaluate(
                    f"""
                () => {{
                    const threshold = {self.threshold};
                    //const elements = document.querySelectorAll('h1, h2, h3, main');
                    const elements = document.querySelectorAll('h1, main');// we want to focus on main and h1 for the g88 evaluation, to have a more concise output for the LLM, but this can be easily changed to include more tags if needed

                    return Array.from(elements)
                        .map(el => {{
                            const tag = el.tagName.toLowerCase();
                            let text = el.innerText.replace(/\\n/g, ' ').trim();

                            if (text.length > threshold) {{
                                text = text.substring(0, threshold) + '...';
                            }}

                            return text ? `<${{tag}}>${{text}}</${{tag}}>` : null;
                        }})
                        .filter(Boolean)
                        .join(' ');
                    }}
                """
                )

                return {
                    "page_url": self.url,
                    "title": title,
                    "structural_content": structural_text,
                }

            except Exception as e:
                print(f"Error extracting page title: {e}")
                return {"error": str(e)}

            finally:
                await browser.close()