wcag_AI_validation/dependences/title_content_extractor.py

74 lines
2.7 KiB
Python

import asyncio
from playwright.async_api import async_playwright
from datetime import datetime, timezone
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Optional
import json
import argparse
from dependences.utils import disclaim_bool_string, prepare_output_folder, create_folder
import requests
import os
import urllib.parse
from pathlib import Path
class PageTitleExtractor:
def __init__(self, url: str, threshold: int = 200):
self.url = url
self.threshold = threshold
async def extract_page_title(self) -> Dict:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
try:
await page.goto(self.url, timeout=50000, wait_until="domcontentloaded")
# Extract the page title
title = await page.title()
if not title:
title = "<title></title>"
# Extract headings and main content in a single JS call
# Using an f-string to inject self.threshold
# Note the double {{ }} for the JS logic to prevent Python errors
structural_text = await page.evaluate(
f"""
() => {{
const threshold = {self.threshold};
//const elements = document.querySelectorAll('h1, h2, h3, main');
const elements = document.querySelectorAll('h1, main');// we want to focus on main and h1 for the g88 evaluation, to have a more concise output for the LLM, but this can be easily changed to include more tags if needed
return Array.from(elements)
.map(el => {{
const tag = el.tagName.toLowerCase();
let text = el.innerText.replace(/\\n/g, ' ').trim();
if (text.length > threshold) {{
text = text.substring(0, threshold) + '...';
}}
return text ? `<${{tag}}>${{text}}</${{tag}}>` : null;
}})
.filter(Boolean)
.join(' ');
}}
"""
)
return {
"page_url": self.url,
"title": title,
"structural_content": structural_text,
}
except Exception as e:
print(f"Error extracting page title: {e}")
return {"error": str(e)}
finally:
await browser.close()