import asyncio from playwright.async_api import async_playwright from datetime import datetime, timezone from urllib.parse import urljoin, urlparse from typing import List, Dict, Optional import json import argparse from dependences.utils import disclaim_bool_string, prepare_output_folder, create_folder import requests import os import urllib.parse from pathlib import Path class PageTitleExtractor: def __init__(self, url: str, threshold: int = 200): self.url = url self.threshold = threshold async def extract_page_title(self) -> Dict: async with async_playwright() as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() try: await page.goto(self.url, timeout=50000, wait_until="domcontentloaded") # Extract the page title title = await page.title() if not title: title = "" # Extract headings and main content in a single JS call # Using an f-string to inject self.threshold # Note the double {{ }} for the JS logic to prevent Python errors structural_text = await page.evaluate( f""" () => {{ const threshold = {self.threshold}; //const elements = document.querySelectorAll('h1, h2, h3, main'); const elements = document.querySelectorAll('h1, main');// we want to focus on main and h1 for the g88 evaluation, to have a more concise output for the LLM, but this can be easily changed to include more tags if needed return Array.from(elements) .map(el => {{ const tag = el.tagName.toLowerCase(); let text = el.innerText.replace(/\\n/g, ' ').trim(); if (text.length > threshold) {{ text = text.substring(0, threshold) + '...'; }} return text ? `<${{tag}}>${{text}}` : null; }}) .filter(Boolean) .join(' '); }} """ ) return { "page_url": self.url, "title": title, "structural_content": structural_text, } except Exception as e: print(f"Error extracting page title: {e}") return {"error": str(e)} finally: await browser.close()