74 lines
2.7 KiB
Python
74 lines
2.7 KiB
Python
import asyncio
|
|
from playwright.async_api import async_playwright
|
|
from datetime import datetime, timezone
|
|
from urllib.parse import urljoin, urlparse
|
|
from typing import List, Dict, Optional
|
|
import json
|
|
import argparse
|
|
from dependences.utils import disclaim_bool_string, prepare_output_folder, create_folder
|
|
import requests
|
|
import os
|
|
import urllib.parse
|
|
from pathlib import Path
|
|
|
|
|
|
class PageTitleExtractor:
|
|
|
|
def __init__(self, url: str, threshold: int = 200):
|
|
self.url = url
|
|
self.threshold = threshold
|
|
|
|
async def extract_page_title(self) -> Dict:
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
page = await browser.new_page()
|
|
|
|
try:
|
|
await page.goto(self.url, timeout=50000, wait_until="domcontentloaded")
|
|
|
|
# Extract the page title
|
|
title = await page.title()
|
|
if not title:
|
|
title = "<title></title>"
|
|
|
|
# Extract headings and main content in a single JS call
|
|
# Using an f-string to inject self.threshold
|
|
# Note the double {{ }} for the JS logic to prevent Python errors
|
|
structural_text = await page.evaluate(
|
|
f"""
|
|
() => {{
|
|
const threshold = {self.threshold};
|
|
//const elements = document.querySelectorAll('h1, h2, h3, main');
|
|
const elements = document.querySelectorAll('h1, main');// we want to focus on main and h1 for the g88 evaluation, to have a more concise output for the LLM, but this can be easily changed to include more tags if needed
|
|
|
|
return Array.from(elements)
|
|
.map(el => {{
|
|
const tag = el.tagName.toLowerCase();
|
|
let text = el.innerText.replace(/\\n/g, ' ').trim();
|
|
|
|
if (text.length > threshold) {{
|
|
text = text.substring(0, threshold) + '...';
|
|
}}
|
|
|
|
return text ? `<${{tag}}>${{text}}</${{tag}}>` : null;
|
|
}})
|
|
.filter(Boolean)
|
|
.join(' ');
|
|
}}
|
|
"""
|
|
)
|
|
|
|
return {
|
|
"page_url": self.url,
|
|
"title": title,
|
|
"structural_content": structural_text,
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"Error extracting page title: {e}")
|
|
return {"error": str(e)}
|
|
|
|
finally:
|
|
await browser.close()
|