117 lines
4.8 KiB
Python
117 lines
4.8 KiB
Python
import asyncio
|
|
from playwright.async_api import async_playwright
|
|
from datetime import datetime, timezone
|
|
from urllib.parse import urljoin, urlparse
|
|
from typing import List, Dict, Optional
|
|
import json
|
|
import argparse
|
|
from dependences.utils import disclaim_bool_string, prepare_output_folder, create_folder
|
|
import requests
|
|
import os
|
|
import urllib.parse
|
|
from pathlib import Path
|
|
|
|
|
|
class LanguageExtractor:
|
|
|
|
def __init__(
|
|
self,
|
|
url: str,
|
|
short_segments_length_threshold: int = 30,
|
|
max_total_length: int = 15000,
|
|
):
|
|
|
|
self.url = url
|
|
self.short_segments_length_threshold = short_segments_length_threshold
|
|
self.max_total_length = max_total_length
|
|
|
|
async def extract_content_with_lang_context(self) -> Dict:
|
|
"""
|
|
The verification is:
|
|
Read through all the text content on the page and identify any passages that are in a different language than the page default
|
|
Then check whether those passages have a lang attribute marking them correctly as being in a different language.
|
|
If a language change exists in the text but no lang attribute is present → that's a failure of H58
|
|
"""
|
|
|
|
async with async_playwright() as p:
|
|
# Efficiently launch and manage the browser lifecycle
|
|
browser = await p.chromium.launch(headless=True)
|
|
context = await browser.new_context()
|
|
page = await context.new_page()
|
|
|
|
results = {
|
|
"page_url": self.url,
|
|
"main_page_lang": "not specified",
|
|
"extracted_segments": [],
|
|
"total_char_count": 0,
|
|
}
|
|
|
|
try:
|
|
# Optimized wait: stop once the DOM is ready
|
|
await page.goto(self.url, timeout=50000, wait_until="domcontentloaded")
|
|
|
|
# 1. Get Root Language (Global Context)
|
|
html_tag = page.locator("html")
|
|
root_lang = await html_tag.get_attribute("lang") or "unknown"
|
|
results["page_url"]= self.url
|
|
results["main_page_lang"] = root_lang
|
|
|
|
# 2. Find Leaf Nodes containing text (The H58 Logic)
|
|
# We target elements with text but no child elements to get the 'cleanest' snippets
|
|
elements = await page.locator("//*[text() and not(*)]").all()
|
|
|
|
current_length = 0
|
|
max_length = self.max_total_length # only considers the text content, not the HTML tags
|
|
|
|
for element in elements:
|
|
if current_length >= max_length:
|
|
results["extracted_segments"].append(
|
|
"...[Truncated: Limit Reached]"
|
|
)
|
|
break
|
|
|
|
try:
|
|
# Skip non-content tags
|
|
tag_name = await element.evaluate(
|
|
"el => el.tagName.toLowerCase()"
|
|
)
|
|
if tag_name in ["script", "style", "noscript", "html"]:
|
|
continue
|
|
|
|
# Get local language context (The extract_languages logic)
|
|
local_lang = await element.get_attribute("lang")
|
|
# outer_html = await element.evaluate("el => el.outerHTML")
|
|
clean_text = await element.inner_text()
|
|
clean_text = clean_text.strip()
|
|
if (
|
|
not clean_text
|
|
or len(clean_text) < self.short_segments_length_threshold
|
|
): # Skip very short text which is unlikely to be meaningful for language detection
|
|
# print(f"Skipping short text: '{clean_text}'")
|
|
continue
|
|
|
|
# Package the data: Text + its specific language metadata
|
|
segment = {
|
|
"tag": tag_name,
|
|
"lang": local_lang if local_lang else "inherited",
|
|
"html": clean_text,
|
|
}
|
|
|
|
results["extracted_segments"].append(segment)
|
|
current_length += len(clean_text)
|
|
|
|
except Exception as e:
|
|
# Silently skip individual element errors to keep the loop moving
|
|
continue
|
|
|
|
results["total_char_count"] = (
|
|
current_length # only considers the text content, not the HTML tags, to calculate the total character count of the extracted content
|
|
)
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
return {"error": str(e)}
|
|
finally:
|
|
await browser.close()
|