import asyncio from playwright.async_api import async_playwright from datetime import datetime, timezone from urllib.parse import urljoin, urlparse from typing import List, Dict, Optional import json import argparse from dependences.utils import disclaim_bool_string, prepare_output_folder, create_folder import requests import os import urllib.parse from pathlib import Path class LanguageExtractor: def __init__( self, url: str, ): self.url = url async def extract_languages(self, extract_context=True) -> Dict: async with async_playwright() as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() try: #await page.goto(self.url, timeout=50000, wait_until="load") #await page.wait_for_timeout(2000) await page.goto(self.url, timeout=50000, wait_until="domcontentloaded")# faster in this case, we just need the DOM to be loaded, not necessarily all the resources lang_only_elements = [] lang_and_xml_lang_elements = [] # Extract the lang attribute of the tag html_tag = page.locator('html') html_tag_lang = await html_tag.get_attribute('lang') html_tag_xml_lang = await html_tag.get_attribute('xml:lang') if html_tag_lang and html_tag_xml_lang: lang_and_xml_lang_elements.append( f'' ) elif html_tag_lang: lang_only_elements.append(f'') # Find all elements with the lang attribute (excluding ) elements_with_lang = await page.locator('//*[@lang and not(self::html)]').all() for element in elements_with_lang: outer_html = await element.evaluate('el => el.outerHTML') xml_lang = await element.get_attribute('xml:lang') if xml_lang: lang_and_xml_lang_elements.append(outer_html) else: lang_only_elements.append(outer_html) return { "lang_only": "; ".join(lang_only_elements), "lang_and_xml": "; ".join(lang_and_xml_lang_elements) } except Exception as e: print(f"Error extracting languages: {e}") return {"error": str(e)} finally: await browser.close() """ ## quella da nodejs from playwright.async_api import Page async def h58(page: Page): results = [] try: print("Identifying the main language of the page...") # Identify the main language of the page main_lang = "The main language of the page is: not specified" try: # Playwright uses locator() or query_selector() html_element = page.locator('html') lang_attribute = await html_element.get_attribute('lang') if lang_attribute: main_lang = f"The main language of the page is: {lang_attribute}" except Exception as e: print(f"Error identifying main language: {e}") print("Find all elements containing text") # Find all elements containing text that don't have children (leaf nodes) try: # Playwright handles XPaths directly through the locator API elements = await page.locator('//*[text() and not(*)]').all() except Exception as e: print(f"Error finding text elements: {e}") return results print("Create a string to collect the outer html of all the elements containing text...") all_outer_html = "" for element in elements: try: # Get the tag name tag_name = await element.evaluate("el => el.tagName.toLowerCase()") # Skip ,