diff --git a/dependences/language_extractor.py b/dependences/language_extractor.py new file mode 100644 index 0000000..3b1266f --- /dev/null +++ b/dependences/language_extractor.py @@ -0,0 +1,211 @@ +import asyncio +from playwright.async_api import async_playwright +from datetime import datetime, timezone +from urllib.parse import urljoin, urlparse +from typing import List, Dict, Optional +import json +import argparse +from dependences.utils import disclaim_bool_string, prepare_output_folder, create_folder +import requests +import os +import urllib.parse +from pathlib import Path + + +class LanguageExtractor: + + def __init__( + self, + url: str, + ): + + self.url = url + + + async def extract_languages(self, extract_context=True) -> Dict: + + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + + try: + #await page.goto(self.url, timeout=50000, wait_until="load") + #await page.wait_for_timeout(2000) + await page.goto(self.url, timeout=50000, wait_until="domcontentloaded")# faster in this case, we just need the DOM to be loaded, not necessarily all the resources + + lang_only_elements = [] + lang_and_xml_lang_elements = [] + + # Extract the lang attribute of the tag + html_tag = page.locator('html') + html_tag_lang = await html_tag.get_attribute('lang') + html_tag_xml_lang = await html_tag.get_attribute('xml:lang') + + if html_tag_lang and html_tag_xml_lang: + lang_and_xml_lang_elements.append( + f'' + ) + elif html_tag_lang: + lang_only_elements.append(f'') + + # Find all elements with the lang attribute (excluding ) + elements_with_lang = await page.locator('//*[@lang and not(self::html)]').all() + + for element in elements_with_lang: + outer_html = await element.evaluate('el => el.outerHTML') + xml_lang = await element.get_attribute('xml:lang') + if xml_lang: + lang_and_xml_lang_elements.append(outer_html) + else: + lang_only_elements.append(outer_html) + + return { + "lang_only": "; ".join(lang_only_elements), + "lang_and_xml": "; ".join(lang_and_xml_lang_elements) + } + + except Exception as e: + print(f"Error extracting languages: {e}") + return {"error": str(e)} + + finally: + await browser.close() + + + + """ + ## quella da nodejs + from playwright.async_api import Page + + async def h58(page: Page): + results = [] + + try: + print("Identifying the main language of the page...") + # Identify the main language of the page + main_lang = "The main language of the page is: not specified" + try: + # Playwright uses locator() or query_selector() + html_element = page.locator('html') + lang_attribute = await html_element.get_attribute('lang') + if lang_attribute: + main_lang = f"The main language of the page is: {lang_attribute}" + except Exception as e: + print(f"Error identifying main language: {e}") + + print("Find all elements containing text") + # Find all elements containing text that don't have children (leaf nodes) + try: + # Playwright handles XPaths directly through the locator API + elements = await page.locator('//*[text() and not(*)]').all() + except Exception as e: + print(f"Error finding text elements: {e}") + return results + + print("Create a string to collect the outer html of all the elements containing text...") + all_outer_html = "" + + for element in elements: + try: + # Get the tag name + tag_name = await element.evaluate("el => el.tagName.toLowerCase()") + + # Skip ,