From 4f189ec32e2b195266813164eaa59ba875c2dc43 Mon Sep 17 00:00:00 2001 From: nicola leonardi Date: Sat, 15 Nov 2025 19:11:09 +0100 Subject: [PATCH] image extraction and MLLM management --- README.md | 11 + image_extractor.py | 518 +++++++++++++++++++++++++++++++++++++++++++++ mllm_management.py | 70 ++++++ requirements.txt | 4 + utils.py | 124 +++++++++++ wcag_validator.py | 151 +++++++++++++ 6 files changed, 878 insertions(+) create mode 100644 README.md create mode 100644 image_extractor.py create mode 100644 mllm_management.py create mode 100644 requirements.txt create mode 100644 utils.py create mode 100644 wcag_validator.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..c5a2c65 --- /dev/null +++ b/README.md @@ -0,0 +1,11 @@ +# WCGA AI validator + +- Install the required dependencies (inside the docker folder) + +``` +pip install -r requirements.txt +``` + +- Start the application + +LLM_accessibility_validator/wcag_validator.py \ No newline at end of file diff --git a/image_extractor.py b/image_extractor.py new file mode 100644 index 0000000..b71635f --- /dev/null +++ b/image_extractor.py @@ -0,0 +1,518 @@ +import asyncio +from playwright.async_api import async_playwright +from datetime import datetime, timezone +from urllib.parse import urljoin, urlparse +from typing import List, Dict, Optional +import json +import argparse +from utils import disclaim_bool_string, prepare_output_folder, create_folder +import requests +import os + + +class ImageExtractor: + SUPPORTED_FORMATS = {"png", "jpeg", "jpg", "webp", "gif"} + + def __init__( + self, + url: str, + context_levels: int = 5, + pixel_distance_threshold: int = 200, + number_of_images: int = 10, + save_images=True, + save_images_path="", + ): + """ + Initialize the ImageExtractor. + + Args: + url: The page URL to extract images from + context_levels: Number of parent/child levels to traverse for context (default=5) + pixel_distance_threshold: Maximum pixel distance for nearby text elements (default=200) + number_of_images: maximum number for the desired images + save_images: if save images + save_images_path: path to save images + """ + self.url = url + self.context_levels = context_levels + self.pixel_distance_threshold = pixel_distance_threshold + self.number_of_images = number_of_images + self.save_images = save_images + self.save_images_path = save_images_path + + def _is_supported_format(self, img_url: str) -> bool: + """Check if the image URL has a supported format.""" + parsed = urlparse(img_url.lower()) + path = parsed.path + + # Check file extension + for fmt in self.SUPPORTED_FORMATS: + if path.endswith(f".{fmt}"): + return True + + # Also check query parameters (e.g., format=jpeg) + return any(fmt in img_url.lower() for fmt in self.SUPPORTED_FORMATS) + + async def _download_image(self, image_url, output_dir="images") -> None: + try: + # Get file extension from URL + ext = image_url.split(".")[-1].split("?")[0] + image_name = image_url.split("/")[-1][0 : -len(ext) - 1] + + if ext not in ["jpg", "jpeg", "png", "gif", "webp"]: + ext = "jpg" + + # Download the image + print("getting image:", image_url) + response = requests.get(image_url, timeout=10) + response.raise_for_status() + + # Save the image + output_path = os.path.join(output_dir, f"{image_name}.{ext}") + with open(output_path, "wb") as f: + f.write(response.content) + + print(f"Saved: {output_path}") + + except Exception as e: + print(f"Error downloading {image_url}: {e}") + + async def save_elaboration(self,images,output_dir)->None: + with open(output_dir, "w", encoding="utf-8") as f: + json.dump(images, f, indent=2, ensure_ascii=False) + print("\nResults saved to extracted_images.json") + + + async def _get_element_context(self, page, img_element) -> tuple[str, str, str]: + """ + Extract textual context around an image element from text-containing tags. + + Returns: + Tuple of (full_context, immediate_context, nearby_text) where: + - full_context: Text extracted with self.context_levels + - immediate_context: Text extracted with context_level=1 + - nearby_text: Text within pixel_distance_threshold pixels of the image + """ + try: + # JavaScript function to check if element is visible + + """ + Visibility Checks : + + visibility CSS property - Excludes elements with visibility: hidden or visibility: collapse + display CSS property - Excludes elements with display: none + opacity CSS property - Excludes elements with opacity: 0 + Element dimensions - Excludes elements with zero width or height (collapsed elements) + """ + + visibility_check = """ + function isVisible(el) { + if (!el) return false; + + const style = window.getComputedStyle(el); + + // Check visibility and display properties + if (style.visibility === 'hidden' || style.visibility === 'collapse') return false; + if (style.display === 'none') return false; + if (style.opacity === '0') return false; + + // Check if element has dimensions + const rect = el.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) return false; + + return true; + } + """ + + # JavaScript function to extract text at a specific context level + def get_context_js(levels): + return f""" + (element) => {{ + {visibility_check} + + // Text-containing tags to extract + /*const textTags = ['p', 'span', 'div', 'a', 'li', 'td', 'th', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'label', 'figcaption', 'caption', 'blockquote', 'pre', 'code', 'em', 'strong', + 'b', 'i', 'u', 'small', 'mark', 'sub', 'sup', 'time', 'article', 'section'];*/ + + const textTags = ['p', 'span', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'a']; + + let textContent = []; + + // Traverse up the DOM tree + let current = element; + for (let i = 0; i < {levels} && current.parentElement; i++) {{ + current = current.parentElement; + }} + + // Function to extract text from an element and its children + function extractText(el, depth = 0) {{ + if (depth > {levels}) return; + + // Skip if element is not visible + if (!isVisible(el)) return; + + // Get direct text content of text-containing elements + if (textTags.includes(el.tagName.toLowerCase())) {{ + const text = el.textContent.trim(); + + if (text && text.length > 0) {{ + textContent.push({{ + tag: el.tagName.toLowerCase(), + text: text + }}); + }} + }} + + // Recursively process children + for (let child of el.children) {{ + extractText(child, depth + 1); + }} + }} + + // Extract text from the context root + extractText(current); + + // Format as readable text + //return textContent.map(item => `<${{item.tag}}>: ${{item.text}}`).join('\\n\\n'); + return textContent.map(item => `<${{item.tag}}>: ${{item.text}}`).join(' '); + }} + """ + + # JavaScript function to extract nearby text based on pixel distance + nearby_text_js = f""" + (element) => {{ + {visibility_check} + + /*const textTags = ['p', 'span', 'div', 'a', 'li', 'td', 'th', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'label', 'figcaption', 'caption', 'blockquote', 'pre', 'code', 'em', 'strong', + 'b', 'i', 'u', 'small', 'mark', 'sub', 'sup', 'time'];*/ + const textTags = ['p', 'span', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'a']; + + const threshold = {self.pixel_distance_threshold}; + const imgRect = element.getBoundingClientRect(); + const imgCenterX = imgRect.left + imgRect.width / 2; + const imgCenterY = imgRect.top + imgRect.height / 2; + + // Calculate distance between two rectangles. + function getDistance(rect1, rect2) {{ + // Get centers + const x1 = rect1.left + rect1.width / 2; + const y1 = rect1.top + rect1.height / 2; + const x2 = rect2.left + rect2.width / 2; + const y2 = rect2.top + rect2.height / 2; + + // Euclidean distance + return Math.sqrt(Math.pow(x2 - x1, 2) + Math.pow(y2 - y1, 2)); //This can be changed considering not only the distance between the centers but maybe the nearest points + }} + + let nearbyElements = []; + + // Find all text elements on the page + const allElements = document.querySelectorAll(textTags.join(',')); + + allElements.forEach(el => {{ + // Skip if element is not visible + if (!isVisible(el)) return; + + const text = el.textContent.trim(); + if (!text || text.length === 0) return; + + // Skip if it's the image itself or contains the image + if (el === element || el.contains(element)) return; + + const elRect = el.getBoundingClientRect(); + const distance = getDistance(imgRect, elRect); + + if (distance <= threshold) {{ + nearbyElements.push({{ + tag: el.tagName.toLowerCase(), + text: text, + distance: Math.round(distance) + }}); + }} + }}); + + // Sort by distance + nearbyElements.sort((a, b) => a.distance - b.distance); + + // Format output + //return nearbyElements.map(item => + // `<${{item.tag}}> [${{item.distance}}px]: ${{item.text}}` + //).join('\\n\\n'); + + return nearbyElements.map(item => + `<${{item.tag}}> [${{item.distance}}px]: ${{item.text}}` + ).join(' '); + }} + """ + + # Get full context with self.context_levels + full_context_js = get_context_js(self.context_levels) + full_context = await img_element.evaluate(full_context_js) + full_context = full_context if full_context else "No textual context found" + + # Get immediate context with level=1 + immediate_context_js = get_context_js(1) + immediate_context = await img_element.evaluate(immediate_context_js) + immediate_context = ( + immediate_context if immediate_context else "No immediate context found" + ) + + # Get nearby text based on pixel distance + nearby_text = await img_element.evaluate(nearby_text_js) + nearby_text = nearby_text if nearby_text else "No nearby text found" + + return full_context, immediate_context, nearby_text + + except Exception as e: + error_msg = f"Error extracting context: {str(e)}" + return error_msg, error_msg, error_msg + + async def _get_page_metadata(self, page) -> Dict[str, Optional[str]]: + """Extract page metadata including title, description, and keywords.""" + metadata = { + "title": await page.title(), + "description": None, + "keywords": None, + "headings": [], + } + + # Extract meta description + try: + description = await page.locator('meta[name="description"]').get_attribute( + "content" + ) + metadata["description"] = description + except: + pass + + # Extract meta keywords + try: + keywords = await page.locator('meta[name="keywords"]').get_attribute( + "content" + ) + metadata["keywords"] = keywords + except: + pass + + # Extract all headings (h1-h6) + for level in range(1, 7): + headings = await page.locator(f"h{level}").all_text_contents() + for heading in headings: + if heading.strip(): + metadata["headings"].append( + {"level": level, "text": heading.strip()} + ) + + return metadata + + async def extract_images(self) -> List[Dict]: + """ + Extract all images from the page with their metadata and context. + + Returns: + List of dictionaries containing image information + """ + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + + try: + # await page.goto(self.url, wait_until='networkidle')#original + + # ---alternative + await page.goto(self.url, wait_until="load") + # Wait for page to load completely + await page.wait_for_timeout(2000) # Wait for dynamic content + # ----- + + # Get page metadata once + page_metadata = await self._get_page_metadata(page) + + # Find all img elements + img_elements = await page.locator("img").all() + + image_source_list = [] # avoid multiple check for the same image url + images_data = [] + + for img in img_elements[ + 0 : self.number_of_images + ]: # limits the image list based on the ini param + + try: + # Get image src + src = await img.get_attribute("src") + if not src: + continue + + if src not in image_source_list: + image_source_list.append(src) + + else: + print("image src", src, "already processed. Skipped.") + continue + + # Convert relative URLs to absolute + img_url = urljoin(self.url, src) + + # Verify format + if not self._is_supported_format(img_url): + continue + + if disclaim_bool_string(self.save_images) == True: + print("save image:", img_url.split("/")[-1]) + await self._download_image( + image_url=img_url, output_dir=self.save_images_path + ) + + # Get alt text + alt_text = await img.get_attribute("alt") or "" + + # Get surrounding HTML context (full, immediate, and nearby) + html_context, immediate_context, nearby_text = ( + await self._get_element_context(page, img) + ) + + # Compile image data + image_info = { + "url": img_url, + "alt_text": alt_text, + "html_context": html_context, + "immediate_context": immediate_context, + "nearby_text": nearby_text, + "page_url": self.url, + "page_title": page_metadata["title"], + "page_description": page_metadata["description"], + "page_keywords": page_metadata["keywords"], + "page_headings": page_metadata["headings"], + } + + images_data.append(image_info) + + except Exception as e: + print(f"Error processing image: {str(e)}") + continue + + return images_data + + finally: + await browser.close() + + +async def main(args): + + url = args.page_url + context_levels = args.context_levels + pixel_distance_threshold = args.pixel_distance_threshold + number_of_images = args.number_of_images + save_images = args.save_images + + print( + "call ImageExtrcator with-", + "page_url:", + url, + "context_levels:", + context_levels, + "pixel_distance_threshold:", + pixel_distance_threshold, + "number_of_images:", + number_of_images, + "save_images:", + save_images, + ) + + if ( + disclaim_bool_string(args.save_elaboration) == True + or disclaim_bool_string(args.save_images) == True + ): # if something to save + url_path = url.replace(":", "").replace("//", "_").replace("/", "_") + now = datetime.now(timezone.utc) + now_str = now.strftime("%Y_%m_%d-%H_%M_%S") + output_dir = prepare_output_folder(url_path, now_str) + + if disclaim_bool_string(args.save_images) == True: + images_output_dir = create_folder( + output_dir, directory_separator="/", next_path="images" + ) + print("save images path:", images_output_dir) + + # Create extractor + extractor = ImageExtractor( + url, + context_levels=context_levels, + pixel_distance_threshold=pixel_distance_threshold, + number_of_images=number_of_images, + save_images=save_images, + save_images_path=images_output_dir, + ) + + # Extract images + print(f"Extracting images from: {url}") + images = await extractor.extract_images() + + print(f"\nFound {len(images)} supported images\n") + + # Display results + for i, img in enumerate(images, 1): + print(f"Image {i}:") + print(f" URL: {img['url']}") + print(f" Alt text: {img['alt_text']}") + print(f" Page title: {img['page_title']}") + print(f" Full context length: {len(img['html_context'])} characters") + print(f" Immediate context length: {len(img['immediate_context'])} characters") + print(f" Nearby text length: {len(img['nearby_text'])} characters") + print(f" Number of headings on page: {len(img['page_headings'])}") + print("-" * 80) + + if disclaim_bool_string(args.save_elaboration) == True: # Optionally save to JSON + + await extractor.save_elaboration(images,output_dir=output_dir + "/extracted_images.json") + + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument( + "--page_url", + type=str, + help=("Url page to analyze"), + default="https://www.bbc.com", + ) + + parser.add_argument( + "--context_levels", + type=int, + default=5, + help=("HTML context levels around the image"), + ) + parser.add_argument( + "--pixel_distance_threshold", + type=int, + default=200, + help=("pixel distance threshold around the image"), + ) + parser.add_argument( + "--number_of_images", + type=int, + default=10, + help=("max number of desired images"), + ) + + parser.add_argument( + "--save_elaboration", + action="store_true", + default=True, + help=("If True save the elaborated info in a json file"), + ) + + parser.add_argument( + "--save_images", + action="store_true", + default=True, + help=("If True save the images"), + ) + + args = parser.parse_args() + asyncio.run(main(args)) diff --git a/mllm_management.py b/mllm_management.py new file mode 100644 index 0000000..18a3bf5 --- /dev/null +++ b/mllm_management.py @@ -0,0 +1,70 @@ +from utils import call_API_urlibrequest + + +class MLLMManager: + def __init__(self, end_point, api_key): + self.end_point = end_point + self.api_key = api_key + + def get_response(self, system_prompt, user_prompt): + payload = self.create_mllm_payload(system_prompt, user_prompt) + headers = [ + ["Content-Type", "application/json"], + ["Authorization", f"Bearer {self.api_key}"] + ] + response = call_API_urlibrequest(url=self.end_point, headers=headers, data=payload) + return response + + def create_mllm_payload(self, system_prompt, user_prompt): + payload = { + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ], + "temperature": 0.7, + "top_p": 0.95, + "frequency_penalty": 0, + "presence_penalty": 0, + "max_tokens": 800, + "stop": None, + } + return payload + + def get_alt_text_system_prompt(self): + system_prompt = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for + images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present + the same information as the image, and should be able to substitute for the non-text content. The text alternative would + be brief but as informative as possible. + + Follow these instructions carefully: + 1. You will be provided as input with the following: + - The image found on the webpage. + - The associated alternative text. When the alt-text is empty or absent, you will be explicitly informed. + - The surrounding context of the image. + - The page title, headings and the content of the “keywords” and “description” tag, if found. + + 2. Determine the function and purpose of the image by analyzing these elements. Take into account the purpose and function + of the associated image by considering the page context. Check also if the image is, or is associated with, a link or a button, + and consider this in your judgement. If the image contains text use that as part of the context. + + 3. Provide a final assessment based on the following: + - 'success' if you can assess with 'sufficient certainty' the alt-text is appropriate in relation to the image purpose, + - 'failure' if you can assess with 'sufficient certainty' that the alt-text is NOT appropriate, + - 'warning' if you cannot determine with 'sufficient certainty'. + where the level of certainty goes from 1 to 100 and 'sufficient certainty' means > 80 + + 4. The original alt-text assessment on a scale from 1 to 5, where 5 is the best score. Use an integer number only. + + 5. Provide a brief reasoning for your judgment. If the image contains text, write it verbatim. Your response should be in English. + + 6. Keep your response within 150 words. + + 7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words. + + 8. Here is the JSON format the results must have: + {"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}""" + return system_prompt + + def get_alt_text_user_prompt(self, altTextMessage, imageURL, HTMLcontext, pageText): + user_prompt= [{ "type": "text", "text": altTextMessage }, { "type": "image_url", "image_url": { "url": imageURL }}, { "type": "text", "text": HTMLcontext }, { "type": "text", "text": pageText }] + return user_prompt \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..36dc331 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +pandas==2.3.3 +playwright==1.56.0 +python-dotenv==1.2.1 +requests==2.32.5 \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..e95f828 --- /dev/null +++ b/utils.py @@ -0,0 +1,124 @@ +import json +import time +import urllib.request +import urllib.parse +import logging +import os + +exception_msg = "Exception: %s" + + +def call_API_urlibrequest( + data={}, + verbose=False, + url="", + headers=[], + method="post", + base=2, # number of seconds to wait + max_tries=3, +): + + if verbose: + logging.info("input_data:%s", data) + + # Allow multiple attempts to call the API incase of downtime. + # Return provided response to user after 3 failed attempts. + wait_seconds = [base**i for i in range(max_tries)] + + for num_tries in range(max_tries): + try: + + if method == "get": + + # Encode the parameters and append them to the URL + query_string = urllib.parse.urlencode(data) + + url_with_params = f"{url}?{query_string}" + request = urllib.request.Request(url_with_params, method="GET") + for ele in headers: + + request.add_header(ele[0], ele[1]) + + elif method == "post": + # Convert the dictionary to a JSON formatted string and encode it to bytes + data_to_send = json.dumps(data).encode("utf-8") + + request = urllib.request.Request(url, data=data_to_send, method="POST") + for ele in headers: + + request.add_header(ele[0], ele[1]) + else: + return {"error_message": "method_not_allowed"} + + # Send the request and capture the response + + with urllib.request.urlopen(request) as response: + # Read and decode the response + + response_json = json.loads(response.read().decode("utf-8")) + logging.info("response_json:%s", response_json) + + logging.info("response.status_code:%s", response.getcode()) + return response_json + + except Exception as e: + + logging.error("error message:%s", e) + response_json = {"error": e} + + logging.info("num_tries:%s", num_tries) + logging.info( + "Waiting %s seconds before automatically trying again.", + str(wait_seconds[num_tries]), + ) + time.sleep(wait_seconds[num_tries]) + + logging.info( + "Tried %s times to make API call to get a valid response object", max_tries + ) + logging.info("Returning provided response") + return response_json + + +def disclaim_bool_string(value): + if isinstance(value, str): + if value == "True": + return True + else: + return False + elif isinstance(value, bool): + return value + + +def prepare_output_folder(file, now_str): + + output_dir = "" + try: + output_dir = create_folder( + root_path=os.getcwd(), + directory_separator="/", + next_path="outputs", + ) + output_dir = create_folder( + root_path=output_dir, + directory_separator="/", + next_path=file + "_" + now_str, + ) + + except Exception as e: + logging.error("error prepare output folder:%s", e) + + return output_dir + + +def create_folder(root_path, directory_separator, next_path): + output_dir = root_path + directory_separator + next_path + try: + if not os.path.exists(output_dir): + os.mkdir(output_dir) + + except Exception as e: + logging.error(exception_msg, e) + + exit(1) + return output_dir diff --git a/wcag_validator.py b/wcag_validator.py new file mode 100644 index 0000000..c0d2871 --- /dev/null +++ b/wcag_validator.py @@ -0,0 +1,151 @@ +import sys +import argparse +import json +import asyncio +from utils import disclaim_bool_string, prepare_output_folder, create_folder +from datetime import datetime, timezone +from dotenv import load_dotenv, find_dotenv +import os + +import warnings +warnings.filterwarnings("ignore") + +exception_msg = "Exception: %s" + +from image_extractor import ImageExtractor +from mllm_management import MLLMManager + +async def cli(sys_argv): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--page_url", + type=str, + help=("Url page to analyze"), + default="https://www.bbc.com", + ) + + parser.add_argument( + "--context_levels", + type=int, + default=5, + help=("HTML context levels around the image"), + ) + parser.add_argument( + "--pixel_distance_threshold", + type=int, + default=200, + help=("pixel distance threshold around the image"), + ) + parser.add_argument( + "--number_of_images", + type=int, + default=10, + help=("max number of desired images"), + ) + + parser.add_argument( + "--save_elaboration", + action="store_true", + default=True, + help=("If True save the elaborated info in a json file"), + ) + + parser.add_argument( + "--save_images", + action="store_true", + default=True, + help=("If True save the images"), + ) + + args = parser.parse_args() + + print("wcag validator args:",args) + + if ( + disclaim_bool_string(args.save_elaboration) == True + or disclaim_bool_string(args.save_images) == True + ): # if something to save + url_path = args.page_url.replace(":", "").replace("//", "_").replace("/", "_") + now = datetime.now(timezone.utc) + now_str = now.strftime("%Y_%m_%d-%H_%M_%S") + output_dir = prepare_output_folder(url_path, now_str) + + if disclaim_bool_string(args.save_images) == True: + images_output_dir = create_folder( + output_dir, directory_separator="/", next_path="images" + ) + print("save images path:", images_output_dir) + + ### Task #1: ---------- Image Extractor + # Create extractor + image_extractor = ImageExtractor( + args.page_url, + context_levels=args.context_levels, + pixel_distance_threshold=args.pixel_distance_threshold, + number_of_images=args.number_of_images, + save_images=args.save_images, + save_images_path=images_output_dir, + ) + # Extract images + print(f"Extracting images from: {args.page_url}") + images = await image_extractor.extract_images() + if disclaim_bool_string(args.save_elaboration) == True: # Optionally save to JSON + + await image_extractor.save_elaboration(images,output_dir=output_dir + "/extracted_images.json") + #--------------------------------------------- + + ### Task #2: ---------- MLLM management + env_path = find_dotenv(filename=".env") + _ = load_dotenv(env_path) # read .env file + + mllm_end_point=os.getenv("mllm_end_point") + mllm_api_key=os.getenv("mllm_api_key") + print("mllm_end_point:",mllm_end_point) + + mllm_manager = MLLMManager(mllm_end_point, mllm_api_key) + print("mllm_manager.end_point:", mllm_manager.end_point) + + alt_text_system_prompt = mllm_manager.get_alt_text_system_prompt() + print("alt_text_system_prompt:", alt_text_system_prompt) + + + mllm_responses = [] + for img_info in images: + alt_text="Here is the alt-text of the image: " + img_info["alt_text"] + image_URL=img_info["url"] + HTML_context = "Here is the surrounding HTML context of the element: " + img_info["html_context"] + page_text = "Here is the content of the page: Title of the page: " + str(img_info["page_title"]) + page_text=page_text+", content of the tag: " + str(img_info["page_description"]) + page_text=page_text+", content of the tag: " + str(img_info["page_keywords"]) + # skip headings + + print("Processing image URL:", image_URL) + print("Alt-text:", alt_text) + print("HTML context:", HTML_context) + print("Page text:", page_text) + + alt_text_user_prompt=mllm_manager.get_alt_text_user_prompt( + altTextMessage=alt_text, + imageURL=image_URL, + HTMLcontext=HTML_context, + pageText=page_text, + ) + print("alt_text_user_prompt:", alt_text_user_prompt) + + mllm_response = mllm_manager.get_response( + system_prompt=alt_text_system_prompt, + user_prompt=alt_text_user_prompt + + ) + mllm_responses.append(mllm_response['choices'][0]["message"]["content"]) + + if disclaim_bool_string(args.save_elaboration) == True: # Optionally save to JSON + + with open(output_dir + "/mllm_responses.json", "w", encoding="utf-8") as f: + json.dump(mllm_responses, f, indent=2, ensure_ascii=False) + + +if __name__ == '__main__': + + asyncio.run(cli(sys.argv[1:])) \ No newline at end of file