wcag_AI_validation/dependences/mllm_management.py

471 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from dependences.utils import call_API_urlibrequest, encode_image_from_url
import json
import re
class MLLMManager:
def __init__(self, end_point, api_key, model_id):
self.end_point = end_point
self.api_key = api_key
self.model_id = model_id
def get_response(
self, system_prompt, user_prompt, openai_model=False, is_only_textual=False
):
payload = self.create_mllm_payload(
system_prompt,
user_prompt,
openai_model=openai_model,
is_only_textual=is_only_textual,
)
# print("LLM full payload:", payload)
headers = [
["Content-Type", "application/json"],
["Authorization", f"Bearer {self.api_key}"],
]
response = call_API_urlibrequest(
url=self.end_point, headers=headers, data=payload
)
try:
if openai_model:
model_response = response["choices"][0]["message"]["content"]
else:
model_response = response["message"]["content"]
except Exception as e:
print("Error getting model response:", e)
model_response = {}
return model_response
def create_mllm_payload(
self,
system_prompt,
user_prompt,
openai_model=False,
is_only_textual=False,
):
if openai_model:
print("Creating OpenAI format payload")
payload = {
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
"temperature": 0.7,
"top_p": 0.95,
"frequency_penalty": 0,
"presence_penalty": 0,
"max_tokens": 800,
"stop": None,
}
else: # ollama format
print("Creating alternative LLM format payload")
if is_only_textual:
payload = {
"model": self.model_id,
"stream": False,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt["user_prompt"]},
],
"options": {
# "seed": 123,
"temperature": 0.7,
"num_ctx": 8192, # max input token
"num_predict": 800, # max output tokens
"top_p": 0.95,
},
}
else:
payload = {
"model": self.model_id,
"stream": False,
"messages": [
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": user_prompt["user_prompt"],
"images": [user_prompt["image_base64"]],
},
],
"options": {
# "seed": 123,
"temperature": 0.7,
"num_ctx": 8192, # max input token
"num_predict": 800, # max output tokens
"top_p": 0.95,
},
}
return payload
# --------alt text evaluation specific methods ---------
def get_alt_text_system_prompt(self):
# https://www.w3.org/WAI/WCAG22/Techniques/general/G94 without examples
system_prompt = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for
images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present
the same information as the original image content. As a result, it is possible to remove the image content and replace it with the text alternative and no functionality or information would be lost. This text alternative should not necessarily describe the image content.
It should serve the same purpose and convey the same information. This may sometimes result in a text alternative that looks like a description of the image content. But this would only be true if that was the best way to serve the same purpose.
If possible, the short text alternative should completely convey the purpose and information. If it is not possible to do this in a short phrase or sentence, then the short text alternative should provide a brief overview of the information.
The text alternative should be able to substitute for the image content. If the image content were removed from the page and substituted with the text, the page would still provide the same function and information. The text alternative would be brief but as informative as possible.
In deciding what text to include in the alternative, it is often a good idea to consider the following questions:
Why is this image content here?
What information is it presenting?
What purpose does it fulfill?
If I could not use the image content, what words would I use to convey the same function and/or information?
When image content contains words that are important to understanding the content, the alt text should include those words.
Decorative images dont add information to the content of a page. For example, the information provided by the image might already be given using adjacent text, or the image might be included to make the website more visually attractive.
In these cases, a null (empty) alt text should be provided (alt="") so that they can be ignored by assistive technologies, such as screen readers.
Follow these instructions carefully:
1. You will be provided as input with the following:
- The image found on the webpage.
- The associated alternative text. When the alt-text is empty or absent, you will be explicitly informed.
- The surrounding context of the image.
- The page title, headings and the content of the “keywords” and “description” <meta> tag, if found.
2. Determine the function and purpose of the image by analyzing these elements. Take into account the purpose and function
of the associated image by considering the page context. Check also if the image is, or is associated with, a link or a button,
and consider this in your judgement. If the image contains text use that as part of the context.
3. Provide a final assessment judgment based on the following:
- 'success' if you can assess with 'sufficient certainty' the alt-text is appropriate in relation to the image purpose,
- 'failure' if you can assess with 'sufficient certainty' that the alt-text is NOT appropriate,
- 'warning' if you cannot determine with 'sufficient certainty'.
where the level of certainty goes from 1 to 100 and 'sufficient certainty' means > 80
4. The original alt-text assessment on a scale from 1 to 5, where 5 is the best score. Use an integer number only.
5. Provide a brief reasoning for your judgment. If the image contains text, write it verbatim.
6. Keep your response within 150 words.
7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words. Use the same natural language (e.g., English, Spanish, Italian) as the original alt-text.
8. Here is the JSON format the results must have:
{"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment judgment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}"""
return system_prompt
def get_g88_system_prompt(self):
# https://www.w3.org/WAI/WCAG22/Techniques/general/G88 without examples
system_prompt = """You are a web accessibility evaluation tool.
Your task is to determine if web pages have a descriptive title, according to WCAG guidelines.
The objective of this technique is to give each web page a descriptive title. Descriptive titles help users find content, orient themselves within it, and navigate through it. A descriptive title allows a user to easily identify what web page they are using and to tell when the web page has changed. The title can be used to identify the web page without requiring users to read or interpret page content. Users can more quickly identify the content they need when accurate, descriptive titles appear in site maps or lists of search results. When descriptive titles are used within link text, they help users navigate more precisely to the content they are interested in.
The title of each web page should:
- Identify the subject of the web page
- Make sense when read out of context, for example by a screen reader or in a site map or list of search results
- Be short
Follow these instructions carefully:
1. You will be provided with the following:
-The <title> content (if present, and if absent, acknowledge this in your evaluation).
-The main section and headings of the page as context.
2. Determine if the page title is descriptive, by comparing its semantic meaning with the partial context provided.
3. Provide a judgment based on the following:
- 'success' If you can determine with sufficient certainty that the page title is meaningful for the purpose and content of the page,
- 'failure' If you can determine with sufficient certainty that it is not meaningful,
- 'warning' if you cannot determine with 'sufficient certainty'.
where the level of certainty goes from 1 to 100 and 'sufficient certainty' means > 80
4. Provide the assessment on a scale from 1 to 5, where 5 is the best score. Use an integer number only. Note: assessmnet and judgment should be consistent but their purpose is different.
5. Provide a brief reasoning for your judgment. Your response should be in English. Keep your response within 100 words.
6. Here is the JSON format the result must have:
{"Assessment" : "*your assessment*", "Judgment" : "*your judgment*", "EvaluationResult": "*your response*"}"""
return system_prompt
def get_alt_text_user_prompt( # the user_prompt is specific to the platform used (openai, ollama)
self, altTextMessage, imageURL, HTMLcontext, pageText, openai_model=True
):
if openai_model:
user_prompt = [
{"type": "text", "text": altTextMessage},
{"type": "image_url", "image_url": {"url": imageURL}},
{"type": "text", "text": HTMLcontext},
{"type": "text", "text": pageText},
]
else:
user_prompt = {
"user_prompt": altTextMessage + " " + HTMLcontext + " " + pageText,
"image_base64": encode_image_from_url(imageURL),
}
return user_prompt
def get_standard_textual_user_prompt( # the user_prompt is specific to the platform used (openai, ollama)
self, texts, openai_model=True
):
if openai_model:
user_prompt = []
for text in texts:
partial_user_prompt = {"type": "text", "text": text}
user_prompt.append(partial_user_prompt)
else:
user_prompt = ""
for text in texts:
user_prompt = user_prompt + " " + text
user_prompt = {"user_prompt": user_prompt}
return user_prompt
def make_alt_text_evaluation(
self,
images,
openai_model=False,
):
print("Using end_point:", self.end_point)
alt_text_system_prompt = self.get_alt_text_system_prompt()
# print("alt_text_system_prompt:", alt_text_system_prompt)
mllm_responses = []
for img_info in images:
alt_text = "Here is the alt-text of the image: " + img_info["alt_text"]
image_URL = img_info["url"]
HTML_context = (
"Here is the surrounding HTML context of the element: "
+ img_info["html_context"]
)
page_text = "Here is the content of the page: Title of the page: " + str(
img_info["page_title"]
)
page_text = (
page_text
+ ", content of the <meta name='description'> tag: "
+ str(img_info["page_description"])
)
page_text = (
page_text
+ ", content of the <meta name='keywords'> tag: "
+ str(img_info["page_keywords"])
)
# skip headings
print("Processing image URL:", image_URL)
print("Alt-text:", alt_text)
print("HTML context:", HTML_context)
print("Page text:", page_text)
alt_text_user_prompt = self.get_alt_text_user_prompt(
altTextMessage=alt_text,
imageURL=image_URL,
HTMLcontext=HTML_context,
pageText=page_text,
openai_model=openai_model,
)
# print("alt_text_user_prompt:", alt_text_user_prompt)
mllm_response = self.get_response(
system_prompt=alt_text_system_prompt,
user_prompt=alt_text_user_prompt,
openai_model=openai_model,
)
report = {
"image_url": image_URL,
"alt_text": img_info["alt_text"],
"mllm_response": mllm_response,
}
mllm_responses.append(report)
return mllm_responses
# --- end of alt text evaluation specific methods ---------
def make_h58_evaluation(
self,
main_language,
other_textual_elements,
openai_model=False,
):
print("Using end_point:", self.end_point)
print(
"make_h58_evaluation - main_language:",
main_language,
"other_textual_elements:",
other_textual_elements,
)
mllm_responses = []
report = {
"mllm_response": "",
}
mllm_responses.append(report)
return mllm_responses
def make_g88_evaluation(
self,
title_content,
openai_model=False,
):
system_prompt = self.get_g88_system_prompt()
page_title = "The title of the page is: " + str(title_content["title"] + ". ")
structural_content = (
"Here is the content of the page (<main> tag, headings):"
+ str(title_content["structural_content"])
)
user_prompt = self.get_standard_textual_user_prompt(
texts=[page_title, structural_content], openai_model=openai_model
)
mllm_response = self.get_response(
system_prompt=system_prompt,
user_prompt=user_prompt,
openai_model=openai_model,
is_only_textual=True,
)
report = {
"mllm_response": mllm_response,
}
return report
### Other utility functions
def parse_mllm_alt_text_response(mllm_response):
"""
Parse an MLLM response string and extract key attributes into a JSON object.
from mllm response like:
```json\n{\n\"Original alt-text assessment\"... etc
to a structured dictionary.
Args:
mllm_response (str): The raw MLLM response text containing JSON data
Returns:
dict: A dictionary containing the extracted attributes, or None if parsing fails
"""
try:
# Handle NaN or None values
if mllm_response is None or mllm_response == "":
return {
"original_alt_text_assessment": None,
"assessment": None,
"evaluation_result": None,
"new_alt_text": None,
}
# Extract JSON content between ```json and ``` markers
json_match = re.search(r"```json\s*(.*?)\s*```", mllm_response, re.DOTALL)
if not json_match:
# Try to find JSON without markdown code blocks
json_match = re.search(r"\{.*\}", mllm_response, re.DOTALL)
if not json_match:
return {
"original_alt_text_assessment": None,
"assessment": None,
"evaluation_result": None,
"new_alt_text": None,
}
json_str = (
json_match.group(1) if "```json" in mllm_response else json_match.group(0)
)
# Parse the JSON string
parsed_data = json.loads(json_str)
# Create a structured output with the key attributes
result = {
"original_alt_text_assessment": parsed_data.get(
"Original alt-text assessment", ""
),
"assessment": parsed_data.get("Assessment", ""),
"evaluation_result": parsed_data.get("EvaluationResult", ""),
"new_alt_text": parsed_data.get("New alt-text", ""),
}
return result
except json.JSONDecodeError as e:
print(f"JSON parsing error: {e}")
return {
"original_alt_text_assessment": None,
"assessment": None,
"evaluation_result": None,
"new_alt_text": None,
}
except Exception as e:
print(f"Error parsing MLLM response: {e}")
return {
"original_alt_text_assessment": None,
"assessment": None,
"evaluation_result": None,
"new_alt_text": None,
}
def parse_mllm_standard_response(mllm_response):
try:
# Handle NaN or None values
if mllm_response is None or mllm_response == "":
return {
"assessment": None,
"judgment": None,
"evaluation_result": None,
}
# Extract JSON content between ```json and ``` markers
json_match = re.search(r"```json\s*(.*?)\s*```", mllm_response, re.DOTALL)
if not json_match:
# Try to find JSON without markdown code blocks
json_match = re.search(r"\{.*\}", mllm_response, re.DOTALL)
if not json_match:
return {
"assessment": None,
"judgment": None,
"evaluation_result": None,
}
json_str = (
json_match.group(1) if "```json" in mllm_response else json_match.group(0)
)
print("Extracted JSON string from MLLM response:", json_str)
# Parse the JSON string
parsed_data = json.loads(json_str)
# Create a structured output with the key attributes
result = {
"assessment": parsed_data.get("Assessment", ""),
"judgment": parsed_data.get("Judgment", ""),
"evaluation_result": parsed_data.get("EvaluationResult", ""),
}
return result
except json.JSONDecodeError as e:
print(f"JSON parsing error: {e}")
return {
"assessment": None,
"judgment": None,
"evaluation_result": None,
}
except Exception as e:
print(f"Error parsing MLLM response: {e}")
return {
"assessment": None,
"judgment": None,
"evaluation_result": None,
}