wcag_AI_validation/dependences/mllm_management.py

from dependences.utils import call_API_urlibrequest, encode_image_from_url
import json
import re

class MLLMManager:
    def __init__(self, end_point, api_key, model_id):
        self.end_point = end_point
        self.api_key = api_key
        self.model_id = model_id

    def get_response(self, system_prompt, user_prompt, openai_model=False):
        payload = self.create_mllm_payload(
            system_prompt, user_prompt, openai_model=openai_model
        )
        # print("LLM full payload:", payload)
        headers = [
            ["Content-Type", "application/json"],
            ["Authorization", f"Bearer {self.api_key}"],
        ]
        response = call_API_urlibrequest(
            url=self.end_point, headers=headers, data=payload
        )
        try:
            if openai_model:
                model_response = response["choices"][0]["message"]["content"]
            else:
                model_response = response["message"]["content"]

        except Exception as e:
            print("Error getting model response:", e)
            model_response = {}

        return model_response

    def create_mllm_payload(
        self,
        system_prompt,
        user_prompt,
        openai_model=False,
    ):
        if openai_model:
            print("Creating OpenAI format payload")
            payload = {
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
                "temperature": 0.7,
                "top_p": 0.95,
                "frequency_penalty": 0,
                "presence_penalty": 0,
                "max_tokens": 800,
                "stop": None,
            }
        else:  # ollama format
            print("Creating alternative LLM format payload")
            payload = {
                "model": self.model_id,
                "stream": False,
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {
                        "role": "user",
                        "content": user_prompt["user_prompt"],
                        "images": [user_prompt["image_base64"]],
                    },
                ],
                "options": {
                    "seed": 123,
                    "temperature": 0.7,
                    "num_ctx": 8192,  # max input token
                    "num_predict": 800,  # max output tokens
                    "top_p": 0.95,
                },
            }
        return payload

    def get_alt_text_system_prompt(self):

        # https://www.w3.org/WAI/WCAG22/Techniques/general/G94 without examples
        system_prompt = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for
                images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present
                the same information as the original image content.  As a result, it is possible to remove the image content and replace it with the text alternative and no functionality or information would be lost. This text alternative should not necessarily describe the image content.
                It should serve the same purpose and convey the same information. This may sometimes result in a text alternative that looks like a description of the image content. But this would only be true if that was the best way to serve the same purpose.
                If possible, the short text alternative should completely convey the purpose and information. If it is not possible to do this in a short phrase or sentence, then the short text alternative should provide a brief overview of the information.
                The text alternative should be able to substitute for the image content. If the image content were removed from the page and substituted with the text, the page would still provide the same function and information. The text alternative would be brief but as informative as possible.
                In deciding what text to include in the alternative, it is often a good idea to consider the following questions:
                Why is this image content here?
                What information is it presenting?
                What purpose does it fulfill?
                If I could not use the image content, what words would I use to convey the same function and/or information?

                When image content contains words that are important to understanding the content, the alt text should include those words.

                Follow these instructions carefully:
                1. You will be provided as input with the following:
                - The image found on the webpage.
                - The associated alternative text. When the alt-text is empty or absent, you will be explicitly informed.
                - The surrounding context of the image.
                - The page title, headings and the content of the “keywords” and “description” <meta> tag, if found.

                2. Determine the function and purpose of the image by analyzing these elements. Take into account the purpose and function
                of the associated image by considering the page context. Check also if the image is, or is associated with, a link or a button,
                and consider this in your judgement. If the image contains text use that as part of the context.

                3. Provide a final assessment based on the following:
                - 'success' if you can assess with 'sufficient certainty' the alt-text is appropriate in relation to the image purpose,
                - 'failure' if you can assess with 'sufficient certainty' that the alt-text is NOT appropriate,
                - 'warning' if you cannot determine with 'sufficient certainty'.
                where the level of certainty goes from 1 to 100 and 'sufficient certainty' means > 80

                4. The original alt-text assessment on a scale from 1 to 5, where 5 is the best score. Use an integer number only.

                5. Provide a brief reasoning for your judgment. If the image contains text, write it verbatim. Your response should be in English.

                6. Keep your response within 150 words.

                7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words. Use the same language as the original alt-text.

                8. Here is the JSON format the results must have:
                {"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}"""

        return system_prompt

    def get_alt_text_user_prompt(  # the user_prompt is specific to the platform used (openai, ollama)
        self, altTextMessage, imageURL, HTMLcontext, pageText, openai_model=True
    ):

        if openai_model:
            user_prompt = [
                {"type": "text", "text": altTextMessage},
                {"type": "image_url", "image_url": {"url": imageURL}},
                {"type": "text", "text": HTMLcontext},
                {"type": "text", "text": pageText},
            ]
        else:
            user_prompt = {
                "user_prompt": altTextMessage + " " + HTMLcontext + " " + pageText,
                "image_base64": encode_image_from_url(imageURL),
            }

        return user_prompt

    def make_alt_text_evaluation(
        self,
        images,
        openai_model=False,
    ):
        print("Using end_point:", self.end_point)

        alt_text_system_prompt = self.get_alt_text_system_prompt()
        #print("alt_text_system_prompt:", alt_text_system_prompt)

        mllm_responses = []
        for img_info in images:
            alt_text = "Here is the alt-text of the image: " + img_info["alt_text"]
            image_URL = img_info["url"]
            HTML_context = (
                "Here is the surrounding HTML context of the element: "
                + img_info["html_context"]
            )
            page_text = "Here is the content of the page: Title of the page: " + str(
                img_info["page_title"]
            )
            page_text = (
                page_text
                + ", content of the <meta name='description'> tag: "
                + str(img_info["page_description"])
            )
            page_text = (
                page_text
                + ", content of the <meta name='keywords'> tag: "
                + str(img_info["page_keywords"])
            )
            # skip headings

            print("Processing image URL:", image_URL)
            print("Alt-text:", alt_text)
            print("HTML context:", HTML_context)
            print("Page text:", page_text)

            alt_text_user_prompt = self.get_alt_text_user_prompt(
                altTextMessage=alt_text,
                imageURL=image_URL,
                HTMLcontext=HTML_context,
                pageText=page_text,
                openai_model=openai_model,
            )
            # print("alt_text_user_prompt:", alt_text_user_prompt)

            mllm_response = self.get_response(
                system_prompt=alt_text_system_prompt,
                user_prompt=alt_text_user_prompt,
                openai_model=openai_model,
            )

            report = {
                "image_url": image_URL,
                "alt_text": img_info["alt_text"],
                "mllm_response": mllm_response,
            }
            mllm_responses.append(report)
        return mllm_responses


def parse_mllm_alt_text_response(mllm_response):
    """
    Parse an MLLM response string and extract key attributes into a JSON object.

    from mllm response like:
    ```json\n{\n\"Original alt-text assessment\"... etc
    to a structured dictionary.

    Args:
        mllm_response (str): The raw MLLM response text containing JSON data

    Returns:
        dict: A dictionary containing the extracted attributes, or None if parsing fails
    """
    try:
        # Handle NaN or None values
        if mllm_response is None or mllm_response == "":
            return {
                "original_alt_text_assessment": None,
                "assessment": None,
                "evaluation_result": None,
                "new_alt_text": None
            }

        # Extract JSON content between ```json and ``` markers
        json_match = re.search(r'```json\s*(.*?)\s*```', mllm_response, re.DOTALL)

        if not json_match:
            # Try to find JSON without markdown code blocks
            json_match = re.search(r'\{.*\}', mllm_response, re.DOTALL)

        if not json_match:
            return {
                "original_alt_text_assessment": None,
                "assessment": None,
                "evaluation_result": None,
                "new_alt_text": None
            }

        json_str = json_match.group(1) if '```json' in mllm_response else json_match.group(0)

        # Parse the JSON string
        parsed_data = json.loads(json_str)

        # Create a structured output with the key attributes
        result = {
            "original_alt_text_assessment": parsed_data.get("Original alt-text assessment", ""),
            "assessment": parsed_data.get("Assessment", ""),
            "evaluation_result": parsed_data.get("EvaluationResult", ""),
            "new_alt_text": parsed_data.get("New alt-text", "")
        }

        return result

    except json.JSONDecodeError as e:
        print(f"JSON parsing error: {e}")
        return {
            "original_alt_text_assessment": None,
            "assessment": None,
            "evaluation_result": None,
            "new_alt_text": None
        }
    except Exception as e:
        print(f"Error parsing MLLM response: {e}")
        return {
            "original_alt_text_assessment": None,
            "assessment": None,
            "evaluation_result": None,
            "new_alt_text": None
        }