wcag_AI_validation/dependences/mllm_management.py

from dependences.utils import call_API_urlibrequest, encode_image_from_url
import json
import re


class MLLMManager:
    def __init__(self, end_point, api_key, model_id):
        self.end_point = end_point
        self.api_key = api_key
        self.model_id = model_id

    def get_response(
        self, system_prompt, user_prompt, openai_model=False, is_only_textual=False
    ):
        payload = self.create_mllm_payload(
            system_prompt,
            user_prompt,
            openai_model=openai_model,
            is_only_textual=is_only_textual,
        )
        # print("LLM full payload:", payload)
        headers = [
            ["Content-Type", "application/json"],
            ["Authorization", f"Bearer {self.api_key}"],
        ]
        response = call_API_urlibrequest(
            url=self.end_point, headers=headers, data=payload
        )
        try:
            if openai_model:
                model_response = response["choices"][0]["message"]["content"]
            else:
                model_response = response["message"]["content"]

        except Exception as e:
            print("Error getting model response:", e)
            model_response = {}

        return model_response

    def create_mllm_payload(
        self,
        system_prompt,
        user_prompt,
        openai_model=False,
        is_only_textual=False,
    ):
        if openai_model:
            print("Creating OpenAI format payload")
            payload = {
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
                "temperature": 0.7,
                "top_p": 0.95,
                "frequency_penalty": 0,
                "presence_penalty": 0,
                "max_tokens": 800,
                "stop": None,
            }
        else:  # ollama format
            print("Creating alternative LLM format payload")
            if is_only_textual:
                payload = {
                    "model": self.model_id,
                    "stream": False,
                    "messages": [
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": user_prompt["user_prompt"]},
                    ],
                    "options": {
                        # "seed": 123,
                        "temperature": 0.7,
                        "num_ctx": 8192,  # max input token
                        "num_predict": 800,  # max output tokens
                        "top_p": 0.95,
                    },
                }

            else:
                payload = {
                    "model": self.model_id,
                    "stream": False,
                    "messages": [
                        {"role": "system", "content": system_prompt},
                        {
                            "role": "user",
                            "content": user_prompt["user_prompt"],
                            "images": [user_prompt["image_base64"]],
                        },
                    ],
                    "options": {
                        # "seed": 123,
                        "temperature": 0.7,
                        "num_ctx": 8192,  # max input token
                        "num_predict": 800,  # max output tokens
                        "top_p": 0.95,
                    },
                }
        return payload

    # --------alt text evaluation specific methods ---------

    def get_alt_text_system_prompt(self):

        # https://www.w3.org/WAI/WCAG22/Techniques/general/G94 without examples
        system_prompt = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for
                images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present
                the same information as the original image content.  As a result, it is possible to remove the image content and replace it with the text alternative and no functionality or information would be lost. This text alternative should not necessarily describe the image content.
                It should serve the same purpose and convey the same information. This may sometimes result in a text alternative that looks like a description of the image content. But this would only be true if that was the best way to serve the same purpose.
                If possible, the short text alternative should completely convey the purpose and information. If it is not possible to do this in a short phrase or sentence, then the short text alternative should provide a brief overview of the information.
                The text alternative should be able to substitute for the image content. If the image content were removed from the page and substituted with the text, the page would still provide the same function and information. The text alternative would be brief but as informative as possible.
                In deciding what text to include in the alternative, it is often a good idea to consider the following questions:
                Why is this image content here?
                What information is it presenting?
                What purpose does it fulfill?
                If I could not use the image content, what words would I use to convey the same function and/or information?

                When image content contains words that are important to understanding the content, the alt text should include those words.
                Decorative images don’t add information to the content of a page. For example, the information provided by the image might already be given using adjacent text, or the image might be included to make the website more visually attractive.
                In these cases, a null (empty) alt text should be provided (alt="") so that they can be ignored by assistive technologies, such as screen readers.

                Follow these instructions carefully:
                1. You will be provided as input with the following:
                - The image found on the webpage.
                - The associated alternative text. When the alt-text is empty or absent, you will be explicitly informed.
                - The surrounding context of the image.
                - The page title, headings and the content of the “keywords” and “description” <meta> tag, if found.

                2. Determine the function and purpose of the image by analyzing these elements. Take into account the purpose and function
                of the associated image by considering the page context. Check also if the image is, or is associated with, a link or a button,
                and consider this in your judgement. If the image contains text use that as part of the context.

                3. Provide a final assessment judgment based on the following:
                - 'success' if you can assess with 'sufficient certainty' the alt-text is appropriate in relation to the image purpose,
                - 'failure' if you can assess with 'sufficient certainty' that the alt-text is NOT appropriate,
                - 'warning' if you cannot determine with 'sufficient certainty'.
                where the level of certainty goes from 1 to 100 and 'sufficient certainty' means > 80

                4. The original alt-text assessment on a scale from 1 to 5, where 5 is the best score. Use an integer number only.

                5. Provide a brief reasoning for your judgment. If the image contains text, write it verbatim.

                6. Keep your response within 150 words.

                7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words. Use the same natural language (e.g., English, Spanish, Italian) as the original alt-text.

                8. Here is the JSON format the results must have:
                {"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment judgment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}"""

        return system_prompt

    def get_g88_system_prompt(self):

        # https://www.w3.org/WAI/WCAG22/Techniques/general/G88 without examples

        system_prompt = """You are a web accessibility evaluation tool.
                Your task is to determine if web pages have a descriptive title, according to WCAG guidelines.
                The objective of this technique is to give each web page a descriptive title. Descriptive titles help users find content, orient themselves within it, and navigate through it. A descriptive title allows a user to easily identify what web page they are using and to tell when the web page has changed. The title can be used to identify the web page without requiring users to read or interpret page content. Users can more quickly identify the content they need when accurate, descriptive titles appear in site maps or lists of search results. When descriptive titles are used within link text, they help users navigate more precisely to the content they are interested in.
                The title of each web page should:
                - Identify the subject of the web page
                - Make sense when read out of context, for example by a screen reader or in a site map or list of search results
                - Be short

                Follow these instructions carefully:
                1. You will be provided with the following:
                -The <title> content (if present, and if absent, acknowledge this in your evaluation).
                -The main section and headings of the page as context.

                2. Determine if the page title is descriptive, by comparing its semantic meaning with the partial context provided.

                3. Provide a judgment based on the following:
                - 'success' If you can determine with sufficient certainty that the page title is meaningful for the purpose and content of the page,
                - 'failure' If you can determine with sufficient certainty that it is not meaningful,
                - 'warning' if you cannot determine with 'sufficient certainty'.
                where the level of certainty goes from 1 to 100 and 'sufficient certainty' means > 80

                4. Provide the assessment on a scale from 1 to 5, where 5 is the best score. Use an integer number only. Note: assessmnet and judgment should be consistent but their purpose is different.

                5. Provide a brief reasoning for your judgment. Your response should be in English. Keep your response within 100 words.

                6. Here is the JSON format the result must have:
                {"Assessment" : "*your assessment*", "Judgment" : "*your judgment*", "EvaluationResult": "*your response*"}"""

        return system_prompt

    def get_alt_text_user_prompt(  # the user_prompt is specific to the platform used (openai, ollama)
        self, altTextMessage, imageURL, HTMLcontext, pageText, openai_model=True
    ):

        if openai_model:
            user_prompt = [
                {"type": "text", "text": altTextMessage},
                {"type": "image_url", "image_url": {"url": imageURL}},
                {"type": "text", "text": HTMLcontext},
                {"type": "text", "text": pageText},
            ]
        else:
            user_prompt = {
                "user_prompt": altTextMessage + " " + HTMLcontext + " " + pageText,
                "image_base64": encode_image_from_url(imageURL),
            }

        return user_prompt

    def get_standard_textual_user_prompt(  # the user_prompt is specific to the platform used (openai, ollama)
        self, texts, openai_model=True
    ):

        if openai_model:
            user_prompt = []
            for text in texts:
                partial_user_prompt = {"type": "text", "text": text}
                user_prompt.append(partial_user_prompt)
        else:
            user_prompt = ""
            for text in texts:
                user_prompt = user_prompt + " " + text
            user_prompt = {"user_prompt": user_prompt}

        return user_prompt

    def make_alt_text_evaluation(
        self,
        images,
        openai_model=False,
    ):
        print("Using end_point:", self.end_point)

        alt_text_system_prompt = self.get_alt_text_system_prompt()
        # print("alt_text_system_prompt:", alt_text_system_prompt)

        mllm_responses = []
        for img_info in images:
            alt_text = "Here is the alt-text of the image: " + img_info["alt_text"]
            image_URL = img_info["url"]
            HTML_context = (
                "Here is the surrounding HTML context of the element: "
                + img_info["html_context"]
            )
            page_text = "Here is the content of the page: Title of the page: " + str(
                img_info["page_title"]
            )
            page_text = (
                page_text
                + ", content of the <meta name='description'> tag: "
                + str(img_info["page_description"])
            )
            page_text = (
                page_text
                + ", content of the <meta name='keywords'> tag: "
                + str(img_info["page_keywords"])
            )
            # skip headings

            print("Processing image URL:", image_URL)
            print("Alt-text:", alt_text)
            print("HTML context:", HTML_context)
            print("Page text:", page_text)

            alt_text_user_prompt = self.get_alt_text_user_prompt(
                altTextMessage=alt_text,
                imageURL=image_URL,
                HTMLcontext=HTML_context,
                pageText=page_text,
                openai_model=openai_model,
            )
            # print("alt_text_user_prompt:", alt_text_user_prompt)

            mllm_response = self.get_response(
                system_prompt=alt_text_system_prompt,
                user_prompt=alt_text_user_prompt,
                openai_model=openai_model,
            )

            report = {
                "image_url": image_URL,
                "alt_text": img_info["alt_text"],
                "mllm_response": mllm_response,
            }
            mllm_responses.append(report)
        return mllm_responses

    # --- end of alt text evaluation specific methods ---------

    def make_h58_evaluation(
        self,
        main_language,
        other_textual_elements,
        openai_model=False,
    ):
        print("Using end_point:", self.end_point)
        print(
            "make_h58_evaluation - main_language:",
            main_language,
            "other_textual_elements:",
            other_textual_elements,
        )
        mllm_responses = []
        report = {
            "mllm_response": "",
        }
        mllm_responses.append(report)
        return mllm_responses

    def make_g88_evaluation(
        self,
        title_content,
        openai_model=False,
    ):

        system_prompt = self.get_g88_system_prompt()

        page_title = "The title of the page is: " + str(title_content["title"] + ". ")
        structural_content = (
            "Here is the content of the page (<main> tag, headings):"
            + str(title_content["structural_content"])
        )
        user_prompt = self.get_standard_textual_user_prompt(
            texts=[page_title, structural_content], openai_model=openai_model
        )

        mllm_response = self.get_response(
            system_prompt=system_prompt,
            user_prompt=user_prompt,
            openai_model=openai_model,
            is_only_textual=True,
        )

        report = {
            "mllm_response": mllm_response,
        }

        return report


### Other utility functions
def parse_mllm_alt_text_response(mllm_response):
    """
    Parse an MLLM response string and extract key attributes into a JSON object.

    from mllm response like:
    ```json\n{\n\"Original alt-text assessment\"... etc
    to a structured dictionary.

    Args:
        mllm_response (str): The raw MLLM response text containing JSON data

    Returns:
        dict: A dictionary containing the extracted attributes, or None if parsing fails
    """
    try:
        # Handle NaN or None values
        if mllm_response is None or mllm_response == "":
            return {
                "original_alt_text_assessment": None,
                "assessment": None,
                "evaluation_result": None,
                "new_alt_text": None,
            }

        # Extract JSON content between ```json and ``` markers
        json_match = re.search(r"```json\s*(.*?)\s*```", mllm_response, re.DOTALL)

        if not json_match:
            # Try to find JSON without markdown code blocks
            json_match = re.search(r"\{.*\}", mllm_response, re.DOTALL)

        if not json_match:
            return {
                "original_alt_text_assessment": None,
                "assessment": None,
                "evaluation_result": None,
                "new_alt_text": None,
            }

        json_str = (
            json_match.group(1) if "```json" in mllm_response else json_match.group(0)
        )

        # Parse the JSON string
        parsed_data = json.loads(json_str)

        # Create a structured output with the key attributes
        result = {
            "original_alt_text_assessment": parsed_data.get(
                "Original alt-text assessment", ""
            ),
            "assessment": parsed_data.get("Assessment", ""),
            "evaluation_result": parsed_data.get("EvaluationResult", ""),
            "new_alt_text": parsed_data.get("New alt-text", ""),
        }

        return result

    except json.JSONDecodeError as e:
        print(f"JSON parsing error: {e}")
        return {
            "original_alt_text_assessment": None,
            "assessment": None,
            "evaluation_result": None,
            "new_alt_text": None,
        }
    except Exception as e:
        print(f"Error parsing MLLM response: {e}")
        return {
            "original_alt_text_assessment": None,
            "assessment": None,
            "evaluation_result": None,
            "new_alt_text": None,
        }


def parse_mllm_standard_response(mllm_response):

    try:
        # Handle NaN or None values
        if mllm_response is None or mllm_response == "":
            return {
                "assessment": None,
                "judgment": None,
                "evaluation_result": None,
            }
        # Extract JSON content between ```json and ``` markers
        json_match = re.search(r"```json\s*(.*?)\s*```", mllm_response, re.DOTALL)

        if not json_match:
            # Try to find JSON without markdown code blocks
            json_match = re.search(r"\{.*\}", mllm_response, re.DOTALL)

        if not json_match:
            return {
                "assessment": None,
                "judgment": None,
                "evaluation_result": None,
            }

        json_str = (
            json_match.group(1) if "```json" in mllm_response else json_match.group(0)
        )

        print("Extracted JSON string from MLLM response:", json_str)

        # Parse the JSON string
        parsed_data = json.loads(json_str)

        # Create a structured output with the key attributes
        result = {
            "assessment": parsed_data.get("Assessment", ""),
            "judgment": parsed_data.get("Judgment", ""),
            "evaluation_result": parsed_data.get("EvaluationResult", ""),
        }

        return result

    except json.JSONDecodeError as e:
        print(f"JSON parsing error: {e}")
        return {
            "assessment": None,
            "judgment": None,
            "evaluation_result": None,
        }
    except Exception as e:
        print(f"Error parsing MLLM response: {e}")
        return {
            "assessment": None,
            "judgment": None,
            "evaluation_result": None,
        }