codice analisi esercitazione 12-2025

2025-12-30 10:01:06 +01:00 · 2025-12-30 10:01:06 +01:00 · f4a99f358d
parent 107473cd4f
commit f4a99f358d
12 changed files with 56878 additions and 7685 deletions
--- a/scripts/analisi_esercitazione_12_2025.ipynb
+++ b/scripts/analisi_esercitazione_12_2025.ipynb
--- a/scripts/build_dataset_from_folder_no_llm_check.py
+++ b/scripts/build_dataset_from_folder_no_llm_check.py
@ -0,0 +1,546 @@
+# costruisco il db solo con url immagini e progressivo immagini per ogni page_url
+# to launch: python build_dataset_from_folder_no_llm_check.py --ref_path "C:\cartella_condivisa\MachineLearning\HIISlab\accessibility\notebook_miei\LLM_accessibility_validator\out" --push_to_hub --repo_id "nicolaleo/LLM-alt-text-assessment" --token "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"
+
+from datasets import Dataset, DatasetDict
+import datasets
+import json
+from pathlib import Path
+from PIL import Image
+import hashlib
+import urllib.parse
+import argparse
+
+
+'''
+# Dataset metadata
+_DESCRIPTION = """\
+Dataset for image alt-text assessment and improvement using MLLM responses.
+Contains images, original alt-texts, quality assessments, and improved versions.
+"""
+
+_CITATION = """\
+@misc{alt_text_assessment,
+  title={Alt-Text Assessment Dataset},
+  year={2024}
+}
+"""
+
+
+
+
+
+class AltTextDataset(datasets.GeneratorBasedBuilder):
+    """Dataset for alt-text assessment with images and MLLM responses."""
+    
+    VERSION = datasets.Version("1.0.0")
+    
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features({
+                "image": datasets.Image(),
+                "image_url": datasets.Value("string"),
+                "alt_text": datasets.Value("string"),
+                "original_alt_text_assessment": datasets.Value("string"),
+                "assessment": datasets.Value("string"),
+                "evaluation_result": datasets.Value("string"),
+                "new_alt_text": datasets.Value("string"),
+                #"source_folder": datasets.Value("string"),
+            }),
+            citation=_CITATION,
+        )
+    
+    def _split_generators(self, dl_manager):
+        """Define data splits."""
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "json_filepath": "data.json",
+                    "images_dir": "images"
+                },
+            ),
+        ]
+    
+    def _generate_examples(self, json_filepath, images_dir):
+        """Generate examples from JSON file and image directory."""
+        with open(json_filepath, encoding="utf-8") as f:
+            data = json.load(f)
+        
+        images_path = Path(images_dir)
+        
+        for idx, entry in enumerate(data):
+            image_url = entry["image_url"]
+            image_filename = url_to_filename(image_url)
+            image_path = images_path / image_filename
+            
+            # Load image if exists, otherwise None
+            image = str(image_path) if image_path.exists() else None
+            
+            yield idx, {
+                "image": image,
+                "image_url": image_url,
+                "alt_text": entry["alt_text"],
+                "original_alt_text_assessment": entry["mllm_response"]["original_alt_text_assessment"],
+                "assessment": entry["mllm_response"]["assessment"],
+                "evaluation_result": entry["mllm_response"]["evaluation_result"],
+                "new_alt_text": entry["mllm_response"]["new_alt_text"],
+            }
+
+'''
+# ============================================================================
+# SIMPLE USAGE FUNCTIONS
+# ============================================================================
+
+
+def url_to_filename(image_url):  # save step as in the image_extractor dependence
+    """
+    Convert image URL to sanitized filename following your exact logic.
+
+    Args:
+        image_url: The image URL
+
+    Returns:
+        Sanitized filename with extension
+    """
+
+    # Parse the URL to get the path without query parameters
+    parsed_url = urllib.parse.urlparse(image_url)
+    url_path = parsed_url.path
+
+    # Get the filename from the path
+    filename = url_path.split("/")[-1]
+    print(f"Original filename: '{filename}'")
+
+    # Split filename and extension
+    if "." in filename:
+        image_name, ext = filename.rsplit(".", 1)
+        ext = ext.lower()
+    else:
+        image_name = filename
+        ext = "jpg"
+
+    # Validate extension
+    if ext not in ["jpg", "jpeg", "png", "gif", "webp"]:
+        ext = "jpg"
+
+    # Sanitize image name (remove special characters, limit length)
+    image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_"))
+
+    image_name = image_name[:50]  # Limit filename length
+
+    # If name is empty after sanitization, create a hash-based name
+    if not image_name:
+        image_name = hashlib.md5(image_url.encode()).hexdigest()[:16]
+
+    return f"{image_name}.{ext}"
+
+
+def push_to_hub_example(dataset_path="alt_text_merged_dataset", repo_id="",token=None):
+    """
+    Example of how to push dataset to Hugging Face Hub.
+    You need to authenticate first!
+    """
+    from huggingface_hub import login
+
+    print("\n=== Pushing Dataset to Hugging Face Hub ===")
+    # Method 1: Login interactively (will prompt for token)
+    # login()
+
+    # Method 2: Login with token directly
+    login(token=token)
+
+    # Method 3: Set token as environment variable
+    # export HF_TOKEN="hf_YourTokenHere"
+    # Then login() will use it automatically
+
+    # Load your dataset
+    ds = load_dataset_from_disk(dataset_path)
+
+    # Combine into DatasetDict
+    ds = DatasetDict(
+        {
+            "train": ds,
+            #    #"test": test_dataset
+        }
+    )
+
+    # Push to hub (creates repo if it doesn't exist)
+    ds.push_to_hub(  # Automatically converts to Parquet when uploading to Hub
+        repo_id,  # Replace with your username
+        private=False,  # Set True for private dataset
+    )
+
+    print("Dataset pushed successfully!")
+    print(f"View at: https://huggingface.co/datasets/{repo_id}")
+
+
+def create_dataset_from_json(json_filepath, json_filepath_images, images_dir="images"):
+    """
+    Create a Hugging Face Dataset from JSON file with local images.
+
+    Args:
+        json_filepath: Path to JSON file with your data structure
+        images_dir: Directory containing the images (default: "images")
+
+    Returns:
+        datasets.Dataset object with images loaded
+    """
+    with open(json_filepath, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    with open(json_filepath_images, "r", encoding="utf-8") as f:
+        data_images = json.load(f)
+
+    images_path = Path(images_dir)
+
+    # Flatten the nested structure and load images
+    flattened_data = {
+        #"image": [],
+        "url": [],
+        "alt_text": [],
+        #"original_alt_text_assessment": [],
+        #"assessment": [],
+        #"evaluation_result": [],
+        #"new_alt_text": [],
+        "page_url": [],
+        #"html_context": [],
+        "image_id":[]
+    }
+
+    count_entry = 0
+    for entry in data_images:
+        """
+        if (
+            entry["mllm_response"]["original_alt_text_assessment"] is None
+        ):  # important! skip entries with no MLLM response. not usable data
+            print(
+                f"Skipping entry with image URL: {entry['image_url']} due to missing MLLM response"
+            )
+            #count_entry += 1
+            #continue  # Skip entries with no MLLM response
+            flattened_data["original_alt_text_assessment"].append(str(0))
+            flattened_data["assessment"].append(0)
+            flattened_data["evaluation_result"].append(0)
+        else:
+            flattened_data["original_alt_text_assessment"].append(
+            str(entry["mllm_response"]["original_alt_text_assessment"])
+            )
+            flattened_data["assessment"].append(entry["mllm_response"]["assessment"])
+            flattened_data["evaluation_result"].append(
+                entry["mllm_response"]["evaluation_result"]
+            )"""
+
+        image_url = entry["url"]
+        image_filename = url_to_filename(image_url)
+        image_path = images_path / image_filename
+
+        # Load image if it exists
+        """
+        if image_path.exists():
+            img = Image.open(image_path)
+            flattened_data["image"].append(img)
+        else:
+            print(f"Warning: Image not found: {image_path}")
+            flattened_data["image"].append(None)
+        """
+
+        flattened_data["url"].append(image_url)
+        flattened_data["alt_text"].append(entry["alt_text"])
+
+
+        """flattened_data["original_alt_text_assessment"].append(
+            str(entry["mllm_response"]["original_alt_text_assessment"])
+        )
+        flattened_data["assessment"].append(entry["mllm_response"]["assessment"])
+        flattened_data["evaluation_result"].append(
+            entry["mllm_response"]["evaluation_result"]
+        )
+        flattened_data["new_alt_text"].append(entry["mllm_response"]["new_alt_text"])"""
+
+
+        flattened_data["page_url"].append(data_images[count_entry]["page_url"])
+        #flattened_data["html_context"].append(data_images[count_entry]["html_context"])
+        
+        count_entry += 1
+        flattened_data["image_id"].append(count_entry)
+        
+        
+
+    print(f"Total valid entries loaded: {len(flattened_data['url'])}")
+    return datasets.Dataset.from_dict(flattened_data)
+
+
+def create_dataset_from_folders(
+    ref_path,
+    json_filename="mllm_alttext_assessments.json",
+    json_filename_images="extracted_images.json",
+    images_dirname="images",
+):
+    """
+    Create a merged dataset from multiple folders under ref_path.
+    Each folder should contain a JSON file and an images subdirectory.
+
+    Args:
+        ref_path: Root path containing multiple folders
+        json_filename: Name of JSON file in each folder (default: "data.json")
+        images_dirname: Name of images subdirectory (default: "images")
+
+    Returns:
+        datasets.Dataset object with all entries merged
+    """
+    ref_path = Path(ref_path)
+    all_datasets = []
+
+    # Find all subdirectories containing the JSON file
+    folders_processed = 0
+
+    for folder in ref_path.iterdir():
+        if not folder.is_dir():
+            continue
+
+        json_path = folder / json_filename
+        json_path_images = folder / json_filename_images
+        images_path = folder / images_dirname
+
+        # Check if both JSON and images directory exist
+        if not json_path.exists():
+            print(f"Skipping {folder.name}: no {json_filename} found")
+            continue
+
+        if not json_path_images.exists():
+            print(f"Skipping {folder.name}: no {json_filename_images} found")
+            continue
+
+        if not images_path.exists():
+            print(f"Warning: {folder.name}: images directory not found")
+            # continue
+            # Continue anyway, images might be optional (from urls only)
+
+        print(f"Processing folder: {folder.name}")
+
+        try:
+            # Create dataset for this folder
+            ds = create_dataset_from_json(
+                str(json_path), str(json_path_images), str(images_path)
+            )
+            all_datasets.append(ds)
+
+            folders_processed += 1
+            print(f"  -> Loaded {len(ds)} entries")
+        except Exception as e:
+            print(f"Error processing {folder.name}: {e}")
+            continue
+
+    if not all_datasets:
+        raise ValueError(f"No valid folders found in {ref_path}")
+
+    # Merge all datasets
+    print(f"\n=== Merging {folders_processed} folders ===")
+    merged_dataset = datasets.concatenate_datasets(all_datasets)
+    print(f"Total entries: {len(merged_dataset)}")
+
+    return merged_dataset
+
+
+def verify_images(json_filepath, images_dir="images"):
+    """
+    Verify that all images referenced in JSON exist in the images directory.
+
+    Args:
+        json_filepath: Path to JSON file
+        images_dir: Directory containing images
+
+    Returns:
+        Dict with 'found', 'missing', and 'details' keys
+    """
+    with open(json_filepath, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    images_path = Path(images_dir)
+
+    found = []
+    missing = []
+
+    for entry in data:
+        image_url = entry["image_url"]
+        image_filename = url_to_filename(image_url)
+        image_path = images_path / image_filename
+        print(
+            "image_url:",
+            image_url,
+            "image_filename:",
+            image_filename,
+            "image_path:",
+            image_path,
+        )
+
+        if image_path.exists():
+            found.append(
+                {"url": image_url, "filename": image_filename, "path": str(image_path)}
+            )
+        else:
+            missing.append(
+                {
+                    "url": image_url,
+                    "filename": image_filename,
+                    "expected_path": str(image_path),
+                }
+            )
+
+    return {
+        "found": len(found),
+        "missing": len(missing),
+        "total": len(data),
+        "details": {"found_images": found, "missing_images": missing},
+    }
+
+
+def verify_images_in_folders(
+    ref_path, json_filename="mllm_alttext_assessments.json", images_dirname="images"
+):
+    """
+    Verify images across all folders under ref_path.
+
+    Args:
+        ref_path: Root path containing multiple folders
+        json_filename: Name of JSON file in each folder
+        images_dirname: Name of images subdirectory
+
+    Returns:
+        Dict with aggregated verification results
+    """
+    ref_path = Path(ref_path)
+    total_found = 0
+    total_missing = 0
+    total_entries = 0
+    folder_results = {}
+
+    for folder in ref_path.iterdir():
+        if not folder.is_dir():
+            continue
+
+        json_path = folder / json_filename
+        images_path = folder / images_dirname
+
+        if not json_path.exists():
+            continue
+
+        print(f"Verifying folder: {folder.name}")
+
+        try:
+            verification = verify_images(str(json_path), str(images_path))
+            folder_results[folder.name] = verification
+
+            total_found += verification["found"]
+            total_missing += verification["missing"]
+            total_entries += verification["total"]
+
+            print(f"  Found: {verification['found']}/{verification['total']}")
+
+        except Exception as e:
+            print(f"  Error: {e}")
+            continue
+
+    return {
+        "found": total_found,
+        "missing": total_missing,
+        "total": total_entries,
+        "folders": folder_results,
+    }
+
+
+def save_dataset(dataset, output_path):
+    """Save dataset in Arrow format (includes images)."""
+    dataset.save_to_disk(output_path)
+    # print(f"Dataset saved to {output_path}")
+
+    # Or save as JSON
+    # dataset.to_json(f"{output_path}/data.json")
+
+    # Or save as CSV
+    # dataset.to_csv(f"{output_path}/data.csv")
+
+    # Or save as Parquet
+    # dataset.to_parquet(f"{output_path}/data.parquet")
+
+
+def load_dataset_from_disk(dataset_path):
+    """Load a previously saved dataset."""
+    return datasets.load_from_disk(dataset_path)
+
+
+# ============================================================================
+# EXAMPLE USAGE
+# ============================================================================
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--ref_path",
+        type=str,
+        help=("Root path containing multiple folders"),
+        default="C:\\cartella_condivisa\\MachineLearning\\HIISlab\\accessibility\\notebook_miei\\LLM_accessibility_validator\\out",
+    )
+
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        default=False,
+        help=("If True push the merged dataset to Hugging Face Hub"),
+    )
+    parser.add_argument(
+        "--token",
+        type=str,
+        help=("Hugging Face authentication token"),
+        default="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",
+    )
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        help=("Hugging Face repository ID"),
+        default="nicolaleo/LLM-alt-text-assessment",
+    )
+    args = parser.parse_args()
+
+    # Example 1: Verify images across all folders
+    print("=== Verifying Images in All Folders ===")
+    verification = verify_images_in_folders(args.ref_path)
+    print("\n######## Verifier output ################################")
+    print(f"Total Found: {verification['found']}/{verification['total']}")
+    print(f"Total Missing: {verification['missing']}/{verification['total']}")
+    print("########################################")
+
+    # Show per-folder breakdown
+    print("\n=== Per-Folder Breakdown ===")
+    for folder_name, results in verification["folders"].items():
+        print(f"{folder_name}: {results['found']}/{results['total']} images found")
+
+    # Example 2: Create merged dataset from all folders
+    print("\n=== Creating Merged Dataset ===")
+    ds = create_dataset_from_folders(args.ref_path)
+    print("\n######## Merged Dataset output ################################")
+    print(f"Final dataset size: {len(ds)} entries")
+    print("########################################")
+
+    # Example 3: Analyze the merged dataset
+    print("\n=== Dataset Analysis ===")
+    print(ds)
+
+   
+   
+
+    # Example 5: Save merged dataset
+    print("\n=== Saving Merged Dataset ===")
+    save_dataset(ds, "alt_text_merged_dataset_no_llm_check")
+
+    # Example 6: Load dataset
+    print("\n=== Loading Dataset ===")
+    loaded_ds = load_dataset_from_disk("alt_text_merged_dataset_no_llm_check")
+    print(f"Loaded {len(loaded_ds)} entries")
+
+    if args.push_to_hub:
+        # Push to Hugging Face Hub (optional)
+        push_to_hub_example(repo_id=args.repo_id, token=args.token)  # function below for details
--- a/scripts/esercitazione_12_2025/README.md
+++ b/scripts/esercitazione_12_2025/README.md
@ -0,0 +1,10 @@
+
+# Folder structure
+
+- [analisi_esercitazione_12_2025](analisi_esercitazione_12_2025) first notebook that includes similarities calculation and some basic EDA analysis
+- [analisi_esercitazione_12_2025_advanced](analisi_esercitazione_12_2025_advanced) the notebook contains the analysis on language switch
+- [analisi_esercitazione_12_2025_embedding](analisi_esercitazione_12_2025_embedding) the notebook builds a classifier on the semantic representation of the text generated by the LLM
+- [analisi_esercitazione_12_2025_classificatore](analisi_esercitazione_12_2025_classificatore) the notebook builds a classifier based on some features related to user-LLM alt text similarities, text readibility, etc. 
+- [analisi_esercitazione_ricostruzione_associazioni](analisi_esercitazione_ricostruzione_associazioni) the notebook rebuilds the exercise dataset based on the DB dumbs from UI and backend microservices
+- [analisi_esercitazione_12_2025_models_comparisons](analisi_esercitazione_12_2025_models_comparisons) starting from the rebuild exercise dataset, the notebook performs some test switching LLM model and/or prompt
+
--- a/scripts/esercitazione_12_2025/analisi_esercitazione_12_2025.ipynb
+++ b/scripts/esercitazione_12_2025/analisi_esercitazione_12_2025.ipynb
--- a/scripts/esercitazione_12_2025/analisi_esercitazione_12_2025_advanced.ipynb
+++ b/scripts/esercitazione_12_2025/analisi_esercitazione_12_2025_advanced.ipynb
--- a/scripts/esercitazione_12_2025/analisi_esercitazione_12_2025_classificatore.ipynb
+++ b/scripts/esercitazione_12_2025/analisi_esercitazione_12_2025_classificatore.ipynb
--- a/scripts/esercitazione_12_2025/analisi_esercitazione_12_2025_embedding.ipynb
+++ b/scripts/esercitazione_12_2025/analisi_esercitazione_12_2025_embedding.ipynb
--- a/scripts/esercitazione_12_2025/analisi_esercitazione_12_2025_models_comparisons.ipynb
+++ b/scripts/esercitazione_12_2025/analisi_esercitazione_12_2025_models_comparisons.ipynb
--- a/scripts/esercitazione_12_2025/analisi_esercitazione_ricostruzione_associazioni.ipynb
+++ b/scripts/esercitazione_12_2025/analisi_esercitazione_ricostruzione_associazioni.ipynb
--- a/scripts/esercitazione_12_2025/utils.py
+++ b/scripts/esercitazione_12_2025/utils.py
@ -0,0 +1,105 @@
+import numpy as np
+from transformers import BertTokenizer, BertModel
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sentence_transformers import SentenceTransformer
+import torch
+from bert_score import score
+from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity_sklearn
+import re
+
+
+def preprocess_text(text):
+    text = text.lower()
+    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
+    text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
+    return text
+
+
+def cosine_similarity(a, b):
+    return np.dot(a, b) / (
+        np.linalg.norm(a) * np.linalg.norm(b) + 1e-10
+    )  # Use epsilon for numerical stability
+
+
+def semantic_similarity(text1, text2):
+
+    # Handle empty strings explicitly
+    if not text1.strip() or not text2.strip():
+        return 0.0
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    model = BertModel.from_pretrained("bert-base-uncased")
+
+    inputs1 = tokenizer(
+        text1, return_tensors="pt"
+    )  # no preprocess: The neural models are trained to handle natural text variations
+    inputs2 = tokenizer(text2, return_tensors="pt")
+    model.eval()
+    with torch.no_grad():
+        outputs1 = model(**inputs1)
+        outputs2 = model(**inputs2)
+
+    embedding1 = (
+        outputs1.last_hidden_state.mean(dim=1).squeeze().numpy()
+    )  # the average of all token embeddings as representation
+    embedding2 = outputs2.last_hidden_state.mean(dim=1).squeeze().numpy()
+
+    return cosine_similarity(embedding1, embedding2)
+
+
+def semantic_similarity_sentence_transformer(text1, text2):
+
+    # Handle empty strings explicitly
+    if not text1.strip() or not text2.strip():
+        return 0.0
+
+    # Purpose-built for sentence embeddings
+    model = SentenceTransformer(
+        "all-MiniLM-L6-v2"
+    )  # no preprocess: The neural models are trained to handle natural text variations
+    embeddings = model.encode(
+        [text1, text2],
+        output_value="sentence_embedding",
+        device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+    )  # params "sentence_embedding" to prodcuce only one representation per sentence (the average of token embeddings)
+    return cosine_similarity(embeddings[0], embeddings[1])
+
+def extract_semantic_representation(text):
+    # Handle empty strings explicitly
+    if not text.strip():
+        return 0.0
+
+    # Purpose-built for sentence embeddings
+    model = SentenceTransformer(
+        "all-MiniLM-L6-v2"
+    )  # no preprocess: The neural models are trained to handle natural text variations
+    embeddings = model.encode(
+        [text],
+        output_value="sentence_embedding",
+        device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+    )  # params "sentence_embedding" to prodcuce only one representation per sentence (the average of token embeddings)
+    return embeddings
+
+def lexical_similarity(text1, text2):
+    #vectorizer = TfidfVectorizer(stop_words=None, analyzer="char", ngram_range=(1, 3))
+    vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(1, 1))
+    text1 = preprocess_text(text1)  # only lexical needs preprocessing
+    text2 = preprocess_text(text2)
+    tfidf_matrix = vectorizer.fit_transform([text1, text2])
+    vec1 = tfidf_matrix.toarray()[0]
+    vec2 = tfidf_matrix.toarray()[1]
+    return cosine_similarity(vec1, vec2)
+
+
+def bert_score_similarity(texts1, texts2, batch=False):
+    P, R, F1 = (
+        score(  # no preprocess: The neural models are trained to handle natural text variations
+            texts1,
+            texts2,
+            lang="en",
+            verbose=False,
+            model_type="bert-base-uncased",
+            device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+            batch_size=32,
+        )
+    )
+    return F1.tolist() if batch else F1.item()
--- a/scripts/esercitazione_12_2025/utils_API.py
+++ b/scripts/esercitazione_12_2025/utils_API.py
@ -0,0 +1,166 @@
+import json
+import time
+import urllib.request
+import urllib.parse
+import logging
+import os
+import requests
+import base64
+import re
+
+
+
+def call_API_urlibrequest(
+    data={},
+    verbose=False,
+    url="",
+    headers=[],
+    method="post",
+    base=2,  # number of seconds to wait
+    max_tries=3,
+):
+
+    if verbose:
+        logging.info("input_data:%s", data)
+
+    # Allow multiple attempts to call the API incase of downtime.
+    # Return provided response to user after 3 failed attempts.
+    wait_seconds = [base**i for i in range(max_tries)]
+
+    for num_tries in range(max_tries):
+        try:
+
+            if method == "get":
+
+                # Encode the parameters and append them to the URL
+                query_string = urllib.parse.urlencode(data)
+
+                url_with_params = f"{url}?{query_string}"
+                request = urllib.request.Request(url_with_params, method="GET")
+                for ele in headers:
+
+                    request.add_header(ele[0], ele[1])
+
+            elif method == "post":
+                # Convert the dictionary to a JSON formatted string and encode it to bytes
+                data_to_send = json.dumps(data).encode("utf-8")
+
+                request = urllib.request.Request(url, data=data_to_send, method="POST")
+                for ele in headers:
+
+                    request.add_header(ele[0], ele[1])
+            else:
+                return {"error_message": "method_not_allowed"}
+
+            # Send the request and capture the response
+
+            with urllib.request.urlopen(request) as response:
+                # Read and decode the response
+
+                response_json = json.loads(response.read().decode("utf-8"))
+                logging.info("response_json:%s", response_json)
+
+                logging.info("response.status_code:%s", response.getcode())
+                return response_json
+
+        except Exception as e:
+
+            logging.error("error message:%s", e)
+            response_json = {"error": e}
+
+            logging.info("num_tries:%s", num_tries)
+            logging.info(
+                "Waiting %s seconds before automatically trying again.",
+                str(wait_seconds[num_tries]),
+            )
+            time.sleep(wait_seconds[num_tries])
+
+    logging.info(
+        "Tried %s times to make API call to get a valid response object", max_tries
+    )
+    logging.info("Returning provided response")
+    return response_json
+
+
+def parse_mllm_alt_text_response(mllm_response):
+    """
+    Parse an MLLM response string and extract key attributes into a JSON object.
+
+    from mllm response like:
+    ```json\n{\n\"Original alt-text assessment\"... etc
+    to a structured dictionary.
+    
+    Args:
+        mllm_response (str): The raw MLLM response text containing JSON data
+        
+    Returns:
+        dict: A dictionary containing the extracted attributes, or None if parsing fails
+    """
+    try:
+        # Handle NaN or None values
+        if mllm_response is None or mllm_response == "":
+            return {
+                "original_alt_text_assessment": None,
+                "assessment": None,
+                "evaluation_result": None,
+                "new_alt_text": None
+            }
+        
+        # Extract JSON content between ```json and ``` markers
+        json_match = re.search(r'```json\s*(.*?)\s*```', mllm_response, re.DOTALL)
+        
+        if not json_match:
+            # Try to find JSON without markdown code blocks
+            json_match = re.search(r'\{.*\}', mllm_response, re.DOTALL)
+            
+        if not json_match:
+            return {
+                "original_alt_text_assessment": None,
+                "assessment": None,
+                "evaluation_result": None,
+                "new_alt_text": None
+            }
+            
+        json_str = json_match.group(1) if '```json' in mllm_response else json_match.group(0)
+        
+        # Parse the JSON string
+        parsed_data = json.loads(json_str)
+        
+        # Create a structured output with the key attributes
+        result = {
+            "original_alt_text_assessment": parsed_data.get("Original alt-text assessment", ""),
+            "assessment": parsed_data.get("Assessment", ""),
+            "evaluation_result": parsed_data.get("EvaluationResult", ""),
+            "new_alt_text": parsed_data.get("New alt-text", "")
+        }
+        
+        return result
+        
+    except json.JSONDecodeError as e:
+        print(f"JSON parsing error: {e}")
+        return {
+            "original_alt_text_assessment": None,
+            "assessment": None,
+            "evaluation_result": None,
+            "new_alt_text": None
+        }
+    except Exception as e:
+        print(f"Error parsing MLLM response: {e}")
+        return {
+            "original_alt_text_assessment": None,
+            "assessment": None,
+            "evaluation_result": None,
+            "new_alt_text": None
+        }
+
+
+
+
+
+
+def encode_image_from_url(image_url):
+    response = requests.get(image_url)
+    return base64.b64encode(response.content).decode("utf-8")
+
+
+
--- a/scripts/esercitazione_12_2025/utils_text_complexity.py
+++ b/scripts/esercitazione_12_2025/utils_text_complexity.py
@ -0,0 +1,290 @@
+import re
+from collections import Counter
+
+"""
+For English texts:
+
+Flesch Reading Ease score
+Flesch-Kincaid Grade Level
+Gunning Fog Index
+
+For Italian texts:
+
+Flesch Reading Ease (adapted for Italian with Flesch-Vacca formula)
+Gulpease Index (specifically designed for Italian)
+Gunning Fog Index
+
+Basic statistics for both:
+
+Sentence count
+Word count
+Syllable count
+Complex words (3+ syllables)
+Average words per sentence
+Average syllables per word
+"""
+
+class ReadabilityAnalyzer:
+    """Analyze text readability for English and Italian"""
+    
+    def __init__(self, text, language='en'):
+        self.text = text
+        self.language = language.lower()
+        self.sentences = self._count_sentences()
+        self.words = self._count_words()
+        self.syllables = self._count_syllables()
+        self.complex_words = self._count_complex_words()
+        self.characters = len(re.sub(r'\s', '', text))
+        
+    def _count_sentences(self):
+        """Count sentences in text"""
+        sentences = re.split(r'[.!?]+', self.text)
+        return len([s for s in sentences if s.strip()])
+    
+    def _count_words(self):
+        """Count words in text"""
+        words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text)
+        return len(words)
+    
+    def _count_syllables(self):
+        """Count syllables in text (approximation for both languages)"""
+        words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text.lower())
+        total_syllables = 0
+        
+        for word in words:
+            if self.language == 'it':
+                syllables = self._count_syllables_italian(word)
+            else:
+                syllables = self._count_syllables_english(word)
+            total_syllables += syllables
+            
+        return total_syllables
+    
+    def _count_syllables_english(self, word):
+        """Count syllables in English word"""
+        word = word.lower()
+        vowels = 'aeiouy'
+        syllables = 0
+        previous_was_vowel = False
+        
+        for char in word:
+            is_vowel = char in vowels
+            if is_vowel and not previous_was_vowel:
+                syllables += 1
+            previous_was_vowel = is_vowel
+        
+        # Adjust for silent e
+        if word.endswith('e'):
+            syllables -= 1
+        
+        # Ensure at least 1 syllable
+        if syllables == 0:
+            syllables = 1
+            
+        return syllables
+    
+    def _count_syllables_italian(self, word):
+        """Count syllables in Italian word"""
+        word = word.lower()
+        vowels = 'aeiouàèéìòùáíóúý'
+        syllables = 0
+        previous_was_vowel = False
+        
+        for char in word:
+            is_vowel = char in vowels
+            if is_vowel and not previous_was_vowel:
+                syllables += 1
+            previous_was_vowel = is_vowel
+        
+        # Ensure at least 1 syllable
+        if syllables == 0:
+            syllables = 1
+            
+        return syllables
+    
+    def _count_complex_words(self):
+        """Count words with 3+ syllables"""
+        words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text.lower())
+        complex_count = 0
+        
+        for word in words:
+            if self.language == 'it':
+                syllables = self._count_syllables_italian(word)
+            else:
+                syllables = self._count_syllables_english(word)
+            
+            if syllables >= 3:
+                complex_count += 1
+                
+        return complex_count
+    
+    def flesch_reading_ease(self):
+        """Calculate Flesch Reading Ease score"""
+        if self.words == 0 or self.sentences == 0:
+            return 0
+        
+        if self.language == 'it':
+            # Flesch-Vacca formula for Italian
+            score = 206.835 - 1.3 * (self.words / self.sentences) - 60.1 * (self.syllables / self.words)
+        else:
+            # Standard Flesch formula for English
+            score = 206.835 - 1.015 * (self.words / self.sentences) - 84.6 * (self.syllables / self.words)
+        
+        return round(score, 2)
+    
+    def flesch_kincaid_grade(self):
+        """Calculate Flesch-Kincaid Grade Level (primarily for English)"""
+        if self.words == 0 or self.sentences == 0:
+            return 0
+        
+        grade = 0.39 * (self.words / self.sentences) + 11.8 * (self.syllables / self.words) - 15.59
+        return round(grade, 2)
+    
+    def gunning_fog_index(self):
+        """Calculate Gunning Fog Index"""
+        if self.words == 0 or self.sentences == 0:
+            return 0
+        
+        fog = 0.4 * ((self.words / self.sentences) + 100 * (self.complex_words / self.words))
+        return round(fog, 2)
+    
+    def gulpease_index(self):
+        """Calculate Gulpease Index (for Italian)"""
+        if self.words == 0:
+            return 0
+        
+        gulpease = 89 - (self.characters / self.words * 10) + (self.sentences / self.words * 300)
+        return round(gulpease, 2)
+    
+    def get_all_scores(self):
+        """Get all readability scores"""
+        scores = {
+            'basic_stats': {
+                'sentences': self.sentences,
+                'words': self.words,
+                'syllables': self.syllables,
+                'complex_words': self.complex_words,
+                'characters': self.characters,
+                'avg_words_per_sentence': round(self.words / self.sentences, 2) if self.sentences > 0 else 0,
+                'avg_syllables_per_word': round(self.syllables / self.words, 2) if self.words > 0 else 0
+            },
+            'readability_scores': {}
+        }
+        
+        # Add appropriate scores based on language
+        if self.language == 'it':
+            scores['readability_scores']['flesch_reading_ease_it'] = self.flesch_reading_ease()
+            scores['readability_scores']['gulpease_index'] = self.gulpease_index()
+            scores['readability_scores']['gunning_fog_index'] = self.gunning_fog_index()
+        else:
+            scores['readability_scores']['flesch_reading_ease'] = self.flesch_reading_ease()
+            scores['readability_scores']['flesch_kincaid_grade'] = self.flesch_kincaid_grade()
+            scores['readability_scores']['gunning_fog_index'] = self.gunning_fog_index()
+        
+        return scores
+    
+    def interpret_scores(self):
+        """Provide interpretation of readability scores"""
+        scores = self.get_all_scores()
+        interpretation = []
+        
+        if self.language == 'it':
+            # Flesch Reading Ease (Italian)
+            fre = scores['readability_scores']['flesch_reading_ease_it']
+            if fre >= 80:
+                interpretation.append(f"Flesch Reading Ease (IT): {fre} - Molto facile (Very easy)")
+            elif fre >= 60:
+                interpretation.append(f"Flesch Reading Ease (IT): {fre} - Facile (Easy)")
+            elif fre >= 50:
+                interpretation.append(f"Flesch Reading Ease (IT): {fre} - Abbastanza facile (Fairly easy)")
+            elif fre >= 40:
+                interpretation.append(f"Flesch Reading Ease (IT): {fre} - Normale (Normal)")
+            elif fre >= 30:
+                interpretation.append(f"Flesch Reading Ease (IT): {fre} - Abbastanza difficile (Fairly difficult)")
+            else:
+                interpretation.append(f"Flesch Reading Ease (IT): {fre} - Difficile (Difficult)")
+            
+            # Gulpease Index
+            gulpease = scores['readability_scores']['gulpease_index']
+            if gulpease >= 80:
+                interpretation.append(f"Gulpease Index: {gulpease} - Elementare (Elementary school)")
+            elif gulpease >= 60:
+                interpretation.append(f"Gulpease Index: {gulpease} - Media inferiore (Middle school)")
+            elif gulpease >= 40:
+                interpretation.append(f"Gulpease Index: {gulpease} - Media superiore (High school)")
+            else:
+                interpretation.append(f"Gulpease Index: {gulpease} - Universitario (University)")
+        else:
+            # Flesch Reading Ease (English)
+            fre = scores['readability_scores']['flesch_reading_ease']
+            if fre >= 90:
+                interpretation.append(f"Flesch Reading Ease: {fre} - Very easy (5th grade)")
+            elif fre >= 80:
+                interpretation.append(f"Flesch Reading Ease: {fre} - Easy (6th grade)")
+            elif fre >= 70:
+                interpretation.append(f"Flesch Reading Ease: {fre} - Fairly easy (7th grade)")
+            elif fre >= 60:
+                interpretation.append(f"Flesch Reading Ease: {fre} - Standard (8th-9th grade)")
+            elif fre >= 50:
+                interpretation.append(f"Flesch Reading Ease: {fre} - Fairly difficult (10th-12th grade)")
+            elif fre >= 30:
+                interpretation.append(f"Flesch Reading Ease: {fre} - Difficult (College)")
+            else:
+                interpretation.append(f"Flesch Reading Ease: {fre} - Very difficult (College graduate)")
+            
+            # Flesch-Kincaid Grade
+            fkg = scores['readability_scores']['flesch_kincaid_grade']
+            interpretation.append(f"Flesch-Kincaid Grade: {fkg} (US grade level)")
+        
+        # Gunning Fog Index (both languages)
+        fog = scores['readability_scores']['gunning_fog_index']
+        interpretation.append(f"Gunning Fog Index: {fog} (years of education needed)")
+        
+        return '\n'.join(interpretation)
+
+
+# Example usage
+if __name__ == "__main__":
+    # English example
+    english_text = """
+    The quick brown fox jumps over the lazy dog. This is a simple sentence.
+    However, more complicated sentences with multisyllabic words can significantly
+    increase the complexity of the text and make it harder to read.
+    """
+    
+    print("=== ENGLISH TEXT ANALYSIS ===")
+    analyzer_en = ReadabilityAnalyzer(english_text, language='en')
+    scores_en = analyzer_en.get_all_scores()
+    
+    print("\nBasic Statistics:")
+    for key, value in scores_en['basic_stats'].items():
+        print(f"  {key}: {value}")
+    
+    print("\nReadability Scores:")
+    for key, value in scores_en['readability_scores'].items():
+        print(f"  {key}: {value}")
+    
+    print("\nInterpretation:")
+    print(analyzer_en.interpret_scores())
+    
+    # Italian example
+    italian_text = """
+    Il veloce cane marrone salta sopra il cane pigro. Questa è una frase semplice.
+    Tuttavia, frasi più complicate con parole polisillabiche possono aumentare
+    significativamente la complessità del testo e renderlo più difficile da leggere.
+    """
+    
+    print("\n\n=== ITALIAN TEXT ANALYSIS ===")
+    analyzer_it = ReadabilityAnalyzer(italian_text, language='it')
+    scores_it = analyzer_it.get_all_scores()
+    
+    print("\nBasic Statistics:")
+    for key, value in scores_it['basic_stats'].items():
+        print(f"  {key}: {value}")
+    
+    print("\nReadability Scores:")
+    for key, value in scores_it['readability_scores'].items():
+        print(f"  {key}: {value}")
+    
+    print("\nInterpretation:")
+    print(analyzer_it.interpret_scores())