codice analisi esercitazione 12-2025
This commit is contained in:
parent
107473cd4f
commit
f4a99f358d
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,546 @@
|
|||
# costruisco il db solo con url immagini e progressivo immagini per ogni page_url
|
||||
# to launch: python build_dataset_from_folder_no_llm_check.py --ref_path "C:\cartella_condivisa\MachineLearning\HIISlab\accessibility\notebook_miei\LLM_accessibility_validator\out" --push_to_hub --repo_id "nicolaleo/LLM-alt-text-assessment" --token "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"
|
||||
|
||||
from datasets import Dataset, DatasetDict
|
||||
import datasets
|
||||
import json
|
||||
from pathlib import Path
|
||||
from PIL import Image
|
||||
import hashlib
|
||||
import urllib.parse
|
||||
import argparse
|
||||
|
||||
|
||||
'''
|
||||
# Dataset metadata
|
||||
_DESCRIPTION = """\
|
||||
Dataset for image alt-text assessment and improvement using MLLM responses.
|
||||
Contains images, original alt-texts, quality assessments, and improved versions.
|
||||
"""
|
||||
|
||||
_CITATION = """\
|
||||
@misc{alt_text_assessment,
|
||||
title={Alt-Text Assessment Dataset},
|
||||
year={2024}
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class AltTextDataset(datasets.GeneratorBasedBuilder):
|
||||
"""Dataset for alt-text assessment with images and MLLM responses."""
|
||||
|
||||
VERSION = datasets.Version("1.0.0")
|
||||
|
||||
def _info(self):
|
||||
return datasets.DatasetInfo(
|
||||
description=_DESCRIPTION,
|
||||
features=datasets.Features({
|
||||
"image": datasets.Image(),
|
||||
"image_url": datasets.Value("string"),
|
||||
"alt_text": datasets.Value("string"),
|
||||
"original_alt_text_assessment": datasets.Value("string"),
|
||||
"assessment": datasets.Value("string"),
|
||||
"evaluation_result": datasets.Value("string"),
|
||||
"new_alt_text": datasets.Value("string"),
|
||||
#"source_folder": datasets.Value("string"),
|
||||
}),
|
||||
citation=_CITATION,
|
||||
)
|
||||
|
||||
def _split_generators(self, dl_manager):
|
||||
"""Define data splits."""
|
||||
return [
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TRAIN,
|
||||
gen_kwargs={
|
||||
"json_filepath": "data.json",
|
||||
"images_dir": "images"
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
def _generate_examples(self, json_filepath, images_dir):
|
||||
"""Generate examples from JSON file and image directory."""
|
||||
with open(json_filepath, encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
images_path = Path(images_dir)
|
||||
|
||||
for idx, entry in enumerate(data):
|
||||
image_url = entry["image_url"]
|
||||
image_filename = url_to_filename(image_url)
|
||||
image_path = images_path / image_filename
|
||||
|
||||
# Load image if exists, otherwise None
|
||||
image = str(image_path) if image_path.exists() else None
|
||||
|
||||
yield idx, {
|
||||
"image": image,
|
||||
"image_url": image_url,
|
||||
"alt_text": entry["alt_text"],
|
||||
"original_alt_text_assessment": entry["mllm_response"]["original_alt_text_assessment"],
|
||||
"assessment": entry["mllm_response"]["assessment"],
|
||||
"evaluation_result": entry["mllm_response"]["evaluation_result"],
|
||||
"new_alt_text": entry["mllm_response"]["new_alt_text"],
|
||||
}
|
||||
|
||||
'''
|
||||
# ============================================================================
|
||||
# SIMPLE USAGE FUNCTIONS
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def url_to_filename(image_url): # save step as in the image_extractor dependence
|
||||
"""
|
||||
Convert image URL to sanitized filename following your exact logic.
|
||||
|
||||
Args:
|
||||
image_url: The image URL
|
||||
|
||||
Returns:
|
||||
Sanitized filename with extension
|
||||
"""
|
||||
|
||||
# Parse the URL to get the path without query parameters
|
||||
parsed_url = urllib.parse.urlparse(image_url)
|
||||
url_path = parsed_url.path
|
||||
|
||||
# Get the filename from the path
|
||||
filename = url_path.split("/")[-1]
|
||||
print(f"Original filename: '{filename}'")
|
||||
|
||||
# Split filename and extension
|
||||
if "." in filename:
|
||||
image_name, ext = filename.rsplit(".", 1)
|
||||
ext = ext.lower()
|
||||
else:
|
||||
image_name = filename
|
||||
ext = "jpg"
|
||||
|
||||
# Validate extension
|
||||
if ext not in ["jpg", "jpeg", "png", "gif", "webp"]:
|
||||
ext = "jpg"
|
||||
|
||||
# Sanitize image name (remove special characters, limit length)
|
||||
image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_"))
|
||||
|
||||
image_name = image_name[:50] # Limit filename length
|
||||
|
||||
# If name is empty after sanitization, create a hash-based name
|
||||
if not image_name:
|
||||
image_name = hashlib.md5(image_url.encode()).hexdigest()[:16]
|
||||
|
||||
return f"{image_name}.{ext}"
|
||||
|
||||
|
||||
def push_to_hub_example(dataset_path="alt_text_merged_dataset", repo_id="",token=None):
|
||||
"""
|
||||
Example of how to push dataset to Hugging Face Hub.
|
||||
You need to authenticate first!
|
||||
"""
|
||||
from huggingface_hub import login
|
||||
|
||||
print("\n=== Pushing Dataset to Hugging Face Hub ===")
|
||||
# Method 1: Login interactively (will prompt for token)
|
||||
# login()
|
||||
|
||||
# Method 2: Login with token directly
|
||||
login(token=token)
|
||||
|
||||
# Method 3: Set token as environment variable
|
||||
# export HF_TOKEN="hf_YourTokenHere"
|
||||
# Then login() will use it automatically
|
||||
|
||||
# Load your dataset
|
||||
ds = load_dataset_from_disk(dataset_path)
|
||||
|
||||
# Combine into DatasetDict
|
||||
ds = DatasetDict(
|
||||
{
|
||||
"train": ds,
|
||||
# #"test": test_dataset
|
||||
}
|
||||
)
|
||||
|
||||
# Push to hub (creates repo if it doesn't exist)
|
||||
ds.push_to_hub( # Automatically converts to Parquet when uploading to Hub
|
||||
repo_id, # Replace with your username
|
||||
private=False, # Set True for private dataset
|
||||
)
|
||||
|
||||
print("Dataset pushed successfully!")
|
||||
print(f"View at: https://huggingface.co/datasets/{repo_id}")
|
||||
|
||||
|
||||
def create_dataset_from_json(json_filepath, json_filepath_images, images_dir="images"):
|
||||
"""
|
||||
Create a Hugging Face Dataset from JSON file with local images.
|
||||
|
||||
Args:
|
||||
json_filepath: Path to JSON file with your data structure
|
||||
images_dir: Directory containing the images (default: "images")
|
||||
|
||||
Returns:
|
||||
datasets.Dataset object with images loaded
|
||||
"""
|
||||
with open(json_filepath, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
with open(json_filepath_images, "r", encoding="utf-8") as f:
|
||||
data_images = json.load(f)
|
||||
|
||||
images_path = Path(images_dir)
|
||||
|
||||
# Flatten the nested structure and load images
|
||||
flattened_data = {
|
||||
#"image": [],
|
||||
"url": [],
|
||||
"alt_text": [],
|
||||
#"original_alt_text_assessment": [],
|
||||
#"assessment": [],
|
||||
#"evaluation_result": [],
|
||||
#"new_alt_text": [],
|
||||
"page_url": [],
|
||||
#"html_context": [],
|
||||
"image_id":[]
|
||||
}
|
||||
|
||||
count_entry = 0
|
||||
for entry in data_images:
|
||||
"""
|
||||
if (
|
||||
entry["mllm_response"]["original_alt_text_assessment"] is None
|
||||
): # important! skip entries with no MLLM response. not usable data
|
||||
print(
|
||||
f"Skipping entry with image URL: {entry['image_url']} due to missing MLLM response"
|
||||
)
|
||||
#count_entry += 1
|
||||
#continue # Skip entries with no MLLM response
|
||||
flattened_data["original_alt_text_assessment"].append(str(0))
|
||||
flattened_data["assessment"].append(0)
|
||||
flattened_data["evaluation_result"].append(0)
|
||||
else:
|
||||
flattened_data["original_alt_text_assessment"].append(
|
||||
str(entry["mllm_response"]["original_alt_text_assessment"])
|
||||
)
|
||||
flattened_data["assessment"].append(entry["mllm_response"]["assessment"])
|
||||
flattened_data["evaluation_result"].append(
|
||||
entry["mllm_response"]["evaluation_result"]
|
||||
)"""
|
||||
|
||||
image_url = entry["url"]
|
||||
image_filename = url_to_filename(image_url)
|
||||
image_path = images_path / image_filename
|
||||
|
||||
# Load image if it exists
|
||||
"""
|
||||
if image_path.exists():
|
||||
img = Image.open(image_path)
|
||||
flattened_data["image"].append(img)
|
||||
else:
|
||||
print(f"Warning: Image not found: {image_path}")
|
||||
flattened_data["image"].append(None)
|
||||
"""
|
||||
|
||||
flattened_data["url"].append(image_url)
|
||||
flattened_data["alt_text"].append(entry["alt_text"])
|
||||
|
||||
|
||||
"""flattened_data["original_alt_text_assessment"].append(
|
||||
str(entry["mllm_response"]["original_alt_text_assessment"])
|
||||
)
|
||||
flattened_data["assessment"].append(entry["mllm_response"]["assessment"])
|
||||
flattened_data["evaluation_result"].append(
|
||||
entry["mllm_response"]["evaluation_result"]
|
||||
)
|
||||
flattened_data["new_alt_text"].append(entry["mllm_response"]["new_alt_text"])"""
|
||||
|
||||
|
||||
flattened_data["page_url"].append(data_images[count_entry]["page_url"])
|
||||
#flattened_data["html_context"].append(data_images[count_entry]["html_context"])
|
||||
|
||||
count_entry += 1
|
||||
flattened_data["image_id"].append(count_entry)
|
||||
|
||||
|
||||
|
||||
print(f"Total valid entries loaded: {len(flattened_data['url'])}")
|
||||
return datasets.Dataset.from_dict(flattened_data)
|
||||
|
||||
|
||||
def create_dataset_from_folders(
|
||||
ref_path,
|
||||
json_filename="mllm_alttext_assessments.json",
|
||||
json_filename_images="extracted_images.json",
|
||||
images_dirname="images",
|
||||
):
|
||||
"""
|
||||
Create a merged dataset from multiple folders under ref_path.
|
||||
Each folder should contain a JSON file and an images subdirectory.
|
||||
|
||||
Args:
|
||||
ref_path: Root path containing multiple folders
|
||||
json_filename: Name of JSON file in each folder (default: "data.json")
|
||||
images_dirname: Name of images subdirectory (default: "images")
|
||||
|
||||
Returns:
|
||||
datasets.Dataset object with all entries merged
|
||||
"""
|
||||
ref_path = Path(ref_path)
|
||||
all_datasets = []
|
||||
|
||||
# Find all subdirectories containing the JSON file
|
||||
folders_processed = 0
|
||||
|
||||
for folder in ref_path.iterdir():
|
||||
if not folder.is_dir():
|
||||
continue
|
||||
|
||||
json_path = folder / json_filename
|
||||
json_path_images = folder / json_filename_images
|
||||
images_path = folder / images_dirname
|
||||
|
||||
# Check if both JSON and images directory exist
|
||||
if not json_path.exists():
|
||||
print(f"Skipping {folder.name}: no {json_filename} found")
|
||||
continue
|
||||
|
||||
if not json_path_images.exists():
|
||||
print(f"Skipping {folder.name}: no {json_filename_images} found")
|
||||
continue
|
||||
|
||||
if not images_path.exists():
|
||||
print(f"Warning: {folder.name}: images directory not found")
|
||||
# continue
|
||||
# Continue anyway, images might be optional (from urls only)
|
||||
|
||||
print(f"Processing folder: {folder.name}")
|
||||
|
||||
try:
|
||||
# Create dataset for this folder
|
||||
ds = create_dataset_from_json(
|
||||
str(json_path), str(json_path_images), str(images_path)
|
||||
)
|
||||
all_datasets.append(ds)
|
||||
|
||||
folders_processed += 1
|
||||
print(f" -> Loaded {len(ds)} entries")
|
||||
except Exception as e:
|
||||
print(f"Error processing {folder.name}: {e}")
|
||||
continue
|
||||
|
||||
if not all_datasets:
|
||||
raise ValueError(f"No valid folders found in {ref_path}")
|
||||
|
||||
# Merge all datasets
|
||||
print(f"\n=== Merging {folders_processed} folders ===")
|
||||
merged_dataset = datasets.concatenate_datasets(all_datasets)
|
||||
print(f"Total entries: {len(merged_dataset)}")
|
||||
|
||||
return merged_dataset
|
||||
|
||||
|
||||
def verify_images(json_filepath, images_dir="images"):
|
||||
"""
|
||||
Verify that all images referenced in JSON exist in the images directory.
|
||||
|
||||
Args:
|
||||
json_filepath: Path to JSON file
|
||||
images_dir: Directory containing images
|
||||
|
||||
Returns:
|
||||
Dict with 'found', 'missing', and 'details' keys
|
||||
"""
|
||||
with open(json_filepath, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
images_path = Path(images_dir)
|
||||
|
||||
found = []
|
||||
missing = []
|
||||
|
||||
for entry in data:
|
||||
image_url = entry["image_url"]
|
||||
image_filename = url_to_filename(image_url)
|
||||
image_path = images_path / image_filename
|
||||
print(
|
||||
"image_url:",
|
||||
image_url,
|
||||
"image_filename:",
|
||||
image_filename,
|
||||
"image_path:",
|
||||
image_path,
|
||||
)
|
||||
|
||||
if image_path.exists():
|
||||
found.append(
|
||||
{"url": image_url, "filename": image_filename, "path": str(image_path)}
|
||||
)
|
||||
else:
|
||||
missing.append(
|
||||
{
|
||||
"url": image_url,
|
||||
"filename": image_filename,
|
||||
"expected_path": str(image_path),
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"found": len(found),
|
||||
"missing": len(missing),
|
||||
"total": len(data),
|
||||
"details": {"found_images": found, "missing_images": missing},
|
||||
}
|
||||
|
||||
|
||||
def verify_images_in_folders(
|
||||
ref_path, json_filename="mllm_alttext_assessments.json", images_dirname="images"
|
||||
):
|
||||
"""
|
||||
Verify images across all folders under ref_path.
|
||||
|
||||
Args:
|
||||
ref_path: Root path containing multiple folders
|
||||
json_filename: Name of JSON file in each folder
|
||||
images_dirname: Name of images subdirectory
|
||||
|
||||
Returns:
|
||||
Dict with aggregated verification results
|
||||
"""
|
||||
ref_path = Path(ref_path)
|
||||
total_found = 0
|
||||
total_missing = 0
|
||||
total_entries = 0
|
||||
folder_results = {}
|
||||
|
||||
for folder in ref_path.iterdir():
|
||||
if not folder.is_dir():
|
||||
continue
|
||||
|
||||
json_path = folder / json_filename
|
||||
images_path = folder / images_dirname
|
||||
|
||||
if not json_path.exists():
|
||||
continue
|
||||
|
||||
print(f"Verifying folder: {folder.name}")
|
||||
|
||||
try:
|
||||
verification = verify_images(str(json_path), str(images_path))
|
||||
folder_results[folder.name] = verification
|
||||
|
||||
total_found += verification["found"]
|
||||
total_missing += verification["missing"]
|
||||
total_entries += verification["total"]
|
||||
|
||||
print(f" Found: {verification['found']}/{verification['total']}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
continue
|
||||
|
||||
return {
|
||||
"found": total_found,
|
||||
"missing": total_missing,
|
||||
"total": total_entries,
|
||||
"folders": folder_results,
|
||||
}
|
||||
|
||||
|
||||
def save_dataset(dataset, output_path):
|
||||
"""Save dataset in Arrow format (includes images)."""
|
||||
dataset.save_to_disk(output_path)
|
||||
# print(f"Dataset saved to {output_path}")
|
||||
|
||||
# Or save as JSON
|
||||
# dataset.to_json(f"{output_path}/data.json")
|
||||
|
||||
# Or save as CSV
|
||||
# dataset.to_csv(f"{output_path}/data.csv")
|
||||
|
||||
# Or save as Parquet
|
||||
# dataset.to_parquet(f"{output_path}/data.parquet")
|
||||
|
||||
|
||||
def load_dataset_from_disk(dataset_path):
|
||||
"""Load a previously saved dataset."""
|
||||
return datasets.load_from_disk(dataset_path)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# EXAMPLE USAGE
|
||||
# ============================================================================
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--ref_path",
|
||||
type=str,
|
||||
help=("Root path containing multiple folders"),
|
||||
default="C:\\cartella_condivisa\\MachineLearning\\HIISlab\\accessibility\\notebook_miei\\LLM_accessibility_validator\\out",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help=("If True push the merged dataset to Hugging Face Hub"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--token",
|
||||
type=str,
|
||||
help=("Hugging Face authentication token"),
|
||||
default="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",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--repo_id",
|
||||
type=str,
|
||||
help=("Hugging Face repository ID"),
|
||||
default="nicolaleo/LLM-alt-text-assessment",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Example 1: Verify images across all folders
|
||||
print("=== Verifying Images in All Folders ===")
|
||||
verification = verify_images_in_folders(args.ref_path)
|
||||
print("\n######## Verifier output ################################")
|
||||
print(f"Total Found: {verification['found']}/{verification['total']}")
|
||||
print(f"Total Missing: {verification['missing']}/{verification['total']}")
|
||||
print("########################################")
|
||||
|
||||
# Show per-folder breakdown
|
||||
print("\n=== Per-Folder Breakdown ===")
|
||||
for folder_name, results in verification["folders"].items():
|
||||
print(f"{folder_name}: {results['found']}/{results['total']} images found")
|
||||
|
||||
# Example 2: Create merged dataset from all folders
|
||||
print("\n=== Creating Merged Dataset ===")
|
||||
ds = create_dataset_from_folders(args.ref_path)
|
||||
print("\n######## Merged Dataset output ################################")
|
||||
print(f"Final dataset size: {len(ds)} entries")
|
||||
print("########################################")
|
||||
|
||||
# Example 3: Analyze the merged dataset
|
||||
print("\n=== Dataset Analysis ===")
|
||||
print(ds)
|
||||
|
||||
|
||||
|
||||
|
||||
# Example 5: Save merged dataset
|
||||
print("\n=== Saving Merged Dataset ===")
|
||||
save_dataset(ds, "alt_text_merged_dataset_no_llm_check")
|
||||
|
||||
# Example 6: Load dataset
|
||||
print("\n=== Loading Dataset ===")
|
||||
loaded_ds = load_dataset_from_disk("alt_text_merged_dataset_no_llm_check")
|
||||
print(f"Loaded {len(loaded_ds)} entries")
|
||||
|
||||
if args.push_to_hub:
|
||||
# Push to Hugging Face Hub (optional)
|
||||
push_to_hub_example(repo_id=args.repo_id, token=args.token) # function below for details
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
|
||||
# Folder structure
|
||||
|
||||
- [analisi_esercitazione_12_2025](analisi_esercitazione_12_2025) first notebook that includes similarities calculation and some basic EDA analysis
|
||||
- [analisi_esercitazione_12_2025_advanced](analisi_esercitazione_12_2025_advanced) the notebook contains the analysis on language switch
|
||||
- [analisi_esercitazione_12_2025_embedding](analisi_esercitazione_12_2025_embedding) the notebook builds a classifier on the semantic representation of the text generated by the LLM
|
||||
- [analisi_esercitazione_12_2025_classificatore](analisi_esercitazione_12_2025_classificatore) the notebook builds a classifier based on some features related to user-LLM alt text similarities, text readibility, etc.
|
||||
- [analisi_esercitazione_ricostruzione_associazioni](analisi_esercitazione_ricostruzione_associazioni) the notebook rebuilds the exercise dataset based on the DB dumbs from UI and backend microservices
|
||||
- [analisi_esercitazione_12_2025_models_comparisons](analisi_esercitazione_12_2025_models_comparisons) starting from the rebuild exercise dataset, the notebook performs some test switching LLM model and/or prompt
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,105 @@
|
|||
import numpy as np
|
||||
from transformers import BertTokenizer, BertModel
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import torch
|
||||
from bert_score import score
|
||||
from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity_sklearn
|
||||
import re
|
||||
|
||||
|
||||
def preprocess_text(text):
|
||||
text = text.lower()
|
||||
text = re.sub(r"[^\w\s]", "", text) # Remove punctuation
|
||||
text = re.sub(r"\s+", " ", text).strip() # Normalize whitespace
|
||||
return text
|
||||
|
||||
|
||||
def cosine_similarity(a, b):
|
||||
return np.dot(a, b) / (
|
||||
np.linalg.norm(a) * np.linalg.norm(b) + 1e-10
|
||||
) # Use epsilon for numerical stability
|
||||
|
||||
|
||||
def semantic_similarity(text1, text2):
|
||||
|
||||
# Handle empty strings explicitly
|
||||
if not text1.strip() or not text2.strip():
|
||||
return 0.0
|
||||
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
||||
model = BertModel.from_pretrained("bert-base-uncased")
|
||||
|
||||
inputs1 = tokenizer(
|
||||
text1, return_tensors="pt"
|
||||
) # no preprocess: The neural models are trained to handle natural text variations
|
||||
inputs2 = tokenizer(text2, return_tensors="pt")
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
outputs1 = model(**inputs1)
|
||||
outputs2 = model(**inputs2)
|
||||
|
||||
embedding1 = (
|
||||
outputs1.last_hidden_state.mean(dim=1).squeeze().numpy()
|
||||
) # the average of all token embeddings as representation
|
||||
embedding2 = outputs2.last_hidden_state.mean(dim=1).squeeze().numpy()
|
||||
|
||||
return cosine_similarity(embedding1, embedding2)
|
||||
|
||||
|
||||
def semantic_similarity_sentence_transformer(text1, text2):
|
||||
|
||||
# Handle empty strings explicitly
|
||||
if not text1.strip() or not text2.strip():
|
||||
return 0.0
|
||||
|
||||
# Purpose-built for sentence embeddings
|
||||
model = SentenceTransformer(
|
||||
"all-MiniLM-L6-v2"
|
||||
) # no preprocess: The neural models are trained to handle natural text variations
|
||||
embeddings = model.encode(
|
||||
[text1, text2],
|
||||
output_value="sentence_embedding",
|
||||
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
|
||||
) # params "sentence_embedding" to prodcuce only one representation per sentence (the average of token embeddings)
|
||||
return cosine_similarity(embeddings[0], embeddings[1])
|
||||
|
||||
def extract_semantic_representation(text):
|
||||
# Handle empty strings explicitly
|
||||
if not text.strip():
|
||||
return 0.0
|
||||
|
||||
# Purpose-built for sentence embeddings
|
||||
model = SentenceTransformer(
|
||||
"all-MiniLM-L6-v2"
|
||||
) # no preprocess: The neural models are trained to handle natural text variations
|
||||
embeddings = model.encode(
|
||||
[text],
|
||||
output_value="sentence_embedding",
|
||||
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
|
||||
) # params "sentence_embedding" to prodcuce only one representation per sentence (the average of token embeddings)
|
||||
return embeddings
|
||||
|
||||
def lexical_similarity(text1, text2):
|
||||
#vectorizer = TfidfVectorizer(stop_words=None, analyzer="char", ngram_range=(1, 3))
|
||||
vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(1, 1))
|
||||
text1 = preprocess_text(text1) # only lexical needs preprocessing
|
||||
text2 = preprocess_text(text2)
|
||||
tfidf_matrix = vectorizer.fit_transform([text1, text2])
|
||||
vec1 = tfidf_matrix.toarray()[0]
|
||||
vec2 = tfidf_matrix.toarray()[1]
|
||||
return cosine_similarity(vec1, vec2)
|
||||
|
||||
|
||||
def bert_score_similarity(texts1, texts2, batch=False):
|
||||
P, R, F1 = (
|
||||
score( # no preprocess: The neural models are trained to handle natural text variations
|
||||
texts1,
|
||||
texts2,
|
||||
lang="en",
|
||||
verbose=False,
|
||||
model_type="bert-base-uncased",
|
||||
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
|
||||
batch_size=32,
|
||||
)
|
||||
)
|
||||
return F1.tolist() if batch else F1.item()
|
||||
|
|
@ -0,0 +1,166 @@
|
|||
import json
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
import logging
|
||||
import os
|
||||
import requests
|
||||
import base64
|
||||
import re
|
||||
|
||||
|
||||
|
||||
def call_API_urlibrequest(
|
||||
data={},
|
||||
verbose=False,
|
||||
url="",
|
||||
headers=[],
|
||||
method="post",
|
||||
base=2, # number of seconds to wait
|
||||
max_tries=3,
|
||||
):
|
||||
|
||||
if verbose:
|
||||
logging.info("input_data:%s", data)
|
||||
|
||||
# Allow multiple attempts to call the API incase of downtime.
|
||||
# Return provided response to user after 3 failed attempts.
|
||||
wait_seconds = [base**i for i in range(max_tries)]
|
||||
|
||||
for num_tries in range(max_tries):
|
||||
try:
|
||||
|
||||
if method == "get":
|
||||
|
||||
# Encode the parameters and append them to the URL
|
||||
query_string = urllib.parse.urlencode(data)
|
||||
|
||||
url_with_params = f"{url}?{query_string}"
|
||||
request = urllib.request.Request(url_with_params, method="GET")
|
||||
for ele in headers:
|
||||
|
||||
request.add_header(ele[0], ele[1])
|
||||
|
||||
elif method == "post":
|
||||
# Convert the dictionary to a JSON formatted string and encode it to bytes
|
||||
data_to_send = json.dumps(data).encode("utf-8")
|
||||
|
||||
request = urllib.request.Request(url, data=data_to_send, method="POST")
|
||||
for ele in headers:
|
||||
|
||||
request.add_header(ele[0], ele[1])
|
||||
else:
|
||||
return {"error_message": "method_not_allowed"}
|
||||
|
||||
# Send the request and capture the response
|
||||
|
||||
with urllib.request.urlopen(request) as response:
|
||||
# Read and decode the response
|
||||
|
||||
response_json = json.loads(response.read().decode("utf-8"))
|
||||
logging.info("response_json:%s", response_json)
|
||||
|
||||
logging.info("response.status_code:%s", response.getcode())
|
||||
return response_json
|
||||
|
||||
except Exception as e:
|
||||
|
||||
logging.error("error message:%s", e)
|
||||
response_json = {"error": e}
|
||||
|
||||
logging.info("num_tries:%s", num_tries)
|
||||
logging.info(
|
||||
"Waiting %s seconds before automatically trying again.",
|
||||
str(wait_seconds[num_tries]),
|
||||
)
|
||||
time.sleep(wait_seconds[num_tries])
|
||||
|
||||
logging.info(
|
||||
"Tried %s times to make API call to get a valid response object", max_tries
|
||||
)
|
||||
logging.info("Returning provided response")
|
||||
return response_json
|
||||
|
||||
|
||||
def parse_mllm_alt_text_response(mllm_response):
|
||||
"""
|
||||
Parse an MLLM response string and extract key attributes into a JSON object.
|
||||
|
||||
from mllm response like:
|
||||
```json\n{\n\"Original alt-text assessment\"... etc
|
||||
to a structured dictionary.
|
||||
|
||||
Args:
|
||||
mllm_response (str): The raw MLLM response text containing JSON data
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the extracted attributes, or None if parsing fails
|
||||
"""
|
||||
try:
|
||||
# Handle NaN or None values
|
||||
if mllm_response is None or mllm_response == "":
|
||||
return {
|
||||
"original_alt_text_assessment": None,
|
||||
"assessment": None,
|
||||
"evaluation_result": None,
|
||||
"new_alt_text": None
|
||||
}
|
||||
|
||||
# Extract JSON content between ```json and ``` markers
|
||||
json_match = re.search(r'```json\s*(.*?)\s*```', mllm_response, re.DOTALL)
|
||||
|
||||
if not json_match:
|
||||
# Try to find JSON without markdown code blocks
|
||||
json_match = re.search(r'\{.*\}', mllm_response, re.DOTALL)
|
||||
|
||||
if not json_match:
|
||||
return {
|
||||
"original_alt_text_assessment": None,
|
||||
"assessment": None,
|
||||
"evaluation_result": None,
|
||||
"new_alt_text": None
|
||||
}
|
||||
|
||||
json_str = json_match.group(1) if '```json' in mllm_response else json_match.group(0)
|
||||
|
||||
# Parse the JSON string
|
||||
parsed_data = json.loads(json_str)
|
||||
|
||||
# Create a structured output with the key attributes
|
||||
result = {
|
||||
"original_alt_text_assessment": parsed_data.get("Original alt-text assessment", ""),
|
||||
"assessment": parsed_data.get("Assessment", ""),
|
||||
"evaluation_result": parsed_data.get("EvaluationResult", ""),
|
||||
"new_alt_text": parsed_data.get("New alt-text", "")
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"JSON parsing error: {e}")
|
||||
return {
|
||||
"original_alt_text_assessment": None,
|
||||
"assessment": None,
|
||||
"evaluation_result": None,
|
||||
"new_alt_text": None
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Error parsing MLLM response: {e}")
|
||||
return {
|
||||
"original_alt_text_assessment": None,
|
||||
"assessment": None,
|
||||
"evaluation_result": None,
|
||||
"new_alt_text": None
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def encode_image_from_url(image_url):
|
||||
response = requests.get(image_url)
|
||||
return base64.b64encode(response.content).decode("utf-8")
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,290 @@
|
|||
import re
|
||||
from collections import Counter
|
||||
|
||||
"""
|
||||
For English texts:
|
||||
|
||||
Flesch Reading Ease score
|
||||
Flesch-Kincaid Grade Level
|
||||
Gunning Fog Index
|
||||
|
||||
For Italian texts:
|
||||
|
||||
Flesch Reading Ease (adapted for Italian with Flesch-Vacca formula)
|
||||
Gulpease Index (specifically designed for Italian)
|
||||
Gunning Fog Index
|
||||
|
||||
Basic statistics for both:
|
||||
|
||||
Sentence count
|
||||
Word count
|
||||
Syllable count
|
||||
Complex words (3+ syllables)
|
||||
Average words per sentence
|
||||
Average syllables per word
|
||||
"""
|
||||
|
||||
class ReadabilityAnalyzer:
|
||||
"""Analyze text readability for English and Italian"""
|
||||
|
||||
def __init__(self, text, language='en'):
|
||||
self.text = text
|
||||
self.language = language.lower()
|
||||
self.sentences = self._count_sentences()
|
||||
self.words = self._count_words()
|
||||
self.syllables = self._count_syllables()
|
||||
self.complex_words = self._count_complex_words()
|
||||
self.characters = len(re.sub(r'\s', '', text))
|
||||
|
||||
def _count_sentences(self):
|
||||
"""Count sentences in text"""
|
||||
sentences = re.split(r'[.!?]+', self.text)
|
||||
return len([s for s in sentences if s.strip()])
|
||||
|
||||
def _count_words(self):
|
||||
"""Count words in text"""
|
||||
words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text)
|
||||
return len(words)
|
||||
|
||||
def _count_syllables(self):
|
||||
"""Count syllables in text (approximation for both languages)"""
|
||||
words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text.lower())
|
||||
total_syllables = 0
|
||||
|
||||
for word in words:
|
||||
if self.language == 'it':
|
||||
syllables = self._count_syllables_italian(word)
|
||||
else:
|
||||
syllables = self._count_syllables_english(word)
|
||||
total_syllables += syllables
|
||||
|
||||
return total_syllables
|
||||
|
||||
def _count_syllables_english(self, word):
|
||||
"""Count syllables in English word"""
|
||||
word = word.lower()
|
||||
vowels = 'aeiouy'
|
||||
syllables = 0
|
||||
previous_was_vowel = False
|
||||
|
||||
for char in word:
|
||||
is_vowel = char in vowels
|
||||
if is_vowel and not previous_was_vowel:
|
||||
syllables += 1
|
||||
previous_was_vowel = is_vowel
|
||||
|
||||
# Adjust for silent e
|
||||
if word.endswith('e'):
|
||||
syllables -= 1
|
||||
|
||||
# Ensure at least 1 syllable
|
||||
if syllables == 0:
|
||||
syllables = 1
|
||||
|
||||
return syllables
|
||||
|
||||
def _count_syllables_italian(self, word):
|
||||
"""Count syllables in Italian word"""
|
||||
word = word.lower()
|
||||
vowels = 'aeiouàèéìòùáíóúý'
|
||||
syllables = 0
|
||||
previous_was_vowel = False
|
||||
|
||||
for char in word:
|
||||
is_vowel = char in vowels
|
||||
if is_vowel and not previous_was_vowel:
|
||||
syllables += 1
|
||||
previous_was_vowel = is_vowel
|
||||
|
||||
# Ensure at least 1 syllable
|
||||
if syllables == 0:
|
||||
syllables = 1
|
||||
|
||||
return syllables
|
||||
|
||||
def _count_complex_words(self):
|
||||
"""Count words with 3+ syllables"""
|
||||
words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text.lower())
|
||||
complex_count = 0
|
||||
|
||||
for word in words:
|
||||
if self.language == 'it':
|
||||
syllables = self._count_syllables_italian(word)
|
||||
else:
|
||||
syllables = self._count_syllables_english(word)
|
||||
|
||||
if syllables >= 3:
|
||||
complex_count += 1
|
||||
|
||||
return complex_count
|
||||
|
||||
def flesch_reading_ease(self):
|
||||
"""Calculate Flesch Reading Ease score"""
|
||||
if self.words == 0 or self.sentences == 0:
|
||||
return 0
|
||||
|
||||
if self.language == 'it':
|
||||
# Flesch-Vacca formula for Italian
|
||||
score = 206.835 - 1.3 * (self.words / self.sentences) - 60.1 * (self.syllables / self.words)
|
||||
else:
|
||||
# Standard Flesch formula for English
|
||||
score = 206.835 - 1.015 * (self.words / self.sentences) - 84.6 * (self.syllables / self.words)
|
||||
|
||||
return round(score, 2)
|
||||
|
||||
def flesch_kincaid_grade(self):
|
||||
"""Calculate Flesch-Kincaid Grade Level (primarily for English)"""
|
||||
if self.words == 0 or self.sentences == 0:
|
||||
return 0
|
||||
|
||||
grade = 0.39 * (self.words / self.sentences) + 11.8 * (self.syllables / self.words) - 15.59
|
||||
return round(grade, 2)
|
||||
|
||||
def gunning_fog_index(self):
|
||||
"""Calculate Gunning Fog Index"""
|
||||
if self.words == 0 or self.sentences == 0:
|
||||
return 0
|
||||
|
||||
fog = 0.4 * ((self.words / self.sentences) + 100 * (self.complex_words / self.words))
|
||||
return round(fog, 2)
|
||||
|
||||
def gulpease_index(self):
|
||||
"""Calculate Gulpease Index (for Italian)"""
|
||||
if self.words == 0:
|
||||
return 0
|
||||
|
||||
gulpease = 89 - (self.characters / self.words * 10) + (self.sentences / self.words * 300)
|
||||
return round(gulpease, 2)
|
||||
|
||||
def get_all_scores(self):
|
||||
"""Get all readability scores"""
|
||||
scores = {
|
||||
'basic_stats': {
|
||||
'sentences': self.sentences,
|
||||
'words': self.words,
|
||||
'syllables': self.syllables,
|
||||
'complex_words': self.complex_words,
|
||||
'characters': self.characters,
|
||||
'avg_words_per_sentence': round(self.words / self.sentences, 2) if self.sentences > 0 else 0,
|
||||
'avg_syllables_per_word': round(self.syllables / self.words, 2) if self.words > 0 else 0
|
||||
},
|
||||
'readability_scores': {}
|
||||
}
|
||||
|
||||
# Add appropriate scores based on language
|
||||
if self.language == 'it':
|
||||
scores['readability_scores']['flesch_reading_ease_it'] = self.flesch_reading_ease()
|
||||
scores['readability_scores']['gulpease_index'] = self.gulpease_index()
|
||||
scores['readability_scores']['gunning_fog_index'] = self.gunning_fog_index()
|
||||
else:
|
||||
scores['readability_scores']['flesch_reading_ease'] = self.flesch_reading_ease()
|
||||
scores['readability_scores']['flesch_kincaid_grade'] = self.flesch_kincaid_grade()
|
||||
scores['readability_scores']['gunning_fog_index'] = self.gunning_fog_index()
|
||||
|
||||
return scores
|
||||
|
||||
def interpret_scores(self):
|
||||
"""Provide interpretation of readability scores"""
|
||||
scores = self.get_all_scores()
|
||||
interpretation = []
|
||||
|
||||
if self.language == 'it':
|
||||
# Flesch Reading Ease (Italian)
|
||||
fre = scores['readability_scores']['flesch_reading_ease_it']
|
||||
if fre >= 80:
|
||||
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Molto facile (Very easy)")
|
||||
elif fre >= 60:
|
||||
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Facile (Easy)")
|
||||
elif fre >= 50:
|
||||
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Abbastanza facile (Fairly easy)")
|
||||
elif fre >= 40:
|
||||
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Normale (Normal)")
|
||||
elif fre >= 30:
|
||||
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Abbastanza difficile (Fairly difficult)")
|
||||
else:
|
||||
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Difficile (Difficult)")
|
||||
|
||||
# Gulpease Index
|
||||
gulpease = scores['readability_scores']['gulpease_index']
|
||||
if gulpease >= 80:
|
||||
interpretation.append(f"Gulpease Index: {gulpease} - Elementare (Elementary school)")
|
||||
elif gulpease >= 60:
|
||||
interpretation.append(f"Gulpease Index: {gulpease} - Media inferiore (Middle school)")
|
||||
elif gulpease >= 40:
|
||||
interpretation.append(f"Gulpease Index: {gulpease} - Media superiore (High school)")
|
||||
else:
|
||||
interpretation.append(f"Gulpease Index: {gulpease} - Universitario (University)")
|
||||
else:
|
||||
# Flesch Reading Ease (English)
|
||||
fre = scores['readability_scores']['flesch_reading_ease']
|
||||
if fre >= 90:
|
||||
interpretation.append(f"Flesch Reading Ease: {fre} - Very easy (5th grade)")
|
||||
elif fre >= 80:
|
||||
interpretation.append(f"Flesch Reading Ease: {fre} - Easy (6th grade)")
|
||||
elif fre >= 70:
|
||||
interpretation.append(f"Flesch Reading Ease: {fre} - Fairly easy (7th grade)")
|
||||
elif fre >= 60:
|
||||
interpretation.append(f"Flesch Reading Ease: {fre} - Standard (8th-9th grade)")
|
||||
elif fre >= 50:
|
||||
interpretation.append(f"Flesch Reading Ease: {fre} - Fairly difficult (10th-12th grade)")
|
||||
elif fre >= 30:
|
||||
interpretation.append(f"Flesch Reading Ease: {fre} - Difficult (College)")
|
||||
else:
|
||||
interpretation.append(f"Flesch Reading Ease: {fre} - Very difficult (College graduate)")
|
||||
|
||||
# Flesch-Kincaid Grade
|
||||
fkg = scores['readability_scores']['flesch_kincaid_grade']
|
||||
interpretation.append(f"Flesch-Kincaid Grade: {fkg} (US grade level)")
|
||||
|
||||
# Gunning Fog Index (both languages)
|
||||
fog = scores['readability_scores']['gunning_fog_index']
|
||||
interpretation.append(f"Gunning Fog Index: {fog} (years of education needed)")
|
||||
|
||||
return '\n'.join(interpretation)
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
# English example
|
||||
english_text = """
|
||||
The quick brown fox jumps over the lazy dog. This is a simple sentence.
|
||||
However, more complicated sentences with multisyllabic words can significantly
|
||||
increase the complexity of the text and make it harder to read.
|
||||
"""
|
||||
|
||||
print("=== ENGLISH TEXT ANALYSIS ===")
|
||||
analyzer_en = ReadabilityAnalyzer(english_text, language='en')
|
||||
scores_en = analyzer_en.get_all_scores()
|
||||
|
||||
print("\nBasic Statistics:")
|
||||
for key, value in scores_en['basic_stats'].items():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
print("\nReadability Scores:")
|
||||
for key, value in scores_en['readability_scores'].items():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
print("\nInterpretation:")
|
||||
print(analyzer_en.interpret_scores())
|
||||
|
||||
# Italian example
|
||||
italian_text = """
|
||||
Il veloce cane marrone salta sopra il cane pigro. Questa è una frase semplice.
|
||||
Tuttavia, frasi più complicate con parole polisillabiche possono aumentare
|
||||
significativamente la complessità del testo e renderlo più difficile da leggere.
|
||||
"""
|
||||
|
||||
print("\n\n=== ITALIAN TEXT ANALYSIS ===")
|
||||
analyzer_it = ReadabilityAnalyzer(italian_text, language='it')
|
||||
scores_it = analyzer_it.get_all_scores()
|
||||
|
||||
print("\nBasic Statistics:")
|
||||
for key, value in scores_it['basic_stats'].items():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
print("\nReadability Scores:")
|
||||
for key, value in scores_it['readability_scores'].items():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
print("\nInterpretation:")
|
||||
print(analyzer_it.interpret_scores())
|
||||
Loading…
Reference in New Issue