codice analisi esercitazione 12-2025

This commit is contained in:
Nicola Leonardi 2025-12-30 10:01:06 +01:00
parent 107473cd4f
commit f4a99f358d
12 changed files with 56878 additions and 7685 deletions

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,546 @@
# costruisco il db solo con url immagini e progressivo immagini per ogni page_url
# to launch: python build_dataset_from_folder_no_llm_check.py --ref_path "C:\cartella_condivisa\MachineLearning\HIISlab\accessibility\notebook_miei\LLM_accessibility_validator\out" --push_to_hub --repo_id "nicolaleo/LLM-alt-text-assessment" --token "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"
from datasets import Dataset, DatasetDict
import datasets
import json
from pathlib import Path
from PIL import Image
import hashlib
import urllib.parse
import argparse
'''
# Dataset metadata
_DESCRIPTION = """\
Dataset for image alt-text assessment and improvement using MLLM responses.
Contains images, original alt-texts, quality assessments, and improved versions.
"""
_CITATION = """\
@misc{alt_text_assessment,
title={Alt-Text Assessment Dataset},
year={2024}
}
"""
class AltTextDataset(datasets.GeneratorBasedBuilder):
"""Dataset for alt-text assessment with images and MLLM responses."""
VERSION = datasets.Version("1.0.0")
def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features({
"image": datasets.Image(),
"image_url": datasets.Value("string"),
"alt_text": datasets.Value("string"),
"original_alt_text_assessment": datasets.Value("string"),
"assessment": datasets.Value("string"),
"evaluation_result": datasets.Value("string"),
"new_alt_text": datasets.Value("string"),
#"source_folder": datasets.Value("string"),
}),
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Define data splits."""
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"json_filepath": "data.json",
"images_dir": "images"
},
),
]
def _generate_examples(self, json_filepath, images_dir):
"""Generate examples from JSON file and image directory."""
with open(json_filepath, encoding="utf-8") as f:
data = json.load(f)
images_path = Path(images_dir)
for idx, entry in enumerate(data):
image_url = entry["image_url"]
image_filename = url_to_filename(image_url)
image_path = images_path / image_filename
# Load image if exists, otherwise None
image = str(image_path) if image_path.exists() else None
yield idx, {
"image": image,
"image_url": image_url,
"alt_text": entry["alt_text"],
"original_alt_text_assessment": entry["mllm_response"]["original_alt_text_assessment"],
"assessment": entry["mllm_response"]["assessment"],
"evaluation_result": entry["mllm_response"]["evaluation_result"],
"new_alt_text": entry["mllm_response"]["new_alt_text"],
}
'''
# ============================================================================
# SIMPLE USAGE FUNCTIONS
# ============================================================================
def url_to_filename(image_url): # save step as in the image_extractor dependence
"""
Convert image URL to sanitized filename following your exact logic.
Args:
image_url: The image URL
Returns:
Sanitized filename with extension
"""
# Parse the URL to get the path without query parameters
parsed_url = urllib.parse.urlparse(image_url)
url_path = parsed_url.path
# Get the filename from the path
filename = url_path.split("/")[-1]
print(f"Original filename: '{filename}'")
# Split filename and extension
if "." in filename:
image_name, ext = filename.rsplit(".", 1)
ext = ext.lower()
else:
image_name = filename
ext = "jpg"
# Validate extension
if ext not in ["jpg", "jpeg", "png", "gif", "webp"]:
ext = "jpg"
# Sanitize image name (remove special characters, limit length)
image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_"))
image_name = image_name[:50] # Limit filename length
# If name is empty after sanitization, create a hash-based name
if not image_name:
image_name = hashlib.md5(image_url.encode()).hexdigest()[:16]
return f"{image_name}.{ext}"
def push_to_hub_example(dataset_path="alt_text_merged_dataset", repo_id="",token=None):
"""
Example of how to push dataset to Hugging Face Hub.
You need to authenticate first!
"""
from huggingface_hub import login
print("\n=== Pushing Dataset to Hugging Face Hub ===")
# Method 1: Login interactively (will prompt for token)
# login()
# Method 2: Login with token directly
login(token=token)
# Method 3: Set token as environment variable
# export HF_TOKEN="hf_YourTokenHere"
# Then login() will use it automatically
# Load your dataset
ds = load_dataset_from_disk(dataset_path)
# Combine into DatasetDict
ds = DatasetDict(
{
"train": ds,
# #"test": test_dataset
}
)
# Push to hub (creates repo if it doesn't exist)
ds.push_to_hub( # Automatically converts to Parquet when uploading to Hub
repo_id, # Replace with your username
private=False, # Set True for private dataset
)
print("Dataset pushed successfully!")
print(f"View at: https://huggingface.co/datasets/{repo_id}")
def create_dataset_from_json(json_filepath, json_filepath_images, images_dir="images"):
"""
Create a Hugging Face Dataset from JSON file with local images.
Args:
json_filepath: Path to JSON file with your data structure
images_dir: Directory containing the images (default: "images")
Returns:
datasets.Dataset object with images loaded
"""
with open(json_filepath, "r", encoding="utf-8") as f:
data = json.load(f)
with open(json_filepath_images, "r", encoding="utf-8") as f:
data_images = json.load(f)
images_path = Path(images_dir)
# Flatten the nested structure and load images
flattened_data = {
#"image": [],
"url": [],
"alt_text": [],
#"original_alt_text_assessment": [],
#"assessment": [],
#"evaluation_result": [],
#"new_alt_text": [],
"page_url": [],
#"html_context": [],
"image_id":[]
}
count_entry = 0
for entry in data_images:
"""
if (
entry["mllm_response"]["original_alt_text_assessment"] is None
): # important! skip entries with no MLLM response. not usable data
print(
f"Skipping entry with image URL: {entry['image_url']} due to missing MLLM response"
)
#count_entry += 1
#continue # Skip entries with no MLLM response
flattened_data["original_alt_text_assessment"].append(str(0))
flattened_data["assessment"].append(0)
flattened_data["evaluation_result"].append(0)
else:
flattened_data["original_alt_text_assessment"].append(
str(entry["mllm_response"]["original_alt_text_assessment"])
)
flattened_data["assessment"].append(entry["mllm_response"]["assessment"])
flattened_data["evaluation_result"].append(
entry["mllm_response"]["evaluation_result"]
)"""
image_url = entry["url"]
image_filename = url_to_filename(image_url)
image_path = images_path / image_filename
# Load image if it exists
"""
if image_path.exists():
img = Image.open(image_path)
flattened_data["image"].append(img)
else:
print(f"Warning: Image not found: {image_path}")
flattened_data["image"].append(None)
"""
flattened_data["url"].append(image_url)
flattened_data["alt_text"].append(entry["alt_text"])
"""flattened_data["original_alt_text_assessment"].append(
str(entry["mllm_response"]["original_alt_text_assessment"])
)
flattened_data["assessment"].append(entry["mllm_response"]["assessment"])
flattened_data["evaluation_result"].append(
entry["mllm_response"]["evaluation_result"]
)
flattened_data["new_alt_text"].append(entry["mllm_response"]["new_alt_text"])"""
flattened_data["page_url"].append(data_images[count_entry]["page_url"])
#flattened_data["html_context"].append(data_images[count_entry]["html_context"])
count_entry += 1
flattened_data["image_id"].append(count_entry)
print(f"Total valid entries loaded: {len(flattened_data['url'])}")
return datasets.Dataset.from_dict(flattened_data)
def create_dataset_from_folders(
ref_path,
json_filename="mllm_alttext_assessments.json",
json_filename_images="extracted_images.json",
images_dirname="images",
):
"""
Create a merged dataset from multiple folders under ref_path.
Each folder should contain a JSON file and an images subdirectory.
Args:
ref_path: Root path containing multiple folders
json_filename: Name of JSON file in each folder (default: "data.json")
images_dirname: Name of images subdirectory (default: "images")
Returns:
datasets.Dataset object with all entries merged
"""
ref_path = Path(ref_path)
all_datasets = []
# Find all subdirectories containing the JSON file
folders_processed = 0
for folder in ref_path.iterdir():
if not folder.is_dir():
continue
json_path = folder / json_filename
json_path_images = folder / json_filename_images
images_path = folder / images_dirname
# Check if both JSON and images directory exist
if not json_path.exists():
print(f"Skipping {folder.name}: no {json_filename} found")
continue
if not json_path_images.exists():
print(f"Skipping {folder.name}: no {json_filename_images} found")
continue
if not images_path.exists():
print(f"Warning: {folder.name}: images directory not found")
# continue
# Continue anyway, images might be optional (from urls only)
print(f"Processing folder: {folder.name}")
try:
# Create dataset for this folder
ds = create_dataset_from_json(
str(json_path), str(json_path_images), str(images_path)
)
all_datasets.append(ds)
folders_processed += 1
print(f" -> Loaded {len(ds)} entries")
except Exception as e:
print(f"Error processing {folder.name}: {e}")
continue
if not all_datasets:
raise ValueError(f"No valid folders found in {ref_path}")
# Merge all datasets
print(f"\n=== Merging {folders_processed} folders ===")
merged_dataset = datasets.concatenate_datasets(all_datasets)
print(f"Total entries: {len(merged_dataset)}")
return merged_dataset
def verify_images(json_filepath, images_dir="images"):
"""
Verify that all images referenced in JSON exist in the images directory.
Args:
json_filepath: Path to JSON file
images_dir: Directory containing images
Returns:
Dict with 'found', 'missing', and 'details' keys
"""
with open(json_filepath, "r", encoding="utf-8") as f:
data = json.load(f)
images_path = Path(images_dir)
found = []
missing = []
for entry in data:
image_url = entry["image_url"]
image_filename = url_to_filename(image_url)
image_path = images_path / image_filename
print(
"image_url:",
image_url,
"image_filename:",
image_filename,
"image_path:",
image_path,
)
if image_path.exists():
found.append(
{"url": image_url, "filename": image_filename, "path": str(image_path)}
)
else:
missing.append(
{
"url": image_url,
"filename": image_filename,
"expected_path": str(image_path),
}
)
return {
"found": len(found),
"missing": len(missing),
"total": len(data),
"details": {"found_images": found, "missing_images": missing},
}
def verify_images_in_folders(
ref_path, json_filename="mllm_alttext_assessments.json", images_dirname="images"
):
"""
Verify images across all folders under ref_path.
Args:
ref_path: Root path containing multiple folders
json_filename: Name of JSON file in each folder
images_dirname: Name of images subdirectory
Returns:
Dict with aggregated verification results
"""
ref_path = Path(ref_path)
total_found = 0
total_missing = 0
total_entries = 0
folder_results = {}
for folder in ref_path.iterdir():
if not folder.is_dir():
continue
json_path = folder / json_filename
images_path = folder / images_dirname
if not json_path.exists():
continue
print(f"Verifying folder: {folder.name}")
try:
verification = verify_images(str(json_path), str(images_path))
folder_results[folder.name] = verification
total_found += verification["found"]
total_missing += verification["missing"]
total_entries += verification["total"]
print(f" Found: {verification['found']}/{verification['total']}")
except Exception as e:
print(f" Error: {e}")
continue
return {
"found": total_found,
"missing": total_missing,
"total": total_entries,
"folders": folder_results,
}
def save_dataset(dataset, output_path):
"""Save dataset in Arrow format (includes images)."""
dataset.save_to_disk(output_path)
# print(f"Dataset saved to {output_path}")
# Or save as JSON
# dataset.to_json(f"{output_path}/data.json")
# Or save as CSV
# dataset.to_csv(f"{output_path}/data.csv")
# Or save as Parquet
# dataset.to_parquet(f"{output_path}/data.parquet")
def load_dataset_from_disk(dataset_path):
"""Load a previously saved dataset."""
return datasets.load_from_disk(dataset_path)
# ============================================================================
# EXAMPLE USAGE
# ============================================================================
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--ref_path",
type=str,
help=("Root path containing multiple folders"),
default="C:\\cartella_condivisa\\MachineLearning\\HIISlab\\accessibility\\notebook_miei\\LLM_accessibility_validator\\out",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
default=False,
help=("If True push the merged dataset to Hugging Face Hub"),
)
parser.add_argument(
"--token",
type=str,
help=("Hugging Face authentication token"),
default="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",
)
parser.add_argument(
"--repo_id",
type=str,
help=("Hugging Face repository ID"),
default="nicolaleo/LLM-alt-text-assessment",
)
args = parser.parse_args()
# Example 1: Verify images across all folders
print("=== Verifying Images in All Folders ===")
verification = verify_images_in_folders(args.ref_path)
print("\n######## Verifier output ################################")
print(f"Total Found: {verification['found']}/{verification['total']}")
print(f"Total Missing: {verification['missing']}/{verification['total']}")
print("########################################")
# Show per-folder breakdown
print("\n=== Per-Folder Breakdown ===")
for folder_name, results in verification["folders"].items():
print(f"{folder_name}: {results['found']}/{results['total']} images found")
# Example 2: Create merged dataset from all folders
print("\n=== Creating Merged Dataset ===")
ds = create_dataset_from_folders(args.ref_path)
print("\n######## Merged Dataset output ################################")
print(f"Final dataset size: {len(ds)} entries")
print("########################################")
# Example 3: Analyze the merged dataset
print("\n=== Dataset Analysis ===")
print(ds)
# Example 5: Save merged dataset
print("\n=== Saving Merged Dataset ===")
save_dataset(ds, "alt_text_merged_dataset_no_llm_check")
# Example 6: Load dataset
print("\n=== Loading Dataset ===")
loaded_ds = load_dataset_from_disk("alt_text_merged_dataset_no_llm_check")
print(f"Loaded {len(loaded_ds)} entries")
if args.push_to_hub:
# Push to Hugging Face Hub (optional)
push_to_hub_example(repo_id=args.repo_id, token=args.token) # function below for details

View File

@ -0,0 +1,10 @@
# Folder structure
- [analisi_esercitazione_12_2025](analisi_esercitazione_12_2025) first notebook that includes similarities calculation and some basic EDA analysis
- [analisi_esercitazione_12_2025_advanced](analisi_esercitazione_12_2025_advanced) the notebook contains the analysis on language switch
- [analisi_esercitazione_12_2025_embedding](analisi_esercitazione_12_2025_embedding) the notebook builds a classifier on the semantic representation of the text generated by the LLM
- [analisi_esercitazione_12_2025_classificatore](analisi_esercitazione_12_2025_classificatore) the notebook builds a classifier based on some features related to user-LLM alt text similarities, text readibility, etc.
- [analisi_esercitazione_ricostruzione_associazioni](analisi_esercitazione_ricostruzione_associazioni) the notebook rebuilds the exercise dataset based on the DB dumbs from UI and backend microservices
- [analisi_esercitazione_12_2025_models_comparisons](analisi_esercitazione_12_2025_models_comparisons) starting from the rebuild exercise dataset, the notebook performs some test switching LLM model and/or prompt

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,105 @@
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import torch
from bert_score import score
from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity_sklearn
import re
def preprocess_text(text):
text = text.lower()
text = re.sub(r"[^\w\s]", "", text) # Remove punctuation
text = re.sub(r"\s+", " ", text).strip() # Normalize whitespace
return text
def cosine_similarity(a, b):
return np.dot(a, b) / (
np.linalg.norm(a) * np.linalg.norm(b) + 1e-10
) # Use epsilon for numerical stability
def semantic_similarity(text1, text2):
# Handle empty strings explicitly
if not text1.strip() or not text2.strip():
return 0.0
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
inputs1 = tokenizer(
text1, return_tensors="pt"
) # no preprocess: The neural models are trained to handle natural text variations
inputs2 = tokenizer(text2, return_tensors="pt")
model.eval()
with torch.no_grad():
outputs1 = model(**inputs1)
outputs2 = model(**inputs2)
embedding1 = (
outputs1.last_hidden_state.mean(dim=1).squeeze().numpy()
) # the average of all token embeddings as representation
embedding2 = outputs2.last_hidden_state.mean(dim=1).squeeze().numpy()
return cosine_similarity(embedding1, embedding2)
def semantic_similarity_sentence_transformer(text1, text2):
# Handle empty strings explicitly
if not text1.strip() or not text2.strip():
return 0.0
# Purpose-built for sentence embeddings
model = SentenceTransformer(
"all-MiniLM-L6-v2"
) # no preprocess: The neural models are trained to handle natural text variations
embeddings = model.encode(
[text1, text2],
output_value="sentence_embedding",
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
) # params "sentence_embedding" to prodcuce only one representation per sentence (the average of token embeddings)
return cosine_similarity(embeddings[0], embeddings[1])
def extract_semantic_representation(text):
# Handle empty strings explicitly
if not text.strip():
return 0.0
# Purpose-built for sentence embeddings
model = SentenceTransformer(
"all-MiniLM-L6-v2"
) # no preprocess: The neural models are trained to handle natural text variations
embeddings = model.encode(
[text],
output_value="sentence_embedding",
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
) # params "sentence_embedding" to prodcuce only one representation per sentence (the average of token embeddings)
return embeddings
def lexical_similarity(text1, text2):
#vectorizer = TfidfVectorizer(stop_words=None, analyzer="char", ngram_range=(1, 3))
vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(1, 1))
text1 = preprocess_text(text1) # only lexical needs preprocessing
text2 = preprocess_text(text2)
tfidf_matrix = vectorizer.fit_transform([text1, text2])
vec1 = tfidf_matrix.toarray()[0]
vec2 = tfidf_matrix.toarray()[1]
return cosine_similarity(vec1, vec2)
def bert_score_similarity(texts1, texts2, batch=False):
P, R, F1 = (
score( # no preprocess: The neural models are trained to handle natural text variations
texts1,
texts2,
lang="en",
verbose=False,
model_type="bert-base-uncased",
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
batch_size=32,
)
)
return F1.tolist() if batch else F1.item()

View File

@ -0,0 +1,166 @@
import json
import time
import urllib.request
import urllib.parse
import logging
import os
import requests
import base64
import re
def call_API_urlibrequest(
data={},
verbose=False,
url="",
headers=[],
method="post",
base=2, # number of seconds to wait
max_tries=3,
):
if verbose:
logging.info("input_data:%s", data)
# Allow multiple attempts to call the API incase of downtime.
# Return provided response to user after 3 failed attempts.
wait_seconds = [base**i for i in range(max_tries)]
for num_tries in range(max_tries):
try:
if method == "get":
# Encode the parameters and append them to the URL
query_string = urllib.parse.urlencode(data)
url_with_params = f"{url}?{query_string}"
request = urllib.request.Request(url_with_params, method="GET")
for ele in headers:
request.add_header(ele[0], ele[1])
elif method == "post":
# Convert the dictionary to a JSON formatted string and encode it to bytes
data_to_send = json.dumps(data).encode("utf-8")
request = urllib.request.Request(url, data=data_to_send, method="POST")
for ele in headers:
request.add_header(ele[0], ele[1])
else:
return {"error_message": "method_not_allowed"}
# Send the request and capture the response
with urllib.request.urlopen(request) as response:
# Read and decode the response
response_json = json.loads(response.read().decode("utf-8"))
logging.info("response_json:%s", response_json)
logging.info("response.status_code:%s", response.getcode())
return response_json
except Exception as e:
logging.error("error message:%s", e)
response_json = {"error": e}
logging.info("num_tries:%s", num_tries)
logging.info(
"Waiting %s seconds before automatically trying again.",
str(wait_seconds[num_tries]),
)
time.sleep(wait_seconds[num_tries])
logging.info(
"Tried %s times to make API call to get a valid response object", max_tries
)
logging.info("Returning provided response")
return response_json
def parse_mllm_alt_text_response(mllm_response):
"""
Parse an MLLM response string and extract key attributes into a JSON object.
from mllm response like:
```json\n{\n\"Original alt-text assessment\"... etc
to a structured dictionary.
Args:
mllm_response (str): The raw MLLM response text containing JSON data
Returns:
dict: A dictionary containing the extracted attributes, or None if parsing fails
"""
try:
# Handle NaN or None values
if mllm_response is None or mllm_response == "":
return {
"original_alt_text_assessment": None,
"assessment": None,
"evaluation_result": None,
"new_alt_text": None
}
# Extract JSON content between ```json and ``` markers
json_match = re.search(r'```json\s*(.*?)\s*```', mllm_response, re.DOTALL)
if not json_match:
# Try to find JSON without markdown code blocks
json_match = re.search(r'\{.*\}', mllm_response, re.DOTALL)
if not json_match:
return {
"original_alt_text_assessment": None,
"assessment": None,
"evaluation_result": None,
"new_alt_text": None
}
json_str = json_match.group(1) if '```json' in mllm_response else json_match.group(0)
# Parse the JSON string
parsed_data = json.loads(json_str)
# Create a structured output with the key attributes
result = {
"original_alt_text_assessment": parsed_data.get("Original alt-text assessment", ""),
"assessment": parsed_data.get("Assessment", ""),
"evaluation_result": parsed_data.get("EvaluationResult", ""),
"new_alt_text": parsed_data.get("New alt-text", "")
}
return result
except json.JSONDecodeError as e:
print(f"JSON parsing error: {e}")
return {
"original_alt_text_assessment": None,
"assessment": None,
"evaluation_result": None,
"new_alt_text": None
}
except Exception as e:
print(f"Error parsing MLLM response: {e}")
return {
"original_alt_text_assessment": None,
"assessment": None,
"evaluation_result": None,
"new_alt_text": None
}
def encode_image_from_url(image_url):
response = requests.get(image_url)
return base64.b64encode(response.content).decode("utf-8")

View File

@ -0,0 +1,290 @@
import re
from collections import Counter
"""
For English texts:
Flesch Reading Ease score
Flesch-Kincaid Grade Level
Gunning Fog Index
For Italian texts:
Flesch Reading Ease (adapted for Italian with Flesch-Vacca formula)
Gulpease Index (specifically designed for Italian)
Gunning Fog Index
Basic statistics for both:
Sentence count
Word count
Syllable count
Complex words (3+ syllables)
Average words per sentence
Average syllables per word
"""
class ReadabilityAnalyzer:
"""Analyze text readability for English and Italian"""
def __init__(self, text, language='en'):
self.text = text
self.language = language.lower()
self.sentences = self._count_sentences()
self.words = self._count_words()
self.syllables = self._count_syllables()
self.complex_words = self._count_complex_words()
self.characters = len(re.sub(r'\s', '', text))
def _count_sentences(self):
"""Count sentences in text"""
sentences = re.split(r'[.!?]+', self.text)
return len([s for s in sentences if s.strip()])
def _count_words(self):
"""Count words in text"""
words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text)
return len(words)
def _count_syllables(self):
"""Count syllables in text (approximation for both languages)"""
words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text.lower())
total_syllables = 0
for word in words:
if self.language == 'it':
syllables = self._count_syllables_italian(word)
else:
syllables = self._count_syllables_english(word)
total_syllables += syllables
return total_syllables
def _count_syllables_english(self, word):
"""Count syllables in English word"""
word = word.lower()
vowels = 'aeiouy'
syllables = 0
previous_was_vowel = False
for char in word:
is_vowel = char in vowels
if is_vowel and not previous_was_vowel:
syllables += 1
previous_was_vowel = is_vowel
# Adjust for silent e
if word.endswith('e'):
syllables -= 1
# Ensure at least 1 syllable
if syllables == 0:
syllables = 1
return syllables
def _count_syllables_italian(self, word):
"""Count syllables in Italian word"""
word = word.lower()
vowels = 'aeiouàèéìòùáíóúý'
syllables = 0
previous_was_vowel = False
for char in word:
is_vowel = char in vowels
if is_vowel and not previous_was_vowel:
syllables += 1
previous_was_vowel = is_vowel
# Ensure at least 1 syllable
if syllables == 0:
syllables = 1
return syllables
def _count_complex_words(self):
"""Count words with 3+ syllables"""
words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text.lower())
complex_count = 0
for word in words:
if self.language == 'it':
syllables = self._count_syllables_italian(word)
else:
syllables = self._count_syllables_english(word)
if syllables >= 3:
complex_count += 1
return complex_count
def flesch_reading_ease(self):
"""Calculate Flesch Reading Ease score"""
if self.words == 0 or self.sentences == 0:
return 0
if self.language == 'it':
# Flesch-Vacca formula for Italian
score = 206.835 - 1.3 * (self.words / self.sentences) - 60.1 * (self.syllables / self.words)
else:
# Standard Flesch formula for English
score = 206.835 - 1.015 * (self.words / self.sentences) - 84.6 * (self.syllables / self.words)
return round(score, 2)
def flesch_kincaid_grade(self):
"""Calculate Flesch-Kincaid Grade Level (primarily for English)"""
if self.words == 0 or self.sentences == 0:
return 0
grade = 0.39 * (self.words / self.sentences) + 11.8 * (self.syllables / self.words) - 15.59
return round(grade, 2)
def gunning_fog_index(self):
"""Calculate Gunning Fog Index"""
if self.words == 0 or self.sentences == 0:
return 0
fog = 0.4 * ((self.words / self.sentences) + 100 * (self.complex_words / self.words))
return round(fog, 2)
def gulpease_index(self):
"""Calculate Gulpease Index (for Italian)"""
if self.words == 0:
return 0
gulpease = 89 - (self.characters / self.words * 10) + (self.sentences / self.words * 300)
return round(gulpease, 2)
def get_all_scores(self):
"""Get all readability scores"""
scores = {
'basic_stats': {
'sentences': self.sentences,
'words': self.words,
'syllables': self.syllables,
'complex_words': self.complex_words,
'characters': self.characters,
'avg_words_per_sentence': round(self.words / self.sentences, 2) if self.sentences > 0 else 0,
'avg_syllables_per_word': round(self.syllables / self.words, 2) if self.words > 0 else 0
},
'readability_scores': {}
}
# Add appropriate scores based on language
if self.language == 'it':
scores['readability_scores']['flesch_reading_ease_it'] = self.flesch_reading_ease()
scores['readability_scores']['gulpease_index'] = self.gulpease_index()
scores['readability_scores']['gunning_fog_index'] = self.gunning_fog_index()
else:
scores['readability_scores']['flesch_reading_ease'] = self.flesch_reading_ease()
scores['readability_scores']['flesch_kincaid_grade'] = self.flesch_kincaid_grade()
scores['readability_scores']['gunning_fog_index'] = self.gunning_fog_index()
return scores
def interpret_scores(self):
"""Provide interpretation of readability scores"""
scores = self.get_all_scores()
interpretation = []
if self.language == 'it':
# Flesch Reading Ease (Italian)
fre = scores['readability_scores']['flesch_reading_ease_it']
if fre >= 80:
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Molto facile (Very easy)")
elif fre >= 60:
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Facile (Easy)")
elif fre >= 50:
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Abbastanza facile (Fairly easy)")
elif fre >= 40:
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Normale (Normal)")
elif fre >= 30:
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Abbastanza difficile (Fairly difficult)")
else:
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Difficile (Difficult)")
# Gulpease Index
gulpease = scores['readability_scores']['gulpease_index']
if gulpease >= 80:
interpretation.append(f"Gulpease Index: {gulpease} - Elementare (Elementary school)")
elif gulpease >= 60:
interpretation.append(f"Gulpease Index: {gulpease} - Media inferiore (Middle school)")
elif gulpease >= 40:
interpretation.append(f"Gulpease Index: {gulpease} - Media superiore (High school)")
else:
interpretation.append(f"Gulpease Index: {gulpease} - Universitario (University)")
else:
# Flesch Reading Ease (English)
fre = scores['readability_scores']['flesch_reading_ease']
if fre >= 90:
interpretation.append(f"Flesch Reading Ease: {fre} - Very easy (5th grade)")
elif fre >= 80:
interpretation.append(f"Flesch Reading Ease: {fre} - Easy (6th grade)")
elif fre >= 70:
interpretation.append(f"Flesch Reading Ease: {fre} - Fairly easy (7th grade)")
elif fre >= 60:
interpretation.append(f"Flesch Reading Ease: {fre} - Standard (8th-9th grade)")
elif fre >= 50:
interpretation.append(f"Flesch Reading Ease: {fre} - Fairly difficult (10th-12th grade)")
elif fre >= 30:
interpretation.append(f"Flesch Reading Ease: {fre} - Difficult (College)")
else:
interpretation.append(f"Flesch Reading Ease: {fre} - Very difficult (College graduate)")
# Flesch-Kincaid Grade
fkg = scores['readability_scores']['flesch_kincaid_grade']
interpretation.append(f"Flesch-Kincaid Grade: {fkg} (US grade level)")
# Gunning Fog Index (both languages)
fog = scores['readability_scores']['gunning_fog_index']
interpretation.append(f"Gunning Fog Index: {fog} (years of education needed)")
return '\n'.join(interpretation)
# Example usage
if __name__ == "__main__":
# English example
english_text = """
The quick brown fox jumps over the lazy dog. This is a simple sentence.
However, more complicated sentences with multisyllabic words can significantly
increase the complexity of the text and make it harder to read.
"""
print("=== ENGLISH TEXT ANALYSIS ===")
analyzer_en = ReadabilityAnalyzer(english_text, language='en')
scores_en = analyzer_en.get_all_scores()
print("\nBasic Statistics:")
for key, value in scores_en['basic_stats'].items():
print(f" {key}: {value}")
print("\nReadability Scores:")
for key, value in scores_en['readability_scores'].items():
print(f" {key}: {value}")
print("\nInterpretation:")
print(analyzer_en.interpret_scores())
# Italian example
italian_text = """
Il veloce cane marrone salta sopra il cane pigro. Questa è una frase semplice.
Tuttavia, frasi più complicate con parole polisillabiche possono aumentare
significativamente la complessità del testo e renderlo più difficile da leggere.
"""
print("\n\n=== ITALIAN TEXT ANALYSIS ===")
analyzer_it = ReadabilityAnalyzer(italian_text, language='it')
scores_it = analyzer_it.get_all_scores()
print("\nBasic Statistics:")
for key, value in scores_it['basic_stats'].items():
print(f" {key}: {value}")
print("\nReadability Scores:")
for key, value in scores_it['readability_scores'].items():
print(f" {key}: {value}")
print("\nInterpretation:")
print(analyzer_it.interpret_scores())