wcag_AI_validation/scripts/esercitazione_12_2025/utils_text_complexity.py

290 lines
11 KiB
Python

import re
from collections import Counter
"""
For English texts:
Flesch Reading Ease score
Flesch-Kincaid Grade Level
Gunning Fog Index
For Italian texts:
Flesch Reading Ease (adapted for Italian with Flesch-Vacca formula)
Gulpease Index (specifically designed for Italian)
Gunning Fog Index
Basic statistics for both:
Sentence count
Word count
Syllable count
Complex words (3+ syllables)
Average words per sentence
Average syllables per word
"""
class ReadabilityAnalyzer:
"""Analyze text readability for English and Italian"""
def __init__(self, text, language='en'):
self.text = text
self.language = language.lower()
self.sentences = self._count_sentences()
self.words = self._count_words()
self.syllables = self._count_syllables()
self.complex_words = self._count_complex_words()
self.characters = len(re.sub(r'\s', '', text))
def _count_sentences(self):
"""Count sentences in text"""
sentences = re.split(r'[.!?]+', self.text)
return len([s for s in sentences if s.strip()])
def _count_words(self):
"""Count words in text"""
words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text)
return len(words)
def _count_syllables(self):
"""Count syllables in text (approximation for both languages)"""
words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text.lower())
total_syllables = 0
for word in words:
if self.language == 'it':
syllables = self._count_syllables_italian(word)
else:
syllables = self._count_syllables_english(word)
total_syllables += syllables
return total_syllables
def _count_syllables_english(self, word):
"""Count syllables in English word"""
word = word.lower()
vowels = 'aeiouy'
syllables = 0
previous_was_vowel = False
for char in word:
is_vowel = char in vowels
if is_vowel and not previous_was_vowel:
syllables += 1
previous_was_vowel = is_vowel
# Adjust for silent e
if word.endswith('e'):
syllables -= 1
# Ensure at least 1 syllable
if syllables == 0:
syllables = 1
return syllables
def _count_syllables_italian(self, word):
"""Count syllables in Italian word"""
word = word.lower()
vowels = 'aeiouàèéìòùáíóúý'
syllables = 0
previous_was_vowel = False
for char in word:
is_vowel = char in vowels
if is_vowel and not previous_was_vowel:
syllables += 1
previous_was_vowel = is_vowel
# Ensure at least 1 syllable
if syllables == 0:
syllables = 1
return syllables
def _count_complex_words(self):
"""Count words with 3+ syllables"""
words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text.lower())
complex_count = 0
for word in words:
if self.language == 'it':
syllables = self._count_syllables_italian(word)
else:
syllables = self._count_syllables_english(word)
if syllables >= 3:
complex_count += 1
return complex_count
def flesch_reading_ease(self):
"""Calculate Flesch Reading Ease score"""
if self.words == 0 or self.sentences == 0:
return 0
if self.language == 'it':
# Flesch-Vacca formula for Italian
score = 206.835 - 1.3 * (self.words / self.sentences) - 60.1 * (self.syllables / self.words)
else:
# Standard Flesch formula for English
score = 206.835 - 1.015 * (self.words / self.sentences) - 84.6 * (self.syllables / self.words)
return round(score, 2)
def flesch_kincaid_grade(self):
"""Calculate Flesch-Kincaid Grade Level (primarily for English)"""
if self.words == 0 or self.sentences == 0:
return 0
grade = 0.39 * (self.words / self.sentences) + 11.8 * (self.syllables / self.words) - 15.59
return round(grade, 2)
def gunning_fog_index(self):
"""Calculate Gunning Fog Index"""
if self.words == 0 or self.sentences == 0:
return 0
fog = 0.4 * ((self.words / self.sentences) + 100 * (self.complex_words / self.words))
return round(fog, 2)
def gulpease_index(self):
"""Calculate Gulpease Index (for Italian)"""
if self.words == 0:
return 0
gulpease = 89 - (self.characters / self.words * 10) + (self.sentences / self.words * 300)
return round(gulpease, 2)
def get_all_scores(self):
"""Get all readability scores"""
scores = {
'basic_stats': {
'sentences': self.sentences,
'words': self.words,
'syllables': self.syllables,
'complex_words': self.complex_words,
'characters': self.characters,
'avg_words_per_sentence': round(self.words / self.sentences, 2) if self.sentences > 0 else 0,
'avg_syllables_per_word': round(self.syllables / self.words, 2) if self.words > 0 else 0
},
'readability_scores': {}
}
# Add appropriate scores based on language
if self.language == 'it':
scores['readability_scores']['flesch_reading_ease_it'] = self.flesch_reading_ease()
scores['readability_scores']['gulpease_index'] = self.gulpease_index()
scores['readability_scores']['gunning_fog_index'] = self.gunning_fog_index()
else:
scores['readability_scores']['flesch_reading_ease'] = self.flesch_reading_ease()
scores['readability_scores']['flesch_kincaid_grade'] = self.flesch_kincaid_grade()
scores['readability_scores']['gunning_fog_index'] = self.gunning_fog_index()
return scores
def interpret_scores(self):
"""Provide interpretation of readability scores"""
scores = self.get_all_scores()
interpretation = []
if self.language == 'it':
# Flesch Reading Ease (Italian)
fre = scores['readability_scores']['flesch_reading_ease_it']
if fre >= 80:
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Molto facile (Very easy)")
elif fre >= 60:
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Facile (Easy)")
elif fre >= 50:
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Abbastanza facile (Fairly easy)")
elif fre >= 40:
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Normale (Normal)")
elif fre >= 30:
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Abbastanza difficile (Fairly difficult)")
else:
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Difficile (Difficult)")
# Gulpease Index
gulpease = scores['readability_scores']['gulpease_index']
if gulpease >= 80:
interpretation.append(f"Gulpease Index: {gulpease} - Elementare (Elementary school)")
elif gulpease >= 60:
interpretation.append(f"Gulpease Index: {gulpease} - Media inferiore (Middle school)")
elif gulpease >= 40:
interpretation.append(f"Gulpease Index: {gulpease} - Media superiore (High school)")
else:
interpretation.append(f"Gulpease Index: {gulpease} - Universitario (University)")
else:
# Flesch Reading Ease (English)
fre = scores['readability_scores']['flesch_reading_ease']
if fre >= 90:
interpretation.append(f"Flesch Reading Ease: {fre} - Very easy (5th grade)")
elif fre >= 80:
interpretation.append(f"Flesch Reading Ease: {fre} - Easy (6th grade)")
elif fre >= 70:
interpretation.append(f"Flesch Reading Ease: {fre} - Fairly easy (7th grade)")
elif fre >= 60:
interpretation.append(f"Flesch Reading Ease: {fre} - Standard (8th-9th grade)")
elif fre >= 50:
interpretation.append(f"Flesch Reading Ease: {fre} - Fairly difficult (10th-12th grade)")
elif fre >= 30:
interpretation.append(f"Flesch Reading Ease: {fre} - Difficult (College)")
else:
interpretation.append(f"Flesch Reading Ease: {fre} - Very difficult (College graduate)")
# Flesch-Kincaid Grade
fkg = scores['readability_scores']['flesch_kincaid_grade']
interpretation.append(f"Flesch-Kincaid Grade: {fkg} (US grade level)")
# Gunning Fog Index (both languages)
fog = scores['readability_scores']['gunning_fog_index']
interpretation.append(f"Gunning Fog Index: {fog} (years of education needed)")
return '\n'.join(interpretation)
# Example usage
if __name__ == "__main__":
# English example
english_text = """
The quick brown fox jumps over the lazy dog. This is a simple sentence.
However, more complicated sentences with multisyllabic words can significantly
increase the complexity of the text and make it harder to read.
"""
print("=== ENGLISH TEXT ANALYSIS ===")
analyzer_en = ReadabilityAnalyzer(english_text, language='en')
scores_en = analyzer_en.get_all_scores()
print("\nBasic Statistics:")
for key, value in scores_en['basic_stats'].items():
print(f" {key}: {value}")
print("\nReadability Scores:")
for key, value in scores_en['readability_scores'].items():
print(f" {key}: {value}")
print("\nInterpretation:")
print(analyzer_en.interpret_scores())
# Italian example
italian_text = """
Il veloce cane marrone salta sopra il cane pigro. Questa è una frase semplice.
Tuttavia, frasi più complicate con parole polisillabiche possono aumentare
significativamente la complessità del testo e renderlo più difficile da leggere.
"""
print("\n\n=== ITALIAN TEXT ANALYSIS ===")
analyzer_it = ReadabilityAnalyzer(italian_text, language='it')
scores_it = analyzer_it.get_all_scores()
print("\nBasic Statistics:")
for key, value in scores_it['basic_stats'].items():
print(f" {key}: {value}")
print("\nReadability Scores:")
for key, value in scores_it['readability_scores'].items():
print(f" {key}: {value}")
print("\nInterpretation:")
print(analyzer_it.interpret_scores())