wcag_AI_validation/scripts/esercitazione_12_2025/utils_text_complexity.py

import re
from collections import Counter

"""
For English texts:

Flesch Reading Ease score
Flesch-Kincaid Grade Level
Gunning Fog Index

For Italian texts:

Flesch Reading Ease (adapted for Italian with Flesch-Vacca formula)
Gulpease Index (specifically designed for Italian)
Gunning Fog Index

Basic statistics for both:

Sentence count
Word count
Syllable count
Complex words (3+ syllables)
Average words per sentence
Average syllables per word
"""

class ReadabilityAnalyzer:
    """Analyze text readability for English and Italian"""

    def __init__(self, text, language='en'):
        self.text = text
        self.language = language.lower()
        self.sentences = self._count_sentences()
        self.words = self._count_words()
        self.syllables = self._count_syllables()
        self.complex_words = self._count_complex_words()
        self.characters = len(re.sub(r'\s', '', text))

    def _count_sentences(self):
        """Count sentences in text"""
        sentences = re.split(r'[.!?]+', self.text)
        return len([s for s in sentences if s.strip()])

    def _count_words(self):
        """Count words in text"""
        words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text)
        return len(words)

    def _count_syllables(self):
        """Count syllables in text (approximation for both languages)"""
        words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text.lower())
        total_syllables = 0

        for word in words:
            if self.language == 'it':
                syllables = self._count_syllables_italian(word)
            else:
                syllables = self._count_syllables_english(word)
            total_syllables += syllables

        return total_syllables

    def _count_syllables_english(self, word):
        """Count syllables in English word"""
        word = word.lower()
        vowels = 'aeiouy'
        syllables = 0
        previous_was_vowel = False

        for char in word:
            is_vowel = char in vowels
            if is_vowel and not previous_was_vowel:
                syllables += 1
            previous_was_vowel = is_vowel

        # Adjust for silent e
        if word.endswith('e'):
            syllables -= 1

        # Ensure at least 1 syllable
        if syllables == 0:
            syllables = 1

        return syllables

    def _count_syllables_italian(self, word):
        """Count syllables in Italian word"""
        word = word.lower()
        vowels = 'aeiouàèéìòùáíóúý'
        syllables = 0
        previous_was_vowel = False

        for char in word:
            is_vowel = char in vowels
            if is_vowel and not previous_was_vowel:
                syllables += 1
            previous_was_vowel = is_vowel

        # Ensure at least 1 syllable
        if syllables == 0:
            syllables = 1

        return syllables

    def _count_complex_words(self):
        """Count words with 3+ syllables"""
        words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text.lower())
        complex_count = 0

        for word in words:
            if self.language == 'it':
                syllables = self._count_syllables_italian(word)
            else:
                syllables = self._count_syllables_english(word)

            if syllables >= 3:
                complex_count += 1

        return complex_count

    def flesch_reading_ease(self):
        """Calculate Flesch Reading Ease score"""
        if self.words == 0 or self.sentences == 0:
            return 0

        if self.language == 'it':
            # Flesch-Vacca formula for Italian
            score = 206.835 - 1.3 * (self.words / self.sentences) - 60.1 * (self.syllables / self.words)
        else:
            # Standard Flesch formula for English
            score = 206.835 - 1.015 * (self.words / self.sentences) - 84.6 * (self.syllables / self.words)

        return round(score, 2)

    def flesch_kincaid_grade(self):
        """Calculate Flesch-Kincaid Grade Level (primarily for English)"""
        if self.words == 0 or self.sentences == 0:
            return 0

        grade = 0.39 * (self.words / self.sentences) + 11.8 * (self.syllables / self.words) - 15.59
        return round(grade, 2)

    def gunning_fog_index(self):
        """Calculate Gunning Fog Index"""
        if self.words == 0 or self.sentences == 0:
            return 0

        fog = 0.4 * ((self.words / self.sentences) + 100 * (self.complex_words / self.words))
        return round(fog, 2)

    def gulpease_index(self):
        """Calculate Gulpease Index (for Italian)"""
        if self.words == 0:
            return 0

        gulpease = 89 - (self.characters / self.words * 10) + (self.sentences / self.words * 300)
        return round(gulpease, 2)

    def get_all_scores(self):
        """Get all readability scores"""
        scores = {
            'basic_stats': {
                'sentences': self.sentences,
                'words': self.words,
                'syllables': self.syllables,
                'complex_words': self.complex_words,
                'characters': self.characters,
                'avg_words_per_sentence': round(self.words / self.sentences, 2) if self.sentences > 0 else 0,
                'avg_syllables_per_word': round(self.syllables / self.words, 2) if self.words > 0 else 0
            },
            'readability_scores': {}
        }

        # Add appropriate scores based on language
        if self.language == 'it':
            scores['readability_scores']['flesch_reading_ease_it'] = self.flesch_reading_ease()
            scores['readability_scores']['gulpease_index'] = self.gulpease_index()
            scores['readability_scores']['gunning_fog_index'] = self.gunning_fog_index()
        else:
            scores['readability_scores']['flesch_reading_ease'] = self.flesch_reading_ease()
            scores['readability_scores']['flesch_kincaid_grade'] = self.flesch_kincaid_grade()
            scores['readability_scores']['gunning_fog_index'] = self.gunning_fog_index()

        return scores

    def interpret_scores(self):
        """Provide interpretation of readability scores"""
        scores = self.get_all_scores()
        interpretation = []

        if self.language == 'it':
            # Flesch Reading Ease (Italian)
            fre = scores['readability_scores']['flesch_reading_ease_it']
            if fre >= 80:
                interpretation.append(f"Flesch Reading Ease (IT): {fre} - Molto facile (Very easy)")
            elif fre >= 60:
                interpretation.append(f"Flesch Reading Ease (IT): {fre} - Facile (Easy)")
            elif fre >= 50:
                interpretation.append(f"Flesch Reading Ease (IT): {fre} - Abbastanza facile (Fairly easy)")
            elif fre >= 40:
                interpretation.append(f"Flesch Reading Ease (IT): {fre} - Normale (Normal)")
            elif fre >= 30:
                interpretation.append(f"Flesch Reading Ease (IT): {fre} - Abbastanza difficile (Fairly difficult)")
            else:
                interpretation.append(f"Flesch Reading Ease (IT): {fre} - Difficile (Difficult)")

            # Gulpease Index
            gulpease = scores['readability_scores']['gulpease_index']
            if gulpease >= 80:
                interpretation.append(f"Gulpease Index: {gulpease} - Elementare (Elementary school)")
            elif gulpease >= 60:
                interpretation.append(f"Gulpease Index: {gulpease} - Media inferiore (Middle school)")
            elif gulpease >= 40:
                interpretation.append(f"Gulpease Index: {gulpease} - Media superiore (High school)")
            else:
                interpretation.append(f"Gulpease Index: {gulpease} - Universitario (University)")
        else:
            # Flesch Reading Ease (English)
            fre = scores['readability_scores']['flesch_reading_ease']
            if fre >= 90:
                interpretation.append(f"Flesch Reading Ease: {fre} - Very easy (5th grade)")
            elif fre >= 80:
                interpretation.append(f"Flesch Reading Ease: {fre} - Easy (6th grade)")
            elif fre >= 70:
                interpretation.append(f"Flesch Reading Ease: {fre} - Fairly easy (7th grade)")
            elif fre >= 60:
                interpretation.append(f"Flesch Reading Ease: {fre} - Standard (8th-9th grade)")
            elif fre >= 50:
                interpretation.append(f"Flesch Reading Ease: {fre} - Fairly difficult (10th-12th grade)")
            elif fre >= 30:
                interpretation.append(f"Flesch Reading Ease: {fre} - Difficult (College)")
            else:
                interpretation.append(f"Flesch Reading Ease: {fre} - Very difficult (College graduate)")

            # Flesch-Kincaid Grade
            fkg = scores['readability_scores']['flesch_kincaid_grade']
            interpretation.append(f"Flesch-Kincaid Grade: {fkg} (US grade level)")

        # Gunning Fog Index (both languages)
        fog = scores['readability_scores']['gunning_fog_index']
        interpretation.append(f"Gunning Fog Index: {fog} (years of education needed)")

        return '\n'.join(interpretation)


# Example usage
if __name__ == "__main__":
    # English example
    english_text = """
    The quick brown fox jumps over the lazy dog. This is a simple sentence.
    However, more complicated sentences with multisyllabic words can significantly
    increase the complexity of the text and make it harder to read.
    """

    print("=== ENGLISH TEXT ANALYSIS ===")
    analyzer_en = ReadabilityAnalyzer(english_text, language='en')
    scores_en = analyzer_en.get_all_scores()

    print("\nBasic Statistics:")
    for key, value in scores_en['basic_stats'].items():
        print(f"  {key}: {value}")

    print("\nReadability Scores:")
    for key, value in scores_en['readability_scores'].items():
        print(f"  {key}: {value}")

    print("\nInterpretation:")
    print(analyzer_en.interpret_scores())

    # Italian example
    italian_text = """
    Il veloce cane marrone salta sopra il cane pigro. Questa è una frase semplice.
    Tuttavia, frasi più complicate con parole polisillabiche possono aumentare
    significativamente la complessità del testo e renderlo più difficile da leggere.
    """

    print("\n\n=== ITALIAN TEXT ANALYSIS ===")
    analyzer_it = ReadabilityAnalyzer(italian_text, language='it')
    scores_it = analyzer_it.get_all_scores()

    print("\nBasic Statistics:")
    for key, value in scores_it['basic_stats'].items():
        print(f"  {key}: {value}")

    print("\nReadability Scores:")
    for key, value in scores_it['readability_scores'].items():
        print(f"  {key}: {value}")

    print("\nInterpretation:")
    print(analyzer_it.interpret_scores())