290 lines
11 KiB
Python
290 lines
11 KiB
Python
import re
|
|
from collections import Counter
|
|
|
|
"""
|
|
For English texts:
|
|
|
|
Flesch Reading Ease score
|
|
Flesch-Kincaid Grade Level
|
|
Gunning Fog Index
|
|
|
|
For Italian texts:
|
|
|
|
Flesch Reading Ease (adapted for Italian with Flesch-Vacca formula)
|
|
Gulpease Index (specifically designed for Italian)
|
|
Gunning Fog Index
|
|
|
|
Basic statistics for both:
|
|
|
|
Sentence count
|
|
Word count
|
|
Syllable count
|
|
Complex words (3+ syllables)
|
|
Average words per sentence
|
|
Average syllables per word
|
|
"""
|
|
|
|
class ReadabilityAnalyzer:
|
|
"""Analyze text readability for English and Italian"""
|
|
|
|
def __init__(self, text, language='en'):
|
|
self.text = text
|
|
self.language = language.lower()
|
|
self.sentences = self._count_sentences()
|
|
self.words = self._count_words()
|
|
self.syllables = self._count_syllables()
|
|
self.complex_words = self._count_complex_words()
|
|
self.characters = len(re.sub(r'\s', '', text))
|
|
|
|
def _count_sentences(self):
|
|
"""Count sentences in text"""
|
|
sentences = re.split(r'[.!?]+', self.text)
|
|
return len([s for s in sentences if s.strip()])
|
|
|
|
def _count_words(self):
|
|
"""Count words in text"""
|
|
words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text)
|
|
return len(words)
|
|
|
|
def _count_syllables(self):
|
|
"""Count syllables in text (approximation for both languages)"""
|
|
words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text.lower())
|
|
total_syllables = 0
|
|
|
|
for word in words:
|
|
if self.language == 'it':
|
|
syllables = self._count_syllables_italian(word)
|
|
else:
|
|
syllables = self._count_syllables_english(word)
|
|
total_syllables += syllables
|
|
|
|
return total_syllables
|
|
|
|
def _count_syllables_english(self, word):
|
|
"""Count syllables in English word"""
|
|
word = word.lower()
|
|
vowels = 'aeiouy'
|
|
syllables = 0
|
|
previous_was_vowel = False
|
|
|
|
for char in word:
|
|
is_vowel = char in vowels
|
|
if is_vowel and not previous_was_vowel:
|
|
syllables += 1
|
|
previous_was_vowel = is_vowel
|
|
|
|
# Adjust for silent e
|
|
if word.endswith('e'):
|
|
syllables -= 1
|
|
|
|
# Ensure at least 1 syllable
|
|
if syllables == 0:
|
|
syllables = 1
|
|
|
|
return syllables
|
|
|
|
def _count_syllables_italian(self, word):
|
|
"""Count syllables in Italian word"""
|
|
word = word.lower()
|
|
vowels = 'aeiouàèéìòùáíóúý'
|
|
syllables = 0
|
|
previous_was_vowel = False
|
|
|
|
for char in word:
|
|
is_vowel = char in vowels
|
|
if is_vowel and not previous_was_vowel:
|
|
syllables += 1
|
|
previous_was_vowel = is_vowel
|
|
|
|
# Ensure at least 1 syllable
|
|
if syllables == 0:
|
|
syllables = 1
|
|
|
|
return syllables
|
|
|
|
def _count_complex_words(self):
|
|
"""Count words with 3+ syllables"""
|
|
words = re.findall(r'\b[a-zA-ZàèéìòùÀÈÉÌÒÙáíóúýÁÍÓÚÝâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]+\b', self.text.lower())
|
|
complex_count = 0
|
|
|
|
for word in words:
|
|
if self.language == 'it':
|
|
syllables = self._count_syllables_italian(word)
|
|
else:
|
|
syllables = self._count_syllables_english(word)
|
|
|
|
if syllables >= 3:
|
|
complex_count += 1
|
|
|
|
return complex_count
|
|
|
|
def flesch_reading_ease(self):
|
|
"""Calculate Flesch Reading Ease score"""
|
|
if self.words == 0 or self.sentences == 0:
|
|
return 0
|
|
|
|
if self.language == 'it':
|
|
# Flesch-Vacca formula for Italian
|
|
score = 206.835 - 1.3 * (self.words / self.sentences) - 60.1 * (self.syllables / self.words)
|
|
else:
|
|
# Standard Flesch formula for English
|
|
score = 206.835 - 1.015 * (self.words / self.sentences) - 84.6 * (self.syllables / self.words)
|
|
|
|
return round(score, 2)
|
|
|
|
def flesch_kincaid_grade(self):
|
|
"""Calculate Flesch-Kincaid Grade Level (primarily for English)"""
|
|
if self.words == 0 or self.sentences == 0:
|
|
return 0
|
|
|
|
grade = 0.39 * (self.words / self.sentences) + 11.8 * (self.syllables / self.words) - 15.59
|
|
return round(grade, 2)
|
|
|
|
def gunning_fog_index(self):
|
|
"""Calculate Gunning Fog Index"""
|
|
if self.words == 0 or self.sentences == 0:
|
|
return 0
|
|
|
|
fog = 0.4 * ((self.words / self.sentences) + 100 * (self.complex_words / self.words))
|
|
return round(fog, 2)
|
|
|
|
def gulpease_index(self):
|
|
"""Calculate Gulpease Index (for Italian)"""
|
|
if self.words == 0:
|
|
return 0
|
|
|
|
gulpease = 89 - (self.characters / self.words * 10) + (self.sentences / self.words * 300)
|
|
return round(gulpease, 2)
|
|
|
|
def get_all_scores(self):
|
|
"""Get all readability scores"""
|
|
scores = {
|
|
'basic_stats': {
|
|
'sentences': self.sentences,
|
|
'words': self.words,
|
|
'syllables': self.syllables,
|
|
'complex_words': self.complex_words,
|
|
'characters': self.characters,
|
|
'avg_words_per_sentence': round(self.words / self.sentences, 2) if self.sentences > 0 else 0,
|
|
'avg_syllables_per_word': round(self.syllables / self.words, 2) if self.words > 0 else 0
|
|
},
|
|
'readability_scores': {}
|
|
}
|
|
|
|
# Add appropriate scores based on language
|
|
if self.language == 'it':
|
|
scores['readability_scores']['flesch_reading_ease_it'] = self.flesch_reading_ease()
|
|
scores['readability_scores']['gulpease_index'] = self.gulpease_index()
|
|
scores['readability_scores']['gunning_fog_index'] = self.gunning_fog_index()
|
|
else:
|
|
scores['readability_scores']['flesch_reading_ease'] = self.flesch_reading_ease()
|
|
scores['readability_scores']['flesch_kincaid_grade'] = self.flesch_kincaid_grade()
|
|
scores['readability_scores']['gunning_fog_index'] = self.gunning_fog_index()
|
|
|
|
return scores
|
|
|
|
def interpret_scores(self):
|
|
"""Provide interpretation of readability scores"""
|
|
scores = self.get_all_scores()
|
|
interpretation = []
|
|
|
|
if self.language == 'it':
|
|
# Flesch Reading Ease (Italian)
|
|
fre = scores['readability_scores']['flesch_reading_ease_it']
|
|
if fre >= 80:
|
|
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Molto facile (Very easy)")
|
|
elif fre >= 60:
|
|
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Facile (Easy)")
|
|
elif fre >= 50:
|
|
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Abbastanza facile (Fairly easy)")
|
|
elif fre >= 40:
|
|
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Normale (Normal)")
|
|
elif fre >= 30:
|
|
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Abbastanza difficile (Fairly difficult)")
|
|
else:
|
|
interpretation.append(f"Flesch Reading Ease (IT): {fre} - Difficile (Difficult)")
|
|
|
|
# Gulpease Index
|
|
gulpease = scores['readability_scores']['gulpease_index']
|
|
if gulpease >= 80:
|
|
interpretation.append(f"Gulpease Index: {gulpease} - Elementare (Elementary school)")
|
|
elif gulpease >= 60:
|
|
interpretation.append(f"Gulpease Index: {gulpease} - Media inferiore (Middle school)")
|
|
elif gulpease >= 40:
|
|
interpretation.append(f"Gulpease Index: {gulpease} - Media superiore (High school)")
|
|
else:
|
|
interpretation.append(f"Gulpease Index: {gulpease} - Universitario (University)")
|
|
else:
|
|
# Flesch Reading Ease (English)
|
|
fre = scores['readability_scores']['flesch_reading_ease']
|
|
if fre >= 90:
|
|
interpretation.append(f"Flesch Reading Ease: {fre} - Very easy (5th grade)")
|
|
elif fre >= 80:
|
|
interpretation.append(f"Flesch Reading Ease: {fre} - Easy (6th grade)")
|
|
elif fre >= 70:
|
|
interpretation.append(f"Flesch Reading Ease: {fre} - Fairly easy (7th grade)")
|
|
elif fre >= 60:
|
|
interpretation.append(f"Flesch Reading Ease: {fre} - Standard (8th-9th grade)")
|
|
elif fre >= 50:
|
|
interpretation.append(f"Flesch Reading Ease: {fre} - Fairly difficult (10th-12th grade)")
|
|
elif fre >= 30:
|
|
interpretation.append(f"Flesch Reading Ease: {fre} - Difficult (College)")
|
|
else:
|
|
interpretation.append(f"Flesch Reading Ease: {fre} - Very difficult (College graduate)")
|
|
|
|
# Flesch-Kincaid Grade
|
|
fkg = scores['readability_scores']['flesch_kincaid_grade']
|
|
interpretation.append(f"Flesch-Kincaid Grade: {fkg} (US grade level)")
|
|
|
|
# Gunning Fog Index (both languages)
|
|
fog = scores['readability_scores']['gunning_fog_index']
|
|
interpretation.append(f"Gunning Fog Index: {fog} (years of education needed)")
|
|
|
|
return '\n'.join(interpretation)
|
|
|
|
|
|
# Example usage
|
|
if __name__ == "__main__":
|
|
# English example
|
|
english_text = """
|
|
The quick brown fox jumps over the lazy dog. This is a simple sentence.
|
|
However, more complicated sentences with multisyllabic words can significantly
|
|
increase the complexity of the text and make it harder to read.
|
|
"""
|
|
|
|
print("=== ENGLISH TEXT ANALYSIS ===")
|
|
analyzer_en = ReadabilityAnalyzer(english_text, language='en')
|
|
scores_en = analyzer_en.get_all_scores()
|
|
|
|
print("\nBasic Statistics:")
|
|
for key, value in scores_en['basic_stats'].items():
|
|
print(f" {key}: {value}")
|
|
|
|
print("\nReadability Scores:")
|
|
for key, value in scores_en['readability_scores'].items():
|
|
print(f" {key}: {value}")
|
|
|
|
print("\nInterpretation:")
|
|
print(analyzer_en.interpret_scores())
|
|
|
|
# Italian example
|
|
italian_text = """
|
|
Il veloce cane marrone salta sopra il cane pigro. Questa è una frase semplice.
|
|
Tuttavia, frasi più complicate con parole polisillabiche possono aumentare
|
|
significativamente la complessità del testo e renderlo più difficile da leggere.
|
|
"""
|
|
|
|
print("\n\n=== ITALIAN TEXT ANALYSIS ===")
|
|
analyzer_it = ReadabilityAnalyzer(italian_text, language='it')
|
|
scores_it = analyzer_it.get_all_scores()
|
|
|
|
print("\nBasic Statistics:")
|
|
for key, value in scores_it['basic_stats'].items():
|
|
print(f" {key}: {value}")
|
|
|
|
print("\nReadability Scores:")
|
|
for key, value in scores_it['readability_scores'].items():
|
|
print(f" {key}: {value}")
|
|
|
|
print("\nInterpretation:")
|
|
print(analyzer_it.interpret_scores()) |