wcag_AI_validation/scripts/esercitazione_12_2025/utils.py

import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import torch
from bert_score import score
from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity_sklearn
import re


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
    return text


def cosine_similarity(a, b):
    return np.dot(a, b) / (
        np.linalg.norm(a) * np.linalg.norm(b) + 1e-10
    )  # Use epsilon for numerical stability


def semantic_similarity(text1, text2):

    # Handle empty strings explicitly
    if not text1.strip() or not text2.strip():
        return 0.0
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertModel.from_pretrained("bert-base-uncased")

    inputs1 = tokenizer(
        text1, return_tensors="pt"
    )  # no preprocess: The neural models are trained to handle natural text variations
    inputs2 = tokenizer(text2, return_tensors="pt")
    model.eval()
    with torch.no_grad():
        outputs1 = model(**inputs1)
        outputs2 = model(**inputs2)

    embedding1 = (
        outputs1.last_hidden_state.mean(dim=1).squeeze().numpy()
    )  # the average of all token embeddings as representation
    embedding2 = outputs2.last_hidden_state.mean(dim=1).squeeze().numpy()

    return cosine_similarity(embedding1, embedding2)


def semantic_similarity_sentence_transformer(text1, text2):

    # Handle empty strings explicitly
    if not text1.strip() or not text2.strip():
        return 0.0

    # Purpose-built for sentence embeddings
    model = SentenceTransformer(
        "all-MiniLM-L6-v2"
    )  # no preprocess: The neural models are trained to handle natural text variations
    embeddings = model.encode(
        [text1, text2],
        output_value="sentence_embedding",
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    )  # params "sentence_embedding" to prodcuce only one representation per sentence (the average of token embeddings)
    return cosine_similarity(embeddings[0], embeddings[1])

def extract_semantic_representation(text):
    # Handle empty strings explicitly
    if not text.strip():
        return 0.0

    # Purpose-built for sentence embeddings
    model = SentenceTransformer(
        "all-MiniLM-L6-v2"
    )  # no preprocess: The neural models are trained to handle natural text variations
    embeddings = model.encode(
        [text],
        output_value="sentence_embedding",
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    )  # params "sentence_embedding" to prodcuce only one representation per sentence (the average of token embeddings)
    return embeddings

def lexical_similarity(text1, text2):
    #vectorizer = TfidfVectorizer(stop_words=None, analyzer="char", ngram_range=(1, 3))
    vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(1, 1))
    text1 = preprocess_text(text1)  # only lexical needs preprocessing
    text2 = preprocess_text(text2)
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    vec1 = tfidf_matrix.toarray()[0]
    vec2 = tfidf_matrix.toarray()[1]
    return cosine_similarity(vec1, vec2)


def bert_score_similarity(texts1, texts2, batch=False):
    P, R, F1 = (
        score(  # no preprocess: The neural models are trained to handle natural text variations
            texts1,
            texts2,
            lang="en",
            verbose=False,
            model_type="bert-base-uncased",
            device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
            batch_size=32,
        )
    )
    return F1.tolist() if batch else F1.item()