75 KiB
75 KiB
1) semantic, sparse, bertscore, lexical similarity between LLM on new_alt_text proposal¶
In [1]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
#funziona meglio che sentence_embeddings perchè il tokenizer skippa token speciali
def bert_score(reference, candidate, return_similarity_matrix=False):
# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
# Tokenize the input text
ref_tokens = tokenizer(reference, return_tensors="pt", add_special_tokens=False)#add_special_tokens=False mi evita codifica dei caratteri speciali
can_tokens = tokenizer(candidate, return_tensors="pt", add_special_tokens=False)
print("ref_tokens:",ref_tokens)
print("can_tokens:",can_tokens)
# Get the BERT embeddings
model.eval()
with torch.no_grad():
ref_outputs = model(**ref_tokens)
ref_embeddings = ref_outputs.last_hidden_state[0]
can_outputs = model(**can_tokens)
can_embeddings = can_outputs.last_hidden_state[0]
#print("can_embeddings:",can_embeddings,can_embeddings.shape)
# Compute cosine similarities
cosine_similarities = np.zeros((can_embeddings.shape[0], ref_embeddings.shape[0]))
for i, c in enumerate(can_embeddings):
for j, r in enumerate(ref_embeddings):
cosine_similarities[i, j] = cosine_similarity(c, r)
# Align cosine similarities
max_similarities = cosine_similarities.max(axis=1)
# Average similarity scores
bertscore = max_similarities.mean()
if return_similarity_matrix:
return bertscore, cosine_similarities
else:
return bertscore
In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
def preprocess_text(text):
# Lowercase the text
text = text.lower()
# Remove punctuation
text = re.sub(r'[^\w\s]', '', text)
return text
In [3]:
#just a test
bertscore, sim_matrix=bert_score("dog", "zebra",return_similarity_matrix=True)
print(bertscore)
print(sim_matrix)
print(sim_matrix.shape)
In [4]:
import pandas as pd
In [5]:
df_mllm_response=pd.read_json("C:\cartella_condivisa\MachineLearning\HIISlab\\accessibility\\notebook_miei\LLM_accessibility_validator\outputs\https_www.bbc.com_gemma3-4b_2025_11_23-09_45_08\mllm_alttext_assessments.json")
df_mllm_response
Out[5]:
In [6]:
df_mllm_response.iloc[0]['mllm_response']
Out[6]:
In [7]:
def apply_parser_to_dataframe(df, column_name='mllm_response'):
"""
Apply the MLLM response parser to a dataframe column and expand results into separate columns.
Args:
df (pd.DataFrame): The input dataframe
column_name (str): Name of the column containing MLLM responses
Returns:
pd.DataFrame: Dataframe with additional columns for parsed attributes
"""
# Apply the parsing function to each row
#parsed_results = df[column_name].apply(parse_mllm_alt_text_response) #skipped because already parsed
parsed_results = df[column_name]
# Convert the list of dictionaries to a DataFrame
parsed_df = pd.DataFrame(parsed_results.tolist())
# Concatenate with the original dataframe
result_df = pd.concat([df, parsed_df], axis=1)
return result_df
In [8]:
df_mllm_response_parsed=apply_parser_to_dataframe(df_mllm_response)
df_mllm_response_parsed
Out[8]:
In [9]:
df_mllm_response_1=pd.read_json("C:\cartella_condivisa\MachineLearning\HIISlab\\accessibility\\notebook_miei\LLM_accessibility_validator\outputs\https_www.bbc.com_gpt-4o_2025_11_23-09_47_49\mllm_alttext_assessments.json")
df_mllm_response_1
Out[9]:
In [10]:
df_mllm_response_parsed_1=apply_parser_to_dataframe(df_mllm_response_1)
df_mllm_response_parsed_1
Out[10]:
In [12]:
reference=df_mllm_response_parsed["new_alt_text"].iloc[0]
candidate=df_mllm_response_parsed_1["new_alt_text"].iloc[0]
reference,candidate
Out[12]:
semantic similarity (bertscore)¶
In [13]:
bertscore =bert_score(reference, candidate,return_similarity_matrix=False)
bertscore
Out[13]:
lexical similarity¶
In [14]:
from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity_sklearn
text=[reference,candidate]
# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words=None, analyzer='char', ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(text)
# Compute cosine similarity between each pair of entries
cos_sim_matrix = cosine_similarity_sklearn(tfidf_matrix)
cos_sim_matrix
Out[14]:
In [14]:
tfidf_matrix.shape
Out[14]:
In [15]:
cos_sim_lexical = cosine_similarity(tfidf_matrix[0].toarray().flatten(),tfidf_matrix[1].toarray().flatten())
cos_sim_lexical
Out[15]:
semantic similarity embeddings¶
In [15]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode([reference, candidate],output_value="sentence_embedding")# comando per estrarre un embedding solo media di tutti gli embedding delle parole
# "token_embeddings" comando per estrarre un embedding per ogni parola
embeddings.shape
Out[15]:
In [32]:
# per capire se usa default prompt_name per differenziare i task come modelli avanzati come gemma
print(model.default_prompt_name )
In [16]:
cos_sim_matrix_emb = cosine_similarity_sklearn(embeddings)
cos_sim_matrix_emb
Out[16]:
In [17]:
cos_sim_emb = cosine_similarity(embeddings[0],embeddings[1])
cos_sim_emb
Out[17]:
In [18]:
# USING MODEL SIMILARITY METHOD
similarities = model.similarity(embeddings, embeddings)
similarities,model.similarity_fn_name
Out[18]:
sparse encoder¶
In [20]:
from sentence_transformers import SparseEncoder
# 1. Load a pretrained SparseEncoder model
model = SparseEncoder("naver/splade-cocondenser-ensembledistil")
model.similarity_fn_name = "cosine" # set similarity function to cosine default is dot product
# The sentences to encode
sentences = [reference, candidate]
# 2. Calculate sparse embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [#, 30522] - sparse representation with vocabulary size dimensions
# 3. Calculate the embedding similarities (using dot product by default)
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor()
# 4. Check sparsity statistics
stats = SparseEncoder.sparsity(embeddings)
print(f"Sparsity: {stats['sparsity_ratio']:.2%}") # Typically >99% zeros
print(f"Avg non-zero dimensions per embedding: {stats['active_dims']:.2f}")
2) Correlation analysis between LLM assessment response pairs¶
In [21]:
import numpy as np
import matplotlib.pyplot as plt
In [22]:
list1=df_mllm_response_parsed["original_alt_text_assessment"].astype(int).tolist()
list2=df_mllm_response_parsed_1["original_alt_text_assessment"].astype(int).tolist()
list1,list2
Out[22]:
In [23]:
plt.scatter(list1, list2)
plt.plot(
np.unique(list1),
np.poly1d(np.polyfit(list1, list2, 1))(np.unique(list1))
)
plt.xlabel("gemma4b")
plt.ylabel("gtp-4o")
plt.show()
Correlation Coefficients¶
In [24]:
import pandas as pd
from scipy.stats import spearmanr, kendalltau
pearson_correlation = np.corrcoef(list1, list2)[0, 1]
spearman_correlation, _ = spearmanr(list1, list2)
kendall_tau_correlation, _ = kendalltau(list1, list2)
correlation_table = pd.DataFrame({
"Pearson": [pearson_correlation],
"Spearman": [spearman_correlation],
"Kendall Tau": [kendall_tau_correlation]
}, index=['Results'])
correlation_table
Out[24]: