1) semantic, sparse, bertscore, lexical similarity between LLM on new_alt_text proposal¶

In [1]:

import torch
import numpy as np
from transformers import BertTokenizer, BertModel

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


#funziona meglio che sentence_embeddings perchè il tokenizer skippa token speciali
def bert_score(reference, candidate, return_similarity_matrix=False):
    
    # Load the BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertModel.from_pretrained("bert-base-uncased")
    
    # Tokenize the input text
    ref_tokens = tokenizer(reference, return_tensors="pt", add_special_tokens=False)#add_special_tokens=False mi evita codifica dei caratteri speciali
    can_tokens = tokenizer(candidate, return_tensors="pt", add_special_tokens=False)
    print("ref_tokens:",ref_tokens)
    print("can_tokens:",can_tokens)

    # Get the BERT embeddings
    model.eval()
    with torch.no_grad():
        ref_outputs = model(**ref_tokens)
        ref_embeddings = ref_outputs.last_hidden_state[0]

        can_outputs = model(**can_tokens)
        can_embeddings = can_outputs.last_hidden_state[0]
        #print("can_embeddings:",can_embeddings,can_embeddings.shape)
        
    # Compute cosine similarities
    cosine_similarities = np.zeros((can_embeddings.shape[0], ref_embeddings.shape[0]))
    for i, c in enumerate(can_embeddings):
        for j, r in enumerate(ref_embeddings):
            cosine_similarities[i, j] = cosine_similarity(c, r)
    

    # Align cosine similarities
    max_similarities = cosine_similarities.max(axis=1)

    # Average similarity scores
    bertscore = max_similarities.mean()

    if return_similarity_matrix:
        return bertscore, cosine_similarities
    else:
        return bertscore

c:\Users\nicola\anaconda3\envs\accessibility\lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

In [2]:

from sklearn.feature_extraction.text import TfidfVectorizer
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [3]:

#just a test
bertscore, sim_matrix=bert_score("dog", "zebra",return_similarity_matrix=True)
print(bertscore)
print(sim_matrix)
print(sim_matrix.shape)

ref_tokens: {'input_ids': tensor([[3899]]), 'token_type_ids': tensor([[0]]), 'attention_mask': tensor([[1]])}
can_tokens: {'input_ids': tensor([[29145]]), 'token_type_ids': tensor([[0]]), 'attention_mask': tensor([[1]])}
0.4610232412815094
[[0.46102324]]
(1, 1)

C:\Users\nicola\AppData\Local\Temp\ipykernel_20916\1344219625.py:6: DeprecationWarning: __array__ implementation doesn't accept a copy keyword, so passing copy=False failed. __array__ must implement 'dtype' and 'copy' keyword arguments. To learn more, see the migration guide https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword
  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [4]:

import pandas as pd

In [5]:

df_mllm_response=pd.read_json("C:\cartella_condivisa\MachineLearning\HIISlab\\accessibility\\notebook_miei\LLM_accessibility_validator\outputs\https_www.bbc.com_gemma3-4b_2025_11_23-09_45_08\mllm_alttext_assessments.json")
df_mllm_response

Out[5]:

	image_url	alt_text	mllm_response
0	https://ichef.bbci.co.uk/news/480/cpsprodpb/fd...	A shot through a window of a wasteland in Pokr...	{'original_alt_text_assessment': '4', 'assessm...
1	https://ichef.bbci.co.uk/news/480/cpsprodpb/72...	A man in a Caracas market speaks while holding...	{'original_alt_text_assessment': '4', 'assessm...
2	https://ichef.bbci.co.uk/ace/standard/480/cpsp...	A man stands in a pile of rubble with a damage...	{'original_alt_text_assessment': '4', 'assessm...
3	https://ichef.bbci.co.uk/images/ic/480x270/p0m...	A collage showing a man in a suit with a newsp...	{'original_alt_text_assessment': '4', 'assessm...
4	https://ichef.bbci.co.uk/news/480/cpsprodpb/2b...	A composite image of Rupert Grint as an adult,...	{'original_alt_text_assessment': '3', 'assessm...
5	https://ichef.bbci.co.uk/images/ic/480x270/p0m...	The Global Story, The Global Story, Is there a...	{'original_alt_text_assessment': '3', 'assessm...
6	https://ichef.bbci.co.uk/news/1024/cpsprodpb/3...	The pink facade of the Saudades art deco-style...	{'original_alt_text_assessment': '2', 'assessm...
7	https://ichef.bbci.co.uk/news/480/cpsprodpb/66...	Lando Norris speaking to Max Verstappen after ...	{'original_alt_text_assessment': '4', 'assessm...
8	https://ichef.bbci.co.uk/news/480/cpsprodpb/10...	The Northern Lights snake across the curvature...	{'original_alt_text_assessment': '4', 'assessm...
9	https://ichef.bbci.co.uk/news/480/cpsprodpb/e1...	A woman cries as she holds her injured child, ...	{'original_alt_text_assessment': '4', 'assessm...

In [6]:

df_mllm_response.iloc[0]['mllm_response']

Out[6]:

{'original_alt_text_assessment': '4',
 'assessment': 'failure',
 'evaluation_result': "The alt-text accurately describes the image content but lacks context. While 'A shot through a window of a wasteland in Pokrovsk' is technically correct, it doesn't convey the significance of the image within the article. The image depicts the devastation caused by the conflict in Pokrovsk, highlighting the ongoing war and the challenges faced by Ukrainian troops. The alt-text should reflect this broader context. It’s a descriptive caption, not an informative alt-text.",
 'new_alt_text': 'Devastation in Pokrovsk, Ukraine - a city under siege during the ongoing conflict.'}

In [7]:

def apply_parser_to_dataframe(df, column_name='mllm_response'):
    """
    Apply the MLLM response parser to a dataframe column and expand results into separate columns.
    
    Args:
        df (pd.DataFrame): The input dataframe
        column_name (str): Name of the column containing MLLM responses
        
    Returns:
        pd.DataFrame: Dataframe with additional columns for parsed attributes
    """
    # Apply the parsing function to each row
    #parsed_results = df[column_name].apply(parse_mllm_alt_text_response) #skipped because already parsed
    parsed_results = df[column_name]
    
    # Convert the list of dictionaries to a DataFrame
    parsed_df = pd.DataFrame(parsed_results.tolist())
    
    # Concatenate with the original dataframe
    result_df = pd.concat([df, parsed_df], axis=1)
    
    return result_df

In [8]:

df_mllm_response_parsed=apply_parser_to_dataframe(df_mllm_response)
df_mllm_response_parsed

Out[8]:

	image_url	alt_text	mllm_response	original_alt_text_assessment	assessment	evaluation_result	new_alt_text
0	https://ichef.bbci.co.uk/news/480/cpsprodpb/fd...	A shot through a window of a wasteland in Pokr...	{'original_alt_text_assessment': '4', 'assessm...	4	failure	The alt-text accurately describes the image co...	Devastation in Pokrovsk, Ukraine - a city unde...
1	https://ichef.bbci.co.uk/news/480/cpsprodpb/72...	A man in a Caracas market speaks while holding...	{'original_alt_text_assessment': '4', 'assessm...	4	failure	The alt-text accurately describes the image co...	Man in Caracas market speaks about rising food...
2	https://ichef.bbci.co.uk/ace/standard/480/cpsp...	A man stands in a pile of rubble with a damage...	{'original_alt_text_assessment': '4', 'assessm...	4	failure	The alt-text describes the scene accurately bu...	Ukraine: Man surveys damage after missile stri...
3	https://ichef.bbci.co.uk/images/ic/480x270/p0m...	A collage showing a man in a suit with a newsp...	{'original_alt_text_assessment': '4', 'assessm...	4	failure	The alt-text is not appropriate. The image dep...	Satirical depiction of information overload an...
4	https://ichef.bbci.co.uk/news/480/cpsprodpb/2b...	A composite image of Rupert Grint as an adult,...	{'original_alt_text_assessment': '3', 'assessm...	3	failure	The alt-text ‘A composite image of Rupert Grin...	Rupert Grint: Ron Weasley – adult and child
5	https://ichef.bbci.co.uk/images/ic/480x270/p0m...	The Global Story, The Global Story, Is there a...	{'original_alt_text_assessment': '3', 'assessm...	3	failure	The alt-text is overly verbose and doesn't acc...	Protest demonstration.
6	https://ichef.bbci.co.uk/news/1024/cpsprodpb/3...	The pink facade of the Saudades art deco-style...	{'original_alt_text_assessment': '2', 'assessm...	2	failure	The alt-text is inappropriate. The image depic...	Art Deco building facade in Mumbai, India.
7	https://ichef.bbci.co.uk/news/480/cpsprodpb/66...	Lando Norris speaking to Max Verstappen after ...	{'original_alt_text_assessment': '4', 'assessm...	4	failure	The alt-text is overly specific and doesn't ac...	Lando Norris speaking to Max Verstappen after ...
8	https://ichef.bbci.co.uk/news/480/cpsprodpb/10...	The Northern Lights snake across the curvature...	{'original_alt_text_assessment': '4', 'assessm...	4	failure	The alt-text is not appropriate. While it desc...	NASA astronaut's footage of the Northern Light...
9	https://ichef.bbci.co.uk/news/480/cpsprodpb/e1...	A woman cries as she holds her injured child, ...	{'original_alt_text_assessment': '4', 'assessm...	4	failure	The alt-text is overly descriptive and verbose...	Israeli strikes in Gaza – a woman holds her cr...

In [9]:

df_mllm_response_1=pd.read_json("C:\cartella_condivisa\MachineLearning\HIISlab\\accessibility\\notebook_miei\LLM_accessibility_validator\outputs\https_www.bbc.com_gpt-4o_2025_11_23-09_47_49\mllm_alttext_assessments.json")
df_mllm_response_1

Out[9]:

	image_url	alt_text	mllm_response
0	https://ichef.bbci.co.uk/news/480/cpsprodpb/fd...	A shot through a window of a wasteland in Pokr...	{'original_alt_text_assessment': '5', 'assessm...
1	https://ichef.bbci.co.uk/news/480/cpsprodpb/72...	A man in a Caracas market speaks while holding...	{'original_alt_text_assessment': '3', 'assessm...
2	https://ichef.bbci.co.uk/ace/standard/480/cpsp...	A man stands in a pile of rubble with a damage...	{'original_alt_text_assessment': '3', 'assessm...
3	https://ichef.bbci.co.uk/images/ic/480x270/p0m...	A collage showing a man in a suit with a newsp...	{'original_alt_text_assessment': '3', 'assessm...
4	https://ichef.bbci.co.uk/news/480/cpsprodpb/2b...	A composite image of Rupert Grint as an adult,...	{'original_alt_text_assessment': '4', 'assessm...
5	https://ichef.bbci.co.uk/images/ic/480x270/p0m...	The Global Story, The Global Story, Is there a...	{'original_alt_text_assessment': '2', 'assessm...
6	https://ichef.bbci.co.uk/news/1024/cpsprodpb/3...	The pink facade of the Saudades art deco-style...	{'original_alt_text_assessment': '2', 'assessm...
7	https://ichef.bbci.co.uk/news/480/cpsprodpb/66...	Lando Norris speaking to Max Verstappen after ...	{'original_alt_text_assessment': '3', 'assessm...
8	https://ichef.bbci.co.uk/news/480/cpsprodpb/10...	The Northern Lights snake across the curvature...	{'original_alt_text_assessment': '4', 'assessm...
9	https://ichef.bbci.co.uk/news/480/cpsprodpb/e1...	A woman cries as she holds her injured child, ...	{'original_alt_text_assessment': '4', 'assessm...

In [10]:

df_mllm_response_parsed_1=apply_parser_to_dataframe(df_mllm_response_1)
df_mllm_response_parsed_1

Out[10]:

	image_url	alt_text	mllm_response	original_alt_text_assessment	assessment	evaluation_result	new_alt_text
0	https://ichef.bbci.co.uk/news/480/cpsprodpb/fd...	A shot through a window of a wasteland in Pokr...	{'original_alt_text_assessment': '5', 'assessm...	5	success	The alt-text is appropriate as it provides a d...	Damaged homes and wasteland in Pokrovsk, Ukrai...
1	https://ichef.bbci.co.uk/news/480/cpsprodpb/72...	A man in a Caracas market speaks while holding...	{'original_alt_text_assessment': '3', 'assessm...	3	warning	The alt-text partially describes the image but...	Man in Caracas market holding coffee, represen...
2	https://ichef.bbci.co.uk/ace/standard/480/cpsp...	A man stands in a pile of rubble with a damage...	{'original_alt_text_assessment': '3', 'assessm...	3	warning	The original alt-text provides descriptive det...	Destruction in Ukraine: rubble, burnt car, and...
3	https://ichef.bbci.co.uk/images/ic/480x270/p0m...	A collage showing a man in a suit with a newsp...	{'original_alt_text_assessment': '3', 'assessm...	3	warning	The alt-text describes the image but doesn't c...	Metaphor for restricted words on social media,...
4	https://ichef.bbci.co.uk/news/480/cpsprodpb/2b...	A composite image of Rupert Grint as an adult,...	{'original_alt_text_assessment': '4', 'assessm...	4	success	The alt-text appropriately identifies the imag...	Rupert Grint as an adult and child, reflecting...
5	https://ichef.bbci.co.uk/images/ic/480x270/p0m...	The Global Story, The Global Story, Is there a...	{'original_alt_text_assessment': '2', 'assessm...	2	failure	The original alt-text does not directly descri...	Protesters holding 'Trump Help!' and 'Refuge P...
6	https://ichef.bbci.co.uk/news/1024/cpsprodpb/3...	The pink facade of the Saudades art deco-style...	{'original_alt_text_assessment': '2', 'assessm...	2	failure	The alt-text is inaccurate and does not match ...	Yellow art deco-style building in Mumbai's Ban...
7	https://ichef.bbci.co.uk/news/480/cpsprodpb/66...	Lando Norris speaking to Max Verstappen after ...	{'original_alt_text_assessment': '3', 'assessm...	3	warning	The alt-text describes the image content direc...	Lando Norris discussing post-race matters afte...
8	https://ichef.bbci.co.uk/news/480/cpsprodpb/10...	The Northern Lights snake across the curvature...	{'original_alt_text_assessment': '4', 'assessm...	4	success	The alt-text describes the image content effec...	Northern Lights over Earth’s curvature filmed ...
9	https://ichef.bbci.co.uk/news/480/cpsprodpb/e1...	A woman cries as she holds her injured child, ...	{'original_alt_text_assessment': '4', 'assessm...	4	success	The alt-text appropriately describes the emoti...	A crying mother holds her injured child in a G...

In [12]:

reference=df_mllm_response_parsed["new_alt_text"].iloc[0]
candidate=df_mllm_response_parsed_1["new_alt_text"].iloc[0]
reference,candidate

Out[12]:

('Devastation in Pokrovsk, Ukraine - a city under siege during the ongoing conflict.',
 "Damaged homes and wasteland in Pokrovsk, Ukraine with smoke rising, highlighting war's impact on the city.")

semantic similarity (bertscore)¶

In [13]:

bertscore =bert_score(reference, candidate,return_similarity_matrix=False)
bertscore

ref_tokens: {'input_ids': tensor([[25594,  1999, 13433, 21638,  4492,  6711,  1010,  5924,  1011,  1037,
          2103,  2104,  6859,  2076,  1996,  7552,  4736,  1012]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
can_tokens: {'input_ids': tensor([[ 5591,  5014,  1998,  5949,  3122,  1999, 13433, 21638,  4492,  6711,
          1010,  5924,  2007,  5610,  4803,  1010, 20655,  2162,  1005,  1055,
          4254,  2006,  1996,  2103,  1012]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}

C:\Users\nicola\AppData\Local\Temp\ipykernel_20916\1344219625.py:6: DeprecationWarning: __array__ implementation doesn't accept a copy keyword, so passing copy=False failed. __array__ must implement 'dtype' and 'copy' keyword arguments. To learn more, see the migration guide https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword
  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

Out[13]:

np.float64(0.5812176442146302)

lexical similarity¶

In [14]:

from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity_sklearn
text=[reference,candidate]
# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words=None, analyzer='char', ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(text)
# Compute cosine similarity between each pair of entries
cos_sim_matrix = cosine_similarity_sklearn(tfidf_matrix)
cos_sim_matrix

Out[14]:

array([[1.        , 0.70703788],
       [0.70703788, 1.        ]])

In [14]:

tfidf_matrix.shape

Out[14]:

(2, 278)

In [15]:

cos_sim_lexical = cosine_similarity(tfidf_matrix[0].toarray().flatten(),tfidf_matrix[1].toarray().flatten())
cos_sim_lexical

Out[15]:

np.float64(0.7070378833564678)

semantic similarity embeddings¶

In [15]:

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode([reference, candidate],output_value="sentence_embedding")# comando per estrarre un embedding solo media di tutti gli embedding delle parole
                                                                                   # "token_embeddings" comando per estrarre un embedding per ogni parola
embeddings.shape

Out[15]:

(2, 768)

In [32]:

# per capire se usa default prompt_name per differenziare i task come modelli avanzati come gemma
print(model.default_prompt_name )

None

In [16]:

cos_sim_matrix_emb = cosine_similarity_sklearn(embeddings)
cos_sim_matrix_emb

Out[16]:

array([[0.9999995 , 0.82111526],
       [0.82111526, 1.        ]], dtype=float32)

In [17]:

cos_sim_emb = cosine_similarity(embeddings[0],embeddings[1])
cos_sim_emb

Out[17]:

np.float32(0.8211156)

In [18]:

# USING MODEL SIMILARITY METHOD
similarities = model.similarity(embeddings, embeddings)
similarities,model.similarity_fn_name

Out[18]:

(tensor([[1.0000, 0.8211],
         [0.8211, 1.0000]]),
 'cosine')

sparse encoder¶

In [20]:

from sentence_transformers import SparseEncoder

# 1. Load a pretrained SparseEncoder model
model = SparseEncoder("naver/splade-cocondenser-ensembledistil")
model.similarity_fn_name = "cosine" # set similarity function to cosine default is dot product

# The sentences to encode
sentences = [reference, candidate]

# 2. Calculate sparse embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [#, 30522] - sparse representation with vocabulary size dimensions

# 3. Calculate the embedding similarities (using dot product by default)
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor()

# 4. Check sparsity statistics
stats = SparseEncoder.sparsity(embeddings)
print(f"Sparsity: {stats['sparsity_ratio']:.2%}")  # Typically >99% zeros
print(f"Avg non-zero dimensions per embedding: {stats['active_dims']:.2f}")

torch.Size([2, 30522])
tensor([[1.0000, 0.6181],
        [0.6181, 1.0000]])
Sparsity: 99.62%
Avg non-zero dimensions per embedding: 116.50

2) Correlation analysis between LLM assessment response pairs¶

In [21]:

import numpy as np
import matplotlib.pyplot as plt

In [22]:

list1=df_mllm_response_parsed["original_alt_text_assessment"].astype(int).tolist()
list2=df_mllm_response_parsed_1["original_alt_text_assessment"].astype(int).tolist()
list1,list2

Out[22]:

([4, 4, 4, 4, 3, 3, 2, 4, 4, 4], [5, 3, 3, 3, 4, 2, 2, 3, 4, 4])

In [23]:

plt.scatter(list1, list2)
plt.plot(
    np.unique(list1),
    np.poly1d(np.polyfit(list1, list2, 1))(np.unique(list1))

)
plt.xlabel("gemma4b")
plt.ylabel("gtp-4o")
plt.show()

Correlation Coefficients¶

In [24]:

import pandas as pd
from scipy.stats import spearmanr, kendalltau

pearson_correlation = np.corrcoef(list1, list2)[0, 1]
spearman_correlation, _ = spearmanr(list1, list2)
kendall_tau_correlation, _ = kendalltau(list1, list2)

correlation_table = pd.DataFrame({
    "Pearson": [pearson_correlation],
    "Spearman": [spearman_correlation],
    "Kendall Tau": [kendall_tau_correlation]
}, index=['Results'])

correlation_table

Out[24]:

	Pearson	Spearman	Kendall Tau
Results	0.53602	0.48319	0.422944

75 KiB Raw Blame History Unescape Escape