wcag_AI_validation/scripts/manage_mllm_response.ipynb

75 KiB
Raw Blame History

1) semantic, sparse, bertscore, lexical similarity between LLM on new_alt_text proposal

In [1]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


#funziona meglio che sentence_embeddings perchè il tokenizer skippa token speciali
def bert_score(reference, candidate, return_similarity_matrix=False):
    
    # Load the BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertModel.from_pretrained("bert-base-uncased")
    
    # Tokenize the input text
    ref_tokens = tokenizer(reference, return_tensors="pt", add_special_tokens=False)#add_special_tokens=False mi evita codifica dei caratteri speciali
    can_tokens = tokenizer(candidate, return_tensors="pt", add_special_tokens=False)
    print("ref_tokens:",ref_tokens)
    print("can_tokens:",can_tokens)

    # Get the BERT embeddings
    model.eval()
    with torch.no_grad():
        ref_outputs = model(**ref_tokens)
        ref_embeddings = ref_outputs.last_hidden_state[0]

        can_outputs = model(**can_tokens)
        can_embeddings = can_outputs.last_hidden_state[0]
        #print("can_embeddings:",can_embeddings,can_embeddings.shape)
        
    # Compute cosine similarities
    cosine_similarities = np.zeros((can_embeddings.shape[0], ref_embeddings.shape[0]))
    for i, c in enumerate(can_embeddings):
        for j, r in enumerate(ref_embeddings):
            cosine_similarities[i, j] = cosine_similarity(c, r)
    

    # Align cosine similarities
    max_similarities = cosine_similarities.max(axis=1)

    # Average similarity scores
    bertscore = max_similarities.mean()

    if return_similarity_matrix:
        return bertscore, cosine_similarities
    else:
        return bertscore
c:\Users\nicola\anaconda3\envs\accessibility\lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text
In [3]:
#just a test
bertscore, sim_matrix=bert_score("dog", "zebra",return_similarity_matrix=True)
print(bertscore)
print(sim_matrix)
print(sim_matrix.shape)
ref_tokens: {'input_ids': tensor([[3899]]), 'token_type_ids': tensor([[0]]), 'attention_mask': tensor([[1]])}
can_tokens: {'input_ids': tensor([[29145]]), 'token_type_ids': tensor([[0]]), 'attention_mask': tensor([[1]])}
0.4610232412815094
[[0.46102324]]
(1, 1)
C:\Users\nicola\AppData\Local\Temp\ipykernel_20916\1344219625.py:6: DeprecationWarning: __array__ implementation doesn't accept a copy keyword, so passing copy=False failed. __array__ must implement 'dtype' and 'copy' keyword arguments. To learn more, see the migration guide https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword
  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
In [4]:
import pandas as pd
In [5]:
df_mllm_response=pd.read_json("C:\cartella_condivisa\MachineLearning\HIISlab\\accessibility\\notebook_miei\LLM_accessibility_validator\outputs\https_www.bbc.com_gemma3-4b_2025_11_23-09_45_08\mllm_alttext_assessments.json")
df_mllm_response
Out[5]:
image_url alt_text mllm_response
0 https://ichef.bbci.co.uk/news/480/cpsprodpb/fd... A shot through a window of a wasteland in Pokr... {'original_alt_text_assessment': '4', 'assessm...
1 https://ichef.bbci.co.uk/news/480/cpsprodpb/72... A man in a Caracas market speaks while holding... {'original_alt_text_assessment': '4', 'assessm...
2 https://ichef.bbci.co.uk/ace/standard/480/cpsp... A man stands in a pile of rubble with a damage... {'original_alt_text_assessment': '4', 'assessm...
3 https://ichef.bbci.co.uk/images/ic/480x270/p0m... A collage showing a man in a suit with a newsp... {'original_alt_text_assessment': '4', 'assessm...
4 https://ichef.bbci.co.uk/news/480/cpsprodpb/2b... A composite image of Rupert Grint as an adult,... {'original_alt_text_assessment': '3', 'assessm...
5 https://ichef.bbci.co.uk/images/ic/480x270/p0m... The Global Story, The Global Story, Is there a... {'original_alt_text_assessment': '3', 'assessm...
6 https://ichef.bbci.co.uk/news/1024/cpsprodpb/3... The pink facade of the Saudades art deco-style... {'original_alt_text_assessment': '2', 'assessm...
7 https://ichef.bbci.co.uk/news/480/cpsprodpb/66... Lando Norris speaking to Max Verstappen after ... {'original_alt_text_assessment': '4', 'assessm...
8 https://ichef.bbci.co.uk/news/480/cpsprodpb/10... The Northern Lights snake across the curvature... {'original_alt_text_assessment': '4', 'assessm...
9 https://ichef.bbci.co.uk/news/480/cpsprodpb/e1... A woman cries as she holds her injured child, ... {'original_alt_text_assessment': '4', 'assessm...
In [6]:
df_mllm_response.iloc[0]['mllm_response']
Out[6]:
{'original_alt_text_assessment': '4',
 'assessment': 'failure',
 'evaluation_result': "The alt-text accurately describes the image content but lacks context. While 'A shot through a window of a wasteland in Pokrovsk' is technically correct, it doesn't convey the significance of the image within the article. The image depicts the devastation caused by the conflict in Pokrovsk, highlighting the ongoing war and the challenges faced by Ukrainian troops. The alt-text should reflect this broader context. Its a descriptive caption, not an informative alt-text.",
 'new_alt_text': 'Devastation in Pokrovsk, Ukraine - a city under siege during the ongoing conflict.'}
In [7]:
def apply_parser_to_dataframe(df, column_name='mllm_response'):
    """
    Apply the MLLM response parser to a dataframe column and expand results into separate columns.
    
    Args:
        df (pd.DataFrame): The input dataframe
        column_name (str): Name of the column containing MLLM responses
        
    Returns:
        pd.DataFrame: Dataframe with additional columns for parsed attributes
    """
    # Apply the parsing function to each row
    #parsed_results = df[column_name].apply(parse_mllm_alt_text_response) #skipped because already parsed
    parsed_results = df[column_name]
    
    # Convert the list of dictionaries to a DataFrame
    parsed_df = pd.DataFrame(parsed_results.tolist())
    
    # Concatenate with the original dataframe
    result_df = pd.concat([df, parsed_df], axis=1)
    
    return result_df
In [8]:
df_mllm_response_parsed=apply_parser_to_dataframe(df_mllm_response)
df_mllm_response_parsed
Out[8]:
image_url alt_text mllm_response original_alt_text_assessment assessment evaluation_result new_alt_text
0 https://ichef.bbci.co.uk/news/480/cpsprodpb/fd... A shot through a window of a wasteland in Pokr... {'original_alt_text_assessment': '4', 'assessm... 4 failure The alt-text accurately describes the image co... Devastation in Pokrovsk, Ukraine - a city unde...
1 https://ichef.bbci.co.uk/news/480/cpsprodpb/72... A man in a Caracas market speaks while holding... {'original_alt_text_assessment': '4', 'assessm... 4 failure The alt-text accurately describes the image co... Man in Caracas market speaks about rising food...
2 https://ichef.bbci.co.uk/ace/standard/480/cpsp... A man stands in a pile of rubble with a damage... {'original_alt_text_assessment': '4', 'assessm... 4 failure The alt-text describes the scene accurately bu... Ukraine: Man surveys damage after missile stri...
3 https://ichef.bbci.co.uk/images/ic/480x270/p0m... A collage showing a man in a suit with a newsp... {'original_alt_text_assessment': '4', 'assessm... 4 failure The alt-text is not appropriate. The image dep... Satirical depiction of information overload an...
4 https://ichef.bbci.co.uk/news/480/cpsprodpb/2b... A composite image of Rupert Grint as an adult,... {'original_alt_text_assessment': '3', 'assessm... 3 failure The alt-text A composite image of Rupert Grin... Rupert Grint: Ron Weasley adult and child
5 https://ichef.bbci.co.uk/images/ic/480x270/p0m... The Global Story, The Global Story, Is there a... {'original_alt_text_assessment': '3', 'assessm... 3 failure The alt-text is overly verbose and doesn't acc... Protest demonstration.
6 https://ichef.bbci.co.uk/news/1024/cpsprodpb/3... The pink facade of the Saudades art deco-style... {'original_alt_text_assessment': '2', 'assessm... 2 failure The alt-text is inappropriate. The image depic... Art Deco building facade in Mumbai, India.
7 https://ichef.bbci.co.uk/news/480/cpsprodpb/66... Lando Norris speaking to Max Verstappen after ... {'original_alt_text_assessment': '4', 'assessm... 4 failure The alt-text is overly specific and doesn't ac... Lando Norris speaking to Max Verstappen after ...
8 https://ichef.bbci.co.uk/news/480/cpsprodpb/10... The Northern Lights snake across the curvature... {'original_alt_text_assessment': '4', 'assessm... 4 failure The alt-text is not appropriate. While it desc... NASA astronaut's footage of the Northern Light...
9 https://ichef.bbci.co.uk/news/480/cpsprodpb/e1... A woman cries as she holds her injured child, ... {'original_alt_text_assessment': '4', 'assessm... 4 failure The alt-text is overly descriptive and verbose... Israeli strikes in Gaza a woman holds her cr...
In [9]:
df_mllm_response_1=pd.read_json("C:\cartella_condivisa\MachineLearning\HIISlab\\accessibility\\notebook_miei\LLM_accessibility_validator\outputs\https_www.bbc.com_gpt-4o_2025_11_23-09_47_49\mllm_alttext_assessments.json")
df_mllm_response_1
Out[9]:
image_url alt_text mllm_response
0 https://ichef.bbci.co.uk/news/480/cpsprodpb/fd... A shot through a window of a wasteland in Pokr... {'original_alt_text_assessment': '5', 'assessm...
1 https://ichef.bbci.co.uk/news/480/cpsprodpb/72... A man in a Caracas market speaks while holding... {'original_alt_text_assessment': '3', 'assessm...
2 https://ichef.bbci.co.uk/ace/standard/480/cpsp... A man stands in a pile of rubble with a damage... {'original_alt_text_assessment': '3', 'assessm...
3 https://ichef.bbci.co.uk/images/ic/480x270/p0m... A collage showing a man in a suit with a newsp... {'original_alt_text_assessment': '3', 'assessm...
4 https://ichef.bbci.co.uk/news/480/cpsprodpb/2b... A composite image of Rupert Grint as an adult,... {'original_alt_text_assessment': '4', 'assessm...
5 https://ichef.bbci.co.uk/images/ic/480x270/p0m... The Global Story, The Global Story, Is there a... {'original_alt_text_assessment': '2', 'assessm...
6 https://ichef.bbci.co.uk/news/1024/cpsprodpb/3... The pink facade of the Saudades art deco-style... {'original_alt_text_assessment': '2', 'assessm...
7 https://ichef.bbci.co.uk/news/480/cpsprodpb/66... Lando Norris speaking to Max Verstappen after ... {'original_alt_text_assessment': '3', 'assessm...
8 https://ichef.bbci.co.uk/news/480/cpsprodpb/10... The Northern Lights snake across the curvature... {'original_alt_text_assessment': '4', 'assessm...
9 https://ichef.bbci.co.uk/news/480/cpsprodpb/e1... A woman cries as she holds her injured child, ... {'original_alt_text_assessment': '4', 'assessm...
In [10]:
df_mllm_response_parsed_1=apply_parser_to_dataframe(df_mllm_response_1)
df_mllm_response_parsed_1
Out[10]:
image_url alt_text mllm_response original_alt_text_assessment assessment evaluation_result new_alt_text
0 https://ichef.bbci.co.uk/news/480/cpsprodpb/fd... A shot through a window of a wasteland in Pokr... {'original_alt_text_assessment': '5', 'assessm... 5 success The alt-text is appropriate as it provides a d... Damaged homes and wasteland in Pokrovsk, Ukrai...
1 https://ichef.bbci.co.uk/news/480/cpsprodpb/72... A man in a Caracas market speaks while holding... {'original_alt_text_assessment': '3', 'assessm... 3 warning The alt-text partially describes the image but... Man in Caracas market holding coffee, represen...
2 https://ichef.bbci.co.uk/ace/standard/480/cpsp... A man stands in a pile of rubble with a damage... {'original_alt_text_assessment': '3', 'assessm... 3 warning The original alt-text provides descriptive det... Destruction in Ukraine: rubble, burnt car, and...
3 https://ichef.bbci.co.uk/images/ic/480x270/p0m... A collage showing a man in a suit with a newsp... {'original_alt_text_assessment': '3', 'assessm... 3 warning The alt-text describes the image but doesn't c... Metaphor for restricted words on social media,...
4 https://ichef.bbci.co.uk/news/480/cpsprodpb/2b... A composite image of Rupert Grint as an adult,... {'original_alt_text_assessment': '4', 'assessm... 4 success The alt-text appropriately identifies the imag... Rupert Grint as an adult and child, reflecting...
5 https://ichef.bbci.co.uk/images/ic/480x270/p0m... The Global Story, The Global Story, Is there a... {'original_alt_text_assessment': '2', 'assessm... 2 failure The original alt-text does not directly descri... Protesters holding 'Trump Help!' and 'Refuge P...
6 https://ichef.bbci.co.uk/news/1024/cpsprodpb/3... The pink facade of the Saudades art deco-style... {'original_alt_text_assessment': '2', 'assessm... 2 failure The alt-text is inaccurate and does not match ... Yellow art deco-style building in Mumbai's Ban...
7 https://ichef.bbci.co.uk/news/480/cpsprodpb/66... Lando Norris speaking to Max Verstappen after ... {'original_alt_text_assessment': '3', 'assessm... 3 warning The alt-text describes the image content direc... Lando Norris discussing post-race matters afte...
8 https://ichef.bbci.co.uk/news/480/cpsprodpb/10... The Northern Lights snake across the curvature... {'original_alt_text_assessment': '4', 'assessm... 4 success The alt-text describes the image content effec... Northern Lights over Earths curvature filmed ...
9 https://ichef.bbci.co.uk/news/480/cpsprodpb/e1... A woman cries as she holds her injured child, ... {'original_alt_text_assessment': '4', 'assessm... 4 success The alt-text appropriately describes the emoti... A crying mother holds her injured child in a G...
In [12]:
reference=df_mllm_response_parsed["new_alt_text"].iloc[0]
candidate=df_mllm_response_parsed_1["new_alt_text"].iloc[0]
reference,candidate
Out[12]:
('Devastation in Pokrovsk, Ukraine - a city under siege during the ongoing conflict.',
 "Damaged homes and wasteland in Pokrovsk, Ukraine with smoke rising, highlighting war's impact on the city.")

semantic similarity (bertscore)

In [13]:
bertscore =bert_score(reference, candidate,return_similarity_matrix=False)
bertscore
ref_tokens: {'input_ids': tensor([[25594,  1999, 13433, 21638,  4492,  6711,  1010,  5924,  1011,  1037,
          2103,  2104,  6859,  2076,  1996,  7552,  4736,  1012]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
can_tokens: {'input_ids': tensor([[ 5591,  5014,  1998,  5949,  3122,  1999, 13433, 21638,  4492,  6711,
          1010,  5924,  2007,  5610,  4803,  1010, 20655,  2162,  1005,  1055,
          4254,  2006,  1996,  2103,  1012]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}
C:\Users\nicola\AppData\Local\Temp\ipykernel_20916\1344219625.py:6: DeprecationWarning: __array__ implementation doesn't accept a copy keyword, so passing copy=False failed. __array__ must implement 'dtype' and 'copy' keyword arguments. To learn more, see the migration guide https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword
  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
Out[13]:
np.float64(0.5812176442146302)

lexical similarity

In [14]:
from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity_sklearn
text=[reference,candidate]
# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words=None, analyzer='char', ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(text)
# Compute cosine similarity between each pair of entries
cos_sim_matrix = cosine_similarity_sklearn(tfidf_matrix)
cos_sim_matrix
Out[14]:
array([[1.        , 0.70703788],
       [0.70703788, 1.        ]])
In [14]:
tfidf_matrix.shape
Out[14]:
(2, 278)
In [15]:
cos_sim_lexical = cosine_similarity(tfidf_matrix[0].toarray().flatten(),tfidf_matrix[1].toarray().flatten())
cos_sim_lexical
Out[15]:
np.float64(0.7070378833564678)

semantic similarity embeddings

In [15]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode([reference, candidate],output_value="sentence_embedding")# comando per estrarre un embedding solo media di tutti gli embedding delle parole
                                                                                   # "token_embeddings" comando per estrarre un embedding per ogni parola
embeddings.shape
Out[15]:
(2, 768)
In [32]:
# per capire se usa default prompt_name per differenziare i task come modelli avanzati come gemma
print(model.default_prompt_name )
None
In [16]:
cos_sim_matrix_emb = cosine_similarity_sklearn(embeddings)
cos_sim_matrix_emb
Out[16]:
array([[0.9999995 , 0.82111526],
       [0.82111526, 1.        ]], dtype=float32)
In [17]:
cos_sim_emb = cosine_similarity(embeddings[0],embeddings[1])
cos_sim_emb
Out[17]:
np.float32(0.8211156)
In [18]:
# USING MODEL SIMILARITY METHOD
similarities = model.similarity(embeddings, embeddings)
similarities,model.similarity_fn_name
Out[18]:
(tensor([[1.0000, 0.8211],
         [0.8211, 1.0000]]),
 'cosine')

sparse encoder

In [20]:
from sentence_transformers import SparseEncoder

# 1. Load a pretrained SparseEncoder model
model = SparseEncoder("naver/splade-cocondenser-ensembledistil")
model.similarity_fn_name = "cosine" # set similarity function to cosine default is dot product

# The sentences to encode
sentences = [reference, candidate]

# 2. Calculate sparse embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [#, 30522] - sparse representation with vocabulary size dimensions

# 3. Calculate the embedding similarities (using dot product by default)
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor()

# 4. Check sparsity statistics
stats = SparseEncoder.sparsity(embeddings)
print(f"Sparsity: {stats['sparsity_ratio']:.2%}")  # Typically >99% zeros
print(f"Avg non-zero dimensions per embedding: {stats['active_dims']:.2f}")
torch.Size([2, 30522])
tensor([[1.0000, 0.6181],
        [0.6181, 1.0000]])
Sparsity: 99.62%
Avg non-zero dimensions per embedding: 116.50

2) Correlation analysis between LLM assessment response pairs

In [21]:
import numpy as np
import matplotlib.pyplot as plt
In [22]:
list1=df_mllm_response_parsed["original_alt_text_assessment"].astype(int).tolist()
list2=df_mllm_response_parsed_1["original_alt_text_assessment"].astype(int).tolist()
list1,list2
Out[22]:
([4, 4, 4, 4, 3, 3, 2, 4, 4, 4], [5, 3, 3, 3, 4, 2, 2, 3, 4, 4])
In [23]:
plt.scatter(list1, list2)
plt.plot(
    np.unique(list1),
    np.poly1d(np.polyfit(list1, list2, 1))(np.unique(list1))

)
plt.xlabel("gemma4b")
plt.ylabel("gtp-4o")
plt.show()

Correlation Coefficients

In [24]:
import pandas as pd
from scipy.stats import spearmanr, kendalltau

pearson_correlation = np.corrcoef(list1, list2)[0, 1]
spearman_correlation, _ = spearmanr(list1, list2)
kendall_tau_correlation, _ = kendalltau(list1, list2)

correlation_table = pd.DataFrame({
    "Pearson": [pearson_correlation],
    "Spearman": [spearman_correlation],
    "Kendall Tau": [kendall_tau_correlation]
}, index=['Results'])

correlation_table
Out[24]:
Pearson Spearman Kendall Tau
Results 0.53602 0.48319 0.422944