wcag_AI_validation/scripts/utils.py

56 lines
1.8 KiB
Python

import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import torch
from bert_score import score
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def semantic_similarity(text1, text2):
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
inputs1 = tokenizer(text1, return_tensors="pt")
inputs2 = tokenizer(text2, return_tensors="pt")
with torch.no_grad():
outputs1 = model(**inputs1)
outputs2 = model(**inputs2)
embedding1 = outputs1.last_hidden_state.mean(dim=1).squeeze().numpy()
embedding2 = outputs2.last_hidden_state.mean(dim=1).squeeze().numpy()
return cosine_similarity(embedding1, embedding2)
def semantic_similarity_sentence_transformer(text1, text2):
# Purpose-built for sentence embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode([text1, text2], output_value="sentence_embedding")
return cosine_similarity(embeddings[0], embeddings[1])
def lexical_similarity(text1, text2):
vectorizer = TfidfVectorizer(stop_words=None, analyzer="char", ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform([text1, text2])
vec1 = tfidf_matrix.toarray()[0]
vec2 = tfidf_matrix.toarray()[1]
return cosine_similarity(vec1, vec2)
def bert_score_similarity(texts1, texts2, batch=False):
P, R, F1 = score(
texts1,
texts2,
lang="en",
verbose=False,
model_type="bert-base-uncased",
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
batch_size=32,
)
return F1.tolist() if batch else F1.item()