import numpy as np from transformers import BertTokenizer, BertModel from sklearn.feature_extraction.text import TfidfVectorizer from sentence_transformers import SentenceTransformer import torch from bert_score import score def cosine_similarity(a, b): return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) def semantic_similarity(text1, text2): tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") model = BertModel.from_pretrained("bert-base-uncased") inputs1 = tokenizer(text1, return_tensors="pt") inputs2 = tokenizer(text2, return_tensors="pt") with torch.no_grad(): outputs1 = model(**inputs1) outputs2 = model(**inputs2) embedding1 = outputs1.last_hidden_state.mean(dim=1).squeeze().numpy() embedding2 = outputs2.last_hidden_state.mean(dim=1).squeeze().numpy() return cosine_similarity(embedding1, embedding2) def semantic_similarity_sentence_transformer(text1, text2): # Purpose-built for sentence embeddings model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = model.encode([text1, text2], output_value="sentence_embedding") return cosine_similarity(embeddings[0], embeddings[1]) def lexical_similarity(text1, text2): vectorizer = TfidfVectorizer(stop_words=None, analyzer="char", ngram_range=(1, 3)) tfidf_matrix = vectorizer.fit_transform([text1, text2]) vec1 = tfidf_matrix.toarray()[0] vec2 = tfidf_matrix.toarray()[1] return cosine_similarity(vec1, vec2) def bert_score_similarity(texts1, texts2, batch=False): P, R, F1 = score( texts1, texts2, lang="en", verbose=False, model_type="bert-base-uncased", device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), batch_size=32, ) return F1.tolist() if batch else F1.item()