import numpy as np from transformers import BertTokenizer, BertModel from sklearn.feature_extraction.text import TfidfVectorizer import torch from bert_score import score def cosine_similarity(a, b): return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) def semantic_similarity(text1, text2): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') inputs1 = tokenizer(text1, return_tensors='pt') inputs2 = tokenizer(text2, return_tensors='pt') with torch.no_grad(): outputs1 = model(**inputs1) outputs2 = model(**inputs2) embedding1 = outputs1.last_hidden_state.mean(dim=1).squeeze().numpy() embedding2 = outputs2.last_hidden_state.mean(dim=1).squeeze().numpy() return cosine_similarity(embedding1, embedding2) def lexical_similarity(text1, text2): vectorizer = TfidfVectorizer(stop_words=None, analyzer='char', ngram_range=(1, 3)) tfidf_matrix = vectorizer.fit_transform([text1, text2]) vec1 = tfidf_matrix.toarray()[0] vec2 = tfidf_matrix.toarray()[1] return cosine_similarity(vec1, vec2) def bert_score_similarity(texts1, texts2): P, R, F1 = score(texts1, texts2, lang='en', verbose=False, model_type='bert-base-uncased',device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')) return F1.item()