import numpy as np from transformers import BertTokenizer, BertModel from sklearn.feature_extraction.text import TfidfVectorizer from sentence_transformers import SentenceTransformer import torch from bert_score import score from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity_sklearn import re def preprocess_text(text): text = text.lower() text = re.sub(r"[^\w\s]", "", text) # Remove punctuation text = re.sub(r"\s+", " ", text).strip() # Normalize whitespace return text def cosine_similarity(a, b): return np.dot(a, b) / ( np.linalg.norm(a) * np.linalg.norm(b) + 1e-10 ) # Use epsilon for numerical stability def semantic_similarity(text1, text2): # Handle empty strings explicitly if not text1.strip() or not text2.strip(): return 0.0 tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") model = BertModel.from_pretrained("bert-base-uncased") inputs1 = tokenizer( text1, return_tensors="pt" ) # no preprocess: The neural models are trained to handle natural text variations inputs2 = tokenizer(text2, return_tensors="pt") model.eval() with torch.no_grad(): outputs1 = model(**inputs1) outputs2 = model(**inputs2) embedding1 = ( outputs1.last_hidden_state.mean(dim=1).squeeze().numpy() ) # the average of all token embeddings as representation embedding2 = outputs2.last_hidden_state.mean(dim=1).squeeze().numpy() return cosine_similarity(embedding1, embedding2) def semantic_similarity_sentence_transformer(text1, text2): # Handle empty strings explicitly if not text1.strip() or not text2.strip(): return 0.0 # Purpose-built for sentence embeddings model = SentenceTransformer( "all-MiniLM-L6-v2" ) # no preprocess: The neural models are trained to handle natural text variations embeddings = model.encode( [text1, text2], output_value="sentence_embedding", device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), ) # params "sentence_embedding" to prodcuce only one representation per sentence (the average of token embeddings) return cosine_similarity(embeddings[0], embeddings[1]) def extract_semantic_representation(text): # Handle empty strings explicitly if not text.strip(): return 0.0 # Purpose-built for sentence embeddings model = SentenceTransformer( "all-MiniLM-L6-v2" ) # no preprocess: The neural models are trained to handle natural text variations embeddings = model.encode( [text], output_value="sentence_embedding", device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), ) # params "sentence_embedding" to prodcuce only one representation per sentence (the average of token embeddings) return embeddings def lexical_similarity(text1, text2): #vectorizer = TfidfVectorizer(stop_words=None, analyzer="char", ngram_range=(1, 3)) vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(1, 1)) text1 = preprocess_text(text1) # only lexical needs preprocessing text2 = preprocess_text(text2) tfidf_matrix = vectorizer.fit_transform([text1, text2]) vec1 = tfidf_matrix.toarray()[0] vec2 = tfidf_matrix.toarray()[1] return cosine_similarity(vec1, vec2) def bert_score_similarity(texts1, texts2, batch=False): P, R, F1 = ( score( # no preprocess: The neural models are trained to handle natural text variations texts1, texts2, lang="en", verbose=False, model_type="bert-base-uncased", device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), batch_size=32, ) ) return F1.tolist() if batch else F1.item()