106 lines
3.7 KiB
Python
106 lines
3.7 KiB
Python
import numpy as np
|
|
from transformers import BertTokenizer, BertModel
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sentence_transformers import SentenceTransformer
|
|
import torch
|
|
from bert_score import score
|
|
from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity_sklearn
|
|
import re
|
|
|
|
|
|
def preprocess_text(text):
|
|
text = text.lower()
|
|
text = re.sub(r"[^\w\s]", "", text) # Remove punctuation
|
|
text = re.sub(r"\s+", " ", text).strip() # Normalize whitespace
|
|
return text
|
|
|
|
|
|
def cosine_similarity(a, b):
|
|
return np.dot(a, b) / (
|
|
np.linalg.norm(a) * np.linalg.norm(b) + 1e-10
|
|
) # Use epsilon for numerical stability
|
|
|
|
|
|
def semantic_similarity(text1, text2):
|
|
|
|
# Handle empty strings explicitly
|
|
if not text1.strip() or not text2.strip():
|
|
return 0.0
|
|
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
|
model = BertModel.from_pretrained("bert-base-uncased")
|
|
|
|
inputs1 = tokenizer(
|
|
text1, return_tensors="pt"
|
|
) # no preprocess: The neural models are trained to handle natural text variations
|
|
inputs2 = tokenizer(text2, return_tensors="pt")
|
|
model.eval()
|
|
with torch.no_grad():
|
|
outputs1 = model(**inputs1)
|
|
outputs2 = model(**inputs2)
|
|
|
|
embedding1 = (
|
|
outputs1.last_hidden_state.mean(dim=1).squeeze().numpy()
|
|
) # the average of all token embeddings as representation
|
|
embedding2 = outputs2.last_hidden_state.mean(dim=1).squeeze().numpy()
|
|
|
|
return cosine_similarity(embedding1, embedding2)
|
|
|
|
|
|
def semantic_similarity_sentence_transformer(text1, text2):
|
|
|
|
# Handle empty strings explicitly
|
|
if not text1.strip() or not text2.strip():
|
|
return 0.0
|
|
|
|
# Purpose-built for sentence embeddings
|
|
model = SentenceTransformer(
|
|
"all-MiniLM-L6-v2"
|
|
) # no preprocess: The neural models are trained to handle natural text variations
|
|
embeddings = model.encode(
|
|
[text1, text2],
|
|
output_value="sentence_embedding",
|
|
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
|
|
) # params "sentence_embedding" to prodcuce only one representation per sentence (the average of token embeddings)
|
|
return cosine_similarity(embeddings[0], embeddings[1])
|
|
|
|
def extract_semantic_representation(text):
|
|
# Handle empty strings explicitly
|
|
if not text.strip():
|
|
return 0.0
|
|
|
|
# Purpose-built for sentence embeddings
|
|
model = SentenceTransformer(
|
|
"all-MiniLM-L6-v2"
|
|
) # no preprocess: The neural models are trained to handle natural text variations
|
|
embeddings = model.encode(
|
|
[text],
|
|
output_value="sentence_embedding",
|
|
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
|
|
) # params "sentence_embedding" to prodcuce only one representation per sentence (the average of token embeddings)
|
|
return embeddings
|
|
|
|
def lexical_similarity(text1, text2):
|
|
#vectorizer = TfidfVectorizer(stop_words=None, analyzer="char", ngram_range=(1, 3))
|
|
vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(1, 1))
|
|
text1 = preprocess_text(text1) # only lexical needs preprocessing
|
|
text2 = preprocess_text(text2)
|
|
tfidf_matrix = vectorizer.fit_transform([text1, text2])
|
|
vec1 = tfidf_matrix.toarray()[0]
|
|
vec2 = tfidf_matrix.toarray()[1]
|
|
return cosine_similarity(vec1, vec2)
|
|
|
|
|
|
def bert_score_similarity(texts1, texts2, batch=False):
|
|
P, R, F1 = (
|
|
score( # no preprocess: The neural models are trained to handle natural text variations
|
|
texts1,
|
|
texts2,
|
|
lang="en",
|
|
verbose=False,
|
|
model_type="bert-base-uncased",
|
|
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
|
|
batch_size=32,
|
|
)
|
|
)
|
|
return F1.tolist() if batch else F1.item()
|