36 lines
1.3 KiB
Python
36 lines
1.3 KiB
Python
|
|
import numpy as np
|
|
from transformers import BertTokenizer, BertModel
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
import torch
|
|
from bert_score import score
|
|
|
|
def cosine_similarity(a, b):
|
|
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
|
|
|
def semantic_similarity(text1, text2):
|
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
|
model = BertModel.from_pretrained('bert-base-uncased')
|
|
|
|
inputs1 = tokenizer(text1, return_tensors='pt')
|
|
inputs2 = tokenizer(text2, return_tensors='pt')
|
|
|
|
with torch.no_grad():
|
|
outputs1 = model(**inputs1)
|
|
outputs2 = model(**inputs2)
|
|
|
|
embedding1 = outputs1.last_hidden_state.mean(dim=1).squeeze().numpy()
|
|
embedding2 = outputs2.last_hidden_state.mean(dim=1).squeeze().numpy()
|
|
|
|
return cosine_similarity(embedding1, embedding2)
|
|
|
|
def lexical_similarity(text1, text2):
|
|
vectorizer = TfidfVectorizer(stop_words=None, analyzer='char', ngram_range=(1, 3))
|
|
tfidf_matrix = vectorizer.fit_transform([text1, text2])
|
|
vec1 = tfidf_matrix.toarray()[0]
|
|
vec2 = tfidf_matrix.toarray()[1]
|
|
return cosine_similarity(vec1, vec2)
|
|
|
|
def bert_score_similarity(texts1, texts2):
|
|
P, R, F1 = score(texts1, texts2, lang='en', verbose=False, model_type='bert-base-uncased',device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
|
|
return F1.item() |