""" Should also evaluate IF and HOW actual spaCy KB could be deoloyed in this scenario https://github.com/seatgeek/fuzzywuzzy?source=post_page--------------------------- """ # TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz from difflib import SequenceMatcher from pprint import pprint import pickle class Knowledge_base: def __init__(self, kb_path): with open(kb_path, 'rb') as infile: data = pickle.load(infile) self.kb = data #self.utt2ent = self._generate_utter_2_ent() def link_entities(self, preds): PER_preds = [pred[0] for pred in preds if pred[1] == 'PER'] WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART'] print(f'Candidate authors (i.e., entitites matched): {PER_preds}') # print(f'Candidates work:\n{WORK_preds}') COMMEDIA_DATE = 1321 """ for target in set(PER_preds): if target in self.utt2ent.keys(): print(target, self.utt2ent[target]) """ print('#'*50 + '\nChecking in KB...') # TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'aquino) for target in set(PER_preds): scores = [] for auth in self.kb.keys(): sim = self._similar(target, auth) scores.append((auth, sim)) scores.sort(key=lambda tup: tup[1], reverse=True) for i in range(3): if scores[i][1] > .8: print(f'Prediction: {target} - {scores[i]} - born in {self.kb[scores[i][0]]["birth"]}') break #elif scores[0][1] == 0: # print(f'Author {target} not in KB ') return 0 """ for target in set(PER_preds): #print(f'TARGET: {target}') scores = [] for auth in self.kb.keys(): sim = self._similar(target, auth) scores.append((auth, sim)) scores.sort(key=lambda tup: tup[1], reverse=True) # pprint(scores[:3]) all_lang_scores = self._check_other_lang(scores[0], target) if all_lang_scores[0][1] >= 0.8: # with this threshold 'Tommaso' is not linked to 'Tommaso d'aquino' ... print(f'TARGET: {target}') print(f'{all_lang_scores[0][0]} was born in year: {self.kb[scores[0][0]]["birth"]}') #print(all_lang_scores) else: continue #print('Author not in KB') print('-'*15) """ def _generate_utter_2_ent(self): utt_2_ent = {} for ent_en in self.kb.keys(): for utt in self.kb[ent_en]['names']: utt_2_ent[utt] = ent_en return utt_2_ent def _check_other_lang(self, target, original_name): other_names = self.kb[target[0]]['names'] scores = [] for name in other_names: sim = self._similar(original_name, name) scores.append((name, sim)) scores.sort(key=lambda tup: tup[1], reverse=True) return scores def _similar(self,a, b): return SequenceMatcher(None, a, b).ratio()