import pandas as pd import numpy as np from glob import glob from os.path import join import quapy.functional as F Ks = [50, 100, 500, 1000] CLASS_NAMES = ['continent', 'gender', 'years_category'] # ['relative_pageviews_category', 'num_sitelinks_category']: DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL'] protected_group = { 'gender': 'Female', 'continent': 'Africa', 'years_category': 'Pre-1900s', } def load_sample(path, class_name): """ Loads a sample json as a dataframe and returns text and labels for the given class_name :param path: path to a json file :param class_name: string representing the target class :return: texts, labels for class_name """ df = pd.read_json(path) text = df.text.values labels = df[class_name].values return text, labels def binarize_labels(labels, positive_class=None): if positive_class is not None: protected_labels = labels==positive_class labels[protected_labels] = 1 labels[~protected_labels] = 0 labels = labels.astype(int) return labels class RetrievedSamples: def __init__(self, class_home: str, test_rankings_path: str, test_query_prevs_path: str, vectorizer, class_name, positive_class=None, classes=None, ): self.class_home = class_home self.test_rankings_df = pd.read_json(test_rankings_path) self.test_query_prevs_df = pd.read_json(test_query_prevs_path) self.vectorizer = vectorizer self.class_name = class_name self.positive_class = positive_class self.classes = classes def get_text_label_score(self, df, filter_rank=1000): df = df[df['rank'] 0 and len(texts) > max_lines: # ranks = sel_df.rank.values # idx = np.argsort(ranks)[:max_lines] # texts = np.asarray(texts)[idx] # labels = np.asarray(labels)[idx] # return texts, labels def total(self): return len(self._list_queries()) def _get_query_id_from_path(self, path): prefix = 'training_Query-' posfix = 'Sample-200SPLIT' qid = path qid = qid[:qid.index(posfix)] qid = qid[qid.index(prefix) + len(prefix):] qid = int(qid) return qid