diff --git a/Retrieval/commons.py b/Retrieval/commons.py index 9100b22..b2007ea 100644 --- a/Retrieval/commons.py +++ b/Retrieval/commons.py @@ -3,9 +3,7 @@ import numpy as np from glob import glob from os.path import join -from quapy.data import LabelledCollection -from quapy.protocol import AbstractProtocol -import json +import quapy.functional as F def load_sample(path, class_name): @@ -23,67 +21,86 @@ def load_sample(path, class_name): return text, labels -def get_text_label_score(df, class_name, vectorizer=None, filter_classes=None): - text = df.text.values - labels = df[class_name].values - rel_score = df.score.values - - if filter_classes is not None: - idx = np.isin(labels, filter_classes) - text = text[idx] - labels = labels[idx] - rel_score = rel_score[idx] - - if vectorizer is not None: - text = vectorizer.transform(text) - - order = np.argsort(-rel_score) - return text[order], labels[order], rel_score[order] +def binarize_labels(labels, positive_class=None): + if positive_class is not None: + protected_labels = labels==positive_class + labels[protected_labels] = 1 + labels[~protected_labels] = 0 + labels = labels.astype(int) + return labels class RetrievedSamples: - def __init__(self, class_home: str, test_rankings_path: str, test_query_prevs_path: str, vectorizer, class_name, - classes=None + positive_class=None, + classes=None, ): self.class_home = class_home self.test_rankings_df = pd.read_json(test_rankings_path) self.test_query_prevs_df = pd.read_json(test_query_prevs_path) self.vectorizer = vectorizer self.class_name = class_name - self.classes=classes + self.positive_class = positive_class + self.classes = classes + def get_text_label_score(self, df): + class_name = self.class_name + vectorizer = self.vectorizer + filter_classes = self.classes + + text = df.text.values + labels = df[class_name].values + rel_score = df.score.values + + labels = binarize_labels(labels, self.positive_class) + + if filter_classes is not None: + idx = np.isin(labels, filter_classes) + text = text[idx] + labels = labels[idx] + rel_score = rel_score[idx] + + if vectorizer is not None: + text = vectorizer.transform(text) + + order = np.argsort(-rel_score) + return text[order], labels[order], rel_score[order] def __call__(self): tests_df = self.test_rankings_df class_name = self.class_name - vectorizer = self.vectorizer for file in self._list_queries(): - # print(file) - # loads the training sample train_df = pd.read_json(file) if len(train_df) == 0: print('empty dataframe: ', file) else: - Xtr, ytr, score_tr = get_text_label_score(train_df, class_name, vectorizer, filter_classes=self.classes) + Xtr, ytr, score_tr = self.get_text_label_score(train_df) # loads the test sample query_id = self._get_query_id_from_path(file) sel_df = tests_df[tests_df.qid == query_id] - Xte, yte, score_te = get_text_label_score(sel_df, class_name, vectorizer, filter_classes=self.classes) + Xte, yte, score_te = self.get_text_label_score(sel_df) # gets the prevalence of all judged relevant documents for the query df = self.test_query_prevs_df q_rel_prevs = df.loc[df.id == query_id][class_name+'_proportions'].values[0] + if self.positive_class is not None: + if self.positive_class not in q_rel_prevs: + print(f'positive class {self.positive_class} not found in the query; skipping') + continue + q_rel_prevs = F.as_binary_prevalence(q_rel_prevs[self.positive_class]) + else: + q_rel_prevs = np.asarray([q_rel_prevs.get(class_i, 0.) for class_i in self.classes]) + yield (Xtr, ytr, score_tr), (Xte, yte, score_te), q_rel_prevs def _list_queries(self): diff --git a/Retrieval/experiments.py b/Retrieval/experiments.py index 74b912b..958c4c7 100644 --- a/Retrieval/experiments.py +++ b/Retrieval/experiments.py @@ -6,16 +6,20 @@ from pathlib import Path import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import GridSearchCV +from sklearn.metrics import confusion_matrix +from sklearn.model_selection import GridSearchCV, cross_val_predict +from sklearn.base import clone from sklearn.svm import LinearSVC from scipy.special import rel_entr as KLD import quapy as qp import quapy.functional as F -from Retrieval.commons import RetrievedSamples, load_sample +from Retrieval.commons import RetrievedSamples, load_sample, binarize_labels +from Retrieval.methods import M3rND_ModelB, M3rND_ModelD, AbstractM3rND from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML from quapy.data.base import LabelledCollection +from scipy.sparse import vstack from os.path import join from tqdm import tqdm @@ -50,21 +54,20 @@ To evaluate our approach, I have executed the queries on the test split. You can """ -def methods(classifier, class_name): +def methods(classifier, class_name, binarize=False): kde_param = { 'continent': 0.01, - 'gender': 0.005, + 'gender': 0.03, 'years_category':0.03 } - #yield ('Naive', Naive()) - #yield ('NaiveQuery', Naive()) + yield ('Naive', Naive()) + yield ('NaiveQuery', Naive()) yield ('CC', ClassifyAndCount(classifier)) # yield ('PCC', PCC(classifier)) # yield ('ACC', ACC(classifier, val_split=5, n_jobs=-1)) - #yield ('PACC', PACC(classifier, val_split=5, n_jobs=-1)) - # yield ('PACC-s', PACC(classifier, val_split=5, n_jobs=-1)) + yield ('PACC', PACC(classifier, val_split=5, n_jobs=-1)) # yield ('EMQ', EMQ(classifier, exact_train_prev=True)) # yield ('EMQ-Platt', EMQ(classifier, exact_train_prev=True, recalib='platt')) # yield ('EMQh', EMQ(classifier, exact_train_prev=False)) @@ -72,26 +75,16 @@ def methods(classifier, class_name): # yield ('EMQ-TS', EMQ(classifier, exact_train_prev=False, recalib='ts')) # yield ('EMQ-NBVS', EMQ(classifier, exact_train_prev=False, recalib='nbvs')) # yield ('EMQ-VS', EMQ(classifier, exact_train_prev=False, recalib='vs')) - # yield ('KDE001', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.001)) - # yield ('KDE005', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow! + yield ('KDEy-ML', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param[class_name])) # yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01)) - # yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02)) - # yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03)) - # yield ('KDE-silver', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth='silverman')) - # yield ('KDE-scott', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth='scott')) - # yield ('KDEy-ML', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param[class_name])) - # yield ('KDE005', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.005)) - # yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01)) - # yield ('KDE01-s', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01)) - # yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02)) - # yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03)) - # yield ('KDE04', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.04)) - # yield ('KDE05', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.05)) - # yield ('KDE07', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.07)) - # yield ('KDE10', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.10)) + if binarize: + yield ('M3b', M3rND_ModelB(classifier)) + yield ('M3b+', M3rND_ModelB(classifier)) + yield ('M3d', M3rND_ModelD(classifier)) + yield ('M3d+', M3rND_ModelD(classifier)) -def train_classifier(train_path): +def train_classifier_fn(train_path): """ Trains a classifier. To do so, it loads the training set, transforms it into a tfidf representation. The classifier is Logistic Regression, with hyperparameters C (range [0.001, 0.01, ..., 1000]) and @@ -101,28 +94,36 @@ def train_classifier(train_path): """ texts, labels = load_sample(train_path, class_name=class_name) + if BINARIZE: + labels = binarize_labels(labels, positive_class=protected_group[class_name]) + tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3) Xtr = tfidf.fit_transform(texts) print(f'Xtr shape={Xtr.shape}') print('training classifier...', end='') classifier = LogisticRegression(max_iter=5000) - classifier = GridSearchCV( + modsel = GridSearchCV( classifier, param_grid={'C': np.logspace(-4, 4, 9), 'class_weight': ['balanced', None]}, n_jobs=-1, cv=5 ) - classifier.fit(Xtr, labels) - classifier = classifier.best_estimator_ - classifier_acc = classifier.best_score_ - print(f'[done] best-params={classifier.best_params_} got {classifier_acc:.4f} score') + modsel.fit(Xtr, labels) + classifier = modsel.best_estimator_ + classifier_acc = modsel.best_score_ + best_params = modsel.best_params_ + print(f'[done] best-params={best_params} got {classifier_acc:.4f} score') + + print('generating cross-val predictions for M3') + predictions = cross_val_predict(clone(classifier), Xtr, labels, cv=10, n_jobs=-1, verbose=10) + conf_matrix = confusion_matrix(labels, predictions, labels=classifier.classes_) training = LabelledCollection(Xtr, labels) print('training classes:', training.classes_) print('training prevalence:', training.prevalence()) - return tfidf, classifier + return tfidf, classifier, conf_matrix def reduceAtK(data: LabelledCollection, k): @@ -140,12 +141,12 @@ def benchmark_name(class_name, k): def run_experiment(): + results = { 'mae': {k: [] for k in Ks}, 'mrae': {k: [] for k in Ks}, - 'Dkl_estim': [], - 'Dkl_true': [], - 'Dkl_error': [] + 'rKL_error': [], + 'rND_error': [] } pbar = tqdm(experiment_prot(), total=experiment_prot.total()) @@ -153,163 +154,159 @@ def run_experiment(): Xtr, ytr, score_tr = train Xte, yte, score_te = test - if HALF and not method_name.endswith('-s'): - n = len(ytr) // 2 - train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier_trained.classes_) - else: - train_col = LabelledCollection(Xtr, ytr, classes=classifier_trained.classes_) + n = len(ytr) // 2 + train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier.classes_) - class_order = train_col.classes_ - q_rel_prevs = np.asarray([q_rel_prevs.get(k, 0.) for k in class_order]) - - # idx, max_score_round_robin = get_idx_score_matrix_per_class(train_col, score_tr) - - if method_name not in ['Naive', 'NaiveQuery'] and not method_name.endswith('-s'): - quantifier.fit(train_col, val_split=train_col, fit_classifier=False) + if method_name not in ['Naive', 'NaiveQuery', 'M3b', 'M3b+', 'M3d', 'M3d+']: + method.fit(train_col, val_split=train_col, fit_classifier=False) elif method_name == 'Naive': - quantifier.fit(train_col) + method.fit(train_col) - test_col = LabelledCollection(Xte, yte, classes=classifier_trained.classes_) - Dkl_estim = [] - Dkl_true = [] + test_col = LabelledCollection(Xte, yte, classes=classifier.classes_) + rKL_estim, rKL_true = [], [] + rND_estim, rND_true = [], [] for k in Ks: test_k = reduceAtK(test_col, k) if method_name == 'NaiveQuery': train_k = reduceAtK(train_col, k) - quantifier.fit(train_k) - # elif method_name.endswith('-s'): - # test_min_score = score_te[k] if k < len(score_te) else score_te[-1] - # train_k = reduce_train_at_score(train_col, idx, max_score_round_robin, test_min_score) - # print(f'{k=}, {test_min_score=} {len(train_k)=}') - # quantifier.fit(train_k, val_split=train_k, fit_classifier=False) + method.fit(train_k) - estim_prev = quantifier.quantify(test_k.instances) + estim_prev = method.quantify(test_k.instances) - eps=(1. / (2 * k)) - mae = qp.error.mae(test_k.prevalence(), estim_prev) - mrae = qp.error.mrae(test_k.prevalence(), estim_prev, eps=eps) - Dkl_at_k_estim = qp.error.kld(estim_prev, q_rel_prevs, eps=eps) - Dkl_at_k_true = qp.error.kld(test_k.prevalence(), q_rel_prevs, eps=eps) + # epsilon value for prevalence smoothing + eps=(1. / (2. * k)) + # error metrics + test_k_prev = test_k.prevalence() + mae = qp.error.mae(test_k_prev, estim_prev) + mrae = qp.error.mrae(test_k_prev, estim_prev, eps=eps) + rKL_at_k_estim = qp.error.kld(estim_prev, q_rel_prevs, eps=eps) + rKL_at_k_true = qp.error.kld(test_k_prev, q_rel_prevs, eps=eps) + + if BINARIZE: + # [1] is the index of the minority or historically disadvantaged group + rND_at_k_estim = np.abs(estim_prev[1] - q_rel_prevs[1]) + rND_at_k_true = np.abs(test_k_prev[1] - q_rel_prevs[1]) + + # collect results results['mae'][k].append(mae) results['mrae'][k].append(mrae) - Dkl_estim.append(Dkl_at_k_estim) - Dkl_true.append(Dkl_at_k_true) - - Z = 1 - Dkl_estim = (1/Z) * sum((1./np.log2(k)) * v for v in Dkl_estim) - Dkl_true = (1/Z) * sum((1./np.log2(k)) * v for v in Dkl_true) - Dkl_error = np.abs(Dkl_true-Dkl_estim) - #print(f'{Dkl_estim=}\t{Dkl_true=}\t{Dkl_error=}') + rKL_estim.append(rKL_at_k_estim) + rKL_true.append(rKL_at_k_true) + if BINARIZE: + rND_estim.append(rND_at_k_estim) + rND_true.append(rND_at_k_true) - results['Dkl_estim'].append(Dkl_estim) - results['Dkl_true'].append(Dkl_true) - results['Dkl_error'].append(Dkl_error) + + # aggregate fairness metrics + def aggregate(rMs, Ks, Z=1): + return (1 / Z) * sum((1. / np.log2(k)) * v for v, k in zip(rMs, Ks)) + + Z = sum((1. / np.log2(k)) for k in Ks) + rKL_estim = aggregate(rKL_estim, Ks, Z) + rKL_true = aggregate(rKL_true, Ks, Z) + rKL_error = np.abs(rKL_true-rKL_estim) + results['rKL_error'].append(rKL_error) + + if BINARIZE: + rND_estim = aggregate(rND_estim, Ks, Z) + rND_true = aggregate(rND_true, Ks, Z) + + if isinstance(method, AbstractM3rND): + if method_name.endswith('+'): + conf_matrix_ = method.get_confusion_matrix(*train_col.Xy) + else: + conf_matrix_ = conf_matrix.copy() + rND_estim = method.fair_measure_correction(rND_estim, conf_matrix_) + + rND_error = np.abs(rND_true - rND_estim) + results['rND_error'].append(rND_error) pbar.set_description(f'{method_name}') return results -def get_idx_score_matrix_per_class(train, score_tr): - classes = train.classes_ - num_classes = len(classes) - num_docs = len(train) - scores = np.zeros(shape=(num_docs, num_classes), dtype=float) - idx = np.full(shape=(num_docs, num_classes), fill_value=-1, dtype=int) - X, y = train.Xy - for i, class_i in enumerate(classes): - class_i_scores = score_tr[y == class_i] - rank_i = np.argwhere(y == class_i).flatten() - scores[:len(class_i_scores), i] = class_i_scores - idx[:len(class_i_scores), i] = rank_i - max_score_round_robin = scores.max(axis=1) - return idx, max_score_round_robin - -def reduce_train_at_score(train, idx, max_score_round_robin, score_te_at_k, min_docs_per_class=5): - min_index = np.min(np.argwhere(max_score_round_robin len(data): - # print(f'[warning] {k=}>{len(data)=}') - X, y = data.Xy - X = X[:k] - y = y[:k] - return LabelledCollection(X, y, classes=data.classes_) - - -def run_experiment(): - results = { - 'mae': {k: [] for k in Ks}, - 'mrae': {k: [] for k in Ks} - } - - pbar = tqdm(experiment_prot(), total=experiment_prot.total()) - for train, test in pbar: - Xtr, ytr, score_tr = train - Xte, yte, score_te = test - - if HALF: - n = len(ytr) // 2 - train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier_trained.classes_) - else: - train_col = LabelledCollection(Xtr, ytr, classes=classifier_trained.classes_) - - if method_name not in ['Naive', 'NaiveQuery']: - quantifier.fit(train_col, val_split=train_col, fit_classifier=False) - elif method_name == 'Naive': - quantifier.fit(train_col) - - test_col = LabelledCollection(Xte, yte, classes=classifier_trained.classes_) - for k in Ks: - test_k = reduceAtK(test_col, k) - if method_name == 'NaiveQuery': - train_k = reduceAtK(train_col, k) - quantifier.fit(train_k) - - estim_prev = quantifier.quantify(test_k.instances) - - mae = qp.error.mae(test_k.prevalence(), estim_prev) - mrae = qp.error.mrae(test_k.prevalence(), estim_prev, eps=(1. / (2 * k))) - - results['mae'][k].append(mae) - results['mrae'][k].append(mrae) - - pbar.set_description(f'{method_name}') - - return results - -def benchmark_name(class_name, k): - scape_class_name = class_name.replace('_', '\_') - return f'{scape_class_name}@{k}' +def methods(classifier): + for i, bandwidth in enumerate(np.linspace(0.01, 0.1, 10)): + yield (f'KDE{str(i).zfill(2)}', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=bandwidth)) if __name__ == '__main__': data_home = 'data-modsel' - HALF=True - exp_posfix = '_half_modsel' - Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000] - method_names = [m for m, *_ in methods(None, None)] + method_names = [m for m, *_ in methods(None)] + + class_mode = 'multiclass' dir_names={ 'gender': '100K_GENDER_TREC21_QUERIES/100K-NEW-QUERIES', @@ -104,54 +43,42 @@ if __name__ == '__main__': 'years_category': '100K_YEARS_TREC21_QUERIES/100K-NEW-QUERIES' } - for class_name in ['gender', 'continent', 'years_category']: # 'relative_pageviews_category', 'num_sitelinks_category']: - tables_mae, tables_mrae = [], [] + for class_name in ['gender', 'continent', 'years_category']: + + tables_mrae = [] benchmarks = [benchmark_name(class_name, k) for k in Ks] for data_size in ['100K']: - table_mae = Table(name=f'{class_name}-{data_size}-mae', benchmarks=benchmarks, methods=method_names) table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks, methods=method_names) - table_mae.format.mean_prec = 5 - table_mae.format.remove_zero = True - table_mae.format.color_mode = 'global' - - tables_mae.append(table_mae) tables_mrae.append(table_mrae) class_home = join(data_home, dir_names[class_name]) - classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl') # <------------ fixed classifier + classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}_{class_mode}.pkl') test_rankings_path = join(data_home, 'testRanking-TREC21-Queries_Results.json') - results_home = join('results'+exp_posfix, class_name, data_size) + test_query_prevs_path = join('data', 'prevelance_vectors_judged_docs.json') + results_home = join('results', 'modsel', class_name, data_size) - tfidf, classifier_trained = pickle.load(open(classifier_path, 'rb')) + tfidf, classifier, conf_matrix = pickle.load(open(classifier_path, 'rb')) experiment_prot = RetrievedSamples( class_home, test_rankings_path, + test_query_prevs_path, vectorizer=tfidf, class_name=class_name, - classes=classifier_trained.classes_ + classes=classifier.classes_ ) - for method_name, quantifier in methods(classifier_trained, class_name): + for method_name, quantifier in methods(classifier): results_path = join(results_home, method_name + '.pkl') - if os.path.exists(results_path): - print(f'Method {method_name=} already computed') - results = pickle.load(open(results_path, 'rb')) - else: - results = run_experiment() - - os.makedirs(Path(results_path).parent, exist_ok=True) - pickle.dump(results, open(results_path, 'wb'), pickle.HIGHEST_PROTOCOL) + results = qp.util.pickled_resource(results_path, run_experiment) for k in Ks: - table_mae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mae'][k]) table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k]) - # Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mae+tables_mrae) - Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mrae) + Table.LatexPDF(f'./latex/modsel/{class_name}.pdf', tables=tables_mrae) diff --git a/Retrieval/kdey_bandwith_selection_APP.py b/Retrieval/kdey_bandwith_selection_APP.py deleted file mode 100644 index 658a826..0000000 --- a/Retrieval/kdey_bandwith_selection_APP.py +++ /dev/null @@ -1,77 +0,0 @@ -import itertools -import os.path -import pickle -from collections import defaultdict -from pathlib import Path - -import numpy as np -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import GridSearchCV -from sklearn.svm import LinearSVC - -import quapy as qp -from Retrieval.commons import RetrievedSamples, load_sample -from quapy.protocol import UPP -from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive -from quapy.model_selection import GridSearchQ -from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML -from quapy.data.base import LabelledCollection - -from os.path import join -from tqdm import tqdm - -from result_table.src.table import Table - -""" - -""" - -data_home = 'data' - -datasets = ['continent', 'gender', 'years_category'] #, 'relative_pageviews_category', 'num_sitelinks_category'] - -for class_name in datasets: - - train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <-------- fixed classifier - texts, labels = load_sample(train_data_path, class_name=class_name) - - classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl') - tfidf, classifier_trained = pickle.load(open(classifier_path, 'rb')) - classifier_hyper = classifier_trained.get_params() - print(f'{classifier_hyper=}') - - X = tfidf.transform(texts) - print(f'Xtr shape={X.shape}') - - pool = LabelledCollection(X, labels) - train, val = pool.split_stratified(train_prop=0.5, random_state=0) - q = KDEyML(LogisticRegression()) - classifier_hyper = {'classifier__C':[classifier_hyper['C'], 0.00000001], 'classifier__class_weight':[classifier_hyper['class_weight']]} - quantifier_hyper = {'bandwidth': np.linspace(0.01, 0.2, 20)} - hyper = {**classifier_hyper, **quantifier_hyper} - qp.environ['SAMPLE_SIZE'] = 100 - modsel = GridSearchQ( - model=q, - param_grid=hyper, - protocol=UPP(val, sample_size=100), - n_jobs=-1, - error='mrae', - verbose=True - ) - modsel.fit(train) - - print(class_name) - print(f'{modsel.best_params_}') - print(f'{modsel.best_score_}') - - - - - - - - - - - diff --git a/Retrieval/plot_mrae_xaxis_k.py b/Retrieval/plot_mrae_xaxis_k.py new file mode 100644 index 0000000..201ca0f --- /dev/null +++ b/Retrieval/plot_mrae_xaxis_k.py @@ -0,0 +1,125 @@ +import os.path +import pickle +from collections import defaultdict +from pathlib import Path + +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.svm import LinearSVC + +import quapy as qp +from Retrieval.commons import RetrievedSamples, load_sample +from Retrieval.experiments import methods, benchmark_name +from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive +from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML +from quapy.data.base import LabelledCollection + +from os.path import join +from tqdm import tqdm + +from result_table.src.table import Table +import matplotlib.pyplot as plt + + + +data_home = 'data' +class_mode = 'multiclass' + +method_names = [name for name, *other in methods(None, 'continent')] + +# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000] +Ks = [50, 100, 500, 1000] +DATA_SIZE = ['10K', '50K', '100K', '500K', '1M', 'FULL'] +CLASS_NAME = ['gender', 'continent', 'years_category'] +all_results = {} + + +# loads all MRAE results, and returns a dictionary containing the values, which is indexed by: +# class_name -> data_size -> method_name -> k -> stat -> float +# where stat is "mean", "std", "max" +def load_all_results(): + for class_name in CLASS_NAME: + + all_results[class_name] = {} + + for data_size in DATA_SIZE: + + all_results[class_name][data_size] = {} + + results_home = join('results', class_name, class_mode, data_size) + + all_results[class_name][data_size] = {} + + for method_name in method_names: + results_path = join(results_home, method_name + '.pkl') + try: + results = pickle.load(open(results_path, 'rb')) + except Exception as e: + print(f'missing result {results}', e) + + all_results[class_name][data_size][method_name] = {} + for k in Ks: + all_results[class_name][data_size][method_name][k] = {} + values = results['mrae'] + all_results[class_name][data_size][method_name][k]['mean'] = np.mean(values[k]) + all_results[class_name][data_size][method_name][k]['std'] = np.std(values[k]) + all_results[class_name][data_size][method_name][k]['max'] = np.max(values[k]) + + return all_results + + +results = load_all_results() + +# generates the class-independent, size-independent plots for y-axis=MRAE in which: +# - the x-axis displays the Ks + +for class_name in CLASS_NAME: + for data_size in DATA_SIZE: + + fig, ax = plt.subplots() + + max_means = [] + for method_name in method_names: + # class_name -> data_size -> method_name -> k -> stat -> float + means = [ + results[class_name][data_size][method_name][k]['mean'] for k in Ks + ] + stds = [ + results[class_name][data_size][method_name][k]['std'] for k in Ks + ] + # max_mean = np.max([ + # results[class_name][data_size][method_name][k]['max'] for k in Ks + # ]) + max_means.append(max(means)) + + means = np.asarray(means) + stds = np.asarray(stds) + + line = ax.plot(Ks, means, 'o-', label=method_name, color=None) + color = line[-1].get_color() + # ax.fill_between(Ks, means - stds, means + stds, alpha=0.3, color=color) + + ax.set_xlabel('k') + ax.set_ylabel('RAE') + ax.set_title(f'{class_name} from {data_size}') + ax.set_ylim([0, max(max_means)*1.05]) + + ax.legend() + + os.makedirs(f'plots/var_k/{class_name}', exist_ok=True) + plotpath = f'plots/var_k/{class_name}/{data_size}_mrae.pdf' + print(f'saving plot in {plotpath}') + plt.savefig(plotpath, bbox_inches='tight') + + + + + + + + + + + diff --git a/Retrieval/plot_mrae_xaxis_size.py b/Retrieval/plot_mrae_xaxis_size.py new file mode 100644 index 0000000..2b4403c --- /dev/null +++ b/Retrieval/plot_mrae_xaxis_size.py @@ -0,0 +1,91 @@ +import os.path +import pickle +from collections import defaultdict +from pathlib import Path + +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.svm import LinearSVC + +import quapy as qp +from Retrieval.commons import RetrievedSamples, load_sample +from Retrieval.experiments import methods, benchmark_name +from Retrieval.plot_mrae_xaxis_k import load_all_results +from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive +from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML +from quapy.data.base import LabelledCollection + +from os.path import join +from tqdm import tqdm + +from result_table.src.table import Table +import matplotlib.pyplot as plt + + + +data_home = 'data' +class_mode = 'multiclass' + +method_names = [name for name, *other in methods(None, 'continent')] + +Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000] +DATA_SIZE = ['10K', '50K', '100K', '500K', '1M', 'FULL'] +CLASS_NAME = ['gender', 'continent', 'years_category'] +all_results = {} + + +# loads all MRAE results, and returns a dictionary containing the values, which is indexed by: +# class_name -> data_size -> method_name -> k -> stat -> float +results = load_all_results() + +# generates the class-independent, size-independent plots for y-axis=MRAE in which: +# - the x-axis displays the Ks + +for class_name in CLASS_NAME: + for k in Ks: + + fig, ax = plt.subplots() + + max_means = [] + for method_name in method_names: + # class_name -> data_size -> method_name -> k -> stat -> float + means = [ + results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZE + ] + stds = [ + results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZE + ] + # max_mean = np.max([ + # results[class_name][data_size][method_name][k]['max'] for data_size in DATA_SIZE + # ]) + max_means.append(max(means)) + + style = 'o-' if method_name != 'CC' else '--' + line = ax.plot(DATA_SIZE, means, style, label=method_name, color=None) + color = line[-1].get_color() + # ax.fill_between(Ks, means - stds, means + stds, alpha=0.3, color=color) + + ax.set_xlabel('training pool size') + ax.set_ylabel('RAE') + ax.set_title(f'{class_name} from {k=}') + ax.set_ylim([0, max(max_means)*1.05]) + + ax.legend() + + os.makedirs(f'plots/var_size/{class_name}', exist_ok=True) + plotpath = f'plots/var_size/{class_name}/{k}_mrae.pdf' + print(f'saving plot in {plotpath}') + plt.savefig(plotpath, bbox_inches='tight') + + + + + + + + + + + diff --git a/Retrieval/plot_results.py b/Retrieval/plot_results.py deleted file mode 100644 index 26285dd..0000000 --- a/Retrieval/plot_results.py +++ /dev/null @@ -1,102 +0,0 @@ -import os.path -import pickle -from collections import defaultdict -from pathlib import Path - -import numpy as np -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import GridSearchCV -from sklearn.svm import LinearSVC - -import quapy as qp -from Retrieval.commons import RetrievedSamples, load_sample -from Retrieval.experiments import methods -from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive -from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML -from quapy.data.base import LabelledCollection - -from os.path import join -from tqdm import tqdm - -from result_table.src.table import Table -import matplotlib.pyplot as plt - - -def benchmark_name(class_name, k): - scape_class_name = class_name.replace('_', '\_') - return f'{scape_class_name}@{k}' - - -data_home = 'data' - -HALF=True -exp_posfix = '_half' - -method_names = [name for name, *other in methods(None, 'continent')] - -Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000] - -for class_name in ['gender', 'continent', 'years_category']: # 'relative_pageviews_category', 'num_sitelinks_category']: - - benchmarks = [benchmark_name(class_name, k) for k in Ks] - - for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']: - - fig, ax = plt.subplots() - - class_home = join(data_home, class_name, data_size) - test_rankings_path = join(data_home, 'testRanking_Results.json') - results_home = join('results'+exp_posfix, class_name, data_size) - - max_mean = None - for method_name in method_names: - - results_path = join(results_home, method_name + '.pkl') - try: - results = pickle.load(open(results_path, 'rb')) - except Exception as e: - print(f'missing result {results}', e) - - for err in ['mrae']: - means, stds = [], [] - for k in Ks: - values = results[err][k] - means.append(np.mean(values)) - stds.append(np.std(values)) - - means = np.asarray(means) - stds = np.asarray(stds) #/ np.sqrt(len(stds)) - - if max_mean is None: - max_mean = np.max(means) - else: - max_mean = max(max_mean, np.max(means)) - - line = ax.plot(Ks, means, 'o-', label=method_name, color=None) - color = line[-1].get_color() - # ax.fill_between(Ks, means - stds, means + stds, alpha=0.3, color=color) - - ax.set_xlabel('k') - ax.set_ylabel(err.upper()) - ax.set_title(f'{class_name} from {data_size}') - ax.set_ylim([0, max_mean]) - - ax.legend() - - # plt.show() - os.makedirs(f'plots/results/{class_name}', exist_ok=True) - plotpath = f'plots/results/{class_name}/{data_size}_{err}.pdf' - print(f'saving plot in {plotpath}') - plt.savefig(plotpath) - - - - - - - - - - - diff --git a/quapy/error.py b/quapy/error.py index f2f5bd0..3e21333 100644 --- a/quapy/error.py +++ b/quapy/error.py @@ -158,8 +158,8 @@ def kld(prevs, prevs_hat, eps=None): :return: Kullback-Leibler divergence between the two distributions """ eps = __check_eps(eps) - smooth_prevs = prevs + eps - smooth_prevs_hat = prevs_hat + eps + smooth_prevs = smooth(prevs, eps) + smooth_prevs_hat = smooth(prevs_hat, eps) return (smooth_prevs*np.log(smooth_prevs/smooth_prevs_hat)).sum(axis=-1)