From 36c53639d75ade70f471ed02942d64190833789a Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Tue, 23 Apr 2024 09:53:31 +0200 Subject: [PATCH] model selection for kde in a past TREC dataset --- Retrieval/commons.py | 17 +- Retrieval/experiments.py | 110 ++++++------ Retrieval/kdey_bandwidth_selection_queries.py | 161 ++++++++++++++++++ ...tion.py => kdey_bandwith_selection_APP.py} | 0 Retrieval/plot_results.py | 102 +++++++++++ quapy/method/_kdey.py | 2 +- 6 files changed, 331 insertions(+), 61 deletions(-) create mode 100644 Retrieval/kdey_bandwidth_selection_queries.py rename Retrieval/{kdey_bandwith_selection.py => kdey_bandwith_selection_APP.py} (100%) create mode 100644 Retrieval/plot_results.py diff --git a/Retrieval/commons.py b/Retrieval/commons.py index ae66ed7..26a34b4 100644 --- a/Retrieval/commons.py +++ b/Retrieval/commons.py @@ -64,16 +64,21 @@ class RetrievedSamples: for file in self._list_queries(): + # print(file) + # loads the training sample train_df = pd.read_json(file) - Xtr, ytr, score_tr = get_text_label_score(train_df, class_name, vectorizer, filter_classes=self.classes) + if len(train_df) == 0: + print('empty dataframe: ', file) + else: + Xtr, ytr, score_tr = get_text_label_score(train_df, class_name, vectorizer, filter_classes=self.classes) - # loads the test sample - query_id = self._get_query_id_from_path(file) - sel_df = tests_df[tests_df.qid == int(query_id)] - Xte, yte, score_te = get_text_label_score(sel_df, class_name, vectorizer, filter_classes=self.classes) + # loads the test sample + query_id = self._get_query_id_from_path(file) + sel_df = tests_df[tests_df.qid == int(query_id)] + Xte, yte, score_te = get_text_label_score(sel_df, class_name, vectorizer, filter_classes=self.classes) - yield (Xtr, ytr, score_tr), (Xte, yte, score_te) + yield (Xtr, ytr, score_tr), (Xte, yte, score_te) def _list_queries(self): return sorted(glob(join(self.class_home, 'training_Query*200SPLIT.json'))) diff --git a/Retrieval/experiments.py b/Retrieval/experiments.py index c1450be..2630b13 100644 --- a/Retrieval/experiments.py +++ b/Retrieval/experiments.py @@ -51,9 +51,9 @@ To evaluate our approach, I have executed the queries on the test split. You can def methods(classifier, class_name): kde_param = { - 'continent': 0.18, - 'gender': 0.12, - 'years_category':0.09 + 'continent': 0.01, + 'gender': 0.005, + 'years_category':0.03 } yield ('Naive', Naive()) @@ -76,13 +76,14 @@ def methods(classifier, class_name): # yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03)) # yield ('KDE-silver', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth='silverman')) # yield ('KDE-scott', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth='scott')) - yield ('KDE-opt', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param[class_name])) + yield ('KDEy-ML', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param[class_name])) + # yield ('KDE005', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.005)) yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01)) - yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02)) - yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03)) - yield ('KDE04', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.04)) - yield ('KDE05', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.05)) - yield ('KDE07', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.07)) + # yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02)) + # yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03)) + # yield ('KDE04', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.04)) + # yield ('KDE05', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.05)) + # yield ('KDE07', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.07)) # yield ('KDE10', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.10)) @@ -176,63 +177,64 @@ def run_experiment(): return results -data_home = 'data' - -HALF=True -exp_posfix = '_half' - -method_names = [name for name, *other in methods(None, 'continent')] - Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000] -for class_name in ['gender', 'continent', 'years_category']: # 'relative_pageviews_category', 'num_sitelinks_category']: - tables_mae, tables_mrae = [], [] +if __name__ == '__main__': + data_home = 'data' - benchmarks = [benchmark_name(class_name, k) for k in Ks] + HALF=True + exp_posfix = '_half' - for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']: + method_names = [name for name, *other in methods(None, 'continent')] - table_mae = Table(name=f'{class_name}-{data_size}-mae', benchmarks=benchmarks, methods=method_names) - table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks, methods=method_names) - table_mae.format.mean_prec = 5 - table_mae.format.remove_zero = True - table_mae.format.color_mode = 'global' + for class_name in ['gender', 'continent', 'years_category']: # 'relative_pageviews_category', 'num_sitelinks_category']: + tables_mae, tables_mrae = [], [] - tables_mae.append(table_mae) - tables_mrae.append(table_mrae) + benchmarks = [benchmark_name(class_name, k) for k in Ks] - class_home = join(data_home, class_name, data_size) - # train_data_path = join(class_home, 'classifier_training.json') - # classifier_path = join('classifiers', data_size, f'classifier_{class_name}.pkl') - train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <-------- fixed classifier - classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl') # <------------ fixed classifier - test_rankings_path = join(data_home, 'testRanking_Results.json') - results_home = join('results'+exp_posfix, class_name, data_size) + for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']: - tfidf, classifier_trained = qp.util.pickled_resource(classifier_path, train_classifier, train_data_path) + table_mae = Table(name=f'{class_name}-{data_size}-mae', benchmarks=benchmarks, methods=method_names) + table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks, methods=method_names) + table_mae.format.mean_prec = 5 + table_mae.format.remove_zero = True + table_mae.format.color_mode = 'global' - experiment_prot = RetrievedSamples( - class_home, - test_rankings_path, - vectorizer=tfidf, - class_name=class_name, - classes=classifier_trained.classes_ - ) - for method_name, quantifier in methods(classifier_trained, class_name): + tables_mae.append(table_mae) + tables_mrae.append(table_mrae) - results_path = join(results_home, method_name + '.pkl') - if os.path.exists(results_path): - print(f'Method {method_name=} already computed') - results = pickle.load(open(results_path, 'rb')) - else: - results = run_experiment() + class_home = join(data_home, class_name, data_size) + # train_data_path = join(class_home, 'classifier_training.json') + # classifier_path = join('classifiers', data_size, f'classifier_{class_name}.pkl') + train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <-------- fixed classifier + classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl') # <------------ fixed classifier + test_rankings_path = join(data_home, 'testRanking_Results.json') + results_home = join('results'+exp_posfix, class_name, data_size) - os.makedirs(Path(results_path).parent, exist_ok=True) - pickle.dump(results, open(results_path, 'wb'), pickle.HIGHEST_PROTOCOL) + tfidf, classifier_trained = qp.util.pickled_resource(classifier_path, train_classifier, train_data_path) - for k in Ks: - table_mae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mae'][k]) - table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k]) + experiment_prot = RetrievedSamples( + class_home, + test_rankings_path, + vectorizer=tfidf, + class_name=class_name, + classes=classifier_trained.classes_ + ) + for method_name, quantifier in methods(classifier_trained, class_name): + + results_path = join(results_home, method_name + '.pkl') + if os.path.exists(results_path): + print(f'Method {method_name=} already computed') + results = pickle.load(open(results_path, 'rb')) + else: + results = run_experiment() + + os.makedirs(Path(results_path).parent, exist_ok=True) + pickle.dump(results, open(results_path, 'wb'), pickle.HIGHEST_PROTOCOL) + + for k in Ks: + table_mae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mae'][k]) + table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k]) # Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mae+tables_mrae) Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mrae) diff --git a/Retrieval/kdey_bandwidth_selection_queries.py b/Retrieval/kdey_bandwidth_selection_queries.py new file mode 100644 index 0000000..93d3bdb --- /dev/null +++ b/Retrieval/kdey_bandwidth_selection_queries.py @@ -0,0 +1,161 @@ +import os.path +import pickle +from collections import defaultdict +from pathlib import Path + +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.svm import LinearSVC + +import quapy as qp +from Retrieval.commons import RetrievedSamples, load_sample +from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive +from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML +from quapy.data.base import LabelledCollection + +from os.path import join +from tqdm import tqdm + +from result_table.src.table import Table + + + +def methods(classifier, class_name): + yield ('KDE001', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.001)) + yield ('KDE005', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.005)) + yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01)) + yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02)) + yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03)) + yield ('KDE04', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.04)) + yield ('KDE05', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.05)) + yield ('KDE07', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.07)) + yield ('KDE10', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.10)) + +def reduceAtK(data: LabelledCollection, k): + # if k > len(data): + # print(f'[warning] {k=}>{len(data)=}') + X, y = data.Xy + X = X[:k] + y = y[:k] + return LabelledCollection(X, y, classes=data.classes_) + + +def run_experiment(): + results = { + 'mae': {k: [] for k in Ks}, + 'mrae': {k: [] for k in Ks} + } + + pbar = tqdm(experiment_prot(), total=experiment_prot.total()) + for train, test in pbar: + Xtr, ytr, score_tr = train + Xte, yte, score_te = test + + if HALF: + n = len(ytr) // 2 + train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier_trained.classes_) + else: + train_col = LabelledCollection(Xtr, ytr, classes=classifier_trained.classes_) + + if method_name not in ['Naive', 'NaiveQuery']: + quantifier.fit(train_col, val_split=train_col, fit_classifier=False) + elif method_name == 'Naive': + quantifier.fit(train_col) + + test_col = LabelledCollection(Xte, yte, classes=classifier_trained.classes_) + for k in Ks: + test_k = reduceAtK(test_col, k) + if method_name == 'NaiveQuery': + train_k = reduceAtK(train_col, k) + quantifier.fit(train_k) + + estim_prev = quantifier.quantify(test_k.instances) + + mae = qp.error.mae(test_k.prevalence(), estim_prev) + mrae = qp.error.mrae(test_k.prevalence(), estim_prev, eps=(1. / (2 * k))) + + results['mae'][k].append(mae) + results['mrae'][k].append(mrae) + + pbar.set_description(f'{method_name}') + + return results + +def benchmark_name(class_name, k): + scape_class_name = class_name.replace('_', '\_') + return f'{scape_class_name}@{k}' + + +if __name__ == '__main__': + data_home = 'data-modsel' + + HALF=True + exp_posfix = '_half_modsel' + + Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000] + + method_names = [m for m, *_ in methods(None, None)] + + dir_names={ + 'gender': '100K_GENDER_TREC21_QUERIES/100K-NEW-QUERIES', + 'continent': '100K_CONT_TREC21_QUERIES/100K-NEW-QUERIES', + 'years_category': '100K_YEARS_TREC21_QUERIES/100K-NEW-QUERIES' + } + + for class_name in ['gender', 'continent', 'years_category']: # 'relative_pageviews_category', 'num_sitelinks_category']: + tables_mae, tables_mrae = [], [] + + benchmarks = [benchmark_name(class_name, k) for k in Ks] + + for data_size in ['100K']: + + table_mae = Table(name=f'{class_name}-{data_size}-mae', benchmarks=benchmarks, methods=method_names) + table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks, methods=method_names) + table_mae.format.mean_prec = 5 + table_mae.format.remove_zero = True + table_mae.format.color_mode = 'global' + + tables_mae.append(table_mae) + tables_mrae.append(table_mrae) + + class_home = join(data_home, dir_names[class_name]) + classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl') # <------------ fixed classifier + test_rankings_path = join(data_home, 'testRanking-TREC21-Queries_Results.json') + results_home = join('results'+exp_posfix, class_name, data_size) + + tfidf, classifier_trained = pickle.load(open(classifier_path, 'rb')) + + experiment_prot = RetrievedSamples( + class_home, + test_rankings_path, + vectorizer=tfidf, + class_name=class_name, + classes=classifier_trained.classes_ + ) + for method_name, quantifier in methods(classifier_trained, class_name): + + results_path = join(results_home, method_name + '.pkl') + if os.path.exists(results_path): + print(f'Method {method_name=} already computed') + results = pickle.load(open(results_path, 'rb')) + else: + results = run_experiment() + + os.makedirs(Path(results_path).parent, exist_ok=True) + pickle.dump(results, open(results_path, 'wb'), pickle.HIGHEST_PROTOCOL) + + for k in Ks: + table_mae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mae'][k]) + table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k]) + + # Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mae+tables_mrae) + Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mrae) + + + + + + + diff --git a/Retrieval/kdey_bandwith_selection.py b/Retrieval/kdey_bandwith_selection_APP.py similarity index 100% rename from Retrieval/kdey_bandwith_selection.py rename to Retrieval/kdey_bandwith_selection_APP.py diff --git a/Retrieval/plot_results.py b/Retrieval/plot_results.py new file mode 100644 index 0000000..26285dd --- /dev/null +++ b/Retrieval/plot_results.py @@ -0,0 +1,102 @@ +import os.path +import pickle +from collections import defaultdict +from pathlib import Path + +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.svm import LinearSVC + +import quapy as qp +from Retrieval.commons import RetrievedSamples, load_sample +from Retrieval.experiments import methods +from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive +from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML +from quapy.data.base import LabelledCollection + +from os.path import join +from tqdm import tqdm + +from result_table.src.table import Table +import matplotlib.pyplot as plt + + +def benchmark_name(class_name, k): + scape_class_name = class_name.replace('_', '\_') + return f'{scape_class_name}@{k}' + + +data_home = 'data' + +HALF=True +exp_posfix = '_half' + +method_names = [name for name, *other in methods(None, 'continent')] + +Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000] + +for class_name in ['gender', 'continent', 'years_category']: # 'relative_pageviews_category', 'num_sitelinks_category']: + + benchmarks = [benchmark_name(class_name, k) for k in Ks] + + for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']: + + fig, ax = plt.subplots() + + class_home = join(data_home, class_name, data_size) + test_rankings_path = join(data_home, 'testRanking_Results.json') + results_home = join('results'+exp_posfix, class_name, data_size) + + max_mean = None + for method_name in method_names: + + results_path = join(results_home, method_name + '.pkl') + try: + results = pickle.load(open(results_path, 'rb')) + except Exception as e: + print(f'missing result {results}', e) + + for err in ['mrae']: + means, stds = [], [] + for k in Ks: + values = results[err][k] + means.append(np.mean(values)) + stds.append(np.std(values)) + + means = np.asarray(means) + stds = np.asarray(stds) #/ np.sqrt(len(stds)) + + if max_mean is None: + max_mean = np.max(means) + else: + max_mean = max(max_mean, np.max(means)) + + line = ax.plot(Ks, means, 'o-', label=method_name, color=None) + color = line[-1].get_color() + # ax.fill_between(Ks, means - stds, means + stds, alpha=0.3, color=color) + + ax.set_xlabel('k') + ax.set_ylabel(err.upper()) + ax.set_title(f'{class_name} from {data_size}') + ax.set_ylim([0, max_mean]) + + ax.legend() + + # plt.show() + os.makedirs(f'plots/results/{class_name}', exist_ok=True) + plotpath = f'plots/results/{class_name}/{data_size}_{err}.pdf' + print(f'saving plot in {plotpath}') + plt.savefig(plotpath) + + + + + + + + + + + diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index 3504b22..c531f64 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -67,7 +67,7 @@ class KDEBase: selX = X[y==cat] if selX.size==0: selX = [F.uniform_prevalence(len(classes))] - class_cond_X.append(selX) + class_cond_X.append(np.asarray(selX)) return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]