From b9fed349f00d0c88feb66682790225b956693228 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Thu, 29 Feb 2024 01:25:27 +0100 Subject: [PATCH] added quacc and a method to allow quantification training and predict with empty classes --- ClassifierAccuracy/commons.py | 281 ++++++++++++++++-- .../accuracy_prediction_via_quantification.py | 0 .../{main2.py => experiments.py} | 68 ++--- ClassifierAccuracy/gen_tables.py | 2 +- ClassifierAccuracy/models_multiclass.py | 201 ++++++++++--- ClassifierAccuracy/utils.py | 164 ---------- quapy/data/base.py | 56 +++- 7 files changed, 488 insertions(+), 284 deletions(-) rename ClassifierAccuracy/{ => deprecated}/accuracy_prediction_via_quantification.py (100%) rename ClassifierAccuracy/{main2.py => experiments.py} (58%) delete mode 100644 ClassifierAccuracy/utils.py diff --git a/ClassifierAccuracy/commons.py b/ClassifierAccuracy/commons.py index c054a4a..983d827 100644 --- a/ClassifierAccuracy/commons.py +++ b/ClassifierAccuracy/commons.py @@ -1,28 +1,21 @@ +import itertools +import json +import os from collections import defaultdict - -from sklearn.base import BaseEstimator -from sklearn.linear_model import LogisticRegression -import numpy as np +from glob import glob +from os import makedirs +from os.path import join +from pathlib import Path from time import time -from sklearn.metrics import confusion_matrix -from sklearn.naive_bayes import GaussianNB -from sklearn.svm import SVC, LinearSVC -from method.aggregative import PACC, EMQ, ACC -from utils import * +import matplotlib.pyplot as plt +from sklearn.datasets import fetch_rcv1 -import quapy.data.datasets -import quapy as qp +from quapy.method.aggregative import EMQ, ACC from models_multiclass import * from quapy.data import LabelledCollection -from quapy.protocol import UPP from quapy.data.datasets import fetch_UCIMulticlassLabelledCollection, UCI_MULTICLASS_DATASETS - - -def split(data: LabelledCollection): - train_val, test = data.split_stratified(train_prop=0.66, random_state=0) - train, val = train_val.split_stratified(train_prop=0.5, random_state=0) - return train, val, test +from quapy.data.datasets import fetch_reviews def gen_classifiers(): @@ -32,30 +25,103 @@ def gen_classifiers(): #yield 'SVM(linear)', LinearSVC() -def gen_datasets()-> [str,[LabelledCollection,LabelledCollection,LabelledCollection]]: +def gen_multi_datasets(only_names=False)-> [str,[LabelledCollection,LabelledCollection,LabelledCollection]]: for dataset_name in UCI_MULTICLASS_DATASETS: - dataset = fetch_UCIMulticlassLabelledCollection(dataset_name) - yield dataset_name, split(dataset) + if only_names: + yield dataset_name, None + else: + dataset = fetch_UCIMulticlassLabelledCollection(dataset_name) + yield dataset_name, split(dataset) + + +def gen_bin_datasets(only_names=False) -> [str,[LabelledCollection,LabelledCollection,LabelledCollection]]: + if only_names: + for dataset_name in ['imdb', 'CCAT', 'GCAT', 'MCAT']: + yield dataset_name, None + else: + train, U = fetch_reviews('imdb', tfidf=True, min_df=10, pickle=True).train_test + L, V = train.split_stratified(0.5, random_state=0) + yield 'imdb', (L, V, U) + + training = fetch_rcv1(subset='train') + test = fetch_rcv1(subset='test') + class_names = training.target_names.tolist() + for cat in ['CCAT', 'GCAT', 'MCAT']: + class_idx = class_names.index(cat) + tr_labels = training.target[:,class_idx].toarray().flatten() + te_labels = test.target[:,class_idx].toarray().flatten() + tr = LabelledCollection(training.data, tr_labels) + U = LabelledCollection(test.data, te_labels) + L, V = tr.split_stratified(train_prop=0.5, random_state=0) + yield cat, (L, V, U) def gen_CAP(h, acc_fn)->[str, ClassifierAccuracyPrediction]: - yield 'SebCAP', SebastianiCAP(h, acc_fn, ACC) - yield 'SebCAPweight', SebastianiCAP(h, acc_fn, ACC, alpha=0) - yield 'PabCAP', PabloCAP(h, acc_fn, ACC) + #yield 'SebCAP', SebastianiCAP(h, acc_fn, ACC) + yield 'SebCAP-SLD', SebastianiCAP(h, acc_fn, EMQ) + #yield 'SebCAPweight', SebastianiCAP(h, acc_fn, ACC, alpha=0) + #yield 'PabCAP', PabloCAP(h, acc_fn, ACC) yield 'PabCAP-SLD-median', PabloCAP(h, acc_fn, EMQ, aggr='median') + def gen_CAP_cont_table(h)->[str,CAPContingencyTable]: acc_fn = None - # yield 'Naive', NaiveCAP(h, acc_fn) + yield 'Naive', NaiveCAP(h, acc_fn) yield 'CT-PPS-EMQ', ContTableTransferCAP(h, acc_fn, EMQ(LogisticRegression())) - #yield 'CT-PPSh-ACC', ContTableWithHTransferCAP(h, acc_fn, ACC) - yield 'Equations-ACCh', NsquaredEquationsCAP(h, acc_fn, ACC, reuse_h=True) + yield 'QuAcc(EMQ)nxn', QuAccNxN(h, acc_fn, EMQ(LogisticRegression())) + #yield 'QuAcc(EMQ)1xn2', QuAcc1xN2(h, acc_fn, EMQ(LogisticRegression())) + yield 'QuAcc(EMQ)1xn2', QuAcc1xN2(h, acc_fn, EMQ(LogisticRegression())) + #yield 'CT-PPSh-EMQ', ContTableTransferCAP(h, acc_fn, EMQ(LogisticRegression()), reuse_h=True) + #yield 'Equations-ACCh', NsquaredEquationsCAP(h, acc_fn, ACC, reuse_h=True) # yield 'Equations-ACC', NsquaredEquationsCAP(h, acc_fn, ACC) - yield 'Equations-SLD', NsquaredEquationsCAP(h, acc_fn, EMQ) + #yield 'Equations-SLD', NsquaredEquationsCAP(h, acc_fn, EMQ) + + +def get_method_names(): + mock_h = LogisticRegression() + return [m for m, _ in gen_CAP(mock_h, None)] + [m for m, _ in gen_CAP_cont_table(mock_h)] + def gen_acc_measure(): yield 'vanilla_accuracy', vanilla_acc_fn - yield 'macro-F1', macrof1 + #yield 'macro-F1', macrof1 + + +def split(data: LabelledCollection): + train_val, test = data.split_stratified(train_prop=0.66, random_state=0) + train, val = train_val.split_stratified(train_prop=0.5, random_state=0) + return train, val, test + + +def fit_method(method, V): + tinit = time() + method.fit(V) + t_train = time() - tinit + return method, t_train + + +def predictionsCAP(method, test_prot): + tinit = time() + estim_accs = [method.predict(Ui.X) for Ui in test_prot()] + t_test_ave = (time() - tinit) / test_prot.total() + return estim_accs, t_test_ave + + +def predictionsCAPcont_table(method, test_prot, gen_acc_measure): + estim_accs_dict = {} + tinit = time() + estim_tables = [method.predict_ct(Ui.X) for Ui in test_prot()] + for acc_name, acc_fn in gen_acc_measure(): + estim_accs_dict[acc_name] = [acc_fn(cont_table) for cont_table in estim_tables] + t_test_ave = (time() - tinit) / test_prot.total() + return estim_accs_dict, t_test_ave + + +def any_missing(basedir, cls_name, dataset_name, method_name): + for acc_name, _ in gen_acc_measure(): + if not os.path.exists(getpath(basedir, cls_name, acc_name, dataset_name, method_name)): + return True + return False def true_acc(h:BaseEstimator, acc_fn: callable, U: LabelledCollection): @@ -115,4 +181,159 @@ def cap_errors(true_acc, estim_acc): true_acc = np.asarray(true_acc) estim_acc = np.asarray(estim_acc) #return (true_acc - estim_acc)**2 - return np.abs(true_acc - estim_acc) \ No newline at end of file + return np.abs(true_acc - estim_acc) + + +def plot_diagonal(cls_name, measure_name, results, base_dir='plots'): + + makedirs(base_dir, exist_ok=True) + makedirs(join(base_dir, measure_name), exist_ok=True) + + # Create scatter plot + plt.figure(figsize=(10, 10)) + plt.xlim(0, 1) + plt.ylim(0, 1) + plt.plot([0, 1], [0, 1], color='black', linestyle='--') + + for method_name in results.keys(): + xs = results[method_name]['true_acc'] + ys = results[method_name]['estim_acc'] + err = cap_errors(xs, ys).mean() + #pear_cor, _ = 0, 0 #pearsonr(xs, ys) + plt.scatter(xs, ys, label=f'{method_name} {err:.3f}', alpha=0.6) + + plt.legend() + + # Add labels and title + plt.xlabel(f'True {measure_name}') + plt.ylabel(f'Estimated {measure_name}') + + # Display the plot + # plt.show() + plt.savefig(join(base_dir, measure_name, 'diagonal_'+cls_name+'.png')) + + +def getpath(basedir, cls_name, acc_name, dataset_name, method_name): + return f"results/{basedir}/{cls_name}/{acc_name}/{dataset_name}/{method_name}.json" + + +def open_results(basedir, cls_name, acc_name, dataset_name='*', method_name='*'): + results = defaultdict(lambda : {'true_acc':[], 'estim_acc':[]}) + if isinstance(method_name, str): + method_name = [method_name] + if isinstance(dataset_name, str): + dataset_name = [dataset_name] + for dataset_, method_ in itertools.product(dataset_name, method_name): + path = getpath(basedir, cls_name, acc_name, dataset_, method_) + for file in glob(path): + #print(file) + method = Path(file).name.replace('.json','') + result = json.load(open(file, 'r')) + results[method]['true_acc'].extend(result['true_acc']) + results[method]['estim_acc'].extend(result['estim_acc']) + return results + + +def save_json_file(path, data): + os.makedirs(Path(path).parent, exist_ok=True) + with open(path, 'w') as f: + json.dump(data, f) + + +def save_json_result(path, true_accs, estim_accs, t_train, t_test): + result = { + 't_train': t_train, + 't_test_ave': t_test, + 'true_acc': true_accs, + 'estim_acc': estim_accs + } + save_json_file(path, result) + + +def get_dataset_stats(path, test_prot, L, V): + test_prevs = [Ui.prevalence() for Ui in test_prot()] + shifts = [qp.error.ae(L.prevalence(), Ui_prev) for Ui_prev in test_prevs] + info = { + 'n_classes': L.n_classes, + 'n_train': len(L), + 'n_val': len(V), + 'train_prev': L.prevalence().tolist(), + 'val_prev': V.prevalence().tolist(), + 'test_prevs': [x.tolist() for x in test_prevs], + 'shifts': [x.tolist() for x in shifts], + 'sample_size': test_prot.sample_size, + 'num_samples': test_prot.total() + } + save_json_file(path, info) + + +def gen_tables(basedir, datasets): + from tabular import Table + + mock_h = LogisticRegression(), + methods = [method for method, _ in gen_CAP(mock_h, None)] + [method for method, _ in gen_CAP_cont_table(mock_h)] + classifiers = [classifier for classifier, _ in gen_classifiers()] + measures = [measure for measure, _ in gen_acc_measure()] + + os.makedirs('tables', exist_ok=True) + + tex_doc = """ + \\documentclass[10pt,a4paper]{article} + \\usepackage[utf8]{inputenc} + \\usepackage{amsmath} + \\usepackage{amsfonts} + \\usepackage{amssymb} + \\usepackage{graphicx} + \\usepackage{tabularx} + \\usepackage{color} + \\usepackage{colortbl} + \\usepackage{xcolor} + \\begin{document} + """ + + classifier = classifiers[0] + metric = "vanilla_accuracy" + + table = Table(datasets, methods) + for method, dataset in itertools.product(methods, datasets): + path = getpath(basedir, classifier, metric, dataset, method) + if not os.path.exists(path): + print('missing ', path) + continue + results = json.load(open(path, 'r')) + true_acc = results['true_acc'] + estim_acc = np.asarray(results['estim_acc']) + if any(np.isnan(estim_acc)): + print(f'nan values found in {method=} {dataset=}') + continue + if any(estim_acc>1.00001): + print(f'values >1 found in {method=} {dataset=} [max={estim_acc.max()}]') + continue + if any(estim_acc<-0.00001): + print(f'values <0 found in {method=} {dataset=} [min={estim_acc.min()}]') + continue + errors = cap_errors(true_acc, estim_acc) + table.add(dataset, method, errors) + + tex = table.latexTabular() + table_name = f'{basedir}_{classifier}_{metric}.tex' + with open(f'./tables/{table_name}', 'wt') as foo: + foo.write('\\resizebox{\\textwidth}{!}{%\n') + foo.write('\\begin{tabular}{c|'+('c'*len(methods))+'}\n') + foo.write(tex) + foo.write('\\end{tabular}%\n') + foo.write('}\n') + + tex_doc += "\input{" + table_name + "}\n" + + tex_doc += """ + \\end{document} + """ + with open(f'./tables/main.tex', 'wt') as foo: + foo.write(tex_doc) + + print("[Tables Done] runing latex") + os.chdir('./tables/') + os.system('pdflatex main.tex') + os.system('rm main.aux main.log') + diff --git a/ClassifierAccuracy/accuracy_prediction_via_quantification.py b/ClassifierAccuracy/deprecated/accuracy_prediction_via_quantification.py similarity index 100% rename from ClassifierAccuracy/accuracy_prediction_via_quantification.py rename to ClassifierAccuracy/deprecated/accuracy_prediction_via_quantification.py diff --git a/ClassifierAccuracy/main2.py b/ClassifierAccuracy/experiments.py similarity index 58% rename from ClassifierAccuracy/main2.py rename to ClassifierAccuracy/experiments.py index a3b8bbf..73ad1b1 100644 --- a/ClassifierAccuracy/main2.py +++ b/ClassifierAccuracy/experiments.py @@ -1,46 +1,16 @@ -import itertools -import os.path -from collections import defaultdict -from time import time -from utils import * -from models_multiclass import * -from quapy.protocol import UPP from commons import * +PROBLEM = 'multiclass' +basedir = PROBLEM -def fit_method(method, V): - tinit = time() - method.fit(V) - t_train = time() - tinit - return method, t_train - - -def predictionsCAP(method, test_prot): - tinit = time() - estim_accs = [method.predict(Ui.X) for Ui in test_prot()] - t_test_ave = (time() - tinit) / test_prot.total() - return estim_accs, t_test_ave - - -def predictionsCAPcont_table(method, test_prot, gen_acc_measure): - estim_accs_dict = {} - tinit = time() - estim_tables = [method.predict_ct(Ui.X) for Ui in test_prot()] - for acc_name, acc_fn in gen_acc_measure(): - estim_accs_dict[acc_name] = [acc_fn(cont_table) for cont_table in estim_tables] - t_test_ave = (time() - tinit) / test_prot.total() - return estim_accs_dict, t_test_ave - - -def any_missing(cls_name, dataset_name, method_name): - for acc_name, _ in gen_acc_measure(): - if not os.path.exists(getpath(cls_name, acc_name, dataset_name, method_name)): - return True - return False - - -qp.environ['SAMPLE_SIZE'] = 250 -NUM_TEST = 100 +if PROBLEM == 'binary': + qp.environ['SAMPLE_SIZE'] = 1000 + NUM_TEST = 1000 + gen_datasets = gen_bin_datasets +elif PROBLEM == 'multiclass': + qp.environ['SAMPLE_SIZE'] = 250 + NUM_TEST = 100 + gen_datasets = gen_multi_datasets for (cls_name, h), (dataset_name, (L, V, U)) in itertools.product(gen_classifiers(), gen_datasets()): @@ -62,7 +32,7 @@ for (cls_name, h), (dataset_name, (L, V, U)) in itertools.product(gen_classifier # must be nested in the acc-for for acc_name, acc_fn in gen_acc_measure(): for (method_name, method) in gen_CAP(h, acc_fn): - result_path = getpath(cls_name, acc_name, dataset_name, method_name) + result_path = getpath(basedir, cls_name, acc_name, dataset_name, method_name) if os.path.exists(result_path): print(f'\t{method_name}-{acc_name} exists, skipping') continue @@ -75,7 +45,7 @@ for (cls_name, h), (dataset_name, (L, V, U)) in itertools.product(gen_classifier # instances of CAPContingencyTable instead are generic, and the evaluation measure can # be nested to the predictions to speed up things for (method_name, method) in gen_CAP_cont_table(h): - if not any_missing(cls_name, dataset_name, method_name): + if not any_missing(basedir, cls_name, dataset_name, method_name): print(f'\tmethod {method_name} has all results already computed. Skipping.') continue @@ -84,14 +54,22 @@ for (cls_name, h), (dataset_name, (L, V, U)) in itertools.product(gen_classifier method, t_train = fit_method(method, V) estim_accs_dict, t_test_ave = predictionsCAPcont_table(method, test_prot, gen_acc_measure) for acc_name in estim_accs_dict.keys(): - result_path = getpath(cls_name, acc_name, dataset_name, method_name) + result_path = getpath(basedir, cls_name, acc_name, dataset_name, method_name) save_json_result(result_path, true_accs[acc_name], estim_accs_dict[acc_name], t_train, t_test_ave) print() # generate diagonal plots +print('generating plots') for (cls_name, _), (acc_name, _) in itertools.product(gen_classifiers(), gen_acc_measure()): - results = open_results(cls_name, acc_name) - plot_diagonal(cls_name, acc_name, results) + methods = get_method_names() + results = open_results(basedir, cls_name, acc_name, method_name=methods) + plot_diagonal(cls_name, acc_name, results, base_dir=f'plots/{basedir}/all') + for dataset_name, _ in gen_datasets(only_names=True): + results = open_results(basedir, cls_name, acc_name, dataset_name=dataset_name, method_name=methods) + plot_diagonal(cls_name, acc_name, results, base_dir=f'plots/{basedir}/{dataset_name}') + +print('generating tables') +gen_tables(basedir, datasets=[d for d,_ in gen_datasets(only_names=True)]) diff --git a/ClassifierAccuracy/gen_tables.py b/ClassifierAccuracy/gen_tables.py index e5bd279..e306e74 100644 --- a/ClassifierAccuracy/gen_tables.py +++ b/ClassifierAccuracy/gen_tables.py @@ -1,3 +1,3 @@ -from utils import gen_tables +from commons import gen_tables gen_tables() \ No newline at end of file diff --git a/ClassifierAccuracy/models_multiclass.py b/ClassifierAccuracy/models_multiclass.py index ce67d16..23b9a7c 100644 --- a/ClassifierAccuracy/models_multiclass.py +++ b/ClassifierAccuracy/models_multiclass.py @@ -1,3 +1,5 @@ +from copy import deepcopy + import numpy as np from sklearn.base import BaseEstimator from sklearn.linear_model import LogisticRegression @@ -14,7 +16,7 @@ from sklearn.model_selection import cross_val_predict from quapy.protocol import UPP from quapy.method.base import BaseQuantifier -from quapy.method.aggregative import PACC +from quapy.method.aggregative import PACC, AggregativeQuantifier import quapy.functional as F @@ -102,20 +104,38 @@ class NaiveCAP(CAPContingencyTable): return self.cont_table -class ContTableTransferCAP(CAPContingencyTable): +class CAPContingencyTableQ(CAPContingencyTable): + + def __init__(self, h: BaseEstimator, acc: callable, q_class: AggregativeQuantifier, reuse_h=False): + super().__init__(h, acc) + self.reuse_h = reuse_h + if reuse_h: + assert isinstance(q_class, AggregativeQuantifier), f'quantifier {q_class} is not of type aggregative' + self.q = deepcopy(q_class) + self.q.set_params(classifier=h) + else: + self.q = q_class + + def quantifier_fit(self, val: LabelledCollection): + if self.reuse_h: + self.q.fit(val, fit_classifier=False, val_split=val) + else: + self.q.fit(val) + + +class ContTableTransferCAP(CAPContingencyTableQ): """ """ - def __init__(self, h: BaseEstimator, acc: callable, q: BaseQuantifier): - super().__init__(h, acc) - self.q = q + def __init__(self, h: BaseEstimator, acc: callable, q_class, reuse_h=False): + super().__init__(h, acc, q_class, reuse_h) def fit(self, val: LabelledCollection): y_hat = self.h.predict(val.X) y_true = val.y self.cont_table = confusion_matrix(y_true, y_pred=y_hat, labels=val.classes_) self.train_prev = val.prevalence() - self.q.fit(val) + self.quantifier_fit(val) return self def predict_ct(self, test): @@ -128,52 +148,18 @@ class ContTableTransferCAP(CAPContingencyTable): return self.cont_table * adjustment[:, np.newaxis] -class ContTableWithHTransferCAP(CAPContingencyTable): - """ - - """ - def __init__(self, h: BaseEstimator, acc: callable, q_class): - super().__init__(h, acc) - self.q = q_class(classifier=h) - - def fit(self, val: LabelledCollection): - y_hat = self.h.predict(val.X) - y_true = val.y - self.cont_table = confusion_matrix(y_true, y_pred=y_hat, labels=val.classes_) - self.train_prev = val.prevalence() - self.q.fit(val, fit_classifier=False, val_split=val) - return self - - def predict_ct(self, test): - """ - :param test: test collection (ignored) - :return: a confusion matrix in the return format of `sklearn.metrics.confusion_matrix` - """ - test_prev_estim = self.q.quantify(test) - adjustment = test_prev_estim / self.train_prev - return self.cont_table * adjustment[:, np.newaxis] - - -class NsquaredEquationsCAP(CAPContingencyTable): +class NsquaredEquationsCAP(CAPContingencyTableQ): """ """ def __init__(self, h: BaseEstimator, acc: callable, q_class, reuse_h=False): - super().__init__(h, acc) - self.reuse_h = reuse_h - if reuse_h: - self.q = q_class(classifier=h) - else: - self.q = q_class(classifier=LogisticRegression()) + super().__init__(h, acc, q_class, reuse_h) def fit(self, val: LabelledCollection): y_hat = self.h.predict(val.X) y_true = val.y self.cont_table = confusion_matrix(y_true, y_pred=y_hat, labels=val.classes_) - if self.reuse_h: - self.q.fit(val, fit_classifier=False, val_split=val) - else: - self.q.fit(val) + self.quantifier_fit(val) self.A, self.partial_b = self._construct_equations() return self @@ -247,8 +233,22 @@ class NsquaredEquationsCAP(CAPContingencyTable): b[-2*(n-1):-(n-1)] = cc_prev_estim[1:] b[-(n-1):] = q_prev_estim[1:] + # try the fast solution (may not be valid) x = np.linalg.solve(A, b) + if any(x<0) or any(x>0) or not np.isclose(x.sum(), 1): + + print('L', end='') + + # try the iterative solution + def loss(x): + return np.linalg.norm(A @ x - b, ord=2) + + x = F.optim_minimize(loss, n_classes=n**2) + + else: + print('.', end='') + cont_table_test = x.reshape(n,n) return cont_table_test @@ -334,3 +334,118 @@ class PabloCAP(ClassifierAccuracyPrediction): raise ValueError('unknown aggregation function') +class QuAcc: + def _get_X_dot(self, X): + h = self.h + if hasattr(h, 'predict_proba'): + P = h.predict_proba(X)[:, 1:] + else: + n_classes = len(h.classes_) + P = h.decision_function(X).reshape(-1, n_classes) + + X_dot = safehstack(X, P) + return X_dot + + +class QuAcc1xN2(CAPContingencyTableQ, QuAcc): + + def __init__(self, h: BaseEstimator, acc: callable, q_class: AggregativeQuantifier): + self.h = h + self.acc = acc + self.q = EmptySaveQuantifier(q_class) + + def fit(self, val: LabelledCollection): + pred_labels = self.h.predict(val.X) + true_labels = val.y + + n = val.n_classes + classes_dot = np.arange(n**2) + ct_class_idx = classes_dot.reshape(n, n) + + X_dot = self._get_X_dot(val.X) + y_dot = ct_class_idx[true_labels, pred_labels] + val_dot = LabelledCollection(X_dot, y_dot, classes=classes_dot) + self.q.fit(val_dot) + + def predict_ct(self, X): + X_dot = self._get_X_dot(X) + return self.q.quantify(X_dot) + + +class QuAccNxN(CAPContingencyTableQ, QuAcc): + + def __init__(self, h: BaseEstimator, acc: callable, q_class: AggregativeQuantifier): + self.h = h + self.acc = acc + self.q_class = q_class + + def fit(self, val: LabelledCollection): + pred_labels = self.h.predict(val.X) + true_labels = val.y + X_dot = self._get_X_dot(val.X) + + self.q = [] + for class_i in self.h.classes_: + X_dot_i = X_dot[pred_labels==class_i] + y_i = true_labels[pred_labels==class_i] + data_i = LabelledCollection(X_dot_i, y_i, classes=val.classes_) + + q_i = EmptySaveQuantifier(deepcopy(self.q_class)) + q_i.fit(data_i) + self.q.append(q_i) + + def predict_ct(self, X): + classes = self.h.classes_ + pred_labels = self.h.predict(X) + X_dot = self._get_X_dot(X) + pred_prev = F.prevalence_from_labels(pred_labels, classes) + cont_table = [] + for class_i, q_i, p_i in zip(classes, self.q, pred_prev): + X_dot_i = X_dot[pred_labels==class_i] + classcond_cond_table_prevs = q_i.quantify(X_dot_i) + cond_table_prevs = p_i * classcond_cond_table_prevs + cont_table.append(cond_table_prevs) + cont_table = np.vstack(cont_table) + return cont_table + + +def safehstack(X, P): + if issparse(X) or issparse(P): + XP = scipy.sparse.hstack([X, P]) + XP = csr_matrix(XP) + else: + XP = np.hstack([X,P]) + return XP + + +class EmptySaveQuantifier(BaseQuantifier): + def __init__(self, surrogate_quantifier: BaseQuantifier): + self.surrogate = surrogate_quantifier + + def fit(self, data: LabelledCollection): + self.n_classes = data.n_classes + class_compact_data, self.old_class_idx = data.compact_classes() + if self.num_non_empty_classes() > 1: + self.surrogate.fit(class_compact_data) + return self + + def quantify(self, instances): + num_instances = instances.shape[0] + if self.num_non_empty_classes() == 0 or num_instances==0: + # returns the uniform prevalence vector + uniform = np.full(fill_value=1./self.n_classes, shape=self.n_classes, dtype=float) + return uniform + elif self.num_non_empty_classes() == 1: + # returns a prevalence vector with 100% of the mass in the only non empty class + prev_vector = np.full(fill_value=0., shape=self.n_classes, dtype=float) + prev_vector[self.old_class_idx[0]] = 1 + return prev_vector + else: + class_compact_prev = self.surrogate.quantify(instances) + prev_vector = np.full(fill_value=0., shape=self.n_classes, dtype=float) + prev_vector[self.old_class_idx] = class_compact_prev + return prev_vector + + def num_non_empty_classes(self): + return len(self.old_class_idx) + diff --git a/ClassifierAccuracy/utils.py b/ClassifierAccuracy/utils.py deleted file mode 100644 index 257f69f..0000000 --- a/ClassifierAccuracy/utils.py +++ /dev/null @@ -1,164 +0,0 @@ -import itertools -import os -from collections import defaultdict - -import matplotlib.pyplot as plt -from pathlib import Path -from os import makedirs -from os.path import join -import numpy as np -import json -from scipy.stats import pearsonr -from sklearn.linear_model import LogisticRegression -from time import time -import quapy as qp -from glob import glob - -from commons import cap_errors -from models_multiclass import ClassifierAccuracyPrediction, CAPContingencyTable - - -def plot_diagonal(cls_name, measure_name, results, base_dir='plots'): - - makedirs(base_dir, exist_ok=True) - makedirs(join(base_dir, measure_name), exist_ok=True) - - # Create scatter plot - plt.figure(figsize=(10, 10)) - plt.xlim(0, 1) - plt.ylim(0, 1) - plt.plot([0, 1], [0, 1], color='black', linestyle='--') - - for method_name in results.keys(): - print(method_name, measure_name) - xs = results[method_name]['true_acc'] - ys = results[method_name]['estim_acc'] - print('max xs', np.max(xs)) - print('max ys', np.max(ys)) - err = cap_errors(xs, ys).mean() - #pear_cor, _ = 0, 0 #pearsonr(xs, ys) - plt.scatter(xs, ys, label=f'{method_name} {err:.3f}', alpha=0.6) - - plt.legend() - - # Add labels and title - plt.xlabel(f'True {measure_name}') - plt.ylabel(f'Estimated {measure_name}') - - # Display the plot - # plt.show() - plt.savefig(join(base_dir, measure_name, 'diagonal_'+cls_name+'.png')) - - -def getpath(cls_name, acc_name, dataset_name, method_name): - return f"results/{cls_name}/{acc_name}/{dataset_name}/{method_name}.json" - - -def open_results(cls_name, acc_name, dataset_name='*', method_name='*'): - path = getpath(cls_name, acc_name, dataset_name, method_name) - results = defaultdict(lambda : {'true_acc':[], 'estim_acc':[]}) - for file in glob(path): - #print(file) - method = Path(file).name.replace('.json','') - result = json.load(open(file, 'r')) - results[method]['true_acc'].extend(result['true_acc']) - results[method]['estim_acc'].extend(result['estim_acc']) - return results - - -def save_json_file(path, data): - os.makedirs(Path(path).parent, exist_ok=True) - with open(path, 'w') as f: - json.dump(data, f) - - -def save_json_result(path, true_accs, estim_accs, t_train, t_test): - result = { - 't_train': t_train, - 't_test_ave': t_test, - 'true_acc': true_accs, - 'estim_acc': estim_accs - } - save_json_file(path, result) - - -def get_dataset_stats(path, test_prot, L, V): - test_prevs = [Ui.prevalence() for Ui in test_prot()] - shifts = [qp.error.ae(L.prevalence(), Ui_prev) for Ui_prev in test_prevs] - info = { - 'n_classes': L.n_classes, - 'n_train': len(L), - 'n_val': len(V), - 'train_prev': L.prevalence().tolist(), - 'val_prev': V.prevalence().tolist(), - 'test_prevs': [x.tolist() for x in test_prevs], - 'shifts': [x.tolist() for x in shifts], - 'sample_size': test_prot.sample_size, - 'num_samples': test_prot.total() - } - save_json_file(path, info) - - -def gen_tables(): - from commons import gen_datasets, gen_classifiers, gen_acc_measure, gen_CAP, gen_CAP_cont_table - from tabular import Table - - mock_h = LogisticRegression(), - methods = [method for method, _ in gen_CAP(mock_h, None)] + [method for method, _ in gen_CAP_cont_table(mock_h)] - datasets = [dataset for dataset, _ in gen_datasets()] - classifiers = [classifier for classifier, _ in gen_classifiers()] - measures = [measure for measure, _ in gen_acc_measure()] - - os.makedirs('tables', exist_ok=True) - - tex_doc = """ - \\documentclass[10pt,a4paper]{article} - \\usepackage[utf8]{inputenc} - \\usepackage{amsmath} - \\usepackage{amsfonts} - \\usepackage{amssymb} - \\usepackage{graphicx} - \\usepackage{tabularx} - \\usepackage{color} - \\usepackage{colortbl} - \\usepackage{xcolor} - \\begin{document} - """ - - classifier = classifiers[0] - metric = "vanilla_accuracy" - - table = Table(datasets, methods) - for method, dataset in itertools.product(methods, datasets): - path = f'results/{classifier}/{metric}/{dataset}/{method}.json' - results = json.load(open(path, 'r')) - true_acc = results['true_acc'] - estim_acc = np.asarray(results['estim_acc']) - if any(np.isnan(estim_acc)) or any(estim_acc>1) or any(estim_acc<0): - print(f'error in {method=} {dataset=}') - continue - errors = cap_errors(true_acc, estim_acc) - table.add(dataset, method, errors) - - tex = table.latexTabular() - table_name = f'{classifier}_{metric}.tex' - with open(f'./tables/{table_name}', 'wt') as foo: - foo.write('\\resizebox{\\textwidth}{!}{%\n') - foo.write('\\begin{tabular}{c|'+('c'*len(methods))+'}\n') - foo.write(tex) - foo.write('\\end{tabular}%\n') - foo.write('}\n') - - tex_doc += "\input{" + table_name + "}\n" - - tex_doc += """ - \\end{document} - """ - with open(f'./tables/main.tex', 'wt') as foo: - foo.write(tex_doc) - - print("[Tables Done] runing latex") - os.chdir('./tables/') - os.system('pdflatex main.tex') - os.system('rm main.aux main.bbl main.blg main.log main.out main.dvi') - diff --git a/quapy/data/base.py b/quapy/data/base.py index 2629084..eb41c44 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -232,11 +232,24 @@ class LabelledCollection: :return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the second one with `1-train_prop` elements """ + instances = self.instances + labels = self.labels + remainder = None + for idx in np.argwhere(self.counts()==1): + class_with_1 = self.classes_[idx.item()] + if remainder is None: + remainder = LabelledCollection(instances[labels==class_with_1], [class_with_1], classes=self.classes_) + else: + remainder += LabelledCollection(instances[labels==class_with_1], [class_with_1], classes=self.classes_) + instances = instances[labels!=class_with_1] + labels = labels[labels!=class_with_1] tr_docs, te_docs, tr_labels, te_labels = train_test_split( - self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state + instances, labels, train_size=train_prop, stratify=labels, random_state=random_state ) training = LabelledCollection(tr_docs, tr_labels, classes=self.classes_) test = LabelledCollection(te_docs, te_labels, classes=self.classes_) + if remainder is not None: + training += remainder return training, test def split_random(self, train_prop=0.6, random_state=None): @@ -414,6 +427,47 @@ class LabelledCollection: test = self.sampling_from_index(test_index) yield train, test + def empty_classes(self): + """ + Returns a np.ndarray of empty classes (classes present in self.classes_ but with + no positive instance). In case there is none, then an empty np.ndarray is returned + + :return: np.ndarray + """ + idx = np.argwhere(self.counts()==0).flatten() + return self.classes_[idx] + + def non_empty_classes(self): + """ + Returns a np.ndarray of non-empty classes (classes present in self.classes_ but with + at least one positive instance). In case there is none, then an empty np.ndarray is returned + + :return: np.ndarray + """ + idx = np.argwhere(self.counts() > 0).flatten() + return self.classes_[idx] + + def has_empty_classes(self): + """ + Checks whether the collection has empty classes + + :return: boolean + """ + return len(self.empty_classes()) > 0 + + def compact_classes(self): + """ + Generates a new LabelledCollection object with no empty classes. It also returns a np.ndarray of + indexes that correspond to the old indexes of the new self.classes_. + + :return: (LabelledCollection, np.ndarray,) + """ + non_empty = self.non_empty_classes() + all_classes = self.classes_ + old_pos = np.searchsorted(all_classes, non_empty) + non_empty_collection = LabelledCollection(*self.Xy, classes=non_empty) + return non_empty_collection, old_pos + class Dataset: """