diff --git a/.gitignore b/.gitignore index b5ecfbc..a18696a 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,4 @@ dmypy.json .pyre/ *__pycache__* +*.dataframe diff --git a/distribution_matching/binary_experiments.py b/distribution_matching/binary_experiments.py new file mode 100644 index 0000000..2a17027 --- /dev/null +++ b/distribution_matching/binary_experiments.py @@ -0,0 +1,83 @@ +import pickle +import os +from distribution_matching.commons import BIN_METHODS, new_method, show_results + +import quapy as qp +from quapy.model_selection import GridSearchQ +from quapy.protocol import UPP + + +SEED = 1 + + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = 100 + qp.environ['N_JOBS'] = -1 + n_bags_val = 250 + n_bags_test = 1000 + optim = 'mae' + result_dir = f'results/binary/{optim}' + + os.makedirs(result_dir, exist_ok=True) + + for method in BIN_METHODS: + + print('Init method', method) + + global_result_path = f'{result_dir}/{method}' + + if not os.path.exists(global_result_path + '.csv'): + with open(global_result_path + '.csv', 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n') + + with open(global_result_path + '.csv', 'at') as csv: + + for dataset in qp.datasets.UCI_DATASETS: + if dataset in ['acute.a', 'acute.b', 'iris.1']: continue # , 'pageblocks.5', 'spambase', 'wdbc']: continue + + print('init', dataset) + + local_result_path = global_result_path + '_' + dataset + if os.path.exists(local_result_path + '.dataframe'): + print(f'result file {local_result_path}.dataframe already exist; skipping') + continue + + with qp.util.temp_seed(SEED): + + param_grid, quantifier = new_method(method, max_iter=3000) + + data = qp.datasets.fetch_UCIDataset(dataset) + + # model selection + train, test = data.train_test + train, val = train.split_stratified() + + protocol = UPP(val, repeats=n_bags_val) + modsel = GridSearchQ( + quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error=optim + ) + + try: + modsel.fit(train) + + print(f'best params {modsel.best_params_}') + print(f'best score {modsel.best_score_}') + pickle.dump( + (modsel.best_params_, modsel.best_score_,), + open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) + + quantifier = modsel.best_model() + except: + print('something went wrong... reporting CC') + quantifier = qp.method.aggregative.CC(LR()).fit(train) + + protocol = UPP(test, repeats=n_bags_test) + report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], + verbose=True) + report.to_csv(f'{local_result_path}.dataframe') + means = report.mean() + csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') + csv.flush() + + show_results(global_result_path) diff --git a/distribution_matching/commons.py b/distribution_matching/commons.py new file mode 100644 index 0000000..d2ccd40 --- /dev/null +++ b/distribution_matching/commons.py @@ -0,0 +1,78 @@ +import numpy as np +import pandas as pd +from distribution_matching.method_kdey import KDEy +from quapy.method.aggregative import EMQ, CC, PCC, DistributionMatching, PACC, HDy, OneVsAllAggregative, ACC +from distribution_matching.method_dirichlety import DIRy +from sklearn.linear_model import LogisticRegression + + +METHODS = ['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DM', 'EMQ', 'KDEy-ML'] +BIN_METHODS = ['ACC', 'PACC', 'HDy', 'DIR', 'DM', 'KDEy-DM', 'EMQ', 'KDEy-ML'] + + +hyper_LR = { + 'classifier__C': np.logspace(-3,3,7), + 'classifier__class_weight': ['balanced', None] +} + +def new_method(method, **lr_kwargs): + + lr = LogisticRegression(**lr_kwargs) + + if method == 'CC': + param_grid = hyper_LR + quantifier = CC(lr) + elif method == 'PCC': + param_grid = hyper_LR + quantifier = PCC(lr) + elif method == 'ACC': + param_grid = hyper_LR + quantifier = ACC(lr) + elif method == 'PACC': + param_grid = hyper_LR + quantifier = PACC(lr) + elif method == 'KDEy-ML': + method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} + param_grid = {**method_params, **hyper_LR} + quantifier = KDEy(lr, target='max_likelihood', val_split=10) + elif method in ['KDEy-DM']: + method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} + param_grid = {**method_params, **hyper_LR} + quantifier = KDEy(lr, target='min_divergence', divergence='l2', montecarlo_trials=5000, val_split=10) + elif method == 'DIR': + param_grid = hyper_LR + quantifier = DIRy(lr) + elif method == 'EMQ': + param_grid = hyper_LR + quantifier = EMQ(lr) + elif method == 'HDy-OvA': + param_grid = {'binary_quantifier__' + key: val for key, val in hyper_LR.items()} + quantifier = OneVsAllAggregative(HDy(lr)) + elif method == 'DM': + method_params = { + 'nbins': [4,8,16,32], + 'val_split': [10, 0.4], + 'divergence': ['HD', 'topsoe', 'l2'] + } + param_grid = {**method_params, **hyper_LR} + quantifier = DistributionMatching(lr) + + elif method in ['KDE-DMkld']: + method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} + param_grid = {**method_params, **hyper_LR} + quantifier = KDEy(lr, target='min_divergence', divergence='KLD', montecarlo_trials=5000, val_split=10) + + else: + raise NotImplementedError('unknown method', method) + + return param_grid, quantifier + + +def show_results(result_path): + df = pd.read_csv(result_path+'.csv', sep='\t') + + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"]) + print(pv) + diff --git a/distribution_matching/figures/sensibility_plot.py b/distribution_matching/figures/sensibility_plot.py new file mode 100644 index 0000000..0165fbc --- /dev/null +++ b/distribution_matching/figures/sensibility_plot.py @@ -0,0 +1,26 @@ +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + +""" +This script generates plots of sensibility for the hyperparameter "bandwidth". +Plots results for MAE, MRAE, and KLD +The rest of hyperparameters were set to their default values +""" + +df_tweet = pd.read_csv('../results_tweet_sensibility/KDEy-MLE.csv', sep='\t') +df_lequa = pd.read_csv('../results_lequa_sensibility/KDEy-MLE.csv', sep='\t') +df = pd.concat([df_tweet, df_lequa]) + +for err in ['MAE', 'MRAE', 'KLD']: + piv = df.pivot_table(index='Bandwidth', columns='Dataset', values=err) + g = sns.lineplot(data=piv, markers=True, dashes=False) + g.set(xlim=(0.01, 0.2)) + g.legend(loc="center left", bbox_to_anchor=(1, 0.5)) + g.set_ylabel(err) + g.set_xticks(np.linspace(0.01, 0.2, 20)) + plt.xticks(rotation=90) + plt.grid() + plt.savefig(f'./sensibility_{err}.pdf', bbox_inches='tight') + plt.clf() \ No newline at end of file diff --git a/distribution_matching/lequa_bandwidth_sensibility.py b/distribution_matching/lequa_bandwidth_sensibility.py new file mode 100644 index 0000000..d212ab3 --- /dev/null +++ b/distribution_matching/lequa_bandwidth_sensibility.py @@ -0,0 +1,56 @@ +import numpy as np +from sklearn.linear_model import LogisticRegression +import os +import pandas as pd +import quapy as qp +from method_kdey import KDEy + + +SEED=1 + +def task(bandwidth): + print('job-init', dataset, bandwidth) + train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset) + + with qp.util.temp_seed(SEED): + quantifier = KDEy(LogisticRegression(), target='max_likelihood', val_split=10, bandwidth=bandwidth) + quantifier.fit(train) + report = qp.evaluation.evaluation_report( + quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True) + return report + + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B'] + qp.environ['N_JOBS'] = -1 + result_dir = f'results_lequa_sensibility' + + os.makedirs(result_dir, exist_ok=True) + + method = 'KDEy-MLE' + + global_result_path = f'{result_dir}/{method}' + + if not os.path.exists(global_result_path+'.csv'): + with open(global_result_path+'.csv', 'wt') as csv: + csv.write(f'Method\tDataset\tBandwidth\tMAE\tMRAE\tKLD\n') + + dataset = 'T1B' + bandwidths = np.linspace(0.01, 0.2, 20) + + reports = qp.util.parallel(task, bandwidths, n_jobs=-1) + with open(global_result_path + '.csv', 'at') as csv: + for bandwidth, report in zip(bandwidths, reports): + means = report.mean() + local_result_path = global_result_path + '_' + dataset + f'_{bandwidth:.3f}' + report.to_csv(f'{local_result_path}.dataframe') + csv.write(f'{method}\tLeQua-T1B\t{bandwidth}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') + csv.flush() + + df = pd.read_csv(global_result_path + '.csv', sep='\t') + + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"]) + print(pv) diff --git a/distribution_matching/lequa_experiments.py b/distribution_matching/lequa_experiments.py index 95b6b5e..24c1125 100644 --- a/distribution_matching/lequa_experiments.py +++ b/distribution_matching/lequa_experiments.py @@ -1,16 +1,12 @@ import pickle import numpy as np -from sklearn.linear_model import LogisticRegression import os -import sys import pandas as pd +from distribution_matching.commons import METHODS, new_method, show_results import quapy as qp -from quapy.method.aggregative import EMQ, DistributionMatching, PACC, HDy, OneVsAllAggregative, ACC -from method_kdey import KDEy -from method_dirichlety import DIRy from quapy.model_selection import GridSearchQ -from quapy.protocol import UPP + if __name__ == '__main__': @@ -18,16 +14,11 @@ if __name__ == '__main__': qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B'] qp.environ['N_JOBS'] = -1 optim = 'mrae' - result_dir = f'results/results_lequa_{optim}' + result_dir = f'results/lequa/{optim}' os.makedirs(result_dir, exist_ok=True) - hyper_LR = { - 'classifier__C': np.logspace(-3,3,7), - 'classifier__class_weight': ['balanced', None] - } - - for method in ['DIR']:#'HDy-OvA', 'SLD', 'ACC-tv', 'PACC-tv']: #['DM', 'DIR']: #'KDEy-MLE', 'KDE-DM', 'DM', 'DIR']: + for method in METHODS: print('Init method', method) @@ -43,39 +34,7 @@ if __name__ == '__main__': dataset = 'T1B' train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset) print(f'init {dataset} #instances: {len(train)}') - if method == 'KDEy-MLE': - method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} - param_grid = {**method_params, **hyper_LR} - quantifier = KDEy(LogisticRegression(), target='max_likelihood', val_split=10) - elif method in ['KDE-DM']: - method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} - param_grid = {**method_params, **hyper_LR} - quantifier = KDEy(LogisticRegression(), target='min_divergence', divergence='l2', montecarlo_trials=5000, val_split=10) - elif method == 'DIR': - param_grid = hyper_LR - quantifier = DIRy(LogisticRegression()) - elif method == 'SLD': - param_grid = hyper_LR - quantifier = EMQ(LogisticRegression()) - elif method == 'PACC-tv': - param_grid = hyper_LR - quantifier = PACC(LogisticRegression()) - elif method == 'ACC-tv': - param_grid = hyper_LR - quantifier = ACC(LogisticRegression()) - elif method == 'HDy-OvA': - param_grid = {'binary_quantifier__' + key: val for key, val in hyper_LR.items()} - quantifier = OneVsAllAggregative(HDy(LogisticRegression())) - elif method == 'DM': - method_params = { - 'nbins': [4,8,16,32], - 'val_split': [10, 0.4], - 'divergence': ['HD', 'topsoe', 'l2'] - } - param_grid = {**method_params, **hyper_LR} - quantifier = DistributionMatching(LogisticRegression()) - else: - raise NotImplementedError('unknown method', method) + param_grid, quantifier = new_method(method) if param_grid is not None: modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim) @@ -98,9 +57,4 @@ if __name__ == '__main__': csv.write(f'{method}\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') csv.flush() - df = pd.read_csv(result_path+'.csv', sep='\t') - - pd.set_option('display.max_columns', None) - pd.set_option('display.max_rows', None) - pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"]) - print(pv) + show_results(result_path) diff --git a/distribution_matching/method_kdey.py b/distribution_matching/method_kdey.py index 2b25227..9ab10c9 100644 --- a/distribution_matching/method_kdey.py +++ b/distribution_matching/method_kdey.py @@ -1,3 +1,4 @@ +from cgi import test import os import sys from typing import Union, Callable @@ -34,6 +35,7 @@ class KDEy(AggregativeProbabilisticQuantifier): f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}' assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}' assert target in KDEy.TARGET, f'unknown target, valid ones are {KDEy.TARGET}' + assert divergence=='KLD', 'in this version I will only allow KLD as a divergence' self.classifier = classifier self.val_split = val_split self.divergence = divergence @@ -54,7 +56,7 @@ class KDEy(AggregativeProbabilisticQuantifier): print(f'auto: bandwidth={bandwidth:.5f}') return bandwidth - def get_kde(self, posteriors): + def get_kde_function(self, posteriors): # if self.bandwidth == 'auto': # print('adjusting bandwidth') # @@ -116,27 +118,26 @@ class KDEy(AggregativeProbabilisticQuantifier): self.classifier, y, posteriors, classes, class_count = cross_generate_predictions( data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs ) + print('classifier fit done') if self.bandwidth == 'auto': self.bandwidth = self.search_bandwidth_maxlikelihood(posteriors, y) - self.val_densities = [self.get_kde(posteriors[y == cat]) for cat in range(data.n_classes)] + self.val_densities = [self.get_kde_function(posteriors[y == cat]) for cat in range(data.n_classes)] self.val_posteriors = posteriors - if self.target == 'min_divergence': + if self.target == 'min_divergence_depr': self.samples = qp.functional.uniform_prevalence_sampling(n_classes=data.n_classes, size=self.montecarlo_trials) self.sample_densities = [self.pdf(kde_i, self.samples) for kde_i in self.val_densities] + if self.target == 'min_divergence': + self.class_samples = [kde_i.sample(self.montecarlo_trials, random_state=self.random_state) for kde_i in self.val_densities] + self.class_sample_densities = {} + for ci, samples_i in enumerate(self.class_samples): + self.class_sample_densities[ci] = np.asarray([self.pdf(kde_j, samples_i) for kde_j in self.val_densities]).T + print('kde fit done') return self - #def val_pdf(self, prev): - """ - Returns a function that computes the mixture model with the given prev as mixture factor - :param prev: a prevalence vector, ndarray - :return: a function implementing the validation distribution with fixed mixture factor - """ - # return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev)) - def aggregate(self, posteriors: np.ndarray): if self.target == 'min_divergence': return self._target_divergence(posteriors) @@ -145,44 +146,64 @@ class KDEy(AggregativeProbabilisticQuantifier): else: raise ValueError('unknown target') - def _target_divergence_depr(self, posteriors): - # this variant is, I think, ill-formed, since it evaluates the likelihood on the test points, which are - # overconfident in the KDE-test. - test_density = self.get_kde(posteriors) - # val_test_posteriors = np.concatenate([self.val_posteriors, posteriors]) - test_likelihood = self.pdf(test_density, posteriors) - divergence = _get_divergence(self.divergence) - n_classes = len(self.val_densities) - - def match(prev): - val_pdf = self.val_pdf(prev) - val_likelihood = val_pdf(posteriors) - - return divergence(val_likelihood, test_likelihood) - - # the initial point is set as the uniform distribution - uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) - - # solutions are bounded to those contained in the unit-simplex - bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] - constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 - r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) - return r.x + # this is the variant I have in the current results, which I think is bugged + # def _target_divergence_depr(self, posteriors): + # # in this variant we evaluate the divergence using a Montecarlo approach + # n_classes = len(self.val_densities) + # + # test_kde = self.get_kde_function(posteriors) + # test_likelihood = self.pdf(test_kde, self.samples) + # + # divergence = _get_divergence(self.divergence) + # + # def match(prev): + # val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, self.sample_densities)) + # return divergence(val_likelihood, test_likelihood) + # + # # the initial point is set as the uniform distribution + # uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + # + # # solutions are bounded to those contained in the unit-simplex + # bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] + # constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 + # r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) + # return r.x def _target_divergence(self, posteriors): # in this variant we evaluate the divergence using a Montecarlo approach n_classes = len(self.val_densities) - test_kde = self.get_kde(posteriors) - test_likelihood = self.pdf(test_kde, self.samples) - - divergence = _get_divergence(self.divergence) - + test_kde = self.get_kde_function(posteriors) + test_densities_per_class = [self.pdf(test_kde, samples_i) for samples_i in self.class_samples] + + # divergence = _get_divergence(self.divergence) + def kld_monte(pi, qi, eps=1e-8): + # there is no pi in front of the log because the samples are already drawn according to pi + smooth_pi = pi+eps + smooth_qi = qi+eps + return np.mean(np.log(smooth_pi / smooth_qi)) + def match(prev): - val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, self.sample_densities)) - return divergence(val_likelihood, test_likelihood) - + # choose the samples according to the prevalence vector + # e.g., prev = [0.5, 0.3, 0.2] will draw 50% from KDE_0, 30% from KDE_1, and 20% from KDE_2 + # the points are already pre-sampled and de densities are pre-computed, so that now all that remains + # is to pick a proportional number of each from each class (same for test) + num_variates_per_class = np.round(prev * self.montecarlo_trials).astype(int) + sample_densities = np.vstack( + [self.class_sample_densities[ci][:num_i] for ci, num_i in enumerate(num_variates_per_class)] + ) + #val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, sample_densities.T)) + val_likelihood = prev @ sample_densities.T + #test_likelihood = [] + #for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class): + # test_likelihood.append(samples_i[:num_i]) + #test_likelihood = np.concatenate[test] + test_likelihood = np.concatenate( + [samples_i[:num_i] for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class)] + ) + return kld_monte(val_likelihood, test_likelihood) + # the initial point is set as the uniform distribution uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) diff --git a/distribution_matching/tweets_experiments.py b/distribution_matching/tweets_experiments.py index 3aa3611..16b18de 100644 --- a/distribution_matching/tweets_experiments.py +++ b/distribution_matching/tweets_experiments.py @@ -1,14 +1,9 @@ import pickle -import numpy as np -from sklearn.linear_model import LogisticRegression import os -import sys import pandas as pd +from distribution_matching.commons import METHODS, new_method, show_results import quapy as qp -from quapy.method.aggregative import EMQ, DistributionMatching, PACC, ACC, CC, PCC, HDy, OneVsAllAggregative -from method_kdey import KDEy -from method_dirichlety import DIRy from quapy.model_selection import GridSearchQ from quapy.protocol import UPP @@ -21,16 +16,11 @@ if __name__ == '__main__': n_bags_val = 250 n_bags_test = 1000 optim = 'mae' - result_dir = f'results/results_tweet_{optim}_redohyper' + result_dir = f'results/tweet/{optim}' os.makedirs(result_dir, exist_ok=True) - hyper_LR = { - 'classifier__C': np.logspace(-3,3,7), - 'classifier__class_weight': ['balanced', None] - } - - for method in ['CC', 'SLD', 'PCC', 'PACC-tv', 'ACC-tv', 'DM', 'HDy-OvA', 'KDEy-MLE', 'KDE-DM', 'DIR']: + for method in METHODS: print('Init method', method) @@ -59,67 +49,7 @@ if __name__ == '__main__': if not is_semeval or not semeval_trained: - if method == 'KDE': # not used - method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} - param_grid = {**method_params, **hyper_LR} - quantifier = KDEy(LogisticRegression(), target='max_likelihood') - elif method == 'KDEy-MLE': - method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} - param_grid = {**method_params, **hyper_LR} - quantifier = KDEy(LogisticRegression(), target='max_likelihood', val_split=10) - elif method in ['KDE-DM']: - method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} - param_grid = {**method_params, **hyper_LR} - quantifier = KDEy(LogisticRegression(), target='min_divergence', divergence='l2', montecarlo_trials=5000, val_split=10) - elif method == 'DIR': - param_grid = hyper_LR - quantifier = DIRy(LogisticRegression()) - elif method == 'SLD': - param_grid = hyper_LR - quantifier = EMQ(LogisticRegression()) - elif method == 'PACC-tv': - param_grid = hyper_LR - quantifier = PACC(LogisticRegression()) - #elif method == 'PACC-kfcv': - # param_grid = hyper_LR - # quantifier = PACC(LogisticRegression(), val_split=10) - elif method == 'PACC': - method_params = {'val_split': [10, 0.4]} - param_grid = {**method_params, **hyper_LR} - quantifier = PACC(LogisticRegression()) - elif method == 'ACC': - method_params = {'val_split': [10, 0.4]} - param_grid = {**method_params, **hyper_LR} - quantifier = ACC(LogisticRegression()) - elif method == 'PCC': - param_grid = hyper_LR - quantifier = PCC(LogisticRegression()) - elif method == 'ACC-tv': - param_grid = hyper_LR - quantifier = ACC(LogisticRegression()) - elif method == 'CC': - param_grid = hyper_LR - quantifier = CC(LogisticRegression()) - elif method == 'HDy-OvA': - param_grid = {'binary_quantifier__'+key:val for key,val in hyper_LR.items()} - quantifier = OneVsAllAggregative(HDy(LogisticRegression())) - #elif method == 'DM': - # param_grid = { - # 'nbins': [5,10,15], - # 'classifier__C': np.logspace(-4,4,9), - # 'classifier__class_weight': ['balanced', None] - # } - # quantifier = DistributionMatching(LogisticRegression()) - elif method == 'DM': - method_params = { - 'nbins': [4,8,16,32], - 'val_split': [10, 0.4], - 'divergence': ['HD', 'topsoe', 'l2'] - } - param_grid = {**method_params, **hyper_LR} - quantifier = DistributionMatching(LogisticRegression()) - else: - raise NotImplementedError('unknown method', method) + param_grid, quantifier = new_method(method) # model selection data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True) @@ -151,9 +81,4 @@ if __name__ == '__main__': csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') csv.flush() - df = pd.read_csv(global_result_path+'.csv', sep='\t') - - pd.set_option('display.max_columns', None) - pd.set_option('display.max_rows', None) - pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"]) - print(pv) + show_results(global_result_path) diff --git a/quapy/evaluation.py b/quapy/evaluation.py index c198115..869d28f 100644 --- a/quapy/evaluation.py +++ b/quapy/evaluation.py @@ -11,7 +11,8 @@ def prediction( model: BaseQuantifier, protocol: AbstractProtocol, aggr_speedup: Union[str, bool] = 'auto', - verbose=False): + verbose=False, + verbose_error=None): """ Uses a quantification model to generate predictions for the samples generated via a specific protocol. This function is central to all evaluation processes, and is endowed with an optimization to speed-up the @@ -32,6 +33,7 @@ def prediction( in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is convenient or not. Set to False to deactivate. :param verbose: boolean, show or not information in stdout + :param verbose_error: an evaluation function to be used to display intermediate results if verbose=True (default None) :return: a tuple `(true_prevs, estim_prevs)` in which each element in the tuple is an array of shape `(n_samples, n_classes)` containing the true, or predicted, prevalence values for each sample """ @@ -61,16 +63,21 @@ def prediction( if apply_optimization: pre_classified = model.classify(protocol.get_labelled_collection().instances) protocol_with_predictions = protocol.on_preclassified_instances(pre_classified) - return __prediction_helper(model.aggregate, protocol_with_predictions, verbose) + return __prediction_helper(model.aggregate, protocol_with_predictions, verbose, verbose_error) else: - return __prediction_helper(model.quantify, protocol, verbose) + return __prediction_helper(model.quantify, protocol, verbose, verbose_error) -def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False): +def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False, verbose_error=None): true_prevs, estim_prevs = [], [] - for sample_instances, sample_prev in tqdm(protocol(), total=protocol.total(), desc='predicting') if verbose else protocol(): + if verbose: + pbar = tqdm(protocol(), total=protocol.total(), desc='predicting') + for sample_instances, sample_prev in pbar if verbose else protocol(): estim_prevs.append(quantification_fn(sample_instances)) true_prevs.append(sample_prev) + if verbose and verbose_error is not None: + err = verbose_error(true_prevs, estim_prevs) + pbar.set_description('predicting: ongoing error={err:.5f}') true_prevs = np.asarray(true_prevs) estim_prevs = np.asarray(estim_prevs) diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 20e6180..9901f38 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -1091,7 +1091,7 @@ class T50(ThresholdOptimization): Threshold Optimization variant for :class:`ACC` as proposed by `Forman 2006 `_ and `Forman 2008 `_ that looks - for the threshold that makes `tpr` cosest to 0.5. + for the threshold that makes `tpr` closest to 0.5. The goal is to bring improved stability to the denominator of the adjustment. :param classifier: a sklearn's Estimator that generates a classifier @@ -1179,7 +1179,7 @@ class MS(ThresholdOptimization): super().__init__(classifier, val_split) def _condition(self, tpr, fpr) -> float: - pass + return True def _optimize_threshold(self, y, probabilities): tprs = [] @@ -1190,9 +1190,26 @@ class MS(ThresholdOptimization): TP, FP, FN, TN = self._compute_table(y, y_) tpr = self._compute_tpr(TP, FP) fpr = self._compute_fpr(FP, TN) - tprs.append(tpr) - fprs.append(fpr) - return np.median(tprs), np.median(fprs) + if self._condition(tpr, fpr): + tprs.append(tpr) + fprs.append(fpr) + return tprs, fprs + + def aggregate(self, classif_predictions): + prevs_estim = self.cc.aggregate(classif_predictions) + + positive_prevs = [] + for tpr, fpr in zip(self.tpr, self.fpr): + if tpr - fpr > 0: + acc = np.clip((prevs_estim[1] - fpr) / (tpr - fpr), 0, 1) + positive_prevs.append(acc) + + if len(positive_prevs) > 0: + adjusted_positive_prev = np.median(positive_prevs) + adjusted_prevs_estim = np.array((1 - adjusted_positive_prev, adjusted_positive_prev)) + return adjusted_prevs_estim + else: + return prevs_estim class MS2(MS): @@ -1215,19 +1232,9 @@ class MS2(MS): def __init__(self, classifier: BaseEstimator, val_split=0.4): super().__init__(classifier, val_split) - def _optimize_threshold(self, y, probabilities): - tprs = [0, 1] - fprs = [0, 1] - candidate_thresholds = np.unique(probabilities[:, 1]) - for candidate_threshold in candidate_thresholds: - y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]] - TP, FP, FN, TN = self._compute_table(y, y_) - tpr = self._compute_tpr(TP, FP) - fpr = self._compute_fpr(FP, TN) - if (tpr - fpr) > 0.25: - tprs.append(tpr) - fprs.append(fpr) - return np.median(tprs), np.median(fprs) + def _condition(self, tpr, fpr) -> float: + return (tpr - fpr) > 0.25 + ClassifyAndCount = CC