From 59500a5a4217d00103f5544aeb7228f5c2d0b5be Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Mon, 11 Dec 2023 16:43:45 +0100 Subject: [PATCH] refactoring --- distribution_matching/commons.py | 47 +++++----- .../lequa_nclasses_sensibility.py | 74 ---------------- .../lequa_sensibility_analysis.py | 2 +- .../dirichlety.py} | 0 .../{methods_kdey.py => method/kdey.py} | 31 ++++--- .../{ => method}/method_kdey.py | 0 .../{ => method}/method_kdey_closed.py | 0 .../method_kdey_closed_efficient.py | 0 .../method_kdey_closed_efficient_correct.py | 0 distribution_matching/show_results.py | 41 --------- .../tables/gen_tables_compact.py | 85 +++++-------------- .../tables/latex/tables_compact.tex | 2 +- .../tweets_sensibility_analysis.py | 57 +++++++++++++ ...experiments.py => ucimulti_experiments.py} | 1 + .../ucimulti_sensibility_analysis.py | 63 ++++++++++++++ laboratory/main_tweets_auto.py | 2 +- 16 files changed, 188 insertions(+), 217 deletions(-) delete mode 100644 distribution_matching/lequa_nclasses_sensibility.py rename distribution_matching/{method_dirichlety.py => method/dirichlety.py} (100%) rename distribution_matching/{methods_kdey.py => method/kdey.py} (90%) rename distribution_matching/{ => method}/method_kdey.py (100%) rename distribution_matching/{ => method}/method_kdey_closed.py (100%) rename distribution_matching/{ => method}/method_kdey_closed_efficient.py (100%) rename distribution_matching/{ => method}/method_kdey_closed_efficient_correct.py (100%) delete mode 100644 distribution_matching/show_results.py create mode 100644 distribution_matching/tweets_sensibility_analysis.py rename distribution_matching/{ucimulticlass_experiments.py => ucimulti_experiments.py} (98%) create mode 100644 distribution_matching/ucimulti_sensibility_analysis.py diff --git a/distribution_matching/commons.py b/distribution_matching/commons.py index 3dd89d9..9fa82af 100644 --- a/distribution_matching/commons.py +++ b/distribution_matching/commons.py @@ -1,23 +1,31 @@ import numpy as np import pandas as pd -from distribution_matching.method_kdey import KDEy -from distribution_matching.method_kdey_closed import KDEyclosed -from distribution_matching.method_kdey_closed_efficient_correct import KDEyclosed_efficient_corr -from distribution_matching.methods_kdey import KDEyCS, KDEyHD, KDEyML +from distribution_matching.method.kdex import KDExML +from distribution_matching.method.method_kdey import KDEy +from distribution_matching.method.method_kdey_closed_efficient_correct import KDEyclosed_efficient_corr +from distribution_matching.method.kdey import KDEyCS, KDEyHD, KDEyML from quapy.method.aggregative import EMQ, CC, PCC, DistributionMatching, PACC, HDy, OneVsAllAggregative, ACC -from distribution_matching.method_dirichlety import DIRy +from distribution_matching.method.dirichlety import DIRy from sklearn.linear_model import LogisticRegression -from distribution_matching.method_kdey_closed_efficient import KDEyclosed_efficient -# the full list of methods tested in the paper (reported in the appendix) -METHODS = ['ACC', 'PACC', 'HDy-OvA', 'DM-T', 'DM-HD', 'KDEy-HD', 'KDEy-HD2', 'DM-CS', 'KDEy-CS','KDEy-CS2', 'DIR', 'EMQ', 'EMQ-BCTS', 'KDEy-ML', 'KDEy-ML2'] +# set to True to get the full list of methods tested in the paper (reported in the appendix) +# set to False to get the reduced list (shown in the body of the paper) +FULL_METHOD_LIST = True -# uncomment this other list for the methods shown in the body of the paper (the other methods are not comparable in performance) -#METHODS = ['PACC', 'DM-T', 'DM-HD', 'KDEy-HD', 'DM-CS', 'KDEy-CS', 'EMQ', 'KDEy-ML'] +if FULL_METHOD_LIST: + ADJUSTMENT_METHODS = ['ACC', 'PACC'] + DISTR_MATCH_METHODS = ['HDy-OvA', 'DM-T', 'DM-HD', 'KDEy-HD', 'DM-CS', 'KDEy-CS'] + MAX_LIKE_METHODS = ['DIR', 'EMQ', 'EMQ-BCTS', 'KDEy-ML', 'KDEx-ML'] +else: + ADJUSTMENT_METHODS = ['PACC'] + DISTR_MATCH_METHODS = ['DM-T', 'DM-HD', 'KDEy-HD', 'DM-CS', 'KDEy-CS'] + MAX_LIKE_METHODS = ['EMQ', 'KDEy-ML', 'KDEx-ML'] +# list of methods to consider +METHODS = ADJUSTMENT_METHODS + DISTR_MATCH_METHODS + MAX_LIKE_METHODS BIN_METHODS = [x.replace('-OvA', '') for x in METHODS] - +# common hyperparameterss hyper_LR = { 'classifier__C': np.logspace(-3,3,7), 'classifier__class_weight': ['balanced', None] @@ -29,8 +37,9 @@ hyper_kde = { nbins_range = [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 64] -def new_method(method, **lr_kwargs): +# instances a new quantifier based on a string name +def new_method(method, **lr_kwargs): lr = LogisticRegression(**lr_kwargs) if method == 'CC': @@ -46,23 +55,19 @@ def new_method(method, **lr_kwargs): param_grid = hyper_LR quantifier = PACC(lr) elif method in ['KDEy-HD']: - param_grid = {**hyper_kde, **hyper_LR} - quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=10000, val_split=10) - elif method in ['KDEy-HD2']: param_grid = {**hyper_kde, **hyper_LR} quantifier = KDEyHD(lr) elif method == 'KDEy-CS': - param_grid = {**hyper_kde, **hyper_LR} - quantifier = KDEyclosed_efficient_corr(lr, val_split=10) - elif method == 'KDEy-CS2': param_grid = {**hyper_kde, **hyper_LR} quantifier = KDEyCS(lr) elif method == 'KDEy-ML': - param_grid = {**hyper_kde, **hyper_LR} - quantifier = KDEy(lr, target='max_likelihood', val_split=10) - elif method == 'KDEy-ML2': param_grid = {**hyper_kde, **hyper_LR} quantifier = KDEyML(lr) + elif method == 'KDEx-ML': + param_grid = { + 'bandwidth': np.linspace(0.001, 2, 501) + } + quantifier = KDExML() elif method == 'DIR': param_grid = hyper_LR quantifier = DIRy(lr) diff --git a/distribution_matching/lequa_nclasses_sensibility.py b/distribution_matching/lequa_nclasses_sensibility.py deleted file mode 100644 index d9df8c8..0000000 --- a/distribution_matching/lequa_nclasses_sensibility.py +++ /dev/null @@ -1,74 +0,0 @@ -import pickle -import numpy as np -import os -from os.path import join -import pandas as pd -from quapy.protocol import UPP -from quapy.data import LabelledCollection -from distribution_matching.commons import METHODS, new_method, show_results -import quapy as qp - - -SEED=1 - - -def extract_classes(data:LabelledCollection, classes): - X, y = data.Xy - counts = data.counts() - Xs, ys = [], [] - for class_i in classes: - Xs.append(X[y==class_i]) - ys.append([class_i]*counts[class_i]) - Xs = np.concatenate(Xs) - ys = np.concatenate(ys) - return LabelledCollection(Xs, ys, classes=classes - ) - -def task(nclasses): - in_classes = np.arange(0, nclasses) - train = extract_classes(train_pool, classes=in_classes) - test = extract_classes(test_pool, classes=in_classes) - with qp.util.temp_seed(SEED): - hyper, quantifier = new_method(method) - quantifier.set_params(classifier__C=1, classifier__class_weight='balanced') - hyper = {h:v for h,v in hyper.items() if not h.startswith('classifier__')} - tr, va = train.split_stratified(random_state=SEED) - quantifier = qp.model_selection.GridSearchQ(quantifier, hyper, UPP(va), optim).fit(tr) - report = qp.evaluation.evaluation_report(quantifier, protocol=UPP(test), error_metrics=['mae', 'mrae', 'kld'], verbose=True) - return report - - -# only the quantifier-dependent hyperparameters are explored; the classifier is a LR with default parameters -if __name__ == '__main__': - - qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B'] - qp.environ['N_JOBS'] = -1 - - - for optim in ['mae']: #, 'mrae']: - - result_dir = f'results/lequa/nclasses/{optim}' - os.makedirs(result_dir, exist_ok=True) - - for method in ['DM', 'EMQ', 'KDEy-ML']: # 'KDEy-ML', 'KDEy-DMhd3']: - - result_path = join(result_dir, f'{method}.csv') - if os.path.exists(result_path): continue - - train_orig, _, _ = qp.datasets.fetch_lequa2022('T1B') - - train_pool, test_pool = train_orig.split_stratified(0.5, random_state=SEED) - arange_classes = np.arange(2, train_orig.n_classes + 1) - reports = qp.util.parallel(task, arange_classes, n_jobs=-1) - with open(result_path, 'at') as csv: - csv.write(f'Method\tDataset\tnClasses\tMAE\tMRAE\tKLD\n') - for num_classes, report in zip(arange_classes, reports): - means = report.mean() - report_result_path = join(result_dir, f'{method}_{num_classes}')+'.dataframe' - report.to_csv(report_result_path) - csv.write(f'{method}\tLeQua-T1B\t{num_classes}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') - csv.flush() - - means = report.mean() - print(means) - diff --git a/distribution_matching/lequa_sensibility_analysis.py b/distribution_matching/lequa_sensibility_analysis.py index e1de526..1213493 100644 --- a/distribution_matching/lequa_sensibility_analysis.py +++ b/distribution_matching/lequa_sensibility_analysis.py @@ -3,7 +3,7 @@ from sklearn.linear_model import LogisticRegression import os import quapy as qp from distribution_matching.commons import show_results -from method_kdey import KDEy +from distribution_matching.method.method_kdey import KDEy from quapy.method.aggregative import DistributionMatching diff --git a/distribution_matching/method_dirichlety.py b/distribution_matching/method/dirichlety.py similarity index 100% rename from distribution_matching/method_dirichlety.py rename to distribution_matching/method/dirichlety.py diff --git a/distribution_matching/methods_kdey.py b/distribution_matching/method/kdey.py similarity index 90% rename from distribution_matching/methods_kdey.py rename to distribution_matching/method/kdey.py index d5c0df9..c6f9794 100644 --- a/distribution_matching/methods_kdey.py +++ b/distribution_matching/method/kdey.py @@ -5,36 +5,35 @@ from sklearn.neighbors import KernelDensity import quapy as qp from quapy.data import LabelledCollection -from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions +from quapy.method.aggregative import AggregativeProbabilisticQuantifier, cross_generate_predictions import quapy.functional as F -from scipy.stats import multivariate_normal -from scipy import optimize from sklearn.metrics.pairwise import rbf_kernel -class KDEyBase: +class KDEBase: BANDWIDTH_METHOD = ['scott', 'silverman'] - def _check_bandwidth(self, bandwidth): - assert bandwidth in KDEyBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \ - f'invalid bandwidth, valid ones are {KDEyBase.BANDWIDTH_METHOD} or float values' + @classmethod + def _check_bandwidth(cls, bandwidth): + assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \ + f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values' if isinstance(bandwidth, float): assert 0 < bandwidth < 1, "the bandwith for KDEy should be in (0,1), since this method models the unit simplex" - def get_kde_function(self, posteriors, bandwidth): - return KernelDensity(bandwidth=bandwidth).fit(posteriors) + def get_kde_function(self, X, bandwidth): + return KernelDensity(bandwidth=bandwidth).fit(X) - def pdf(self, kde, posteriors): - return np.exp(kde.score_samples(posteriors)) + def pdf(self, kde, X): + return np.exp(kde.score_samples(X)) - def get_mixture_components(self, posteriors, y, n_classes, bandwidth): - return [self.get_kde_function(posteriors[y == cat], bandwidth) for cat in range(n_classes)] + def get_mixture_components(self, X, y, n_classes, bandwidth): + return [self.get_kde_function(X[y == cat], bandwidth) for cat in range(n_classes)] -class KDEyML(AggregativeProbabilisticQuantifier, KDEyBase): +class KDEyML(AggregativeProbabilisticQuantifier, KDEBase): def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=0): self._check_bandwidth(bandwidth) @@ -77,7 +76,7 @@ class KDEyML(AggregativeProbabilisticQuantifier, KDEyBase): return F.optim_minimize(neg_loglikelihood, n_classes) -class KDEyHD(AggregativeProbabilisticQuantifier, KDEyBase): +class KDEyHD(AggregativeProbabilisticQuantifier, KDEBase): def __init__(self, classifier: BaseEstimator, val_split=10, divergence: str='HD', bandwidth=0.1, n_jobs=None, random_state=0, montecarlo_trials=10000): @@ -145,7 +144,7 @@ class KDEyHD(AggregativeProbabilisticQuantifier, KDEyBase): class KDEyCS(AggregativeProbabilisticQuantifier): def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=0): - self._check_bandwidth(bandwidth) + KDEBase._check_bandwidth(bandwidth) self.classifier = classifier self.val_split = val_split self.bandwidth = bandwidth diff --git a/distribution_matching/method_kdey.py b/distribution_matching/method/method_kdey.py similarity index 100% rename from distribution_matching/method_kdey.py rename to distribution_matching/method/method_kdey.py diff --git a/distribution_matching/method_kdey_closed.py b/distribution_matching/method/method_kdey_closed.py similarity index 100% rename from distribution_matching/method_kdey_closed.py rename to distribution_matching/method/method_kdey_closed.py diff --git a/distribution_matching/method_kdey_closed_efficient.py b/distribution_matching/method/method_kdey_closed_efficient.py similarity index 100% rename from distribution_matching/method_kdey_closed_efficient.py rename to distribution_matching/method/method_kdey_closed_efficient.py diff --git a/distribution_matching/method_kdey_closed_efficient_correct.py b/distribution_matching/method/method_kdey_closed_efficient_correct.py similarity index 100% rename from distribution_matching/method_kdey_closed_efficient_correct.py rename to distribution_matching/method/method_kdey_closed_efficient_correct.py diff --git a/distribution_matching/show_results.py b/distribution_matching/show_results.py deleted file mode 100644 index 5f71776..0000000 --- a/distribution_matching/show_results.py +++ /dev/null @@ -1,41 +0,0 @@ -import sys -from pathlib import Path -import pandas as pd - -result_dir = 'results/results_tweet_mae_redohyper' -#result_dir = 'results_lequa_mrae' - -dfs = [] - -pathlist = Path(result_dir).rglob('*.csv') -for path in pathlist: - path_in_str = str(path) - - try: - df = pd.read_csv(path_in_str, sep='\t') - df = df[df.iloc[:, 0] != df.columns[0]] - if not df.empty: - dfs.append(df) - except Exception: - print('empty') - -df = pd.concat(dfs) - -for err in ['MAE', 'MRAE', 'KLD']: - print('-'*100) - print(err) - print('-'*100) - piv = df.pivot_table(index='Dataset', columns='Method', values=err) - piv.loc['mean'] = piv.mean() - - pd.set_option('display.max_columns', None) - pd.set_option('display.max_rows', None) - pd.set_option('expand_frame_repr', False) - print(piv) - print() - - - - - - diff --git a/distribution_matching/tables/gen_tables_compact.py b/distribution_matching/tables/gen_tables_compact.py index d8a8d9f..5c694e0 100644 --- a/distribution_matching/tables/gen_tables_compact.py +++ b/distribution_matching/tables/gen_tables_compact.py @@ -1,4 +1,5 @@ -from distribution_matching.commons import BIN_METHODS, METHODS +from distribution_matching.commons import (ADJUSTMENT_METHODS, BIN_METHODS, DISTR_MATCH_METHODS, MAX_LIKE_METHODS, + METHODS, FULL_METHOD_LIST) import quapy as qp from os import makedirs import os @@ -12,10 +13,9 @@ tables_path = '.' MAXTONE = 35 # sets the intensity of the maximum color reached by the worst (red) and best (green) results SHOW_STD = False -NUM_ADJUSTMENT_METHODS = 2 if 'ACC' in METHODS else 1 -NUM_MAXIMUM_LIKELIHOOD_METHODS = 4 if 'DIR' in METHODS else 3 -NUM_DISTRIBUTION_MATCHING_PAIRS = 2 -NUM_DISTRIBUTION_MATCHING_METHODS = NUM_DISTRIBUTION_MATCHING_PAIRS*2 + (2 if 'HDy-OvA' in METHODS else 1) +NUM_ADJUSTMENT_METHODS = len(ADJUSTMENT_METHODS) +NUM_MAXIMUM_LIKELIHOOD_METHODS = len(MAX_LIKE_METHODS) +NUM_DISTRIBUTION_MATCHING_METHODS = len(DISTR_MATCH_METHODS) qp.environ['SAMPLE_SIZE'] = 100 @@ -27,21 +27,24 @@ nice_bench = { 'semeval16': 'SemEval16', } -nice_method={ - 'KDEy-MLE': 'KDEy-ML', - 'KDEy-DMhd4': 'KDEy-HD', - 'KDEy-closed++': 'KDEy-CS', - 'EMQ-C': 'EMQ-BCTS' -} def save_table(path, table): print(f'saving results in {path}') with open(path, 'wt') as foo: foo.write(table) - -def nicerm(key): - return '\mathrm{'+nice[key]+'}' +def new_table(datasets, methods): + return Table( + benchmarks=datasets, + methods=methods, + ttest='wilcoxon', + prec_mean=5, + show_std=SHOW_STD, + prec_std=4, + clean_zero=(eval=='mae'), + average=True, + maxtone=MAXTONE + ) def make_table(tabs, eval, benchmark_groups, benchmark_names, compact=False): @@ -54,7 +57,7 @@ def make_table(tabs, eval, benchmark_groups, benchmark_names, compact=False): # write the latex table tabular = """ - \\begin{tabular}{|c|""" + ('c|' * NUM_ADJUSTMENT_METHODS) + 'c|c' + ('|c|c' * (NUM_DISTRIBUTION_MATCHING_PAIRS)) + ('|c' * NUM_MAXIMUM_LIKELIHOOD_METHODS) + """|} """ + cline + """ + \\begin{tabular}{|c|""" + ('c|' * NUM_ADJUSTMENT_METHODS) + ('c|' * NUM_DISTRIBUTION_MATCHING_METHODS) + ('c|' * NUM_MAXIMUM_LIKELIHOOD_METHODS) + """} """ + cline + """ \multicolumn{1}{c}{} & \multicolumn{"""+str(NUM_ADJUSTMENT_METHODS)+"""}{|c}{Adjustment} & \multicolumn{"""+str(NUM_DISTRIBUTION_MATCHING_METHODS)+"""}{|c|}{Distribution Matching} & @@ -62,8 +65,7 @@ def make_table(tabs, eval, benchmark_groups, benchmark_names, compact=False): \hline """ for i, (tab, group, name) in enumerate(zip(tabs, benchmark_groups, benchmark_names)): - tablines = tab.latexTabular(benchmark_replace=nice_bench, method_replace=nice_method, endl='\\\\'+ cline, aslines=True) - print(tablines) + tablines = tab.latexTabular(benchmark_replace=nice_bench, endl='\\\\'+ cline, aslines=True) tablines[0] = tablines[0].replace('\multicolumn{1}{c|}{}', '\\textbf{'+name+'}') if not compact: tabular += '\n'.join(tablines) @@ -87,17 +89,7 @@ def gen_tables_uci_multiclass(eval): datasets = qp.datasets.UCI_MULTICLASS_DATASETS - tab = Table( - benchmarks=datasets, - methods=METHODS, - ttest='wilcoxon', - prec_mean=4, - show_std=SHOW_STD, - prec_std=4, - clean_zero=(eval=='mae'), - average=True, - maxtone=MAXTONE - ) + tab = new_table(datasets, METHODS) for dataset in datasets: print(f'\t Dataset: {dataset}: ', end='') @@ -122,17 +114,7 @@ def gen_tables_uci_bin(eval): exclude = ['acute.a', 'acute.b', 'iris.1', 'balance.2'] datasets = [x for x in qp.datasets.UCI_DATASETS if x not in exclude] - tab = Table( - benchmarks=datasets, - methods=BIN_METHODS, - ttest='wilcoxon', - prec_mean=4, - show_std=SHOW_STD, - prec_std=4, - clean_zero=(eval=='mae'), - average=True, - maxtone=MAXTONE - ) + tab = new_table(datasets, BIN_METHODS) for dataset in datasets: print(f'\t Dataset: {dataset}: ', end='') @@ -156,17 +138,7 @@ def gen_tables_tweet(eval): datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST - tab = Table( - benchmarks=datasets, - methods=METHODS, - ttest='wilcoxon', - prec_mean=4, - show_std=SHOW_STD, - prec_std=4, - clean_zero=(eval=='mae'), - average=True, - maxtone=MAXTONE - ) + tab = new_table(datasets, METHODS) for dataset in datasets: print(f'\t Dataset: {dataset}: ', end='') @@ -185,19 +157,8 @@ def gen_tables_tweet(eval): def gen_tables_lequa(Methods, task, eval): # generating table for LeQua-T1A or Lequa-T1B; only one table with two rows, one for MAE, another for MRAE - dataset_name = 'LeQua-'+task - tab = Table( - benchmarks=[f'Average'], - methods=Methods, - ttest='wilcoxon', - prec_mean=5, - show_std=SHOW_STD, - prec_std=4, - clean_zero=False, - average=False, - maxtone=MAXTONE - ) + tab = new_table([f'Average'], Methods) print('Generating table for T1A@Lequa', eval, end='') dir_results = f'../results/lequa/{task}/{eval}' diff --git a/distribution_matching/tables/latex/tables_compact.tex b/distribution_matching/tables/latex/tables_compact.tex index f15bc23..c6b1d5d 100644 --- a/distribution_matching/tables/latex/tables_compact.tex +++ b/distribution_matching/tables/latex/tables_compact.tex @@ -65,7 +65,7 @@ \centering \caption{Multiclass RAE} \resizebox{\textwidth}{!}{% -\input{multiclass_mae} +\input{multiclass_mrae} }% \end{table} diff --git a/distribution_matching/tweets_sensibility_analysis.py b/distribution_matching/tweets_sensibility_analysis.py new file mode 100644 index 0000000..4c795b3 --- /dev/null +++ b/distribution_matching/tweets_sensibility_analysis.py @@ -0,0 +1,57 @@ +import numpy as np +from sklearn.linear_model import LogisticRegression +import os + +import quapy as qp +from distribution_matching.commons import show_results +from quapy.method.aggregative import DMy +from distribution_matching.method.method_kdey import KDEy +from quapy.protocol import UPP + +SEED=1 + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = 100 + qp.environ['N_JOBS'] = -1 + n_bags_val = 250 + n_bags_test = 1000 + result_dir = f'results/tweet/sensibility' + + os.makedirs(result_dir, exist_ok=True) + + for method, param, grid in [ + ('KDEy-ML', 'Bandwidth', np.linspace(0.01, 0.2, 20)), + ('DM-HD', 'nbins', list(range(2,10)) + list(range(10,34,2))) + ]: + + global_result_path = f'{result_dir}/{method}' + + if not os.path.exists(global_result_path+'.csv'): + with open(global_result_path+'.csv', 'wt') as csv: + csv.write(f'Method\tDataset\t{param}\tMAE\tMRAE\tKLD\n') + + with open(global_result_path+'.csv', 'at') as csv: + for val in grid: + for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST: + print('init', dataset) + + local_result_path = global_result_path + '_' + dataset + (f'_{val:.3f}' if isinstance(val, float) else f'{val}') + + with qp.util.temp_seed(SEED): + + data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False) + + if method == 'KDEy-ML': + quantifier = KDEy(LogisticRegression(n_jobs=-1), target='max_likelihood', val_split=10, bandwidth=val) + elif method == 'DM-HD': + quantifier = DMy(LogisticRegression(n_jobs=-1), val_split=10, nbins=val, divergence='HD', n_jobs=-1) + quantifier.fit(data.training) + protocol = UPP(data.test, repeats=n_bags_test) + report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True, n_jobs=-1) + report.to_csv(f'{local_result_path}.dataframe') + means = report.mean() + csv.write(f'{method}\t{data.name}\t{val}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') + csv.flush() + + show_results(global_result_path) diff --git a/distribution_matching/ucimulticlass_experiments.py b/distribution_matching/ucimulti_experiments.py similarity index 98% rename from distribution_matching/ucimulticlass_experiments.py rename to distribution_matching/ucimulti_experiments.py index 192c25f..b3980bb 100644 --- a/distribution_matching/ucimulticlass_experiments.py +++ b/distribution_matching/ucimulti_experiments.py @@ -1,5 +1,6 @@ import pickle import os +from data.base import LabelledCollection from sklearn.linear_model import LogisticRegression diff --git a/distribution_matching/ucimulti_sensibility_analysis.py b/distribution_matching/ucimulti_sensibility_analysis.py new file mode 100644 index 0000000..e70d063 --- /dev/null +++ b/distribution_matching/ucimulti_sensibility_analysis.py @@ -0,0 +1,63 @@ +import numpy as np +from sklearn.linear_model import LogisticRegression +import os +import quapy as qp +from distribution_matching.commons import show_results +from distribution_matching.method.method_kdey import KDEy +from quapy.method.aggregative import DMy +from quapy.protocol import UPP + + +SEED=1 + +def task(val): + print('job-init', dataset, val) + + with qp.util.temp_seed(SEED): + if method=='KDEy-ML': + quantifier = KDEy(LogisticRegression(), target='max_likelihood', val_split=10, bandwidth=val) + elif method == 'DM-HD': + quantifier = DMy(LogisticRegression(), val_split=10, nbins=val, divergence='HD') + + quantifier.fit(data.data) + protocol = UPP(data.test, repeats=n_bags_test) + report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], + verbose=True, n_jobs=-1) + return report + + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = 500 + qp.environ['N_JOBS'] = -1 + n_bags_val = 250 + n_bags_test = 1000 + result_dir = f'results/ucimulti/sensibility' + + os.makedirs(result_dir, exist_ok=True) + + for dataset in qp.datasets.UCI_MULTICLASS_DATASETS: + + data = qp.datasets.fetch_UCIMulticlassDataset(dataset) + + for method, param, grid in [ + ('KDEy-ML', 'Bandwidth', np.linspace(0.01, 0.2, 20)), + ('DM-HD', 'nbins', list(range(2, 10)) + list(range(10, 34, 2))) + ]: + + global_result_path = f'{result_dir}/{method}' + + if not os.path.exists(global_result_path+'.csv'): + with open(global_result_path+'.csv', 'wt') as csv: + csv.write(f'Method\tDataset\t{param}\tMAE\tMRAE\tKLD\n') + + reports = qp.util.parallel(task, grid, n_jobs=-1) + with open(global_result_path + '.csv', 'at') as csv: + for val, report in zip(grid, reports): + means = report.mean() + local_result_path = global_result_path + '_' + dataset + (f'_{val:.3f}' if isinstance(val, float) else f'{val}') + report.to_csv(f'{local_result_path}.dataframe') + csv.write(f'{method}\t{dataset}\t{val}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') + csv.flush() + + show_results(global_result_path) diff --git a/laboratory/main_tweets_auto.py b/laboratory/main_tweets_auto.py index 72d3a65..98d8229 100644 --- a/laboratory/main_tweets_auto.py +++ b/laboratory/main_tweets_auto.py @@ -5,7 +5,7 @@ import pandas as pd import quapy as qp from method.aggregative import DistributionMatching -from distribution_matching.method_kdey import KDEy +from distribution_matching.method.method_kdey import KDEy from protocol import UPP