diff --git a/examples/uci_experiments.py b/examples/uci_experiments.py index 07db7cd..b452feb 100644 --- a/examples/uci_experiments.py +++ b/examples/uci_experiments.py @@ -29,12 +29,17 @@ def newLR(): def calibratedLR(): - return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)) + return CalibratedClassifierCV(newLR()) __C_range = np.logspace(-3, 3, 7) -lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']} -svmperf_params = {'classifier__C': __C_range} +lr_params = { + 'classifier__C': __C_range, + 'classifier__class_weight': [None, 'balanced'] +} +svmperf_params = { + 'classifier__C': __C_range +} def quantification_models(): diff --git a/examples/ucimulti_experiments.py b/examples/ucimulti_experiments.py new file mode 100644 index 0000000..1b48834 --- /dev/null +++ b/examples/ucimulti_experiments.py @@ -0,0 +1,113 @@ +import pickle +import os + +import numpy as np +from sklearn.linear_model import LogisticRegression + +import quapy as qp +from quapy.method.aggregative import PACC, EMQ, KDEyML +from quapy.model_selection import GridSearchQ +from quapy.protocol import UPP +from pathlib import Path + + +SEED = 1 + + +def newLR(): + return LogisticRegression(max_iter=3000) + +# typical hyperparameters explored for Logistic Regression +logreg_grid = { + 'C': np.logspace(-3, 3, 7), + 'class_weight': ['balanced', None] +} + +def wrap_hyper(classifier_hyper_grid:dict): + return {'classifier__'+k:v for k, v in classifier_hyper_grid.items()} + +METHODS = [ + ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)), + ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)), + ('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}), +] + + +def show_results(result_path): + import pandas as pd + df = pd.read_csv(result_path+'.csv', sep='\t') + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"], margins=True) + print(pv) + + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = 500 + qp.environ['N_JOBS'] = -1 + n_bags_val = 250 + n_bags_test = 1000 + result_dir = f'results/ucimulti' + + os.makedirs(result_dir, exist_ok=True) + + global_result_path = f'{result_dir}/allmethods' + with open(global_result_path + '.csv', 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\n') + + for method_name, quantifier, param_grid in METHODS: + + print('Init method', method_name) + + with open(global_result_path + '.csv', 'at') as csv: + + for dataset in qp.datasets.UCI_MULTICLASS_DATASETS[:5]: + + if dataset in ['covertype', 'diabetes']: + continue + + print('init', dataset) + + local_result_path = os.path.join(Path(global_result_path).parent, method_name + '_' + dataset + '.dataframe') + + if os.path.exists(local_result_path): + print(f'result file {local_result_path} already exist; skipping') + report = qp.util.load_report(local_result_path) + + else: + with qp.util.temp_seed(SEED): + + data = qp.datasets.fetch_UCIMulticlassDataset(dataset, verbose=True) + + # model selection + train, test = data.train_test + train, val = train.split_stratified(random_state=SEED) + + protocol = UPP(val, repeats=n_bags_val) + modsel = GridSearchQ( + quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae' + ) + + try: + modsel.fit(train) + + print(f'best params {modsel.best_params_}') + print(f'best score {modsel.best_score_}') + + quantifier = modsel.best_model() + except: + print('something went wrong... trying to fit the default model') + quantifier.fit(train) + + protocol = UPP(test, repeats=n_bags_test) + report = qp.evaluation.evaluation_report( + quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True + ) + report.to_csv(local_result_path) + + means = report.mean() + csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n') + csv.flush() + + show_results(global_result_path) \ No newline at end of file diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index a5a5677..1e0750e 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -591,7 +591,7 @@ def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=Fals return data -def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False, min_ipc=100) -> Dataset: +def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, min_class_support=100, verbose=False) -> Dataset: """ Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. @@ -614,16 +614,16 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default ~/quay_data/ directory) :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param min_class_support: minimum number of istances per class. Classes with fewer instances + are discarded (deafult is 100) :param verbose: set to True (default is False) to get information (stats) about the dataset - :param min_ipc: minimum number of istances per class. Classes with less instances than min_ipc are discarded - (deafult is 100) :return: a :class:`quapy.data.base.Dataset` instance """ - data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose, min_ipc) + data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, min_class_support, verbose=verbose) return Dataset(*data.split_stratified(1 - test_split, random_state=0)) -def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False, min_ipc=100) -> LabelledCollection: +def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_class_support=100, verbose=False) -> LabelledCollection: """ Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`. @@ -646,9 +646,9 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose= :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default ~/quay_data/ directory) :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param min_class_support: minimum number of istances per class. Classes with fewer instances + are discarded (deafult is 100) :param verbose: set to True (default is False) to get information (stats) about the dataset - :param min_ipc: minimum number of istances per class. Classes with less instances than min_ipc are discarded - (deafult is 100) :return: a :class:`quapy.data.base.LabelledCollection` instance """ assert dataset_name in UCI_MULTICLASS_DATASETS, \ @@ -736,13 +736,20 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose= file = join(data_home, 'uci_multiclass', dataset_name+'.pkl') def download(id, name): - data = fetch_ucirepo(id=id) - X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze() - # classes represented as arrays are transformed to tuples to treat them as signle objects + df = fetch_ucirepo(id=id) + + df.data.features = pd.get_dummies(df.data.features, drop_first=True) + + X, y = df.data.features.to_numpy(), df.data.targets.to_numpy().squeeze() + # classes represented as arrays are transformed to tuples to treat them as single objects if name == 'support2': y[:, 2] = np.fromiter((str(elm) for elm in y[:, 2]), dtype='object') + raise ValueError('this is support 2') + if y.ndim > 1: y = np.fromiter((tuple(elm) for elm in y), dtype='object') + raise ValueError('more than one y') + classes = np.sort(np.unique(y)) y = np.searchsorted(classes, y) return LabelledCollection(X, y) @@ -759,11 +766,11 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose= return LabelledCollection(X, y) data = pickled_resource(file, download, identifier, dataset_name) - data = filter_classes(data, min_ipc) + data = filter_classes(data, min_class_support) if data.n_classes <= 2: raise ValueError( - f'Dataset {dataset_name} has too few valid classes to be multiclass with {min_ipc=}. ' - 'Try a lower value for min_ipc.' + f'After filtering out classes with less than {min_class_support=} instances, the dataset {dataset_name} ' + f'is no longer multiclass. Try a reducing this value.' ) if verbose: @@ -848,7 +855,6 @@ def fetch_lequa2022(task, data_home=None): return train, val_gen, test_gen - def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None): """ Loads the IFCB dataset for quantification from `Zenodo `_ (for more diff --git a/quapy/method/_neural.py b/quapy/method/_neural.py index 11c2dc4..28d848a 100644 --- a/quapy/method/_neural.py +++ b/quapy/method/_neural.py @@ -21,7 +21,7 @@ class QuaNetTrainer(BaseQuantifier): Example: >>> import quapy as qp - >>> from quapy.method.meta import QuaNet + >>> from quapy.method_name.meta import QuaNet >>> from quapy.classification.neural import NeuralClassifierTrainer, CNNnet >>> >>> # use samples of 100 elements diff --git a/quapy/util.py b/quapy/util.py index 7f0abc4..9165499 100644 --- a/quapy/util.py +++ b/quapy/util.py @@ -6,6 +6,9 @@ import pickle import urllib from pathlib import Path from contextlib import ExitStack + +import pandas as pd + import quapy as qp import numpy as np @@ -246,6 +249,28 @@ def _check_sample_size(sample_size): return sample_size +def load_report(path, as_dict=False): + def str2prev_arr(strprev): + within = strprev.strip('[]').split() + float_list = [float(p) for p in within] + float_list[-1] = 1. - sum(float_list[:-1]) + return np.asarray(float_list) + + df = pd.read_csv(path, index_col=0) + df['true-prev'] = df['true-prev'].apply(str2prev_arr) + df['estim-prev'] = df['estim-prev'].apply(str2prev_arr) + if as_dict: + d = {} + for col in df.columns.values: + vals = df[col].values + if col in ['true-prev', 'estim-prev']: + vals = np.vstack(vals) + d[col] = vals + return d + else: + return df + + class EarlyStop: """ A class implementing the early-stopping condition typically used for training neural networks.