integrating more uci-multiclass datasets

This commit is contained in:
Alejandro Moreo Fernandez 2024-04-12 18:08:00 +02:00
parent 3095d7092c
commit 4abec6629b
5 changed files with 167 additions and 18 deletions

View File

@ -29,12 +29,17 @@ def newLR():
def calibratedLR(): def calibratedLR():
return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)) return CalibratedClassifierCV(newLR())
__C_range = np.logspace(-3, 3, 7) __C_range = np.logspace(-3, 3, 7)
lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']} lr_params = {
svmperf_params = {'classifier__C': __C_range} 'classifier__C': __C_range,
'classifier__class_weight': [None, 'balanced']
}
svmperf_params = {
'classifier__C': __C_range
}
def quantification_models(): def quantification_models():

View File

@ -0,0 +1,113 @@
import pickle
import os
import numpy as np
from sklearn.linear_model import LogisticRegression
import quapy as qp
from quapy.method.aggregative import PACC, EMQ, KDEyML
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
from pathlib import Path
SEED = 1
def newLR():
return LogisticRegression(max_iter=3000)
# typical hyperparameters explored for Logistic Regression
logreg_grid = {
'C': np.logspace(-3, 3, 7),
'class_weight': ['balanced', None]
}
def wrap_hyper(classifier_hyper_grid:dict):
return {'classifier__'+k:v for k, v in classifier_hyper_grid.items()}
METHODS = [
('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)),
('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}),
]
def show_results(result_path):
import pandas as pd
df = pd.read_csv(result_path+'.csv', sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"], margins=True)
print(pv)
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 500
qp.environ['N_JOBS'] = -1
n_bags_val = 250
n_bags_test = 1000
result_dir = f'results/ucimulti'
os.makedirs(result_dir, exist_ok=True)
global_result_path = f'{result_dir}/allmethods'
with open(global_result_path + '.csv', 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
for method_name, quantifier, param_grid in METHODS:
print('Init method', method_name)
with open(global_result_path + '.csv', 'at') as csv:
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS[:5]:
if dataset in ['covertype', 'diabetes']:
continue
print('init', dataset)
local_result_path = os.path.join(Path(global_result_path).parent, method_name + '_' + dataset + '.dataframe')
if os.path.exists(local_result_path):
print(f'result file {local_result_path} already exist; skipping')
report = qp.util.load_report(local_result_path)
else:
with qp.util.temp_seed(SEED):
data = qp.datasets.fetch_UCIMulticlassDataset(dataset, verbose=True)
# model selection
train, test = data.train_test
train, val = train.split_stratified(random_state=SEED)
protocol = UPP(val, repeats=n_bags_val)
modsel = GridSearchQ(
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae'
)
try:
modsel.fit(train)
print(f'best params {modsel.best_params_}')
print(f'best score {modsel.best_score_}')
quantifier = modsel.best_model()
except:
print('something went wrong... trying to fit the default model')
quantifier.fit(train)
protocol = UPP(test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(
quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True
)
report.to_csv(local_result_path)
means = report.mean()
csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
csv.flush()
show_results(global_result_path)

View File

@ -591,7 +591,7 @@ def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=Fals
return data return data
def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False, min_ipc=100) -> Dataset: def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, min_class_support=100, verbose=False) -> Dataset:
""" """
Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`.
@ -614,16 +614,16 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory) ~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param min_class_support: minimum number of istances per class. Classes with fewer instances
are discarded (deafult is 100)
:param verbose: set to True (default is False) to get information (stats) about the dataset :param verbose: set to True (default is False) to get information (stats) about the dataset
:param min_ipc: minimum number of istances per class. Classes with less instances than min_ipc are discarded
(deafult is 100)
:return: a :class:`quapy.data.base.Dataset` instance :return: a :class:`quapy.data.base.Dataset` instance
""" """
data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose, min_ipc) data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, min_class_support, verbose=verbose)
return Dataset(*data.split_stratified(1 - test_split, random_state=0)) return Dataset(*data.split_stratified(1 - test_split, random_state=0))
def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False, min_ipc=100) -> LabelledCollection: def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_class_support=100, verbose=False) -> LabelledCollection:
""" """
Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`. Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
@ -646,9 +646,9 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
:param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
~/quay_data/ directory) ~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param min_class_support: minimum number of istances per class. Classes with fewer instances
are discarded (deafult is 100)
:param verbose: set to True (default is False) to get information (stats) about the dataset :param verbose: set to True (default is False) to get information (stats) about the dataset
:param min_ipc: minimum number of istances per class. Classes with less instances than min_ipc are discarded
(deafult is 100)
:return: a :class:`quapy.data.base.LabelledCollection` instance :return: a :class:`quapy.data.base.LabelledCollection` instance
""" """
assert dataset_name in UCI_MULTICLASS_DATASETS, \ assert dataset_name in UCI_MULTICLASS_DATASETS, \
@ -736,13 +736,20 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
file = join(data_home, 'uci_multiclass', dataset_name+'.pkl') file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
def download(id, name): def download(id, name):
data = fetch_ucirepo(id=id) df = fetch_ucirepo(id=id)
X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
# classes represented as arrays are transformed to tuples to treat them as signle objects df.data.features = pd.get_dummies(df.data.features, drop_first=True)
X, y = df.data.features.to_numpy(), df.data.targets.to_numpy().squeeze()
# classes represented as arrays are transformed to tuples to treat them as single objects
if name == 'support2': if name == 'support2':
y[:, 2] = np.fromiter((str(elm) for elm in y[:, 2]), dtype='object') y[:, 2] = np.fromiter((str(elm) for elm in y[:, 2]), dtype='object')
raise ValueError('this is support 2')
if y.ndim > 1: if y.ndim > 1:
y = np.fromiter((tuple(elm) for elm in y), dtype='object') y = np.fromiter((tuple(elm) for elm in y), dtype='object')
raise ValueError('more than one y')
classes = np.sort(np.unique(y)) classes = np.sort(np.unique(y))
y = np.searchsorted(classes, y) y = np.searchsorted(classes, y)
return LabelledCollection(X, y) return LabelledCollection(X, y)
@ -759,11 +766,11 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
return LabelledCollection(X, y) return LabelledCollection(X, y)
data = pickled_resource(file, download, identifier, dataset_name) data = pickled_resource(file, download, identifier, dataset_name)
data = filter_classes(data, min_ipc) data = filter_classes(data, min_class_support)
if data.n_classes <= 2: if data.n_classes <= 2:
raise ValueError( raise ValueError(
f'Dataset {dataset_name} has too few valid classes to be multiclass with {min_ipc=}. ' f'After filtering out classes with less than {min_class_support=} instances, the dataset {dataset_name} '
'Try a lower value for min_ipc.' f'is no longer multiclass. Try a reducing this value.'
) )
if verbose: if verbose:
@ -848,7 +855,6 @@ def fetch_lequa2022(task, data_home=None):
return train, val_gen, test_gen return train, val_gen, test_gen
def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None): def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
""" """
Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more

View File

@ -21,7 +21,7 @@ class QuaNetTrainer(BaseQuantifier):
Example: Example:
>>> import quapy as qp >>> import quapy as qp
>>> from quapy.method.meta import QuaNet >>> from quapy.method_name.meta import QuaNet
>>> from quapy.classification.neural import NeuralClassifierTrainer, CNNnet >>> from quapy.classification.neural import NeuralClassifierTrainer, CNNnet
>>> >>>
>>> # use samples of 100 elements >>> # use samples of 100 elements

View File

@ -6,6 +6,9 @@ import pickle
import urllib import urllib
from pathlib import Path from pathlib import Path
from contextlib import ExitStack from contextlib import ExitStack
import pandas as pd
import quapy as qp import quapy as qp
import numpy as np import numpy as np
@ -246,6 +249,28 @@ def _check_sample_size(sample_size):
return sample_size return sample_size
def load_report(path, as_dict=False):
def str2prev_arr(strprev):
within = strprev.strip('[]').split()
float_list = [float(p) for p in within]
float_list[-1] = 1. - sum(float_list[:-1])
return np.asarray(float_list)
df = pd.read_csv(path, index_col=0)
df['true-prev'] = df['true-prev'].apply(str2prev_arr)
df['estim-prev'] = df['estim-prev'].apply(str2prev_arr)
if as_dict:
d = {}
for col in df.columns.values:
vals = df[col].values
if col in ['true-prev', 'estim-prev']:
vals = np.vstack(vals)
d[col] = vals
return d
else:
return df
class EarlyStop: class EarlyStop:
""" """
A class implementing the early-stopping condition typically used for training neural networks. A class implementing the early-stopping condition typically used for training neural networks.