diff --git a/MultiLabel/main.py b/MultiLabel/main.py index 9e9d85e..671f759 100644 --- a/MultiLabel/main.py +++ b/MultiLabel/main.py @@ -1,8 +1,11 @@ +import argparse from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression +import itertools from sklearn.multioutput import ClassifierChain from tqdm import tqdm - +from skmultilearn.dataset import load_dataset, available_data_sets +from scipy.sparse import csr_matrix import quapy as qp from MultiLabel.mlclassification import MultilabelStackedClassifier from MultiLabel.mldata import MultilabelledCollection @@ -12,7 +15,10 @@ from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy import numpy as np from data.dataset import Dataset -from mlevaluation import ml_natural_prevalence_evaluation, ml_artificial_prevalence_evaluation +from mlevaluation import ml_natural_prevalence_prediction, ml_artificial_prevalence_prediction +import sys +import os +import pickle def cls(): @@ -26,7 +32,7 @@ def calibratedCls(): # DEBUG=True # if DEBUG: -sample_size = 250 +sample_size = 100 n_samples = 5000 @@ -35,28 +41,29 @@ def models(): yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls())) yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls())) yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls())) + yield 'HDy', MultilabelNaiveAggregativeQuantifier(HDy(cls())) # yield 'EMQ', MultilabelQuantifier(EMQ(calibratedCls())) - yield 'StackCC', MLCC(MultilabelStackedClassifier(cls())) - yield 'StackPCC', MLPCC(MultilabelStackedClassifier(cls())) - yield 'StackACC', MLACC(MultilabelStackedClassifier(cls())) - yield 'StackPACC', MLPACC(MultilabelStackedClassifier(cls())) + # yield 'StackCC', MLCC(MultilabelStackedClassifier(cls())) + # yield 'StackPCC', MLPCC(MultilabelStackedClassifier(cls())) + # yield 'StackACC', MLACC(MultilabelStackedClassifier(cls())) + # yield 'StackPACC', MLPACC(MultilabelStackedClassifier(cls())) # yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None, order='random')) # yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random')) # yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random')) # yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random')) common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'} - yield 'MRQ-CC', MLRegressionQuantification(MultilabelNaiveQuantifier(CC(cls())), **common) - yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())), **common) - yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())), **common) - yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common) - yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common) - yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common) - yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common) - yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), **common) - yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), protocol='app', **common) - yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), protocol='app', **common) - yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), protocol='app', **common) - yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), protocol='app', **common) + # yield 'MRQ-CC', MLRegressionQuantification(MultilabelNaiveQuantifier(CC(cls())), **common) + # yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())), **common) + # yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())), **common) + # yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common) + # yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common) + # yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common) + # yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common) + # yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), **common) + # yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), protocol='app', **common) + # yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), protocol='app', **common) + # yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), protocol='app', **common) + # yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), protocol='app', **common) # yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common) # yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common) # yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common) @@ -64,43 +71,98 @@ def models(): # dataset = 'reuters21578' -# dataset = 'ohsumed' -dataset = 'jrcall' # picklepath = '/home/moreo/word-class-embeddings/pickles' -picklepath = './pickles' -data = Dataset.load(dataset, pickle_path=f'{picklepath}/{dataset}.pickle') - -Xtr, Xte = data.vectorize() -ytr = data.devel_labelmatrix.todense().getA() -yte = data.test_labelmatrix.todense().getA() +# data = Dataset.load(dataset, pickle_path=f'{picklepath}/{dataset}.pickle') +# Xtr, Xte = data.vectorize() +# ytr = data.devel_labelmatrix.todense().getA() +# yte = data.test_labelmatrix.todense().getA() # remove categories with < 10 training documents # to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50) -to_keep = np.argsort(ytr.sum(axis=0))[-10:] -ytr = ytr[:, to_keep] -yte = yte[:, to_keep] -print(f'num categories = {ytr.shape[1]}') +# ytr = ytr[:, to_keep] +# yte = yte[:, to_keep] +# print(f'num categories = {ytr.shape[1]}') -train = MultilabelledCollection(Xtr, ytr) -test = MultilabelledCollection(Xte, yte) -# print(f'Train-prev: {train.prevalence()[:,1]}') -print(f'Train-counts: {train.counts()}') -# print(f'Test-prev: {test.prevalence()[:,1]}') -print(f'Test-counts: {test.counts()}') -print(f'MLPE: {qp.error.mae(train.prevalence(), test.prevalence()):.5f}') +def datasets(): + dataset_list = sorted(set([x[0] for x in available_data_sets().keys()])) + for dataset_name in dataset_list: + yield dataset_name -fit_models = {model_name:model.fit(train) for model_name,model in tqdm(models(), 'fitting', total=6)} -print('NPP:') -for model_name, model in fit_models.items(): - err = ml_natural_prevalence_evaluation(model, test, sample_size, repeats=100) - print(f'{model_name:10s}\tmae={err:.5f}') +def get_dataset(dataset_name): + Xtr, ytr, feature_names, label_names = load_dataset(dataset_name, 'train') + Xte, yte, _, _ = load_dataset(dataset_name, 'test') + print(f'n-labels = {len(label_names)}') -print('APP:') -for model_name, model in fit_models.items(): - err = ml_artificial_prevalence_evaluation(model, test, sample_size, n_prevalences=21, repeats=10) - print(f'{model_name:10s}\tmae={err:.5f}') + Xtr = csr_matrix(Xtr) + Xte = csr_matrix(Xte) + + ytr = ytr.todense().getA() + yte = yte.todense().getA() + + # remove categories without positives in the training or test splits + valid_categories = np.logical_and(ytr.sum(axis=0)>5, yte.sum(axis=0)>5) + ytr = ytr[:, valid_categories] + yte = yte[:, valid_categories] + + train = MultilabelledCollection(Xtr, ytr) + test = MultilabelledCollection(Xte, yte) + + return train, test + + +def already_run(result_path): + if os.path.exists(result_path): + print(f'{result_path} already computed. Skipping') + return True + return False + + +def print_info(train, test): + # print((np.abs(np.corrcoef(ytr, rowvar=False))>0.1).sum()) + # sys.exit(0) + + print(f'Tr documents {len(train)}') + print(f'Te documents {len(test)}') + print(f'#features {train.instances.shape[1]}') + print(f'#classes {train.labels.shape[1]}') + + # print(f'Train-prev: {train.prevalence()[:,1]}') + print(f'Train-counts: {train.counts()}') + # print(f'Test-prev: {test.prevalence()[:,1]}') + print(f'Test-counts: {test.counts()}') + print(f'MLPE: {qp.error.mae(train.prevalence(), test.prevalence()):.5f}') + + +def run_experiment(dataset_name, model_name, model): + result_path = f'{opt.results}/{dataset_name}_{model_name}.pkl' + if already_run(result_path): + return + + print(f'runing experiment {dataset_name} x {model_name}') + train, test = get_dataset(dataset_name) + + print_info(train, test) + + model.fit(train) + + results = dict() + results['npp'] = ml_natural_prevalence_prediction(model, test, sample_size, repeats=100) + results['app'] = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences=21, repeats=10) + pickle.dump(results, open(result_path, 'wb'), pickle.HIGHEST_PROTOCOL) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Experiments for multi-label quantification') + parser.add_argument('--results', type=str, default='./results', metavar='str', + help=f'path where to store the results') + opt = parser.parse_args() + + os.makedirs(opt.results, exist_ok=True) + + for datasetname, (modelname,model) in itertools.product(datasets(), models()): + run_experiment(datasetname, modelname, model) diff --git a/MultiLabel/mldata.py b/MultiLabel/mldata.py index d211c33..a87dfcd 100644 --- a/MultiLabel/mldata.py +++ b/MultiLabel/mldata.py @@ -9,7 +9,7 @@ from quapy.functional import artificial_prevalence_sampling class MultilabelledCollection: def __init__(self, instances, labels): - assert labels.ndim==2, 'data does not seem to be multilabel' + assert labels.ndim==2, f'data does not seem to be multilabel {labels}' self.instances = instances self.labels = labels self.classes_ = np.arange(labels.shape[1]) diff --git a/MultiLabel/mlevaluation.py b/MultiLabel/mlevaluation.py index 1863063..e5b7cf1 100644 --- a/MultiLabel/mlevaluation.py +++ b/MultiLabel/mlevaluation.py @@ -4,8 +4,42 @@ import numpy as np import quapy as qp from MultiLabel.mlquantification import MLAggregativeQuantifier from mldata import MultilabelledCollection +import itertools +def __check_error(error_metric): + if isinstance(error_metric, str): + error_metric = qp.error.from_name(error_metric) + + assert hasattr(error_metric, '__call__'), 'invalid error function' + return error_metric + + +def _ml_prevalence_predictions(model, + test: MultilabelledCollection, + test_indexes): + + predict_batch_fn = _predict_quantification_batch + if isinstance(model, MLAggregativeQuantifier): + test = MultilabelledCollection(model.preclassify(test.instances), test.labels) + predict_batch_fn = _predict_aggregative_batch + + args = tuple([model, test, test_indexes]) + true_prevs, estim_prevs = predict_batch_fn(args) + return true_prevs, estim_prevs + + +def ml_natural_prevalence_prediction(model, + test:MultilabelledCollection, + sample_size, + repeats=100, + random_seed=42): + + with qp.util.temp_seed(random_seed): + test_indexes = list(test.natural_sampling_index_generator(sample_size=sample_size, repeats=repeats)) + + return _ml_prevalence_predictions(model, test, test_indexes) + def ml_natural_prevalence_evaluation(model, test:MultilabelledCollection, @@ -14,23 +48,32 @@ def ml_natural_prevalence_evaluation(model, error_metric:Union[str,Callable]='mae', random_seed=42): - if isinstance(error_metric, str): - error_metric = qp.error.from_name(error_metric) + error_metric = __check_error(error_metric) - assert hasattr(error_metric, '__call__'), 'invalid error function' + true_prevs, estim_prevs = ml_natural_prevalence_prediction(model, test, sample_size, repeats, random_seed) - test_batch_fn = _test_quantification_batch - if isinstance(model, MLAggregativeQuantifier): - test = MultilabelledCollection(model.preclassify(test.instances), test.labels) - test_batch_fn = _test_aggregation_batch - - with qp.util.temp_seed(random_seed): - test_indexes = list(test.natural_sampling_index_generator(sample_size=sample_size, repeats=repeats)) - - errs = test_batch_fn(tuple([model, test, test_indexes, error_metric])) + errs = [error_metric(true_prev_i, estim_prev_i) for true_prev_i, estim_prev_i in zip(true_prevs, estim_prevs)] return np.mean(errs) +def ml_artificial_prevalence_prediction(model, + test:MultilabelledCollection, + sample_size, + n_prevalences=21, + repeats=10, + random_seed=42): + + test_indexes = [] + with qp.util.temp_seed(random_seed): + for cat in test.classes_: + test_indexes.append(list(test.artificial_sampling_index_generator(sample_size=sample_size, + category=cat, + n_prevalences=n_prevalences, + repeats=repeats))) + test_indexes = list(itertools.chain.from_iterable(test_indexes)) + return _ml_prevalence_predictions(model, test, test_indexes) + + def ml_artificial_prevalence_evaluation(model, test:MultilabelledCollection, sample_size, @@ -39,47 +82,30 @@ def ml_artificial_prevalence_evaluation(model, error_metric:Union[str,Callable]='mae', random_seed=42): - if isinstance(error_metric, str): - error_metric = qp.error.from_name(error_metric) + error_metric = __check_error(error_metric) - assert hasattr(error_metric, '__call__'), 'invalid error function' + true_prevs, estim_prevs = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences, repeats, random_seed) - test_batch_fn = _test_quantification_batch - if isinstance(model, MLAggregativeQuantifier): - test = MultilabelledCollection(model.preclassify(test.instances), test.labels) - test_batch_fn = _test_aggregation_batch - - test_indexes = [] - with qp.util.temp_seed(random_seed): - for cat in test.classes_: - test_indexes.append(list(test.artificial_sampling_index_generator(sample_size=sample_size, - category=cat, - n_prevalences=n_prevalences, - repeats=repeats))) - - args = [(model, test, indexes, error_metric) for indexes in test_indexes] - macro_errs = qp.util.parallel(test_batch_fn, args, n_jobs=-1) - - return np.mean(macro_errs) + errs = [error_metric(true_prev_i, estim_prev_i) for true_prev_i, estim_prev_i in zip(true_prevs, estim_prevs)] + return np.mean(errs) -def _test_quantification_batch(args): - model, test, indexes, error_metric = args - errs = [] +def _predict_quantification_batch(args): + model, test, indexes = args + return __predict_batch_fn(args, model.quantify) + + +def _predict_aggregative_batch(args): + model, test, indexes = args + return __predict_batch_fn(args, model.aggregate) + + +def __predict_batch_fn(args, quant_fn): + model, test, indexes = args + trues, estims = [], [] for index in indexes: sample = test.sampling_from_index(index) - estim_prevs = model.quantify(sample.instances) - true_prevs = sample.prevalence() - errs.append(error_metric(true_prevs, estim_prevs)) - return errs + estims.append(quant_fn(sample.instances)) + trues.append(sample.prevalence()) + return trues, estims - -def _test_aggregation_batch(args): - model, preclassified_test, indexes, error_metric = args - errs = [] - for index in indexes: - sample = preclassified_test.sampling_from_index(index) - estim_prevs = model.aggregate(sample.instances) - true_prevs = sample.prevalence() - errs.append(error_metric(true_prevs, estim_prevs)) - return errs \ No newline at end of file diff --git a/MultiLabel/mlquantification.py b/MultiLabel/mlquantification.py index 775bd94..13bc719 100644 --- a/MultiLabel/mlquantification.py +++ b/MultiLabel/mlquantification.py @@ -186,6 +186,7 @@ class MLRegressionQuantification: # self.norm = StandardScaler() self.means = means self.stds = stds + # self.covs = covs def _prepare_arrays(self, Xs, ys, samples_mean, samples_std): Xs = np.asarray(Xs) @@ -196,6 +197,8 @@ class MLRegressionQuantification: if self.stds: samples_std = np.asarray(samples_std) Xs = np.hstack([Xs, samples_std]) + # if self.covs: + return Xs, ys def generate_samples_npp(self, val): @@ -257,3 +260,6 @@ class MLRegressionQuantification: adjusted = adjusted.flatten() neg_prevs = 1-adjusted return np.asarray([neg_prevs, adjusted]).T + + +# class \ No newline at end of file