From db1dbe2534969cec7b1a8927501979e0e90fe45b Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Fri, 27 Aug 2021 12:21:53 +0200 Subject: [PATCH] parallelizing stuff --- MultiLabel/main.py | 49 +++++++++++++++++++++++++++++++------- MultiLabel/mlevaluation.py | 14 +++++++---- 2 files changed, 50 insertions(+), 13 deletions(-) diff --git a/MultiLabel/main.py b/MultiLabel/main.py index 671f759..583b5a7 100644 --- a/MultiLabel/main.py +++ b/MultiLabel/main.py @@ -23,7 +23,7 @@ import pickle def cls(): # return LinearSVC() - return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1) + return LogisticRegression(max_iter=1000, solver='lbfgs') def calibratedCls(): @@ -38,10 +38,10 @@ n_samples = 5000 def models(): yield 'NaiveCC', MultilabelNaiveAggregativeQuantifier(CC(cls())) - yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls())) - yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls())) - yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls())) - yield 'HDy', MultilabelNaiveAggregativeQuantifier(HDy(cls())) + # yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls())) + # yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls())) + # yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls())) + # yield 'HDy', MultilabelNaiveAggregativeQuantifier(HDy(cls())) # yield 'EMQ', MultilabelQuantifier(EMQ(calibratedCls())) # yield 'StackCC', MLCC(MultilabelStackedClassifier(cls())) # yield 'StackPCC', MLPCC(MultilabelStackedClassifier(cls())) @@ -135,6 +135,36 @@ def print_info(train, test): print(f'MLPE: {qp.error.mae(train.prevalence(), test.prevalence()):.5f}') +def save_results(npp_results, app_results, result_path): + # results are lists of tuples of (true_prevs, estim_prevs) + # each true_prevs is an ndarray of ndim=2, but the second dimension is constrained + def _prepare_result_lot(lot_results): + true_prevs, estim_prevs = lot_results + return { + 'true_prevs': [true_i[:,0].flatten() for true_i in true_prevs], # removes the constrained prevalence + 'estim_prevs': [estim_i[:,0].flatten() for estim_i in estim_prevs] # removes the constrained prevalence + } + results = { + 'npp': _prepare_result_lot(npp_results), + 'app': _prepare_result_lot(app_results), + } + pickle.dump(results, open(result_path, 'wb'), pickle.HIGHEST_PROTOCOL) + + +def load_results(result_path): + def _unpack_result_lot(lot_result): + true_prevs = lot_result['true_prevs'] + true_prevs = [np.vstack([true_i, 1 - true_i]).T for true_i in true_prevs] # add the constrained prevalence + estim_prevs = lot_result['estim_prevs'] + estim_prevs = [np.vstack([estim_i, 1 - estim_i]).T for estim_i in estim_prevs] # add the constrained prevalence + return true_prevs, estim_prevs + results = pickle.load(open(result_path, 'rb')) + results_npp = _unpack_result_lot(results['npp']) + results_app = _unpack_result_lot(results['app']) + return results_npp, results_app + + + def run_experiment(dataset_name, model_name, model): result_path = f'{opt.results}/{dataset_name}_{model_name}.pkl' if already_run(result_path): @@ -147,10 +177,11 @@ def run_experiment(dataset_name, model_name, model): model.fit(train) - results = dict() - results['npp'] = ml_natural_prevalence_prediction(model, test, sample_size, repeats=100) - results['app'] = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences=21, repeats=10) - pickle.dump(results, open(result_path, 'wb'), pickle.HIGHEST_PROTOCOL) + results_npp = ml_natural_prevalence_prediction(model, test, sample_size, repeats=100) + results_app = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences=11, repeats=5) + save_results(results_npp, results_app, result_path) + results_npp2, results_app2 = load_results(result_path) + print('pass') if __name__ == '__main__': diff --git a/MultiLabel/mlevaluation.py b/MultiLabel/mlevaluation.py index e5b7cf1..e03b11e 100644 --- a/MultiLabel/mlevaluation.py +++ b/MultiLabel/mlevaluation.py @@ -5,6 +5,7 @@ import quapy as qp from MultiLabel.mlquantification import MLAggregativeQuantifier from mldata import MultilabelledCollection import itertools +from tqdm import tqdm def __check_error(error_metric): @@ -63,15 +64,20 @@ def ml_artificial_prevalence_prediction(model, repeats=10, random_seed=42): - test_indexes = [] + nested_test_indexes = [] with qp.util.temp_seed(random_seed): for cat in test.classes_: - test_indexes.append(list(test.artificial_sampling_index_generator(sample_size=sample_size, + nested_test_indexes.append(list(test.artificial_sampling_index_generator(sample_size=sample_size, category=cat, n_prevalences=n_prevalences, repeats=repeats))) - test_indexes = list(itertools.chain.from_iterable(test_indexes)) - return _ml_prevalence_predictions(model, test, test_indexes) + def _predict_batch(test_indexes): + return _ml_prevalence_predictions(model, test, test_indexes) + + predictions = qp.util.parallel(_predict_batch, nested_test_indexes, n_jobs=-1) + true_prevs = list(itertools.chain.from_iterable(trues for trues, estims in predictions)) + estim_prevs = list(itertools.chain.from_iterable(estims for trues, estims in predictions)) + return true_prevs, estim_prevs def ml_artificial_prevalence_evaluation(model,