parallelizing stuff

2021-08-27 12:21:53 +02:00 · 2021-08-27 12:21:53 +02:00 · db1dbe2534
parent b941c0665e
commit db1dbe2534
2 changed files with 50 additions and 13 deletions
--- a/MultiLabel/main.py
+++ b/MultiLabel/main.py
@ -23,7 +23,7 @@ import pickle

 def cls():
    # return LinearSVC()
-    return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
+    return LogisticRegression(max_iter=1000, solver='lbfgs')


 def calibratedCls():
@ -38,10 +38,10 @@ n_samples = 5000

 def models():
    yield 'NaiveCC', MultilabelNaiveAggregativeQuantifier(CC(cls()))
-    yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls()))
-    yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls()))
-    yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls()))
-    yield 'HDy', MultilabelNaiveAggregativeQuantifier(HDy(cls()))
+    # yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls()))
+    # yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls()))
+    # yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls()))
+    # yield 'HDy', MultilabelNaiveAggregativeQuantifier(HDy(cls()))
    # yield 'EMQ', MultilabelQuantifier(EMQ(calibratedCls()))
    # yield 'StackCC', MLCC(MultilabelStackedClassifier(cls()))
    # yield 'StackPCC', MLPCC(MultilabelStackedClassifier(cls()))
@ -135,6 +135,36 @@ def print_info(train, test):
    print(f'MLPE: {qp.error.mae(train.prevalence(), test.prevalence()):.5f}')


+def save_results(npp_results, app_results, result_path):
+    # results are lists of tuples of (true_prevs, estim_prevs)
+    # each true_prevs is an ndarray of ndim=2, but the second dimension is constrained
+    def _prepare_result_lot(lot_results):
+        true_prevs, estim_prevs = lot_results
+        return {
+            'true_prevs': [true_i[:,0].flatten() for true_i in true_prevs],  # removes the constrained prevalence
+            'estim_prevs': [estim_i[:,0].flatten() for estim_i in estim_prevs]  # removes the constrained prevalence
+        }
+    results = {
+        'npp': _prepare_result_lot(npp_results),
+        'app': _prepare_result_lot(app_results),
+    }
+    pickle.dump(results, open(result_path, 'wb'), pickle.HIGHEST_PROTOCOL)
+
+
+def load_results(result_path):
+    def _unpack_result_lot(lot_result):
+        true_prevs = lot_result['true_prevs']
+        true_prevs = [np.vstack([true_i, 1 - true_i]).T for true_i in true_prevs]  # add the constrained prevalence
+        estim_prevs = lot_result['estim_prevs']
+        estim_prevs = [np.vstack([estim_i, 1 - estim_i]).T for estim_i in estim_prevs]  # add the constrained prevalence
+        return true_prevs, estim_prevs
+    results = pickle.load(open(result_path, 'rb'))
+    results_npp = _unpack_result_lot(results['npp'])
+    results_app = _unpack_result_lot(results['app'])
+    return results_npp, results_app
+
+
+
 def run_experiment(dataset_name, model_name, model):
    result_path = f'{opt.results}/{dataset_name}_{model_name}.pkl'
    if already_run(result_path):
@ -147,10 +177,11 @@ def run_experiment(dataset_name, model_name, model):

    model.fit(train)

-    results = dict()
-    results['npp'] = ml_natural_prevalence_prediction(model, test, sample_size, repeats=100)
-    results['app'] = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences=21, repeats=10)
-    pickle.dump(results, open(result_path, 'wb'), pickle.HIGHEST_PROTOCOL)
+    results_npp = ml_natural_prevalence_prediction(model, test, sample_size, repeats=100)
+    results_app = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences=11, repeats=5)
+    save_results(results_npp, results_app, result_path)
+    results_npp2, results_app2 = load_results(result_path)
+    print('pass')


 if __name__ == '__main__':
--- a/MultiLabel/mlevaluation.py
+++ b/MultiLabel/mlevaluation.py
@ -5,6 +5,7 @@ import quapy as qp
 from MultiLabel.mlquantification import MLAggregativeQuantifier
 from mldata import MultilabelledCollection
 import itertools
+from tqdm import tqdm


 def __check_error(error_metric):
@ -63,15 +64,20 @@ def ml_artificial_prevalence_prediction(model,
                                        repeats=10,
                                        random_seed=42):

-    test_indexes = []
+    nested_test_indexes = []
    with qp.util.temp_seed(random_seed):
        for cat in test.classes_:
-            test_indexes.append(list(test.artificial_sampling_index_generator(sample_size=sample_size,
+            nested_test_indexes.append(list(test.artificial_sampling_index_generator(sample_size=sample_size,
                                                                              category=cat,
                                                                              n_prevalences=n_prevalences,
                                                                              repeats=repeats)))
-    test_indexes = list(itertools.chain.from_iterable(test_indexes))
-    return _ml_prevalence_predictions(model, test, test_indexes)
+    def _predict_batch(test_indexes):
+        return _ml_prevalence_predictions(model, test, test_indexes)
+
+    predictions = qp.util.parallel(_predict_batch, nested_test_indexes, n_jobs=-1)
+    true_prevs = list(itertools.chain.from_iterable(trues for trues, estims in predictions))
+    estim_prevs = list(itertools.chain.from_iterable(estims for trues, estims in predictions))
+    return true_prevs, estim_prevs


 def ml_artificial_prevalence_evaluation(model,