all uci datasets from Pérez-Gállego added, quantification report added

2021-01-28 18:22:43 +01:00 · 2021-01-28 18:22:43 +01:00 · 3aaf57f2f3
parent 1d89301089
commit 3aaf57f2f3
8 changed files with 222 additions and 73 deletions
--- a/TweetSentQuant/evaluate_results.py
+++ b/TweetSentQuant/evaluate_results.py
@ -1,3 +1,4 @@
+import numpy as np
 import quapy as qp
 import settings
 import os
@ -11,8 +12,10 @@ qp.environ['SAMPLE_SIZE'] = settings.SAMPLE_SIZE
 resultdir = './results'
 methods = ['*']

+
 def evaluate_results(methods, datasets, error_name):
    results_str = []
+    all = []
    error = qp.error.from_name(error_name)
    for method, dataset in itertools.product(methods, datasets):
        for experiment in glob(f'{resultdir}/{dataset}-{method}-{error_name}.pkl'):
@ -21,8 +24,12 @@ def evaluate_results(methods, datasets, error_name):
            result = error(true_prevalences, estim_prevalences)
            string = f'{pathlib.Path(experiment).name}: {result:.3f}'
            results_str.append(string)
+            all.append(result)
    results_str = sorted(results_str)
    for r in results_str:
        print(r)
+    print()
+    print(f'Ave: {np.mean(all):.3f}')

-evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae')
+
+evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae')
--- a/TweetSentQuant/experiments.py
+++ b/TweetSentQuant/experiments.py
@ -58,7 +58,7 @@ def quantification_ensembles():
        'verbose': False
    }
    common={
-        'max_sample_size': 500,
+        'max_sample_size': 1000,
        'n_jobs': settings.ENSEMBLE_N_JOBS,
        'param_grid': lr_params,
        'param_mod_sel': param_mod_sel,
@ -69,13 +69,13 @@ def quantification_ensembles():
    # hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection
    # will be skipped (by setting hyperparameters to None)
    hyper_none = None
-    yield 'epaccmaeptr', EPACC(newLR(), optim='mae', policy='ptr', **common), hyper_none
-    yield 'epaccmaemae', EPACC(newLR(), optim='mae', policy='mae', **common), hyper_none
+    #yield 'epaccmaeptr', EPACC(newLR(), optim='mae', policy='ptr', **common), hyper_none
+    yield 'epaccmaemae1k', EPACC(newLR(), optim='mae', policy='mae', **common), hyper_none
    # yield 'esldmaeptr', EEMQ(newLR(), optim='mae', policy='ptr', **common), hyper_none
    # yield 'esldmaemae', EEMQ(newLR(), optim='mae', policy='mae', **common), hyper_none

-    yield 'epaccmraeptr', EPACC(newLR(), optim='mrae', policy='ptr', **common), hyper_none
-    yield 'epaccmraemrae', EPACC(newLR(), optim='mrae', policy='mrae', **common), hyper_none
+    #yield 'epaccmraeptr', EPACC(newLR(), optim='mrae', policy='ptr', **common), hyper_none
+    #yield 'epaccmraemrae', EPACC(newLR(), optim='mrae', policy='mrae', **common), hyper_none
    #yield 'esldmraeptr', EEMQ(newLR(), optim='mrae', policy='ptr', **common), hyper_none
    #yield 'esldmraemrae', EEMQ(newLR(), optim='mrae', policy='mrae', **common), hyper_none

--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@ -1,7 +1,7 @@
 import numpy as np
 from scipy.sparse import issparse
 from scipy.sparse import vstack
-from sklearn.model_selection import train_test_split
+from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
 from quapy.functional import artificial_prevalence_sampling, strprev


@ -151,6 +151,12 @@ class LabelledCollection:
                  f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}')
        return stats_

+    def kFCV(self, nfolds=5, nrepeats=1, random_state=0):
+        kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state)
+        for train_index, test_index in kf.split(*self.Xy):
+            train = self.sampling_from_index(train_index)
+            test  = self.sampling_from_index(test_index)
+            yield train, test

 class Dataset:

@ -190,6 +196,11 @@ class Dataset:
              f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
              f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')

+    @classmethod
+    def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0):
+        for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)):
+            yield Dataset(train, test, name=f'fold {(i%nfolds)+1}/{nfolds} (round={(i//nfolds)+1})')
+

 def isbinary(data):
    if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -1,7 +1,12 @@
+def warn(*args, **kwargs):
+    pass
+import warnings
+warnings.warn = warn
 import os
 import zipfile
 from os.path import join
 from urllib.error import HTTPError
+from sklearn.model_selection import StratifiedKFold

 import pandas as pd

@ -17,6 +22,29 @@ TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
 TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
                                 'semeval', 'semeval16',
                                 'sst', 'wa', 'wb']
+UCI_DATASETS = ['acute.a', 'acute.b',
+                'balance.1', 'balance.2', 'balance.3',
+                'breast-cancer',
+                'cmc.1', 'cmc.2', 'cmc.3',
+                'ctg.1', 'ctg.2', 'ctg.3',
+                #'diabetes', # <-- I haven't found this one...
+                'german',
+                'haberman',
+                'ionosphere',
+                'iris.1', 'iris.2', 'iris.3',
+                'mammographic',
+                'pageblocks.5',
+                #'phoneme', # <-- I haven't found this one...
+                'semeion',
+                'sonar',
+                'spambase',
+                'spectf',
+                'tictactoe',
+                'transfusion',
+                'wdbc',
+                'wine.1', 'wine.2', 'wine.3',
+                'wine-q-red', 'wine-q-white',
+                'yeast']


 def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False):
@ -134,27 +162,12 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
    return data


-UCI_DATASETS = ['acute.a', 'acute.b',
-                'balance.1', 'balance.2', 'balance.3',
-                'breast-cancer',
-                'cmc.1', 'cmc.2', 'cmc.3',
-                'ctg.1', 'ctg.2', 'ctg.3',
-                #'diabetes', # <-- I haven't found this one...
-                'german',
-                'haberman',
-                'ionosphere',
-                'iris.1', 'iris.2', 'iris.3',
-                'mammographic',
-                'pageblocks.5',
-                #'phoneme', # <-- I haven't found this one...
-                'semeion',
-                'sonar',
-                'spambase',
-                'spectf',
-                'tictactoe',
-                'transfusion'] # ongoing...
+def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False):
+    data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
+    return Dataset(*data.split_stratified(1 - test_split, random_state=0))

-def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3):
+
+def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False):

    assert dataset_name in UCI_DATASETS, \
        f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
@ -188,7 +201,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
        'spambase': 'Spambase Data Set',
        'spectf': 'SPECTF Heart Data',
        'tictactoe': 'Tic-Tac-Toe Endgame Database',
-        'transfusion': 'Blood Transfusion Service Center Data Set '
+        'transfusion': 'Blood Transfusion Service Center Data Set',
+        'wdbc': 'Wisconsin Diagnostic Breast Cancer',
+        'wine.1': 'Wine Recognition Data (1)',
+        'wine.2': 'Wine Recognition Data (2)',
+        'wine.3': 'Wine Recognition Data (3)',
+        'wine-q-red': 'Wine Quality Red (6-10)',
+        'wine-q-white': 'Wine Quality White (6-10)',
+        'yeast': 'Yeast',
    }

    # the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
@ -219,7 +239,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
        'spambase': 'spambase',
        'spectf': 'spect',
        'tictactoe': 'tic-tac-toe',
-        'transfusion': 'blood-transfusion'
+        'transfusion': 'blood-transfusion',
+        'wdbc': 'breast-cancer-wisconsin',
+        'wine-q-red': 'wine-quality',
+        'wine-q-white': 'wine-quality',
+        'wine.1': 'wine',
+        'wine.2': 'wine',
+        'wine.3': 'wine',
+        'yeast': 'yeast',
    }

    # the filename is the name of the file within the data_folder indexed by the identifier
@ -231,7 +258,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
        'page-blocks': 'page-blocks.data.Z',
        'undocumented/connectionist-bench/sonar': 'sonar.all-data',
        'spect': ['SPECTF.train', 'SPECTF.test'],
-        'blood-transfusion': 'transfusion.data'
+        'blood-transfusion': 'transfusion.data',
+        'wine-quality': ['winequality-red.csv', 'winequality-white.csv'],
+        'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data'
    }

    # the filename containing the dataset description (if any)
@ -242,7 +271,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
        'mammographic-masses': 'mammographic_masses.names',
        'undocumented/connectionist-bench/sonar': 'sonar.names',
        'spect': 'SPECTF.names',
-        'blood-transfusion': 'transfusion.names'
+        'blood-transfusion': 'transfusion.names',
+        'wine-quality': 'winequality.names',
+        'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names'
    }

    identifier = identifier_map[dataset_name]
@ -269,16 +300,15 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
    print(f'Loading {dataset_name} ({fullname})')
    if identifier == 'acute':
        df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
+
+        df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False)
+        [df_replace(df, col) for col in range(1, 6)]
+        X = df.loc[:, 0:5].values
        if dataset_name == 'acute.a':
            y = binarize(df[6], pos_class='yes')
        elif dataset_name == 'acute.b':
            y = binarize(df[7], pos_class='yes')

-        mintemp, maxtemp = 35, 42
-        df[0] = df[0].apply(lambda x:(float(x.replace(',','.'))-mintemp)/(maxtemp-mintemp)).astype(float, copy=False)
-        [df_replace(df, col) for col in range(1, 6)]
-        X = df.loc[:, 0:5].values
-
    if identifier == 'balance-scale':
        df = pd.read_csv(data_path, header=None, sep=',')
        if dataset_name == 'balance.1':
@ -289,14 +319,20 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
            y = binarize(df[0], pos_class='R')
        X = df.loc[:, 1:].astype(float).values

-    if identifier == 'breast-cancer-wisconsin':
+    if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer':
        df = pd.read_csv(data_path, header=None, sep=',')
        Xy = df.loc[:, 1:10]
        Xy[Xy=='?']=np.nan
        Xy = Xy.dropna(axis=0)
        X = Xy.loc[:, 1:9]
        X = X.astype(float).values
-        y = binarize(Xy[10], pos_class=4)
+        y = binarize(Xy[10], pos_class=2)
+
+    if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc':
+        df = pd.read_csv(data_path, header=None, sep=',')
+        X = df.loc[:, 2:32].astype(float).values
+        y = df[1].values
+        y = binarize(y, pos_class='M')

    if identifier == 'cmc':
        df = pd.read_csv(data_path, header=None, sep=',')
@ -356,8 +392,8 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3

    if identifier == 'mammographic-masses':
        df = pd.read_csv(data_path, header=None, sep=',')
-        Xy[df == '?'] = np.nan
-        Xy = Xy.dropna(axis=0)
+        df[df == '?'] = np.nan
+        Xy = df.dropna(axis=0)
        X = Xy.iloc[:, 0:5]
        X = X.astype(float).values
        y = binarize(Xy.iloc[:,5], pos_class=1)
@ -395,9 +431,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3

    if identifier == 'spect':
        dfs = []
-        for file in  filename:
+        for file in filename:
            data_path = join(data_dir, file)
-            download_file_if_not_exists(f'{URL}/{filename}', data_path)
+            download_file_if_not_exists(f'{URL}/{file}', data_path)
            dfs.append(pd.read_csv(data_path, header=None, sep=','))
        df = pd.concat(dfs)
        X = df.iloc[:, 1:45].astype(float).values
@ -416,9 +452,34 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
        y = df.iloc[:, 4].values
        y = binarize(y, pos_class=1)

+    if identifier == 'wine':
+        df = pd.read_csv(data_path, header=None, sep=',')
+        X = df.iloc[:, 1:14].astype(float).values
+        y = df[0].values
+        if dataset_name == 'wine.1':
+            y = binarize(y, pos_class=1)
+        elif dataset_name == 'wine.2':
+            y = binarize(y, pos_class=2)
+        elif dataset_name == 'wine.3':
+            y = binarize(y, pos_class=3)
+
+    if identifier == 'wine-quality':
+        filename = filename[0] if dataset_name=='wine-q-red' else filename[1]
+        data_path = join(data_dir, filename)
+        download_file_if_not_exists(f'{URL}/{filename}', data_path)
+        df = pd.read_csv(data_path, sep=';')
+        X = df.iloc[:, 0:11].astype(float).values
+        y = df.iloc[:, 11].values > 5
+
+    if identifier == 'yeast':
+        df = pd.read_csv(data_path, header=None, delim_whitespace=True)
+        X = df.iloc[:, 1:9].astype(float).values
+        y = df.iloc[:, 9].values
+        y = binarize(y, pos_class='NUC')
+
    data = LabelledCollection(X, y)
    data.stats()
-    return Dataset(*data.split_stratified(1-test_split, random_state=0))
+    return data


 def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
--- a/quapy/data/reader.py
+++ b/quapy/data/reader.py
@ -93,4 +93,5 @@ def binarize(y, pos_class):
    y = np.asarray(y)
    ybin = np.zeros(y.shape, dtype=np.int)
    ybin[y == pos_class] = 1
-    return ybin
+    return ybin
+
--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@ -9,7 +9,7 @@ from quapy.data import LabelledCollection
 from quapy.method.base import BaseQuantifier
 from quapy.util import temp_seed
 import quapy.functional as F
-
+import pandas as pd

 def artificial_sampling_prediction(
        model: BaseQuantifier,
@ -62,9 +62,6 @@ def artificial_sampling_prediction(

    pbar = tqdm(indexes, desc='[artificial sampling protocol] predicting') if verbose else indexes
    results = qp.util.parallel(_predict_prevalences, pbar, n_jobs=n_jobs)
-    # results = Parallel(n_jobs=n_jobs)(
-    #     delayed(_predict_prevalences)(index) for index in pbar
-    # )

    true_prevalences, estim_prevalences = zip(*results)
    true_prevalences = np.asarray(true_prevalences)
@ -73,13 +70,65 @@ def artificial_sampling_prediction(
    return true_prevalences, estim_prevalences


+def artificial_sampling_report(
+        model: BaseQuantifier,
+        test: LabelledCollection,
+        sample_size,
+        n_prevpoints=210,
+        n_repetitions=1,
+        n_jobs=1,
+        random_seed=42,
+        error_metrics:Iterable[Union[str,Callable]]='mae',
+        verbose=True):
+
+    if isinstance(error_metrics, str):
+        error_metrics=[error_metrics]
+
+    error_names = [e if isinstance(e, str) else e.__name__ for e in error_metrics]
+    error_funcs = [qp.error.from_name(e) if isinstance(e, str) else e for e in error_metrics]
+    assert all(hasattr(e, '__call__') for e in error_funcs), 'invalid error functions'
+
+    df = pd.DataFrame(columns=['true-prev', 'estim-prev']+error_names)
+    true_prevs, estim_prevs = artificial_sampling_prediction(
+        model, test, sample_size, n_prevpoints, n_repetitions, n_jobs, random_seed, verbose
+    )
+    for true_prev, estim_prev in zip(true_prevs, estim_prevs):
+        series = {'true-prev': true_prev, 'estim-prev': estim_prev}
+        for error_name, error_metric in zip(error_names, error_funcs):
+            score = error_metric(true_prev, estim_prev)
+            series[error_name] = score
+        df = df.append(series, ignore_index=True)
+
+    return df
+
+
+def artificial_sampling_eval(
+        model: BaseQuantifier,
+        test: LabelledCollection,
+        sample_size,
+        n_prevpoints=210,
+        n_repetitions=1,
+        n_jobs=1,
+        random_seed=42,
+        error_metric:Union[str,Callable]='mae',
+        verbose=True):
+
+    if isinstance(error_metric, str):
+        error_metric = qp.error.from_name(error_metric)
+
+    assert hasattr(error_metric, '__call__'), 'invalid error function'
+
+    true_prevs, estim_prevs = artificial_sampling_prediction(
+        model, test, sample_size, n_prevpoints, n_repetitions, n_jobs, random_seed, verbose
+    )
+
+    return error_metric(true_prevs, estim_prevs)
+
+
 def evaluate(model: BaseQuantifier, test_samples:Iterable[LabelledCollection], err:Union[str, Callable], n_jobs:int=-1):
    if isinstance(err, str):
        err = qp.error.from_name(err)
    scores = qp.util.parallel(_delayed_eval, ((model, Ti, err) for Ti in test_samples), n_jobs=n_jobs)
-    # scores = Parallel(n_jobs=n_jobs)(
-    #     delayed(_delayed_eval)(model, Ti, err) for Ti in test_samples
-    # )
    return np.mean(scores)


--- a/quapy/method/meta.py
+++ b/quapy/method/meta.py
@ -38,7 +38,7 @@ class Ensemble(BaseQuantifier):
                 quantifier: BaseQuantifier,
                 size=50,
                 red_size=25,
-                 min_pos=1,
+                 min_pos=5,
                 policy='ave',
                 max_sample_size=None,
                 val_split=None,
@ -88,15 +88,8 @@ class Ensemble(BaseQuantifier):
        )
        self.ensemble = qp.util.parallel(
            _delayed_new_instance,
-            tqdm(args, desc='fitting ensamble', total=self.size),
+            tqdm(args, desc='fitting ensamble', total=self.size) if self.verbose else args,
            n_jobs=self.n_jobs)
-        # self.ensemble = Parallel(n_jobs=self.n_jobs)(
-        #     delayed(_delayed_new_instance)(
-        #         self.base_quantifier, data, val_split, prev, posteriors, keep_samples=is_static_policy,
-        #         verbose=self.verbose, sample_size=sample_size
-        #     ) for prev in tqdm(prevs, desc='fitting ensamble')
-        # )
-

        # static selection policy (the name of a quantification-oriented error function to minimize)
        if self.policy in qp.error.QUANTIFICATION_ERROR_NAMES:
@ -109,9 +102,6 @@ class Ensemble(BaseQuantifier):
        predictions = np.asarray(
            qp.util.parallel(_delayed_quantify, ((Qi, instances) for Qi in self.ensemble), n_jobs=self.n_jobs)
        )
-        # predictions = np.asarray(Parallel(n_jobs=self.n_jobs)(
-        #     delayed(_delayed_quantify)(Qi, instances) for Qi in self.ensemble
-        # ))

        if self.policy == 'ptr':
            predictions = self.ptr_policy(predictions)
@ -143,7 +133,7 @@ class Ensemble(BaseQuantifier):
            scores.append(evaluate(model[0], tests[:i] + tests[i+1:], error, self.n_jobs))
        order = np.argsort(scores)

-        self.ensemble = select_k(self.ensemble, order, k=self.red_size)
+        self.ensemble = _select_k(self.ensemble, order, k=self.red_size)

    def ptr_policy(self, predictions):
        """
@ -154,7 +144,7 @@ class Ensemble(BaseQuantifier):
        tr_prevs = [m[1] for m in self.ensemble]
        ptr_differences = [qp.error.mse(ptr_i, test_prev_estim) for ptr_i in tr_prevs]
        order = np.argsort(ptr_differences)
-        return select_k(predictions, order, k=self.red_size)
+        return _select_k(predictions, order, k=self.red_size)

    def ds_policy_get_posteriors(self, data: LabelledCollection):
        """
@ -192,7 +182,7 @@ class Ensemble(BaseQuantifier):
        tr_distributions = [m[2] for m in self.ensemble]
        dist = [F.HellingerDistance(tr_dist_i, test_distribution) for tr_dist_i in tr_distributions]
        order = np.argsort(dist)
-        return select_k(predictions, order, k=self.red_size)
+        return _select_k(predictions, order, k=self.red_size)

    @property
    def binary(self):
@ -201,13 +191,10 @@ class Ensemble(BaseQuantifier):
    @property
    def aggregative(self):
        return False
-        #raise NotImplementedError('aggregative functionality not yet supported for Ensemble')

    @property
    def probabilistic(self):
        return False
-        #raise NotImplementedError('probabilistic functionality not yet supported for Ensemble')
-        #return self.base_quantifier.probabilistic


 def get_probability_distribution(posterior_probabilities, bins=8):
@ -217,7 +204,7 @@ def get_probability_distribution(posterior_probabilities, bins=8):
    return distribution


-def select_k(elements, order, k):
+def _select_k(elements, order, k):
    return [elements[idx] for idx in order[:k]]


--- a/test.py
+++ b/test.py
@ -8,15 +8,48 @@ import numpy as np

 from NewMethods.methods import AveragePoolQuantification
 from classification.methods import PCALR
-from classification.neural import NeuralClassifierTrainer, CNNnet
+from data import Dataset
 from method.meta import EPACC
 from quapy.model_selection import GridSearchQ
+from tqdm import tqdm
+import pandas as pd
+
+sample_size=100
+qp.environ['SAMPLE_SIZE'] = sample_size
+
+np.random.seed(0)
+
+nfolds=5
+nrepeats=1
+
+df = pd.DataFrame(columns=['dataset', 'method', 'mse'])
+for datasetname in qp.datasets.UCI_DATASETS[2:]:
+    collection = qp.datasets.fetch_UCILabelledCollection(datasetname, verbose=False)
+    scores = []
+    pbar = tqdm(Dataset.kFCV(collection, nfolds=nfolds, nrepeats=nrepeats), total=nfolds*nrepeats)
+    for data in pbar:
+        pbar.set_description(f'{data.name}')
+        # learner = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid={'C': np.logspace(-3,3,7)}, n_jobs=-1)
+        learner = LogisticRegression(class_weight='balanced')
+        # model = qp.method.aggregative.CC(learner)
+        model = qp.method.meta.EHDy(learner, size=30, red_size=15, verbose=False)
+        model.fit(data.training)
+        err = qp.evaluation.artificial_sampling_eval(model, data.test, sample_size, n_prevpoints=101, n_jobs=-1,
+                                                     error_metric='mse', verbose=False)
+        scores.append(err)
+
+    score = np.mean(scores)
+    df = df.append({
+        'dataset': datasetname,
+        'method': model.__class__.__name__,
+        'mse': score
+    }, ignore_index=True)
+    print(df)

-dataset = qp.datasets.fetch_UCIDataset('transfusion', verbose=True)
 sys.exit(0)


-qp.environ['SAMPLE_SIZE'] = 500
+
 #param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
 param_grid = {'C': np.logspace(0,3,4), 'class_weight': ['balanced']}
 max_evaluations = 500