forked from moreo/QuaPy
all uci datasets from Pérez-Gállego added, quantification report added
This commit is contained in:
parent
1d89301089
commit
3aaf57f2f3
|
@ -1,3 +1,4 @@
|
|||
import numpy as np
|
||||
import quapy as qp
|
||||
import settings
|
||||
import os
|
||||
|
@ -11,8 +12,10 @@ qp.environ['SAMPLE_SIZE'] = settings.SAMPLE_SIZE
|
|||
resultdir = './results'
|
||||
methods = ['*']
|
||||
|
||||
|
||||
def evaluate_results(methods, datasets, error_name):
|
||||
results_str = []
|
||||
all = []
|
||||
error = qp.error.from_name(error_name)
|
||||
for method, dataset in itertools.product(methods, datasets):
|
||||
for experiment in glob(f'{resultdir}/{dataset}-{method}-{error_name}.pkl'):
|
||||
|
@ -21,8 +24,12 @@ def evaluate_results(methods, datasets, error_name):
|
|||
result = error(true_prevalences, estim_prevalences)
|
||||
string = f'{pathlib.Path(experiment).name}: {result:.3f}'
|
||||
results_str.append(string)
|
||||
all.append(result)
|
||||
results_str = sorted(results_str)
|
||||
for r in results_str:
|
||||
print(r)
|
||||
print()
|
||||
print(f'Ave: {np.mean(all):.3f}')
|
||||
|
||||
evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae')
|
||||
|
||||
evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae')
|
||||
|
|
|
@ -58,7 +58,7 @@ def quantification_ensembles():
|
|||
'verbose': False
|
||||
}
|
||||
common={
|
||||
'max_sample_size': 500,
|
||||
'max_sample_size': 1000,
|
||||
'n_jobs': settings.ENSEMBLE_N_JOBS,
|
||||
'param_grid': lr_params,
|
||||
'param_mod_sel': param_mod_sel,
|
||||
|
@ -69,13 +69,13 @@ def quantification_ensembles():
|
|||
# hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection
|
||||
# will be skipped (by setting hyperparameters to None)
|
||||
hyper_none = None
|
||||
yield 'epaccmaeptr', EPACC(newLR(), optim='mae', policy='ptr', **common), hyper_none
|
||||
yield 'epaccmaemae', EPACC(newLR(), optim='mae', policy='mae', **common), hyper_none
|
||||
#yield 'epaccmaeptr', EPACC(newLR(), optim='mae', policy='ptr', **common), hyper_none
|
||||
yield 'epaccmaemae1k', EPACC(newLR(), optim='mae', policy='mae', **common), hyper_none
|
||||
# yield 'esldmaeptr', EEMQ(newLR(), optim='mae', policy='ptr', **common), hyper_none
|
||||
# yield 'esldmaemae', EEMQ(newLR(), optim='mae', policy='mae', **common), hyper_none
|
||||
|
||||
yield 'epaccmraeptr', EPACC(newLR(), optim='mrae', policy='ptr', **common), hyper_none
|
||||
yield 'epaccmraemrae', EPACC(newLR(), optim='mrae', policy='mrae', **common), hyper_none
|
||||
#yield 'epaccmraeptr', EPACC(newLR(), optim='mrae', policy='ptr', **common), hyper_none
|
||||
#yield 'epaccmraemrae', EPACC(newLR(), optim='mrae', policy='mrae', **common), hyper_none
|
||||
#yield 'esldmraeptr', EEMQ(newLR(), optim='mrae', policy='ptr', **common), hyper_none
|
||||
#yield 'esldmraemrae', EEMQ(newLR(), optim='mrae', policy='mrae', **common), hyper_none
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from scipy.sparse import vstack
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
|
||||
from quapy.functional import artificial_prevalence_sampling, strprev
|
||||
|
||||
|
||||
|
@ -151,6 +151,12 @@ class LabelledCollection:
|
|||
f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}')
|
||||
return stats_
|
||||
|
||||
def kFCV(self, nfolds=5, nrepeats=1, random_state=0):
|
||||
kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state)
|
||||
for train_index, test_index in kf.split(*self.Xy):
|
||||
train = self.sampling_from_index(train_index)
|
||||
test = self.sampling_from_index(test_index)
|
||||
yield train, test
|
||||
|
||||
class Dataset:
|
||||
|
||||
|
@ -190,6 +196,11 @@ class Dataset:
|
|||
f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
|
||||
f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
|
||||
|
||||
@classmethod
|
||||
def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0):
|
||||
for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)):
|
||||
yield Dataset(train, test, name=f'fold {(i%nfolds)+1}/{nfolds} (round={(i//nfolds)+1})')
|
||||
|
||||
|
||||
def isbinary(data):
|
||||
if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
|
||||
|
|
|
@ -1,7 +1,12 @@
|
|||
def warn(*args, **kwargs):
|
||||
pass
|
||||
import warnings
|
||||
warnings.warn = warn
|
||||
import os
|
||||
import zipfile
|
||||
from os.path import join
|
||||
from urllib.error import HTTPError
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
@ -17,6 +22,29 @@ TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
|
|||
TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
|
||||
'semeval', 'semeval16',
|
||||
'sst', 'wa', 'wb']
|
||||
UCI_DATASETS = ['acute.a', 'acute.b',
|
||||
'balance.1', 'balance.2', 'balance.3',
|
||||
'breast-cancer',
|
||||
'cmc.1', 'cmc.2', 'cmc.3',
|
||||
'ctg.1', 'ctg.2', 'ctg.3',
|
||||
#'diabetes', # <-- I haven't found this one...
|
||||
'german',
|
||||
'haberman',
|
||||
'ionosphere',
|
||||
'iris.1', 'iris.2', 'iris.3',
|
||||
'mammographic',
|
||||
'pageblocks.5',
|
||||
#'phoneme', # <-- I haven't found this one...
|
||||
'semeion',
|
||||
'sonar',
|
||||
'spambase',
|
||||
'spectf',
|
||||
'tictactoe',
|
||||
'transfusion',
|
||||
'wdbc',
|
||||
'wine.1', 'wine.2', 'wine.3',
|
||||
'wine-q-red', 'wine-q-white',
|
||||
'yeast']
|
||||
|
||||
|
||||
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False):
|
||||
|
@ -134,27 +162,12 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
|
|||
return data
|
||||
|
||||
|
||||
UCI_DATASETS = ['acute.a', 'acute.b',
|
||||
'balance.1', 'balance.2', 'balance.3',
|
||||
'breast-cancer',
|
||||
'cmc.1', 'cmc.2', 'cmc.3',
|
||||
'ctg.1', 'ctg.2', 'ctg.3',
|
||||
#'diabetes', # <-- I haven't found this one...
|
||||
'german',
|
||||
'haberman',
|
||||
'ionosphere',
|
||||
'iris.1', 'iris.2', 'iris.3',
|
||||
'mammographic',
|
||||
'pageblocks.5',
|
||||
#'phoneme', # <-- I haven't found this one...
|
||||
'semeion',
|
||||
'sonar',
|
||||
'spambase',
|
||||
'spectf',
|
||||
'tictactoe',
|
||||
'transfusion'] # ongoing...
|
||||
def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False):
|
||||
data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
|
||||
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
|
||||
|
||||
def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3):
|
||||
|
||||
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False):
|
||||
|
||||
assert dataset_name in UCI_DATASETS, \
|
||||
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
|
||||
|
@ -188,7 +201,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
|||
'spambase': 'Spambase Data Set',
|
||||
'spectf': 'SPECTF Heart Data',
|
||||
'tictactoe': 'Tic-Tac-Toe Endgame Database',
|
||||
'transfusion': 'Blood Transfusion Service Center Data Set '
|
||||
'transfusion': 'Blood Transfusion Service Center Data Set',
|
||||
'wdbc': 'Wisconsin Diagnostic Breast Cancer',
|
||||
'wine.1': 'Wine Recognition Data (1)',
|
||||
'wine.2': 'Wine Recognition Data (2)',
|
||||
'wine.3': 'Wine Recognition Data (3)',
|
||||
'wine-q-red': 'Wine Quality Red (6-10)',
|
||||
'wine-q-white': 'Wine Quality White (6-10)',
|
||||
'yeast': 'Yeast',
|
||||
}
|
||||
|
||||
# the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
|
||||
|
@ -219,7 +239,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
|||
'spambase': 'spambase',
|
||||
'spectf': 'spect',
|
||||
'tictactoe': 'tic-tac-toe',
|
||||
'transfusion': 'blood-transfusion'
|
||||
'transfusion': 'blood-transfusion',
|
||||
'wdbc': 'breast-cancer-wisconsin',
|
||||
'wine-q-red': 'wine-quality',
|
||||
'wine-q-white': 'wine-quality',
|
||||
'wine.1': 'wine',
|
||||
'wine.2': 'wine',
|
||||
'wine.3': 'wine',
|
||||
'yeast': 'yeast',
|
||||
}
|
||||
|
||||
# the filename is the name of the file within the data_folder indexed by the identifier
|
||||
|
@ -231,7 +258,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
|||
'page-blocks': 'page-blocks.data.Z',
|
||||
'undocumented/connectionist-bench/sonar': 'sonar.all-data',
|
||||
'spect': ['SPECTF.train', 'SPECTF.test'],
|
||||
'blood-transfusion': 'transfusion.data'
|
||||
'blood-transfusion': 'transfusion.data',
|
||||
'wine-quality': ['winequality-red.csv', 'winequality-white.csv'],
|
||||
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data'
|
||||
}
|
||||
|
||||
# the filename containing the dataset description (if any)
|
||||
|
@ -242,7 +271,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
|||
'mammographic-masses': 'mammographic_masses.names',
|
||||
'undocumented/connectionist-bench/sonar': 'sonar.names',
|
||||
'spect': 'SPECTF.names',
|
||||
'blood-transfusion': 'transfusion.names'
|
||||
'blood-transfusion': 'transfusion.names',
|
||||
'wine-quality': 'winequality.names',
|
||||
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names'
|
||||
}
|
||||
|
||||
identifier = identifier_map[dataset_name]
|
||||
|
@ -269,16 +300,15 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
|||
print(f'Loading {dataset_name} ({fullname})')
|
||||
if identifier == 'acute':
|
||||
df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
|
||||
|
||||
df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False)
|
||||
[df_replace(df, col) for col in range(1, 6)]
|
||||
X = df.loc[:, 0:5].values
|
||||
if dataset_name == 'acute.a':
|
||||
y = binarize(df[6], pos_class='yes')
|
||||
elif dataset_name == 'acute.b':
|
||||
y = binarize(df[7], pos_class='yes')
|
||||
|
||||
mintemp, maxtemp = 35, 42
|
||||
df[0] = df[0].apply(lambda x:(float(x.replace(',','.'))-mintemp)/(maxtemp-mintemp)).astype(float, copy=False)
|
||||
[df_replace(df, col) for col in range(1, 6)]
|
||||
X = df.loc[:, 0:5].values
|
||||
|
||||
if identifier == 'balance-scale':
|
||||
df = pd.read_csv(data_path, header=None, sep=',')
|
||||
if dataset_name == 'balance.1':
|
||||
|
@ -289,14 +319,20 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
|||
y = binarize(df[0], pos_class='R')
|
||||
X = df.loc[:, 1:].astype(float).values
|
||||
|
||||
if identifier == 'breast-cancer-wisconsin':
|
||||
if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer':
|
||||
df = pd.read_csv(data_path, header=None, sep=',')
|
||||
Xy = df.loc[:, 1:10]
|
||||
Xy[Xy=='?']=np.nan
|
||||
Xy = Xy.dropna(axis=0)
|
||||
X = Xy.loc[:, 1:9]
|
||||
X = X.astype(float).values
|
||||
y = binarize(Xy[10], pos_class=4)
|
||||
y = binarize(Xy[10], pos_class=2)
|
||||
|
||||
if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc':
|
||||
df = pd.read_csv(data_path, header=None, sep=',')
|
||||
X = df.loc[:, 2:32].astype(float).values
|
||||
y = df[1].values
|
||||
y = binarize(y, pos_class='M')
|
||||
|
||||
if identifier == 'cmc':
|
||||
df = pd.read_csv(data_path, header=None, sep=',')
|
||||
|
@ -356,8 +392,8 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
|||
|
||||
if identifier == 'mammographic-masses':
|
||||
df = pd.read_csv(data_path, header=None, sep=',')
|
||||
Xy[df == '?'] = np.nan
|
||||
Xy = Xy.dropna(axis=0)
|
||||
df[df == '?'] = np.nan
|
||||
Xy = df.dropna(axis=0)
|
||||
X = Xy.iloc[:, 0:5]
|
||||
X = X.astype(float).values
|
||||
y = binarize(Xy.iloc[:,5], pos_class=1)
|
||||
|
@ -395,9 +431,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
|||
|
||||
if identifier == 'spect':
|
||||
dfs = []
|
||||
for file in filename:
|
||||
for file in filename:
|
||||
data_path = join(data_dir, file)
|
||||
download_file_if_not_exists(f'{URL}/{filename}', data_path)
|
||||
download_file_if_not_exists(f'{URL}/{file}', data_path)
|
||||
dfs.append(pd.read_csv(data_path, header=None, sep=','))
|
||||
df = pd.concat(dfs)
|
||||
X = df.iloc[:, 1:45].astype(float).values
|
||||
|
@ -416,9 +452,34 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
|||
y = df.iloc[:, 4].values
|
||||
y = binarize(y, pos_class=1)
|
||||
|
||||
if identifier == 'wine':
|
||||
df = pd.read_csv(data_path, header=None, sep=',')
|
||||
X = df.iloc[:, 1:14].astype(float).values
|
||||
y = df[0].values
|
||||
if dataset_name == 'wine.1':
|
||||
y = binarize(y, pos_class=1)
|
||||
elif dataset_name == 'wine.2':
|
||||
y = binarize(y, pos_class=2)
|
||||
elif dataset_name == 'wine.3':
|
||||
y = binarize(y, pos_class=3)
|
||||
|
||||
if identifier == 'wine-quality':
|
||||
filename = filename[0] if dataset_name=='wine-q-red' else filename[1]
|
||||
data_path = join(data_dir, filename)
|
||||
download_file_if_not_exists(f'{URL}/{filename}', data_path)
|
||||
df = pd.read_csv(data_path, sep=';')
|
||||
X = df.iloc[:, 0:11].astype(float).values
|
||||
y = df.iloc[:, 11].values > 5
|
||||
|
||||
if identifier == 'yeast':
|
||||
df = pd.read_csv(data_path, header=None, delim_whitespace=True)
|
||||
X = df.iloc[:, 1:9].astype(float).values
|
||||
y = df.iloc[:, 9].values
|
||||
y = binarize(y, pos_class='NUC')
|
||||
|
||||
data = LabelledCollection(X, y)
|
||||
data.stats()
|
||||
return Dataset(*data.split_stratified(1-test_split, random_state=0))
|
||||
return data
|
||||
|
||||
|
||||
def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
|
||||
|
|
|
@ -93,4 +93,5 @@ def binarize(y, pos_class):
|
|||
y = np.asarray(y)
|
||||
ybin = np.zeros(y.shape, dtype=np.int)
|
||||
ybin[y == pos_class] = 1
|
||||
return ybin
|
||||
return ybin
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ from quapy.data import LabelledCollection
|
|||
from quapy.method.base import BaseQuantifier
|
||||
from quapy.util import temp_seed
|
||||
import quapy.functional as F
|
||||
|
||||
import pandas as pd
|
||||
|
||||
def artificial_sampling_prediction(
|
||||
model: BaseQuantifier,
|
||||
|
@ -62,9 +62,6 @@ def artificial_sampling_prediction(
|
|||
|
||||
pbar = tqdm(indexes, desc='[artificial sampling protocol] predicting') if verbose else indexes
|
||||
results = qp.util.parallel(_predict_prevalences, pbar, n_jobs=n_jobs)
|
||||
# results = Parallel(n_jobs=n_jobs)(
|
||||
# delayed(_predict_prevalences)(index) for index in pbar
|
||||
# )
|
||||
|
||||
true_prevalences, estim_prevalences = zip(*results)
|
||||
true_prevalences = np.asarray(true_prevalences)
|
||||
|
@ -73,13 +70,65 @@ def artificial_sampling_prediction(
|
|||
return true_prevalences, estim_prevalences
|
||||
|
||||
|
||||
def artificial_sampling_report(
|
||||
model: BaseQuantifier,
|
||||
test: LabelledCollection,
|
||||
sample_size,
|
||||
n_prevpoints=210,
|
||||
n_repetitions=1,
|
||||
n_jobs=1,
|
||||
random_seed=42,
|
||||
error_metrics:Iterable[Union[str,Callable]]='mae',
|
||||
verbose=True):
|
||||
|
||||
if isinstance(error_metrics, str):
|
||||
error_metrics=[error_metrics]
|
||||
|
||||
error_names = [e if isinstance(e, str) else e.__name__ for e in error_metrics]
|
||||
error_funcs = [qp.error.from_name(e) if isinstance(e, str) else e for e in error_metrics]
|
||||
assert all(hasattr(e, '__call__') for e in error_funcs), 'invalid error functions'
|
||||
|
||||
df = pd.DataFrame(columns=['true-prev', 'estim-prev']+error_names)
|
||||
true_prevs, estim_prevs = artificial_sampling_prediction(
|
||||
model, test, sample_size, n_prevpoints, n_repetitions, n_jobs, random_seed, verbose
|
||||
)
|
||||
for true_prev, estim_prev in zip(true_prevs, estim_prevs):
|
||||
series = {'true-prev': true_prev, 'estim-prev': estim_prev}
|
||||
for error_name, error_metric in zip(error_names, error_funcs):
|
||||
score = error_metric(true_prev, estim_prev)
|
||||
series[error_name] = score
|
||||
df = df.append(series, ignore_index=True)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def artificial_sampling_eval(
|
||||
model: BaseQuantifier,
|
||||
test: LabelledCollection,
|
||||
sample_size,
|
||||
n_prevpoints=210,
|
||||
n_repetitions=1,
|
||||
n_jobs=1,
|
||||
random_seed=42,
|
||||
error_metric:Union[str,Callable]='mae',
|
||||
verbose=True):
|
||||
|
||||
if isinstance(error_metric, str):
|
||||
error_metric = qp.error.from_name(error_metric)
|
||||
|
||||
assert hasattr(error_metric, '__call__'), 'invalid error function'
|
||||
|
||||
true_prevs, estim_prevs = artificial_sampling_prediction(
|
||||
model, test, sample_size, n_prevpoints, n_repetitions, n_jobs, random_seed, verbose
|
||||
)
|
||||
|
||||
return error_metric(true_prevs, estim_prevs)
|
||||
|
||||
|
||||
def evaluate(model: BaseQuantifier, test_samples:Iterable[LabelledCollection], err:Union[str, Callable], n_jobs:int=-1):
|
||||
if isinstance(err, str):
|
||||
err = qp.error.from_name(err)
|
||||
scores = qp.util.parallel(_delayed_eval, ((model, Ti, err) for Ti in test_samples), n_jobs=n_jobs)
|
||||
# scores = Parallel(n_jobs=n_jobs)(
|
||||
# delayed(_delayed_eval)(model, Ti, err) for Ti in test_samples
|
||||
# )
|
||||
return np.mean(scores)
|
||||
|
||||
|
||||
|
|
|
@ -38,7 +38,7 @@ class Ensemble(BaseQuantifier):
|
|||
quantifier: BaseQuantifier,
|
||||
size=50,
|
||||
red_size=25,
|
||||
min_pos=1,
|
||||
min_pos=5,
|
||||
policy='ave',
|
||||
max_sample_size=None,
|
||||
val_split=None,
|
||||
|
@ -88,15 +88,8 @@ class Ensemble(BaseQuantifier):
|
|||
)
|
||||
self.ensemble = qp.util.parallel(
|
||||
_delayed_new_instance,
|
||||
tqdm(args, desc='fitting ensamble', total=self.size),
|
||||
tqdm(args, desc='fitting ensamble', total=self.size) if self.verbose else args,
|
||||
n_jobs=self.n_jobs)
|
||||
# self.ensemble = Parallel(n_jobs=self.n_jobs)(
|
||||
# delayed(_delayed_new_instance)(
|
||||
# self.base_quantifier, data, val_split, prev, posteriors, keep_samples=is_static_policy,
|
||||
# verbose=self.verbose, sample_size=sample_size
|
||||
# ) for prev in tqdm(prevs, desc='fitting ensamble')
|
||||
# )
|
||||
|
||||
|
||||
# static selection policy (the name of a quantification-oriented error function to minimize)
|
||||
if self.policy in qp.error.QUANTIFICATION_ERROR_NAMES:
|
||||
|
@ -109,9 +102,6 @@ class Ensemble(BaseQuantifier):
|
|||
predictions = np.asarray(
|
||||
qp.util.parallel(_delayed_quantify, ((Qi, instances) for Qi in self.ensemble), n_jobs=self.n_jobs)
|
||||
)
|
||||
# predictions = np.asarray(Parallel(n_jobs=self.n_jobs)(
|
||||
# delayed(_delayed_quantify)(Qi, instances) for Qi in self.ensemble
|
||||
# ))
|
||||
|
||||
if self.policy == 'ptr':
|
||||
predictions = self.ptr_policy(predictions)
|
||||
|
@ -143,7 +133,7 @@ class Ensemble(BaseQuantifier):
|
|||
scores.append(evaluate(model[0], tests[:i] + tests[i+1:], error, self.n_jobs))
|
||||
order = np.argsort(scores)
|
||||
|
||||
self.ensemble = select_k(self.ensemble, order, k=self.red_size)
|
||||
self.ensemble = _select_k(self.ensemble, order, k=self.red_size)
|
||||
|
||||
def ptr_policy(self, predictions):
|
||||
"""
|
||||
|
@ -154,7 +144,7 @@ class Ensemble(BaseQuantifier):
|
|||
tr_prevs = [m[1] for m in self.ensemble]
|
||||
ptr_differences = [qp.error.mse(ptr_i, test_prev_estim) for ptr_i in tr_prevs]
|
||||
order = np.argsort(ptr_differences)
|
||||
return select_k(predictions, order, k=self.red_size)
|
||||
return _select_k(predictions, order, k=self.red_size)
|
||||
|
||||
def ds_policy_get_posteriors(self, data: LabelledCollection):
|
||||
"""
|
||||
|
@ -192,7 +182,7 @@ class Ensemble(BaseQuantifier):
|
|||
tr_distributions = [m[2] for m in self.ensemble]
|
||||
dist = [F.HellingerDistance(tr_dist_i, test_distribution) for tr_dist_i in tr_distributions]
|
||||
order = np.argsort(dist)
|
||||
return select_k(predictions, order, k=self.red_size)
|
||||
return _select_k(predictions, order, k=self.red_size)
|
||||
|
||||
@property
|
||||
def binary(self):
|
||||
|
@ -201,13 +191,10 @@ class Ensemble(BaseQuantifier):
|
|||
@property
|
||||
def aggregative(self):
|
||||
return False
|
||||
#raise NotImplementedError('aggregative functionality not yet supported for Ensemble')
|
||||
|
||||
@property
|
||||
def probabilistic(self):
|
||||
return False
|
||||
#raise NotImplementedError('probabilistic functionality not yet supported for Ensemble')
|
||||
#return self.base_quantifier.probabilistic
|
||||
|
||||
|
||||
def get_probability_distribution(posterior_probabilities, bins=8):
|
||||
|
@ -217,7 +204,7 @@ def get_probability_distribution(posterior_probabilities, bins=8):
|
|||
return distribution
|
||||
|
||||
|
||||
def select_k(elements, order, k):
|
||||
def _select_k(elements, order, k):
|
||||
return [elements[idx] for idx in order[:k]]
|
||||
|
||||
|
||||
|
|
39
test.py
39
test.py
|
@ -8,15 +8,48 @@ import numpy as np
|
|||
|
||||
from NewMethods.methods import AveragePoolQuantification
|
||||
from classification.methods import PCALR
|
||||
from classification.neural import NeuralClassifierTrainer, CNNnet
|
||||
from data import Dataset
|
||||
from method.meta import EPACC
|
||||
from quapy.model_selection import GridSearchQ
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
|
||||
sample_size=100
|
||||
qp.environ['SAMPLE_SIZE'] = sample_size
|
||||
|
||||
np.random.seed(0)
|
||||
|
||||
nfolds=5
|
||||
nrepeats=1
|
||||
|
||||
df = pd.DataFrame(columns=['dataset', 'method', 'mse'])
|
||||
for datasetname in qp.datasets.UCI_DATASETS[2:]:
|
||||
collection = qp.datasets.fetch_UCILabelledCollection(datasetname, verbose=False)
|
||||
scores = []
|
||||
pbar = tqdm(Dataset.kFCV(collection, nfolds=nfolds, nrepeats=nrepeats), total=nfolds*nrepeats)
|
||||
for data in pbar:
|
||||
pbar.set_description(f'{data.name}')
|
||||
# learner = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid={'C': np.logspace(-3,3,7)}, n_jobs=-1)
|
||||
learner = LogisticRegression(class_weight='balanced')
|
||||
# model = qp.method.aggregative.CC(learner)
|
||||
model = qp.method.meta.EHDy(learner, size=30, red_size=15, verbose=False)
|
||||
model.fit(data.training)
|
||||
err = qp.evaluation.artificial_sampling_eval(model, data.test, sample_size, n_prevpoints=101, n_jobs=-1,
|
||||
error_metric='mse', verbose=False)
|
||||
scores.append(err)
|
||||
|
||||
score = np.mean(scores)
|
||||
df = df.append({
|
||||
'dataset': datasetname,
|
||||
'method': model.__class__.__name__,
|
||||
'mse': score
|
||||
}, ignore_index=True)
|
||||
print(df)
|
||||
|
||||
dataset = qp.datasets.fetch_UCIDataset('transfusion', verbose=True)
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 500
|
||||
|
||||
#param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
|
||||
param_grid = {'C': np.logspace(0,3,4), 'class_weight': ['balanced']}
|
||||
max_evaluations = 500
|
||||
|
|
Loading…
Reference in New Issue