forked from moreo/QuaPy
all uci datasets from Pérez-Gállego added, quantification report added
This commit is contained in:
parent
1d89301089
commit
3aaf57f2f3
|
@ -1,3 +1,4 @@
|
||||||
|
import numpy as np
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
import settings
|
import settings
|
||||||
import os
|
import os
|
||||||
|
@ -11,8 +12,10 @@ qp.environ['SAMPLE_SIZE'] = settings.SAMPLE_SIZE
|
||||||
resultdir = './results'
|
resultdir = './results'
|
||||||
methods = ['*']
|
methods = ['*']
|
||||||
|
|
||||||
|
|
||||||
def evaluate_results(methods, datasets, error_name):
|
def evaluate_results(methods, datasets, error_name):
|
||||||
results_str = []
|
results_str = []
|
||||||
|
all = []
|
||||||
error = qp.error.from_name(error_name)
|
error = qp.error.from_name(error_name)
|
||||||
for method, dataset in itertools.product(methods, datasets):
|
for method, dataset in itertools.product(methods, datasets):
|
||||||
for experiment in glob(f'{resultdir}/{dataset}-{method}-{error_name}.pkl'):
|
for experiment in glob(f'{resultdir}/{dataset}-{method}-{error_name}.pkl'):
|
||||||
|
@ -21,8 +24,12 @@ def evaluate_results(methods, datasets, error_name):
|
||||||
result = error(true_prevalences, estim_prevalences)
|
result = error(true_prevalences, estim_prevalences)
|
||||||
string = f'{pathlib.Path(experiment).name}: {result:.3f}'
|
string = f'{pathlib.Path(experiment).name}: {result:.3f}'
|
||||||
results_str.append(string)
|
results_str.append(string)
|
||||||
|
all.append(result)
|
||||||
results_str = sorted(results_str)
|
results_str = sorted(results_str)
|
||||||
for r in results_str:
|
for r in results_str:
|
||||||
print(r)
|
print(r)
|
||||||
|
print()
|
||||||
|
print(f'Ave: {np.mean(all):.3f}')
|
||||||
|
|
||||||
|
|
||||||
evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae')
|
evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae')
|
|
@ -58,7 +58,7 @@ def quantification_ensembles():
|
||||||
'verbose': False
|
'verbose': False
|
||||||
}
|
}
|
||||||
common={
|
common={
|
||||||
'max_sample_size': 500,
|
'max_sample_size': 1000,
|
||||||
'n_jobs': settings.ENSEMBLE_N_JOBS,
|
'n_jobs': settings.ENSEMBLE_N_JOBS,
|
||||||
'param_grid': lr_params,
|
'param_grid': lr_params,
|
||||||
'param_mod_sel': param_mod_sel,
|
'param_mod_sel': param_mod_sel,
|
||||||
|
@ -69,13 +69,13 @@ def quantification_ensembles():
|
||||||
# hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection
|
# hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection
|
||||||
# will be skipped (by setting hyperparameters to None)
|
# will be skipped (by setting hyperparameters to None)
|
||||||
hyper_none = None
|
hyper_none = None
|
||||||
yield 'epaccmaeptr', EPACC(newLR(), optim='mae', policy='ptr', **common), hyper_none
|
#yield 'epaccmaeptr', EPACC(newLR(), optim='mae', policy='ptr', **common), hyper_none
|
||||||
yield 'epaccmaemae', EPACC(newLR(), optim='mae', policy='mae', **common), hyper_none
|
yield 'epaccmaemae1k', EPACC(newLR(), optim='mae', policy='mae', **common), hyper_none
|
||||||
# yield 'esldmaeptr', EEMQ(newLR(), optim='mae', policy='ptr', **common), hyper_none
|
# yield 'esldmaeptr', EEMQ(newLR(), optim='mae', policy='ptr', **common), hyper_none
|
||||||
# yield 'esldmaemae', EEMQ(newLR(), optim='mae', policy='mae', **common), hyper_none
|
# yield 'esldmaemae', EEMQ(newLR(), optim='mae', policy='mae', **common), hyper_none
|
||||||
|
|
||||||
yield 'epaccmraeptr', EPACC(newLR(), optim='mrae', policy='ptr', **common), hyper_none
|
#yield 'epaccmraeptr', EPACC(newLR(), optim='mrae', policy='ptr', **common), hyper_none
|
||||||
yield 'epaccmraemrae', EPACC(newLR(), optim='mrae', policy='mrae', **common), hyper_none
|
#yield 'epaccmraemrae', EPACC(newLR(), optim='mrae', policy='mrae', **common), hyper_none
|
||||||
#yield 'esldmraeptr', EEMQ(newLR(), optim='mrae', policy='ptr', **common), hyper_none
|
#yield 'esldmraeptr', EEMQ(newLR(), optim='mrae', policy='ptr', **common), hyper_none
|
||||||
#yield 'esldmraemrae', EEMQ(newLR(), optim='mrae', policy='mrae', **common), hyper_none
|
#yield 'esldmraemrae', EEMQ(newLR(), optim='mrae', policy='mrae', **common), hyper_none
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy.sparse import issparse
|
from scipy.sparse import issparse
|
||||||
from scipy.sparse import vstack
|
from scipy.sparse import vstack
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
|
||||||
from quapy.functional import artificial_prevalence_sampling, strprev
|
from quapy.functional import artificial_prevalence_sampling, strprev
|
||||||
|
|
||||||
|
|
||||||
|
@ -151,6 +151,12 @@ class LabelledCollection:
|
||||||
f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}')
|
f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}')
|
||||||
return stats_
|
return stats_
|
||||||
|
|
||||||
|
def kFCV(self, nfolds=5, nrepeats=1, random_state=0):
|
||||||
|
kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state)
|
||||||
|
for train_index, test_index in kf.split(*self.Xy):
|
||||||
|
train = self.sampling_from_index(train_index)
|
||||||
|
test = self.sampling_from_index(test_index)
|
||||||
|
yield train, test
|
||||||
|
|
||||||
class Dataset:
|
class Dataset:
|
||||||
|
|
||||||
|
@ -190,6 +196,11 @@ class Dataset:
|
||||||
f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
|
f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
|
||||||
f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
|
f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0):
|
||||||
|
for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)):
|
||||||
|
yield Dataset(train, test, name=f'fold {(i%nfolds)+1}/{nfolds} (round={(i//nfolds)+1})')
|
||||||
|
|
||||||
|
|
||||||
def isbinary(data):
|
def isbinary(data):
|
||||||
if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
|
if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
|
||||||
|
|
|
@ -1,7 +1,12 @@
|
||||||
|
def warn(*args, **kwargs):
|
||||||
|
pass
|
||||||
|
import warnings
|
||||||
|
warnings.warn = warn
|
||||||
import os
|
import os
|
||||||
import zipfile
|
import zipfile
|
||||||
from os.path import join
|
from os.path import join
|
||||||
from urllib.error import HTTPError
|
from urllib.error import HTTPError
|
||||||
|
from sklearn.model_selection import StratifiedKFold
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
@ -17,6 +22,29 @@ TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
|
||||||
TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
|
TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
|
||||||
'semeval', 'semeval16',
|
'semeval', 'semeval16',
|
||||||
'sst', 'wa', 'wb']
|
'sst', 'wa', 'wb']
|
||||||
|
UCI_DATASETS = ['acute.a', 'acute.b',
|
||||||
|
'balance.1', 'balance.2', 'balance.3',
|
||||||
|
'breast-cancer',
|
||||||
|
'cmc.1', 'cmc.2', 'cmc.3',
|
||||||
|
'ctg.1', 'ctg.2', 'ctg.3',
|
||||||
|
#'diabetes', # <-- I haven't found this one...
|
||||||
|
'german',
|
||||||
|
'haberman',
|
||||||
|
'ionosphere',
|
||||||
|
'iris.1', 'iris.2', 'iris.3',
|
||||||
|
'mammographic',
|
||||||
|
'pageblocks.5',
|
||||||
|
#'phoneme', # <-- I haven't found this one...
|
||||||
|
'semeion',
|
||||||
|
'sonar',
|
||||||
|
'spambase',
|
||||||
|
'spectf',
|
||||||
|
'tictactoe',
|
||||||
|
'transfusion',
|
||||||
|
'wdbc',
|
||||||
|
'wine.1', 'wine.2', 'wine.3',
|
||||||
|
'wine-q-red', 'wine-q-white',
|
||||||
|
'yeast']
|
||||||
|
|
||||||
|
|
||||||
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False):
|
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False):
|
||||||
|
@ -134,27 +162,12 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
UCI_DATASETS = ['acute.a', 'acute.b',
|
def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False):
|
||||||
'balance.1', 'balance.2', 'balance.3',
|
data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
|
||||||
'breast-cancer',
|
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
|
||||||
'cmc.1', 'cmc.2', 'cmc.3',
|
|
||||||
'ctg.1', 'ctg.2', 'ctg.3',
|
|
||||||
#'diabetes', # <-- I haven't found this one...
|
|
||||||
'german',
|
|
||||||
'haberman',
|
|
||||||
'ionosphere',
|
|
||||||
'iris.1', 'iris.2', 'iris.3',
|
|
||||||
'mammographic',
|
|
||||||
'pageblocks.5',
|
|
||||||
#'phoneme', # <-- I haven't found this one...
|
|
||||||
'semeion',
|
|
||||||
'sonar',
|
|
||||||
'spambase',
|
|
||||||
'spectf',
|
|
||||||
'tictactoe',
|
|
||||||
'transfusion'] # ongoing...
|
|
||||||
|
|
||||||
def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3):
|
|
||||||
|
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False):
|
||||||
|
|
||||||
assert dataset_name in UCI_DATASETS, \
|
assert dataset_name in UCI_DATASETS, \
|
||||||
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
|
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
|
||||||
|
@ -188,7 +201,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
||||||
'spambase': 'Spambase Data Set',
|
'spambase': 'Spambase Data Set',
|
||||||
'spectf': 'SPECTF Heart Data',
|
'spectf': 'SPECTF Heart Data',
|
||||||
'tictactoe': 'Tic-Tac-Toe Endgame Database',
|
'tictactoe': 'Tic-Tac-Toe Endgame Database',
|
||||||
'transfusion': 'Blood Transfusion Service Center Data Set '
|
'transfusion': 'Blood Transfusion Service Center Data Set',
|
||||||
|
'wdbc': 'Wisconsin Diagnostic Breast Cancer',
|
||||||
|
'wine.1': 'Wine Recognition Data (1)',
|
||||||
|
'wine.2': 'Wine Recognition Data (2)',
|
||||||
|
'wine.3': 'Wine Recognition Data (3)',
|
||||||
|
'wine-q-red': 'Wine Quality Red (6-10)',
|
||||||
|
'wine-q-white': 'Wine Quality White (6-10)',
|
||||||
|
'yeast': 'Yeast',
|
||||||
}
|
}
|
||||||
|
|
||||||
# the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
|
# the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
|
||||||
|
@ -219,7 +239,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
||||||
'spambase': 'spambase',
|
'spambase': 'spambase',
|
||||||
'spectf': 'spect',
|
'spectf': 'spect',
|
||||||
'tictactoe': 'tic-tac-toe',
|
'tictactoe': 'tic-tac-toe',
|
||||||
'transfusion': 'blood-transfusion'
|
'transfusion': 'blood-transfusion',
|
||||||
|
'wdbc': 'breast-cancer-wisconsin',
|
||||||
|
'wine-q-red': 'wine-quality',
|
||||||
|
'wine-q-white': 'wine-quality',
|
||||||
|
'wine.1': 'wine',
|
||||||
|
'wine.2': 'wine',
|
||||||
|
'wine.3': 'wine',
|
||||||
|
'yeast': 'yeast',
|
||||||
}
|
}
|
||||||
|
|
||||||
# the filename is the name of the file within the data_folder indexed by the identifier
|
# the filename is the name of the file within the data_folder indexed by the identifier
|
||||||
|
@ -231,7 +258,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
||||||
'page-blocks': 'page-blocks.data.Z',
|
'page-blocks': 'page-blocks.data.Z',
|
||||||
'undocumented/connectionist-bench/sonar': 'sonar.all-data',
|
'undocumented/connectionist-bench/sonar': 'sonar.all-data',
|
||||||
'spect': ['SPECTF.train', 'SPECTF.test'],
|
'spect': ['SPECTF.train', 'SPECTF.test'],
|
||||||
'blood-transfusion': 'transfusion.data'
|
'blood-transfusion': 'transfusion.data',
|
||||||
|
'wine-quality': ['winequality-red.csv', 'winequality-white.csv'],
|
||||||
|
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data'
|
||||||
}
|
}
|
||||||
|
|
||||||
# the filename containing the dataset description (if any)
|
# the filename containing the dataset description (if any)
|
||||||
|
@ -242,7 +271,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
||||||
'mammographic-masses': 'mammographic_masses.names',
|
'mammographic-masses': 'mammographic_masses.names',
|
||||||
'undocumented/connectionist-bench/sonar': 'sonar.names',
|
'undocumented/connectionist-bench/sonar': 'sonar.names',
|
||||||
'spect': 'SPECTF.names',
|
'spect': 'SPECTF.names',
|
||||||
'blood-transfusion': 'transfusion.names'
|
'blood-transfusion': 'transfusion.names',
|
||||||
|
'wine-quality': 'winequality.names',
|
||||||
|
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names'
|
||||||
}
|
}
|
||||||
|
|
||||||
identifier = identifier_map[dataset_name]
|
identifier = identifier_map[dataset_name]
|
||||||
|
@ -269,16 +300,15 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
||||||
print(f'Loading {dataset_name} ({fullname})')
|
print(f'Loading {dataset_name} ({fullname})')
|
||||||
if identifier == 'acute':
|
if identifier == 'acute':
|
||||||
df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
|
df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
|
||||||
|
|
||||||
|
df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False)
|
||||||
|
[df_replace(df, col) for col in range(1, 6)]
|
||||||
|
X = df.loc[:, 0:5].values
|
||||||
if dataset_name == 'acute.a':
|
if dataset_name == 'acute.a':
|
||||||
y = binarize(df[6], pos_class='yes')
|
y = binarize(df[6], pos_class='yes')
|
||||||
elif dataset_name == 'acute.b':
|
elif dataset_name == 'acute.b':
|
||||||
y = binarize(df[7], pos_class='yes')
|
y = binarize(df[7], pos_class='yes')
|
||||||
|
|
||||||
mintemp, maxtemp = 35, 42
|
|
||||||
df[0] = df[0].apply(lambda x:(float(x.replace(',','.'))-mintemp)/(maxtemp-mintemp)).astype(float, copy=False)
|
|
||||||
[df_replace(df, col) for col in range(1, 6)]
|
|
||||||
X = df.loc[:, 0:5].values
|
|
||||||
|
|
||||||
if identifier == 'balance-scale':
|
if identifier == 'balance-scale':
|
||||||
df = pd.read_csv(data_path, header=None, sep=',')
|
df = pd.read_csv(data_path, header=None, sep=',')
|
||||||
if dataset_name == 'balance.1':
|
if dataset_name == 'balance.1':
|
||||||
|
@ -289,14 +319,20 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
||||||
y = binarize(df[0], pos_class='R')
|
y = binarize(df[0], pos_class='R')
|
||||||
X = df.loc[:, 1:].astype(float).values
|
X = df.loc[:, 1:].astype(float).values
|
||||||
|
|
||||||
if identifier == 'breast-cancer-wisconsin':
|
if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer':
|
||||||
df = pd.read_csv(data_path, header=None, sep=',')
|
df = pd.read_csv(data_path, header=None, sep=',')
|
||||||
Xy = df.loc[:, 1:10]
|
Xy = df.loc[:, 1:10]
|
||||||
Xy[Xy=='?']=np.nan
|
Xy[Xy=='?']=np.nan
|
||||||
Xy = Xy.dropna(axis=0)
|
Xy = Xy.dropna(axis=0)
|
||||||
X = Xy.loc[:, 1:9]
|
X = Xy.loc[:, 1:9]
|
||||||
X = X.astype(float).values
|
X = X.astype(float).values
|
||||||
y = binarize(Xy[10], pos_class=4)
|
y = binarize(Xy[10], pos_class=2)
|
||||||
|
|
||||||
|
if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc':
|
||||||
|
df = pd.read_csv(data_path, header=None, sep=',')
|
||||||
|
X = df.loc[:, 2:32].astype(float).values
|
||||||
|
y = df[1].values
|
||||||
|
y = binarize(y, pos_class='M')
|
||||||
|
|
||||||
if identifier == 'cmc':
|
if identifier == 'cmc':
|
||||||
df = pd.read_csv(data_path, header=None, sep=',')
|
df = pd.read_csv(data_path, header=None, sep=',')
|
||||||
|
@ -356,8 +392,8 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
||||||
|
|
||||||
if identifier == 'mammographic-masses':
|
if identifier == 'mammographic-masses':
|
||||||
df = pd.read_csv(data_path, header=None, sep=',')
|
df = pd.read_csv(data_path, header=None, sep=',')
|
||||||
Xy[df == '?'] = np.nan
|
df[df == '?'] = np.nan
|
||||||
Xy = Xy.dropna(axis=0)
|
Xy = df.dropna(axis=0)
|
||||||
X = Xy.iloc[:, 0:5]
|
X = Xy.iloc[:, 0:5]
|
||||||
X = X.astype(float).values
|
X = X.astype(float).values
|
||||||
y = binarize(Xy.iloc[:,5], pos_class=1)
|
y = binarize(Xy.iloc[:,5], pos_class=1)
|
||||||
|
@ -397,7 +433,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
||||||
dfs = []
|
dfs = []
|
||||||
for file in filename:
|
for file in filename:
|
||||||
data_path = join(data_dir, file)
|
data_path = join(data_dir, file)
|
||||||
download_file_if_not_exists(f'{URL}/{filename}', data_path)
|
download_file_if_not_exists(f'{URL}/{file}', data_path)
|
||||||
dfs.append(pd.read_csv(data_path, header=None, sep=','))
|
dfs.append(pd.read_csv(data_path, header=None, sep=','))
|
||||||
df = pd.concat(dfs)
|
df = pd.concat(dfs)
|
||||||
X = df.iloc[:, 1:45].astype(float).values
|
X = df.iloc[:, 1:45].astype(float).values
|
||||||
|
@ -416,9 +452,34 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
|
||||||
y = df.iloc[:, 4].values
|
y = df.iloc[:, 4].values
|
||||||
y = binarize(y, pos_class=1)
|
y = binarize(y, pos_class=1)
|
||||||
|
|
||||||
|
if identifier == 'wine':
|
||||||
|
df = pd.read_csv(data_path, header=None, sep=',')
|
||||||
|
X = df.iloc[:, 1:14].astype(float).values
|
||||||
|
y = df[0].values
|
||||||
|
if dataset_name == 'wine.1':
|
||||||
|
y = binarize(y, pos_class=1)
|
||||||
|
elif dataset_name == 'wine.2':
|
||||||
|
y = binarize(y, pos_class=2)
|
||||||
|
elif dataset_name == 'wine.3':
|
||||||
|
y = binarize(y, pos_class=3)
|
||||||
|
|
||||||
|
if identifier == 'wine-quality':
|
||||||
|
filename = filename[0] if dataset_name=='wine-q-red' else filename[1]
|
||||||
|
data_path = join(data_dir, filename)
|
||||||
|
download_file_if_not_exists(f'{URL}/{filename}', data_path)
|
||||||
|
df = pd.read_csv(data_path, sep=';')
|
||||||
|
X = df.iloc[:, 0:11].astype(float).values
|
||||||
|
y = df.iloc[:, 11].values > 5
|
||||||
|
|
||||||
|
if identifier == 'yeast':
|
||||||
|
df = pd.read_csv(data_path, header=None, delim_whitespace=True)
|
||||||
|
X = df.iloc[:, 1:9].astype(float).values
|
||||||
|
y = df.iloc[:, 9].values
|
||||||
|
y = binarize(y, pos_class='NUC')
|
||||||
|
|
||||||
data = LabelledCollection(X, y)
|
data = LabelledCollection(X, y)
|
||||||
data.stats()
|
data.stats()
|
||||||
return Dataset(*data.split_stratified(1-test_split, random_state=0))
|
return data
|
||||||
|
|
||||||
|
|
||||||
def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
|
def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
|
||||||
|
|
|
@ -94,3 +94,4 @@ def binarize(y, pos_class):
|
||||||
ybin = np.zeros(y.shape, dtype=np.int)
|
ybin = np.zeros(y.shape, dtype=np.int)
|
||||||
ybin[y == pos_class] = 1
|
ybin[y == pos_class] = 1
|
||||||
return ybin
|
return ybin
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ from quapy.data import LabelledCollection
|
||||||
from quapy.method.base import BaseQuantifier
|
from quapy.method.base import BaseQuantifier
|
||||||
from quapy.util import temp_seed
|
from quapy.util import temp_seed
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
def artificial_sampling_prediction(
|
def artificial_sampling_prediction(
|
||||||
model: BaseQuantifier,
|
model: BaseQuantifier,
|
||||||
|
@ -62,9 +62,6 @@ def artificial_sampling_prediction(
|
||||||
|
|
||||||
pbar = tqdm(indexes, desc='[artificial sampling protocol] predicting') if verbose else indexes
|
pbar = tqdm(indexes, desc='[artificial sampling protocol] predicting') if verbose else indexes
|
||||||
results = qp.util.parallel(_predict_prevalences, pbar, n_jobs=n_jobs)
|
results = qp.util.parallel(_predict_prevalences, pbar, n_jobs=n_jobs)
|
||||||
# results = Parallel(n_jobs=n_jobs)(
|
|
||||||
# delayed(_predict_prevalences)(index) for index in pbar
|
|
||||||
# )
|
|
||||||
|
|
||||||
true_prevalences, estim_prevalences = zip(*results)
|
true_prevalences, estim_prevalences = zip(*results)
|
||||||
true_prevalences = np.asarray(true_prevalences)
|
true_prevalences = np.asarray(true_prevalences)
|
||||||
|
@ -73,13 +70,65 @@ def artificial_sampling_prediction(
|
||||||
return true_prevalences, estim_prevalences
|
return true_prevalences, estim_prevalences
|
||||||
|
|
||||||
|
|
||||||
|
def artificial_sampling_report(
|
||||||
|
model: BaseQuantifier,
|
||||||
|
test: LabelledCollection,
|
||||||
|
sample_size,
|
||||||
|
n_prevpoints=210,
|
||||||
|
n_repetitions=1,
|
||||||
|
n_jobs=1,
|
||||||
|
random_seed=42,
|
||||||
|
error_metrics:Iterable[Union[str,Callable]]='mae',
|
||||||
|
verbose=True):
|
||||||
|
|
||||||
|
if isinstance(error_metrics, str):
|
||||||
|
error_metrics=[error_metrics]
|
||||||
|
|
||||||
|
error_names = [e if isinstance(e, str) else e.__name__ for e in error_metrics]
|
||||||
|
error_funcs = [qp.error.from_name(e) if isinstance(e, str) else e for e in error_metrics]
|
||||||
|
assert all(hasattr(e, '__call__') for e in error_funcs), 'invalid error functions'
|
||||||
|
|
||||||
|
df = pd.DataFrame(columns=['true-prev', 'estim-prev']+error_names)
|
||||||
|
true_prevs, estim_prevs = artificial_sampling_prediction(
|
||||||
|
model, test, sample_size, n_prevpoints, n_repetitions, n_jobs, random_seed, verbose
|
||||||
|
)
|
||||||
|
for true_prev, estim_prev in zip(true_prevs, estim_prevs):
|
||||||
|
series = {'true-prev': true_prev, 'estim-prev': estim_prev}
|
||||||
|
for error_name, error_metric in zip(error_names, error_funcs):
|
||||||
|
score = error_metric(true_prev, estim_prev)
|
||||||
|
series[error_name] = score
|
||||||
|
df = df.append(series, ignore_index=True)
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def artificial_sampling_eval(
|
||||||
|
model: BaseQuantifier,
|
||||||
|
test: LabelledCollection,
|
||||||
|
sample_size,
|
||||||
|
n_prevpoints=210,
|
||||||
|
n_repetitions=1,
|
||||||
|
n_jobs=1,
|
||||||
|
random_seed=42,
|
||||||
|
error_metric:Union[str,Callable]='mae',
|
||||||
|
verbose=True):
|
||||||
|
|
||||||
|
if isinstance(error_metric, str):
|
||||||
|
error_metric = qp.error.from_name(error_metric)
|
||||||
|
|
||||||
|
assert hasattr(error_metric, '__call__'), 'invalid error function'
|
||||||
|
|
||||||
|
true_prevs, estim_prevs = artificial_sampling_prediction(
|
||||||
|
model, test, sample_size, n_prevpoints, n_repetitions, n_jobs, random_seed, verbose
|
||||||
|
)
|
||||||
|
|
||||||
|
return error_metric(true_prevs, estim_prevs)
|
||||||
|
|
||||||
|
|
||||||
def evaluate(model: BaseQuantifier, test_samples:Iterable[LabelledCollection], err:Union[str, Callable], n_jobs:int=-1):
|
def evaluate(model: BaseQuantifier, test_samples:Iterable[LabelledCollection], err:Union[str, Callable], n_jobs:int=-1):
|
||||||
if isinstance(err, str):
|
if isinstance(err, str):
|
||||||
err = qp.error.from_name(err)
|
err = qp.error.from_name(err)
|
||||||
scores = qp.util.parallel(_delayed_eval, ((model, Ti, err) for Ti in test_samples), n_jobs=n_jobs)
|
scores = qp.util.parallel(_delayed_eval, ((model, Ti, err) for Ti in test_samples), n_jobs=n_jobs)
|
||||||
# scores = Parallel(n_jobs=n_jobs)(
|
|
||||||
# delayed(_delayed_eval)(model, Ti, err) for Ti in test_samples
|
|
||||||
# )
|
|
||||||
return np.mean(scores)
|
return np.mean(scores)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@ class Ensemble(BaseQuantifier):
|
||||||
quantifier: BaseQuantifier,
|
quantifier: BaseQuantifier,
|
||||||
size=50,
|
size=50,
|
||||||
red_size=25,
|
red_size=25,
|
||||||
min_pos=1,
|
min_pos=5,
|
||||||
policy='ave',
|
policy='ave',
|
||||||
max_sample_size=None,
|
max_sample_size=None,
|
||||||
val_split=None,
|
val_split=None,
|
||||||
|
@ -88,15 +88,8 @@ class Ensemble(BaseQuantifier):
|
||||||
)
|
)
|
||||||
self.ensemble = qp.util.parallel(
|
self.ensemble = qp.util.parallel(
|
||||||
_delayed_new_instance,
|
_delayed_new_instance,
|
||||||
tqdm(args, desc='fitting ensamble', total=self.size),
|
tqdm(args, desc='fitting ensamble', total=self.size) if self.verbose else args,
|
||||||
n_jobs=self.n_jobs)
|
n_jobs=self.n_jobs)
|
||||||
# self.ensemble = Parallel(n_jobs=self.n_jobs)(
|
|
||||||
# delayed(_delayed_new_instance)(
|
|
||||||
# self.base_quantifier, data, val_split, prev, posteriors, keep_samples=is_static_policy,
|
|
||||||
# verbose=self.verbose, sample_size=sample_size
|
|
||||||
# ) for prev in tqdm(prevs, desc='fitting ensamble')
|
|
||||||
# )
|
|
||||||
|
|
||||||
|
|
||||||
# static selection policy (the name of a quantification-oriented error function to minimize)
|
# static selection policy (the name of a quantification-oriented error function to minimize)
|
||||||
if self.policy in qp.error.QUANTIFICATION_ERROR_NAMES:
|
if self.policy in qp.error.QUANTIFICATION_ERROR_NAMES:
|
||||||
|
@ -109,9 +102,6 @@ class Ensemble(BaseQuantifier):
|
||||||
predictions = np.asarray(
|
predictions = np.asarray(
|
||||||
qp.util.parallel(_delayed_quantify, ((Qi, instances) for Qi in self.ensemble), n_jobs=self.n_jobs)
|
qp.util.parallel(_delayed_quantify, ((Qi, instances) for Qi in self.ensemble), n_jobs=self.n_jobs)
|
||||||
)
|
)
|
||||||
# predictions = np.asarray(Parallel(n_jobs=self.n_jobs)(
|
|
||||||
# delayed(_delayed_quantify)(Qi, instances) for Qi in self.ensemble
|
|
||||||
# ))
|
|
||||||
|
|
||||||
if self.policy == 'ptr':
|
if self.policy == 'ptr':
|
||||||
predictions = self.ptr_policy(predictions)
|
predictions = self.ptr_policy(predictions)
|
||||||
|
@ -143,7 +133,7 @@ class Ensemble(BaseQuantifier):
|
||||||
scores.append(evaluate(model[0], tests[:i] + tests[i+1:], error, self.n_jobs))
|
scores.append(evaluate(model[0], tests[:i] + tests[i+1:], error, self.n_jobs))
|
||||||
order = np.argsort(scores)
|
order = np.argsort(scores)
|
||||||
|
|
||||||
self.ensemble = select_k(self.ensemble, order, k=self.red_size)
|
self.ensemble = _select_k(self.ensemble, order, k=self.red_size)
|
||||||
|
|
||||||
def ptr_policy(self, predictions):
|
def ptr_policy(self, predictions):
|
||||||
"""
|
"""
|
||||||
|
@ -154,7 +144,7 @@ class Ensemble(BaseQuantifier):
|
||||||
tr_prevs = [m[1] for m in self.ensemble]
|
tr_prevs = [m[1] for m in self.ensemble]
|
||||||
ptr_differences = [qp.error.mse(ptr_i, test_prev_estim) for ptr_i in tr_prevs]
|
ptr_differences = [qp.error.mse(ptr_i, test_prev_estim) for ptr_i in tr_prevs]
|
||||||
order = np.argsort(ptr_differences)
|
order = np.argsort(ptr_differences)
|
||||||
return select_k(predictions, order, k=self.red_size)
|
return _select_k(predictions, order, k=self.red_size)
|
||||||
|
|
||||||
def ds_policy_get_posteriors(self, data: LabelledCollection):
|
def ds_policy_get_posteriors(self, data: LabelledCollection):
|
||||||
"""
|
"""
|
||||||
|
@ -192,7 +182,7 @@ class Ensemble(BaseQuantifier):
|
||||||
tr_distributions = [m[2] for m in self.ensemble]
|
tr_distributions = [m[2] for m in self.ensemble]
|
||||||
dist = [F.HellingerDistance(tr_dist_i, test_distribution) for tr_dist_i in tr_distributions]
|
dist = [F.HellingerDistance(tr_dist_i, test_distribution) for tr_dist_i in tr_distributions]
|
||||||
order = np.argsort(dist)
|
order = np.argsort(dist)
|
||||||
return select_k(predictions, order, k=self.red_size)
|
return _select_k(predictions, order, k=self.red_size)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def binary(self):
|
def binary(self):
|
||||||
|
@ -201,13 +191,10 @@ class Ensemble(BaseQuantifier):
|
||||||
@property
|
@property
|
||||||
def aggregative(self):
|
def aggregative(self):
|
||||||
return False
|
return False
|
||||||
#raise NotImplementedError('aggregative functionality not yet supported for Ensemble')
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def probabilistic(self):
|
def probabilistic(self):
|
||||||
return False
|
return False
|
||||||
#raise NotImplementedError('probabilistic functionality not yet supported for Ensemble')
|
|
||||||
#return self.base_quantifier.probabilistic
|
|
||||||
|
|
||||||
|
|
||||||
def get_probability_distribution(posterior_probabilities, bins=8):
|
def get_probability_distribution(posterior_probabilities, bins=8):
|
||||||
|
@ -217,7 +204,7 @@ def get_probability_distribution(posterior_probabilities, bins=8):
|
||||||
return distribution
|
return distribution
|
||||||
|
|
||||||
|
|
||||||
def select_k(elements, order, k):
|
def _select_k(elements, order, k):
|
||||||
return [elements[idx] for idx in order[:k]]
|
return [elements[idx] for idx in order[:k]]
|
||||||
|
|
||||||
|
|
||||||
|
|
39
test.py
39
test.py
|
@ -8,15 +8,48 @@ import numpy as np
|
||||||
|
|
||||||
from NewMethods.methods import AveragePoolQuantification
|
from NewMethods.methods import AveragePoolQuantification
|
||||||
from classification.methods import PCALR
|
from classification.methods import PCALR
|
||||||
from classification.neural import NeuralClassifierTrainer, CNNnet
|
from data import Dataset
|
||||||
from method.meta import EPACC
|
from method.meta import EPACC
|
||||||
from quapy.model_selection import GridSearchQ
|
from quapy.model_selection import GridSearchQ
|
||||||
|
from tqdm import tqdm
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
sample_size=100
|
||||||
|
qp.environ['SAMPLE_SIZE'] = sample_size
|
||||||
|
|
||||||
|
np.random.seed(0)
|
||||||
|
|
||||||
|
nfolds=5
|
||||||
|
nrepeats=1
|
||||||
|
|
||||||
|
df = pd.DataFrame(columns=['dataset', 'method', 'mse'])
|
||||||
|
for datasetname in qp.datasets.UCI_DATASETS[2:]:
|
||||||
|
collection = qp.datasets.fetch_UCILabelledCollection(datasetname, verbose=False)
|
||||||
|
scores = []
|
||||||
|
pbar = tqdm(Dataset.kFCV(collection, nfolds=nfolds, nrepeats=nrepeats), total=nfolds*nrepeats)
|
||||||
|
for data in pbar:
|
||||||
|
pbar.set_description(f'{data.name}')
|
||||||
|
# learner = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid={'C': np.logspace(-3,3,7)}, n_jobs=-1)
|
||||||
|
learner = LogisticRegression(class_weight='balanced')
|
||||||
|
# model = qp.method.aggregative.CC(learner)
|
||||||
|
model = qp.method.meta.EHDy(learner, size=30, red_size=15, verbose=False)
|
||||||
|
model.fit(data.training)
|
||||||
|
err = qp.evaluation.artificial_sampling_eval(model, data.test, sample_size, n_prevpoints=101, n_jobs=-1,
|
||||||
|
error_metric='mse', verbose=False)
|
||||||
|
scores.append(err)
|
||||||
|
|
||||||
|
score = np.mean(scores)
|
||||||
|
df = df.append({
|
||||||
|
'dataset': datasetname,
|
||||||
|
'method': model.__class__.__name__,
|
||||||
|
'mse': score
|
||||||
|
}, ignore_index=True)
|
||||||
|
print(df)
|
||||||
|
|
||||||
dataset = qp.datasets.fetch_UCIDataset('transfusion', verbose=True)
|
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
|
|
||||||
qp.environ['SAMPLE_SIZE'] = 500
|
|
||||||
#param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
|
#param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
|
||||||
param_grid = {'C': np.logspace(0,3,4), 'class_weight': ['balanced']}
|
param_grid = {'C': np.logspace(0,3,4), 'class_weight': ['balanced']}
|
||||||
max_evaluations = 500
|
max_evaluations = 500
|
||||||
|
|
Loading…
Reference in New Issue