forked from moreo/QuaPy
refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size
This commit is contained in:
parent
54dc2980e6
commit
03cf73aff6
33
README.md
33
README.md
|
@ -1,3 +1,34 @@
|
|||
# QuaPy
|
||||
|
||||
A Quantification framework written in Python.
|
||||
QuaPy is an open source framework for Quantification (a.k.a. Supervised Prevalence Estimation)
|
||||
written in Python.
|
||||
|
||||
QuaPy roots on the concept of data sample, and provides implementations of
|
||||
most important concepts in quantification literature, such as the most important
|
||||
quantification baselines, many advanced quantification methods,
|
||||
quantification-oriented model selection, many evaluation measures and protocols
|
||||
used for evaluating quantification methods.
|
||||
QuaPy also integrates commonly used datasets and offers visualization tools
|
||||
for facilitating the analysis and interpretation of results.
|
||||
|
||||
```python
|
||||
import quapy as qp
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
dataset = qp.datasets.fetch_twitter('semeval16')
|
||||
|
||||
# create an "Adjusted Classify & Count" quantifier
|
||||
model = qp.method.aggregative.ACC(LogisticRegression())
|
||||
model.fit(dataset.training)
|
||||
|
||||
prevalences_estim = model.quantify(dataset.test.instances)
|
||||
prevalences_true = dataset.test.prevalence()
|
||||
|
||||
error = qp.error.mae(prevalences_true, prevalences_estim)
|
||||
|
||||
print(f'MAE={error:.3f}')
|
||||
```
|
||||
|
||||
binary, and single-label
|
||||
|
||||
|
||||
|
|
1
TODO.txt
1
TODO.txt
|
@ -25,3 +25,4 @@ Rename EMQ to SLD ?
|
|||
How many times is the system of equations for ACC and PACC not solved? How many times is it clipped? Do they sum up
|
||||
to one always?
|
||||
Parallelize the kFCV in ACC and PACC
|
||||
Requirements: xlrd for reading excel
|
|
@ -20,49 +20,64 @@ import shutil
|
|||
DEBUG = False
|
||||
|
||||
|
||||
def quantification_models():
|
||||
def newLR():
|
||||
return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
|
||||
|
||||
__C_range = np.logspace(-4, 5, 10)
|
||||
lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']}
|
||||
svmperf_params = {'C': __C_range}
|
||||
|
||||
def quantification_models():
|
||||
# methods tested in Gao & Sebastiani 2016
|
||||
# yield 'cc', CC(newLR()), lr_params
|
||||
# yield 'acc', ACC(newLR()), lr_params
|
||||
# yield 'pcc', PCC(newLR()), lr_params
|
||||
# yield 'pacc', PACC(newLR()), lr_params
|
||||
# yield 'sld', EMQ(newLR()), lr_params
|
||||
# yield 'svmq', OneVsAll(SVMQ(args.svmperfpath)), svmperf_params
|
||||
# yield 'svmkld', OneVsAll(SVMKLD(args.svmperfpath)), svmperf_params
|
||||
# yield 'svmnkld', OneVsAll(SVMNKLD(args.svmperfpath)), svmperf_params
|
||||
#
|
||||
# # methods added
|
||||
# yield 'svmmae', OneVsAll(SVMAE(args.svmperfpath)), svmperf_params
|
||||
# yield 'svmmrae', OneVsAll(SVMRAE(args.svmperfpath)), svmperf_params
|
||||
# yield 'hdy', OneVsAll(HDy(newLR())), lr_params
|
||||
yield 'cc', CC(newLR()), lr_params
|
||||
yield 'acc', ACC(newLR()), lr_params
|
||||
yield 'pcc', PCC(newLR()), lr_params
|
||||
yield 'pacc', PACC(newLR()), lr_params
|
||||
yield 'sld', EMQ(newLR()), lr_params
|
||||
yield 'svmq', OneVsAll(SVMQ(args.svmperfpath)), svmperf_params
|
||||
yield 'svmkld', OneVsAll(SVMKLD(args.svmperfpath)), svmperf_params
|
||||
yield 'svmnkld', OneVsAll(SVMNKLD(args.svmperfpath)), svmperf_params
|
||||
|
||||
# methods added
|
||||
yield 'svmmae', OneVsAll(SVMAE(args.svmperfpath)), svmperf_params
|
||||
yield 'svmmrae', OneVsAll(SVMRAE(args.svmperfpath)), svmperf_params
|
||||
yield 'hdy', OneVsAll(HDy(newLR())), lr_params
|
||||
|
||||
|
||||
def quantification_cuda_models():
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
print(f'Running QuaNet in {device}')
|
||||
if DEBUG:
|
||||
lr_params={'C':[1,10]}
|
||||
yield 'quanet', QuaNet(PCALR(**newLR().get_params()), settings.SAMPLE_SIZE,
|
||||
lstm_hidden_size=32, lstm_nlayers=1,
|
||||
tr_iter_per_poch=50, va_iter_per_poch=10,
|
||||
patience=3,
|
||||
checkpointdir=args.checkpointdir, device=device), lr_params
|
||||
else:
|
||||
yield 'quanet', QuaNet(PCALR(**newLR().get_params()), settings.SAMPLE_SIZE,
|
||||
checkpointdir=args.checkpointdir, device=device), lr_params
|
||||
learner = PCALR(**newLR().get_params())
|
||||
yield 'quanet', QuaNet(learner, settings.SAMPLE_SIZE, checkpointdir=args.checkpointdir, device=device), lr_params
|
||||
|
||||
|
||||
#param_mod_sel={'sample_size':settings.SAMPLE_SIZE, 'n_prevpoints':21, 'n_repetitions':5}
|
||||
#yield 'epaccmaeptr', EPACC(newLR(), param_grid=lr_params, optim='mae', policy='ptr', param_mod_sel=param_mod_sel, n_jobs=settings.ENSEMBLE_N_JOBS), None
|
||||
# yield 'epaccmraeptr', EPACC(newLR(), param_grid=lr_params, optim='mrae', policy='ptr', param_mod_sel=param_mod_sel, n_jobs=settings.ENSEMBLE_N_JOBS), None
|
||||
# yield 'epaccmae', EPACC(newLR(), param_grid=lr_params, optim='mae', policy='mae', param_mod_sel=param_mod_sel, n_jobs=settings.ENSEMBLE_N_JOBS), None
|
||||
# yield 'epaccmrae', EPACC(newLR(), param_grid=lr_params, optim='mrae', policy='mrae', param_mod_sel=param_mod_sel, n_jobs=settings.ENSEMBLE_N_JOBS), None
|
||||
def quantification_ensembles():
|
||||
param_mod_sel = {
|
||||
'sample_size': settings.SAMPLE_SIZE,
|
||||
'n_prevpoints': 21,
|
||||
'n_repetitions': 5,
|
||||
'verbose': False
|
||||
}
|
||||
common={
|
||||
'max_sample_size': 500,
|
||||
'n_jobs': settings.ENSEMBLE_N_JOBS,
|
||||
'param_grid': lr_params,
|
||||
'param_mod_sel': param_mod_sel,
|
||||
'val_split': 0.4
|
||||
}
|
||||
|
||||
#yield 'mlpe', MaximumLikelihoodPrevalenceEstimation(), {}
|
||||
# hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection
|
||||
# will be skipped (by setting hyperparameters to None)
|
||||
hyper_none = None
|
||||
yield 'epaccmaeptr', EPACC(newLR(), optim='mae', policy='ptr', **common), hyper_none
|
||||
yield 'epaccmaemae', EPACC(newLR(), optim='mae', policy='mae', **common), hyper_none
|
||||
yield 'esldmaeptr', EEMQ(newLR(), optim='mae', policy='ptr', **common), hyper_none
|
||||
yield 'esldmaemae', EEMQ(newLR(), optim='mae', policy='mae', **common), hyper_none
|
||||
|
||||
yield 'epaccmraeptr', EPACC(newLR(), optim='mrae', policy='ptr', **common), hyper_none
|
||||
yield 'epaccmraemrae', EPACC(newLR(), optim='mrae', policy='mrae', **common), hyper_none
|
||||
yield 'esldmraeptr', EEMQ(newLR(), optim='mrae', policy='ptr', **common), hyper_none
|
||||
yield 'esldmraemrae', EEMQ(newLR(), optim='mrae', policy='mrae', **common), hyper_none
|
||||
|
||||
|
||||
def evaluate_experiment(true_prevalences, estim_prevalences):
|
||||
|
@ -119,10 +134,7 @@ def run(experiment):
|
|||
benchmark_devel.stats()
|
||||
|
||||
# model selection (hyperparameter optimization for a quantification-oriented loss)
|
||||
if hyperparams is None:
|
||||
model.fit(benchmark_devel.training, benchmark_devel.test)
|
||||
best_params = {}
|
||||
else:
|
||||
if hyperparams is not None:
|
||||
model_selection = qp.model_selection.GridSearchQ(
|
||||
model,
|
||||
param_grid=hyperparams,
|
||||
|
@ -137,6 +149,8 @@ def run(experiment):
|
|||
model_selection.fit(benchmark_devel.training, benchmark_devel.test)
|
||||
model = model_selection.best_model()
|
||||
best_params = model_selection.best_params_
|
||||
else:
|
||||
best_params = {}
|
||||
|
||||
# model evaluation
|
||||
test_names = [dataset_name] if dataset_name != 'semeval' else ['semeval13', 'semeval14', 'semeval15']
|
||||
|
@ -183,9 +197,19 @@ if __name__ == '__main__':
|
|||
|
||||
optim_losses = ['mae'] # ['mae', 'mrae']
|
||||
datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN
|
||||
models = quantification_models()
|
||||
|
||||
results = Parallel(n_jobs=settings.N_JOBS)(
|
||||
#models = quantification_models()
|
||||
#Parallel(n_jobs=settings.N_JOBS)(
|
||||
# delayed(run)(experiment) for experiment in itertools.product(optim_losses, datasets, models)
|
||||
#)
|
||||
|
||||
#models = quantification_cuda_models()
|
||||
#Parallel(n_jobs=settings.CUDA_N_JOBS)(
|
||||
# delayed(run)(experiment) for experiment in itertools.product(optim_losses, datasets, models)
|
||||
#)
|
||||
|
||||
models = quantification_ensembles()
|
||||
Parallel(n_jobs=1)(
|
||||
delayed(run)(experiment) for experiment in itertools.product(optim_losses, datasets, models)
|
||||
)
|
||||
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import multiprocessing
|
||||
|
||||
N_JOBS = 1 #multiprocessing.cpu_count()
|
||||
N_JOBS = -2 #multiprocessing.cpu_count()
|
||||
CUDA_N_JOBS = 1
|
||||
ENSEMBLE_N_JOBS = -2
|
||||
|
||||
SAMPLE_SIZE = 100
|
||||
|
||||
assert N_JOBS==1 or ENSEMBLE_N_JOBS==1, 'general N_JOBS and ENSEMBLE_N_JOBS should not be both greater than 1'
|
|
@ -92,10 +92,10 @@ class LabelledCollection:
|
|||
labels = self.labels[index]
|
||||
return LabelledCollection(documents, labels, n_classes=self.n_classes)
|
||||
|
||||
def split_stratified(self, train_prop=0.6):
|
||||
def split_stratified(self, train_prop=0.6, random_state=None):
|
||||
# with temp_seed(42):
|
||||
tr_docs, te_docs, tr_labels, te_labels = \
|
||||
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels)
|
||||
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state)
|
||||
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
|
||||
|
||||
def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
import zipfile
|
||||
from os.path import join
|
||||
from urllib.error import HTTPError
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
@ -137,9 +138,11 @@ UCI_DATASETS = ['acute.a', 'acute.b',
|
|||
'balance.1', 'balance.2', 'balance.3',
|
||||
'breast-cancer',
|
||||
'cmc.1', 'cmc.2', 'cmc.3',
|
||||
'ctg.1', 'ctg.2', 'ctg.3'] # ongoing...
|
||||
'ctg.1', 'ctg.2', 'ctg.3',
|
||||
#'diabetes', # <-- I haven't found this one...
|
||||
'german'] # ongoing...
|
||||
|
||||
def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
|
||||
def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3):
|
||||
|
||||
assert dataset_name in UCI_DATASETS, \
|
||||
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
|
||||
|
@ -147,22 +150,6 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
|
|||
if data_home is None:
|
||||
data_home = get_quapy_home()
|
||||
|
||||
identifier_map = {
|
||||
'acute.a': 'acute',
|
||||
'acute.b': 'acute',
|
||||
'balance.1': 'balance-scale',
|
||||
'balance.2': 'balance-scale',
|
||||
'balance.3': 'balance-scale',
|
||||
'breast-cancer': 'breast-cancer-wisconsin',
|
||||
'cmc.1': 'cmc',
|
||||
'cmc.2': 'cmc',
|
||||
'cmc.3': 'cmc',
|
||||
'ctg.1': 'ctg',
|
||||
'ctg.2': 'ctg',
|
||||
'ctg.3': 'ctg',
|
||||
|
||||
}
|
||||
|
||||
dataset_fullname = {
|
||||
'acute.a': 'Acute Inflammations (urinary bladder)',
|
||||
'acute.b': 'Acute Inflammations (renal pelvis)',
|
||||
|
@ -176,27 +163,64 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
|
|||
'ctg.1': 'Cardiotocography Data Set (normal)',
|
||||
'ctg.2': 'Cardiotocography Data Set (suspect)',
|
||||
'ctg.3': 'Cardiotocography Data Set (pathologic)',
|
||||
'german': 'Statlog German Credit Data',
|
||||
}
|
||||
|
||||
data_folder = {
|
||||
'acute': 'diagnosis',
|
||||
'balance-scale': 'balance-scale',
|
||||
'breast-cancer-wisconsin': 'breast-cancer-wisconsin',
|
||||
'cmc': 'cmc'
|
||||
# the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
|
||||
# to download the raw dataset
|
||||
identifier_map = {
|
||||
'acute.a': 'acute',
|
||||
'acute.b': 'acute',
|
||||
'balance.1': 'balance-scale',
|
||||
'balance.2': 'balance-scale',
|
||||
'balance.3': 'balance-scale',
|
||||
'breast-cancer': 'breast-cancer-wisconsin',
|
||||
'cmc.1': 'cmc',
|
||||
'cmc.2': 'cmc',
|
||||
'cmc.3': 'cmc',
|
||||
'ctg.1': '00193',
|
||||
'ctg.2': '00193',
|
||||
'ctg.3': '00193',
|
||||
'german': 'statlog/german'
|
||||
}
|
||||
|
||||
# the filename is the name of the file within the data_folder indexed by the identifier
|
||||
file_name = {
|
||||
'acute': 'diagnosis.data',
|
||||
'balance-scale': 'balance-scale.data',
|
||||
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data',
|
||||
'cmc': 'cmc.data',
|
||||
'00193': 'CTG.xls',
|
||||
'statlog/german': 'german.data-numeric'
|
||||
}
|
||||
|
||||
# the filename containing the dataset description (if any)
|
||||
desc_name = {
|
||||
'acute': 'diagnosis.names',
|
||||
'balance-scale': 'balance-scale.names',
|
||||
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names',
|
||||
'cmc': 'cmc.names',
|
||||
'00193': None,
|
||||
'statlog/german': 'german.doc'
|
||||
}
|
||||
|
||||
identifier = identifier_map[dataset_name]
|
||||
URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
|
||||
data_path = join(data_home, 'uci_datasets', identifier)
|
||||
download_file_if_not_exists(f'{URL}/{data_folder[identifier]}.data', f'{data_path}/{identifier}.data')
|
||||
download_file_if_not_exists(f'{URL}/{data_folder[identifier]}.names', f'{data_path}/{identifier}.names')
|
||||
data_dir = join(data_home, 'uci_datasets', identifier)
|
||||
data_path = join(data_dir, file_name[identifier])
|
||||
download_file_if_not_exists(f'{URL}/{file_name[identifier]}', data_path)
|
||||
|
||||
descfile = desc_name[identifier]
|
||||
if descfile:
|
||||
download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}')
|
||||
if verbose:
|
||||
print(open(f'{data_path}/{identifier}.names', 'rt').read())
|
||||
print(open(f'{data_dir}/{descfile}', 'rt').read())
|
||||
elif verbose:
|
||||
print('no file description available')
|
||||
|
||||
print(f'Loading {dataset_name} ({dataset_fullname[dataset_name]})')
|
||||
if identifier == 'acute':
|
||||
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, encoding='utf-16', sep='\t')
|
||||
df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
|
||||
if dataset_name == 'acute.a':
|
||||
y = binarize(df[6], pos_class='yes')
|
||||
elif dataset_name == 'acute.b':
|
||||
|
@ -208,7 +232,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
|
|||
X = df.loc[:, 0:5].values
|
||||
|
||||
if identifier == 'balance-scale':
|
||||
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
|
||||
df = pd.read_csv(data_path, header=None, sep=',')
|
||||
if dataset_name == 'balance.1':
|
||||
y = binarize(df[0], pos_class='L')
|
||||
elif dataset_name == 'balance.2':
|
||||
|
@ -218,7 +242,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
|
|||
X = df.loc[:, 1:].astype(float).values
|
||||
|
||||
if identifier == 'breast-cancer-wisconsin':
|
||||
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
|
||||
df = pd.read_csv(data_path, header=None, sep=',')
|
||||
Xy = df.loc[:, 1:10]
|
||||
Xy[Xy=='?']=np.nan
|
||||
Xy = Xy.dropna(axis=0)
|
||||
|
@ -227,7 +251,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
|
|||
y = binarize(Xy[10], pos_class=4)
|
||||
|
||||
if identifier == 'cmc':
|
||||
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
|
||||
df = pd.read_csv(data_path, header=None, sep=',')
|
||||
X = df.loc[:, 0:8].astype(float).values
|
||||
y = df[9].astype(int).values
|
||||
if dataset_name == 'cmc.1':
|
||||
|
@ -237,25 +261,32 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
|
|||
elif dataset_name == 'cmc.3':
|
||||
y = binarize(y, pos_class=3)
|
||||
|
||||
if identifier == '00193':
|
||||
df = pd.read_excel(data_path, sheet_name='Data', skipfooter=3)
|
||||
df = df[list(range(1,24))] # select columns numbered (number 23 is the target label)
|
||||
# replaces the header with the first row
|
||||
new_header = df.iloc[0] # grab the first row for the header
|
||||
df = df[1:] # take the data less the header row
|
||||
df.columns = new_header # set the header row as the df header
|
||||
X = df.iloc[:, 0:22].astype(float).values
|
||||
y = df['NSP'].astype(int).values
|
||||
if dataset_name == 'ctg.1': # 1==Normal
|
||||
y = binarize(y, pos_class=1)
|
||||
elif dataset_name == 'ctg.2':
|
||||
y = binarize(y, pos_class=2) # 1==Suspect
|
||||
elif dataset_name == 'ctg.3':
|
||||
y = binarize(y, pos_class=3) # 1==Pathologic
|
||||
|
||||
if identifier == 'statlog/german':
|
||||
df = pd.read_csv(data_path, header=None, delim_whitespace=True)
|
||||
X = df.iloc[:, 0:24].astype(float).values
|
||||
y = df[24].astype(int).values
|
||||
y = binarize(y, pos_class=1)
|
||||
|
||||
data = LabelledCollection(X, y)
|
||||
data.stats()
|
||||
raise NotImplementedError()
|
||||
#print(df)
|
||||
#print(df.loc[:, 0:5].values)
|
||||
#print(y)
|
||||
return Dataset(*data.split_stratified(1-test_split, random_state=0))
|
||||
|
||||
# X = __read_csv(f'{data_path}/{identifier}.data', separator='\t')
|
||||
# print(X)
|
||||
|
||||
#X, y = from_csv(f'{data_path}/{dataset_name}.data')
|
||||
#y, classnames = reindex_labels(y)
|
||||
|
||||
|
||||
#def __read_csv(path, separator=','):
|
||||
# x = []
|
||||
# for instance in tqdm(open(path, 'rt', encoding='utf-16').readlines(), desc=f'reading {path}'):
|
||||
# x.append(instance.strip().split(separator))
|
||||
# return x
|
||||
|
||||
def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
|
||||
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
|
|
@ -60,7 +60,6 @@ def artificial_sampling_prediction(
|
|||
estim_prevalence = quantification_func(sample.instances)
|
||||
return true_prevalence, estim_prevalence
|
||||
|
||||
print('predicting')
|
||||
pbar = tqdm(indexes, desc='[artificial sampling protocol] predicting') if verbose else indexes
|
||||
results = Parallel(n_jobs=n_jobs)(
|
||||
delayed(_predict_prevalences)(index) for index in pbar
|
||||
|
|
|
@ -172,10 +172,11 @@ class CC(AggregativeQuantifier):
|
|||
|
||||
class ACC(AggregativeQuantifier):
|
||||
|
||||
def __init__(self, learner:BaseEstimator):
|
||||
def __init__(self, learner:BaseEstimator, val_split=0.4):
|
||||
self.learner = learner
|
||||
self.val_split = val_split
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.4):
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection]=None):
|
||||
"""
|
||||
Trains a ACC quantifier
|
||||
:param data: the training set
|
||||
|
@ -186,7 +187,8 @@ class ACC(AggregativeQuantifier):
|
|||
to estimate the parameters
|
||||
:return: self
|
||||
"""
|
||||
assert val_split is not None, 'val_split cannot be set to None'
|
||||
if val_split is None:
|
||||
val_split = self.val_split
|
||||
if isinstance(val_split, int):
|
||||
# kFCV estimation of parameters
|
||||
y, y_ = [], []
|
||||
|
@ -256,10 +258,11 @@ class PCC(AggregativeProbabilisticQuantifier):
|
|||
|
||||
class PACC(AggregativeProbabilisticQuantifier):
|
||||
|
||||
def __init__(self, learner:BaseEstimator):
|
||||
def __init__(self, learner: BaseEstimator, val_split=0.4):
|
||||
self.learner = learner
|
||||
self.val_split = val_split
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.4):
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=None):
|
||||
"""
|
||||
Trains a PACC quantifier
|
||||
:param data: the training set
|
||||
|
@ -270,7 +273,9 @@ class PACC(AggregativeProbabilisticQuantifier):
|
|||
to estimate the parameters
|
||||
:return: self
|
||||
"""
|
||||
assert val_split is not None, 'val_split cannot be set to None'
|
||||
if val_split is None:
|
||||
val_split = self.val_split
|
||||
|
||||
if isinstance(val_split, int):
|
||||
# kFCV estimation of parameters
|
||||
y, y_ = [], []
|
||||
|
@ -374,10 +379,11 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|||
estimation based on the Hellinger distance. Information Sciences, 218:146–164.
|
||||
"""
|
||||
|
||||
def __init__(self, learner: BaseEstimator):
|
||||
def __init__(self, learner: BaseEstimator, val_split=0.4):
|
||||
self.learner = learner
|
||||
self.val_split = val_split
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=0.4):
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=None):
|
||||
"""
|
||||
Trains a HDy quantifier
|
||||
:param data: the training set
|
||||
|
@ -387,7 +393,9 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|||
indicating the validation set itself
|
||||
:return: self
|
||||
"""
|
||||
assert val_split is not None, 'val_split cannot be set to None'
|
||||
if val_split is None:
|
||||
val_split = self.val_split
|
||||
|
||||
self._check_binary(data, self.__class__.__name__)
|
||||
self.learner, validation = training_helper(
|
||||
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
|
||||
|
@ -498,7 +506,7 @@ class OneVsAll(AggregativeQuantifier):
|
|||
self.binary_quantifier = binary_quantifier
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=None):
|
||||
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||
assert not data.binary, \
|
||||
f'{self.__class__.__name__} expect non-binary data'
|
||||
assert isinstance(self.binary_quantifier, BaseQuantifier), \
|
||||
|
|
|
@ -34,16 +34,26 @@ class Ensemble(BaseQuantifier):
|
|||
Information Fusion, 45, 1-15.
|
||||
"""
|
||||
|
||||
def __init__(self, quantifier: BaseQuantifier, size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1, verbose=True, max_sample_size=None):
|
||||
def __init__(self,
|
||||
quantifier: BaseQuantifier,
|
||||
size=50,
|
||||
red_size=25,
|
||||
min_pos=1,
|
||||
policy='ave',
|
||||
max_sample_size=None,
|
||||
val_split=None,
|
||||
n_jobs=1,
|
||||
verbose=False):
|
||||
assert policy in Ensemble.VALID_POLICIES, \
|
||||
f'unknown policy={policy}; valid are {Ensemble.VALID_POLICIES}'
|
||||
assert max_sample_size is None or max_sample_size > 0, \
|
||||
'wrong value for max_sample_size; set to a positive number or None'
|
||||
'wrong value for max_sample_size; set it to a positive number or None'
|
||||
self.base_quantifier = quantifier
|
||||
self.size = size
|
||||
self.min_pos = min_pos
|
||||
self.red_size = red_size
|
||||
self.policy = policy
|
||||
self.val_split = val_split
|
||||
self.n_jobs = n_jobs
|
||||
self.post_proba_fn = None
|
||||
self.verbose = verbose
|
||||
|
@ -57,6 +67,8 @@ class Ensemble(BaseQuantifier):
|
|||
self.sout('Fit')
|
||||
if self.policy=='ds' and not data.binary:
|
||||
raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary')
|
||||
if val_split is None:
|
||||
val_split = self.val_split
|
||||
|
||||
# randomly chooses the prevalences for each member of the ensemble (preventing classes with less than
|
||||
# min_pos positive examples)
|
||||
|
@ -71,7 +83,8 @@ class Ensemble(BaseQuantifier):
|
|||
sample_size = len(data) if self.max_sample_size is None else min(self.max_sample_size, len(data))
|
||||
self.ensemble = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(_delayed_new_instance)(
|
||||
self.base_quantifier, data, val_split, prev, posteriors, keep_samples=is_static_policy, verbose=self.verbose, sample_size=sample_size
|
||||
self.base_quantifier, data, val_split, prev, posteriors, keep_samples=is_static_policy,
|
||||
verbose=self.verbose, sample_size=sample_size
|
||||
) for prev in tqdm(prevs, desc='fitting ensamble')
|
||||
)
|
||||
|
||||
|
@ -206,15 +219,20 @@ def _delayed_new_instance(base_quantifier,
|
|||
if verbose:
|
||||
print(f'\tfit-start for prev {F.strprev(prev)}, sample_size={sample_size}')
|
||||
model = deepcopy(base_quantifier)
|
||||
sample_index = data.sampling_index(sample_size, *prev)
|
||||
sample = data.sampling_from_index(sample_index)
|
||||
if val_split is None:
|
||||
model.fit(sample)
|
||||
else:
|
||||
|
||||
if val_split is not None:
|
||||
if isinstance(val_split, float):
|
||||
assert 0 < val_split < 1, 'val_split should be in (0,1)'
|
||||
sample, val_split = sample.split_stratified(train_prop=1-val_split)
|
||||
data, val_split = data.split_stratified(train_prop=1-val_split)
|
||||
|
||||
sample_index = data.sampling_index(sample_size, *prev)
|
||||
sample = data.sampling_from_index(sample_index)
|
||||
|
||||
if val_split is not None:
|
||||
model.fit(sample, val_split=val_split)
|
||||
else:
|
||||
model.fit(sample)
|
||||
|
||||
tr_prevalence = sample.prevalence()
|
||||
tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None
|
||||
if verbose:
|
||||
|
@ -281,35 +299,31 @@ def _check_error(error):
|
|||
f'the name of an error function in {qp.error.ERROR_NAMES}')
|
||||
|
||||
|
||||
def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None,
|
||||
param_model_sel:dict=None,
|
||||
size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1, max_sample_size=None):
|
||||
def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None, param_model_sel:dict=None, **kwargs):
|
||||
if optim is not None:
|
||||
if param_grid is None:
|
||||
raise ValueError(f'param_grid is None but optim was requested.')
|
||||
if param_model_sel is None:
|
||||
raise ValueError(f'param_model_sel is None but optim was requested.')
|
||||
error = _check_error(optim)
|
||||
return _instantiate_ensemble(learner, base_quantifier_class, param_grid, error, param_model_sel,
|
||||
size=size, min_pos=min_pos, red_size=red_size,
|
||||
policy=policy, n_jobs=n_jobs, max_sample_size=max_sample_size)
|
||||
return _instantiate_ensemble(learner, base_quantifier_class, param_grid, error, param_model_sel, **kwargs)
|
||||
|
||||
|
||||
def ECC(learner, param_grid=None, optim=None, param_mod_sel=None, size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1, max_sample_size=None):
|
||||
return ensembleFactory(learner, CC, param_grid, optim, param_mod_sel, size, min_pos, red_size, policy, n_jobs, max_sample_size=max_sample_size)
|
||||
def ECC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
|
||||
return ensembleFactory(learner, CC, param_grid, optim, param_mod_sel, **kwargs)
|
||||
|
||||
|
||||
def EACC(learner, param_grid=None, optim=None, param_mod_sel=None, size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1, max_sample_size=None):
|
||||
return ensembleFactory(learner, ACC, param_grid, optim, param_mod_sel, size, min_pos, red_size, policy, n_jobs, max_sample_size=max_sample_size)
|
||||
def EACC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
|
||||
return ensembleFactory(learner, ACC, param_grid, optim, param_mod_sel, **kwargs)
|
||||
|
||||
|
||||
def EPACC(learner, param_grid=None, optim=None, param_mod_sel=None, size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1, max_sample_size=None):
|
||||
return ensembleFactory(learner, PACC, param_grid, optim, param_mod_sel, size, min_pos, red_size, policy, n_jobs, max_sample_size=max_sample_size)
|
||||
def EPACC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
|
||||
return ensembleFactory(learner, PACC, param_grid, optim, param_mod_sel, **kwargs)
|
||||
|
||||
|
||||
def EHDy(learner, param_grid=None, optim=None, param_mod_sel=None, size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1, max_sample_size=None):
|
||||
return ensembleFactory(learner, HDy, param_grid, optim, param_mod_sel, size, min_pos, red_size, policy, n_jobs, max_sample_size=max_sample_size)
|
||||
def EHDy(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
|
||||
return ensembleFactory(learner, HDy, param_grid, optim, param_mod_sel, **kwargs)
|
||||
|
||||
|
||||
def EEMQ(learner, param_grid=None, optim=None, param_mod_sel=None, size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1, max_sample_size=None):
|
||||
return ensembleFactory(learner, EMQ, param_grid, optim, param_mod_sel, size, min_pos, red_size, policy, n_jobs, max_sample_size=max_sample_size)
|
||||
def EEMQ(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
|
||||
return ensembleFactory(learner, EMQ, param_grid, optim, param_mod_sel, **kwargs)
|
|
@ -21,6 +21,7 @@ class GridSearchQ(BaseQuantifier):
|
|||
eval_budget : int = None,
|
||||
error: Union[Callable, str] = qp.error.mae,
|
||||
refit=False,
|
||||
val_split=0.4,
|
||||
n_jobs=1,
|
||||
random_seed=42,
|
||||
timeout=-1,
|
||||
|
@ -63,6 +64,7 @@ class GridSearchQ(BaseQuantifier):
|
|||
self.n_repetitions = n_repetitions
|
||||
self.eval_budget = eval_budget
|
||||
self.refit = refit
|
||||
self.val_split = val_split
|
||||
self.n_jobs = n_jobs
|
||||
self.random_seed = random_seed
|
||||
self.timeout = timeout
|
||||
|
@ -118,12 +120,14 @@ class GridSearchQ(BaseQuantifier):
|
|||
raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n'
|
||||
f'the name of an error function in {qp.error.QUANTIFICATION_ERROR_NAMES}')
|
||||
|
||||
def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float]=0.4):
|
||||
def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float]=None):
|
||||
"""
|
||||
:param training: the training set on which to optimize the hyperparameters
|
||||
:param val_split: either a LabelledCollection on which to test the performance of the different settings, or
|
||||
a float in [0,1] indicating the proportion of labelled data to extract from the training set
|
||||
"""
|
||||
if val_split is None:
|
||||
val_split = self.val_split
|
||||
training, val_split = self.__check_training_validation(training, val_split)
|
||||
assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer'
|
||||
self.__check_num_evals(self.n_prevpoints, self.eval_budget, self.n_repetitions, training.n_classes)
|
||||
|
@ -158,7 +162,7 @@ class GridSearchQ(BaseQuantifier):
|
|||
model.fit(training)
|
||||
true_prevalences, estim_prevalences = artificial_sampling_prediction(
|
||||
model, val_split, self.sample_size, self.n_prevpoints, self.n_repetitions, n_jobs, self.random_seed,
|
||||
verbose=True
|
||||
verbose=False
|
||||
)
|
||||
|
||||
score = self.error(true_prevalences, estim_prevalences)
|
||||
|
|
37
test.py
37
test.py
|
@ -13,9 +13,7 @@ from quapy.model_selection import GridSearchQ
|
|||
|
||||
|
||||
|
||||
#qp.datasets.fetch_UCIDataset('acute.b', verbose=True)
|
||||
|
||||
#sys.exit(0)
|
||||
qp.environ['SAMPLE_SIZE'] = 500
|
||||
#param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
|
||||
param_grid = {'C': np.logspace(0,3,4), 'class_weight': ['balanced']}
|
||||
|
@ -26,11 +24,12 @@ binary = False
|
|||
svmperf_home = './svm_perf_quantification'
|
||||
|
||||
if binary:
|
||||
dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
|
||||
#dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
|
||||
dataset = qp.datasets.fetch_UCIDataset('german', verbose=True)
|
||||
#qp.data.preprocessing.index(dataset, inplace=True)
|
||||
|
||||
else:
|
||||
dataset = qp.datasets.fetch_twitter('gasp', for_model_selection=False, min_df=5, pickle=True)
|
||||
dataset = qp.datasets.fetch_twitter('gasp', for_model_selection=True, min_df=5, pickle=True)
|
||||
#dataset.training = dataset.training.sampling(sample_size, 0.2, 0.5, 0.3)
|
||||
|
||||
print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.test)}')
|
||||
|
@ -57,10 +56,32 @@ print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.tes
|
|||
# model = qp.method.aggregative.ClassifyAndCount(learner)
|
||||
|
||||
learner = LogisticRegression(max_iter=1000)
|
||||
model = qp.method.meta.EPACC(learner, size=10, red_size=5, max_sample_size=200)
|
||||
# param_grid={'C':[1,10,100]},
|
||||
# optim='mae', param_mod_sel={'sample_size':100, 'n_prevpoints':21, 'n_repetitions':5},
|
||||
# policy='ptr', n_jobs=1)
|
||||
#model = qp.method.aggregative.PACC(learner)
|
||||
#model = qp.method.aggregative.ACC(learner)
|
||||
model = qp.method.meta.EPACC(learner, size=10, red_size=5, max_sample_size=500, n_jobs=-1,
|
||||
param_grid={'C':[1,10,100]},
|
||||
optim='mae', param_mod_sel={'sample_size':100, 'n_prevpoints':21, 'n_repetitions':5, 'verbose':True},
|
||||
policy='ptr',
|
||||
val_split=0.4)
|
||||
"""
|
||||
Problemas:
|
||||
- La interfaz es muy fea, hay que conocer practicamente todos los detalles así que no ahorra nada con respecto a crear
|
||||
un objeto con otros anidados dentro
|
||||
- El fit genera las prevalences random, y esto hace que despues de la model selection, un nuevo fit tire todo el trabajo
|
||||
hecho.
|
||||
- El fit de un GridSearcQ tiene dentro un best_estimator, pero después de la model selection, hacer fit otra vez sobre
|
||||
este objeto no se limita a re-entrenar el modelo con los mejores parámetros, sino que inicia una nueva búsqueda
|
||||
en modo grid search.
|
||||
- Posible solución (no vale): sería hacer directamente model selection con el benchmark final, aunque esto haría que los hyper-
|
||||
parámetros se buscasen en un conjunto diferente del resto de models....
|
||||
- Posible solución:
|
||||
- Elegir las prevalences en init
|
||||
-
|
||||
- Problema: el parámetro val_split es muy ambiguo en todo el framework. Por ejemplo, en EPACC podría ser un float que,
|
||||
en el caso de un GridSearchQ podría referir al split de validación para los hyperparámetros o al split que usa PACC
|
||||
para encontrar los parámetros...
|
||||
"""
|
||||
|
||||
# regressor = LinearSVR(max_iter=10000)
|
||||
# param_grid = {'C': np.logspace(-1,3,5)}
|
||||
# model = AveragePoolQuantification(regressor, sample_size, trials=5000, n_components=500, zscore=False)
|
||||
|
|
Loading…
Reference in New Issue