forked from moreo/QuaPy
preparing some experiments
This commit is contained in:
commit
b941c0665e
|
@ -1,8 +1,11 @@
|
||||||
|
import argparse
|
||||||
from sklearn.calibration import CalibratedClassifierCV
|
from sklearn.calibration import CalibratedClassifierCV
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import itertools
|
||||||
from sklearn.multioutput import ClassifierChain
|
from sklearn.multioutput import ClassifierChain
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from skmultilearn.dataset import load_dataset, available_data_sets
|
||||||
|
from scipy.sparse import csr_matrix
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from MultiLabel.mlclassification import MultilabelStackedClassifier
|
from MultiLabel.mlclassification import MultilabelStackedClassifier
|
||||||
from MultiLabel.mldata import MultilabelledCollection
|
from MultiLabel.mldata import MultilabelledCollection
|
||||||
|
@ -12,7 +15,10 @@ from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC,
|
||||||
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
|
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from data.dataset import Dataset
|
from data.dataset import Dataset
|
||||||
from mlevaluation import ml_natural_prevalence_evaluation, ml_artificial_prevalence_evaluation
|
from mlevaluation import ml_natural_prevalence_prediction, ml_artificial_prevalence_prediction
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
|
||||||
def cls():
|
def cls():
|
||||||
|
@ -26,7 +32,7 @@ def calibratedCls():
|
||||||
# DEBUG=True
|
# DEBUG=True
|
||||||
|
|
||||||
# if DEBUG:
|
# if DEBUG:
|
||||||
sample_size = 250
|
sample_size = 100
|
||||||
n_samples = 5000
|
n_samples = 5000
|
||||||
|
|
||||||
|
|
||||||
|
@ -35,28 +41,29 @@ def models():
|
||||||
yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls()))
|
yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls()))
|
||||||
yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls()))
|
yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls()))
|
||||||
yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls()))
|
yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls()))
|
||||||
|
yield 'HDy', MultilabelNaiveAggregativeQuantifier(HDy(cls()))
|
||||||
# yield 'EMQ', MultilabelQuantifier(EMQ(calibratedCls()))
|
# yield 'EMQ', MultilabelQuantifier(EMQ(calibratedCls()))
|
||||||
yield 'StackCC', MLCC(MultilabelStackedClassifier(cls()))
|
# yield 'StackCC', MLCC(MultilabelStackedClassifier(cls()))
|
||||||
yield 'StackPCC', MLPCC(MultilabelStackedClassifier(cls()))
|
# yield 'StackPCC', MLPCC(MultilabelStackedClassifier(cls()))
|
||||||
yield 'StackACC', MLACC(MultilabelStackedClassifier(cls()))
|
# yield 'StackACC', MLACC(MultilabelStackedClassifier(cls()))
|
||||||
yield 'StackPACC', MLPACC(MultilabelStackedClassifier(cls()))
|
# yield 'StackPACC', MLPACC(MultilabelStackedClassifier(cls()))
|
||||||
# yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None, order='random'))
|
# yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None, order='random'))
|
||||||
# yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random'))
|
# yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random'))
|
||||||
# yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random'))
|
# yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random'))
|
||||||
# yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random'))
|
# yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random'))
|
||||||
common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'}
|
common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'}
|
||||||
yield 'MRQ-CC', MLRegressionQuantification(MultilabelNaiveQuantifier(CC(cls())), **common)
|
# yield 'MRQ-CC', MLRegressionQuantification(MultilabelNaiveQuantifier(CC(cls())), **common)
|
||||||
yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())), **common)
|
# yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())), **common)
|
||||||
yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())), **common)
|
# yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())), **common)
|
||||||
yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common)
|
# yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common)
|
||||||
yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common)
|
# yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common)
|
||||||
yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common)
|
# yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common)
|
||||||
yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common)
|
# yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common)
|
||||||
yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), **common)
|
# yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), **common)
|
||||||
yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
# yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
||||||
yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
# yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
||||||
yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
# yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
||||||
yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
# yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
||||||
# yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common)
|
# yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common)
|
||||||
# yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common)
|
# yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common)
|
||||||
# yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common)
|
# yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common)
|
||||||
|
@ -64,43 +71,98 @@ def models():
|
||||||
|
|
||||||
|
|
||||||
# dataset = 'reuters21578'
|
# dataset = 'reuters21578'
|
||||||
# dataset = 'ohsumed'
|
|
||||||
dataset = 'jrcall'
|
|
||||||
# picklepath = '/home/moreo/word-class-embeddings/pickles'
|
# picklepath = '/home/moreo/word-class-embeddings/pickles'
|
||||||
picklepath = './pickles'
|
# data = Dataset.load(dataset, pickle_path=f'{picklepath}/{dataset}.pickle')
|
||||||
data = Dataset.load(dataset, pickle_path=f'{picklepath}/{dataset}.pickle')
|
# Xtr, Xte = data.vectorize()
|
||||||
|
# ytr = data.devel_labelmatrix.todense().getA()
|
||||||
Xtr, Xte = data.vectorize()
|
# yte = data.test_labelmatrix.todense().getA()
|
||||||
ytr = data.devel_labelmatrix.todense().getA()
|
|
||||||
yte = data.test_labelmatrix.todense().getA()
|
|
||||||
|
|
||||||
# remove categories with < 10 training documents
|
# remove categories with < 10 training documents
|
||||||
# to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50)
|
# to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50)
|
||||||
to_keep = np.argsort(ytr.sum(axis=0))[-10:]
|
# ytr = ytr[:, to_keep]
|
||||||
ytr = ytr[:, to_keep]
|
# yte = yte[:, to_keep]
|
||||||
yte = yte[:, to_keep]
|
# print(f'num categories = {ytr.shape[1]}')
|
||||||
print(f'num categories = {ytr.shape[1]}')
|
|
||||||
|
|
||||||
train = MultilabelledCollection(Xtr, ytr)
|
|
||||||
test = MultilabelledCollection(Xte, yte)
|
|
||||||
|
|
||||||
# print(f'Train-prev: {train.prevalence()[:,1]}')
|
def datasets():
|
||||||
print(f'Train-counts: {train.counts()}')
|
dataset_list = sorted(set([x[0] for x in available_data_sets().keys()]))
|
||||||
# print(f'Test-prev: {test.prevalence()[:,1]}')
|
for dataset_name in dataset_list:
|
||||||
print(f'Test-counts: {test.counts()}')
|
yield dataset_name
|
||||||
print(f'MLPE: {qp.error.mae(train.prevalence(), test.prevalence()):.5f}')
|
|
||||||
|
|
||||||
fit_models = {model_name:model.fit(train) for model_name,model in tqdm(models(), 'fitting', total=6)}
|
|
||||||
|
|
||||||
print('NPP:')
|
def get_dataset(dataset_name):
|
||||||
for model_name, model in fit_models.items():
|
Xtr, ytr, feature_names, label_names = load_dataset(dataset_name, 'train')
|
||||||
err = ml_natural_prevalence_evaluation(model, test, sample_size, repeats=100)
|
Xte, yte, _, _ = load_dataset(dataset_name, 'test')
|
||||||
print(f'{model_name:10s}\tmae={err:.5f}')
|
print(f'n-labels = {len(label_names)}')
|
||||||
|
|
||||||
print('APP:')
|
Xtr = csr_matrix(Xtr)
|
||||||
for model_name, model in fit_models.items():
|
Xte = csr_matrix(Xte)
|
||||||
err = ml_artificial_prevalence_evaluation(model, test, sample_size, n_prevalences=21, repeats=10)
|
|
||||||
print(f'{model_name:10s}\tmae={err:.5f}')
|
ytr = ytr.todense().getA()
|
||||||
|
yte = yte.todense().getA()
|
||||||
|
|
||||||
|
# remove categories without positives in the training or test splits
|
||||||
|
valid_categories = np.logical_and(ytr.sum(axis=0)>5, yte.sum(axis=0)>5)
|
||||||
|
ytr = ytr[:, valid_categories]
|
||||||
|
yte = yte[:, valid_categories]
|
||||||
|
|
||||||
|
train = MultilabelledCollection(Xtr, ytr)
|
||||||
|
test = MultilabelledCollection(Xte, yte)
|
||||||
|
|
||||||
|
return train, test
|
||||||
|
|
||||||
|
|
||||||
|
def already_run(result_path):
|
||||||
|
if os.path.exists(result_path):
|
||||||
|
print(f'{result_path} already computed. Skipping')
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def print_info(train, test):
|
||||||
|
# print((np.abs(np.corrcoef(ytr, rowvar=False))>0.1).sum())
|
||||||
|
# sys.exit(0)
|
||||||
|
|
||||||
|
print(f'Tr documents {len(train)}')
|
||||||
|
print(f'Te documents {len(test)}')
|
||||||
|
print(f'#features {train.instances.shape[1]}')
|
||||||
|
print(f'#classes {train.labels.shape[1]}')
|
||||||
|
|
||||||
|
# print(f'Train-prev: {train.prevalence()[:,1]}')
|
||||||
|
print(f'Train-counts: {train.counts()}')
|
||||||
|
# print(f'Test-prev: {test.prevalence()[:,1]}')
|
||||||
|
print(f'Test-counts: {test.counts()}')
|
||||||
|
print(f'MLPE: {qp.error.mae(train.prevalence(), test.prevalence()):.5f}')
|
||||||
|
|
||||||
|
|
||||||
|
def run_experiment(dataset_name, model_name, model):
|
||||||
|
result_path = f'{opt.results}/{dataset_name}_{model_name}.pkl'
|
||||||
|
if already_run(result_path):
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f'runing experiment {dataset_name} x {model_name}')
|
||||||
|
train, test = get_dataset(dataset_name)
|
||||||
|
|
||||||
|
print_info(train, test)
|
||||||
|
|
||||||
|
model.fit(train)
|
||||||
|
|
||||||
|
results = dict()
|
||||||
|
results['npp'] = ml_natural_prevalence_prediction(model, test, sample_size, repeats=100)
|
||||||
|
results['app'] = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences=21, repeats=10)
|
||||||
|
pickle.dump(results, open(result_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(description='Experiments for multi-label quantification')
|
||||||
|
parser.add_argument('--results', type=str, default='./results', metavar='str',
|
||||||
|
help=f'path where to store the results')
|
||||||
|
opt = parser.parse_args()
|
||||||
|
|
||||||
|
os.makedirs(opt.results, exist_ok=True)
|
||||||
|
|
||||||
|
for datasetname, (modelname,model) in itertools.product(datasets(), models()):
|
||||||
|
run_experiment(datasetname, modelname, model)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ from quapy.functional import artificial_prevalence_sampling
|
||||||
|
|
||||||
class MultilabelledCollection:
|
class MultilabelledCollection:
|
||||||
def __init__(self, instances, labels):
|
def __init__(self, instances, labels):
|
||||||
assert labels.ndim==2, 'data does not seem to be multilabel'
|
assert labels.ndim==2, f'data does not seem to be multilabel {labels}'
|
||||||
self.instances = instances
|
self.instances = instances
|
||||||
self.labels = labels
|
self.labels = labels
|
||||||
self.classes_ = np.arange(labels.shape[1])
|
self.classes_ = np.arange(labels.shape[1])
|
||||||
|
|
|
@ -4,8 +4,42 @@ import numpy as np
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from MultiLabel.mlquantification import MLAggregativeQuantifier
|
from MultiLabel.mlquantification import MLAggregativeQuantifier
|
||||||
from mldata import MultilabelledCollection
|
from mldata import MultilabelledCollection
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
|
||||||
|
def __check_error(error_metric):
|
||||||
|
if isinstance(error_metric, str):
|
||||||
|
error_metric = qp.error.from_name(error_metric)
|
||||||
|
|
||||||
|
assert hasattr(error_metric, '__call__'), 'invalid error function'
|
||||||
|
return error_metric
|
||||||
|
|
||||||
|
|
||||||
|
def _ml_prevalence_predictions(model,
|
||||||
|
test: MultilabelledCollection,
|
||||||
|
test_indexes):
|
||||||
|
|
||||||
|
predict_batch_fn = _predict_quantification_batch
|
||||||
|
if isinstance(model, MLAggregativeQuantifier):
|
||||||
|
test = MultilabelledCollection(model.preclassify(test.instances), test.labels)
|
||||||
|
predict_batch_fn = _predict_aggregative_batch
|
||||||
|
|
||||||
|
args = tuple([model, test, test_indexes])
|
||||||
|
true_prevs, estim_prevs = predict_batch_fn(args)
|
||||||
|
return true_prevs, estim_prevs
|
||||||
|
|
||||||
|
|
||||||
|
def ml_natural_prevalence_prediction(model,
|
||||||
|
test:MultilabelledCollection,
|
||||||
|
sample_size,
|
||||||
|
repeats=100,
|
||||||
|
random_seed=42):
|
||||||
|
|
||||||
|
with qp.util.temp_seed(random_seed):
|
||||||
|
test_indexes = list(test.natural_sampling_index_generator(sample_size=sample_size, repeats=repeats))
|
||||||
|
|
||||||
|
return _ml_prevalence_predictions(model, test, test_indexes)
|
||||||
|
|
||||||
|
|
||||||
def ml_natural_prevalence_evaluation(model,
|
def ml_natural_prevalence_evaluation(model,
|
||||||
test:MultilabelledCollection,
|
test:MultilabelledCollection,
|
||||||
|
@ -14,23 +48,32 @@ def ml_natural_prevalence_evaluation(model,
|
||||||
error_metric:Union[str,Callable]='mae',
|
error_metric:Union[str,Callable]='mae',
|
||||||
random_seed=42):
|
random_seed=42):
|
||||||
|
|
||||||
if isinstance(error_metric, str):
|
error_metric = __check_error(error_metric)
|
||||||
error_metric = qp.error.from_name(error_metric)
|
|
||||||
|
|
||||||
assert hasattr(error_metric, '__call__'), 'invalid error function'
|
true_prevs, estim_prevs = ml_natural_prevalence_prediction(model, test, sample_size, repeats, random_seed)
|
||||||
|
|
||||||
test_batch_fn = _test_quantification_batch
|
errs = [error_metric(true_prev_i, estim_prev_i) for true_prev_i, estim_prev_i in zip(true_prevs, estim_prevs)]
|
||||||
if isinstance(model, MLAggregativeQuantifier):
|
|
||||||
test = MultilabelledCollection(model.preclassify(test.instances), test.labels)
|
|
||||||
test_batch_fn = _test_aggregation_batch
|
|
||||||
|
|
||||||
with qp.util.temp_seed(random_seed):
|
|
||||||
test_indexes = list(test.natural_sampling_index_generator(sample_size=sample_size, repeats=repeats))
|
|
||||||
|
|
||||||
errs = test_batch_fn(tuple([model, test, test_indexes, error_metric]))
|
|
||||||
return np.mean(errs)
|
return np.mean(errs)
|
||||||
|
|
||||||
|
|
||||||
|
def ml_artificial_prevalence_prediction(model,
|
||||||
|
test:MultilabelledCollection,
|
||||||
|
sample_size,
|
||||||
|
n_prevalences=21,
|
||||||
|
repeats=10,
|
||||||
|
random_seed=42):
|
||||||
|
|
||||||
|
test_indexes = []
|
||||||
|
with qp.util.temp_seed(random_seed):
|
||||||
|
for cat in test.classes_:
|
||||||
|
test_indexes.append(list(test.artificial_sampling_index_generator(sample_size=sample_size,
|
||||||
|
category=cat,
|
||||||
|
n_prevalences=n_prevalences,
|
||||||
|
repeats=repeats)))
|
||||||
|
test_indexes = list(itertools.chain.from_iterable(test_indexes))
|
||||||
|
return _ml_prevalence_predictions(model, test, test_indexes)
|
||||||
|
|
||||||
|
|
||||||
def ml_artificial_prevalence_evaluation(model,
|
def ml_artificial_prevalence_evaluation(model,
|
||||||
test:MultilabelledCollection,
|
test:MultilabelledCollection,
|
||||||
sample_size,
|
sample_size,
|
||||||
|
@ -39,47 +82,30 @@ def ml_artificial_prevalence_evaluation(model,
|
||||||
error_metric:Union[str,Callable]='mae',
|
error_metric:Union[str,Callable]='mae',
|
||||||
random_seed=42):
|
random_seed=42):
|
||||||
|
|
||||||
if isinstance(error_metric, str):
|
error_metric = __check_error(error_metric)
|
||||||
error_metric = qp.error.from_name(error_metric)
|
|
||||||
|
|
||||||
assert hasattr(error_metric, '__call__'), 'invalid error function'
|
true_prevs, estim_prevs = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences, repeats, random_seed)
|
||||||
|
|
||||||
test_batch_fn = _test_quantification_batch
|
errs = [error_metric(true_prev_i, estim_prev_i) for true_prev_i, estim_prev_i in zip(true_prevs, estim_prevs)]
|
||||||
if isinstance(model, MLAggregativeQuantifier):
|
return np.mean(errs)
|
||||||
test = MultilabelledCollection(model.preclassify(test.instances), test.labels)
|
|
||||||
test_batch_fn = _test_aggregation_batch
|
|
||||||
|
|
||||||
test_indexes = []
|
|
||||||
with qp.util.temp_seed(random_seed):
|
|
||||||
for cat in test.classes_:
|
|
||||||
test_indexes.append(list(test.artificial_sampling_index_generator(sample_size=sample_size,
|
|
||||||
category=cat,
|
|
||||||
n_prevalences=n_prevalences,
|
|
||||||
repeats=repeats)))
|
|
||||||
|
|
||||||
args = [(model, test, indexes, error_metric) for indexes in test_indexes]
|
|
||||||
macro_errs = qp.util.parallel(test_batch_fn, args, n_jobs=-1)
|
|
||||||
|
|
||||||
return np.mean(macro_errs)
|
|
||||||
|
|
||||||
|
|
||||||
def _test_quantification_batch(args):
|
def _predict_quantification_batch(args):
|
||||||
model, test, indexes, error_metric = args
|
model, test, indexes = args
|
||||||
errs = []
|
return __predict_batch_fn(args, model.quantify)
|
||||||
|
|
||||||
|
|
||||||
|
def _predict_aggregative_batch(args):
|
||||||
|
model, test, indexes = args
|
||||||
|
return __predict_batch_fn(args, model.aggregate)
|
||||||
|
|
||||||
|
|
||||||
|
def __predict_batch_fn(args, quant_fn):
|
||||||
|
model, test, indexes = args
|
||||||
|
trues, estims = [], []
|
||||||
for index in indexes:
|
for index in indexes:
|
||||||
sample = test.sampling_from_index(index)
|
sample = test.sampling_from_index(index)
|
||||||
estim_prevs = model.quantify(sample.instances)
|
estims.append(quant_fn(sample.instances))
|
||||||
true_prevs = sample.prevalence()
|
trues.append(sample.prevalence())
|
||||||
errs.append(error_metric(true_prevs, estim_prevs))
|
return trues, estims
|
||||||
return errs
|
|
||||||
|
|
||||||
|
|
||||||
def _test_aggregation_batch(args):
|
|
||||||
model, preclassified_test, indexes, error_metric = args
|
|
||||||
errs = []
|
|
||||||
for index in indexes:
|
|
||||||
sample = preclassified_test.sampling_from_index(index)
|
|
||||||
estim_prevs = model.aggregate(sample.instances)
|
|
||||||
true_prevs = sample.prevalence()
|
|
||||||
errs.append(error_metric(true_prevs, estim_prevs))
|
|
||||||
return errs
|
|
|
@ -186,6 +186,7 @@ class MLRegressionQuantification:
|
||||||
# self.norm = StandardScaler()
|
# self.norm = StandardScaler()
|
||||||
self.means = means
|
self.means = means
|
||||||
self.stds = stds
|
self.stds = stds
|
||||||
|
# self.covs = covs
|
||||||
|
|
||||||
def _prepare_arrays(self, Xs, ys, samples_mean, samples_std):
|
def _prepare_arrays(self, Xs, ys, samples_mean, samples_std):
|
||||||
Xs = np.asarray(Xs)
|
Xs = np.asarray(Xs)
|
||||||
|
@ -196,6 +197,8 @@ class MLRegressionQuantification:
|
||||||
if self.stds:
|
if self.stds:
|
||||||
samples_std = np.asarray(samples_std)
|
samples_std = np.asarray(samples_std)
|
||||||
Xs = np.hstack([Xs, samples_std])
|
Xs = np.hstack([Xs, samples_std])
|
||||||
|
# if self.covs:
|
||||||
|
|
||||||
return Xs, ys
|
return Xs, ys
|
||||||
|
|
||||||
def generate_samples_npp(self, val):
|
def generate_samples_npp(self, val):
|
||||||
|
@ -257,3 +260,6 @@ class MLRegressionQuantification:
|
||||||
adjusted = adjusted.flatten()
|
adjusted = adjusted.flatten()
|
||||||
neg_prevs = 1-adjusted
|
neg_prevs = 1-adjusted
|
||||||
return np.asarray([neg_prevs, adjusted]).T
|
return np.asarray([neg_prevs, adjusted]).T
|
||||||
|
|
||||||
|
|
||||||
|
# class
|
Loading…
Reference in New Issue