some refactor and prior effect script
This commit is contained in:
parent
4b7fc77e90
commit
323cd383f9
|
|
@ -21,3 +21,4 @@
|
|||
- consider W as a measure of quantification error (the current e.g., w-CI is the winkler...)
|
||||
- optimize also C and class_weight? [I don't think so, but could be done easily now]
|
||||
|
||||
- remove wikis from repo
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from sklearn.base import BaseEstimator
|
||||
|
||||
import quapy as qp
|
||||
import numpy as np
|
||||
|
||||
from method.aggregative import KDEyML
|
||||
from quapy.functional import l1_norm, ILRtransformation
|
||||
from scipy.stats import entropy
|
||||
|
||||
|
||||
|
||||
def fetch_UCI_multiclass(data_name):
|
||||
return qp.datasets.fetch_UCIMulticlassDataset(data_name, min_class_support=0.01)
|
||||
|
||||
|
||||
def fetch_UCI_binary(data_name):
|
||||
return qp.datasets.fetch_UCIBinaryDataset(data_name)
|
||||
|
||||
# global configurations
|
||||
|
||||
binary = {
|
||||
'datasets': qp.datasets.UCI_BINARY_DATASETS,
|
||||
'fetch_fn': fetch_UCI_binary,
|
||||
'sample_size': 500
|
||||
}
|
||||
|
||||
multiclass = {
|
||||
'datasets': qp.datasets.UCI_MULTICLASS_DATASETS,
|
||||
'fetch_fn': fetch_UCI_multiclass,
|
||||
'sample_size': 1000
|
||||
}
|
||||
multiclass['datasets'].remove('poker_hand') # random performance
|
||||
multiclass['datasets'].remove('hcv') # random performance
|
||||
multiclass['datasets'].remove('letter') # many classes
|
||||
multiclass['datasets'].remove('isolet') # many classes
|
||||
|
||||
|
||||
# utils
|
||||
def experiment_path(dir:Path, dataset_name:str, method_name:str):
|
||||
os.makedirs(dir, exist_ok=True)
|
||||
return dir/f'{dataset_name}__{method_name}.pkl'
|
||||
|
||||
|
||||
def normalized_entropy(p):
|
||||
"""
|
||||
Normalized Shannon entropy in [0, 1]
|
||||
p: array-like, prevalence vector (sums to 1)
|
||||
"""
|
||||
p = np.asarray(p)
|
||||
H = entropy(p) # Shannon entropy
|
||||
H_max = np.log(len(p))
|
||||
return np.clip(H / H_max, 0, 1)
|
||||
|
||||
|
||||
def antagonistic_prevalence(p, strength=1):
|
||||
ilr = ILRtransformation()
|
||||
z = ilr(p)
|
||||
z_ant = - strength * z
|
||||
p_ant = ilr.inverse(z_ant)
|
||||
return p_ant
|
||||
|
||||
|
||||
class KDEyCLR(KDEyML):
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
|
||||
super().__init__(
|
||||
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
|
||||
random_state=random_state, kernel='aitchison'
|
||||
)
|
||||
|
||||
|
||||
class KDEyILR(KDEyML):
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
|
||||
super().__init__(
|
||||
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
|
||||
random_state=random_state, kernel='ilr'
|
||||
)
|
||||
|
|
@ -1,48 +1,23 @@
|
|||
import os
|
||||
import warnings
|
||||
from os.path import join
|
||||
from pathlib import Path
|
||||
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.linear_model import LogisticRegression as LR
|
||||
from sklearn.model_selection import GridSearchCV, StratifiedKFold
|
||||
from copy import deepcopy as cp
|
||||
import quapy as qp
|
||||
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
|
||||
from BayesianKDEy.commons import multiclass, experiment_path, KDEyCLR
|
||||
from BayesianKDEy.temperature_calibration import temp_calibration
|
||||
from build.lib.quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier, EMQ
|
||||
from quapy.method.base import BinaryQuantifier, BaseQuantifier
|
||||
from quapy.model_selection import GridSearchQ
|
||||
from quapy.data import Dataset
|
||||
# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot
|
||||
from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ, WithConfidenceABC, AggregativeBootstrap
|
||||
from quapy.functional import strprev
|
||||
from quapy.method.confidence import BayesianCC, AggregativeBootstrap
|
||||
from quapy.method.aggregative import KDEyML, ACC
|
||||
from quapy.protocol import UPP
|
||||
import quapy.functional as F
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from scipy.stats import dirichlet
|
||||
from collections import defaultdict
|
||||
from time import time
|
||||
from sklearn.base import clone, BaseEstimator
|
||||
|
||||
|
||||
class KDEyCLR(KDEyML):
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
|
||||
super().__init__(
|
||||
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
|
||||
random_state=random_state, kernel='aitchison'
|
||||
)
|
||||
|
||||
|
||||
class KDEyILR(KDEyML):
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
|
||||
super().__init__(
|
||||
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
|
||||
random_state=random_state, kernel='ilr'
|
||||
)
|
||||
|
||||
|
||||
def methods():
|
||||
|
|
@ -160,33 +135,8 @@ def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, method
|
|||
return report
|
||||
|
||||
|
||||
def experiment_path(dir:Path, dataset_name:str, method_name:str):
|
||||
os.makedirs(dir, exist_ok=True)
|
||||
return dir/f'{dataset_name}__{method_name}.pkl'
|
||||
|
||||
|
||||
def fetch_UCI_binary(data_name):
|
||||
return qp.datasets.fetch_UCIBinaryDataset(data_name)
|
||||
|
||||
|
||||
def fetch_UCI_multiclass(data_name):
|
||||
return qp.datasets.fetch_UCIMulticlassDataset(data_name, min_class_support=0.01)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
binary = {
|
||||
'datasets': qp.datasets.UCI_BINARY_DATASETS,
|
||||
'fetch_fn': fetch_UCI_binary,
|
||||
'sample_size': 500
|
||||
}
|
||||
|
||||
multiclass = {
|
||||
'datasets': qp.datasets.UCI_MULTICLASS_DATASETS,
|
||||
'fetch_fn': fetch_UCI_multiclass,
|
||||
'sample_size': 1000
|
||||
}
|
||||
|
||||
result_dir = Path('./results')
|
||||
|
||||
for setup in [multiclass]: # [binary, multiclass]:
|
||||
|
|
@ -200,15 +150,15 @@ if __name__ == '__main__':
|
|||
is_binary = data.n_classes==2
|
||||
result_subdir = result_dir / ('binary' if is_binary else 'multiclass')
|
||||
hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass')
|
||||
for method_name, method, hyper_params, withconf_constructor, method_scope in methods():
|
||||
for method_name, surrogate_quant, hyper_params, withconf_constructor, method_scope in methods():
|
||||
if method_scope == 'only_binary' and not is_binary:
|
||||
continue
|
||||
if method_scope == 'only_multiclass' and is_binary:
|
||||
continue
|
||||
result_path = experiment_path(result_subdir, data_name, method_name)
|
||||
hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__)
|
||||
hyper_path = experiment_path(hyper_subdir, data_name, surrogate_quant.__class__.__name__)
|
||||
report = qp.util.pickled_resource(
|
||||
result_path, experiment, data, method, method_name, hyper_params, withconf_constructor, hyper_path
|
||||
result_path, experiment, data, surrogate_quant, method_name, hyper_params, withconf_constructor, hyper_path
|
||||
)
|
||||
print(f'dataset={data_name}, '
|
||||
f'method={method_name}: '
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import pandas as pd
|
|||
from glob import glob
|
||||
from pathlib import Path
|
||||
import quapy as qp
|
||||
from BayesianKDEy.full_experiments import fetch_UCI_multiclass, fetch_UCI_binary
|
||||
from BayesianKDEy.commons import fetch_UCI_binary, fetch_UCI_multiclass
|
||||
from error import dist_aitchison
|
||||
from quapy.method.confidence import ConfidenceIntervals
|
||||
from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR, ConfidenceIntervals, ConfidenceRegionABC
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import matplotlib.pyplot as plt
|
|||
from matplotlib.colors import ListedColormap
|
||||
from scipy.stats import gaussian_kde
|
||||
|
||||
from BayesianKDEy.commons import antagonistic_prevalence
|
||||
from method.confidence import (ConfidenceIntervals as CI,
|
||||
ConfidenceEllipseSimplex as CE,
|
||||
ConfidenceEllipseCLR as CLR,
|
||||
|
|
@ -302,6 +303,7 @@ def plot_simplex(
|
|||
|
||||
plt.tight_layout()
|
||||
if save_path:
|
||||
os.makedirs(Path(save_path).parent, exist_ok=True)
|
||||
plt.savefig(save_path)
|
||||
else:
|
||||
plt.show()
|
||||
|
|
@ -377,17 +379,17 @@ if __name__ == '__main__':
|
|||
# ],
|
||||
# save_path=f'./plots/prior_test/uniform.png'
|
||||
# )
|
||||
|
||||
alpha = [40, 10, 10]
|
||||
train_prevs = np.random.dirichlet(alpha=alpha, size=n)
|
||||
test_prevs = np.random.dirichlet(alpha=alpha, size=n)
|
||||
plot_simplex(
|
||||
point_layers=[
|
||||
{"points": train_prevs, "label": "train", "style": train_style},
|
||||
{"points": test_prevs, "label": "test", "style": test_style},
|
||||
],
|
||||
save_path=f'./plots/prior_test/informative.png'
|
||||
)
|
||||
#
|
||||
# alpha = [40, 10, 10]
|
||||
# train_prevs = np.random.dirichlet(alpha=alpha, size=n)
|
||||
# test_prevs = np.random.dirichlet(alpha=alpha, size=n)
|
||||
# plot_simplex(
|
||||
# point_layers=[
|
||||
# {"points": train_prevs, "label": "train", "style": train_style},
|
||||
# {"points": test_prevs, "label": "test", "style": test_style},
|
||||
# ],
|
||||
# save_path=f'./plots/prior_test/informative.png'
|
||||
# )
|
||||
|
||||
# train_prevs = np.random.dirichlet(alpha=[8, 1, 1], size=n)
|
||||
# test_prevs = np.random.dirichlet(alpha=[1, 8, 1], size=n)
|
||||
|
|
@ -402,13 +404,16 @@ if __name__ == '__main__':
|
|||
p = 0.6
|
||||
|
||||
K = 3
|
||||
alpha = [p] + [(1. - p) / (K - 1)] * (K - 1)
|
||||
# alpha = [p] + [(1. - p) / (K - 1)] * (K - 1)
|
||||
alpha = [0.095, 0.246, 0.658]
|
||||
alpha = np.array(alpha)
|
||||
|
||||
for c in [100, 500, 1_000]:
|
||||
alpha_c = alpha * c
|
||||
train_prevs = np.random.dirichlet(alpha=alpha_c, size=n)
|
||||
test_prevs = np.random.dirichlet(alpha=alpha_c[::-1], size=n)
|
||||
|
||||
for c in [50, 500, 5_000]:
|
||||
alpha_tr = alpha * c
|
||||
alpha_te = antagonistic_prevalence(alpha, strength=1) * c
|
||||
train_prevs = np.random.dirichlet(alpha=alpha_tr, size=n)
|
||||
test_prevs = np.random.dirichlet(alpha=alpha_te, size=n)
|
||||
plot_simplex(
|
||||
point_layers=[
|
||||
{"points": train_prevs, "label": "train", "style": train_style},
|
||||
|
|
|
|||
|
|
@ -1,11 +1,125 @@
|
|||
import numpy as np
|
||||
from collections import defaultdict
|
||||
|
||||
n = 3
|
||||
import model_selection
|
||||
import quapy as qp
|
||||
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
|
||||
from BayesianKDEy.temperature_calibration import temp_calibration
|
||||
from commons import *
|
||||
from data import Dataset
|
||||
from protocol import DirichletProtocol
|
||||
from quapy.method.confidence import BayesianCC
|
||||
from quapy.method.aggregative import ACC, AggregativeQuantifier
|
||||
from sklearn.linear_model import LogisticRegression as LR
|
||||
from copy import deepcopy as cp
|
||||
from tqdm import tqdm
|
||||
|
||||
p = 0.5
|
||||
|
||||
alpha = [p] + [(1.-p)/(n-1)]*(n-1)
|
||||
alpha = np.array(alpha)
|
||||
def select_imbalanced_datasets(top_m=5):
|
||||
datasets_prevs = []
|
||||
# choose top-m imbalanced datasets
|
||||
for data_name in multiclass['datasets']:
|
||||
data_prev = multiclass['fetch_fn'](data_name).training.prevalence()
|
||||
balance = normalized_entropy(data_prev)
|
||||
datasets_prevs.append((data_name, balance))
|
||||
datasets_prevs.sort(key=lambda x: x[1])
|
||||
print(datasets_prevs)
|
||||
data_selected = [data_name for data_name, balance in datasets_prevs[:top_m]]
|
||||
return data_selected
|
||||
|
||||
for c in [1_000, 5_000, 10_000]:
|
||||
print(alpha*c)
|
||||
|
||||
def methods():
|
||||
acc_hyper = {}
|
||||
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}
|
||||
kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]}
|
||||
|
||||
#yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0)
|
||||
yield f'BaKDE-Ait', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison',
|
||||
mcmc_seed=0,
|
||||
engine='numpyro',
|
||||
temperature=None,
|
||||
prior='uniform',
|
||||
**hyper)
|
||||
|
||||
|
||||
def run_test(test, alpha_test, alpha_train, concentration, prior_type, bay_quant, train_prev, results):
|
||||
test_generator = DirichletProtocol(test, alpha=alpha_test, repeats=100, random_state=0)
|
||||
for i, (sample_X, true_prev) in tqdm(enumerate(test_generator()), total=test_generator.total(),
|
||||
desc=f'{method_name} informative alpha with {concentration=}'):
|
||||
estim_prev, region = bay_quant.predict_conf(sample_X)
|
||||
|
||||
results['prior-type'].append(prior_type)
|
||||
results['train-prev'].append(train_prev)
|
||||
results['concentration'].append(concentration)
|
||||
results['train-alpha'].append(alpha_train)
|
||||
results['test-alpha'].append(alpha_test)
|
||||
results['true-prevs'].append(true_prev)
|
||||
results['point-estim'].append(estim_prev)
|
||||
results['shift'].append(qp.error.ae(true_prev, train_prev))
|
||||
results['ae'].append(qp.error.ae(prevs_true=true_prev, prevs_hat=estim_prev))
|
||||
results['sre'].append(qp.error.sre(prevs_true=true_prev, prevs_hat=estim_prev, prevs_train=train_prev))
|
||||
results['rae'].append(qp.error.rae(prevs_true=true_prev, prevs_hat=estim_prev))
|
||||
results['coverage'].append(region.coverage(true_prev))
|
||||
results['amplitude'].append(region.montecarlo_proportion(n_trials=50_000))
|
||||
results['samples'].append(region.samples)
|
||||
|
||||
|
||||
def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, grid: dict, bay_constructor, hyper_choice_path: Path):
|
||||
with qp.util.temp_seed(0):
|
||||
|
||||
training, test = dataset.train_test
|
||||
|
||||
# model selection
|
||||
best_hyperparams = qp.util.pickled_resource(
|
||||
hyper_choice_path, model_selection, training, cp(point_quantifier), grid
|
||||
)
|
||||
|
||||
bay_quant = bay_constructor(best_hyperparams)
|
||||
if hasattr(bay_quant, 'temperature') and bay_quant.temperature is None:
|
||||
train, val = data.training.split_stratified(train_prop=0.6, random_state=0)
|
||||
temperature = temp_calibration(bay_quant, train, val, temp_grid=[.5, 1., 1.5, 2., 5., 10., 100.], n_jobs=-1)
|
||||
bay_quant.temperature = temperature
|
||||
bay_quant.fit(*training.Xy)
|
||||
|
||||
# test
|
||||
train_prev = training.prevalence()
|
||||
results = defaultdict(list)
|
||||
|
||||
for concentration in [50, 500, 5_000]:
|
||||
alpha_train = train_prev * concentration
|
||||
bay_quant.prior = alpha_train
|
||||
|
||||
# informative prior
|
||||
alpha_test_informative = alpha_train
|
||||
prior_type = 'informative'
|
||||
run_test(test, alpha_test_informative, alpha_train, concentration, prior_type, bay_quant, train_prev, results)
|
||||
|
||||
# informative prior
|
||||
alpha_test_wrong = antagonistic_prevalence(train_prev, strength=1) * concentration
|
||||
prior_type = 'wrong'
|
||||
run_test(test, alpha_test_wrong, alpha_train, concentration, prior_type, bay_quant, train_prev, results)
|
||||
|
||||
report = {
|
||||
'optim_hyper': best_hyperparams,
|
||||
'train-prev': train_prev,
|
||||
'results': {k: np.asarray(v) for k, v in results.items()}
|
||||
}
|
||||
|
||||
return report
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
result_dir = Path('./results/prior_effect')
|
||||
selected = select_imbalanced_datasets()
|
||||
qp.environ['SAMPLE_SIZE'] = multiclass['sample_size']
|
||||
for data_name in selected:
|
||||
data = multiclass['fetch_fn'](data_name)
|
||||
for method_name, surrogate_quant, hyper_params, bay_constructor, method_scope in methods():
|
||||
result_path = experiment_path(result_dir, data_name, method_name)
|
||||
hyper_path = experiment_path(result_dir/'hyperparams', data_name, surrogate_quant.__class__.__name__)
|
||||
|
||||
print(f'Launching {method_name} in dataset {data_name}')
|
||||
experiment(dataset=data,
|
||||
point_quantifier=surrogate_quant,
|
||||
grid=hyper_params,
|
||||
bay_constructor=bay_constructor,
|
||||
hyper_choice_path=hyper_path)
|
||||
|
|
|
|||
|
|
@ -10,7 +10,8 @@ from sklearn.model_selection import GridSearchCV, StratifiedKFold
|
|||
from copy import deepcopy as cp
|
||||
import quapy as qp
|
||||
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
|
||||
from BayesianKDEy.full_experiments import experiment, experiment_path, KDEyCLR
|
||||
from BayesianKDEy.full_experiments import experiment
|
||||
from BayesianKDEy.commons import experiment_path, KDEyCLR
|
||||
from build.lib.quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier
|
||||
from quapy.method.base import BinaryQuantifier, BaseQuantifier
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from pathlib import Path
|
|||
import pandas as pd
|
||||
import quapy as qp
|
||||
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
|
||||
from BayesianKDEy.full_experiments import experiment_path
|
||||
from BayesianKDEy.commons import experiment_path
|
||||
from quapy.protocol import UPP
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ import copy
|
|||
def temp_calibration(method:WithConfidenceABC,
|
||||
train:LabelledCollection,
|
||||
val:LabelledCollection,
|
||||
temp_grid=[1, 1.5, 2],
|
||||
temp_grid=[.5, 1., 1.5, 2., 5., 10., 100.],
|
||||
num_samples=100,
|
||||
nominal_coverage=0.95,
|
||||
amplitude_threshold='auto',
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ import unittest
|
|||
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
import BayesianKDEy.commons
|
||||
import quapy as qp
|
||||
from quapy.method.aggregative import ACC
|
||||
from quapy.method.meta import Ensemble
|
||||
|
|
@ -47,7 +48,7 @@ class TestMethods(unittest.TestCase):
|
|||
learner.fit(*dataset.training.Xy)
|
||||
|
||||
for model in AGGREGATIVE_METHODS:
|
||||
if not dataset.binary and model in BINARY_METHODS:
|
||||
if not BayesianKDEy.commons.binary and model in BINARY_METHODS:
|
||||
print(f'skipping the test of binary model {model.__name__} on multiclass dataset {dataset.name}')
|
||||
continue
|
||||
|
||||
|
|
@ -61,7 +62,7 @@ class TestMethods(unittest.TestCase):
|
|||
for dataset in TestMethods.datasets:
|
||||
|
||||
for model in NON_AGGREGATIVE_METHODS:
|
||||
if not dataset.binary and model in BINARY_METHODS:
|
||||
if not BayesianKDEy.commons.binary and model in BINARY_METHODS:
|
||||
print(f'skipping the test of binary model {model.__name__} on multiclass dataset {dataset.name}')
|
||||
continue
|
||||
|
||||
|
|
@ -76,7 +77,7 @@ class TestMethods(unittest.TestCase):
|
|||
|
||||
base_quantifier = ACC(LogisticRegression())
|
||||
for dataset, policy in itertools.product(TestMethods.datasets, Ensemble.VALID_POLICIES):
|
||||
if not dataset.binary and policy == 'ds':
|
||||
if not BayesianKDEy.commons.binary and policy == 'ds':
|
||||
print(f'skipping the test of binary policy ds on non-binary dataset {dataset}')
|
||||
continue
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue