From 3243fd90f80489acce3cc656a239d83e93b90612 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Mon, 23 Oct 2023 11:32:35 +0200 Subject: [PATCH] running final experiments, one dedicated DM for each divergence with fine-grained exploration of nbins --- distribution_matching/commons.py | 34 +++++++++++++++++-- distribution_matching/method_dirichlety.py | 9 +++-- distribution_matching/tweets_experiments.py | 3 +- .../ucibinary_experiments.py | 4 +-- .../ucimulticlass_experiments.py | 17 ++++++---- quapy/CHANGE_LOG.txt | 2 -- quapy/classification/calibration.py | 2 +- quapy/functional.py | 22 ++++++++++++ quapy/method/aggregative.py | 5 ++- 9 files changed, 80 insertions(+), 18 deletions(-) diff --git a/distribution_matching/commons.py b/distribution_matching/commons.py index 2d9ba68..970dafa 100644 --- a/distribution_matching/commons.py +++ b/distribution_matching/commons.py @@ -8,7 +8,7 @@ from distribution_matching.method_dirichlety import DIRy from sklearn.linear_model import LogisticRegression from method_kdey_closed_efficient import KDEyclosed_efficient -METHODS = ['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'KDEy-closed++', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+', +METHODS = ['EMQ', 'EMQ-C', 'DM', 'DM-T', 'DM-HD', 'KDEy-DMhd3', 'DM-CS', 'KDEy-closed++', 'KDEy-ML'] #['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'KDEy-closed++', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+', BIN_METHODS = [x.replace('-OvA', '') for x in METHODS] @@ -34,7 +34,7 @@ def new_method(method, **lr_kwargs): param_grid = hyper_LR quantifier = PACC(lr) elif method == 'KDEy-ML': - method_params = {'bandwidth': np.linspace(0.01, 0.3, 30)} + method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} param_grid = {**method_params, **hyper_LR} quantifier = KDEy(lr, target='max_likelihood', val_split=10) elif method == 'KDEy-closed': @@ -59,6 +59,10 @@ def new_method(method, **lr_kwargs): elif method == 'EMQ': param_grid = hyper_LR quantifier = EMQ(lr) + elif method == 'EMQ-C': + method_params = {'exact_train_prev': [False], 'recalib': ['bcts']} + param_grid = {**method_params, **hyper_LR} + quantifier = EMQ(lr) elif method == 'HDy-OvA': param_grid = {'binary_quantifier__' + key: val for key, val in hyper_LR.items()} quantifier = OneVsAllAggregative(HDy(lr)) @@ -70,6 +74,30 @@ def new_method(method, **lr_kwargs): } param_grid = {**method_params, **hyper_LR} quantifier = DistributionMatching(lr) + elif method == 'DM-T': + method_params = { + 'nbins': [2,3,4,5,6,7,8,9,10,12,14,16,18,20,22,24,26,28,30,32,64], + 'val_split': [10], + 'divergence': ['topsoe'] + } + param_grid = {**method_params, **hyper_LR} + quantifier = DistributionMatching(lr) + elif method == 'DM-HD': + method_params = { + 'nbins': [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 64], + 'val_split': [10], + 'divergence': ['HD'] + } + param_grid = {**method_params, **hyper_LR} + quantifier = DistributionMatching(lr) + elif method == 'DM-CS': + method_params = { + 'nbins': [2,3,4,5,6,7,8,9,10,12,14,16,18,20,22,24,26,28,30,32,64], + 'val_split': [10], + 'divergence': ['CS'] + } + param_grid = {**method_params, **hyper_LR} + quantifier = DistributionMatching(lr) # experimental elif method in ['KDEy-DMkld']: @@ -95,7 +123,7 @@ def new_method(method, **lr_kwargs): # can be stored. This means that the reference distribution is V and not T. Then I have found that an # f-divergence is defined as D(p||q) \int_{R^n}q(x)f(p(x)/q(x))dx = E_{x~q}[f(p(x)/q(x))], so if I am sampling # V then I am computing D(T||V) (and not D(V||T) as I thought). - method_params = {'bandwidth': np.linspace(0.01, 0.3, 30)} + method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} param_grid = {**method_params, **hyper_LR} quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10) elif method == 'DM-HD': diff --git a/distribution_matching/method_dirichlety.py b/distribution_matching/method_dirichlety.py index 160d561..c3bfcc8 100644 --- a/distribution_matching/method_dirichlety.py +++ b/distribution_matching/method_dirichlety.py @@ -21,7 +21,7 @@ import dirichlet class DIRy(AggregativeProbabilisticQuantifier): - MAXITER = 10000 + MAXITER = 100000 def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None, target='max_likelihood'): self.classifier = classifier @@ -38,7 +38,12 @@ class DIRy(AggregativeProbabilisticQuantifier): data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs ) - self.val_parameters = [dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER) for cat in range(data.n_classes)] + self.val_parameters = [] + for cat in range(data.n_classes): + dir_i = dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER) + self.val_parameters.append(dir_i) + # print(cat) + # self.val_parameters = [dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER) for cat in range(data.n_classes)] return self diff --git a/distribution_matching/tweets_experiments.py b/distribution_matching/tweets_experiments.py index c5f659a..db66b2e 100644 --- a/distribution_matching/tweets_experiments.py +++ b/distribution_matching/tweets_experiments.py @@ -24,6 +24,7 @@ if __name__ == '__main__': for method in METHODS: print('Init method', method) + if method == 'EMQ-C': continue global_result_path = f'{result_dir}/{method}' @@ -36,7 +37,7 @@ if __name__ == '__main__': # this variable controls that the mod sel has already been done, and skip this otherwise semeval_trained = False - for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST[::-1]: + for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST: print('init', dataset) local_result_path = global_result_path + '_' + dataset diff --git a/distribution_matching/ucibinary_experiments.py b/distribution_matching/ucibinary_experiments.py index 99c1157..2aecbab 100644 --- a/distribution_matching/ucibinary_experiments.py +++ b/distribution_matching/ucibinary_experiments.py @@ -69,8 +69,8 @@ if __name__ == '__main__': quantifier = modsel.best_model() except: - print('something went wrong... reporting CC') - quantifier = qp.method.aggregative.CC(LR()).fit(train) + print('something went wrong... trying to fit the default model') + quantifier.fit(train) protocol = UPP(test, repeats=n_bags_test) report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], diff --git a/distribution_matching/ucimulticlass_experiments.py b/distribution_matching/ucimulticlass_experiments.py index 4bf31b0..d2d0fc8 100644 --- a/distribution_matching/ucimulticlass_experiments.py +++ b/distribution_matching/ucimulticlass_experiments.py @@ -1,5 +1,8 @@ import pickle import os + +from sklearn.linear_model import LogisticRegression + from distribution_matching.commons import METHODS, new_method, show_results import quapy as qp @@ -22,9 +25,9 @@ if __name__ == '__main__': os.makedirs(result_dir, exist_ok=True) for method in METHODS: - if method == 'HDy-OvA': continue - if method == 'DIR': continue - if method != 'KDEy-ML': continue + #if method == 'HDy-OvA': continue + #if method == 'DIR': continue + # if method != 'EMQ-C': continue print('Init method', method) @@ -71,12 +74,14 @@ if __name__ == '__main__': quantifier = modsel.best_model() except: - print('something went wrong... reporting CC') - quantifier = qp.method.aggregative.CC(LR()).fit(train) + print('something went wrong... trying to fit the default model') + quantifier.fit(train) + # quantifier = qp.method.aggregative.CC(LogisticRegression()).fit(train) + protocol = UPP(test, repeats=n_bags_test) report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], - verbose=True) + verbose=True, n_jobs=-1) report.to_csv(f'{local_result_path}.dataframe') means = report.mean() csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') diff --git a/quapy/CHANGE_LOG.txt b/quapy/CHANGE_LOG.txt index d34c464..ba0bf04 100644 --- a/quapy/CHANGE_LOG.txt +++ b/quapy/CHANGE_LOG.txt @@ -9,8 +9,6 @@ Change Log 0.1.8 - qp.evaluation now runs in parallel -- remove dependencies with LabelledCollection in the library. - Change Log 0.1.7 ---------------- diff --git a/quapy/classification/calibration.py b/quapy/classification/calibration.py index a3f1543..4047b6d 100644 --- a/quapy/classification/calibration.py +++ b/quapy/classification/calibration.py @@ -59,7 +59,7 @@ class RecalibratedProbabilisticClassifierBase(BaseEstimator, RecalibratedProbabi elif isinstance(k, float): if not (0 < k < 1): raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)') - return self.fit_cv(X, y) + return self.fit_tr_val(X, y) def fit_cv(self, X, y): """ diff --git a/quapy/functional.py b/quapy/functional.py index a1f0ba2..843a450 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -90,11 +90,32 @@ def TopsoeDistance(P, Q, epsilon=1e-20): :param P: real-valued array-like of shape `(k,)` representing a discrete distribution :param Q: real-valued array-like of shape `(k,)` representing a discrete distribution + :param epsilon: small value to smooth the distributions for numerical stability :return: float """ return np.sum(P*np.log((2*P+epsilon)/(P+Q+epsilon)) + Q*np.log((2*Q+epsilon)/(P+Q+epsilon))) +def CauchySchwarz(P, Q, epsilon=1e-20): + """ + Cauchy-Schwarz divergence between two (discretized) distributions `P` and `Q`. + The Cauchy-Schwarz divergence for two discrete distributions of `k` bins is defined as: + + .. math:: + CS(P,Q) = \\frac{ \\sum_{i=1}^k p_i q_i }{ + \\left( \\sum_{i=1}^k p^2_i \\right) \\left( \\sum_{i=1}^k q^2_i \\right) + } + + :param P: real-valued array-like of shape `(k,)` representing a discrete distribution + :param Q: real-valued array-like of shape `(k,)` representing a discrete distribution + :param epsilon: small value to smooth the distributions for numerical stability + :return: float + """ + P += epsilon + Q += epsilon + return - np.log(sum(P * Q) / np.sqrt(sum(P ** 2) * sum(Q ** 2))) + + def uniform_prevalence_sampling(n_classes, size=1): """ Implements the `Kraemer algorithm `_ @@ -276,3 +297,4 @@ def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08): return False return True + diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 9901f38..31be9f9 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -604,10 +604,13 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): def _get_divergence(divergence: Union[str, Callable]): if isinstance(divergence, str): - if divergence=='HD': + divergence = divergence.lower() + if divergence=='hd': return F.HellingerDistance elif divergence=='topsoe': return F.TopsoeDistance + elif divergence.lower()=='cs': + return F.CauchySchwarz elif divergence.lower()=='l2': return lambda a,b: np.linalg.norm(a-b) elif divergence.lower()=='l1':