From 3243fd90f80489acce3cc656a239d83e93b90612 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo <alejandro.moreo@isti.cnr.it>
Date: Mon, 23 Oct 2023 11:32:35 +0200
Subject: [PATCH] running final experiments, one dedicated DM for each
 divergence with fine-grained exploration of nbins

---
 distribution_matching/commons.py              | 34 +++++++++++++++++--
 distribution_matching/method_dirichlety.py    |  9 +++--
 distribution_matching/tweets_experiments.py   |  3 +-
 .../ucibinary_experiments.py                  |  4 +--
 .../ucimulticlass_experiments.py              | 17 ++++++----
 quapy/CHANGE_LOG.txt                          |  2 --
 quapy/classification/calibration.py           |  2 +-
 quapy/functional.py                           | 22 ++++++++++++
 quapy/method/aggregative.py                   |  5 ++-
 9 files changed, 80 insertions(+), 18 deletions(-)

diff --git a/distribution_matching/commons.py b/distribution_matching/commons.py
index 2d9ba68..970dafa 100644
--- a/distribution_matching/commons.py
+++ b/distribution_matching/commons.py
@@ -8,7 +8,7 @@ from distribution_matching.method_dirichlety import DIRy
 from sklearn.linear_model import LogisticRegression
 from method_kdey_closed_efficient import KDEyclosed_efficient
 
-METHODS  = ['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'KDEy-closed++', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+',
+METHODS  = ['EMQ', 'EMQ-C', 'DM', 'DM-T', 'DM-HD', 'KDEy-DMhd3', 'DM-CS', 'KDEy-closed++', 'KDEy-ML'] #['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'KDEy-closed++', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+',
 BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]
 
 
@@ -34,7 +34,7 @@ def new_method(method, **lr_kwargs):
         param_grid = hyper_LR
         quantifier = PACC(lr)
     elif method == 'KDEy-ML':
-        method_params = {'bandwidth': np.linspace(0.01, 0.3, 30)}
+        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
         param_grid = {**method_params, **hyper_LR}
         quantifier = KDEy(lr, target='max_likelihood', val_split=10)
     elif method == 'KDEy-closed':
@@ -59,6 +59,10 @@ def new_method(method, **lr_kwargs):
     elif method == 'EMQ':
         param_grid = hyper_LR
         quantifier = EMQ(lr)
+    elif method == 'EMQ-C':
+        method_params = {'exact_train_prev': [False], 'recalib': ['bcts']}
+        param_grid = {**method_params, **hyper_LR}
+        quantifier = EMQ(lr)
     elif method == 'HDy-OvA':
         param_grid = {'binary_quantifier__' + key: val for key, val in hyper_LR.items()}
         quantifier = OneVsAllAggregative(HDy(lr))
@@ -70,6 +74,30 @@ def new_method(method, **lr_kwargs):
         }
         param_grid = {**method_params, **hyper_LR}
         quantifier = DistributionMatching(lr)
+    elif method == 'DM-T':
+        method_params = {
+            'nbins': [2,3,4,5,6,7,8,9,10,12,14,16,18,20,22,24,26,28,30,32,64],
+            'val_split': [10],
+            'divergence': ['topsoe']
+        }
+        param_grid = {**method_params, **hyper_LR}
+        quantifier = DistributionMatching(lr)
+    elif method == 'DM-HD':
+        method_params = {
+            'nbins': [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 64],
+            'val_split': [10],
+            'divergence': ['HD']
+        }
+        param_grid = {**method_params, **hyper_LR}
+        quantifier = DistributionMatching(lr)
+    elif method == 'DM-CS':
+        method_params = {
+            'nbins': [2,3,4,5,6,7,8,9,10,12,14,16,18,20,22,24,26,28,30,32,64],
+            'val_split': [10],
+            'divergence': ['CS']
+        }
+        param_grid = {**method_params, **hyper_LR}
+        quantifier = DistributionMatching(lr)
 
     # experimental
     elif method in ['KDEy-DMkld']:
@@ -95,7 +123,7 @@ def new_method(method, **lr_kwargs):
         # can be stored. This means that the reference distribution is V and not T. Then I have found that an
         # f-divergence is defined as D(p||q) \int_{R^n}q(x)f(p(x)/q(x))dx = E_{x~q}[f(p(x)/q(x))], so if I am sampling
         # V then I am computing D(T||V) (and not D(V||T) as I thought).
-        method_params = {'bandwidth': np.linspace(0.01, 0.3, 30)}
+        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
         param_grid = {**method_params, **hyper_LR}
         quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
     elif method == 'DM-HD':
diff --git a/distribution_matching/method_dirichlety.py b/distribution_matching/method_dirichlety.py
index 160d561..c3bfcc8 100644
--- a/distribution_matching/method_dirichlety.py
+++ b/distribution_matching/method_dirichlety.py
@@ -21,7 +21,7 @@ import dirichlet
 
 class DIRy(AggregativeProbabilisticQuantifier):
 
-    MAXITER = 10000
+    MAXITER = 100000
 
     def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None, target='max_likelihood'):
         self.classifier = classifier
@@ -38,7 +38,12 @@ class DIRy(AggregativeProbabilisticQuantifier):
             data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
         )
 
-        self.val_parameters = [dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER) for cat in range(data.n_classes)]
+        self.val_parameters = []
+        for cat in range(data.n_classes):
+            dir_i = dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER)
+            self.val_parameters.append(dir_i)
+            # print(cat)
+        # self.val_parameters = [dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER) for cat in range(data.n_classes)]
 
         return self
 
diff --git a/distribution_matching/tweets_experiments.py b/distribution_matching/tweets_experiments.py
index c5f659a..db66b2e 100644
--- a/distribution_matching/tweets_experiments.py
+++ b/distribution_matching/tweets_experiments.py
@@ -24,6 +24,7 @@ if __name__ == '__main__':
         for method in METHODS:
 
             print('Init method', method)
+            if method == 'EMQ-C': continue
 
             global_result_path = f'{result_dir}/{method}'
 
@@ -36,7 +37,7 @@ if __name__ == '__main__':
                 # this variable controls that the mod sel has already been done, and skip this otherwise
                 semeval_trained = False
 
-                for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST[::-1]:
+                for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
                     print('init', dataset)
 
                     local_result_path = global_result_path + '_' + dataset
diff --git a/distribution_matching/ucibinary_experiments.py b/distribution_matching/ucibinary_experiments.py
index 99c1157..2aecbab 100644
--- a/distribution_matching/ucibinary_experiments.py
+++ b/distribution_matching/ucibinary_experiments.py
@@ -69,8 +69,8 @@ if __name__ == '__main__':
 
                             quantifier = modsel.best_model()
                         except:
-                            print('something went wrong... reporting CC')
-                            quantifier = qp.method.aggregative.CC(LR()).fit(train)
+                            print('something went wrong... trying to fit the default model')
+                            quantifier.fit(train)
 
                         protocol = UPP(test, repeats=n_bags_test)
                         report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
diff --git a/distribution_matching/ucimulticlass_experiments.py b/distribution_matching/ucimulticlass_experiments.py
index 4bf31b0..d2d0fc8 100644
--- a/distribution_matching/ucimulticlass_experiments.py
+++ b/distribution_matching/ucimulticlass_experiments.py
@@ -1,5 +1,8 @@
 import pickle
 import os
+
+from sklearn.linear_model import LogisticRegression
+
 from distribution_matching.commons import METHODS, new_method, show_results
 
 import quapy as qp
@@ -22,9 +25,9 @@ if __name__ == '__main__':
         os.makedirs(result_dir, exist_ok=True)
 
         for method in METHODS:
-            if method == 'HDy-OvA': continue
-            if method == 'DIR': continue
-            if method != 'KDEy-ML': continue
+            #if method == 'HDy-OvA': continue
+            #if method == 'DIR': continue
+            # if method != 'EMQ-C': continue
 
             print('Init method', method)
 
@@ -71,12 +74,14 @@ if __name__ == '__main__':
 
                             quantifier = modsel.best_model()
                         except:
-                            print('something went wrong... reporting CC')
-                            quantifier = qp.method.aggregative.CC(LR()).fit(train)
+                            print('something went wrong... trying to fit the default model')
+                            quantifier.fit(train)
+                            # quantifier = qp.method.aggregative.CC(LogisticRegression()).fit(train)
+
 
                         protocol = UPP(test, repeats=n_bags_test)
                         report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
-                                                                verbose=True)
+                                                                verbose=True, n_jobs=-1)
                         report.to_csv(f'{local_result_path}.dataframe')
                         means = report.mean()
                         csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
diff --git a/quapy/CHANGE_LOG.txt b/quapy/CHANGE_LOG.txt
index d34c464..ba0bf04 100644
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@@ -9,8 +9,6 @@ Change Log 0.1.8
 - qp.evaluation now runs in parallel <improve, remove or fix the ongoing error, put at the qp. level instead of
     qp.evaluation because I don't like the qp.evaluation.evaluate thing>
 
-- <fix> remove dependencies with LabelledCollection in the library.
-
 
 Change Log 0.1.7
 ----------------
diff --git a/quapy/classification/calibration.py b/quapy/classification/calibration.py
index a3f1543..4047b6d 100644
--- a/quapy/classification/calibration.py
+++ b/quapy/classification/calibration.py
@@ -59,7 +59,7 @@ class RecalibratedProbabilisticClassifierBase(BaseEstimator, RecalibratedProbabi
         elif isinstance(k, float):
             if not (0 < k < 1):
                 raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)')
-            return self.fit_cv(X, y)
+            return self.fit_tr_val(X, y)
 
     def fit_cv(self, X, y):
         """
diff --git a/quapy/functional.py b/quapy/functional.py
index a1f0ba2..843a450 100644
--- a/quapy/functional.py
+++ b/quapy/functional.py
@@ -90,11 +90,32 @@ def TopsoeDistance(P, Q, epsilon=1e-20):
 
     :param P: real-valued array-like of shape `(k,)` representing a discrete distribution
     :param Q: real-valued array-like of shape `(k,)` representing a discrete distribution
+    :param epsilon: small value to smooth the distributions for numerical stability
     :return: float
     """
     return np.sum(P*np.log((2*P+epsilon)/(P+Q+epsilon)) + Q*np.log((2*Q+epsilon)/(P+Q+epsilon)))
                   
 
+def CauchySchwarz(P, Q, epsilon=1e-20):
+    """
+    Cauchy-Schwarz divergence between two (discretized) distributions `P` and `Q`.
+    The Cauchy-Schwarz divergence for two discrete distributions of `k` bins is defined as:
+
+    .. math::
+        CS(P,Q) = \\frac{ \\sum_{i=1}^k  p_i q_i }{
+            \\left( \\sum_{i=1}^k  p^2_i \\right) \\left( \\sum_{i=1}^k  q^2_i \\right)
+        }
+
+    :param P: real-valued array-like of shape `(k,)` representing a discrete distribution
+    :param Q: real-valued array-like of shape `(k,)` representing a discrete distribution
+    :param epsilon: small value to smooth the distributions for numerical stability
+    :return: float
+    """
+    P += epsilon
+    Q += epsilon
+    return - np.log(sum(P * Q) / np.sqrt(sum(P ** 2) * sum(Q ** 2)))
+
+
 def uniform_prevalence_sampling(n_classes, size=1):
     """
     Implements the `Kraemer algorithm <http://www.cs.cmu.edu/~nasmith/papers/smith+tromble.tr04.pdf>`_
@@ -276,3 +297,4 @@ def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08):
         return False
     return True
 
+
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index 9901f38..31be9f9 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -604,10 +604,13 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
 
 def _get_divergence(divergence: Union[str, Callable]):
     if isinstance(divergence, str):
-        if divergence=='HD':
+        divergence = divergence.lower()
+        if divergence=='hd':
             return F.HellingerDistance
         elif divergence=='topsoe':
             return F.TopsoeDistance
+        elif divergence.lower()=='cs':
+            return F.CauchySchwarz
         elif divergence.lower()=='l2':
             return lambda a,b: np.linalg.norm(a-b)
         elif divergence.lower()=='l1':