From 1b1f41dc28ec2b0b46e0e80b2e6b274b868bd686 Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Thu, 3 Feb 2022 11:00:25 +0100
Subject: [PATCH] trying to understand the poor performance of quantifiers with
 rel sampling

---
 eDiscovery/functions.py | 5 +++--
 eDiscovery/method.py    | 6 ++++--
 eDiscovery/run.sh       | 2 +-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/eDiscovery/functions.py b/eDiscovery/functions.py
index 170dfa8..32a41a0 100644
--- a/eDiscovery/functions.py
+++ b/eDiscovery/functions.py
@@ -49,8 +49,9 @@ def NewQuantifier(quantifiername, classifiername):
         def newQ():
             # return PACC(NewClassifier(classifiername), val_split=0.4)
             # return CC(CalibratedClassifierCV(NewClassifier(classifiername)))
-            return ClassWeightPCC()
-        return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans')
+            # return ClassWeightPCC()
+            return CC(NewClassifier(classifiername))
+        return RegionProbAdjustmentGlobal(newQ, k=20, clustering='kmeans')
     raise ValueError('unknown quantifier', quantifiername)
 
 
diff --git a/eDiscovery/method.py b/eDiscovery/method.py
index 5b74708..e781fb4 100644
--- a/eDiscovery/method.py
+++ b/eDiscovery/method.py
@@ -203,7 +203,7 @@ class RegionProbAdjustmentGlobal(BaseQuantifier):
         # g = self._get_regions(data.instances)
         X, y = data.Xy
         self.g_quantifiers = {}
-        trivial=0
+        trivial, trivial_data = 0, 0
         for gi in np.unique(g):
             qi_data = LabelledCollection(X[g==gi], y[g==gi], classes_=data.classes_)
             if qi_data.counts()[1] <= 1:
@@ -213,12 +213,14 @@ class RegionProbAdjustmentGlobal(BaseQuantifier):
                 # if qi_data.prevalence()[0] == 1:  # all negatives
                 self.g_quantifiers[gi] = TrivialRejectorQuantifier()
                 trivial+=1
+                trivial_data += len(qi_data)
             elif qi_data.counts()[0] <= 1:  # (almost) all positives
                 self.g_quantifiers[gi] = TrivialAcceptorQuantifier()
                 trivial += 1
+                trivial_data += len(qi_data)
             else:
                 self.g_quantifiers[gi] = self.quantifier_fn().fit(qi_data)
-        print(f'trivials={trivial}')
+        print(f'trivials={trivial} amounting to {trivial_data*100.0/len(data):.2f}% of the data')
 
         return self
 
diff --git a/eDiscovery/run.sh b/eDiscovery/run.sh
index f1c1035..541bd2a 100755
--- a/eDiscovery/run.sh
+++ b/eDiscovery/run.sh
@@ -7,7 +7,7 @@ k=100
 initsize=500
 initprev=-1
 seed=1
-Q=GRPACC
+Q=URBQ
 CLS=lr
 sampling=relevance_sampling