From 1b1f41dc28ec2b0b46e0e80b2e6b274b868bd686 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Thu, 3 Feb 2022 11:00:25 +0100 Subject: [PATCH] trying to understand the poor performance of quantifiers with rel sampling --- eDiscovery/functions.py | 5 +++-- eDiscovery/method.py | 6 ++++-- eDiscovery/run.sh | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/eDiscovery/functions.py b/eDiscovery/functions.py index 170dfa8..32a41a0 100644 --- a/eDiscovery/functions.py +++ b/eDiscovery/functions.py @@ -49,8 +49,9 @@ def NewQuantifier(quantifiername, classifiername): def newQ(): # return PACC(NewClassifier(classifiername), val_split=0.4) # return CC(CalibratedClassifierCV(NewClassifier(classifiername))) - return ClassWeightPCC() - return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans') + # return ClassWeightPCC() + return CC(NewClassifier(classifiername)) + return RegionProbAdjustmentGlobal(newQ, k=20, clustering='kmeans') raise ValueError('unknown quantifier', quantifiername) diff --git a/eDiscovery/method.py b/eDiscovery/method.py index 5b74708..e781fb4 100644 --- a/eDiscovery/method.py +++ b/eDiscovery/method.py @@ -203,7 +203,7 @@ class RegionProbAdjustmentGlobal(BaseQuantifier): # g = self._get_regions(data.instances) X, y = data.Xy self.g_quantifiers = {} - trivial=0 + trivial, trivial_data = 0, 0 for gi in np.unique(g): qi_data = LabelledCollection(X[g==gi], y[g==gi], classes_=data.classes_) if qi_data.counts()[1] <= 1: @@ -213,12 +213,14 @@ class RegionProbAdjustmentGlobal(BaseQuantifier): # if qi_data.prevalence()[0] == 1: # all negatives self.g_quantifiers[gi] = TrivialRejectorQuantifier() trivial+=1 + trivial_data += len(qi_data) elif qi_data.counts()[0] <= 1: # (almost) all positives self.g_quantifiers[gi] = TrivialAcceptorQuantifier() trivial += 1 + trivial_data += len(qi_data) else: self.g_quantifiers[gi] = self.quantifier_fn().fit(qi_data) - print(f'trivials={trivial}') + print(f'trivials={trivial} amounting to {trivial_data*100.0/len(data):.2f}% of the data') return self diff --git a/eDiscovery/run.sh b/eDiscovery/run.sh index f1c1035..541bd2a 100755 --- a/eDiscovery/run.sh +++ b/eDiscovery/run.sh @@ -7,7 +7,7 @@ k=100 initsize=500 initprev=-1 seed=1 -Q=GRPACC +Q=URBQ CLS=lr sampling=relevance_sampling