diff --git a/eDiscovery/functions.py b/eDiscovery/functions.py index 170dfa8..32a41a0 100644 --- a/eDiscovery/functions.py +++ b/eDiscovery/functions.py @@ -49,8 +49,9 @@ def NewQuantifier(quantifiername, classifiername): def newQ(): # return PACC(NewClassifier(classifiername), val_split=0.4) # return CC(CalibratedClassifierCV(NewClassifier(classifiername))) - return ClassWeightPCC() - return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans') + # return ClassWeightPCC() + return CC(NewClassifier(classifiername)) + return RegionProbAdjustmentGlobal(newQ, k=20, clustering='kmeans') raise ValueError('unknown quantifier', quantifiername) diff --git a/eDiscovery/method.py b/eDiscovery/method.py index 5b74708..e781fb4 100644 --- a/eDiscovery/method.py +++ b/eDiscovery/method.py @@ -203,7 +203,7 @@ class RegionProbAdjustmentGlobal(BaseQuantifier): # g = self._get_regions(data.instances) X, y = data.Xy self.g_quantifiers = {} - trivial=0 + trivial, trivial_data = 0, 0 for gi in np.unique(g): qi_data = LabelledCollection(X[g==gi], y[g==gi], classes_=data.classes_) if qi_data.counts()[1] <= 1: @@ -213,12 +213,14 @@ class RegionProbAdjustmentGlobal(BaseQuantifier): # if qi_data.prevalence()[0] == 1: # all negatives self.g_quantifiers[gi] = TrivialRejectorQuantifier() trivial+=1 + trivial_data += len(qi_data) elif qi_data.counts()[0] <= 1: # (almost) all positives self.g_quantifiers[gi] = TrivialAcceptorQuantifier() trivial += 1 + trivial_data += len(qi_data) else: self.g_quantifiers[gi] = self.quantifier_fn().fit(qi_data) - print(f'trivials={trivial}') + print(f'trivials={trivial} amounting to {trivial_data*100.0/len(data):.2f}% of the data') return self diff --git a/eDiscovery/run.sh b/eDiscovery/run.sh index f1c1035..541bd2a 100755 --- a/eDiscovery/run.sh +++ b/eDiscovery/run.sh @@ -7,7 +7,7 @@ k=100 initsize=500 initprev=-1 seed=1 -Q=GRPACC +Q=URBQ CLS=lr sampling=relevance_sampling