1
0
Fork 0

trying to understand the poor performance of quantifiers with rel sampling

This commit is contained in:
Alejandro Moreo Fernandez 2022-02-03 11:00:25 +01:00
parent 6ea627449c
commit 1b1f41dc28
3 changed files with 8 additions and 5 deletions

View File

@ -49,8 +49,9 @@ def NewQuantifier(quantifiername, classifiername):
def newQ():
# return PACC(NewClassifier(classifiername), val_split=0.4)
# return CC(CalibratedClassifierCV(NewClassifier(classifiername)))
return ClassWeightPCC()
return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans')
# return ClassWeightPCC()
return CC(NewClassifier(classifiername))
return RegionProbAdjustmentGlobal(newQ, k=20, clustering='kmeans')
raise ValueError('unknown quantifier', quantifiername)

View File

@ -203,7 +203,7 @@ class RegionProbAdjustmentGlobal(BaseQuantifier):
# g = self._get_regions(data.instances)
X, y = data.Xy
self.g_quantifiers = {}
trivial=0
trivial, trivial_data = 0, 0
for gi in np.unique(g):
qi_data = LabelledCollection(X[g==gi], y[g==gi], classes_=data.classes_)
if qi_data.counts()[1] <= 1:
@ -213,12 +213,14 @@ class RegionProbAdjustmentGlobal(BaseQuantifier):
# if qi_data.prevalence()[0] == 1: # all negatives
self.g_quantifiers[gi] = TrivialRejectorQuantifier()
trivial+=1
trivial_data += len(qi_data)
elif qi_data.counts()[0] <= 1: # (almost) all positives
self.g_quantifiers[gi] = TrivialAcceptorQuantifier()
trivial += 1
trivial_data += len(qi_data)
else:
self.g_quantifiers[gi] = self.quantifier_fn().fit(qi_data)
print(f'trivials={trivial}')
print(f'trivials={trivial} amounting to {trivial_data*100.0/len(data):.2f}% of the data')
return self

View File

@ -7,7 +7,7 @@ k=100
initsize=500
initprev=-1
seed=1
Q=GRPACC
Q=URBQ
CLS=lr
sampling=relevance_sampling