forked from moreo/QuaPy
quantification stumps
This commit is contained in:
parent
970008c9f7
commit
ee007bd0d5
|
@ -16,6 +16,17 @@ import os
|
||||||
from scipy.stats import ttest_rel
|
from scipy.stats import ttest_rel
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
The idea of this method is to make a first guess of the test class distribution (maybe with PACC) and then
|
||||||
|
train a method without adjustment (maybe PCC) setting the class_weight param in such a way that best compensates
|
||||||
|
for the positive and negative contribution wrt the guessed distribution. The method can be iterative, though I
|
||||||
|
have not seen any major inprovements (if at all) in doing more than 1 iteration.
|
||||||
|
This file is the proof of concept with artificial data and nice plots. The quantifier is implemented in file
|
||||||
|
class_weight_model.py.
|
||||||
|
So far, it looks like for artificial datasets works, for UCI (without model selection for now) works better than PACC.
|
||||||
|
For reviews it does not improve over PACC though.
|
||||||
|
"""
|
||||||
|
|
||||||
x_min, x_max = 0, 11
|
x_min, x_max = 0, 11
|
||||||
y_min, y_max = 0, x_max
|
y_min, y_max = 0, x_max
|
||||||
center0 = (2*x_max/5,2*x_max/5)
|
center0 = (2*x_max/5,2*x_max/5)
|
||||||
|
|
|
@ -0,0 +1,87 @@
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
import numpy as np
|
||||||
|
import quapy as qp
|
||||||
|
import quapy.functional as F
|
||||||
|
from data import LabelledCollection
|
||||||
|
from method.aggregative import ACC
|
||||||
|
from method.base import BaseQuantifier
|
||||||
|
from tqdm import tqdm
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=10)
|
||||||
|
|
||||||
|
class DecisionStump(BaseEstimator):
|
||||||
|
def __init__(self, feat_id):
|
||||||
|
self.feat_id = feat_id
|
||||||
|
self.classes_ = np.asarray([0,1], dtype=int)
|
||||||
|
|
||||||
|
def fit(self, X, y):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
return (X[:,self.feat_id].toarray().flatten()>0).astype(int)
|
||||||
|
|
||||||
|
|
||||||
|
class QuantificationStump(BaseQuantifier):
|
||||||
|
def __init__(self, feat_id):
|
||||||
|
self.feat_id = feat_id
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection):
|
||||||
|
self.qs = ACC(DecisionStump(self.feat_id))
|
||||||
|
self.qs.fit(data, fit_learner=False, val_split=data)
|
||||||
|
self.classes = data.classes_
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
return self.qs.quantify(instances)
|
||||||
|
|
||||||
|
def set_params(self, **parameters):
|
||||||
|
raise NotImplemented()
|
||||||
|
|
||||||
|
def get_params(self, deep=True):
|
||||||
|
raise NotImplemented()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes_(self):
|
||||||
|
return self.classes
|
||||||
|
|
||||||
|
|
||||||
|
train, dev = data.training.split_stratified()
|
||||||
|
test = data.test.sampling(1000, 0.3, 0.7)
|
||||||
|
|
||||||
|
print(f'test prevalence = {F.strprev(test.prevalence())}')
|
||||||
|
|
||||||
|
nF = train.instances.shape[1]
|
||||||
|
|
||||||
|
qs_scores = []
|
||||||
|
qs = np.asarray([QuantificationStump(i).fit(train) for i in tqdm(range(nF))])
|
||||||
|
scores = np.zeros(shape=(nF, 11*5))
|
||||||
|
for j, dev_sample in tqdm(enumerate(dev.artificial_sampling_generator(500, n_prevalences=11, repeats=5)), total=11*5):
|
||||||
|
sample_prev = dev_sample.prevalence()
|
||||||
|
for i, qs_i in enumerate(qs):
|
||||||
|
estim_prev = qs_i.quantify(dev.instances)
|
||||||
|
error = qp.error.ae(sample_prev, estim_prev)
|
||||||
|
scores[i,j] = error
|
||||||
|
|
||||||
|
k=250
|
||||||
|
scores = scores.mean(axis=1)
|
||||||
|
order = np.argsort(scores)
|
||||||
|
qs = qs[order][:k]
|
||||||
|
|
||||||
|
prevs = np.asarray([qs_i.quantify(test.instances)[1] for qs_i in tqdm(qs)])
|
||||||
|
|
||||||
|
print(f'test estimation mean {prevs.mean():.3f}, median = {np.median(prevs)}')
|
||||||
|
|
||||||
|
# sns.histplot(data=prevs, binwidth=3)
|
||||||
|
# An "interface" to matplotlib.axes.Axes.hist() method
|
||||||
|
# n, bins, patches = plt.hist(x=prevs, bins='auto', alpha=0.7)
|
||||||
|
# plt.grid(axis='y', alpha=0.75)
|
||||||
|
# plt.xlabel('Value')
|
||||||
|
# plt.ylabel('Frequency')
|
||||||
|
# plt.title('My Very Own Histogram')
|
||||||
|
# maxfreq = n.max()
|
||||||
|
# Set a clean upper y-axis limit.
|
||||||
|
# plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)
|
||||||
|
# plt.show()
|
Loading…
Reference in New Issue