diff --git a/NewMethods/ClassWeightQuantification.py b/NewMethods/ClassWeightQuantification.py index 1b75f3e..456f2e8 100644 --- a/NewMethods/ClassWeightQuantification.py +++ b/NewMethods/ClassWeightQuantification.py @@ -16,6 +16,17 @@ import os from scipy.stats import ttest_rel +""" +The idea of this method is to make a first guess of the test class distribution (maybe with PACC) and then +train a method without adjustment (maybe PCC) setting the class_weight param in such a way that best compensates +for the positive and negative contribution wrt the guessed distribution. The method can be iterative, though I +have not seen any major inprovements (if at all) in doing more than 1 iteration. +This file is the proof of concept with artificial data and nice plots. The quantifier is implemented in file +class_weight_model.py. +So far, it looks like for artificial datasets works, for UCI (without model selection for now) works better than PACC. +For reviews it does not improve over PACC though. +""" + x_min, x_max = 0, 11 y_min, y_max = 0, x_max center0 = (2*x_max/5,2*x_max/5) diff --git a/NewMethods/quantification_stumps.py b/NewMethods/quantification_stumps.py new file mode 100644 index 0000000..a2256b5 --- /dev/null +++ b/NewMethods/quantification_stumps.py @@ -0,0 +1,87 @@ +from sklearn.base import BaseEstimator +import numpy as np +import quapy as qp +import quapy.functional as F +from data import LabelledCollection +from method.aggregative import ACC +from method.base import BaseQuantifier +from tqdm import tqdm +import seaborn as sns +import matplotlib.pyplot as plt + + +data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=10) + +class DecisionStump(BaseEstimator): + def __init__(self, feat_id): + self.feat_id = feat_id + self.classes_ = np.asarray([0,1], dtype=int) + + def fit(self, X, y): + return self + + def predict(self, X): + return (X[:,self.feat_id].toarray().flatten()>0).astype(int) + + +class QuantificationStump(BaseQuantifier): + def __init__(self, feat_id): + self.feat_id = feat_id + + def fit(self, data: LabelledCollection): + self.qs = ACC(DecisionStump(self.feat_id)) + self.qs.fit(data, fit_learner=False, val_split=data) + self.classes = data.classes_ + return self + + def quantify(self, instances): + return self.qs.quantify(instances) + + def set_params(self, **parameters): + raise NotImplemented() + + def get_params(self, deep=True): + raise NotImplemented() + + @property + def classes_(self): + return self.classes + + +train, dev = data.training.split_stratified() +test = data.test.sampling(1000, 0.3, 0.7) + +print(f'test prevalence = {F.strprev(test.prevalence())}') + +nF = train.instances.shape[1] + +qs_scores = [] +qs = np.asarray([QuantificationStump(i).fit(train) for i in tqdm(range(nF))]) +scores = np.zeros(shape=(nF, 11*5)) +for j, dev_sample in tqdm(enumerate(dev.artificial_sampling_generator(500, n_prevalences=11, repeats=5)), total=11*5): + sample_prev = dev_sample.prevalence() + for i, qs_i in enumerate(qs): + estim_prev = qs_i.quantify(dev.instances) + error = qp.error.ae(sample_prev, estim_prev) + scores[i,j] = error + +k=250 +scores = scores.mean(axis=1) +order = np.argsort(scores) +qs = qs[order][:k] + +prevs = np.asarray([qs_i.quantify(test.instances)[1] for qs_i in tqdm(qs)]) + +print(f'test estimation mean {prevs.mean():.3f}, median = {np.median(prevs)}') + +# sns.histplot(data=prevs, binwidth=3) +# An "interface" to matplotlib.axes.Axes.hist() method +# n, bins, patches = plt.hist(x=prevs, bins='auto', alpha=0.7) +# plt.grid(axis='y', alpha=0.75) +# plt.xlabel('Value') +# plt.ylabel('Frequency') +# plt.title('My Very Own Histogram') +# maxfreq = n.max() +# Set a clean upper y-axis limit. +# plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10) +# plt.show() \ No newline at end of file