diff --git a/NewMethods/evaluate_results.py b/NewMethods/evaluate_results.py index 2b8a4d0..ae71593 100644 --- a/NewMethods/evaluate_results.py +++ b/NewMethods/evaluate_results.py @@ -32,4 +32,4 @@ def evaluate_results(methods, datasets, error_name): print(f'Ave: {np.mean(all):.3f}') -evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae') +evaluate_results(methods=['*'], datasets=['*'], error_name='mae') diff --git a/NewMethods/experiments.py b/NewMethods/experiments.py index 40cdce7..df18d74 100644 --- a/NewMethods/experiments.py +++ b/NewMethods/experiments.py @@ -1,5 +1,6 @@ from sklearn.linear_model import LogisticRegression import quapy as qp +from NewMethods.fgsld.fgsld_quantifiers import FakeFGLSD from classification.methods import PCALR from method.meta import QuaNet from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation @@ -36,8 +37,10 @@ def experimental_models(): svmperf_params = {'C': __C_range} #yield 'paccsld', PACCSLD(newLR()), lr_params # yield 'hdysld', OneVsAll(HDySLD(newLR())), lr_params # <-- promising! - yield 'PACC(5)', PACC(newLR(), val_split=5), {} - yield 'PACC(10)', PACC(newLR(), val_split=10), {} + #yield 'PACC(5)', PACC(newLR(), val_split=5), {} + #yield 'PACC(10)', PACC(newLR(), val_split=10), {} + yield 'FGSLD(3)', FakeFGLSD(newLR(), nbins=3, isomerous=False, recompute_bins=True), {} + yield 'FGSLD(5)', FakeFGLSD(newLR(), nbins=5, isomerous=False, recompute_bins=True), {} @@ -209,7 +212,7 @@ if __name__ == '__main__': print(f'Result folder: {args.results}') np.random.seed(0) - optim_losses = ['mae', 'mrae'] + optim_losses = ['mae'] datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN qp.util.parallel(run, itertools.product(optim_losses, datasets, experimental_models()), n_jobs=settings.N_JOBS) diff --git a/NewMethods/fgsld/em.py b/NewMethods/fgsld/em.py index 0f6ab6d..007c41c 100644 --- a/NewMethods/fgsld/em.py +++ b/NewMethods/fgsld/em.py @@ -5,7 +5,7 @@ from collections import namedtuple from sklearn.metrics import brier_score_loss from sklearn.preprocessing import MultiLabelBinarizer -from metrics import smoothmacroF1, isometric_brier_decomposition, isomerous_brier_decomposition +from NewMethods.fgsld.metrics import smoothmacroF1, isometric_brier_decomposition, isomerous_brier_decomposition History = namedtuple('History', ('posteriors', 'priors', 'y', 'iteration', 'stopping_criterium')) MeasureSingleHistory = namedtuple('MeasureSingleHistory', ( diff --git a/NewMethods/fgsld/fglsd_test.py b/NewMethods/fgsld/fglsd_test.py index 72fb90d..d02f07a 100644 --- a/NewMethods/fgsld/fglsd_test.py +++ b/NewMethods/fgsld/fglsd_test.py @@ -1,13 +1,15 @@ from sklearn.calibration import CalibratedClassifierCV +from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC -from fgsld.fgsld_quantifiers import FakeFGLSD +from fgsld_quantifiers import FakeFGLSD from method.aggregative import EMQ, CC import quapy as qp +import numpy as np qp.environ['SAMPLE_SIZE'] = 500 -dataset = qp.datasets.fetch_reviews('kindle') +dataset = qp.datasets.fetch_reviews('hp') qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True) training = dataset.training @@ -15,22 +17,22 @@ test = dataset.test cls = CalibratedClassifierCV(LinearSVC()) +#cls = LogisticRegression() + method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], [] for model, model_name in [ (CC(cls), 'CC'), - # (FakeFGLSD(cls, nbins=5, isomerous=False, recompute_bins=False), 'FGSLD-isometric-stat-5'), - (FakeFGLSD(cls, nbins=5, isomerous=True, recompute_bins=True), 'FGSLD-isometric-dyn-5'), - # (FakeFGLSD(cls, nbins=5, isomerous=True, recompute_bins=False), 'FGSLD-isomerous-stat-5'), - # (FakeFGLSD(cls, nbins=10, isomerous=True, recompute_bins=True), 'FGSLD-isomerous-dyn-10'), - #(FakeFGLSD(cls, nbins=5, isomerous=False), 'FGSLD-5'), - #(FakeFGLSD(cls, nbins=10, isomerous=False), 'FGSLD-10'), - #(FakeFGLSD(cls, nbins=50, isomerous=False), 'FGSLD-50'), - #(FakeFGLSD(cls, nbins=100, isomerous=False), 'FGSLD-100'), -# (FakeFGLSD(cls, nbins=1, isomerous=False), 'FGSLD-1'), - #(FakeFGLSD(cls, nbins=10, isomerous=True), 'FGSLD-10-ISO'), - # (FakeFGLSD(cls, nbins=50, isomerous=False), 'FGSLD-50'), +# (FakeFGLSD(cls, nbins=20, isomerous=False, recompute_bins=True), 'FGSLD-isometric-dyn-20'), + (FakeFGLSD(cls, nbins=11, isomerous=False, recompute_bins=True), 'FGSLD-isometric-dyn-11'), + #(FakeFGLSD(cls, nbins=8, isomerous=False, recompute_bins=True), 'FGSLD-isometric-dyn-8'), + #(FakeFGLSD(cls, nbins=6, isomerous=False, recompute_bins=True), 'FGSLD-isometric-dyn-6'), + (FakeFGLSD(cls, nbins=5, isomerous=False, recompute_bins=True), 'FGSLD-isometric-dyn-5'), + #(FakeFGLSD(cls, nbins=4, isomerous=False, recompute_bins=True), 'FGSLD-isometric-dyn-4'), + (FakeFGLSD(cls, nbins=3, isomerous=False, recompute_bins=True), 'FGSLD-isometric-dyn-3'), +# (FakeFGLSD(cls, nbins=1, isomerous=False, recompute_bins=True), 'FGSLD-isometric-dyn-1'), +# (FakeFGLSD(cls, nbins=3, isomerous=False, recompute_bins=False), 'FGSLD-isometric-sta-3'), (EMQ(cls), 'SLD'), ]: print('running ', model_name) @@ -42,6 +44,8 @@ for model, model_name in [ true_prevs.append(true_prev) estim_prevs.append(estim_prev) tr_prevs.append(training.prevalence()) + #if hasattr(model, 'iterations'): + # print(f'iterations ave={np.mean(model.iterations):.3f}, min={np.min(model.iterations):.3f}, max={np.max(model.iterations):.3f}') qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, train_prev=tr_prevs[0], savepath='./plot_fglsd.png') diff --git a/NewMethods/fgsld/fgsld_quantifiers.py b/NewMethods/fgsld/fgsld_quantifiers.py index 7c76de9..94c8681 100644 --- a/NewMethods/fgsld/fgsld_quantifiers.py +++ b/NewMethods/fgsld/fgsld_quantifiers.py @@ -14,6 +14,7 @@ class FakeFGLSD(BaseQuantifier): self.nbins = nbins self.isomerous = isomerous self.recompute_bins = recompute_bins + self.iterations=[] def fit(self, data: LabelledCollection): self.Xtr, self.ytr = data.Xy @@ -24,6 +25,7 @@ class FakeFGLSD(BaseQuantifier): tr_priors = F.prevalence_from_labels(self.ytr, n_classes=2) fgsld = FineGrainedSLD(self.Xtr, instances, self.ytr, tr_priors, self.learner, n_bins=self.nbins) priors, posteriors = fgsld.run(self.isomerous, compute_bins_at_every_iter=self.recompute_bins) + self.iterations.append(fgsld.iterations) return priors def get_params(self, deep=True): diff --git a/NewMethods/fgsld/fine_grained_sld.py b/NewMethods/fgsld/fine_grained_sld.py index f45e125..cd3e38f 100644 --- a/NewMethods/fgsld/fine_grained_sld.py +++ b/NewMethods/fgsld/fine_grained_sld.py @@ -1,9 +1,9 @@ import numpy as np -from metrics import isomerous_bins, isometric_bins -from em import History, get_measures_single_history +from NewMethods.fgsld.metrics import isomerous_bins, isometric_bins +from NewMethods.fgsld.em import History, get_measures_single_history from sklearn.model_selection import cross_val_predict import math - +from scipy.special import softmax class FineGrainedSLD: def __init__(self, x_tr, x_te, y_tr, tr_priors, clf, n_bins=10): @@ -16,7 +16,7 @@ class FineGrainedSLD: self.history: [History] = [] self.multi_class = False - def run(self, isomerous_binning, epsilon=1e-6, compute_bins_at_every_iter=True, return_posteriors_hist=False): + def run(self, isomerous_binning, epsilon=1e-6, compute_bins_at_every_iter=True): """ Run the FGSLD algorithm. @@ -26,22 +26,18 @@ class FineGrainedSLD: :param return_posteriors_hist: whether to return posteriors at every iteration or not. :return: If `return_posteriors_hist` is true, the returned posteriors will be a list of numpy arrays, else a single numpy array with posteriors at last iteration. """ - smoothing_tr = 1 / (2 * self.tr_preds.shape[0]) - smoothing_te = 1 / (2 * self.te_preds.shape[0]) + smoothing_tr = 1e-9 # 1 / (2 * self.tr_preds.shape[0]) + smoothing_te = 1e-9 # 1 / (2 * self.te_preds.shape[0]) s = 0 tr_bin_priors = np.zeros((self.n_bins, self.tr_preds.shape[1]), dtype=np.float) te_bin_priors = np.zeros((self.n_bins, self.te_preds.shape[1]), dtype=np.float) tr_bins = self.__create_bins(training=True, isomerous_binning=isomerous_binning) - te_bins = self.__create_bins(training=False, isomerous_binning=isomerous_binning) self.__compute_bins_priors(tr_bin_priors, self.tr_preds, tr_bins, smoothing_tr) + te_preds_cp = self.te_preds.copy() val = 2 * epsilon - if return_posteriors_hist: - posteriors_hist = [self.te_preds.copy()] while not val < epsilon and s < 1000: - assert np.all(np.around(self.te_preds.sum(axis=1), 4) == 1), f"Probabilities do not sum to 1:\ns={s}, " \ - f"probs={self.te_preds.sum(axis=1)}" - if compute_bins_at_every_iter: + if compute_bins_at_every_iter or s==0: te_bins = self.__create_bins(training=False, isomerous_binning=isomerous_binning) if s == 0: @@ -50,34 +46,47 @@ class FineGrainedSLD: te_bin_priors_prev = te_bin_priors.copy() self.__compute_bins_priors(te_bin_priors, self.te_preds, te_bins, smoothing_te) - te_preds_cp = self.te_preds.copy() for label_idx, bins in te_bins.items(): for i, bin_ in enumerate(bins): if bin_.shape[0] == 0: continue - te = te_bin_priors[i][label_idx] - tr = tr_bin_priors[i][label_idx] - # local_min = (math.floor(tr * 10) / 10) + alpha = 1 + beta = 0.1 + local_te = te_bin_priors[i][label_idx] + global_te = self.te_preds[:,label_idx].mean() + te = local_te*alpha + global_te*(1-alpha) + local_tr = tr_bin_priors[i][label_idx] + global_tr = self.tr_priors[label_idx] + tr = local_tr*beta + global_tr*(1-beta) + #local_min = (math.floor(tr * self.n_bins) / self.n_bins) # local_max = local_min + .1 # trans = lambda l: min(max((l - local_min) / 1, 0), 1) - trans = lambda l: l - self.te_preds[:, label_idx][bin_] = (te_preds_cp[:, label_idx][bin_]) * \ - (trans(te) / trans(tr)) + assert not isomerous_binning, 'not tested' + #trans = lambda l: l - local_min + # trans = lambda l: l + # ratio = (trans(te) / trans(tr)) + #ratio = np.clip(ratio, 0.1, 2) + #ratio = ratio**3 + #self.te_preds[:, label_idx][bin_] = (te_preds_cp[:, label_idx][bin_]) * ratio + old_posterior = te_preds_cp[:, label_idx][bin_] + lr = 1 + #self.te_preds[:, label_idx][bin_] = np.clip(old_posterior + (te-tr)*lr, 0, None) + self.te_preds[:, label_idx][bin_] = np.clip(old_posterior + (te - tr) * lr, 0, None) + #self.te_preds[:, label_idx][bin_] = (te_preds_cp[:, label_idx][bin_]) * ratio # Normalization step self.te_preds = (self.te_preds / self.te_preds.sum(axis=1, keepdims=True)) + #self.te_preds = softmax(self.te_preds, axis=1) - val = 0 - for label_idx in range(te_bin_priors.shape[1]): - temp = max(abs((te_bin_priors[:, label_idx] / te_bin_priors_prev[:, label_idx]) - 1)) - if temp > val: - val = temp + val = np.max(np.abs(te_bin_priors / te_bin_priors_prev) - 1) s += 1 - if return_posteriors_hist: - posteriors_hist.append(self.te_preds.copy()) - if return_posteriors_hist: - return self.te_preds.mean(axis=0), posteriors_hist - return self.te_preds.mean(axis=0), self.te_preds + + self.iterations = s + + priors = self.te_preds.mean(axis=0) + posteriors = self.te_preds + + return priors, posteriors def __compute_bins_priors(self, bin_priors_placeholder, posteriors, bins, smoothing): for label_idx, bins in bins.items(): @@ -85,23 +94,10 @@ class FineGrainedSLD: if bin_.shape[0] == 0: bin_priors_placeholder[i, label_idx] = smoothing continue - numerator = posteriors[:, label_idx][bin_].mean() + numerator = posteriors[bin_, label_idx].mean() bin_prior = (numerator + smoothing) / (1 + self.n_bins * smoothing) # normalize priors bin_priors_placeholder[i, label_idx] = bin_prior - def __find_bin_idx(self, label_bins: [np.array], idx: int or list): - if hasattr(idx, '__len__'): - idxs = np.zeros(len(idx), dtype=np.int) - for i, bin_ in enumerate(label_bins): - for j, id_ in enumerate(idx): - if id_ in bin_: - idxs[j] = i - return idxs - else: - for i, bin_ in enumerate(label_bins): - if idx in bin_: - return i - def __create_bins(self, training: bool, isomerous_binning: bool): bins = {} preds = self.tr_preds if training else self.te_preds @@ -111,6 +107,6 @@ class FineGrainedSLD: else: intervals = np.linspace(0., 1., num=self.n_bins, endpoint=False) for label_idx in range(preds.shape[1]): - bins_ = isometric_bins(label_idx, preds, intervals, 0.1) + bins_ = isometric_bins(label_idx, preds, intervals) bins[label_idx] = [bins_[i] for i in intervals] return bins diff --git a/NewMethods/fgsld/metrics.py b/NewMethods/fgsld/metrics.py index c95e757..5a2662e 100644 --- a/NewMethods/fgsld/metrics.py +++ b/NewMethods/fgsld/metrics.py @@ -73,10 +73,21 @@ def brier_decomposition(bins, true_labels, predicted_labels, class_=1): return calibration_score / (labels_len * len(bins)), refinement_score / (labels_len * len(bins)) -def isometric_bins(label_index, predicted_labels, bin_intervals, step): +#def isometric_bins(label_index, predicted_labels, bin_intervals, step): +# predicted_class_label = predicted_labels[:, label_index] +# return {interv: np.where(np.logical_and(interv <= predicted_class_label, predicted_class_label < interv + step))[0] +# for interv in bin_intervals} + +def isometric_bins(label_index, predicted_labels, bin_intervals): + def next_intv(i): + return bin_intervals[i + 1] if (i + 1) < len(bin_intervals) else 1. predicted_class_label = predicted_labels[:, label_index] - return {interv: np.where(np.logical_and(interv <= predicted_class_label, predicted_class_label < interv + step))[0] - for interv in bin_intervals} + return { + interv: + np.where(np.logical_and(interv <= predicted_class_label, predicted_class_label < next_intv(i)))[ + 0] + for i, interv in enumerate(bin_intervals) + } def isomerous_bins(label_index, predicted_labels, n): diff --git a/NewMethods/fgsld/plot_fglsd.png b/NewMethods/fgsld/plot_fglsd.png index d1dd2d6..a48f09f 100644 Binary files a/NewMethods/fgsld/plot_fglsd.png and b/NewMethods/fgsld/plot_fglsd.png differ diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 392f866..31a061c 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -352,6 +352,7 @@ class EMQ(AggregativeProbabilisticQuantifier): @classmethod def EM(cls, tr_prev, posterior_probabilities, epsilon=EPSILON): + #print('training-priors', tr_prev) Px = posterior_probabilities Ptr = np.copy(tr_prev) qs = np.copy(Ptr) # qs (the running estimate) is initialized as the training prevalence @@ -359,11 +360,14 @@ class EMQ(AggregativeProbabilisticQuantifier): s, converged = 0, False qs_prev_ = None while not converged and s < EMQ.MAX_ITER: - # E-step: ps is Ps(y=+1|xi) + #print('iter: ', s) + # E-step: ps is Ps(y|xi) ps_unnormalized = (qs / Ptr) * Px - ps = ps_unnormalized / ps_unnormalized.sum(axis=1).reshape(-1,1) + ps = ps_unnormalized / ps_unnormalized.sum(axis=1, keepdims=True) + #print(f'\tratio=', qs / Ptr) + #print(f'\torigin_posteriors ', Px) - # M-step: qs_pos is Ps+1(y=+1) + # M-step: qs = ps.mean(axis=0) if qs_prev_ is not None and qp.error.mae(qs, qs_prev_) < epsilon and s>10: @@ -373,7 +377,6 @@ class EMQ(AggregativeProbabilisticQuantifier): s += 1 if not converged: - #raise UserWarning('the method has reached the maximum number of iterations; it might have not converged') print('[warning] the method has reached the maximum number of iterations; it might have not converged') return qs, ps