diff --git a/plot_example.py b/plot_example.py new file mode 100644 index 0000000..346455e --- /dev/null +++ b/plot_example.py @@ -0,0 +1,48 @@ +from sklearn.model_selection import GridSearchCV +import numpy as np +import quapy as qp +from sklearn.linear_model import LogisticRegression + +sample_size = 500 +qp.environ['SAMPLE_SIZE'] = sample_size + + + +def gen_data(): + + data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5) + + models = [ + qp.method.aggregative.CC, + qp.method.aggregative.ACC, + qp.method.aggregative.PCC, + qp.method.aggregative.PACC, + qp.method.aggregative.HDy, + qp.method.aggregative.EMQ, + qp.method.meta.ECC, + qp.method.meta.EACC, + qp.method.meta.EHDy, + ] + + method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], [] + for Quantifier in models: + print(f'training {Quantifier.__name__}') + lr = LogisticRegression(max_iter=1000, class_weight='balanced') + # lr = GridSearchCV(lr, param_grid={'C':np.logspace(-3,3,7)}, n_jobs=-1) + model = Quantifier(lr).fit(data.training) + true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction( + model, data.test, sample_size, n_repetitions=20, n_prevpoints=11) + + method_names.append(Quantifier.__name__) + true_prevs.append(true_prev) + estim_prevs.append(estim_prev) + tr_prevs.append(data.training.prevalence()) + + return method_names, true_prevs, estim_prevs, tr_prevs + +method_names, true_prevs, estim_prevs, tr_prevs = qp.util.pickled_resource('./plots/plot_data.pkl', gen_data) + +qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=11, savepath='./plots/err_drift.png') +qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots/bin_diag.png') +qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, savepath='./plots/bin_bias.png') +qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, nbins=11, savepath='./plots/bin_bias_bin.png') diff --git a/quapy/__init__.py b/quapy/__init__.py index ac09f24..a2f98fd 100644 --- a/quapy/__init__.py +++ b/quapy/__init__.py @@ -4,6 +4,8 @@ from . import functional from . import method from . import data from . import evaluation +from . import plot +from . import util from method.aggregative import isaggregative, isprobabilistic @@ -17,4 +19,4 @@ environ = { def isbinary(x): - return data.isbinary(x) or method.aggregative.isbinary(x) \ No newline at end of file + return data.isbinary(x) or method.isbinary(x) \ No newline at end of file diff --git a/quapy/evaluation.py b/quapy/evaluation.py index 4c56b1e..3fc3907 100644 --- a/quapy/evaluation.py +++ b/quapy/evaluation.py @@ -30,9 +30,9 @@ def artificial_sampling_prediction( :param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect any other random process. :param verbose: if True, shows a progress bar - :return: two ndarrays of [m,n] with m the number of samples (n_prevpoints*n_repetitions) and n the + :return: two ndarrays of shape (m,n) with m the number of samples (n_prevpoints*n_repetitions) and n the number of classes. The first one contains the true prevalences for the samples generated while the second one - containing the the prevalences estimations + contains the the prevalence estimations """ with temp_seed(random_seed): diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py index f6bc8d5..47d2b3a 100644 --- a/quapy/method/__init__.py +++ b/quapy/method/__init__.py @@ -5,13 +5,13 @@ from . import meta AGGREGATIVE_METHODS = { - aggregative.ClassifyAndCount, - aggregative.AdjustedClassifyAndCount, - aggregative.ProbabilisticClassifyAndCount, - aggregative.ProbabilisticAdjustedClassifyAndCount, - aggregative.ExplicitLossMinimisation, - aggregative.ExpectationMaximizationQuantifier, - aggregative.HellingerDistanceY + aggregative.CC, + aggregative.ACC, + aggregative.PCC, + aggregative.PACC, + aggregative.ELM, + aggregative.EMQ, + aggregative.HDy } NON_AGGREGATIVE_METHODS = { diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index c973613..2613d8e 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -1,5 +1,6 @@ import numpy as np from copy import deepcopy +from sklearn.base import BaseEstimator, clone import functional as F import error from method.base import BaseQuantifier, BinaryQuantifier @@ -130,13 +131,13 @@ def training_helper(learner, # Methods # ------------------------------------ -class ClassifyAndCount(AggregativeQuantifier): +class CC(AggregativeQuantifier): """ The most basic Quantification method. One that simply classifies all instances and countes how many have been attributed each of the classes in order to compute class prevalence estimates. """ - def __init__(self, learner): + def __init__(self, learner:BaseEstimator): self.learner = learner def fit(self, data: LabelledCollection, fit_learner=True): @@ -153,9 +154,9 @@ class ClassifyAndCount(AggregativeQuantifier): return F.prevalence_from_labels(classif_predictions, self.n_classes) -class AdjustedClassifyAndCount(AggregativeQuantifier): +class ACC(AggregativeQuantifier): - def __init__(self, learner): + def __init__(self, learner:BaseEstimator): self.learner = learner def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3): @@ -169,7 +170,7 @@ class AdjustedClassifyAndCount(AggregativeQuantifier): :return: self """ self.learner, validation = training_helper(self.learner, data, fit_learner, val_split=val_split) - self.cc = ClassifyAndCount(self.learner) + self.cc = CC(self.learner) y_ = self.classify(validation.instances) y = validation.labels # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a @@ -182,7 +183,7 @@ class AdjustedClassifyAndCount(AggregativeQuantifier): def aggregate(self, classif_predictions): prevs_estim = self.cc.aggregate(classif_predictions) - return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim) + return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim) @classmethod def solve_adjustment(cls, PteCondEstim, prevs_estim): @@ -198,8 +199,8 @@ class AdjustedClassifyAndCount(AggregativeQuantifier): return adjusted_prevs -class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier): - def __init__(self, learner): +class PCC(AggregativeProbabilisticQuantifier): + def __init__(self, learner:BaseEstimator): self.learner = learner def fit(self, data : LabelledCollection, fit_learner=True): @@ -210,9 +211,9 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier): return F.prevalence_from_probabilities(classif_posteriors, binarize=False) -class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier): +class PACC(AggregativeProbabilisticQuantifier): - def __init__(self, learner): + def __init__(self, learner:BaseEstimator): self.learner = learner def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3): @@ -228,7 +229,7 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier): self.learner, validation = training_helper( self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split ) - self.pcc = ProbabilisticClassifyAndCount(self.learner) + self.pcc = PCC(self.learner) y_ = self.soft_classify(validation.instances) y = validation.labels confusion = np.empty(shape=(data.n_classes, data.n_classes)) @@ -246,7 +247,7 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier): def aggregate(self, classif_posteriors): prevs_estim = self.pcc.aggregate(classif_posteriors) - return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim) + return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim) def classify(self, data): return self.pcc.classify(data) @@ -255,12 +256,12 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier): return self.pcc.posterior_probabilities(data) -class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier): +class EMQ(AggregativeProbabilisticQuantifier): MAX_ITER = 1000 EPSILON = 1e-4 - def __init__(self, learner): + def __init__(self, learner:BaseEstimator): self.learner = learner def fit(self, data: LabelledCollection, fit_learner=True): @@ -279,7 +280,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier): s, converged = 0, False qs_prev_ = None - while not converged and s < ExpectationMaximizationQuantifier.MAX_ITER: + while not converged and s < EMQ.MAX_ITER: # E-step: ps is Ps(y=+1|xi) ps_unnormalized = (qs / Ptr) * Px ps = ps_unnormalized / ps_unnormalized.sum(axis=1).reshape(-1,1) @@ -299,14 +300,14 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier): return qs -class HellingerDistanceY(AggregativeProbabilisticQuantifier, BinaryQuantifier): +class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): """ Implementation of the method based on the Hellinger Distance y (HDy) proposed by González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution estimation based on the Hellinger distance. Information Sciences, 218:146–164. """ - def __init__(self, learner): + def __init__(self, learner:BaseEstimator): self.learner = learner def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3): @@ -353,7 +354,7 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier, BinaryQuantifier): return np.asarray([1-pos_class_prev, pos_class_prev]) -class ExplicitLossMinimisation(AggregativeQuantifier, BinaryQuantifier): +class ELM(AggregativeQuantifier, BinaryQuantifier): def __init__(self, svmperf_base, loss, **kwargs): self.svmperf_base = svmperf_base @@ -374,38 +375,38 @@ class ExplicitLossMinimisation(AggregativeQuantifier, BinaryQuantifier): -class SVMQ(ExplicitLossMinimisation): +class SVMQ(ELM): def __init__(self, svmperf_base, **kwargs): super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs) -class SVMKLD(ExplicitLossMinimisation): +class SVMKLD(ELM): def __init__(self, svmperf_base, **kwargs): super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs) -class SVMNKLD(ExplicitLossMinimisation): +class SVMNKLD(ELM): def __init__(self, svmperf_base, **kwargs): super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs) -class SVMAE(ExplicitLossMinimisation): +class SVMAE(ELM): def __init__(self, svmperf_base, **kwargs): super(SVMAE, self).__init__(svmperf_base, loss='mae', **kwargs) -class SVMRAE(ExplicitLossMinimisation): +class SVMRAE(ELM): def __init__(self, svmperf_base, **kwargs): super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs) -CC = ClassifyAndCount -ACC = AdjustedClassifyAndCount -PCC = ProbabilisticClassifyAndCount -PACC = ProbabilisticAdjustedClassifyAndCount -ELM = ExplicitLossMinimisation -EMQ = ExpectationMaximizationQuantifier -HDy = HellingerDistanceY +ClassifyAndCount = CC +AdjustedClassifyAndCount = ACC +ProbabilisticClassifyAndCount = PCC +ProbabilisticAdjustedClassifyAndCount = PACC +ExplicitLossMinimisation = ELM +ExpectationMaximizationQuantifier = EMQ +HellingerDistanceY = HDy class OneVsAll(AggregativeQuantifier): @@ -414,7 +415,7 @@ class OneVsAll(AggregativeQuantifier): quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1. This variant was used, along with the ExplicitLossMinimization quantifier in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. - Social Network Analysis and Mining6(19), 1–22 (2016) + Social Network Analysis and Mining 6(19), 1–22 (2016) """ def __init__(self, binary_quantifier, n_jobs=-1): @@ -484,15 +485,14 @@ class OneVsAll(AggregativeQuantifier): self.dict_binary_quantifiers[c].fit(bindata) -def isaggregative(model): +def isaggregative(model:BaseQuantifier): return isinstance(model, AggregativeQuantifier) -def isprobabilistic(model): +def isprobabilistic(model:BaseQuantifier): return isinstance(model, AggregativeProbabilisticQuantifier) -def isbinary(model): - return isinstance(model, BinaryQuantifier) + diff --git a/quapy/method/base.py b/quapy/method/base.py index 6134dea..2627b87 100644 --- a/quapy/method/base.py +++ b/quapy/method/base.py @@ -20,12 +20,24 @@ class BaseQuantifier(metaclass=ABCMeta): @abstractmethod def get_params(self, deep=True): ... + @property + def binary(self): + return False + class BinaryQuantifier(BaseQuantifier): def _check_binary(self, data: LabelledCollection, quantifier_name): assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \ f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.' + @property + def binary(self): + return True + + +def isbinary(model:BaseQuantifier): + return model.binary + # class OneVsAll(AggregativeQuantifier): # """ diff --git a/quapy/method/neural.py b/quapy/method/neural.py index da4b35c..9adf175 100644 --- a/quapy/method/neural.py +++ b/quapy/method/neural.py @@ -76,11 +76,11 @@ class QuaNetTrainer(BaseQuantifier): self.tr_prev = data.prevalence() self.quantifiers = { - 'cc': ClassifyAndCount(self.learner).fit(data, fit_learner=False), - 'acc': AdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False), - 'pcc': ProbabilisticClassifyAndCount(self.learner).fit(data, fit_learner=False), - 'pacc': ProbabilisticAdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False), - 'emq': ExpectationMaximizationQuantifier(self.learner).fit(data, fit_learner=False), + 'cc': CC(self.learner).fit(data, fit_learner=False), + 'acc': ACC(self.learner).fit(data, fit_learner=False), + 'pcc': PCC(self.learner).fit(data, fit_learner=False), + 'pacc': PACC(self.learner).fit(data, fit_learner=False), + 'emq': EMQ(self.learner).fit(data, fit_learner=False), } self.status = { diff --git a/quapy/plot.py b/quapy/plot.py new file mode 100644 index 0000000..2b9375b --- /dev/null +++ b/quapy/plot.py @@ -0,0 +1,202 @@ +from collections import defaultdict +import matplotlib.pyplot as plt +from matplotlib import cm +import numpy as np +import quapy as qp + + +plt.rcParams['figure.figsize'] = [12, 8] +plt.rcParams['figure.dpi'] = 200 + + +def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None): + fig, ax = plt.subplots() + ax.set_aspect('equal') + ax.grid() + ax.plot([0, 1], [0, 1], '--k', label='ideal', zorder=1) + + for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs): + true_prev = true_prev[:,pos_class] + estim_prev = estim_prev[:,pos_class] + + x_ticks = np.unique(true_prev) + x_ticks.sort() + y_ave = np.asarray([estim_prev[true_prev == x].mean() for x in x_ticks]) + y_std = np.asarray([estim_prev[true_prev == x].std() for x in x_ticks]) + + ax.errorbar(x_ticks, y_ave, fmt='-', marker='o', label=method, markersize=3, zorder=2) + ax.fill_between(x_ticks, y_ave - y_std, y_ave + y_std, alpha=0.25) + + ax.set(xlabel='true prevalence', ylabel='estimated prevalence', title=title) + ax.set_ylim(0, 1) + ax.set_xlim(0, 1) + + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) + + save_or_show(savepath) + + +def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None): + fig, ax = plt.subplots() + ax.grid() + + data, labels = [], [] + for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs): + true_prev = true_prev[:,pos_class] + estim_prev = estim_prev[:,pos_class] + data.append(estim_prev-true_prev) + labels.append(method) + + ax.boxplot(data, labels=labels, patch_artist=False, showmeans=True) + ax.set(ylabel='error bias', title=title) + + save_or_show(savepath) + + +def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=21, colormap=cm.tab10, + vertical_xticks=False, savepath=None): + from pylab import boxplot, plot, setp + + fig, ax = plt.subplots() + ax.grid() + + bins = np.linspace(0, 1, nbins) + binwidth = 1/(nbins - 1) + data = {} + for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs): + true_prev = true_prev[:,pos_class] + estim_prev = estim_prev[:,pos_class] + + data[method] = [] + inds = np.digitize(true_prev, bins, right=True) + for ind in range(len(bins)): + selected = inds==ind + data[method].append(estim_prev[selected] - true_prev[selected]) + + nmethods = len(method_names) + boxwidth = binwidth/(nmethods+1) + for i,bin in enumerate(bins[:-1]): + boxdata = [data[method][i] for method in method_names] + positions = [bin+(i*boxwidth)+boxwidth for i,_ in enumerate(method_names)] + box = boxplot(boxdata, showmeans=False, positions=positions, widths = boxwidth, sym='+', patch_artist=True) + for boxid in range(len(method_names)): + c = colormap.colors[boxid] + setp(box['fliers'][boxid], color=c, marker='+', markersize=3., markeredgecolor=c) + setp(box['boxes'][boxid], color=c) + setp(box['medians'][boxid], color='k') + + major_xticks_positions, minor_xticks_positions = [], [] + major_xticks_labels, minor_xticks_labels = [], [] + for i,b in enumerate(bins[:-1]): + major_xticks_positions.append(b) + minor_xticks_positions.append(b + binwidth / 2) + major_xticks_labels.append('') + minor_xticks_labels.append(f'[{bins[i]:.2f}-{bins[i + 1]:.2f})') + ax.set_xticks(major_xticks_positions) + ax.set_xticks(minor_xticks_positions, minor=True) + ax.set_xticklabels(major_xticks_labels) + ax.set_xticklabels(minor_xticks_labels, minor=True, rotation='vertical' if vertical_xticks else 'horizontal') + + if vertical_xticks: + # Pad margins so that markers don't get clipped by the axes + plt.margins(0.2) + # Tweak spacing to prevent clipping of tick-labels + plt.subplots_adjust(bottom=0.15) + + # adds the legend to the list hs, initialized with the "ideal" quantifier (one that has 0 bias across all bins. i.e. + # a line from (0,0) to (1,0). The other elements are simply labelled dot-plots that are to be removed (setting + # set_visible to False for all but the first element) after the legend has been placed + hs=[ax.plot([0, 1], [0, 0], '-k', zorder=2)[0]] + for colorid in range(len(method_names)): + h, = plot([1, 1], '-s', markerfacecolor=colormap.colors[colorid], color='k', + mec=colormap.colors[colorid], linewidth=1.) + hs.append(h) + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + ax.legend(hs, ['ideal']+method_names, loc='center left', bbox_to_anchor=(1, 0.5)) + [h.set_visible(False) for h in hs[1:]] + + # x-axis and y-axis labels and limits + ax.set(xlabel='prevalence', ylabel='error bias', title=title) + # ax.set_ylim(-1, 1) + ax.set_xlim(0, 1) + + save_or_show(savepath) + + +def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, error_name='ae', show_std=True, + title=f'Quantification error as a function of distribution shift', + savepath=None): + + fig, ax = plt.subplots() + ax.grid() + + x_error = qp.error.ae + y_error = getattr(qp.error, error_name) + ndims = tr_prevs[0].shape[-1] + + # join all data, and keep the order in which the methods appeared for the first time + data = defaultdict(lambda:{'x':np.empty(shape=(0)), 'y':np.empty(shape=(0))}) + method_order = [] + for method, test_prevs_i, estim_prevs_i, tr_prev_i in zip(method_names, true_prevs, estim_prevs, tr_prevs): + tr_prev_i = np.repeat(tr_prev_i.reshape(1,-1), repeats=test_prevs_i.shape[0], axis=0) + + tr_test_drifts = x_error(test_prevs_i, tr_prev_i) + data[method]['x'] = np.concatenate([data[method]['x'], tr_test_drifts]) + + method_drifts = y_error(test_prevs_i, estim_prevs_i) + data[method]['y'] = np.concatenate([data[method]['y'], method_drifts]) + + if method not in method_order: + method_order.append(method) + + bins = np.linspace(0, 1, n_bins) + binwidth = 1 / (n_bins - 1) + min_x, max_x = None, None + for method in method_order: + tr_test_drifts = data[method]['x'] + method_drifts = data[method]['y'] + + inds = np.digitize(tr_test_drifts, bins, right=True) + xs, ys, ystds = [], [], [] + for ind in range(len(bins)): + selected = inds==ind + if selected.sum() > 0: + xs.append(ind*binwidth) + ys.append(np.mean(method_drifts[selected])) + ystds.append(np.std(method_drifts[selected])) + + xs = np.asarray(xs) + ys = np.asarray(ys) + ystds = np.asarray(ystds) + + min_x_method, max_x_method = xs.min(), xs.max() + min_x = min_x_method if min_x is None or min_x_method < min_x else min_x + max_x = max_x_method if max_x is None or max_x_method > max_x else max_x + + ax.errorbar(xs, ys, fmt='-', marker='o', label=method, markersize=3, zorder=2) + if show_std: + ax.fill_between(xs, ys-ystds, ys+ystds, alpha=0.25) + + ax.set(xlabel=f'Distribution shift between training set and test sample', + ylabel=f'{error_name.upper()} (true distribution, predicted distribution)', + title=title) + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) + ax.set_xlim(min_x, max_x) + + save_or_show(savepath) + + +def save_or_show(savepath): + # if savepath is specified, then saves the plot in that path; otherwise the plot is shown + if savepath is not None: + qp.util.create_parent_dir(savepath) + # plt.tight_layout() + plt.savefig(savepath) + else: + plt.show() + diff --git a/quapy/util.py b/quapy/util.py index bdfbfb9..3bb9446 100644 --- a/quapy/util.py +++ b/quapy/util.py @@ -64,6 +64,10 @@ def get_quapy_home(): return home +def create_parent_dir(path): + os.makedirs(Path(path).parent, exist_ok=True) + + def pickled_resource(pickle_path:str, generation_func:callable, *args): if pickle_path is None: return generation_func(*args)