From 9c8d29156cf10209de7da7d4c10e393225b62294 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Fri, 4 Dec 2020 19:32:08 +0100 Subject: [PATCH 1/2] aggregative methods adapted. Explicit loss minimization methods (SVMQ, SVMKLD, ...) added and with support to binary or single-label. HDy added --- TODO.txt | 1 + quapy/classification/svmperf.py | 13 +-- quapy/dataset/base.py | 4 +- quapy/error.py | 17 +++- quapy/functional.py | 30 ++++++ quapy/method/__init__.py | 7 +- quapy/method/aggregative.py | 165 +++++++++++++++++++++++--------- quapy/method/base.py | 2 +- quapy/utils/__init__.py | 1 + quapy/utils/util.py | 22 +++++ test.py | 46 +++++++++ 11 files changed, 244 insertions(+), 64 deletions(-) create mode 100644 quapy/utils/__init__.py create mode 100644 quapy/utils/util.py create mode 100644 test.py diff --git a/TODO.txt b/TODO.txt index d25ed25..dee2055 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,3 +1,4 @@ Documentation with sphinx +Document methods with paper references The parallel training in svmperf seems not to work Add "prepare svmperf for quantification" script \ No newline at end of file diff --git a/quapy/classification/svmperf.py b/quapy/classification/svmperf.py index eb788c4..ceab225 100644 --- a/quapy/classification/svmperf.py +++ b/quapy/classification/svmperf.py @@ -20,12 +20,9 @@ class SVMperf(BaseEstimator, ClassifierMixin): self.verbose = verbose self.loss = loss - def set_c(self, C): - self.param_C = '-c ' + str(C) - def set_params(self, **parameters): assert list(parameters.keys()) == ['C'], 'currently, only the C parameter is supported' - self.set_c(parameters['C']) + self.C = parameters['C'] def fit(self, X, y): assert self.loss in SVMperf.valid_losses, \ @@ -33,8 +30,8 @@ class SVMperf(BaseEstimator, ClassifierMixin): self.svmperf_learn = join(self.svmperf_base, 'svm_perf_learn') self.svmperf_classify = join(self.svmperf_base, 'svm_perf_classify') - self.loss_cmd = '-l ' + str(self.valid_losses[self.loss]) - self.set_c(self.C) + self.loss_cmd = '-w 3 -l ' + str(self.valid_losses[self.loss]) + self.c_cmd = '-c ' + str(self.C) self.classes_ = sorted(np.unique(y)) self.n_classes_ = len(self.classes_) @@ -49,7 +46,7 @@ class SVMperf(BaseEstimator, ClassifierMixin): dump_svmlight_file(X, y, traindat, zero_based=False) - cmd = ' '.join([self.svmperf_learn, self.param_C, self.loss_cmd, traindat, self.model]) + cmd = ' '.join([self.svmperf_learn, self.c_cmd, self.loss_cmd, traindat, self.model]) if self.verbose: print('[Running]', cmd) p = subprocess.run(cmd.split(), stdout=PIPE, stderr=STDOUT) @@ -60,7 +57,7 @@ class SVMperf(BaseEstimator, ClassifierMixin): return self - def predict(self, X, y=None): + def predict(self, X): confidence_scores = self.decision_function(X) predictions = (confidence_scores > 0) * 1 return predictions diff --git a/quapy/dataset/base.py b/quapy/dataset/base.py index 29a188f..6b16928 100644 --- a/quapy/dataset/base.py +++ b/quapy/dataset/base.py @@ -43,13 +43,13 @@ class LabelledCollection: @property def binary(self): - return self.n_classes==2 + return self.n_classes == 2 def sampling_index(self, size, *prevs, shuffle=True): if len(prevs) == self.n_classes-1: prevs = prevs + (1-sum(prevs),) assert len(prevs) == self.n_classes, 'unexpected number of prevalences' - assert sum(prevs) == 1, f'prevalences ({prevs}) out of range (sum={sum(prevs)})' + assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})' taken = 0 indexes_sample = [] diff --git a/quapy/error.py b/quapy/error.py index ff9a6e0..fa9ae10 100644 --- a/quapy/error.py +++ b/quapy/error.py @@ -1,5 +1,6 @@ from sklearn.metrics import f1_score -from settings import SAMPLE_SIZE + +SAMPLE_SIZE = None def f1e(y_true, y_pred): @@ -20,11 +21,21 @@ def ae(p, p_hat): return abs(p_hat-p).mean(axis=-1) -def mrae(p, p_hat, eps=1./(2. * SAMPLE_SIZE)): +def __check_eps(eps): + if eps is None: + if SAMPLE_SIZE is None: + raise ValueError('eps was not defined, and qp.error.SAMPLE_SIZE was not set') + else: + eps = 1. / (2. * SAMPLE_SIZE) + return eps + + +def mrae(p, p_hat, eps=None): return rae(p, p_hat, eps).mean() -def rae(p, p_hat, eps=1./(2. * SAMPLE_SIZE)): +def rae(p, p_hat, eps=None): + eps = __check_eps(eps) p = smooth(p, eps) p_hat = smooth(p_hat, eps) return (abs(p-p_hat)/p).mean(axis=-1) diff --git a/quapy/functional.py b/quapy/functional.py index f44a85b..dc7bdc0 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -15,6 +15,26 @@ def artificial_prevalence_sampling(dimensions, n_prevalences=21, repeat=1, retur return prevs +def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01): + """ + Produces a uniformly separated values of prevalence. By default, produces an array 21 prevalences, with step 0.05 + and with the limits smoothed, i.e.: + [0.01, 0.05, 0.10, 0.15, ..., 0.90, 0.95, 0.99] + :param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21) + :param repeat: number of times each prevalence is to be repeated (defaults to 1) + :param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1 + :return: an array of uniformly separated prevalence values + """ + p = np.linspace(0., 1., num=n_prevalences, endpoint=True) + p[0] += smooth_limits_epsilon + p[-1] -= smooth_limits_epsilon + if p[0] > p[1]: + raise ValueError(f'the smoothing in the limits is greater than the prevalence step') + if repeat > 1: + p = np.repeat(p, repeat) + return p + + def prevalence_from_labels(labels, n_classes): unique, counts = np.unique(labels, return_counts=True) by_class = defaultdict(lambda:0, dict(zip(unique, counts))) @@ -47,3 +67,13 @@ def adjusted_quantification(prevalence_estim, tpr, fpr, clip=True): return adjusted +def normalize_prevalence(prevalences): + assert prevalences.ndim==1, 'unexpected shape' + accum = prevalences.sum() + if accum > 0: + return prevalences / accum + else: + # if all classifiers are trivial rejectors + return np.ones_like(prevalences) / prevalences.size + + diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py index a8e98d0..df5cfd5 100644 --- a/quapy/method/__init__.py +++ b/quapy/method/__init__.py @@ -9,6 +9,7 @@ AGGREGATIVE_METHODS = { agg.ProbabilisticAdjustedClassifyAndCount, agg.ExplicitLossMinimisation, agg.ExpectationMaximizationQuantifier, + agg.HellingerDistanceY } NON_AGGREGATIVE_METHODS = { @@ -19,12 +20,6 @@ QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS # common alisases -CC = agg.ClassifyAndCount -ACC = agg.AdjustedClassifyAndCount -PCC = agg.ProbabilisticClassifyAndCount -PACC = agg.ProbabilisticAdjustedClassifyAndCount -ELM = agg.ExplicitLossMinimisation -EMQ = agg.ExpectationMaximizationQuantifier MLPE = nagg.MaximumLikelihoodPrevalenceEstimation diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index ee16baf..0862588 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -9,6 +9,8 @@ from sklearn.calibration import CalibratedClassifierCV from joblib import Parallel, delayed + + # Abstract classes # ------------------------------------ @@ -21,8 +23,8 @@ class AggregativeQuantifier(BaseQuantifier): @abstractmethod def fit(self, data: LabelledCollection, fit_learner=True, *args): ... - def classify(self, documents): - return self.learner.predict(documents) + def classify(self, instances): + return self.learner.predict(instances) def get_params(self, deep=True): return self.learner.get_params() @@ -70,7 +72,7 @@ def training_helper(learner, :param fit_learner: whether or not to fit the learner :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the learner is not probabilistic, then a CalibratedCV instance of it is trained) - :param train_val_split: if specified, indicates the proportion of training documents on which to fit the learner + :param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0 or None otherwise) """ @@ -118,8 +120,8 @@ class ClassifyAndCount(AggregativeQuantifier): self.learner, _ = training_helper(self.learner, data, fit_learner) return self - def quantify(self, documents, *args): - classification = self.classify(documents) # classify + def quantify(self, instances, *args): + classification = self.classify(instances) # classify return F.prevalence_from_labels(classification, self.n_classes) # & count @@ -138,8 +140,8 @@ class AdjustedClassifyAndCount(AggregativeQuantifier): self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts() return self - def quantify(self, documents, *args): - prevs_estim = self.cc.quantify(documents) + def quantify(self, instances, *args): + prevs_estim = self.cc.quantify(instances) # solve for the linear system Ax = B with A=Pte_cond_estim and B = prevs_estim A = self.Pte_cond_estim_ B = prevs_estim @@ -163,8 +165,8 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier): self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) return self - def quantify(self, documents, *args): - posteriors = self.soft_classify(documents) # classify + def quantify(self, instances, *args): + posteriors = self.soft_classify(instances) # classify prevalences = F.prevalence_from_probabilities(posteriors, binarize=False) # & count return prevalences @@ -186,8 +188,8 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier): self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts() return self - def quantify(self, documents, *args): - prevs_estim = self.pcc.quantify(documents) + def quantify(self, instances, *args): + prevs_estim = self.pcc.quantify(instances) A = self.Pte_cond_estim_ B = prevs_estim try: @@ -252,53 +254,82 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier): return qs -# todo: from here -def train_task(c, learners, data): - learners[c].fit(data.documents, data.labels == c) +class HellingerDistanceY(AggregativeProbabilisticQuantifier): + """ + Implementation of the method based on the Hellinger Distance y (HDy) proposed by + González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution + estimation based on the Hellinger distance. Information Sciences, 218:146–164. + """ + + def __init__(self, learner): + self.learner = learner + + def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6): + assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification' + self.learner, validation = training_helper( + self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split) + Px = self.soft_classify(validation.instances) + self.Pxy1 = Px[validation.labels == 1] + self.Pxy0 = Px[validation.labels == 0] + return self + + def quantify(self, instances, *args): + # "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10, + # and the final estimated a priori probability was taken as the median of these 11 estimates." + # (González-Castro, et al., 2013). + + Px = self.soft_classify(instances) + + prev_estimations = [] + for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110] + Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True) + Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True) + + Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True) + + prev_selected, min_dist = None, None + for prev in F.prevalence_linspace(n_prevalences=100, repeat=1, smooth_limits_epsilon=0.0): + Px_train = prev*Pxy1_density + (1 - prev)*Pxy0_density + hdy = HellingerDistanceY.HellingerDistance(Px_train, Px_test) + if prev_selected is None or hdy < min_dist: + prev_selected, min_dist = prev, hdy + prev_estimations.append(prev_selected) + + pos_class_prev = np.median(prev_estimations) + return np.asarray([1-pos_class_prev, pos_class_prev]) + + @classmethod + def HellingerDistance(cls, P, Q): + return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2)) -def binary_quant_task(c, learners, X): - predictions_ci = learners[c].predict(X) - return predictions_ci.mean() # since the predictions array is binary +class OneVsAll(AggregativeQuantifier): - -class OneVsAllELM(AggregativeQuantifier): - - def __init__(self, svmperf_base, loss, n_jobs=-1, **kwargs): - self.svmperf_base = svmperf_base - self.loss = loss + def __init__(self, binary_method, n_jobs=-1, **kwargs): + self.binary_method = binary_method self.n_jobs = n_jobs self.kwargs = kwargs - def fit(self, data: LabelledCollection, fit_learner=True, *args): - assert fit_learner, 'the method requires that fit_learner=True' - - self.learners = {c: SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs) for c in data.classes_} + def fit(self, data: LabelledCollection, **kwargs): + assert not data.binary, f'{self.__class__.__name__} expect non-binary data' + self.class_method = {c: self.binary_method(**self.kwargs) for c in data.classes_} Parallel(n_jobs=self.n_jobs, backend='threading')( - delayed(train_task)(c, self.learners, data) for c in self.learners.keys() + delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_ ) return self - def quantify(self, X, y=None): + def quantify(self, X, *args): prevalences = np.asarray( Parallel(n_jobs=self.n_jobs, backend='threading')( - delayed(binary_quant_task)(c, self.learners, X) for c in self.learners.keys() + delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes ) ) - prevalences /= prevalences.sum() - return prevalences + print('one vs all: ', prevalences) + return F.normalize_prevalence(prevalences) @property def classes(self): - return sorted(self.learners.keys()) - - def preclassify_collection(self, data: LabelledCollection): - classifications = [] - for class_ in data.classes_: - classifications.append(self.learners[class_].predict(data.instances)) - classifications = np.vstack(classifications).T - precomputed = LabelledCollection(classifications, data.labels) - return precomputed + return sorted(self.class_method.keys()) def set_params(self, **parameters): self.kwargs=parameters @@ -306,20 +337,57 @@ class OneVsAllELM(AggregativeQuantifier): def get_params(self, deep=True): return self.kwargs + def _delayed_binary_predict(self, c, learners, X): + return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence + + def _delayed_binary_fit(self, c, learners, data, **kwargs): + bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2) + learners[c].fit(bindata, **kwargs) + class ExplicitLossMinimisation(AggregativeQuantifier): def __init__(self, svmperf_base, loss, **kwargs): - self.learner = SVMperf(svmperf_base, loss=loss, **kwargs) + self.svmperf_base = svmperf_base + self.loss = loss + self.kwargs = kwargs def fit(self, data: LabelledCollection, fit_learner=True, *args): assert fit_learner, 'the method requires that fit_learner=True' - self.learner.fit(data.instances, data.labels) + if data.binary: + self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs) + else: + self.learner = OneVsAll( + binary_method=ExplicitLossMinimisationBinary, + n_jobs=-1, + svmperf_base=self.svmperf_base, + loss=self.loss, + **self.kwargs + ) + return self.learner.fit(data, *args) + + def quantify(self, instances, *args): + return self.learner.quantify(instances, *args) + + +class ExplicitLossMinimisationBinary(AggregativeQuantifier): + + def __init__(self, svmperf_base, loss, **kwargs): + self.svmperf_base = svmperf_base + self.loss = loss + self.kwargs = kwargs + + def fit(self, data: LabelledCollection, fit_learner=True, *args): + assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification' + assert fit_learner, 'the method requires that fit_learner=True' + self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels) return self def quantify(self, X, y=None): predictions = self.learner.predict(X) - return F.prevalence_from_labels(predictions, self.learner.n_classes_) + prev = F.prevalence_from_labels(predictions, self.learner.n_classes_) + print('binary: ', prev) + return prev def classify(self, X, y=None): return self.learner.predict(X) @@ -349,3 +417,12 @@ class SVMRAE(ExplicitLossMinimisation): def __init__(self, svmperf_base, **kwargs): super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs) + +CC = ClassifyAndCount +ACC = AdjustedClassifyAndCount +PCC = ProbabilisticClassifyAndCount +PACC = ProbabilisticAdjustedClassifyAndCount +ELM = ExplicitLossMinimisation +EMQ = ExpectationMaximizationQuantifier +HDy = HellingerDistanceY + diff --git a/quapy/method/base.py b/quapy/method/base.py index 4679a8f..bf7ff54 100644 --- a/quapy/method/base.py +++ b/quapy/method/base.py @@ -10,7 +10,7 @@ class BaseQuantifier(metaclass=ABCMeta): def fit(self, data: qp.LabelledCollection, *args): ... @abstractmethod - def quantify(self, documents, *args): ... + def quantify(self, instances, *args): ... @abstractmethod def set_params(self, **parameters): ... diff --git a/quapy/utils/__init__.py b/quapy/utils/__init__.py new file mode 100644 index 0000000..907cc97 --- /dev/null +++ b/quapy/utils/__init__.py @@ -0,0 +1 @@ +from . import util \ No newline at end of file diff --git a/quapy/utils/util.py b/quapy/utils/util.py new file mode 100644 index 0000000..bd5071a --- /dev/null +++ b/quapy/utils/util.py @@ -0,0 +1,22 @@ +import itertools +import multiprocessing +from joblib import Parallel, delayed + + +def get_parallel_slices(n_tasks, n_jobs=-1): + if n_jobs == -1: + n_jobs = multiprocessing.cpu_count() + batch = int(n_tasks / n_jobs) + remainder = n_tasks % n_jobs + return [slice(job * batch, (job + 1) * batch + (remainder if job == n_jobs - 1 else 0)) for job in + range(n_jobs)] + + +def parallelize(func, args, n_jobs): + slices = get_parallel_slices(len(args), n_jobs) + results = Parallel(n_jobs=n_jobs)( + delayed(func)(args[slice_i]) for slice_i in slices + ) + return list(itertools.chain.from_iterable(results)) + + diff --git a/test.py b/test.py new file mode 100644 index 0000000..fe842a2 --- /dev/null +++ b/test.py @@ -0,0 +1,46 @@ +from sklearn.linear_model import LogisticRegression +from sklearn.svm import LinearSVC +import quapy as qp +import quapy.functional as F + + +# load a textual binary dataset and create a tfidf bag of words +train_path = './datasets/reviews/kindle/train.txt' +test_path = './datasets/reviews/kindle/test.txt' +dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text) +dataset.training = dataset.training.sampling(1000, 0.4, 0.6) +dataset.test = dataset.test.sampling(500, 0.6, 0.4) +qp.preprocessing.text2tfidf(dataset, inplace=True) +qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True) + +# load a sparse matrix ternary dataset +#train_path = './datasets/twitter/train/sst.train+dev.feature.txt' +#test_path = './datasets/twitter/test/sst.test.feature.txt' +#dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse) +#dataset.training = dataset.training.sampling(500, 0.3, 0.2, 0.5) +#dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3) + +# training a quantifier +learner = LogisticRegression() +# q = qp.method.aggregative.ClassifyAndCount(learner) +# q = qp.method.aggregative.AdjustedClassifyAndCount(learner) +# q = qp.method.aggregative.AdjustedClassifyAndCount(learner) +# q = qp.method.aggregative.ProbabilisticClassifyAndCount(learner) +# q = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner) +# q = qp.method.aggregative.ExpectationMaximizationQuantifier(learner) +# q = qp.method.aggregative.ExplicitLossMinimisation(svmperf_base='./svm_perf_quantification', loss='q', verbose=0, C=1000) +# q = qp.method.aggregative.SVMQ(svmperf_base='./svm_perf_quantification', verbose=0, C=1000) +q = qp.method.aggregative.HDy(learner) +q.fit(dataset.training) + +# estimating class prevalences +prevalences_estim = q.quantify(dataset.test.instances) +prevalences_true = dataset.test.prevalence() + +# evaluation (one single prediction) +error = qp.error.mae(prevalences_true, prevalences_estim) + +print(f'method {q.__class__.__name__}') +print(f'true prevalence {F.strprev(prevalences_true)}') +print(f'estim prevalence {F.strprev(prevalences_estim)}') +print(f'MAE={error:.3f}') \ No newline at end of file From 2361186a01c53e744f4291e2e2299700216ff139 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Wed, 9 Dec 2020 12:46:50 +0100 Subject: [PATCH 2/2] aggregation methods updated --- quapy/__init__.py | 2 +- quapy/{dataset => data}/__init__.py | 0 quapy/{dataset => data}/base.py | 0 quapy/{dataset => data}/preprocessing.py | 10 ++-- quapy/{dataset => data}/reader.py | 0 quapy/method/__init__.py | 8 ++- quapy/method/aggregative.py | 63 ++++++++++++++---------- quapy/method/base.py | 3 +- test.py | 38 ++++++++------ 9 files changed, 71 insertions(+), 53 deletions(-) rename quapy/{dataset => data}/__init__.py (100%) rename quapy/{dataset => data}/base.py (100%) rename quapy/{dataset => data}/preprocessing.py (97%) rename quapy/{dataset => data}/reader.py (100%) diff --git a/quapy/__init__.py b/quapy/__init__.py index 59e21fe..701641d 100644 --- a/quapy/__init__.py +++ b/quapy/__init__.py @@ -1,4 +1,4 @@ -from .dataset import * +from .data import * from . import functional from . import method from . import error diff --git a/quapy/dataset/__init__.py b/quapy/data/__init__.py similarity index 100% rename from quapy/dataset/__init__.py rename to quapy/data/__init__.py diff --git a/quapy/dataset/base.py b/quapy/data/base.py similarity index 100% rename from quapy/dataset/base.py rename to quapy/data/base.py diff --git a/quapy/dataset/preprocessing.py b/quapy/data/preprocessing.py similarity index 97% rename from quapy/dataset/preprocessing.py rename to quapy/data/preprocessing.py index a6259b2..b08bcab 100644 --- a/quapy/dataset/preprocessing.py +++ b/quapy/data/preprocessing.py @@ -1,9 +1,10 @@ import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer -from dataset.base import Dataset +from data.base import Dataset from scipy.sparse import spmatrix from utils.util import parallelize from .base import LabelledCollection +from tqdm import tqdm def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs): @@ -78,8 +79,8 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs): :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True) consisting of lists of integer values representing indices. """ - __check_type(dataset.training.instances, list, str) - __check_type(dataset.test.instances, list, str) + __check_type(dataset.training.instances, np.ndarray, str) + __check_type(dataset.test.instances, np.ndarray, str) indexer = IndexTransformer(min_df=min_df, **kwargs) training_index = indexer.fit_transform(dataset.training.instances) @@ -105,7 +106,6 @@ def __check_type(container, container_type=None, element_type=None): f'unexpected type of element (expected {container_type}, found {type(container)})' - class IndexTransformer: def __init__(self, **kwargs): @@ -140,7 +140,7 @@ class IndexTransformer: return self.fit(X).transform(X, n_jobs=n_jobs) def vocabulary_size(self): - return len(self.vocabulary_) + 1 # the reserved unk token + return len(self.vocabulary_) def add_word(self, word): if word in self.vocabulary_: diff --git a/quapy/dataset/reader.py b/quapy/data/reader.py similarity index 100% rename from quapy/dataset/reader.py rename to quapy/data/reader.py diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py index df5cfd5..88acd16 100644 --- a/quapy/method/__init__.py +++ b/quapy/method/__init__.py @@ -1,5 +1,6 @@ +from . import base from . import aggregative as agg -from . import non_aggregative as nagg +from . import non_aggregative AGGREGATIVE_METHODS = { @@ -13,13 +14,10 @@ AGGREGATIVE_METHODS = { } NON_AGGREGATIVE_METHODS = { - nagg.MaximumLikelihoodPrevalenceEstimation + non_aggregative.MaximumLikelihoodPrevalenceEstimation } QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS -# common alisases -MLPE = nagg.MaximumLikelihoodPrevalenceEstimation - diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 0862588..99204ab 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -1,14 +1,14 @@ import numpy as np -from .base import * -from ..error import mae +from copy import deepcopy import functional as F -from ..classification.svmperf import SVMperf -from ..dataset import LabelledCollection +import error +from method.base import BaseQuantifier +from quapy.classification.svmperf import SVMperf +from quapy.data import LabelledCollection from sklearn.metrics import confusion_matrix from sklearn.calibration import CalibratedClassifierCV from joblib import Parallel, delayed - - +from abc import abstractmethod # Abstract classes @@ -23,6 +23,14 @@ class AggregativeQuantifier(BaseQuantifier): @abstractmethod def fit(self, data: LabelledCollection, fit_learner=True, *args): ... + @property + def learner(self): + return self.learner_ + + @learner.setter + def learner(self, value): + self.learner_ = value + def classify(self, instances): return self.learner.predict(instances) @@ -69,12 +77,12 @@ def training_helper(learner, Training procedure common to all Aggregative Quantifiers. :param learner: the learner to be fit :param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner. - :param fit_learner: whether or not to fit the learner + :param fit_learner: whether or not to fit the learner (if False, then bypasses any action) :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the learner is not probabilistic, then a CalibratedCV instance of it is trained) :param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0 - or None otherwise) + or None otherwise) to be used as a validation set for any subsequent parameter fitting """ if fit_learner: if ensure_probabilistic: @@ -239,7 +247,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier): # M-step: qs_pos is Ps+1(y=+1) qs = ps.mean(axis=0) - if qs_prev_ is not None and mae(qs, qs_prev_) < epsilon and s>10: + if qs_prev_ is not None and error.mae(qs, qs_prev_) < epsilon and s>10: converged = True qs_prev_ = qs @@ -265,7 +273,8 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier): self.learner = learner def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6): - assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification' + assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification. ' \ + f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.' self.learner, validation = training_helper( self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split) Px = self.soft_classify(validation.instances) @@ -304,15 +313,19 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier): class OneVsAll(AggregativeQuantifier): + """ + Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary + quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1. + """ - def __init__(self, binary_method, n_jobs=-1, **kwargs): + def __init__(self, binary_method, n_jobs=-1): self.binary_method = binary_method self.n_jobs = n_jobs - self.kwargs = kwargs def fit(self, data: LabelledCollection, **kwargs): assert not data.binary, f'{self.__class__.__name__} expect non-binary data' - self.class_method = {c: self.binary_method(**self.kwargs) for c in data.classes_} + assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier' + self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_} Parallel(n_jobs=self.n_jobs, backend='threading')( delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_ ) @@ -332,10 +345,10 @@ class OneVsAll(AggregativeQuantifier): return sorted(self.class_method.keys()) def set_params(self, **parameters): - self.kwargs=parameters + self.binary_method.set_params(**parameters) def get_params(self, deep=True): - return self.kwargs + return self.binary_method.get_params() def _delayed_binary_predict(self, c, learners, X): return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence @@ -346,6 +359,12 @@ class OneVsAll(AggregativeQuantifier): class ExplicitLossMinimisation(AggregativeQuantifier): + """ + A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary + quantifier for each class and then l1-normalizes the class predictions so that they sum up to one. + This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. + Social Network Analysis and Mining6(19), 1–22 (2016) + """ def __init__(self, svmperf_base, loss, **kwargs): self.svmperf_base = svmperf_base @@ -354,16 +373,9 @@ class ExplicitLossMinimisation(AggregativeQuantifier): def fit(self, data: LabelledCollection, fit_learner=True, *args): assert fit_learner, 'the method requires that fit_learner=True' - if data.binary: - self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs) - else: - self.learner = OneVsAll( - binary_method=ExplicitLossMinimisationBinary, - n_jobs=-1, - svmperf_base=self.svmperf_base, - loss=self.loss, - **self.kwargs - ) + self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs) + if not data.binary: + self.learner = OneVsAll(self.learner, n_jobs=-1) return self.learner.fit(data, *args) def quantify(self, instances, *args): @@ -393,6 +405,7 @@ class ExplicitLossMinimisationBinary(AggregativeQuantifier): return self.learner.predict(X) + class SVMQ(ExplicitLossMinimisation): def __init__(self, svmperf_base, **kwargs): super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs) diff --git a/quapy/method/base.py b/quapy/method/base.py index bf7ff54..e65b45e 100644 --- a/quapy/method/base.py +++ b/quapy/method/base.py @@ -1,5 +1,4 @@ from abc import ABCMeta, abstractmethod -import quapy as qp # Base Quantifier abstract class @@ -7,7 +6,7 @@ import quapy as qp class BaseQuantifier(metaclass=ABCMeta): @abstractmethod - def fit(self, data: qp.LabelledCollection, *args): ... + def fit(self, data, *args): ... @abstractmethod def quantify(self, instances, *args): ... diff --git a/test.py b/test.py index fe842a2..90167fe 100644 --- a/test.py +++ b/test.py @@ -2,23 +2,25 @@ from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC import quapy as qp import quapy.functional as F - +from method.aggregative import OneVsAll # load a textual binary dataset and create a tfidf bag of words +#from method.aggregative import OneVsAll, BaseQuantifier + train_path = './datasets/reviews/kindle/train.txt' test_path = './datasets/reviews/kindle/test.txt' -dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text) -dataset.training = dataset.training.sampling(1000, 0.4, 0.6) -dataset.test = dataset.test.sampling(500, 0.6, 0.4) -qp.preprocessing.text2tfidf(dataset, inplace=True) -qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True) +#dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text) +#dataset.training = dataset.training.sampling(1000, 0.4, 0.6) +#dataset.test = dataset.test.sampling(500, 0.6, 0.4) +#qp.preprocessing.text2tfidf(dataset, inplace=True) +#qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True) # load a sparse matrix ternary dataset -#train_path = './datasets/twitter/train/sst.train+dev.feature.txt' -#test_path = './datasets/twitter/test/sst.test.feature.txt' -#dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse) -#dataset.training = dataset.training.sampling(500, 0.3, 0.2, 0.5) -#dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3) +train_path = './datasets/twitter/train/sst.train+dev.feature.txt' +test_path = './datasets/twitter/test/sst.test.feature.txt' +dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse) +dataset.training = dataset.training.sampling(500, 0.3, 0.4, 0.3) +dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3) # training a quantifier learner = LogisticRegression() @@ -30,17 +32,23 @@ learner = LogisticRegression() # q = qp.method.aggregative.ExpectationMaximizationQuantifier(learner) # q = qp.method.aggregative.ExplicitLossMinimisation(svmperf_base='./svm_perf_quantification', loss='q', verbose=0, C=1000) # q = qp.method.aggregative.SVMQ(svmperf_base='./svm_perf_quantification', verbose=0, C=1000) -q = qp.method.aggregative.HDy(learner) -q.fit(dataset.training) +#model = qp.method.aggregative.HDy(learner) +# + +model = qp.method.aggregative.HDy(learner) +model = OneVsAll(model) +print(model.get_params()) + +model.fit(dataset.training) # estimating class prevalences -prevalences_estim = q.quantify(dataset.test.instances) +prevalences_estim = model.quantify(dataset.test.instances) prevalences_true = dataset.test.prevalence() # evaluation (one single prediction) error = qp.error.mae(prevalences_true, prevalences_estim) -print(f'method {q.__class__.__name__}') +print(f'method {model.__class__.__name__}') print(f'true prevalence {F.strprev(prevalences_true)}') print(f'estim prevalence {F.strprev(prevalences_estim)}') print(f'MAE={error:.3f}') \ No newline at end of file