diff --git a/examples/16.confidence_regions.py b/examples/16.confidence_regions.py index 27fbfbd..c8e95dd 100644 --- a/examples/16.confidence_regions.py +++ b/examples/16.confidence_regions.py @@ -36,7 +36,7 @@ with qp.util.temp_seed(0): true_prev = shifted_test.prevalence() # by calling "quantify_conf", we obtain the point estimate and the confidence intervals around it - pred_prev, conf_intervals = pacc.quantify_conf(shifted_test.X) + pred_prev, conf_intervals = pacc.predict_conf(shifted_test.X) # conf_intervals is an instance of ConfidenceRegionABC, which provides some useful utilities like: # - coverage: a function which computes the fraction of true values that belong to the confidence region diff --git a/examples/18.ReadMe_for_text_analysis.py b/examples/18.ReadMe_for_text_analysis.py new file mode 100644 index 0000000..d3e1c49 --- /dev/null +++ b/examples/18.ReadMe_for_text_analysis.py @@ -0,0 +1,23 @@ +from sklearn.feature_extraction.text import CountVectorizer +import quapy as qp +from quapy.method.non_aggregative import ReadMe +import quapy.functional as F + +reviews = qp.datasets.fetch_reviews('imdb').reduce(n_train=1000, random_state=0) + +encode_0_1 = CountVectorizer(min_df=5, binary=True) +train, test = qp.data.preprocessing.instance_transformation(reviews, encode_0_1, inplace=True).train_test + +readme = ReadMe(bootstrap_trials=100, bagging_trials=100, bagging_range=100, random_state=0, verbose=True) +readme.fit(*train.Xy) + +for test_prev in [[0.25, 0.75], [0.5, 0.5], [0.75, 0.25]]: + sample = reviews.test.sampling(500, *test_prev, random_state=0) + prev_estim, conf = readme.predict_conf(sample.X) + err = qp.error.mae(sample.prevalence(), prev_estim) + print(f'true-prevalence={F.strprev(sample.prevalence())},\n' + f'predicted-prevalence={F.strprev(prev_estim)},\n' + f'MAE={err:.4f}') + print(conf) + + diff --git a/quapy/data/preprocessing.py b/quapy/data/preprocessing.py index a4be4fd..bc57d65 100644 --- a/quapy/data/preprocessing.py +++ b/quapy/data/preprocessing.py @@ -10,6 +10,36 @@ from quapy.util import map_parallel from .base import LabelledCollection +def instance_transformation(dataset:Dataset, transformer, inplace=False): + """ + Transforms a :class:`quapy.data.base.Dataset` applying the `fit_transform` and `transform` functions + of a (sklearn's) transformer. + + :param dataset: a :class:`quapy.data.base.Dataset` where the instances of training and test collections are + lists of str + :param transformer: TransformerMixin implementing `fit_transform` and `transform` functions + :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default) + :return: a new :class:`quapy.data.base.Dataset` with transformed instances (if inplace=False) or a reference to the + current Dataset (if inplace=True) where the instances have been transformed + """ + training_transformed = transformer.fit_transform(dataset.training.instances) + test_transformed = transformer.transform(dataset.test.instances) + + if inplace: + dataset.training = LabelledCollection(training_transformed, dataset.training.labels, dataset.classes_) + dataset.test = LabelledCollection(test_transformed, dataset.test.labels, dataset.classes_) + if hasattr(transformer, 'vocabulary_'): + dataset.vocabulary = transformer.vocabulary_ + return dataset + else: + training = LabelledCollection(training_transformed, dataset.training.labels.copy(), dataset.classes_) + test = LabelledCollection(test_transformed, dataset.test.labels.copy(), dataset.classes_) + if hasattr(transformer, 'vocabulary_'): + return Dataset(training, test, transformer.vocabulary_) + else: + return Dataset(training, test) + + def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs): """ Transforms a :class:`quapy.data.base.Dataset` of textual instances into a :class:`quapy.data.base.Dataset` of @@ -29,18 +59,7 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw __check_type(dataset.test.instances, np.ndarray, str) vectorizer = TfidfVectorizer(min_df=min_df, sublinear_tf=sublinear_tf, **kwargs) - training_documents = vectorizer.fit_transform(dataset.training.instances) - test_documents = vectorizer.transform(dataset.test.instances) - - if inplace: - dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.classes_) - dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.classes_) - dataset.vocabulary = vectorizer.vocabulary_ - return dataset - else: - training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.classes_) - test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.classes_) - return Dataset(training, test, vectorizer.vocabulary_) + return instance_transformation(dataset, vectorizer, inplace) def reduce_columns(dataset: Dataset, min_df=5, inplace=False): diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index 7f70bb8..07b2b1e 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -88,18 +88,30 @@ class WithConfidenceABC(ABC): METHODS = ['intervals', 'ellipse', 'ellipse-clr'] @abstractmethod - def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): + def predict_conf(self, instances, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC): """ - Adds the method `quantify_conf` to the interface. This method returns not only the point-estimate, but + Adds the method `predict_conf` to the interface. This method returns not only the point-estimate, but also the confidence region around it. :param instances: a np.ndarray of shape (n_instances, n_features,) - :confidence_level: float in (0, 1) + :param confidence_level: float in (0, 1), default is 0.95 :return: a tuple (`point_estimate`, `conf_region`), where `point_estimate` is a np.ndarray of shape (n_classes,) and `conf_region` is an object from :class:`ConfidenceRegionABC` """ ... + def quantify_conf(self, instances, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC): + """ + Alias to `predict_conf`. This method returns not only the point-estimate, but + also the confidence region around it. + + :param instances: a np.ndarray of shape (n_instances, n_features,) + :param confidence_level: float in (0, 1), default is 0.95 + :return: a tuple (`point_estimate`, `conf_region`), where `point_estimate` is a np.ndarray of shape + (n_classes,) and `conf_region` is an object from :class:`ConfidenceRegionABC` + """ + return self.predict_conf(instances=instances, confidence_level=confidence_level) + @classmethod def construct_region(cls, prev_estims, confidence_level=0.95, method='intervals'): """ @@ -227,6 +239,7 @@ class ConfidenceEllipseCLR(ConfidenceRegionABC): """ def __init__(self, X, confidence_level=0.95): + X = np.asarray(X) self.clr = CLRtransformation() Z = self.clr(X) self.mean_ = np.mean(X, axis=0) @@ -297,6 +310,9 @@ class ConfidenceIntervals(ConfidenceRegionABC): return proportion + def __repr__(self): + return '['+', '.join(f'({low:.4f}, {high:.4f})' for (low,high) in zip(self.I_low, self.I_high))+']' + class CLRtransformation: """ @@ -429,7 +445,7 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): self.aggregation_fit(classif_predictions, labels) return self - def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): + def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): predictions = self.quantifier.classify(instances) return self.aggregate_conf(predictions, confidence_level=confidence_level) @@ -549,7 +565,7 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): samples = self.sample_from_posterior(classif_predictions)[_bayesian.P_TEST_Y] return np.asarray(samples.mean(axis=0), dtype=float) - def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): + def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): classif_predictions = self.classify(instances) point_estimate = self.aggregate(classif_predictions) samples = self.get_prevalence_samples() # available after calling "aggregate" function diff --git a/quapy/method/non_aggregative.py b/quapy/method/non_aggregative.py index eff2283..5e84762 100644 --- a/quapy/method/non_aggregative.py +++ b/quapy/method/non_aggregative.py @@ -1,11 +1,14 @@ from typing import Union, Callable import numpy as np from sklearn.feature_extraction.text import CountVectorizer +from sklearn.utils import resample +from sklearn.preprocessing import normalize +from method.confidence import WithConfidenceABC, ConfidenceRegionABC from quapy.functional import get_divergence -from quapy.data import LabelledCollection from quapy.method.base import BaseQuantifier, BinaryQuantifier import quapy.functional as F +from scipy.optimize import lsq_linear class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier): @@ -149,53 +152,89 @@ class DMx(BaseQuantifier): return F.argmin_prevalence(loss, n_classes, method=self.search) -# class ReadMe(BaseQuantifier): -# -# def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs): -# raise NotImplementedError('under development ...') -# self.bootstrap_trials = bootstrap_trials -# self.bootstrap_range = bootstrap_range -# self.bagging_trials = bagging_trials -# self.bagging_range = bagging_range -# self.vectorizer_kwargs = vectorizer_kwargs -# -# def fit(self, data: LabelledCollection): -# X, y = data.Xy -# self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs) -# X = self.vectorizer.fit_transform(X) -# self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)} -# -# def predict(self, X): -# X = self.vectorizer.transform(X) -# -# # number of features -# num_docs, num_feats = X.shape -# -# # bootstrap -# p_boots = [] -# for _ in range(self.bootstrap_trials): -# docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False) -# class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()} -# Xboot = X[docs_idx] -# -# # bagging -# p_bags = [] -# for _ in range(self.bagging_trials): -# feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False) -# class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()} -# Xbag = Xboot[:,feat_idx] -# p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag) -# p_bags.append(p) -# p_boots.append(np.mean(p_bags, axis=0)) -# -# p_mean = np.mean(p_boots, axis=0) -# p_std = np.std(p_bags, axis=0) -# -# return p_mean -# -# -# def std_constrained_linear_ls(self, X, class_cond_X: dict): -# pass +class ReadMe(BaseQuantifier, WithConfidenceABC): + + def __init__(self, + bootstrap_trials=100, + bagging_trials=100, + bagging_range=250, + confidence_level=0.95, + region='intervals', + random_state=None, + verbose=False): + self.bootstrap_trials = bootstrap_trials + self.bagging_trials = bagging_trials + self.bagging_range = bagging_range + self.confidence_level = confidence_level + self.region = region + self.random_state = random_state + self.verbose = verbose + + def fit(self, X, y): + self.rng = np.random.default_rng(self.random_state) + self.classes_ = np.unique(y) + n_features = X.shape[1] + + if self.bagging_range is None: + self.bagging_range = int(np.sqrt(n_features)) + + Xsize = X.shape[0] + + # Bootstrap loop + self.Xboots, self.yboots = [], [] + for _ in range(self.bootstrap_trials): + idx = self.rng.choice(Xsize, size=Xsize, replace=True) + self.Xboots.append(X[idx]) + self.yboots.append(y[idx]) + + return self + + def predict_conf(self, X, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC): + from tqdm import tqdm + n_features = X.shape[1] + + boots_prevalences = [] + + for Xboots, yboots in tqdm( + zip(self.Xboots, self.yboots), + desc='bootstrap predictions', total=self.bootstrap_trials, disable=not self.verbose + ): + bagging_estimates = [] + for _ in range(self.bagging_trials): + feat_idx = self.rng.choice(n_features, size=self.bagging_range, replace=False) + Xboots_bagging = Xboots[:, feat_idx] + X_boots_bagging = X[:, feat_idx] + bagging_prev = self._quantify_iteration(Xboots_bagging, yboots, X_boots_bagging) + bagging_estimates.append(bagging_prev) + + boots_prevalences.append(np.mean(bagging_estimates, axis=0)) + + conf = WithConfidenceABC.construct_region(boots_prevalences, confidence_level, method=self.region) + prev_estim = conf.point_estimate() + + return prev_estim, conf + + + def predict(self, X): + prev_estim, _ = self.predict_conf(X) + return prev_estim + + + def _quantify_iteration(self, Xtr, ytr, Xte): + """Single ReadMe estimate.""" + n_classes = len(self.classes_) + PX_given_Y = np.zeros((n_classes, Xtr.shape[1])) + for i, c in enumerate(self.classes_): + PX_given_Y[i] = Xtr[ytr == c].sum(axis=0) + PX_given_Y = normalize(PX_given_Y, norm='l1', axis=1) + + PX = np.asarray(Xte.sum(axis=0)) + PX = normalize(PX, norm='l1', axis=1) + + res = lsq_linear(A=PX_given_Y.T, b=PX.ravel(), bounds=(0, 1)) + pY = np.maximum(res.x, 0) + return pY / pY.sum() + def _get_features_range(X):