From 5b772c7eda68e0da1e43d10ef80b736d48e571fd Mon Sep 17 00:00:00 2001 From: Andrea Esuli Date: Wed, 5 May 2021 17:12:44 +0200 Subject: [PATCH] Bug fixes on use of classes_. Tests. --- quapy/data/base.py | 76 +++++++++++++++++------------- quapy/data/datasets.py | 8 ++-- quapy/data/preprocessing.py | 20 ++++---- quapy/functional.py | 6 +-- quapy/method/aggregative.py | 89 +++++++++++++++++++----------------- quapy/method/base.py | 2 +- quapy/tests/test_datasets.py | 17 +++++-- quapy/tests/test_methods.py | 63 ++++++++++++++++++++++--- 8 files changed, 177 insertions(+), 104 deletions(-) diff --git a/quapy/data/base.py b/quapy/data/base.py index 6b2ddec..e68bcfa 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -2,40 +2,52 @@ import numpy as np from scipy.sparse import issparse from scipy.sparse import vstack from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold + from quapy.functional import artificial_prevalence_sampling, strprev class LabelledCollection: + ''' + A LabelledCollection is a set of objects each with a label associated to it. + ''' - def __init__(self, instances, labels, n_classes=None): + def __init__(self, instances, labels, classes_=None): + """ + :param instances: list of objects + :param labels: list of labels, same length of instances + :param classes_: optional, list of classes from which labels are taken. When used, must contain the set of values used in labels. + """ if issparse(instances): self.instances = instances - elif isinstance(instances, list) and len(instances)>0 and isinstance(instances[0], str): + elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str): # lists of strings occupy too much as ndarrays (although python-objects add a heavy overload) self.instances = np.asarray(instances, dtype=object) else: self.instances = np.asarray(instances) - self.labels = np.asarray(labels, dtype=int) + self.labels = np.asarray(labels) n_docs = len(self) - if n_classes is None: + if classes_ is None: self.classes_ = np.unique(self.labels) self.classes_.sort() else: - self.classes_ = np.arange(n_classes) - self.index = {class_i: np.arange(n_docs)[self.labels == class_i] for class_i in self.classes_} + self.classes_ = np.unique(np.asarray(classes_)) + self.classes_.sort() + if len(set(self.labels).difference(set(classes_))) > 0: + raise ValueError('labels contains values not included in classes_') + self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_} @classmethod - def load(cls, path:str, loader_func:callable): + def load(cls, path: str, loader_func: callable): return LabelledCollection(*loader_func(path)) def __len__(self): return self.instances.shape[0] def prevalence(self): - return self.counts()/len(self) + return self.counts() / len(self) def counts(self): - return np.asarray([len(self.index[ci]) for ci in self.classes_]) + return np.asarray([len(self.index[class_]) for class_ in self.classes_]) @property def n_classes(self): @@ -48,21 +60,21 @@ class LabelledCollection: def sampling_index(self, size, *prevs, shuffle=True): if len(prevs) == 0: # no prevalence was indicated; returns an index for uniform sampling return np.random.choice(len(self), size, replace=False) - if len(prevs) == self.n_classes-1: - prevs = prevs + (1-sum(prevs),) + if len(prevs) == self.n_classes - 1: + prevs = prevs + (1 - sum(prevs),) assert len(prevs) == self.n_classes, 'unexpected number of prevalences' assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})' taken = 0 indexes_sample = [] - for i, class_i in enumerate(self.classes_): - if i == self.n_classes-1: + for i, class_ in enumerate(self.classes_): + if i == self.n_classes - 1: n_requested = size - taken else: n_requested = int(size * prevs[i]) - n_candidates = len(self.index[class_i]) - index_sample = self.index[class_i][ + n_candidates = len(self.index[class_]) + index_sample = self.index[class_][ np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates)) ] if n_requested > 0 else [] @@ -90,21 +102,22 @@ class LabelledCollection: def sampling_from_index(self, index): documents = self.instances[index] labels = self.labels[index] - return LabelledCollection(documents, labels, n_classes=self.n_classes) + return LabelledCollection(documents, labels, classes_=self.classes_) def split_stratified(self, train_prop=0.6, random_state=None): # with temp_seed(42): tr_docs, te_docs, tr_labels, te_labels = \ - train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state) + train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels, + random_state=random_state) return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels) def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1): - dimensions=self.n_classes + dimensions = self.n_classes for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): yield self.sampling(sample_size, *prevs) def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1): - dimensions=self.n_classes + dimensions = self.n_classes for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): yield self.sampling_index(sample_size, *prevs) @@ -142,10 +155,10 @@ class LabelledCollection: else: nfeats = '?' stats_ = {'instances': ninstances, - 'type': instance_type, - 'features': nfeats, - 'classes': self.n_classes, - 'prevs': strprev(self.prevalence())} + 'type': instance_type, + 'features': nfeats, + 'classes': self.classes_, + 'prevs': strprev(self.prevalence())} if show: print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, ' f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}') @@ -155,13 +168,14 @@ class LabelledCollection: kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state) for train_index, test_index in kf.split(*self.Xy): train = self.sampling_from_index(train_index) - test = self.sampling_from_index(test_index) + test = self.sampling_from_index(test_index) yield train, test + class Dataset: def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''): - assert training.n_classes == test.n_classes, 'incompatible labels in training and test collections' + assert set(training.classes_) == set(test.classes_), 'incompatible labels in training and test collections' self.training = training self.test = test self.vocabulary = vocabulary @@ -172,8 +186,8 @@ class Dataset: return Dataset(*collection.split_stratified(train_prop=train_size)) @property - def n_classes(self): - return self.training.n_classes + def classes_(self): + return self.training.classes_ @property def binary(self): @@ -195,19 +209,15 @@ class Dataset: print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, ' f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, ' f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}') - return {'train': tr_stats ,'test':te_stats} + return {'train': tr_stats, 'test': te_stats} @classmethod def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0): for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)): - yield Dataset(train, test, name=f'fold {(i%nfolds)+1}/{nfolds} (round={(i//nfolds)+1})') + yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})') def isbinary(data): if isinstance(data, Dataset) or isinstance(data, LabelledCollection): return data.binary return False - - - - diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 79d0bbf..575ffca 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -47,7 +47,7 @@ UCI_DATASETS = ['acute.a', 'acute.b', 'yeast'] -def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False): +def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset: """ Load a Reviews dataset as a Dataset instance, as used in: Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification." @@ -91,7 +91,7 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle return data -def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False): +def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset: """ Load a Twitter dataset as a Dataset instance, as used in: Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. @@ -162,12 +162,12 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom return data -def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False): +def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: data = fetch_UCILabelledCollection(dataset_name, data_home, verbose) return Dataset(*data.split_stratified(1 - test_split, random_state=0)) -def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False): +def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> Dataset: assert dataset_name in UCI_DATASETS, \ f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \ diff --git a/quapy/data/preprocessing.py b/quapy/data/preprocessing.py index 77752f0..ee1627e 100644 --- a/quapy/data/preprocessing.py +++ b/quapy/data/preprocessing.py @@ -29,13 +29,13 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw test_documents = vectorizer.transform(dataset.test.instances) if inplace: - dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.n_classes) - dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.n_classes) + dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.classes_) + dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.classes_) dataset.vocabulary = vectorizer.vocabulary_ return dataset else: - training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.n_classes) - test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.n_classes) + training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.classes_) + test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.classes_) return Dataset(training, test, vectorizer.vocabulary_) @@ -66,8 +66,8 @@ def reduce_columns(dataset: Dataset, min_df=5, inplace=False): dataset.test.instances = Xte return dataset else: - training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.n_classes) - test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.n_classes) + training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.classes_) + test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.classes_) return Dataset(training, test) @@ -100,13 +100,13 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs): test_index = indexer.transform(dataset.test.instances) if inplace: - dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.n_classes) - dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.n_classes) + dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.classes_) + dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.classes_) dataset.vocabulary = indexer.vocabulary_ return dataset else: - training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.n_classes) - test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.n_classes) + training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.classes_) + test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.classes_) return Dataset(training, test, indexer.vocabulary_) diff --git a/quapy/functional.py b/quapy/functional.py index 726b214..39a867b 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -36,12 +36,12 @@ def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01): return p -def prevalence_from_labels(labels, n_classes): +def prevalence_from_labels(labels, classes_): if labels.ndim != 1: raise ValueError(f'param labels does not seem to be a ndarray of label predictions') unique, counts = np.unique(labels, return_counts=True) by_class = defaultdict(lambda:0, dict(zip(unique, counts))) - prevalences = np.asarray([by_class[ci] for ci in range(n_classes)], dtype=np.float) + prevalences = np.asarray([by_class[class_] for class_ in classes_], dtype=np.float) prevalences /= prevalences.sum() return prevalences @@ -51,7 +51,7 @@ def prevalence_from_probabilities(posteriors, binarize: bool = False): raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities') if binarize: predictions = np.argmax(posteriors, axis=-1) - return prevalence_from_labels(predictions, n_classes=posteriors.shape[1]) + return prevalence_from_labels(predictions, np.arange(posteriors.shape[1])) else: prevalences = posteriors.mean(axis=0) prevalences /= prevalences.sum() diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 332fea0..ff94c21 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -1,6 +1,7 @@ from abc import abstractmethod from copy import deepcopy from typing import Union + import numpy as np from joblib import Parallel, delayed from sklearn.base import BaseEstimator @@ -8,6 +9,7 @@ from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics import confusion_matrix from sklearn.model_selection import StratifiedKFold from tqdm import tqdm + import quapy as qp import quapy.functional as F from quapy.classification.svmperf import SVMperf @@ -43,7 +45,7 @@ class AggregativeQuantifier(BaseQuantifier): return self.aggregate(classif_predictions) @abstractmethod - def aggregate(self, classif_predictions:np.ndarray): ... + def aggregate(self, classif_predictions: np.ndarray): ... def get_params(self, deep=True): return self.learner.get_params() @@ -84,7 +86,7 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier): def set_params(self, **parameters): if isinstance(self.learner, CalibratedClassifierCV): - parameters = {'base_estimator__'+k:v for k,v in parameters.items()} + parameters = {'base_estimator__' + k: v for k, v in parameters.items()} self.learner.set_params(**parameters) @property @@ -98,7 +100,7 @@ def training_helper(learner, data: LabelledCollection, fit_learner: bool = True, ensure_probabilistic=False, - val_split:Union[LabelledCollection, float]=None): + val_split: Union[LabelledCollection, float] = None): """ Training procedure common to all Aggregative Quantifiers. :param learner: the learner to be fit @@ -122,13 +124,14 @@ def training_helper(learner, if isinstance(val_split, float): if not (0 < val_split < 1): raise ValueError(f'train/val split {val_split} out of range, must be in (0,1)') - train, unused = data.split_stratified(train_prop=1-val_split) - elif val_split.__class__.__name__ == LabelledCollection.__name__: #isinstance(val_split, LabelledCollection): + train, unused = data.split_stratified(train_prop=1 - val_split) + elif val_split.__class__.__name__ == LabelledCollection.__name__: # isinstance(val_split, LabelledCollection): train = data unused = val_split else: - raise ValueError(f'param "val_split" ({type(val_split)}) not understood; use either a float indicating the split ' - 'proportion, or a LabelledCollection indicating the validation split') + raise ValueError( + f'param "val_split" ({type(val_split)}) not understood; use either a float indicating the split ' + 'proportion, or a LabelledCollection indicating the validation split') else: train, unused = data, None @@ -153,7 +156,7 @@ class CC(AggregativeQuantifier): attributed each of the classes in order to compute class prevalence estimates. """ - def __init__(self, learner:BaseEstimator): + def __init__(self, learner: BaseEstimator): self.learner = learner def fit(self, data: LabelledCollection, fit_learner=True): @@ -167,16 +170,16 @@ class CC(AggregativeQuantifier): return self def aggregate(self, classif_predictions): - return F.prevalence_from_labels(classif_predictions, self.n_classes) + return F.prevalence_from_labels(classif_predictions, self.classes_) class ACC(AggregativeQuantifier): - def __init__(self, learner:BaseEstimator, val_split=0.4): + def __init__(self, learner: BaseEstimator, val_split=0.4): self.learner = learner self.val_split = val_split - def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection]=None): + def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None): """ Trains a ACC quantifier :param data: the training set @@ -262,7 +265,7 @@ class PACC(AggregativeProbabilisticQuantifier): self.learner = learner self.val_split = val_split - def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=None): + def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None): """ Trains a PACC quantifier :param data: the training set @@ -294,7 +297,8 @@ class PACC(AggregativeProbabilisticQuantifier): y_ = np.vstack(y_) # fit the learner on all data - self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True, val_split=None) + self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True, + val_split=None) else: self.learner, val_data = training_helper( @@ -307,8 +311,8 @@ class PACC(AggregativeProbabilisticQuantifier): # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # document that belongs to yj ends up being classified as belonging to yi confusion = np.empty(shape=(data.n_classes, data.n_classes)) - for yi in range(data.n_classes): - confusion[yi] = y_[y==yi].mean(axis=0) + for i,class_ in enumerate(data.classes_): + confusion[i] = y_[y == class_].mean(axis=0) self.Pte_cond_estim_ = confusion.T @@ -338,7 +342,7 @@ class EMQ(AggregativeProbabilisticQuantifier): def fit(self, data: LabelledCollection, fit_learner=True): self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) - self.train_prevalence = F.prevalence_from_labels(data.labels, self.n_classes) + self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_) return self def aggregate(self, classif_posteriors, epsilon=EPSILON): @@ -366,7 +370,7 @@ class EMQ(AggregativeProbabilisticQuantifier): # M-step: qs = ps.mean(axis=0) - if qs_prev_ is not None and qp.error.mae(qs, qs_prev_) < epsilon and s>10: + if qs_prev_ is not None and qp.error.mae(qs, qs_prev_) < epsilon and s > 10: converged = True qs_prev_ = qs @@ -389,7 +393,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): self.learner = learner self.val_split = val_split - def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=None): + def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection] = None): """ Trains a HDy quantifier :param data: the training set @@ -405,13 +409,15 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): self._check_binary(data, self.__class__.__name__) self.learner, validation = training_helper( self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split) - Px = self.posterior_probabilities(validation.instances)[:,1] # takes only the P(y=+1|x) - self.Pxy1 = Px[validation.labels == 1] - self.Pxy0 = Px[validation.labels == 0] + Px = self.posterior_probabilities(validation.instances)[:, 1] # takes only the P(y=+1|x) + self.Pxy1 = Px[validation.labels == self.learner.classes_[1]] + self.Pxy0 = Px[validation.labels == self.learner.classes_[0]] # pre-compute the histogram for positive and negative examples - self.bins = np.linspace(10, 110, 11, dtype=int) #[10, 20, 30, ..., 100, 110] - self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in self.bins} - self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in self.bins} + self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110] + self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in + self.bins} + self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in + self.bins} return self def aggregate(self, classif_posteriors): @@ -419,12 +425,12 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): # and the final estimated a priori probability was taken as the median of these 11 estimates." # (González-Castro, et al., 2013). - Px = classif_posteriors[:,1] # takes only the P(y=+1|x) + Px = classif_posteriors[:, 1] # takes only the P(y=+1|x) prev_estimations = [] - #for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110] - #Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True) - #Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True) + # for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110] + # Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True) + # Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True) for bins in self.bins: Pxy0_density = self.Pxy0_density[bins] Pxy1_density = self.Pxy1_density[bins] @@ -433,14 +439,14 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): prev_selected, min_dist = None, None for prev in F.prevalence_linspace(n_prevalences=100, repeat=1, smooth_limits_epsilon=0.0): - Px_train = prev*Pxy1_density + (1 - prev)*Pxy0_density + Px_train = prev * Pxy1_density + (1 - prev) * Pxy0_density hdy = F.HellingerDistance(Px_train, Px_test) if prev_selected is None or hdy < min_dist: prev_selected, min_dist = prev, hdy prev_estimations.append(prev_selected) - pos_class_prev = np.median(prev_estimations) - return np.asarray([1-pos_class_prev, pos_class_prev]) + class1_prev = np.median(prev_estimations) + return np.asarray([1 - class1_prev, class1_prev]) class ELM(AggregativeQuantifier, BinaryQuantifier): @@ -457,8 +463,8 @@ class ELM(AggregativeQuantifier, BinaryQuantifier): self.learner.fit(data.instances, data.labels) return self - def aggregate(self, classif_predictions:np.ndarray): - return F.prevalence_from_labels(classif_predictions, self.learner.n_classes_) + def aggregate(self, classif_predictions: np.ndarray): + return F.prevalence_from_labels(classif_predictions, self.classes_) def classify(self, X, y=None): return self.learner.predict(X) @@ -470,6 +476,7 @@ class SVMQ(ELM): Quantification-oriented learning based on reliable classifiers. Pattern Recognition, 48(2):591–604. """ + def __init__(self, svmperf_base=None, **kwargs): super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs) @@ -480,6 +487,7 @@ class SVMKLD(ELM): Optimizing text quantifiers for multivariate loss functions. ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27. """ + def __init__(self, svmperf_base=None, **kwargs): super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs) @@ -490,6 +498,7 @@ class SVMNKLD(ELM): Optimizing text quantifiers for multivariate loss functions. ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27. """ + def __init__(self, svmperf_base=None, **kwargs): super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs) @@ -531,7 +540,7 @@ class OneVsAll(AggregativeQuantifier): f'{self.__class__.__name__} expect non-binary data' assert isinstance(self.binary_quantifier, BaseQuantifier), \ f'{self.binary_quantifier} does not seem to be a Quantifier' - assert fit_learner==True, 'fit_learner must be True' + assert fit_learner == True, 'fit_learner must be True' self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_} self.__parallel(self._delayed_binary_fit, data) @@ -559,11 +568,11 @@ class OneVsAll(AggregativeQuantifier): def aggregate(self, classif_predictions_bin): if self.probabilistic: - assert classif_predictions_bin.shape[1]==self.n_classes and classif_predictions_bin.shape[2]==2, \ + assert classif_predictions_bin.shape[1] == self.n_classes and classif_predictions_bin.shape[2] == 2, \ 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of posterior ' \ 'probabilities (2 dimensions) for each document (row) and class (columns)' else: - assert set(np.unique(classif_predictions_bin)).issubset({0,1}), \ + assert set(np.unique(classif_predictions_bin)).issubset({0, 1}), \ 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \ 'predictions for each document (row) and class (columns)' prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin) @@ -606,7 +615,7 @@ class OneVsAll(AggregativeQuantifier): return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1] def _delayed_binary_fit(self, c, data): - bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2) + bindata = LabelledCollection(data.instances, data.labels == c, classes_=[False, True]) self.dict_binary_quantifiers[c].fit(bindata) @property @@ -616,9 +625,3 @@ class OneVsAll(AggregativeQuantifier): @property def probabilistic(self): return self.binary_quantifier.probabilistic - - - - - - diff --git a/quapy/method/base.py b/quapy/method/base.py index 59a6bbf..0c2729f 100644 --- a/quapy/method/base.py +++ b/quapy/method/base.py @@ -19,8 +19,8 @@ class BaseQuantifier(metaclass=ABCMeta): @abstractmethod def get_params(self, deep=True): ... - @abstractmethod @property + @abstractmethod def classes_(self): ... # these methods allows meta-learners to reimplement the decision based on their constituents, and not diff --git a/quapy/tests/test_datasets.py b/quapy/tests/test_datasets.py index 1358f71..88209e8 100644 --- a/quapy/tests/test_datasets.py +++ b/quapy/tests/test_datasets.py @@ -7,7 +7,11 @@ from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DA @pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS) def test_fetch_reviews(dataset_name): dataset = fetch_reviews(dataset_name) - print(dataset.n_classes, len(dataset.training), len(dataset.test)) + print(f'Dataset {dataset_name}') + print('Training set stats') + dataset.training.stats() + print('Test set stats') + dataset.test.stats() @pytest.mark.parametrize('dataset_name', TWITTER_SENTIMENT_DATASETS_TEST + TWITTER_SENTIMENT_DATASETS_TRAIN) @@ -18,7 +22,10 @@ def test_fetch_twitter(dataset_name): if dataset_name == 'semeval' and ve.args[0].startswith( 'dataset "semeval" can only be used for model selection.'): dataset = fetch_twitter(dataset_name, for_model_selection=True) - print(dataset.n_classes, len(dataset.training), len(dataset.test)) + print(f'Dataset {dataset_name}') + print('Training set stats') + dataset.training.stats() + print('Test set stats') @pytest.mark.parametrize('dataset_name', UCI_DATASETS) @@ -28,5 +35,9 @@ def test_fetch_UCIDataset(dataset_name): except FileNotFoundError as fnfe: if dataset_name == 'pageblocks.5' and fnfe.args[0].find( 'If this is the first time you attempt to load this dataset') > 0: + print('The pageblocks.5 dataset requires some hand processing to be usable, skipping this test.') return - print(dataset.n_classes, len(dataset.training), len(dataset.test)) + print(f'Dataset {dataset_name}') + print('Training set stats') + dataset.training.stats() + print('Test set stats') diff --git a/quapy/tests/test_methods.py b/quapy/tests/test_methods.py index d32916d..c036692 100644 --- a/quapy/tests/test_methods.py +++ b/quapy/tests/test_methods.py @@ -1,23 +1,23 @@ import numpy import pytest from sklearn.linear_model import LogisticRegression -from sklearn.naive_bayes import MultinomialNB from sklearn.svm import LinearSVC import quapy as qp +from quapy.data import Dataset, LabelledCollection from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS, EXPLICIT_LOSS_MINIMIZATION_METHODS from quapy.method.meta import Ensemble datasets = [pytest.param(qp.datasets.fetch_twitter('hcr'), id='hcr'), pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')] -learners = [LogisticRegression, MultinomialNB, LinearSVC] +learners = [LogisticRegression, LinearSVC] @pytest.mark.parametrize('dataset', datasets) @pytest.mark.parametrize('aggregative_method', AGGREGATIVE_METHODS.difference(EXPLICIT_LOSS_MINIMIZATION_METHODS)) @pytest.mark.parametrize('learner', learners) -def test_aggregative_methods(dataset, aggregative_method, learner): +def test_aggregative_methods(dataset: Dataset, aggregative_method, learner): model = aggregative_method(learner()) if model.binary and not dataset.binary: @@ -36,7 +36,7 @@ def test_aggregative_methods(dataset, aggregative_method, learner): @pytest.mark.parametrize('dataset', datasets) @pytest.mark.parametrize('elm_method', EXPLICIT_LOSS_MINIMIZATION_METHODS) -def test_elm_methods(dataset, elm_method): +def test_elm_methods(dataset: Dataset, elm_method): try: model = elm_method() except AssertionError as ae: @@ -60,7 +60,7 @@ def test_elm_methods(dataset, elm_method): @pytest.mark.parametrize('dataset', datasets) @pytest.mark.parametrize('non_aggregative_method', NON_AGGREGATIVE_METHODS) -def test_non_aggregative_methods(dataset, non_aggregative_method): +def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method): model = non_aggregative_method() if model.binary and not dataset.binary: @@ -81,7 +81,7 @@ def test_non_aggregative_methods(dataset, non_aggregative_method): @pytest.mark.parametrize('learner', learners) @pytest.mark.parametrize('dataset', datasets) @pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES) -def test_ensemble_method(base_method, learner, dataset, policy): +def test_ensemble_method(base_method, learner, dataset: Dataset, policy): qp.environ['SAMPLE_SIZE'] = len(dataset.training) model = Ensemble(quantifier=base_method(learner()), size=5, policy=policy, n_jobs=-1) if model.binary and not dataset.binary: @@ -100,10 +100,12 @@ def test_ensemble_method(base_method, learner, dataset, policy): def test_quanet_method(): dataset = qp.datasets.fetch_reviews('kindle', pickle=True) + dataset = Dataset(dataset.training.sampling(100, *dataset.training.prevalence()), + dataset.test.sampling(100, *dataset.test.prevalence())) qp.data.preprocessing.index(dataset, min_df=5, inplace=True) from quapy.classification.neural import CNNnet - cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes) + cnn = CNNnet(dataset.vocabulary_size, dataset.training.n_classes) from quapy.classification.neural import NeuralClassifierTrainer learner = NeuralClassifierTrainer(cnn, device='cuda') @@ -123,3 +125,50 @@ def test_quanet_method(): error = qp.error.mae(true_prevalences, estim_prevalences) assert type(error) == numpy.float64 + + +def models_to_test_for_str_label_names(): + models = list() + learner = LogisticRegression + for method in AGGREGATIVE_METHODS.difference(EXPLICIT_LOSS_MINIMIZATION_METHODS): + models.append(method(learner())) + for method in NON_AGGREGATIVE_METHODS: + models.append(method()) + return models + + +@pytest.mark.parametrize('model', models_to_test_for_str_label_names()) +def test_str_label_names(model): + dataset = qp.datasets.fetch_reviews('imdb', pickle=True) + dataset = Dataset(dataset.training.sampling(1000, *dataset.training.prevalence()), + dataset.test.sampling(1000, *dataset.test.prevalence())) + qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True) + + model.fit(dataset.training) + + int_estim_prevalences = model.quantify(dataset.test.instances) + true_prevalences = dataset.test.prevalence() + + error = qp.error.mae(true_prevalences, int_estim_prevalences) + assert type(error) == numpy.float64 + + dataset_str = Dataset(LabelledCollection(dataset.training.instances, + ['one' if label == 1 else 'zero' for label in dataset.training.labels]), + LabelledCollection(dataset.test.instances, + ['one' if label == 1 else 'zero' for label in dataset.test.labels])) + + model.fit(dataset_str.training) + + str_estim_prevalences = model.quantify(dataset_str.test.instances) + true_prevalences = dataset_str.test.prevalence() + + error = qp.error.mae(true_prevalences, str_estim_prevalences) + assert type(error) == numpy.float64 + + print(true_prevalences) + print(int_estim_prevalences) + print(str_estim_prevalences) + + numpy.testing.assert_almost_equal(int_estim_prevalences[1], + str_estim_prevalences[list(model.classes_).index('one')]) +