From 75af15ae4ac581bd16a412af256b30db3449caed Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Wed, 28 Feb 2024 08:46:54 +0100 Subject: [PATCH 01/15] force all samples be with replacement in base.LabelledCollection, irrespective of the sample size requested --- quapy/data/base.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/quapy/data/base.py b/quapy/data/base.py index 9cc6441..2629084 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -108,8 +108,7 @@ class LabelledCollection: """ Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the prevalence values are not specified, then returns the index of a uniform sampling. - For each class, the sampling is drawn with replacement if the requested prevalence is larger than - the actual prevalence of the class, or without replacement otherwise. + For each class, the sampling is drawn with replacement. :param size: integer, the requested size :param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since @@ -153,7 +152,7 @@ class LabelledCollection: for class_, n_requested in n_requests.items(): n_candidates = len(self.index[class_]) index_sample = self.index[class_][ - np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates)) + np.random.choice(n_candidates, size=n_requested, replace=True) ] if n_requested > 0 else [] indexes_sample.append(index_sample) @@ -168,8 +167,7 @@ class LabelledCollection: def uniform_sampling_index(self, size, random_state=None): """ Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn - with replacement if the requested size is greater than the number of instances, or without replacement - otherwise. + with replacement. :param size: integer, the size of the uniform sample :param random_state: if specified, guarantees reproducibility of the split. @@ -179,13 +177,12 @@ class LabelledCollection: ng = RandomState(seed=random_state) else: ng = np.random - return ng.choice(len(self), size, replace=size > len(self)) + return ng.choice(len(self), size, replace=True) def sampling(self, size, *prevs, shuffle=True, random_state=None): """ Return a random sample (an instance of :class:`LabelledCollection`) of desired size and desired prevalence - values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than - the actual prevalence of the class, or with replacement otherwise. + values. For each class, the sampling is drawn with replacement. :param size: integer, the requested size :param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since @@ -202,8 +199,7 @@ class LabelledCollection: def uniform_sampling(self, size, random_state=None): """ Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn - with replacement if the requested size is greater than the number of instances, or without replacement - otherwise. + with replacement. :param size: integer, the requested size :param random_state: if specified, guarantees reproducibility of the split. From 9207114cfa9f307363212aeaf6febd2535124d39 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Mon, 15 Apr 2024 18:00:38 +0200 Subject: [PATCH 02/15] improving unit tests --- quapy/method/__init__.py | 23 ++++++++++++++++++ quapy/method/aggregative.py | 2 +- quapy/tests/test_evaluation.py | 13 ++++++---- quapy/tests/test_hierarchy.py | 44 ++++++++++++++++++++++++++++++---- 4 files changed, 73 insertions(+), 9 deletions(-) diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py index 51b02c2..e0d5c1f 100644 --- a/quapy/method/__init__.py +++ b/quapy/method/__init__.py @@ -17,12 +17,35 @@ AGGREGATIVE_METHODS = { aggregative.MAX, aggregative.MS, aggregative.MS2, + aggregative.DMy, aggregative.KDEyML, aggregative.KDEyCS, aggregative.KDEyHD, aggregative.BayesianCC } +BINARY_METHODS = { + aggregative.HDy, + aggregative.DyS, + aggregative.SMM, + aggregative.X, + aggregative.T50, + aggregative.MAX, + aggregative.MS, + aggregative.MS2, +} + +MULTICLASS_METHODS = { + aggregative.CC, + aggregative.ACC, + aggregative.PCC, + aggregative.PACC, + aggregative.EMQ, + aggregative.KDEyML, + aggregative.KDEyCS, + aggregative.KDEyHD, + aggregative.BayesianCC +} NON_AGGREGATIVE_METHODS = { non_aggregative.MaximumLikelihoodPrevalenceEstimation, diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 46e56d7..5a7812d 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -1522,8 +1522,8 @@ AdjustedClassifyAndCount = ACC ProbabilisticClassifyAndCount = PCC ProbabilisticAdjustedClassifyAndCount = PACC ExpectationMaximizationQuantifier = EMQ -DistributionMatchingY = DMy SLD = EMQ +DistributionMatchingY = DMy HellingerDistanceY = HDy MedianSweep = MS MedianSweep2 = MS2 diff --git a/quapy/tests/test_evaluation.py b/quapy/tests/test_evaluation.py index 5c50218..137a38a 100644 --- a/quapy/tests/test_evaluation.py +++ b/quapy/tests/test_evaluation.py @@ -6,14 +6,17 @@ import quapy as qp from sklearn.linear_model import LogisticRegression from time import time -from quapy.error import QUANTIFICATION_ERROR_SINGLE, QUANTIFICATION_ERROR, QUANTIFICATION_ERROR_NAMES, \ - QUANTIFICATION_ERROR_SINGLE_NAMES +from quapy.error import QUANTIFICATION_ERROR_SINGLE_NAMES from quapy.method.aggregative import EMQ, PCC from quapy.method.base import BaseQuantifier class EvalTestCase(unittest.TestCase): + def test_eval_speedup(self): + """ + Checks whether the speed-up heuristics used by qp.evaluation work, i.e., actually save time + """ data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True) train, test = data.training, data.test @@ -55,8 +58,11 @@ class EvalTestCase(unittest.TestCase): self.assertEqual(tend_no_optim>(tend_optim/2), True) def test_evaluation_output(self): + """ + Checks the evaluation functions return correct types for different error_metrics + """ - data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True) + data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True).reduce(n_train=100, n_test=100) train, test = data.training, data.test qp.environ['SAMPLE_SIZE']=100 @@ -79,6 +85,5 @@ class EvalTestCase(unittest.TestCase): self.assertEqual(scores.mean(), score) - if __name__ == '__main__': unittest.main() diff --git a/quapy/tests/test_hierarchy.py b/quapy/tests/test_hierarchy.py index 33af5da..0797729 100644 --- a/quapy/tests/test_hierarchy.py +++ b/quapy/tests/test_hierarchy.py @@ -1,19 +1,54 @@ import unittest from sklearn.linear_model import LogisticRegression + +from quapy.method import AGGREGATIVE_METHODS, BINARY_METHODS from quapy.method.aggregative import * +import inspect class HierarchyTestCase(unittest.TestCase): def test_aggregative(self): lr = LogisticRegression() - for m in [CC(lr), PCC(lr), ACC(lr), PACC(lr)]: - self.assertEqual(isinstance(m, AggregativeQuantifier), True) + for m in AGGREGATIVE_METHODS: + self.assertEqual(isinstance(m(lr), AggregativeQuantifier), True) + + def test_inspect_aggregative(self): + + import quapy.method.aggregative as aggregative + + members = inspect.getmembers(aggregative) + classes = set([cls for name, cls in members if inspect.isclass(cls)]) + quantifiers = [cls for cls in classes if issubclass(cls, BaseQuantifier)] + quantifiers = [cls for cls in quantifiers if issubclass(cls, AggregativeQuantifier)] + quantifiers = [cls for cls in quantifiers if not inspect.isabstract(cls) ] + + for cls in quantifiers: + self.assertIn(cls, AGGREGATIVE_METHODS) def test_binary(self): lr = LogisticRegression() - for m in [HDy(lr)]: - self.assertEqual(isinstance(m, BinaryQuantifier), True) + for m in BINARY_METHODS: + self.assertEqual(isinstance(m(lr), BinaryQuantifier), True) + + def test_inspect_binary(self): + + import quapy.method.base as base + import quapy.method.aggregative as aggregative + import quapy.method.non_aggregative as non_aggregative + import quapy.method.meta as meta + + members = inspect.getmembers(base) + members+= inspect.getmembers(aggregative) + members += inspect.getmembers(non_aggregative) + members += inspect.getmembers(meta) + classes = set([cls for name, cls in members if inspect.isclass(cls)]) + quantifiers = [cls for cls in classes if issubclass(cls, BaseQuantifier)] + quantifiers = [cls for cls in quantifiers if issubclass(cls, BinaryQuantifier)] + quantifiers = [cls for cls in quantifiers if not inspect.isabstract(cls) ] + + for cls in quantifiers: + self.assertIn(cls, BINARY_METHODS) def test_probabilistic(self): lr = LogisticRegression() @@ -27,3 +62,4 @@ class HierarchyTestCase(unittest.TestCase): if __name__ == '__main__': unittest.main() + From 561b672200575de50fef9d25b046ad8bee44479d Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Tue, 16 Apr 2024 15:12:22 +0200 Subject: [PATCH 03/15] updated unit tests --- examples/model_selection.py | 13 +- quapy/data/base.py | 2 +- quapy/data/datasets.py | 8 +- quapy/functional.py | 15 ++ quapy/method/__init__.py | 1 + quapy/method/_threshold_optim.py | 2 +- quapy/method/aggregative.py | 19 ++- quapy/method/base.py | 4 +- quapy/tests/test_base.py | 5 - quapy/tests/test_datasets.py | 61 -------- quapy/tests/test_hierarchy.py | 65 --------- quapy/tests/test_methods.py | 234 ------------------------------ quapy/tests/test_modsel.py | 74 ++++------ quapy/tests/test_replicability.py | 6 +- 14 files changed, 82 insertions(+), 427 deletions(-) delete mode 100644 quapy/tests/test_base.py delete mode 100644 quapy/tests/test_datasets.py delete mode 100644 quapy/tests/test_hierarchy.py delete mode 100644 quapy/tests/test_methods.py diff --git a/examples/model_selection.py b/examples/model_selection.py index 08fbe34..130b542 100644 --- a/examples/model_selection.py +++ b/examples/model_selection.py @@ -12,12 +12,11 @@ from time import time In this example, we show how to perform model selection on a DistributionMatching quantifier. """ -model = KDEyML(LogisticRegression()) +model = DMy(LogisticRegression()) qp.environ['SAMPLE_SIZE'] = 100 qp.environ['N_JOBS'] = -1 -# training, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test training, test = qp.datasets.fetch_UCIMulticlassDataset('letter').train_test with qp.util.temp_seed(0): @@ -34,19 +33,21 @@ with qp.util.temp_seed(0): # We will explore a classification-dependent hyper-parameter (e.g., the 'C' # hyper-parameter of LogisticRegression) and a quantification-dependent hyper-parameter - # (e.g., the number of bins in a DistributionMatching quantifier. + # (e.g., the number of bins in a DistributionMatching quantifier). # Classifier-dependent hyper-parameters have to be marked with a prefix "classifier__" # in order to let the quantifier know this hyper-parameter belongs to its underlying # classifier. + # We consider 7 values for the classifier and 7 values for the quantifier. + # QuaPy is optimized so that only 7 classifiers are trained, and then reused to test the + # different configurations of the quantifier. In other words, quapy avoids to train + # the classifier 7x7 times. param_grid = { 'classifier__C': np.logspace(-3,3,7), - 'classifier__class_weight': ['balanced', None], - 'bandwidth': np.linspace(0.01, 0.2, 20), + 'nbins': [2, 3, 4, 5, 10, 15, 20] } tinit = time() - # model = OLD_GridSearchQ( model = qp.model_selection.GridSearchQ( model=model, param_grid=param_grid, diff --git a/quapy/data/base.py b/quapy/data/base.py index 2629084..e52230e 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -123,7 +123,7 @@ class LabelledCollection: if len(prevs) == self.n_classes - 1: prevs = prevs + (1 - sum(prevs),) assert len(prevs) == self.n_classes, 'unexpected number of prevalences' - assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})' + assert np.isclose(sum(prevs), 1), f'prevalences ({prevs}) wrong range (sum={sum(prevs)})' # Decide how many instances should be taken for each class in order to satisfy the requested prevalence # accurately, and the number of instances in the sample (exactly). If int(size * prevs[i]) (which is diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 5b9806f..0f732e8 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -50,7 +50,9 @@ UCI_MULTICLASS_DATASETS = ['dry-bean', 'digits', 'letter'] -LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B'] +LEQUA2022_VECTOR_TASKS = ['T1A', 'T1B'] +LEQUA2022_TEXT_TASKS = ['T2A', 'T2B'] +LEQUA2022_TASKS = LEQUA2022_VECTOR_TASKS + LEQUA2022_TEXT_TASKS _TXA_SAMPLE_SIZE = 250 _TXB_SAMPLE_SIZE = 1000 @@ -209,7 +211,7 @@ def fetch_UCIBinaryDataset(dataset_name, data_home=None, test_split=0.3, verbose :return: a :class:`quapy.data.base.Dataset` instance """ data = fetch_UCIBinaryLabelledCollection(dataset_name, data_home, verbose) - return Dataset(*data.split_stratified(1 - test_split, random_state=0)) + return Dataset(*data.split_stratified(1 - test_split, random_state=0), name=dataset_name) def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: @@ -583,7 +585,7 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver :return: a :class:`quapy.data.base.Dataset` instance """ data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose) - return Dataset(*data.split_stratified(1 - test_split, random_state=0)) + return Dataset(*data.split_stratified(1 - test_split, random_state=0), name=dataset_name) def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: diff --git a/quapy/functional.py b/quapy/functional.py index 856534a..fa17a5c 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -189,6 +189,19 @@ def check_prevalence_vector(prevalences: ArrayLike, raise_exception: bool=False, return valid +def uniform_prevalence(n_classes): + """ + Returns a vector representing the uniform distribution for `n_classes` + + :param n_classes: number of classes + :return: np.ndarray with all values 1/n_classes + """ + assert isinstance(n_classes, int) and n_classes>0, \ + (f'param {n_classes} not understood; must be a positive integer representing the ' + f'number of classes ') + return np.full(shape=n_classes, fill_value=1./n_classes) + + def normalize_prevalence(prevalences: ArrayLike, method='l1'): """ Normalizes a vector or matrix of prevalence values. The normalization consists of applying a L1 normalization in @@ -606,3 +619,5 @@ def solve_adjustment( raise ValueError(f"Solver {solver} not known.") else: raise ValueError(f'unknown {solver=}') + + diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py index e0d5c1f..e1d6309 100644 --- a/quapy/method/__init__.py +++ b/quapy/method/__init__.py @@ -3,6 +3,7 @@ from . import aggregative from . import non_aggregative from . import meta + AGGREGATIVE_METHODS = { aggregative.CC, aggregative.ACC, diff --git a/quapy/method/_threshold_optim.py b/quapy/method/_threshold_optim.py index 6a38fdb..a9d2723 100644 --- a/quapy/method/_threshold_optim.py +++ b/quapy/method/_threshold_optim.py @@ -27,7 +27,7 @@ class ThresholdOptimization(BinaryAggregativeQuantifier): :class:`quapy.data.base.LabelledCollection` (the split itself). """ - def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None): + def __init__(self, classifier: BaseEstimator, val_split=None, n_jobs=None): self.classifier = classifier self.val_split = val_split self.n_jobs = qp._get_njobs(n_jobs) diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 5a7812d..3470726 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -82,6 +82,13 @@ class AggregativeQuantifier(BaseQuantifier, ABC): :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data :param fit_classifier: whether to train the learner (default is True). Set to False if the learner has been trained outside the quantifier. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a collection defining the specific set of data to use for validation. + Alternatively, this set can be specified at fit time by indicating the exact set of data + on which the predictions are to be generated. :return: self """ self._check_init_parameters() @@ -111,6 +118,12 @@ class AggregativeQuantifier(BaseQuantifier, ABC): if fit_classifier: self._check_non_empty_classes(data) + if predict_on is None: + if not fit_classifier: + predict_on = data + if isinstance(self.val_split, LabelledCollection) and self.val_split!=predict_on: + raise ValueError(f'{fit_classifier=} but a LabelledCollection was provided as val_split ' + f'in __init__ that is not the same as the LabelledCollection provided in fit.') if predict_on is None: predict_on = self.val_split @@ -467,7 +480,7 @@ class ACC(AggregativeCrispQuantifier): if self.method not in ACC.METHODS: raise ValueError(f"unknown method; valid ones are {ACC.METHODS}") if self.norm not in ACC.NORMALIZATIONS: - raise ValueError(f"unknown clipping; valid ones are {ACC.NORMALIZATIONS}") + raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}") def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): """ @@ -577,8 +590,8 @@ class PACC(AggregativeSoftQuantifier): raise ValueError(f"unknown solver; valid ones are {ACC.SOLVERS}") if self.method not in ACC.METHODS: raise ValueError(f"unknown method; valid ones are {ACC.METHODS}") - if self.clipping not in ACC.NORMALIZATIONS: - raise ValueError(f"unknown clipping; valid ones are {ACC.NORMALIZATIONS}") + if self.norm not in ACC.NORMALIZATIONS: + raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}") def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): """ diff --git a/quapy/method/base.py b/quapy/method/base.py index f34acf6..58cd6f1 100644 --- a/quapy/method/base.py +++ b/quapy/method/base.py @@ -54,7 +54,7 @@ class OneVsAll: pass -def newOneVsAll(binary_quantifier, n_jobs=None): +def newOneVsAll(binary_quantifier: BaseQuantifier, n_jobs=None): assert isinstance(binary_quantifier, BaseQuantifier), \ f'{binary_quantifier} does not seem to be a Quantifier' if isinstance(binary_quantifier, qp.method.aggregative.AggregativeQuantifier): @@ -69,7 +69,7 @@ class OneVsAllGeneric(OneVsAll, BaseQuantifier): quantifier for each class, and then l1-normalizes the outputs so that the class prevelence values sum up to 1. """ - def __init__(self, binary_quantifier, n_jobs=None): + def __init__(self, binary_quantifier: BaseQuantifier, n_jobs=None): assert isinstance(binary_quantifier, BaseQuantifier), \ f'{binary_quantifier} does not seem to be a Quantifier' if isinstance(binary_quantifier, qp.method.aggregative.AggregativeQuantifier): diff --git a/quapy/tests/test_base.py b/quapy/tests/test_base.py deleted file mode 100644 index 4fd9faa..0000000 --- a/quapy/tests/test_base.py +++ /dev/null @@ -1,5 +0,0 @@ -import pytest - -def test_import(): - import quapy as qp - assert qp.__version__ is not None diff --git a/quapy/tests/test_datasets.py b/quapy/tests/test_datasets.py deleted file mode 100644 index 4ed5aa9..0000000 --- a/quapy/tests/test_datasets.py +++ /dev/null @@ -1,61 +0,0 @@ -import pytest - -from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \ - TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_BINARY_DATASETS, LEQUA2022_TASKS, UCI_MULTICLASS_DATASETS,\ - fetch_reviews, fetch_twitter, fetch_UCIBinaryDataset, fetch_lequa2022, fetch_UCIMulticlassLabelledCollection - - -@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS) -def test_fetch_reviews(dataset_name): - dataset = fetch_reviews(dataset_name) - print(f'Dataset {dataset_name}') - print('Training set stats') - dataset.training.stats() - print('Test set stats') - dataset.test.stats() - - -@pytest.mark.parametrize('dataset_name', TWITTER_SENTIMENT_DATASETS_TEST + TWITTER_SENTIMENT_DATASETS_TRAIN) -def test_fetch_twitter(dataset_name): - try: - dataset = fetch_twitter(dataset_name) - except ValueError as ve: - if dataset_name == 'semeval' and ve.args[0].startswith( - 'dataset "semeval" can only be used for model selection.'): - dataset = fetch_twitter(dataset_name, for_model_selection=True) - print(f'Dataset {dataset_name}') - print('Training set stats') - dataset.training.stats() - print('Test set stats') - - -@pytest.mark.parametrize('dataset_name', UCI_BINARY_DATASETS) -def test_fetch_UCIDataset(dataset_name): - try: - dataset = fetch_UCIBinaryDataset(dataset_name) - except FileNotFoundError as fnfe: - if dataset_name == 'pageblocks.5' and fnfe.args[0].find( - 'If this is the first time you attempt to load this dataset') > 0: - print('The pageblocks.5 dataset requires some hand processing to be usable, skipping this test.') - return - print(f'Dataset {dataset_name}') - print('Training set stats') - dataset.training.stats() - print('Test set stats') - - -@pytest.mark.parametrize('dataset_name', UCI_MULTICLASS_DATASETS) -def test_fetch_UCIMultiDataset(dataset_name): - dataset = fetch_UCIMulticlassLabelledCollection(dataset_name) - print(f'Dataset {dataset_name}') - print('Training set stats') - dataset.stats() - print('Test set stats') - - -@pytest.mark.parametrize('dataset_name', LEQUA2022_TASKS) -def test_fetch_lequa2022(dataset_name): - train, gen_val, gen_test = fetch_lequa2022(dataset_name) - print(train.stats()) - print('Val:', gen_val.total()) - print('Test:', gen_test.total()) diff --git a/quapy/tests/test_hierarchy.py b/quapy/tests/test_hierarchy.py deleted file mode 100644 index 0797729..0000000 --- a/quapy/tests/test_hierarchy.py +++ /dev/null @@ -1,65 +0,0 @@ -import unittest -from sklearn.linear_model import LogisticRegression - -from quapy.method import AGGREGATIVE_METHODS, BINARY_METHODS -from quapy.method.aggregative import * -import inspect - - -class HierarchyTestCase(unittest.TestCase): - - def test_aggregative(self): - lr = LogisticRegression() - for m in AGGREGATIVE_METHODS: - self.assertEqual(isinstance(m(lr), AggregativeQuantifier), True) - - def test_inspect_aggregative(self): - - import quapy.method.aggregative as aggregative - - members = inspect.getmembers(aggregative) - classes = set([cls for name, cls in members if inspect.isclass(cls)]) - quantifiers = [cls for cls in classes if issubclass(cls, BaseQuantifier)] - quantifiers = [cls for cls in quantifiers if issubclass(cls, AggregativeQuantifier)] - quantifiers = [cls for cls in quantifiers if not inspect.isabstract(cls) ] - - for cls in quantifiers: - self.assertIn(cls, AGGREGATIVE_METHODS) - - def test_binary(self): - lr = LogisticRegression() - for m in BINARY_METHODS: - self.assertEqual(isinstance(m(lr), BinaryQuantifier), True) - - def test_inspect_binary(self): - - import quapy.method.base as base - import quapy.method.aggregative as aggregative - import quapy.method.non_aggregative as non_aggregative - import quapy.method.meta as meta - - members = inspect.getmembers(base) - members+= inspect.getmembers(aggregative) - members += inspect.getmembers(non_aggregative) - members += inspect.getmembers(meta) - classes = set([cls for name, cls in members if inspect.isclass(cls)]) - quantifiers = [cls for cls in classes if issubclass(cls, BaseQuantifier)] - quantifiers = [cls for cls in quantifiers if issubclass(cls, BinaryQuantifier)] - quantifiers = [cls for cls in quantifiers if not inspect.isabstract(cls) ] - - for cls in quantifiers: - self.assertIn(cls, BINARY_METHODS) - - def test_probabilistic(self): - lr = LogisticRegression() - for m in [CC(lr), ACC(lr)]: - self.assertEqual(isinstance(m, AggregativeCrispQuantifier), True) - self.assertEqual(isinstance(m, AggregativeSoftQuantifier), False) - for m in [PCC(lr), PACC(lr)]: - self.assertEqual(isinstance(m, AggregativeCrispQuantifier), False) - self.assertEqual(isinstance(m, AggregativeSoftQuantifier), True) - - -if __name__ == '__main__': - unittest.main() - diff --git a/quapy/tests/test_methods.py b/quapy/tests/test_methods.py deleted file mode 100644 index 3fbe991..0000000 --- a/quapy/tests/test_methods.py +++ /dev/null @@ -1,234 +0,0 @@ -import numpy as np -import pytest -from sklearn.linear_model import LogisticRegression -from sklearn.svm import LinearSVC - -import method.aggregative -import quapy as qp -from quapy.model_selection import GridSearchQ -from quapy.method.base import BinaryQuantifier -from quapy.data import Dataset, LabelledCollection -from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS -from quapy.method.meta import Ensemble -from quapy.protocol import APP -from quapy.method.aggregative import DMy -from quapy.method.meta import MedianEstimator - -# datasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True), id='hcr'), -# pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')] - -tinydatasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True).reduce(), id='tiny_hcr'), - pytest.param(qp.datasets.fetch_UCIBinaryDataset('ionosphere').reduce(), id='tiny_ionosphere')] - -learners = [LogisticRegression, LinearSVC] - - -@pytest.mark.parametrize('dataset', tinydatasets) -@pytest.mark.parametrize('aggregative_method', AGGREGATIVE_METHODS) -@pytest.mark.parametrize('learner', learners) -def test_aggregative_methods(dataset: Dataset, aggregative_method, learner): - model = aggregative_method(learner()) - - if isinstance(model, BinaryQuantifier) and not dataset.binary: - print(f'skipping the test of binary model {type(model)} on non-binary dataset {dataset}') - return - - model.fit(dataset.training) - - estim_prevalences = model.quantify(dataset.test.instances) - - true_prevalences = dataset.test.prevalence() - error = qp.error.mae(true_prevalences, estim_prevalences) - - assert type(error) == np.float64 - - -@pytest.mark.parametrize('dataset', tinydatasets) -@pytest.mark.parametrize('non_aggregative_method', NON_AGGREGATIVE_METHODS) -def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method): - model = non_aggregative_method() - - if isinstance(model, BinaryQuantifier) and not dataset.binary: - print(f'skipping the test of binary model {model} on non-binary dataset {dataset}') - return - - model.fit(dataset.training) - - estim_prevalences = model.quantify(dataset.test.instances) - - true_prevalences = dataset.test.prevalence() - error = qp.error.mae(true_prevalences, estim_prevalences) - - assert type(error) == np.float64 - - -@pytest.mark.parametrize('base_method', [method.aggregative.ACC, method.aggregative.PACC]) -@pytest.mark.parametrize('learner', [LogisticRegression]) -@pytest.mark.parametrize('dataset', tinydatasets) -@pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES) -def test_ensemble_method(base_method, learner, dataset: Dataset, policy): - - qp.environ['SAMPLE_SIZE'] = 20 - - base_quantifier=base_method(learner()) - - if not dataset.binary and policy=='ds': - print(f'skipping the test of binary policy ds on non-binary dataset {dataset}') - return - - model = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1) - - model.fit(dataset.training) - - estim_prevalences = model.quantify(dataset.test.instances) - - true_prevalences = dataset.test.prevalence() - error = qp.error.mae(true_prevalences, estim_prevalences) - - assert type(error) == np.float64 - - -def test_quanet_method(): - try: - import quapy.classification.neural - except ModuleNotFoundError: - print('skipping QuaNet test due to missing torch package') - return - - qp.environ['SAMPLE_SIZE'] = 100 - - # load the kindle dataset as text, and convert words to numerical indexes - dataset = qp.datasets.fetch_reviews('kindle', pickle=True).reduce(200, 200) - qp.data.preprocessing.index(dataset, min_df=5, inplace=True) - - from quapy.classification.neural import CNNnet - cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes) - - from quapy.classification.neural import NeuralClassifierTrainer - learner = NeuralClassifierTrainer(cnn, device='cuda') - - from quapy.method.meta import QuaNet - model = QuaNet(learner, device='cuda') - - if isinstance(model, BinaryQuantifier) and not dataset.binary: - print(f'skipping the test of binary model {model} on non-binary dataset {dataset}') - return - - model.fit(dataset.training) - - estim_prevalences = model.quantify(dataset.test.instances) - - true_prevalences = dataset.test.prevalence() - error = qp.error.mae(true_prevalences, estim_prevalences) - - assert type(error) == np.float64 - - -def test_str_label_names(): - model = qp.method.aggregative.CC(LogisticRegression()) - - dataset = qp.datasets.fetch_reviews('imdb', pickle=True) - dataset = Dataset(dataset.training.sampling(1000, *dataset.training.prevalence()), - dataset.test.sampling(1000, 0.25, 0.75)) - qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True) - - np.random.seed(0) - model.fit(dataset.training) - - int_estim_prevalences = model.quantify(dataset.test.instances) - true_prevalences = dataset.test.prevalence() - - error = qp.error.mae(true_prevalences, int_estim_prevalences) - assert type(error) == np.float64 - - dataset_str = Dataset(LabelledCollection(dataset.training.instances, - ['one' if label == 1 else 'zero' for label in dataset.training.labels]), - LabelledCollection(dataset.test.instances, - ['one' if label == 1 else 'zero' for label in dataset.test.labels])) - assert all(dataset_str.training.classes_ == dataset_str.test.classes_), 'wrong indexation' - np.random.seed(0) - model.fit(dataset_str.training) - - str_estim_prevalences = model.quantify(dataset_str.test.instances) - true_prevalences = dataset_str.test.prevalence() - - error = qp.error.mae(true_prevalences, str_estim_prevalences) - assert type(error) == np.float64 - - print(true_prevalences) - print(int_estim_prevalences) - print(str_estim_prevalences) - - np.testing.assert_almost_equal(int_estim_prevalences[1], - str_estim_prevalences[list(model.classes_).index('one')]) - -# helper -def __fit_test(quantifier, train, test): - quantifier.fit(train) - test_samples = APP(test) - true_prevs, estim_prevs = qp.evaluation.prediction(quantifier, test_samples) - return qp.error.mae(true_prevs, estim_prevs), estim_prevs - - -def test_median_meta(): - """ - This test compares the performance of the MedianQuantifier with respect to computing the median of the predictions - of a differently parameterized quantifier. We use the DistributionMatching base quantifier and the median is - computed across different values of nbins - """ - - qp.environ['SAMPLE_SIZE'] = 100 - - # grid of values - nbins_grid = list(range(2, 11)) - - dataset = 'kindle' - train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test - prevs = [] - errors = [] - for nbins in nbins_grid: - with qp.util.temp_seed(0): - q = DMy(LogisticRegression(), nbins=nbins) - mae, estim_prevs = __fit_test(q, train, test) - prevs.append(estim_prevs) - errors.append(mae) - print(f'{dataset} DistributionMatching(nbins={nbins}) got MAE {mae:.4f}') - prevs = np.asarray(prevs) - mae = np.mean(errors) - print(f'\tMAE={mae:.4f}') - - q = DMy(LogisticRegression()) - q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1) - median_mae, prev = __fit_test(q, train, test) - print(f'\tMAE={median_mae:.4f}') - - np.testing.assert_almost_equal(np.median(prevs, axis=0), prev) - assert median_mae < mae, 'the median-based quantifier provided a higher error...' - - -def test_median_meta_modsel(): - """ - This test checks the median-meta quantifier with model selection - """ - - qp.environ['SAMPLE_SIZE'] = 100 - - dataset = 'kindle' - train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test - train, val = train.split_stratified(random_state=0) - - nbins_grid = [2, 4, 5, 10, 15] - - q = DMy(LogisticRegression()) - q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1) - median_mae, _ = __fit_test(q, train, test) - print(f'\tMAE={median_mae:.4f}') - - q = DMy(LogisticRegression()) - lr_params = {'classifier__C': np.logspace(-1, 1, 3)} - q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1) - q = GridSearchQ(q, param_grid=lr_params, protocol=APP(val), n_jobs=-1) - optimized_median_ave, _ = __fit_test(q, train, test) - print(f'\tMAE={optimized_median_ave:.4f}') - - assert optimized_median_ave < median_mae, "the optimized method yielded worse performance..." \ No newline at end of file diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py index 75cfaaf..fe416c7 100644 --- a/quapy/tests/test_modsel.py +++ b/quapy/tests/test_modsel.py @@ -2,9 +2,9 @@ import unittest import numpy as np from sklearn.linear_model import LogisticRegression -from sklearn.svm import SVC import quapy as qp +import util from quapy.method.aggregative import PACC from quapy.model_selection import GridSearchQ from quapy.protocol import APP @@ -14,13 +14,16 @@ import time class ModselTestCase(unittest.TestCase): def test_modsel(self): + """ + Checks whether a model selection exploration takes a good hyperparameter + """ q = PACC(LogisticRegression(random_state=1, max_iter=5000)) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce() training, validation = data.training.split_stratified(0.7, random_state=1) - param_grid = {'classifier__C': np.logspace(-3,3,7)} + param_grid = {'classifier__C': [0.000001, 10.]} app = APP(validation, sample_size=100, random_state=1) q = GridSearchQ( q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True @@ -32,54 +35,40 @@ class ModselTestCase(unittest.TestCase): self.assertEqual(q.best_model().get_params()['classifier__C'], 10.0) def test_modsel_parallel(self): + """ + Checks whether a parallelized model selection actually is faster than a sequential exploration but + obtains the same optimal parameters + """ q = PACC(LogisticRegression(random_state=1, max_iter=5000)) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(n_train=500) training, validation = data.training.split_stratified(0.7, random_state=1) - # test = data.test param_grid = {'classifier__C': np.logspace(-3,3,7)} app = APP(validation, sample_size=100, random_state=1) - q = GridSearchQ( + + print('starting model selection in sequential exploration') + tinit = time.time() + modsel = GridSearchQ( + q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=1, verbose=True + ).fit(training) + tend_seq = time.time()-tinit + best_c_seq = modsel.best_params_['classifier__C'] + print(f'[done] took {tend_seq:.2f}s best C = {best_c_seq}') + + print('starting model selection in parallel exploration') + tinit = time.time() + modsel = GridSearchQ( q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True ).fit(training) - print('best params', q.best_params_) - print('best score', q.best_score_) + tend_par = time.time() - tinit + best_c_par = modsel.best_params_['classifier__C'] + print(f'[done] took {tend_par:.2f}s best C = {best_c_par}') - self.assertEqual(q.best_params_['classifier__C'], 10.0) - self.assertEqual(q.best_model().get_params()['classifier__C'], 10.0) + self.assertEqual(best_c_seq, best_c_par) + self.assertLess(tend_par, tend_seq) - def test_modsel_parallel_speedup(self): - class SlowLR(LogisticRegression): - def fit(self, X, y, sample_weight=None): - time.sleep(1) - return super(SlowLR, self).fit(X, y, sample_weight) - - q = PACC(SlowLR(random_state=1, max_iter=5000)) - - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) - training, validation = data.training.split_stratified(0.7, random_state=1) - - param_grid = {'classifier__C': np.logspace(-3, 3, 7)} - app = APP(validation, sample_size=100, random_state=1) - - tinit = time.time() - GridSearchQ( - q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True - ).fit(training) - tend_nooptim = time.time()-tinit - - tinit = time.time() - GridSearchQ( - q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True - ).fit(training) - tend_optim = time.time() - tinit - - print(f'parallel training took {tend_optim:.4f}s') - print(f'sequential training took {tend_nooptim:.4f}s') - - self.assertEqual(tend_optim < (0.5*tend_nooptim), True) def test_modsel_timeout(self): @@ -91,11 +80,10 @@ class ModselTestCase(unittest.TestCase): q = PACC(SlowLR()) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce() training, validation = data.training.split_stratified(0.7, random_state=1) - # test = data.test - param_grid = {'classifier__C': np.logspace(-3,3,7)} + param_grid = {'classifier__C': np.logspace(-1,1,3)} app = APP(validation, sample_size=100, random_state=1) print('Expecting TimeoutError to be raised') diff --git a/quapy/tests/test_replicability.py b/quapy/tests/test_replicability.py index 8633fc4..434d44b 100644 --- a/quapy/tests/test_replicability.py +++ b/quapy/tests/test_replicability.py @@ -8,7 +8,7 @@ from quapy.method.aggregative import PACC import quapy.functional as F -class MyTestCase(unittest.TestCase): +class TestReplicability(unittest.TestCase): def test_prediction_replicability(self): @@ -26,7 +26,7 @@ class MyTestCase(unittest.TestCase): prev2 = pacc.fit(dataset.training).quantify(dataset.test.X) str_prev2 = strprev(prev2, prec=5) - self.assertEqual(str_prev1, str_prev2) # add assertion here + self.assertEqual(str_prev1, str_prev2) def test_samping_replicability(self): @@ -78,7 +78,7 @@ class MyTestCase(unittest.TestCase): def test_parallel_replicability(self): - train, test = qp.datasets.fetch_UCIMulticlassDataset('dry-bean').train_test + train, test = qp.datasets.fetch_UCIMulticlassDataset('dry-bean').reduce().train_test test = test.sampling(500, *[0.1, 0.0, 0.1, 0.1, 0.2, 0.5, 0.0]) From db6ff4ab9ede1bbafaf57b6e05bfeb5504cb5117 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Tue, 16 Apr 2024 17:46:58 +0200 Subject: [PATCH 04/15] refactored unittests --- quapy/tests/test_base.py | 11 ++++ quapy/tests/test_datasets.py | 119 ++++++++++++++++++++++++++++++++++ quapy/tests/test_hierarchy.py | 46 +++++++++++++ quapy/tests/test_methods.py | 92 ++++++++++++++++++++++++++ 4 files changed, 268 insertions(+) create mode 100644 quapy/tests/test_base.py create mode 100644 quapy/tests/test_datasets.py create mode 100644 quapy/tests/test_hierarchy.py create mode 100644 quapy/tests/test_methods.py diff --git a/quapy/tests/test_base.py b/quapy/tests/test_base.py new file mode 100644 index 0000000..7e2b4f8 --- /dev/null +++ b/quapy/tests/test_base.py @@ -0,0 +1,11 @@ +import unittest + + +class ImportTest(unittest.TestCase): + def test_import(self): + import quapy as qp + self.assertIsNotNone(qp.__version__) + + +if __name__ == '__main__': + unittest.main() diff --git a/quapy/tests/test_datasets.py b/quapy/tests/test_datasets.py new file mode 100644 index 0000000..daa9207 --- /dev/null +++ b/quapy/tests/test_datasets.py @@ -0,0 +1,119 @@ +import unittest + +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression + +import quapy.functional as F +from quapy.method.aggregative import PCC +from quapy.data.datasets import * + + +class TestDatasets(unittest.TestCase): + + def new_quantifier(self): + return PCC(LogisticRegression(C=0.001, max_iter=100)) + + def _check_dataset(self, dataset): + q = self.new_quantifier() + print(f'testing method {q} in {dataset.name}...', end='') + q.fit(dataset.training) + estim_prevalences = q.quantify(dataset.test.instances) + self.assertTrue(F.check_prevalence_vector(estim_prevalences)) + print(f'[done]') + + def _check_samples(self, gen, q, max_samples_test=5, vectorizer=None): + for X, p in gen(): + if vectorizer is not None: + X = vectorizer.transform(X) + estim_prevalences = q.quantify(X) + self.assertTrue(F.check_prevalence_vector(estim_prevalences)) + max_samples_test -= 1 + if max_samples_test == 0: + break + + def test_reviews(self): + for dataset_name in REVIEWS_SENTIMENT_DATASETS: + print(f'loading dataset {dataset_name}...', end='') + dataset = fetch_reviews(dataset_name, tfidf=True, min_df=10) + dataset.stats() + dataset.reduce() + print(f'[done]') + self._check_dataset(dataset) + + def test_twitter(self): + for dataset_name in TWITTER_SENTIMENT_DATASETS_TEST: + print(f'loading dataset {dataset_name}...', end='') + dataset = fetch_twitter(dataset_name, min_df=10) + dataset.stats() + dataset.reduce() + print(f'[done]') + self._check_dataset(dataset) + + def test_UCIBinaryDataset(self): + for dataset_name in UCI_BINARY_DATASETS: + try: + print(f'loading dataset {dataset_name}...', end='') + dataset = fetch_UCIBinaryDataset(dataset_name) + dataset.stats() + dataset.reduce() + print(f'[done]') + self._check_dataset(dataset) + except FileNotFoundError as fnfe: + if dataset_name == 'pageblocks.5' and fnfe.args[0].find( + 'If this is the first time you attempt to load this dataset') > 0: + print('The pageblocks.5 dataset requires some hand processing to be usable; skipping this test.') + continue + + def test_UCIMultiDataset(self): + for dataset_name in UCI_MULTICLASS_DATASETS: + print(f'loading dataset {dataset_name}...', end='') + dataset = fetch_UCIMulticlassDataset(dataset_name) + dataset.stats() + n_classes = dataset.n_classes + uniform_prev = F.uniform_prevalence(n_classes) + dataset.training = dataset.training.sampling(100, *uniform_prev) + dataset.test = dataset.test.sampling(100, *uniform_prev) + print(f'[done]') + self._check_dataset(dataset) + + def test_lequa2022(self): + + for dataset_name in LEQUA2022_VECTOR_TASKS: + print(f'loading dataset {dataset_name}...', end='') + train, gen_val, gen_test = fetch_lequa2022(dataset_name) + train.stats() + n_classes = train.n_classes + train = train.sampling(100, *F.uniform_prevalence(n_classes)) + q = self.new_quantifier() + q.fit(train) + self._check_samples(gen_val, q, max_samples_test=5) + self._check_samples(gen_test, q, max_samples_test=5) + + for dataset_name in LEQUA2022_TEXT_TASKS: + print(f'loading dataset {dataset_name}...', end='') + train, gen_val, gen_test = fetch_lequa2022(dataset_name) + train.stats() + n_classes = train.n_classes + train = train.sampling(100, *F.uniform_prevalence(n_classes)) + tfidf = TfidfVectorizer() + train.instances = tfidf.fit_transform(train.instances) + q = self.new_quantifier() + q.fit(train) + self._check_samples(gen_val, q, max_samples_test=5, vectorizer=tfidf) + self._check_samples(gen_test, q, max_samples_test=5, vectorizer=tfidf) + + + def test_IFCB(self): + print(f'loading dataset IFCB.') + for mod_sel in [False, True]: + train, gen = fetch_IFCB(single_sample_train=True, for_model_selection=mod_sel) + train.stats() + n_classes = train.n_classes + train = train.sampling(100, *F.uniform_prevalence(n_classes)) + q = self.new_quantifier() + q.fit(train) + self._check_samples(gen, q, max_samples_test=5) + + +if __name__ == '__main__': + unittest.main() diff --git a/quapy/tests/test_hierarchy.py b/quapy/tests/test_hierarchy.py new file mode 100644 index 0000000..0cf9b9b --- /dev/null +++ b/quapy/tests/test_hierarchy.py @@ -0,0 +1,46 @@ +import unittest +from sklearn.linear_model import LogisticRegression + +from quapy.method import AGGREGATIVE_METHODS, BINARY_METHODS +from quapy.method.aggregative import * +import inspect + + +class HierarchyTestCase(unittest.TestCase): + + def test_aggregative(self): + lr = LogisticRegression() + for m in AGGREGATIVE_METHODS: + self.assertEqual(isinstance(m(lr), AggregativeQuantifier), True) + + def test_inspect_aggregative(self): + + import quapy.method.aggregative as methods + + members = inspect.getmembers(methods) + classes = set([cls for name, cls in members if inspect.isclass(cls)]) + quantifiers = [cls for cls in classes if issubclass(cls, BaseQuantifier)] + quantifiers = [cls for cls in quantifiers if issubclass(cls, AggregativeQuantifier)] + quantifiers = [cls for cls in quantifiers if not inspect.isabstract(cls) ] + + for cls in quantifiers: + self.assertIn(cls, AGGREGATIVE_METHODS) + + def test_binary(self): + lr = LogisticRegression() + for m in BINARY_METHODS: + self.assertEqual(isinstance(m(lr), BinaryQuantifier), True) + + def test_probabilistic(self): + lr = LogisticRegression() + for m in [CC(lr), ACC(lr)]: + self.assertEqual(isinstance(m, AggregativeCrispQuantifier), True) + self.assertEqual(isinstance(m, AggregativeSoftQuantifier), False) + for m in [PCC(lr), PACC(lr)]: + self.assertEqual(isinstance(m, AggregativeCrispQuantifier), False) + self.assertEqual(isinstance(m, AggregativeSoftQuantifier), True) + + +if __name__ == '__main__': + unittest.main() + diff --git a/quapy/tests/test_methods.py b/quapy/tests/test_methods.py new file mode 100644 index 0000000..69d627b --- /dev/null +++ b/quapy/tests/test_methods.py @@ -0,0 +1,92 @@ +import itertools +import unittest + +from sklearn.linear_model import LogisticRegression + +import quapy as qp +from quapy.method.aggregative import ACC +from quapy.method.meta import Ensemble +from quapy.method import AGGREGATIVE_METHODS, BINARY_METHODS, NON_AGGREGATIVE_METHODS +from quapy.functional import check_prevalence_vector + +class TestMethods(unittest.TestCase): + + tiny_dataset_multiclass = qp.datasets.fetch_UCIMulticlassDataset('academic-success').reduce(n_test=10) + tiny_dataset_binary = qp.datasets.fetch_UCIBinaryDataset('ionosphere').reduce(n_test=10) + datasets = [tiny_dataset_binary, tiny_dataset_multiclass] + + def test_aggregative(self): + for dataset in TestMethods.datasets: + learner = LogisticRegression() + learner.fit(*dataset.training.Xy) + + for model in AGGREGATIVE_METHODS: + if not dataset.binary and model in BINARY_METHODS: + print(f'skipping the test of binary model {model.__name__} on multiclass dataset {dataset.name}') + continue + + q = model(learner) + print('testing', q) + q.fit(dataset.training, fit_classifier=False) + estim_prevalences = q.quantify(dataset.test.X) + self.assertTrue(check_prevalence_vector(estim_prevalences)) + + def test_non_aggregative(self): + for dataset in TestMethods.datasets: + + for model in NON_AGGREGATIVE_METHODS: + if not dataset.binary and model in BINARY_METHODS: + print(f'skipping the test of binary model {model.__name__} on multiclass dataset {dataset.name}') + continue + + q = model() + print(f'testing {q} on dataset {dataset.name}') + q.fit(dataset.training) + estim_prevalences = q.quantify(dataset.test.X) + self.assertTrue(check_prevalence_vector(estim_prevalences)) + + def test_ensembles(self): + + qp.environ['SAMPLE_SIZE'] = 10 + + base_quantifier = ACC(LogisticRegression()) + for dataset, policy in itertools.product(TestMethods.datasets, Ensemble.VALID_POLICIES): + if not dataset.binary and policy == 'ds': + print(f'skipping the test of binary policy ds on non-binary dataset {dataset}') + continue + + print(f'testing {base_quantifier} on dataset {dataset.name} with {policy=}') + ensemble = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1) + ensemble.fit(dataset.training) + estim_prevalences = ensemble.quantify(dataset.test.instances) + self.assertTrue(check_prevalence_vector(estim_prevalences)) + + def test_quanet(self): + try: + import quapy.classification.neural + except ModuleNotFoundError: + print('the torch package is not installed; skipping unit test for QuaNet') + return + + qp.environ['SAMPLE_SIZE'] = 10 + + # load the kindle dataset as text, and convert words to numerical indexes + dataset = qp.datasets.fetch_reviews('kindle', pickle=True).reduce() + qp.data.preprocessing.index(dataset, min_df=5, inplace=True) + + from quapy.classification.neural import CNNnet + cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes) + + from quapy.classification.neural import NeuralClassifierTrainer + learner = NeuralClassifierTrainer(cnn, device='cpu') + + from quapy.method.meta import QuaNet + model = QuaNet(learner, device='cpu', n_epochs=2, tr_iter_per_poch=10, va_iter_per_poch=10, patience=2) + + model.fit(dataset.training) + estim_prevalences = model.quantify(dataset.test.instances) + self.assertTrue(check_prevalence_vector(estim_prevalences)) + + +if __name__ == '__main__': + unittest.main() From 69b8327fe925276d9225e1785588a74c6e4fa567 Mon Sep 17 00:00:00 2001 From: Mirko Bunse Date: Wed, 17 Apr 2024 11:44:23 +0200 Subject: [PATCH 05/15] Remove an erroneous import in the unit tests and add extra test dependencies. --- quapy/tests/test_modsel.py | 1 - setup.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py index fe416c7..bf3e6f2 100644 --- a/quapy/tests/test_modsel.py +++ b/quapy/tests/test_modsel.py @@ -4,7 +4,6 @@ import numpy as np from sklearn.linear_model import LogisticRegression import quapy as qp -import util from quapy.method.aggregative import PACC from quapy.model_selection import GridSearchQ from quapy.protocol import APP diff --git a/setup.py b/setup.py index 1f6c6fb..9f7df5c 100644 --- a/setup.py +++ b/setup.py @@ -125,6 +125,7 @@ setup( # projects. extras_require={ # Optional 'bayes': ['jax', 'jaxlib', 'numpyro'], + 'tests': ['certifi'], }, # If there are data files included in your packages that need to be From 31a697559cc740a7cdd888bbb6dbb06d01738adb Mon Sep 17 00:00:00 2001 From: Mirko Bunse Date: Wed, 17 Apr 2024 11:47:55 +0200 Subject: [PATCH 06/15] Unittest on GitHub Actions --- .github/workflows/ci.yml | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..c7b0809 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,31 @@ +name: CI + +on: + pull_request: + push: + branches: + - main + - devel + +jobs: + + # take out unit tests + test: + name: Unit tests (Python ${{ matrix.python-version }}) + runs-on: ubuntu-latest + strategy: + matrix: + python-version: + - "3.11" + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools wheel + python -m pip install -e .[tests] + - name: Test with unittest + run: python -m unittest From f3e543152cd7f131d3a538a3e4a87b30bb13019e Mon Sep 17 00:00:00 2001 From: Mirko Bunse Date: Wed, 17 Apr 2024 12:28:42 +0200 Subject: [PATCH 07/15] CI needs to install the bayes extra dependency --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c7b0809..09cd522 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,6 +26,6 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip setuptools wheel - python -m pip install -e .[tests] + python -m pip install -e .[bayes,tests] - name: Test with unittest run: python -m unittest From 72b43bd2f8fd3ae95cedcd4bacd0fd2d4cbe83bf Mon Sep 17 00:00:00 2001 From: Mirko Bunse Date: Wed, 17 Apr 2024 13:46:59 +0200 Subject: [PATCH 08/15] Omit large datasets (LeQua, IFCB) during CI to avoid overful memory of GitHub Actions runners --- .github/workflows/ci.yml | 2 ++ quapy/tests/test_datasets.py | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 09cd522..1ba6d09 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,6 +17,8 @@ jobs: matrix: python-version: - "3.11" + env: + QUAPY_TESTS_OMIT_LARGE_DATASETS: True steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/quapy/tests/test_datasets.py b/quapy/tests/test_datasets.py index daa9207..a8587b2 100644 --- a/quapy/tests/test_datasets.py +++ b/quapy/tests/test_datasets.py @@ -1,3 +1,4 @@ +import os import unittest from sklearn.feature_extraction.text import TfidfVectorizer @@ -77,6 +78,9 @@ class TestDatasets(unittest.TestCase): self._check_dataset(dataset) def test_lequa2022(self): + if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'): + print("omitting test_lequa2022 because QUAPY_TESTS_OMIT_LARGE_DATASETS is set") + return for dataset_name in LEQUA2022_VECTOR_TASKS: print(f'loading dataset {dataset_name}...', end='') @@ -104,6 +108,10 @@ class TestDatasets(unittest.TestCase): def test_IFCB(self): + if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'): + print("omitting test_IFCB because QUAPY_TESTS_OMIT_LARGE_DATASETS is set") + return + print(f'loading dataset IFCB.') for mod_sel in [False, True]: train, gen = fetch_IFCB(single_sample_train=True, for_model_selection=mod_sel) From a64620c377c291f8ed706dc059e58034c7c125e3 Mon Sep 17 00:00:00 2001 From: Mirko Bunse Date: Wed, 17 Apr 2024 14:46:37 +0200 Subject: [PATCH 09/15] Dataset.reduce() allows to fix the random_state to have reproducible unit tests. This is required to ensure that the expected hyper-parameters are always chosen, independent of randomness --- quapy/data/base.py | 14 +++++++++++--- quapy/tests/test_modsel.py | 6 +++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/quapy/data/base.py b/quapy/data/base.py index e52230e..ceb7402 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -549,7 +549,7 @@ class Dataset: yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})') - def reduce(self, n_train=100, n_test=100): + def reduce(self, n_train=100, n_test=100, random_state=None): """ Reduce the number of instances in place for quick experiments. Preserves the prevalence of each set. @@ -557,6 +557,14 @@ class Dataset: :param n_test: number of test documents to keep (default 100) :return: self """ - self.training = self.training.sampling(n_train, *self.training.prevalence()) - self.test = self.test.sampling(n_test, *self.test.prevalence()) + self.training = self.training.sampling( + n_train, + *self.training.prevalence(), + random_state = random_state + ) + self.test = self.test.sampling( + n_test, + *self.test.prevalence(), + random_state = random_state + ) return self \ No newline at end of file diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py index bf3e6f2..64b0ff4 100644 --- a/quapy/tests/test_modsel.py +++ b/quapy/tests/test_modsel.py @@ -19,7 +19,7 @@ class ModselTestCase(unittest.TestCase): q = PACC(LogisticRegression(random_state=1, max_iter=5000)) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce() + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1) param_grid = {'classifier__C': [0.000001, 10.]} @@ -41,7 +41,7 @@ class ModselTestCase(unittest.TestCase): q = PACC(LogisticRegression(random_state=1, max_iter=5000)) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(n_train=500) + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(n_train=500, random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1) param_grid = {'classifier__C': np.logspace(-3,3,7)} @@ -79,7 +79,7 @@ class ModselTestCase(unittest.TestCase): q = PACC(SlowLR()) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce() + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1) param_grid = {'classifier__C': np.logspace(-1,1,3)} From e6f380dc5f26b66a2fab09f94bb63f804ce763df Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 18 Apr 2024 09:38:33 +0200 Subject: [PATCH 10/15] update changelog --- CHANGE_LOG.txt | 7 +++++++ quapy/data/datasets.py | 7 +++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/CHANGE_LOG.txt b/CHANGE_LOG.txt index e218b10..0dd3d0c 100644 --- a/CHANGE_LOG.txt +++ b/CHANGE_LOG.txt @@ -1,10 +1,17 @@ Change Log 0.1.9 ---------------- +- Added Continuous Integration with GitHub Actions (thanks to Mirko Bunse!) - Added Bayesian CC method (thanks to Pawel Czyz!). The method is described in detail in the paper Ziegler, Albert, and Paweł Czyż. "Bayesian Quantification with Black-Box Estimators." arXiv preprint arXiv:2302.09159 (2023). +- Removed binary UCI datasets {acute.a, acute.b, balance.2} from the list qp.data.datasets.UCI_BINARY_DATASETS + (the datasets are still loadable from the fetch_UCIBinaryLabelledCollection and fetch_UCIBinaryDataset + functions, though). The reason is that these datasets tend to yield results (for all methods) that are + one or two orders of magnitude greater than for other datasets, and this has a disproportionate impact in + methods average (I suspect there is something wrong in those datasets). + Change Log 0.1.8 ---------------- diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 0f732e8..8e1c406 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -20,8 +20,11 @@ TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders', TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders', 'semeval', 'semeval16', 'sst', 'wa', 'wb'] -UCI_BINARY_DATASETS = ['acute.a', 'acute.b', - 'balance.1', 'balance.2', 'balance.3', +UCI_BINARY_DATASETS = [ + #'acute.a', 'acute.b', + 'balance.1', + #'balance.2', + 'balance.3', 'breast-cancer', 'cmc.1', 'cmc.2', 'cmc.3', 'ctg.1', 'ctg.2', 'ctg.3', From 2000c33372a1f57c20d68cedf719c06a153694e3 Mon Sep 17 00:00:00 2001 From: Mirko Bunse Date: Thu, 18 Apr 2024 10:08:49 +0200 Subject: [PATCH 11/15] Composable methods integrated from qunfold, which is an extra dependency for quapy.method.composable --- docs/source/quapy.method.rst | 8 ++++ quapy/method/composable.py | 90 ++++++++++++++++++++++++++++++++++++ quapy/tests/test_methods.py | 31 +++++++++++++ setup.py | 2 + 4 files changed, 131 insertions(+) create mode 100644 quapy/method/composable.py diff --git a/docs/source/quapy.method.rst b/docs/source/quapy.method.rst index 8026e0a..31a357a 100644 --- a/docs/source/quapy.method.rst +++ b/docs/source/quapy.method.rst @@ -52,6 +52,14 @@ quapy.method.non\_aggregative module :undoc-members: :show-inheritance: +quapy.method.composable module +------------------------ + +.. automodule:: quapy.method.composable + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- diff --git a/quapy/method/composable.py b/quapy/method/composable.py new file mode 100644 index 0000000..f7f3a61 --- /dev/null +++ b/quapy/method/composable.py @@ -0,0 +1,90 @@ +"""This module allows the composition of quantification methods from loss functions and feature transformations. This functionality is realized through an integration of the qunfold package: https://github.com/mirkobunse/qunfold.""" + +import qunfold +from qunfold.quapy import QuaPyWrapper +from qunfold.sklearn import CVClassifier +from qunfold import ( + LeastSquaresLoss, # losses + BlobelLoss, + EnergyLoss, + HellingerSurrogateLoss, + CombinedLoss, + TikhonovRegularization, + TikhonovRegularized, + ClassTransformer, # transformers + HistogramTransformer, + DistanceTransformer, + KernelTransformer, + EnergyKernelTransformer, + LaplacianKernelTransformer, + GaussianKernelTransformer, + GaussianRFFKernelTransformer, +) + +__all__ = [ # control public members, e.g., for auto-documentation in sphinx; omit QuaPyWrapper + "ComposableQuantifier", + "CVClassifier", + "LeastSquaresLoss", + "BlobelLoss", + "EnergyLoss", + "HellingerSurrogateLoss", + "CombinedLoss", + "TikhonovRegularization", + "TikhonovRegularized", + "ClassTransformer", + "HistogramTransformer", + "DistanceTransformer", + "KernelTransformer", + "EnergyKernelTransformer", + "LaplacianKernelTransformer", + "GaussianKernelTransformer", + "GaussianRFFKernelTransformer", +] + +def ComposableQuantifier(loss, transformer, **kwargs): + """A generic quantification / unfolding method that solves a linear system of equations. + + This class represents any quantifier that can be described in terms of a loss function, a feature transformation, and a regularization term. In this implementation, the loss is minimized through unconstrained second-order minimization. Valid probability estimates are ensured through a soft-max trick by Bunse (2022). + + Args: + loss: An instance of a loss class from `quapy.methods.composable`. + transformer: An instance of a transformer class from `quapy.methods.composable`. + solver (optional): The `method` argument in `scipy.optimize.minimize`. Defaults to `"trust-ncg"`. + solver_options (optional): The `options` argument in `scipy.optimize.minimize`. Defaults to `{"gtol": 1e-8, "maxiter": 1000}`. + seed (optional): A random number generator seed from which a numpy RandomState is created. Defaults to `None`. + + Examples: + Here, we create the ordinal variant of ACC (Bunse et al., 2023). This variant consists of the original feature transformation of ACC and of the original loss of ACC, the latter of which is regularized towards smooth solutions. + + >>> from qunfold.method.composable import ( + >>> ComposableQuantifier, + >>> TikhonovRegularized, + >>> LeastSquaresLoss, + >>> ClassTransformer, + >>> ) + >>> from sklearn.ensemble import RandomForestClassifier + >>> o_acc = ComposableQuantifier( + >>> TikhonovRegularized(LeastSquaresLoss(), 0.01), + >>> ClassTransformer(RandomForestClassifier(oob_score=True)) + >>> ) + + Here, we perform hyper-parameter optimization with the ordinal ACC. + + >>> quapy.model_selection.GridSearchQ( + >>> model = o_acc, + >>> param_grid = { # try both splitting criteria + >>> "transformer__classifier__estimator__criterion": ["gini", "entropy"], + >>> }, + >>> # ... + >>> ) + + To use a classifier that does not provide the `oob_score` argument, such as logistic regression, you have to configure a cross validation of this classifier. Here, we employ 10 cross validation folds. 5 folds are the default. + + >>> from qunfold.method.composable import CVClassifier + >>> from sklearn.linear_model import LogisticRegression + >>> acc_lr = ComposableQuantifier( + >>> LeastSquaresLoss(), + >>> ClassTransformer(CVClassifier(LogisticRegression(), 10)) + >>> ) + """ + return QuaPyWrapper(qunfold.GenericMethod(loss, transformer, **kwargs)) diff --git a/quapy/tests/test_methods.py b/quapy/tests/test_methods.py index 69d627b..cf5bf39 100644 --- a/quapy/tests/test_methods.py +++ b/quapy/tests/test_methods.py @@ -9,6 +9,29 @@ from quapy.method.meta import Ensemble from quapy.method import AGGREGATIVE_METHODS, BINARY_METHODS, NON_AGGREGATIVE_METHODS from quapy.functional import check_prevalence_vector +# a random selection of composed methods to test the qunfold integration +from quapy.method.composable import ( + ComposableQuantifier, + LeastSquaresLoss, + HellingerSurrogateLoss, + ClassTransformer, + HistogramTransformer, + CVClassifier, +) +COMPOSABLE_METHODS = [ + ComposableQuantifier( # ACC + LeastSquaresLoss(), + ClassTransformer(CVClassifier(LogisticRegression())) + ), + ComposableQuantifier( # HDy + HellingerSurrogateLoss(), + HistogramTransformer( + 3, # 3 bins per class + preprocessor = ClassTransformer(CVClassifier(LogisticRegression())) + ) + ), +] + class TestMethods(unittest.TestCase): tiny_dataset_multiclass = qp.datasets.fetch_UCIMulticlassDataset('academic-success').reduce(n_test=10) @@ -87,6 +110,14 @@ class TestMethods(unittest.TestCase): estim_prevalences = model.quantify(dataset.test.instances) self.assertTrue(check_prevalence_vector(estim_prevalences)) + def test_composable(self): + for dataset in TestMethods.datasets: + for q in COMPOSABLE_METHODS: + print('testing', q) + q.fit(dataset.training) + estim_prevalences = q.quantify(dataset.test.X) + self.assertTrue(check_prevalence_vector(estim_prevalences)) + if __name__ == '__main__': unittest.main() diff --git a/setup.py b/setup.py index 9f7df5c..6e0f394 100644 --- a/setup.py +++ b/setup.py @@ -125,7 +125,9 @@ setup( # projects. extras_require={ # Optional 'bayes': ['jax', 'jaxlib', 'numpyro'], + 'composable': ['qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.3'], 'tests': ['certifi'], + 'docs' : ['sphinx-rtd-theme'], }, # If there are data files included in your packages that need to be From e111860128cfd86464ce51625622b3c4688c9001 Mon Sep 17 00:00:00 2001 From: Mirko Bunse Date: Thu, 18 Apr 2024 10:21:21 +0200 Subject: [PATCH 12/15] Fix the CI by installing the composable dependencies --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1ba6d09..85d0dd1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,6 +28,6 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip setuptools wheel - python -m pip install -e .[bayes,tests] + python -m pip install -e .[bayes,composable,tests] - name: Test with unittest run: python -m unittest From bf33c134fc80bd79d9d4d7b7bcba41c939b70d15 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Fri, 19 Apr 2024 14:23:35 +0200 Subject: [PATCH 13/15] Update _kdey.py fix in KDEy: makes the method robust to cases in which the number of positives for any class is smaller than the number k of folds. In such cases, the kde for that class is created from the uniform prevalence vector --- quapy/method/_kdey.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index a6ecbea..e678563 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -62,8 +62,13 @@ class KDEBase: :param bandwidth: float, the bandwidth of the kernel :return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates """ - return [self.get_kde_function(X[y == cat], bandwidth) for cat in classes] - + class_cond_X = [] + for cat in classes: + selX = X[y==cat] + if selX.size==0: + selX = [F.uniform_prevalence(len(classes))] + class_cond_X.append(selX) + return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X] class KDEyML(AggregativeSoftQuantifier, KDEBase): From e92264c280cfe7da225b1b2e7e1b10c658293450 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Wed, 24 Apr 2024 17:03:57 +0200 Subject: [PATCH 14/15] adding wiki documents to the sphinx documentation in order to allow for collaboration --- docs/Makefile | 15 +- docs/build/html/_sources/index.rst.txt | 17 + docs/build/html/_sources/quapy.method.rst.txt | 8 + docs/build/html/genindex.html | 114 +++- docs/build/html/index.html | 85 ++- docs/build/html/modules.html | 2 + docs/build/html/objects.inv | Bin 3532 -> 4066 bytes docs/build/html/py-modindex.html | 14 + docs/build/html/quapy.data.html | 14 +- docs/build/html/quapy.html | 82 ++- docs/build/html/quapy.method.html | 609 +++++++++++++++++- docs/build/html/search.html | 9 + docs/build/html/searchindex.js | 2 +- docs/source/conf.py | 2 + docs/source/index.rst | 17 + docs/source/wiki_editable/Datasets.md | 440 +++++++++++++ docs/source/wiki_editable/Evaluation.md | 159 +++++ .../wiki_editable/ExplicitLossMinimization.md | 26 + docs/source/wiki_editable/Methods.md | 526 +++++++++++++++ docs/source/wiki_editable/Model-Selection.md | 145 +++++ docs/source/wiki_editable/Plotting.md | 250 +++++++ docs/source/wiki_editable/Protocols.md | 177 +++++ .../wiki_examples/selected_plots/bin_bias.png | Bin 0 -> 63858 bytes .../selected_plots/bin_bias_bin_cc.png | Bin 0 -> 110404 bytes .../selected_plots/bin_bias_cc.png | Bin 0 -> 72541 bytes .../wiki_examples/selected_plots/bin_diag.png | Bin 0 -> 189871 bytes .../selected_plots/bin_diag_cc.png | Bin 0 -> 344915 bytes .../selected_plots/err_drift.png | Bin 0 -> 248874 bytes 28 files changed, 2685 insertions(+), 28 deletions(-) create mode 100644 docs/source/wiki_editable/Datasets.md create mode 100644 docs/source/wiki_editable/Evaluation.md create mode 100644 docs/source/wiki_editable/ExplicitLossMinimization.md create mode 100644 docs/source/wiki_editable/Methods.md create mode 100644 docs/source/wiki_editable/Model-Selection.md create mode 100644 docs/source/wiki_editable/Plotting.md create mode 100644 docs/source/wiki_editable/Protocols.md create mode 100644 docs/source/wiki_editable/wiki_examples/selected_plots/bin_bias.png create mode 100644 docs/source/wiki_editable/wiki_examples/selected_plots/bin_bias_bin_cc.png create mode 100644 docs/source/wiki_editable/wiki_examples/selected_plots/bin_bias_cc.png create mode 100644 docs/source/wiki_editable/wiki_examples/selected_plots/bin_diag.png create mode 100644 docs/source/wiki_editable/wiki_examples/selected_plots/bin_diag_cc.png create mode 100644 docs/source/wiki_editable/wiki_examples/selected_plots/err_drift.png diff --git a/docs/Makefile b/docs/Makefile index d0c3cbf..bb42adc 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -14,7 +14,20 @@ help: .PHONY: help Makefile +# Convert Markdown files to reStructuredText before building HTML +markdown_to_rst: + @echo "Converting Markdown files to reStructuredText" + @mkdir -p $(SOURCEDIR)/wiki/wiki_examples/selected_plots + @cp $(SOURCEDIR)/wiki_editable/wiki_examples/selected_plots/* $(SOURCEDIR)/wiki/wiki_examples/selected_plots/ + @find $(SOURCEDIR)/wiki_editable -name '*.md' -exec sh -c 'pandoc -f markdown -t rst "$$1" -o "$(SOURCEDIR)/wiki/$$(basename "$$1" .md).rst"' _ {} \; + @echo "Conversion complete." + # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile +html: markdown_to_rst @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +# # Catch-all target: route all unknown targets to Sphinx using the new +# # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +# %: Makefile +# @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/build/html/_sources/index.rst.txt b/docs/build/html/_sources/index.rst.txt index cc5b4dc..a4150cd 100644 --- a/docs/build/html/_sources/index.rst.txt +++ b/docs/build/html/_sources/index.rst.txt @@ -21,6 +21,23 @@ GitHub QuaPy is hosted in GitHub at `https://github.com/HLT-ISTI/QuaPy `_ +Wiki Documents +------------ + +In this section you can find useful information concerning different aspects of QuaPy, with examples: + +.. toctree:: + :maxdepth: 1 + + wiki/Datasets + wiki/Evaluation + wiki/ExplicitLossMinimization + wiki/Methods + wiki/Model-Selection + wiki/Plotting + wiki/Protocols + + .. toctree:: :maxdepth: 2 :caption: Contents: diff --git a/docs/build/html/_sources/quapy.method.rst.txt b/docs/build/html/_sources/quapy.method.rst.txt index 8026e0a..31a357a 100644 --- a/docs/build/html/_sources/quapy.method.rst.txt +++ b/docs/build/html/_sources/quapy.method.rst.txt @@ -52,6 +52,14 @@ quapy.method.non\_aggregative module :undoc-members: :show-inheritance: +quapy.method.composable module +------------------------ + +.. automodule:: quapy.method.composable + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- diff --git a/docs/build/html/genindex.html b/docs/build/html/genindex.html index 0099e44..7dcd630 100644 --- a/docs/build/html/genindex.html +++ b/docs/build/html/genindex.html @@ -43,6 +43,15 @@