diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..dd40642 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,108 @@ +name: CI + +on: + pull_request: + push: + branches: + - master + - devel + tags: + - "[0-9]+.[0-9]+.[0-9]+" + +jobs: + + # take out unit tests + test: + name: Unit tests (Python ${{ matrix.python-version }}) + runs-on: ubuntu-latest + strategy: + matrix: + python-version: + - "3.11" + env: + QUAPY_TESTS_OMIT_LARGE_DATASETS: True + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools wheel + python -m pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@main" + python -m pip install -e .[bayes,tests] + - name: Test with unittest + run: python -m unittest + + # build and push documentation to gh-pages (only if pushed to the master branch) + docs: + name: Documentation + runs-on: ubuntu-latest + if: github.ref == 'refs/heads/master' + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.11 + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools wheel "jax[cpu]" + python -m pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@main" + python -m pip install -e .[neural,docs] + - name: Build documentation + run: sphinx-build -M html docs/source docs/build + - name: Publish documentation + run: | + git clone ${{ github.server_url }}/${{ github.repository }}.git --branch gh-pages --single-branch __gh-pages/ + cp -r docs/build/html/* __gh-pages/ + cd __gh-pages/ + git config --local user.email "action@github.com" + git config --local user.name "GitHub Action" + git add . + git commit -am "Documentation based on ${{ github.sha }}" || true + - name: Push changes + uses: ad-m/github-push-action@master + with: + branch: gh-pages + directory: __gh-pages/ + github_token: ${{ secrets.GITHUB_TOKEN }} + + release: + name: Build & Publish Release + runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags/') + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install build dependencies + run: | + python -m pip install --upgrade pip build twine + - name: Build package + run: python -m build + - name: Publish to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + # use these for TESTs! +# password: ${{ secrets.TEST_PYPI_API_TOKEN }} +# repository_url: https://test.pypi.org/legacy/ + password: ${{ secrets.PYPI_API_TOKEN }} + repository_url: https://upload.pypi.org/legacy/ + - name: Create GitHub Release + id: create_release + uses: actions/create-release@v1 + with: + tag_name: ${{ github.ref_name }} + release_name: Release ${{ github.ref_name }} + body: | + Changes in this release: + - see commit history for details + draft: false + prerelease: false + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml deleted file mode 100644 index 383e65c..0000000 --- a/.github/workflows/pylint.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Pylint - -on: [push] - -jobs: - build: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.8", "3.9", "3.10"] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pylint - - name: Analysing the code with pylint - run: | - pylint $(git ls-files '*.py') diff --git a/.gitignore b/.gitignore index b9703a3..9253112 100644 --- a/.gitignore +++ b/.gitignore @@ -69,8 +69,12 @@ instance/ # Scrapy stuff: .scrapy +# vscode config: +.vscode/ + # Sphinx documentation -docs/_build/ +docs/_build/doctest +docs/_build/doctrees # PyBuilder target/ @@ -85,6 +89,11 @@ ipython_config.py # pyenv .python-version +# poetry +poetry.toml +pyproject.toml +poetry.lock + # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies @@ -130,3 +139,33 @@ dmypy.json .pyre/ *__pycache__* +*.pdf +*.zip +*.png +*.csv +*.pkl +*.dataframe + + +# other projects +LeQua2022 +MultiLabel +NewMethods +Ordinal +Retrieval +eDiscovery +poster-cikm +slides-cikm +slides-short-cikm +quick_experiment +svm_perf_quantification/svm_struct +svm_perf_quantification/svm_light +TweetSentQuant + + + + + + +*.png +.idea diff --git a/CHANGE_LOG.txt b/CHANGE_LOG.txt new file mode 100644 index 0000000..b6066b9 --- /dev/null +++ b/CHANGE_LOG.txt @@ -0,0 +1,217 @@ +Change Log 0.2.1 +----------------- + +- Improved documentation of confidence regions. +- Added ReadMe method by Daniel Hopkins and Gary King +- Internal index in LabelledCollection is now "lazy", and is only constructed if required. + +Change Log 0.2.0 +----------------- + +- Base code Refactor: + - Removing coupling between LabelledCollection and quantification methods; the fit interface changes: + def fit(data:LabelledCollection): -> def fit(X, y): + - Adding function "predict" (function "quantify" is still present as an alias, for the nostalgic) + - Aggregative methods's behavior in terms of fit_classifier and how to treat the val_split is now + indicated exclusively at construction time, and it is no longer possible to indicate it at fit time. + This is because, in v<=0.1.9, one could create a method (e.g., ACC) and then indicate: + my_acc.fit(tr_data, fit_classifier=False, val_split=val_data) + in which case the first argument is unused, and this was ambiguous with + my_acc.fit(the_data, fit_classifier=False) + in which case the_data is to be used for validation purposes. However, the val_split could be set as a fraction + indicating only part of the_data must be used for validation, and the rest wasted... it was certainly confusing. + - This change imposes a versioning constrain with qunfold, which now must be >= 0.1.6 +- EMQ has been modified, so that the representation function "classify" now only provides posterior + probabilities and, if required, these are recalibrated (e.g., by "bcts") during the aggregation function. + - A new parameter "on_calib_error" is passed to the constructor, which informs of the policy to follow + in case the abstention's calibration functions failed (which happens sometimes). Options include: + - 'raise': raises a RuntimeException (default) + - 'backup': reruns by silently avoiding calibration + - Parameter "recalib" has been renamed "calib" +- Added aggregative bootstrap for deriving confidence regions (confidence intervals, ellipses in the simplex, or + ellipses in the CLR space). This method is efficient as it leverages the two-phases of the aggregative quantifiers. + This method applies resampling only to the aggregation phase, thus avoiding to train many quantifiers, or + classify multiple times the instances of a sample. See: + - quapy/method/confidence.py (new) + - the new example no. 16.confidence_regions.py +- BayesianCC moved to confidence.py, where methods having to do with confidence intervals belong. +- Improved documentation of qp.plot module. + + +Change Log 0.1.9 +---------------- + +- Added LeQua 2024 datasets and normalized match distance to qp.error + +- Improved data loaders for UCI binary and UCI multiclass datasets (thanks to Lorenzo Volpi!); these datasets + can be loaded with standardised covariates (default) + +- Added a default classifier for aggregative quantifiers, which now can be instantiated without specifying + the classifier. The default classifier can be accessed in qp.environ['DEFAULT_CLS'] and is assigned to + sklearn.linear_model.LogisticRegression(max_iter=3000). If the classifier is not specified, then a clone + of said classifier is returned. E.g.: + > pacc = PACC() + is equivalent to: + > pacc = PACC(classifier=LogisticRegression(max_iter=3000)) + +- Improved error loging in model selection. In v0.1.8 only Status.INVALID was reported; in v0.1.9 it is + now accompanied by a textual description of the error + +- The number of parallel workers can now be set via an environment variable by running, e.g.: + > N_JOBS=10 python3 your_script.py + which has the same effect as writing the following code at the beginning of your_script.py: + > import quapy as qp + > qp.environ["N_JOBS"] = 10 + +- Some examples have been added to the ./examples/ dir, which now contains numbered examples from basics (0) + to advanced topics (higher numbers) + +- Moved the wiki documents to the ./docs/ folder so that they become editable via PR for the community + +- Added Composable methods from Mirko Bunse's qunfold library! (thanks to Mirko Bunse!) + +- Added Continuous Integration with GitHub Actions (thanks to Mirko Bunse!) + +- Added Bayesian CC method (thanks to Pawel Czyz!). The method is described in detail in the paper + Ziegler, Albert, and Paweł Czyż. "Bayesian Quantification with Black-Box Estimators." + arXiv preprint arXiv:2302.09159 (2023). + +- Removed binary UCI datasets {acute.a, acute.b, balance.2} from the list qp.data.datasets.UCI_BINARY_DATASETS + (the datasets are still loadable from the fetch_UCIBinaryLabelledCollection and fetch_UCIBinaryDataset + functions, though). The reason is that these datasets tend to yield results (for all methods) that are + one or two orders of magnitude greater than for other datasets, and this has a disproportionate impact in + methods average (I suspect there is something wrong in those datasets). + + +Change Log 0.1.8 +---------------- + +- Added Kernel Density Estimation methods (KDEyML, KDEyCS, KDEyHD) as proposed in the paper: + Moreo, A., González, P., & del Coz, J. J. Kernel Density Estimation for Multiclass Quantification. + arXiv preprint arXiv:2401.00490, 2024 + +- Substantial internal refactor: aggregative methods now inherit a pattern by which the fit method consists of: + a) fitting the classifier and returning the representations of the training instances (typically the posterior + probabilities, the label predictions, or the classifier scores, and typically obtained through kFCV). + b) fitting an aggregation function + The function implemented in step a) is inherited from the super class. Each new aggregative method now has to + implement only the "aggregative_fit" of step b). + This pattern was already implemented for the prediction (thus allowing evaluation functions to be performed + very quicky), and is now available also for training. The main benefit is that model selection now can nestle + the training of quantifiers in two levels: one for the classifier, and another for the aggregation function. + As a result, a method with a param grid of 10 combinations for the classifier and 10 combinations for the + quantifier, now implies 10 trainings of the classifier + 10*10 trainings of the aggregation function (this is + typically much faster than the classifier training), whereas in versions <0.1.8 this amounted to training + 10*10 (classifiers+aggregations). + +- Added different solvers for ACC and PACC quantifiers. In quapy < 0.1.8 these quantifiers try to solve the system + of equations Ax=B exactly (by means of np.linalg.solve). As noted by Mirko Bunse (thanks!), such an exact solution + does sometimes not exist. In cases like this, quapy < 0.1.8 resorted to CC for providing a plausible solution. + ACC and PACC now resorts to an approximated solution in such cases (minimizing the L2-norm of the difference + between Ax-B) as proposed by Mirko Bunse. A quick experiment reveals this heuristic greatly improves the results + of ACC and PACC in T2A@LeQua. + +- Fixed ThresholdOptimization methods (X, T50, MAX, MS and MS2). Thanks to Tobias Schumacher and colleagues for pointing + this out in Appendix A of "Schumacher, T., Strohmaier, M., & Lemmerich, F. (2021). A comparative evaluation of + quantification methods. arXiv:2103.03223v3 [cs.LG]" + +- Added HDx and DistributionMatchingX to non-aggregative quantifiers (see also the new example "comparing_HDy_HDx.py") + +- New UCI multiclass datasets added (thanks to Pablo González). The 5 UCI multiclass datasets are those corresponding + to the following criteria: + - >1000 instances + - >2 classes + - classification datasets + - Python API available + +- New IFCB (plankton) dataset added (thanks to Pablo González). See qp.datasets.fetch_IFCB. + +- Added new evaluation measures NAE, NRAE (thanks to Andrea Esuli) + +- Added new meta method "MedianEstimator"; an ensemble of binary base quantifiers that receives as input a dictionary + of hyperparameters that will explore exhaustively, fitting and generating predictions for each combination of + hyperparameters, and that returns, as the prevalence estimates, the median across all predictions. + +- Added "custom_protocol.py" example. + +- New API documentation template. + +Change Log 0.1.7 +---------------- + +- Protocols are now abstracted as instances of AbstractProtocol. There is a new class extending AbstractProtocol called + AbstractStochasticSeededProtocol, which implements a seeding policy to allow replicate the series of samplings. + There are some examples of protocols, APP, NPP, UPP, DomainMixer (experimental). + The idea is to start the sample generation by simply calling the __call__ method. + This change has a great impact in the framework, since many functions in qp.evaluation, qp.model_selection, + and sampling functions in LabelledCollection relied of the old functions. E.g., the functionality of + qp.evaluation.artificial_prevalence_report or qp.evaluation.natural_prevalence_report is now obtained by means of + qp.evaluation.report which takes a protocol as an argument. I have not maintained compatibility with the old + interfaces because I did not really like them. Check the wiki guide and the examples for more details. + +- Exploration of hyperparameters in Model selection can now be run in parallel (there was a n_jobs argument in + QuaPy 0.1.6 but only the evaluation part for one specific hyperparameter was run in parallel). + +- The prediction function has been refactored, so it applies the optimization for aggregative quantifiers (that + consists in pre-classifying all instances, and then only invoking aggregate on the samples) only in cases in + which the total number of classifications would be smaller than the number of classifications with the standard + procedure. The user can now specify "force", "auto", True of False, in order to actively decide for applying it + or not. + +- examples directory created! + +- DyS, Topsoe distance and binary search (thanks to Pablo González) + +- Multi-thread reproducibility via seeding (thanks to Pablo González) + +- n_jobs is now taken from the environment if set to None + +- ACC, PACC, Forman's threshold variants have been parallelized. + +- cross_val_predict (for quantification) added to model_selection: would be nice to allow the user specifies a + test protocol maybe, or None for bypassing it? + +- Bugfix: adding two labelled collections (with +) now checks for consistency in the classes + +- newer versions of numpy raise a warning when accessing types (e.g., np.float). I have replaced all such instances + with the plain python type (e.g., float). + +- new dependency "abstention" (to add to the project requirements and setup). Calibration methods from + https://github.com/kundajelab/abstention added. + +- the internal classifier of aggregative methods is now called "classifier" instead of "learner" + +- when optimizing the hyperparameters of an aggregative quantifier, the classifier's specific hyperparameters + should be marked with a "classifier__" prefix (just like in scikit-learn with estimators), while the quantifier's + specific hyperparameters are named directly. For example, PCC(LogisticRegression()) quantifier has hyperparameters + "classifier__C", "classifier__class_weight", etc., instead of "C" and "class_weight" as in v0.1.6. + +- hyperparameters yielding to inconsistent runs raise a ValueError exception, while hyperparameter combinations + yielding to internal errors of surrogate functions are reported and skipped, without stopping the grid search. + +- DistributionMatching methods added. This is a general framework for distribution matching methods that catters for + multiclass quantification. That is to say, one could get a multiclass variant of the (originally binary) HDy + method aligned with the Firat's formulation. + +- internal method properties "binary", "aggregative", and "probabilistic" have been removed; these conditions are + checked via isinstance + +- quantifiers (i.e., classes that inherit from BaseQuantifier) are not forced to implement classes_ or n_classes; + these can be used anyway internally, but the framework will not suppose (nor impose) that a quantifier implements + them + +- qp.evaluation.prediction has been optimized so that, if a quantifier is of type aggregative, and if the evaluation + protocol is of type OnLabelledCollection, then the computation is faster. In this specific case, the predictions + are issued only once and for all, and not for each sample. An exception to this (which is implement also), is + when the number of instances across all samples is anyway smaller than the number of instances in the original + labelled collection; in this case the heuristic is of no help, and is therefore not applied. + +- the distinction between "classify" and "posterior_probabilities" has been removed in Aggregative quantifiers, + so that probabilistic classifiers return posterior probabilities, while non-probabilistic quantifiers + return crisp decisions. + +- OneVsAll fixed. There are now two classes: a generic one OneVsAllGeneric that works with any quantifier (e.g., + any instance of BaseQuantifier), and a subclass of it called OneVsAllAggregative which implements the + classify / aggregate interface. Both are instances of OneVsAll. There is a method getOneVsAll that returns the + best instance based on the type of quantifier. + diff --git a/README.md b/README.md index e48f87e..730a433 100644 --- a/README.md +++ b/README.md @@ -13,9 +13,9 @@ for facilitating the analysis and interpretation of the experimental results. ### Last updates: -* Version 0.1.7 is released! major changes can be consulted [here](quapy/CHANGE_LOG.txt). -* A detailed documentation is now available [here](https://hlt-isti.github.io/QuaPy/) -* The developer API documentation is available [here](https://hlt-isti.github.io/QuaPy/build/html/modules.html) +* Version 0.2.0 is released! major changes can be consulted [here](CHANGE_LOG.txt). +* The developer API documentation is available [here](https://hlt-isti.github.io/QuaPy/index.html) +* Manuals are available [here](https://hlt-isti.github.io/QuaPy/manuals.html) ### Installation @@ -25,7 +25,7 @@ pip install quapy ### Cite QuaPy -If you find QuaPy useful (and we hope you will), plese consider citing the original paper in your research: +If you find QuaPy useful (and we hope you will), please consider citing the original paper in your research: ``` @inproceedings{moreo2021quapy, @@ -46,19 +46,18 @@ of the test set. ```python import quapy as qp -from sklearn.linear_model import LogisticRegression -dataset = qp.datasets.fetch_twitter('semeval16') +training, test = qp.datasets.fetch_UCIBinaryDataset("yeast").train_test # create an "Adjusted Classify & Count" quantifier -model = qp.method.aggregative.ACC(LogisticRegression()) -model.fit(dataset.training) +model = qp.method.aggregative.ACC() +Xtr, ytr = training.Xy +model.fit(Xtr, ytr) -estim_prevalence = model.quantify(dataset.test.instances) -true_prevalence = dataset.test.prevalence() +estim_prevalence = model.predict(test.X) +true_prevalence = test.prevalence() error = qp.error.mae(true_prevalence, estim_prevalence) - print(f'Mean Absolute Error (MAE)={error:.3f}') ``` @@ -69,7 +68,7 @@ class prevalence of the training set. For this reason, any quantification model should be tested across many samples, even ones characterized by class prevalence values different or very different from those found in the training set. QuaPy implements sampling procedures and evaluation protocols that automate this workflow. -See the [Wiki](https://github.com/HLT-ISTI/QuaPy/wiki) for detailed examples. +See the [documentation](https://hlt-isti.github.io/QuaPy/manuals.html) for detailed examples. ## Features @@ -81,7 +80,8 @@ quantification methods based on structured output learning, HDy, QuaNet, quantif * 32 UCI Machine Learning datasets. * 11 Twitter quantification-by-sentiment datasets. * 3 product reviews quantification-by-sentiment datasets. - * 4 tasks from LeQua competition (_new in v0.1.7!_) + * 4 tasks from LeQua 2022 competition and 4 tasks from LeQua 2024 competition + * IFCB for Plancton quantification * Native support for binary and single-label multiclass quantification scenarios. * Model selection functionality that minimizes quantification-oriented loss functions. * Visualization tools for analysing the experimental results. @@ -96,22 +96,29 @@ quantification methods based on structured output learning, HDy, QuaNet, quantif * pandas, xlrd * matplotlib +## Contributing + +In case you want to contribute improvements to quapy, please generate pull request to the "devel" branch. ## Documentation -The [developer API documentation](https://hlt-isti.github.io/QuaPy/build/html/modules.html) is available [here](https://hlt-isti.github.io/QuaPy/build/html/index.html). +Check out the [developer API documentation here](https://hlt-isti.github.io/QuaPy/index.html). -Check out our [Wiki](https://github.com/HLT-ISTI/QuaPy/wiki), in which many examples +Check out the [Manuals](https://hlt-isti.github.io/QuaPy/manuals.html), in which many code examples are provided: -* [Datasets](https://github.com/HLT-ISTI/QuaPy/wiki/Datasets) -* [Evaluation](https://github.com/HLT-ISTI/QuaPy/wiki/Evaluation) -* [Protocols](https://github.com/HLT-ISTI/QuaPy/wiki/Protocols) -* [Methods](https://github.com/HLT-ISTI/QuaPy/wiki/Methods) -* [SVMperf](https://github.com/HLT-ISTI/QuaPy/wiki/ExplicitLossMinimization) -* [Model Selection](https://github.com/HLT-ISTI/QuaPy/wiki/Model-Selection) -* [Plotting](https://github.com/HLT-ISTI/QuaPy/wiki/Plotting) +* [Datasets](https://hlt-isti.github.io/QuaPy/manuals/datasets.html) +* [Evaluation](https://hlt-isti.github.io/QuaPy/manuals/evaluation.html) +* [Protocols](https://hlt-isti.github.io/QuaPy/manuals/protocols.html) +* [Methods](https://hlt-isti.github.io/QuaPy/manuals/methods.html) +* [SVMperf](https://hlt-isti.github.io/QuaPy/manuals/explicit-loss-minimization.html) +* [Model Selection](https://hlt-isti.github.io/QuaPy/manuals/model-selection.html) +* [Plotting](https://hlt-isti.github.io/QuaPy/manuals/plotting.html) ## Acknowledgments: -SoBigData++ +SoBigData++ + +This work has been supported by the QuaDaSh project +_"Finanziato dall’Unione europea---Next Generation EU, +Missione 4 Componente 2 CUP B53D23026250001"_. diff --git a/TODO.txt b/TODO.txt index d3f2b3d..de40ed9 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,95 +1,60 @@ -ensembles seem to be broken; they have an internal model selection which takes the parameters, but since quapy now - works with protocols it would need to know the validation set in order to pass something like - "protocol: APP(val, etc.)" -sample_size should not be mandatory when qp.environ['SAMPLE_SIZE'] has been specified -clean all the cumbersome methods that have to be implemented for new quantifiers (e.g., n_classes_ prop, etc.) -make truly parallel the GridSearchQ -make more examples in the "examples" directory -merge with master, because I had to fix some problems with QuaNet due to an issue notified via GitHub! -added cross_val_predict in qp.model_selection (i.e., a cross_val_predict for quantification) --would be nice to have - it parallelized +Adapt examples; remaining: example 4-onwards +not working: 15 (qunfold) -check the OneVsAll module(s) +Solve the warnings issue; right now there is a warning ignore in method/__init__.py: -check the set_params de neural.py, because the separation of estimator__ is not implemented; see also - __check_params_colision +Add 'platt' to calib options in EMQ? -HDy can be customized so that the number of bins is specified, instead of explored within the fit method +Allow n_prevpoints in APP to be specified by a user-defined grid? -Packaging: -========================================== -Document methods with paper references -unit-tests -clean wiki_examples! +Update READMEs, wiki, & examples for new fit-predict interface -Refactor: -========================================== -Unify ThresholdOptimization methods, as an extension of PACC (and not ACC), the fit methods are almost identical and - use a prob classifier (take into account that PACC uses pcc internally, whereas the threshold methods use cc - instead). The fit method of ACC and PACC has a block for estimating the validation estimates that should be unified - as well... -Refactor protocols. APP and NPP related functionalities are duplicated in functional, LabelledCollection, and evaluation +Add the fix suggested by Alexander: + +For a more general application, I would maybe first establish a per-class threshold value of plausible prevalence +based on the number of actual positives and the required sample size; e.g., for sample_size=100 and actual +positives [10, 100, 500] -> [0.1, 1.0, 1.0], meaning that class 0 can be sampled at most at 0.1 prevalence, while +the others can be sampled up to 1. prevalence. Then, when a prevalence value is requested, e.g., [0.33, 0.33, 0.33], +we may either clip each value and normalize (as you suggest for the extreme case, e.g., [0.1, 0.33, 0.33]/sum) or +scale each value by per-class thresholds, i.e., [0.33*0.1, 0.33*1, 0.33*1]/sum. +- This affects LabelledCollection +- This functionality should be accessible via sampling protocols and evaluation functions + +Solve the pre-trained classifier issues. An example is the coptic-codes script I did, which needed a mock_lr to +work for having access to classes_; think also the case in which the precomputed outputs are already generated +as in the unifying problems code. + +Para quitar el labelledcollection de los métodos: + +- El follón viene por la semántica confusa de fit en agregativos, que recibe 3 parámetros: + - data: LabelledCollection, que puede ser: + - el training set si hay que entrenar el clasificador + - None si no hay que entregar el clasificador + - el validation, que entra en conflicto con val_split, si no hay que entrenar clasificador + - fit_classifier: dice si hay que entrenar el clasificador o no, y estos cambia la semántica de los otros + - val_split: que puede ser: + - un número: el número de kfcv, lo cual implica fit_classifier=True y data=todo el training set + - una fración en [0,1]: que indica la parte que usamos para validation; implica fit_classifier=True y data=train+val + - un labelled collection: el conjunto de validación específico; no implica fit_classifier=True ni False +- La forma de quitar la dependencia de los métodos con LabelledCollection debería ser así: + - En el constructor se dice si el clasificador que se recibe por parámetro hay que entrenarlo o ya está entrenado; + es decir, hay un fit_classifier=True o False. + - fit_classifier=True: + - data en fit es todo el training incluyendo el validation y todo + - val_split: + - int: número de folds en kfcv + - proporción en [0,1] + - fit_classifier=False: -New features: -========================================== -Add "measures for evaluating ordinal"? -Add datasets for topic. -Do we want to cover cross-lingual quantification natively in QuaPy, or does it make more sense as an application on top? - -Current issues: -========================================== -Revise the class structure of quantification methods and the methods they inherit... There is some confusion regarding - methods isbinary, isprobabilistic, and the like. The attribute "learner_" in aggregative quantifiers is also - confusing, since there is a getter and a setter. -Remove the "deep" in get_params. There is no real compatibility with scikit-learn as for now. -SVMperf-based learners do not remove temp files in __del__? -In binary quantification (hp, kindle, imdb) we used F1 in the minority class (which in kindle and hp happens to be the -negative class). This is not covered in this new implementation, in which the binary case is not treated as such, but as -an instance of single-label with 2 labels. Check -Add automatic reindex of class labels in LabelledCollection (currently, class indexes should be ordered and with no gaps) -OVR I believe is currently tied to aggregative methods. We should provide a general interface also for general quantifiers -Currently, being "binary" only adds one checker; we should figure out how to impose the check to be automatically performed -Add random seed management to support replicability (see temp_seed in util.py). -GridSearchQ is not trully parallelized. It only parallelizes on the predictions. -In the context of a quantifier (e.g., QuaNet or CC), the parameters of the learner should be prefixed with "estimator__", - in QuaNet this is resolved with a __check_params_colision, but this should be improved. It might be cumbersome to - impose the "estimator__" prefix for, e.g., quantifiers like CC though... This should be changed everywhere... -QuaNet needs refactoring. The base quantifiers ACC and PACC receive val_data with instances already transformed. This - issue is due to a bad design. - -Improvements: -========================================== -Explore the hyperparameter "number of bins" in HDy -Rename EMQ to SLD ? -Parallelize the kFCV in ACC and PACC? -Parallelize model selection trainings -We might want to think of (improving and) adding the class Tabular (it is defined and used on branch tweetsent). A more - recent version is in the project ql4facct. This class is meant to generate latex tables from results (highligting - best results, computing statistical tests, colouring cells, producing rankings, producing averages, etc.). Trying - to generate tables is typically a bad idea, but in this specific case we do have pretty good control of what an - experiment looks like. (Do we want to abstract experimental results? this could be useful not only for tables but - also for plots). -Add proper logging system. Currently we use print -It might be good to simplify the number of methods that have to be implemented for any new Quantifier. At the moment, - there are many functions like get_params, set_params, and, specially, @property classes_, which are cumbersome to - implement for quick experiments. A possible solution is to impose get_params and set_params only in cases in which - the model extends some "ModelSelectable" interface only. The classes_ should have a default implementation. - -Checks: -========================================== -How many times is the system of equations for ACC and PACC not solved? How many times is it clipped? Do they sum up - to one always? -Re-check how hyperparameters from the quantifier and hyperparameters from the classifier (in aggregative quantifiers) - is handled. In scikit-learn the hyperparameters from a wrapper method are indicated directly whereas the hyperparams - from the internal learner are prefixed with "estimator__". In QuaPy, combinations having to do with the classifier - can be computed at the begining, and then in an internal loop the hyperparams of the quantifier can be explored, - passing fit_learner=False. -Re-check Ensembles. As for now, they are strongly tied to aggregative quantifiers. -Re-think the environment variables. Maybe add new ones (like, for example, parameters for the plots) -Do we want to wrap prevalences (currently simple np.ndarray) as a class? This might be convenient for some interfaces - (e.g., for specifying artificial prevalences in samplings, for printing them -- currently supported through - F.strprev(), etc.). This might however add some overload, and prevent/difficult post processing with numpy. -Would be nice to get a better integration with sklearn. - +- [TODO] document confidence in manuals +- [TODO] Test the return_type="index" in protocols and finish the "distributing_samples.py" example +- [TODO] Add EDy (an implementation is available at quantificationlib) +- [TODO] add ensemble methods SC-MQ, MC-SQ, MC-MQ +- [TODO] add HistNetQ +- [TODO] add CDE-iteration and Bayes-CDE methods +- [TODO] add Friedman's method and DeBias +- [TODO] check ignore warning stuff + check https://docs.python.org/3/library/warnings.html#temporarily-suppressing-warnings +- [TODO] nmd and md are not selectable from qp.evaluation.evaluate as a string \ No newline at end of file diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..567609b --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +build/ diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/build/html/Datasets.html b/docs/build/html/Datasets.html deleted file mode 100644 index 775690d..0000000 --- a/docs/build/html/Datasets.html +++ /dev/null @@ -1,831 +0,0 @@ - - - - - - - - - - Datasets — QuaPy 0.1.7 documentation - - - - - - - - - - - - - - - - - - - -
-
-
-
- -
-

Datasets

-

QuaPy makes available several datasets that have been used in -quantification literature, as well as an interface to allow -anyone import their custom datasets.

-

A Dataset object in QuaPy is roughly a pair of LabelledCollection objects, -one playing the role of the training set, another the test set. -LabelledCollection is a data class consisting of the (iterable) -instances and labels. This class handles most of the sampling functionality in QuaPy. -Take a look at the following code:

-
import quapy as qp
-import quapy.functional as F
-
-instances = [
-    '1st positive document', '2nd positive document',
-    'the only negative document',
-    '1st neutral document', '2nd neutral document', '3rd neutral document'
-]
-labels = [2, 2, 0, 1, 1, 1]
-
-data = qp.data.LabelledCollection(instances, labels)
-print(F.strprev(data.prevalence(), prec=2))
-
-
-

Output the class prevalences (showing 2 digit precision):

-
[0.17, 0.50, 0.33]
-
-
-

One can easily produce new samples at desired class prevalence values:

-
sample_size = 10
-prev = [0.4, 0.1, 0.5]
-sample = data.sampling(sample_size, *prev)
-
-print('instances:', sample.instances)
-print('labels:', sample.labels)
-print('prevalence:', F.strprev(sample.prevalence(), prec=2))
-
-
-

Which outputs:

-
instances: ['the only negative document' '2nd positive document'
- '2nd positive document' '2nd neutral document' '1st positive document'
- 'the only negative document' 'the only negative document'
- 'the only negative document' '2nd positive document'
- '1st positive document']
-labels: [0 2 2 1 2 0 0 0 2 2]
-prevalence: [0.40, 0.10, 0.50]
-
-
-

Samples can be made consistent across different runs (e.g., to test -different methods on the same exact samples) by sampling and retaining -the indexes, that can then be used to generate the sample:

-
index = data.sampling_index(sample_size, *prev)
-for method in methods:
-    sample = data.sampling_from_index(index)
-    ...
-
-
-

However, generating samples for evaluation purposes is tackled in QuaPy -by means of the evaluation protocols (see the dedicated entries in the Wiki -for evaluation and -protocols).

-
-

Reviews Datasets

-

Three datasets of reviews about Kindle devices, Harry Potter’s series, and -the well-known IMDb movie reviews can be fetched using a unified interface. -For example:

-
import quapy as qp
-data = qp.datasets.fetch_reviews('kindle')
-
-
-

These datasets have been used in:

-
Esuli, A., Moreo, A., & Sebastiani, F. (2018, October). 
-A recurrent neural network for sentiment quantification. 
-In Proceedings of the 27th ACM International Conference on 
-Information and Knowledge Management (pp. 1775-1778).
-
-
-

The list of reviews ids is available in:

-
qp.datasets.REVIEWS_SENTIMENT_DATASETS
-
-
-

Some statistics of the fhe available datasets are summarized below:

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Dataset

classes

train size

test size

train prev

test prev

type

hp

2

9533

18399

[0.018, 0.982]

[0.065, 0.935]

text

kindle

2

3821

21591

[0.081, 0.919]

[0.063, 0.937]

text

imdb

2

25000

25000

[0.500, 0.500]

[0.500, 0.500]

text

-
-
-

Twitter Sentiment Datasets

-

11 Twitter datasets for sentiment analysis. -Text is not accessible, and the documents were made available -in tf-idf format. Each dataset presents two splits: a train/val -split for model selection purposes, and a train+val/test split -for model evaluation. The following code exemplifies how to load -a twitter dataset for model selection.

-
import quapy as qp
-data = qp.datasets.fetch_twitter('gasp', for_model_selection=True)
-
-
-

The datasets were used in:

-
Gao, W., & Sebastiani, F. (2015, August). 
-Tweet sentiment: From classification to quantification. 
-In 2015 IEEE/ACM International Conference on Advances in 
-Social Networks Analysis and Mining (ASONAM) (pp. 97-104). IEEE.
-
-
-

Three of the datasets (semeval13, semeval14, and semeval15) share the -same training set (semeval), meaning that the training split one would get -when requesting any of them is the same. The dataset “semeval” can only -be requested with “for_model_selection=True”. -The lists of the Twitter dataset’s ids can be consulted in:

-
# a list of 11 dataset ids that can be used for model selection or model evaluation
-qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST
-
-# 9 dataset ids in which "semeval13", "semeval14", and "semeval15" are replaced with "semeval"
-qp.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN  
-
-
-

Some details can be found below:

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Dataset

classes

train size

test size

features

train prev

test prev

type

gasp

3

8788

3765

694582

[0.421, 0.496, 0.082]

[0.407, 0.507, 0.086]

sparse

hcr

3

1594

798

222046

[0.546, 0.211, 0.243]

[0.640, 0.167, 0.193]

sparse

omd

3

1839

787

199151

[0.463, 0.271, 0.266]

[0.437, 0.283, 0.280]

sparse

sanders

3

2155

923

229399

[0.161, 0.691, 0.148]

[0.164, 0.688, 0.148]

sparse

semeval13

3

11338

3813

1215742

[0.159, 0.470, 0.372]

[0.158, 0.430, 0.412]

sparse

semeval14

3

11338

1853

1215742

[0.159, 0.470, 0.372]

[0.109, 0.361, 0.530]

sparse

semeval15

3

11338

2390

1215742

[0.159, 0.470, 0.372]

[0.153, 0.413, 0.434]

sparse

semeval16

3

8000

2000

889504

[0.157, 0.351, 0.492]

[0.163, 0.341, 0.497]

sparse

sst

3

2971

1271

376132

[0.261, 0.452, 0.288]

[0.207, 0.481, 0.312]

sparse

wa

3

2184

936

248563

[0.305, 0.414, 0.281]

[0.282, 0.446, 0.272]

sparse

wb

3

4259

1823

404333

[0.270, 0.392, 0.337]

[0.274, 0.392, 0.335]

sparse

-
-
-

UCI Machine Learning

-

A set of 32 datasets from the UCI Machine Learning repository -used in:

-
Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
-Using ensembles for problems with characterizable changes 
-in data distribution: A case study on quantification.
-Information Fusion, 34, 87-100.
-
-
-

The list does not exactly coincide with that used in Pérez-Gállego et al. 2017 -since we were unable to find the datasets with ids “diabetes” and “phoneme”.

-

These dataset can be loaded by calling, e.g.:

-
import quapy as qp
-data = qp.datasets.fetch_UCIDataset('yeast', verbose=True)
-
-
-

This call will return a Dataset object in which the training and -test splits are randomly drawn, in a stratified manner, from the whole -collection at 70% and 30%, respectively. The verbose=True option indicates -that the dataset description should be printed in standard output. -The original data is not split, -and some papers submit the entire collection to a kFCV validation. -In order to accommodate with these practices, one could first instantiate -the entire collection, and then creating a generator that will return one -training+test dataset at a time, following a kFCV protocol:

-
import quapy as qp
-collection = qp.datasets.fetch_UCILabelledCollection("yeast")
-for data in qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
-    ...
-
-
-

Above code will allow to conduct a 2x5FCV evaluation on the “yeast” dataset.

-

All datasets come in numerical form (dense matrices); some statistics -are summarized below.

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Dataset

classes

instances

features

prev

type

acute.a

2

120

6

[0.508, 0.492]

dense

acute.b

2

120

6

[0.583, 0.417]

dense

balance.1

2

625

4

[0.539, 0.461]

dense

balance.2

2

625

4

[0.922, 0.078]

dense

balance.3

2

625

4

[0.539, 0.461]

dense

breast-cancer

2

683

9

[0.350, 0.650]

dense

cmc.1

2

1473

9

[0.573, 0.427]

dense

cmc.2

2

1473

9

[0.774, 0.226]

dense

cmc.3

2

1473

9

[0.653, 0.347]

dense

ctg.1

2

2126

22

[0.222, 0.778]

dense

ctg.2

2

2126

22

[0.861, 0.139]

dense

ctg.3

2

2126

22

[0.917, 0.083]

dense

german

2

1000

24

[0.300, 0.700]

dense

haberman

2

306

3

[0.735, 0.265]

dense

ionosphere

2

351

34

[0.641, 0.359]

dense

iris.1

2

150

4

[0.667, 0.333]

dense

iris.2

2

150

4

[0.667, 0.333]

dense

iris.3

2

150

4

[0.667, 0.333]

dense

mammographic

2

830

5

[0.514, 0.486]

dense

pageblocks.5

2

5473

10

[0.979, 0.021]

dense

semeion

2

1593

256

[0.901, 0.099]

dense

sonar

2

208

60

[0.534, 0.466]

dense

spambase

2

4601

57

[0.606, 0.394]

dense

spectf

2

267

44

[0.794, 0.206]

dense

tictactoe

2

958

9

[0.653, 0.347]

dense

transfusion

2

748

4

[0.762, 0.238]

dense

wdbc

2

569

30

[0.627, 0.373]

dense

wine.1

2

178

13

[0.669, 0.331]

dense

wine.2

2

178

13

[0.601, 0.399]

dense

wine.3

2

178

13

[0.730, 0.270]

dense

wine-q-red

2

1599

11

[0.465, 0.535]

dense

wine-q-white

2

4898

11

[0.335, 0.665]

dense

yeast

2

1484

8

[0.711, 0.289]

dense

-
-

Issues:

-

All datasets will be downloaded automatically the first time they are requested, and -stored in the quapy_data folder for faster further reuse. -However, some datasets require special actions that at the moment are not fully -automated.

-
    -
  • Datasets with ids “ctg.1”, “ctg.2”, and “ctg.3” (Cardiotocography Data Set) load -an Excel file, which requires the user to install the xlrd Python module in order -to open it.

  • -
  • The dataset with id “pageblocks.5” (Page Blocks Classification (5)) needs to -open a “unix compressed file” (extension .Z), which is not directly doable with -standard Pythons packages like gzip or zip. This file would need to be uncompressed using -OS-dependent software manually. Information on how to do it will be printed the first -time the dataset is invoked.

  • -
-
-
-
-

LeQua Datasets

-

QuaPy also provides the datasets used for the LeQua competition. -In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification -problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide -raw documents instead. -Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B -are multiclass quantification problems consisting of estimating the class prevalence -values of 28 different merchandise products.

-

Every task consists of a training set, a set of validation samples (for model selection) -and a set of test samples (for evaluation). QuaPy returns this data as a LabelledCollection -(training) and two generation protocols (for validation and test samples), as follows:

-
training, val_generator, test_generator = fetch_lequa2022(task=task)
-
-
-

See the lequa2022_experiments.py in the examples folder for further details on how to -carry out experiments using these datasets.

-

The datasets are downloaded only once, and stored for fast reuse.

-

Some statistics are summarized below:

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Dataset

classes

train size

validation samples

test samples

docs by sample

type

T1A

2

5000

1000

5000

250

vector

T1B

28

20000

1000

5000

1000

vector

T2A

2

5000

1000

5000

250

text

T2B

28

20000

1000

5000

1000

text

-

For further details on the datasets, we refer to the original -paper:

-
Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022).
-A Detailed Overview of LeQua@ CLEF 2022: Learning to Quantify.
-
-
-
-
-

Adding Custom Datasets

-

QuaPy provides data loaders for simple formats dealing with -text, following the format:

-
class-id \t first document's pre-processed text \n
-class-id \t second document's pre-processed text \n
-...
-
-
-

and sparse representations of the form:

-
{-1, 0, or +1} col(int):val(float) col(int):val(float) ... \n
-...
-
-
-

The code in charge in loading a LabelledCollection is:

-
@classmethod
-def load(cls, path:str, loader_func:callable):
-    return LabelledCollection(*loader_func(path))
-
-
-

indicating that any loader_func (e.g., a user-defined one) which -returns valid arguments for initializing a LabelledCollection object will allow -to load any collection. In particular, the LabelledCollection receives as -arguments the instances (as an iterable) and the labels (as an iterable) and, -additionally, the number of classes can be specified (it would otherwise be -inferred from the labels, but that requires at least one positive example for -all classes to be present in the collection).

-

The same loader_func can be passed to a Dataset, along with two -paths, in order to create a training and test pair of LabelledCollection, -e.g.:

-
import quapy as qp
-
-train_path = '../my_data/train.dat'
-test_path = '../my_data/test.dat'
-
-def my_custom_loader(path):
-    with open(path, 'rb') as fin:
-        ...
-    return instances, labels
-
-data = qp.data.Dataset.load(train_path, test_path, my_custom_loader)
-
-
-
-

Data Processing

-

QuaPy implements a number of preprocessing functions in the package qp.data.preprocessing, including:

-
    -
  • text2tfidf: tfidf vectorization

  • -
  • reduce_columns: reducing the number of columns based on term frequency

  • -
  • standardize: transforms the column values into z-scores (i.e., subtract the mean and normalizes by the standard deviation, so -that the column values have zero mean and unit variance).

  • -
  • index: transforms textual tokens into lists of numeric ids)

  • -
-
-
-
- - -
-
-
-
- -
-
- - - - \ No newline at end of file diff --git a/docs/build/html/Evaluation.html b/docs/build/html/Evaluation.html deleted file mode 100644 index 1b41a03..0000000 --- a/docs/build/html/Evaluation.html +++ /dev/null @@ -1,281 +0,0 @@ - - - - - - - - - - Evaluation — QuaPy 0.1.7 documentation - - - - - - - - - - - - - - - - - - - -
-
-
-
- -
-

Evaluation

-

Quantification is an appealing tool in scenarios of dataset shift, -and particularly in scenarios of prior-probability shift. -That is, the interest in estimating the class prevalences arises -under the belief that those class prevalences might have changed -with respect to the ones observed during training. -In other words, one could simply return the training prevalence -as a predictor of the test prevalence if this change is assumed -to be unlikely (as is the case in general scenarios of -machine learning governed by the iid assumption). -In brief, quantification requires dedicated evaluation protocols, -which are implemented in QuaPy and explained here.

-
-

Error Measures

-

The module quapy.error implements the following error measures for quantification:

-
    -
  • mae: mean absolute error

  • -
  • mrae: mean relative absolute error

  • -
  • mse: mean squared error

  • -
  • mkld: mean Kullback-Leibler Divergence

  • -
  • mnkld: mean normalized Kullback-Leibler Divergence

  • -
-

Functions ae, rae, se, kld, and nkld are also available, -which return the individual errors (i.e., without averaging the whole).

-

Some errors of classification are also available:

-
    -
  • acce: accuracy error (1-accuracy)

  • -
  • f1e: F-1 score error (1-F1 score)

  • -
-

The error functions implement the following interface, e.g.:

-
mae(true_prevs, prevs_hat)
-
-
-

in which the first argument is a ndarray containing the true -prevalences, and the second argument is another ndarray with -the estimations produced by some method.

-

Some error functions, e.g., mrae, mkld, and mnkld, are -smoothed for numerical stability. In those cases, there is a -third argument, e.g.:

-
def mrae(true_prevs, prevs_hat, eps=None): ...
-
-
-

indicating the value for the smoothing parameter epsilon. -Traditionally, this value is set to 1/(2T) in past literature, -with T the sampling size. One could either pass this value -to the function each time, or to set a QuaPy’s environment -variable SAMPLE_SIZE once, and omit this argument -thereafter (recommended); -e.g.:

-
qp.environ['SAMPLE_SIZE'] = 100  # once for all
-true_prev = np.asarray([0.5, 0.3, 0.2])  # let's assume 3 classes
-estim_prev = np.asarray([0.1, 0.3, 0.6])
-error = qp.error.mrae(true_prev, estim_prev)
-print(f'mrae({true_prev}, {estim_prev}) = {error:.3f}')
-
-
-

will print:

-
mrae([0.500, 0.300, 0.200], [0.100, 0.300, 0.600]) = 0.914
-
-
-

Finally, it is possible to instantiate QuaPy’s quantification -error functions from strings using, e.g.:

-
error_function = qp.error.from_name('mse')
-error = error_function(true_prev, estim_prev)
-
-
-
-
-

Evaluation Protocols

-

An evaluation protocol is an evaluation procedure that uses -one specific sample generation procotol to genereate many -samples, typically characterized by widely varying amounts of -shift with respect to the original distribution, that are then -used to evaluate the performance of a (trained) quantifier. -These protocols are explained in more detail in a dedicated entry -in the wiki. For the moment being, let us assume we already have -chosen and instantiated one specific such protocol, that we here -simply call prot. Let also assume our model is called -quantifier and that our evaluatio measure of choice is -mae. The evaluation comes down to:

-
mae = qp.evaluation.evaluate(quantifier, protocol=prot, error_metric='mae')
-print(f'MAE = {mae:.4f}')
-
-
-

It is often desirable to evaluate our system using more than one -single evaluatio measure. In this case, it is convenient to generate -a report. A report in QuaPy is a dataframe accounting for all the -true prevalence values with their corresponding prevalence values -as estimated by the quantifier, along with the error each has given -rise.

-
report = qp.evaluation.evaluation_report(quantifier, protocol=prot, error_metrics=['mae', 'mrae', 'mkld'])
-
-
-

From a pandas’ dataframe, it is straightforward to visualize all the results, -and compute the averaged values, e.g.:

-
pd.set_option('display.expand_frame_repr', False)
-report['estim-prev'] = report['estim-prev'].map(F.strprev)
-print(report)
-
-print('Averaged values:')
-print(report.mean())
-
-
-

This will produce an output like:

-
           true-prev      estim-prev       mae      mrae      mkld
-0     [0.308, 0.692]  [0.314, 0.686]  0.005649  0.013182  0.000074
-1     [0.896, 0.104]  [0.909, 0.091]  0.013145  0.069323  0.000985
-2     [0.848, 0.152]  [0.809, 0.191]  0.039063  0.149806  0.005175
-3     [0.016, 0.984]  [0.033, 0.967]  0.017236  0.487529  0.005298
-4     [0.728, 0.272]  [0.751, 0.249]  0.022769  0.057146  0.001350
-...              ...             ...       ...       ...       ...
-4995    [0.72, 0.28]  [0.698, 0.302]  0.021752  0.053631  0.001133
-4996  [0.868, 0.132]  [0.888, 0.112]  0.020490  0.088230  0.001985
-4997  [0.292, 0.708]  [0.298, 0.702]  0.006149  0.014788  0.000090
-4998    [0.24, 0.76]  [0.220, 0.780]  0.019950  0.054309  0.001127
-4999  [0.948, 0.052]  [0.965, 0.035]  0.016941  0.165776  0.003538
-
-[5000 rows x 5 columns]
-Averaged values:
-mae     0.023588
-mrae    0.108779
-mkld    0.003631
-dtype: float64
-
-Process finished with exit code 0
-
-
-

Alternatively, we can simply generate all the predictions by:

-
true_prevs, estim_prevs = qp.evaluation.prediction(quantifier, protocol=prot)
-
-
-

All the evaluation functions implement specific optimizations for speeding-up -the evaluation of aggregative quantifiers (i.e., of instances of AggregativeQuantifier). -The optimization comes down to generating classification predictions (either crisp or soft) -only once for the entire test set, and then applying the sampling procedure to the -predictions, instead of generating samples of instances and then computing the -classification predictions every time. This is only possible when the protocol -is an instance of OnLabelledCollectionProtocol. The optimization is only -carried out when the number of classification predictions thus generated would be -smaller than the number of predictions required for the entire protocol; e.g., -if the original dataset contains 1M instances, but the protocol is such that it would -at most generate 20 samples of 100 instances, then it would be preferable to postpone the -classification for each sample. This behaviour is indicated by setting -aggr_speedup=”auto”. Conversely, when indicating aggr_speedup=”force” QuaPy will -precompute all the predictions irrespectively of the number of instances and number of samples. -Finally, this can be deactivated by setting aggr_speedup=False. Note that this optimization -is not only applied for the final evaluation, but also for the internal evaluations carried -out during model selection. Since these are typically many, the heuristic can help reduce the -execution time a lot.

-
-
- - -
-
-
-
- -
-
- - - - \ No newline at end of file diff --git a/docs/build/html/Installation.html b/docs/build/html/Installation.html deleted file mode 100644 index b63e795..0000000 --- a/docs/build/html/Installation.html +++ /dev/null @@ -1,178 +0,0 @@ - - - - - - - - - - Installation — QuaPy 0.1.7 documentation - - - - - - - - - - - - - - - - - - - -
-
-
-
- -
-

Installation

-

QuaPy can be easily installed via pip

-
pip install quapy
-
-
-

See pip page for older versions.

-
-

Requirements

-
    -
  • scikit-learn, numpy, scipy

  • -
  • pytorch (for QuaNet)

  • -
  • svmperf patched for quantification (see below)

  • -
  • joblib

  • -
  • tqdm

  • -
  • pandas, xlrd

  • -
  • matplotlib

  • -
-
-
-

SVM-perf with quantification-oriented losses

-

In order to run experiments involving SVM(Q), SVM(KLD), SVM(NKLD), -SVM(AE), or SVM(RAE), you have to first download the -svmperf -package, apply the patch -svm-perf-quantification-ext.patch, -and compile the sources. -The script -prepare_svmperf.sh, -does all the job. Simply run:

-
./prepare_svmperf.sh
-
-
-

The resulting directory ./svm_perf_quantification contains the -patched version of svmperf with quantification-oriented losses.

-

The -svm-perf-quantification-ext.patch -is an extension of the patch made available by -Esuli et al. 2015 -that allows SVMperf to optimize for -the Q measure as proposed by -Barranquero et al. 2015 -and for the KLD and NKLD as proposed by -Esuli et al. 2015 -for quantification. -This patch extends the former by also allowing SVMperf to optimize for -AE and RAE.

-
-
- - -
-
-
-
- -
-
- - - - \ No newline at end of file diff --git a/docs/build/html/Methods.html b/docs/build/html/Methods.html deleted file mode 100644 index 8471f3d..0000000 --- a/docs/build/html/Methods.html +++ /dev/null @@ -1,539 +0,0 @@ - - - - - - - - - - Quantification Methods — QuaPy 0.1.7 documentation - - - - - - - - - - - - - - - - - - - -
-
-
-
- -
-

Quantification Methods

-

Quantification methods can be categorized as belonging to -aggregative and non-aggregative groups. -Most methods included in QuaPy at the moment are of type aggregative -(though we plan to add many more methods in the near future), i.e., -are methods characterized by the fact that -quantification is performed as an aggregation function of the individual -products of classification.

-

Any quantifier in QuaPy shoud extend the class BaseQuantifier, -and implement some abstract methods:

-
    @abstractmethod
-    def fit(self, data: LabelledCollection): ...
-
-    @abstractmethod
-    def quantify(self, instances): ...
-
-
-

The meaning of those functions should be familiar to those -used to work with scikit-learn since the class structure of QuaPy -is directly inspired by scikit-learn’s Estimators. Functions -fit and quantify are used to train the model and to provide -class estimations (the reason why -scikit-learn’ structure has not been adopted as is in QuaPy responds to -the fact that scikit-learn’s predict function is expected to return -one output for each input element –e.g., a predicted label for each -instance in a sample– while in quantification the output for a sample -is one single array of class prevalences). -Quantifiers also extend from scikit-learn’s BaseEstimator, in order -to simplify the use of set_params and get_params used in -model selector.

-
-

Aggregative Methods

-

All quantification methods are implemented as part of the -qp.method package. In particular, aggregative methods are defined in -qp.method.aggregative, and extend AggregativeQuantifier(BaseQuantifier). -The methods that any aggregative quantifier must implement are:

-
    @abstractmethod
-    def fit(self, data: LabelledCollection, fit_learner=True): ...
-
-    @abstractmethod
-    def aggregate(self, classif_predictions:np.ndarray): ...
-
-
-

since, as mentioned before, aggregative methods base their prediction on the -individual predictions of a classifier. Indeed, a default implementation -of BaseQuantifier.quantify is already provided, which looks like:

-
    def quantify(self, instances):
-    classif_predictions = self.classify(instances)
-    return self.aggregate(classif_predictions)
-
-
-

Aggregative quantifiers are expected to maintain a classifier (which is -accessed through the @property classifier). This classifier is -given as input to the quantifier, and can be already fit -on external data (in which case, the fit_learner argument should -be set to False), or be fit by the quantifier’s fit (default).

-

Another class of aggregative methods are the probabilistic -aggregative methods, that should inherit from the abstract class -AggregativeProbabilisticQuantifier(AggregativeQuantifier). -The particularity of probabilistic aggregative methods (w.r.t. -non-probabilistic ones), is that the default quantifier is defined -in terms of the posterior probabilities returned by a probabilistic -classifier, and not by the crisp decisions of a hard classifier. -In any case, the interface classify(instances) remains unchanged.

-

One advantage of aggregative methods (either probabilistic or not) -is that the evaluation according to any sampling procedure (e.g., -the artificial sampling protocol) -can be achieved very efficiently, since the entire set can be pre-classified -once, and the quantification estimations for different samples can directly -reuse these predictions, without requiring to classify each element every time. -QuaPy leverages this property to speed-up any procedure having to do with -quantification over samples, as is customarily done in model selection or -in evaluation.

-
-

The Classify & Count variants

-

QuaPy implements the four CC variants, i.e.:

-
    -
  • CC (Classify & Count), the simplest aggregative quantifier; one that -simply relies on the label predictions of a classifier to deliver class estimates.

  • -
  • ACC (Adjusted Classify & Count), the adjusted variant of CC.

  • -
  • PCC (Probabilistic Classify & Count), the probabilistic variant of CC that -relies on the soft estimations (or posterior probabilities) returned by a (probabilistic) classifier.

  • -
  • PACC (Probabilistic Adjusted Classify & Count), the adjusted variant of PCC.

  • -
-

The following code serves as a complete example using CC equipped -with a SVM as the classifier:

-
import quapy as qp
-import quapy.functional as F
-from sklearn.svm import LinearSVC
-
-training, test = qp.datasets.fetch_twitter('hcr', pickle=True).train_test
-
-# instantiate a classifier learner, in this case a SVM
-svm = LinearSVC()
-
-# instantiate a Classify & Count with the SVM
-# (an alias is available in qp.method.aggregative.ClassifyAndCount)
-model = qp.method.aggregative.CC(svm)
-model.fit(training)
-estim_prevalence = model.quantify(test.instances)
-
-
-

The same code could be used to instantiate an ACC, by simply replacing -the instantiation of the model with:

-
model = qp.method.aggregative.ACC(svm)
-
-
-

Note that the adjusted variants (ACC and PACC) need to estimate -some parameters for performing the adjustment (e.g., the -true positive rate and the false positive rate in case of -binary classification) that are estimated on a validation split -of the labelled set. In this case, the init method of -ACC defines an additional parameter, val_split which, by -default, is set to 0.4 and so, the 40% of the labelled data -will be used for estimating the parameters for adjusting the -predictions. This parameters can also be set with an integer, -indicating that the parameters should be estimated by means of -k-fold cross-validation, for which the integer indicates the -number k of folds. Finally, val_split can be set to a -specific held-out validation set (i.e., an instance of LabelledCollection).

-

The specification of val_split can be -postponed to the invokation of the fit method (if val_split was also -set in the constructor, the one specified at fit time would prevail), -e.g.:

-
model = qp.method.aggregative.ACC(svm)
-# perform 5-fold cross validation for estimating ACC's parameters
-# (overrides the default val_split=0.4 in the constructor)
-model.fit(training, val_split=5)
-
-
-

The following code illustrates the case in which PCC is used:

-
model = qp.method.aggregative.PCC(svm)
-model.fit(training)
-estim_prevalence = model.quantify(test.instances)
-print('classifier:', model.classifier)
-
-
-

In this case, QuaPy will print:

-
The learner LinearSVC does not seem to be probabilistic. The learner will be calibrated.
-classifier: CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5)
-
-
-

The first output indicates that the learner (LinearSVC in this case) -is not a probabilistic classifier (i.e., it does not implement the -predict_proba method) and so, the classifier will be converted to -a probabilistic one through calibration. -As a result, the classifier that is printed in the second line points -to a CalibratedClassifier instance. Note that calibration can only -be applied to hard classifiers when fit_learner=True; an exception -will be raised otherwise.

-

Lastly, everything we said aboud ACC and PCC -applies to PACC as well.

-
-
-

Expectation Maximization (EMQ)

-

The Expectation Maximization Quantifier (EMQ), also known as -the SLD, is available at qp.method.aggregative.EMQ or via the -alias qp.method.aggregative.ExpectationMaximizationQuantifier. -The method is described in:

-

Saerens, M., Latinne, P., and Decaestecker, C. (2002). Adjusting the outputs of a classifier -to new a priori probabilities: A simple procedure. Neural Computation, 14(1):21–41.

-

EMQ works with a probabilistic classifier (if the classifier -given as input is a hard one, a calibration will be attempted). -Although this method was originally proposed for improving the -posterior probabilities of a probabilistic classifier, and not -for improving the estimation of prior probabilities, EMQ ranks -almost always among the most effective quantifiers in the -experiments we have carried out.

-

An example of use can be found below:

-
import quapy as qp
-from sklearn.linear_model import LogisticRegression
-
-dataset = qp.datasets.fetch_twitter('hcr', pickle=True)
-
-model = qp.method.aggregative.EMQ(LogisticRegression())
-model.fit(dataset.training)
-estim_prevalence = model.quantify(dataset.test.instances)
-
-
-

New in v0.1.7: EMQ now accepts two new parameters in the construction method, namely -exact_train_prev which allows to use the true training prevalence as the departing -prevalence estimation (default behaviour), or instead an approximation of it as -suggested by Alexandari et al. (2020) -(by setting exact_train_prev=False). -The other parameter is recalib which allows to indicate a calibration method, among those -proposed by Alexandari et al. (2020), -including the Bias-Corrected Temperature Scaling, Vector Scaling, etc. -See the API documentation for further details.

-
-
-

Hellinger Distance y (HDy)

-

Implementation of the method based on the Hellinger Distance y (HDy) proposed by -González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution -estimation based on the Hellinger distance. Information Sciences, 218:146–164.

-

It is implemented in qp.method.aggregative.HDy (also accessible -through the allias qp.method.aggregative.HellingerDistanceY). -This method works with a probabilistic classifier (hard classifiers -can be used as well and will be calibrated) and requires a validation -set to estimate parameter for the mixture model. Just like -ACC and PACC, this quantifier receives a val_split argument -in the constructor (or in the fit method, in which case the previous -value is overridden) that can either be a float indicating the proportion -of training data to be taken as the validation set (in a random -stratified split), or a validation set (i.e., an instance of -LabelledCollection) itself.

-

HDy was proposed as a binary classifier and the implementation -provided in QuaPy accepts only binary datasets.

-

The following code shows an example of use:

-
import quapy as qp
-from sklearn.linear_model import LogisticRegression
-
-# load a binary dataset
-dataset = qp.datasets.fetch_reviews('hp', pickle=True)
-qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True)
-
-model = qp.method.aggregative.HDy(LogisticRegression())
-model.fit(dataset.training)
-estim_prevalence = model.quantify(dataset.test.instances)
-
-
-

New in v0.1.7: QuaPy now provides an implementation of the generalized -“Distribution Matching” approaches for multiclass, inspired by the framework -of Firat (2016). One can instantiate -a variant of HDy for multiclass quantification as follows:

-
mutliclassHDy = qp.method.aggregative.DistributionMatching(classifier=LogisticRegression(), divergence='HD', cdf=False)
-
-
-

New in v0.1.7: QuaPy now provides an implementation of the “DyS” -framework proposed by Maletzke et al (2020) -and the “SMM” method proposed by Hassan et al (2019) -(thanks to Pablo González for the contributions!)

-
-
-

Threshold Optimization methods

-

New in v0.1.7: QuaPy now implements Forman’s threshold optimization methods; -see, e.g., (Forman 2006) -and (Forman 2008). -These include: T50, MAX, X, Median Sweep (MS), and its variant MS2.

-
-
-

Explicit Loss Minimization

-

The Explicit Loss Minimization (ELM) represent a family of methods -based on structured output learning, i.e., quantifiers relying on -classifiers that have been optimized targeting a -quantification-oriented evaluation measure. -The original methods are implemented in QuaPy as classify & count (CC) -quantifiers that use Joachim’s SVMperf -as the underlying classifier, properly set to optimize for the desired loss.

-

In QuaPy, this can be more achieved by calling the functions:

- -

the last two methods (SVM(AE) and SVM(RAE)) have been implemented in -QuaPy in order to make available ELM variants for what nowadays -are considered the most well-behaved evaluation metrics in quantification.

-

In order to make these models work, you would need to run the script -prepare_svmperf.sh (distributed along with QuaPy) that -downloads SVMperf’ source code, applies a patch that -implements the quantification oriented losses, and compiles the -sources.

-

If you want to add any custom loss, you would need to modify -the source code of SVMperf in order to implement it, and -assign a valid loss code to it. Then you must re-compile -the whole thing and instantiate the quantifier in QuaPy -as follows:

-
# you can either set the path to your custom svm_perf_quantification implementation
-# in the environment variable, or as an argument to the constructor of ELM
-qp.environ['SVMPERF_HOME'] = './path/to/svm_perf_quantification'
-
-# assign an alias to your custom loss and the id you have assigned to it
-svmperf = qp.classification.svmperf.SVMperf
-svmperf.valid_losses['mycustomloss'] = 28
-
-# instantiate the ELM method indicating the loss
-model = qp.method.aggregative.ELM(loss='mycustomloss')
-
-
-

All ELM are binary quantifiers since they rely on SVMperf, that -currently supports only binary classification. -ELM variants (any binary quantifier in general) can be extended -to operate in single-label scenarios trivially by adopting a -“one-vs-all” strategy (as, e.g., in -Gao, W. and Sebastiani, F. (2016). From classification to quantification in tweet sentiment -analysis. Social Network Analysis and Mining, 6(19):1–22). -In QuaPy this is possible by using the OneVsAll class.

-

There are two ways for instantiating this class, OneVsAllGeneric that works for -any quantifier, and OneVsAllAggregative that is optimized for aggregative quantifiers. -In general, you can simply use the getOneVsAll function and QuaPy will choose -the more convenient of the two.

-
import quapy as qp
-from quapy.method.aggregative import SVMQ
-
-# load a single-label dataset (this one contains 3 classes)
-dataset = qp.datasets.fetch_twitter('hcr', pickle=True)
-
-# let qp know where svmperf is
-qp.environ['SVMPERF_HOME'] = '../svm_perf_quantification'
-
-model = getOneVsAll(SVMQ(), n_jobs=-1)  # run them on parallel
-model.fit(dataset.training)
-estim_prevalence = model.quantify(dataset.test.instances)
-
-
-

Check the examples explicit_loss_minimization.py -and one_vs_all.py for more details.

-
-
-
-

Meta Models

-

By meta models we mean quantification methods that are defined on top of other -quantification methods, and that thus do not squarely belong to the aggregative nor -the non-aggregative group (indeed, meta models could use quantifiers from any of those -groups). -Meta models are implemented in the qp.method.meta module.

-
-

Ensembles

-

QuaPy implements (some of) the variants proposed in:

- -

The following code shows how to instantiate an Ensemble of 30 Adjusted Classify & Count (ACC) -quantifiers operating with a Logistic Regressor (LR) as the base classifier, and using the -average as the aggregation policy (see the original article for further details). -The last parameter indicates to use all processors for parallelization.

-
import quapy as qp
-from quapy.method.aggregative import ACC
-from quapy.method.meta import Ensemble
-from sklearn.linear_model import LogisticRegression
-
-dataset = qp.datasets.fetch_UCIDataset('haberman')
-
-model = Ensemble(quantifier=ACC(LogisticRegression()), size=30, policy='ave', n_jobs=-1)
-model.fit(dataset.training)
-estim_prevalence = model.quantify(dataset.test.instances)
-
-
-

Other aggregation policies implemented in QuaPy include:

-
    -
  • ‘ptr’ for applying a dynamic selection based on the training prevalence of the ensemble’s members

  • -
  • ‘ds’ for applying a dynamic selection based on the Hellinger Distance

  • -
  • any valid quantification measure (e.g., ‘mse’) for performing a static selection based on -the performance estimated for each member of the ensemble in terms of that evaluation metric.

  • -
-

When using any of the above options, it is important to set the red_size parameter, which -informs of the number of members to retain.

-

Please, check the model selection -wiki if you want to optimize the hyperparameters of ensemble for classification or quantification.

-
-
-

The QuaNet neural network

-

QuaPy offers an implementation of QuaNet, a deep learning model presented in:

-

Esuli, A., Moreo, A., & Sebastiani, F. (2018, October). -A recurrent neural network for sentiment quantification. -In Proceedings of the 27th ACM International Conference on -Information and Knowledge Management (pp. 1775-1778).

-

This model requires torch to be installed. -QuaNet also requires a classifier that can provide embedded representations -of the inputs. -In the original paper, QuaNet was tested using an LSTM as the base classifier. -In the following example, we show an instantiation of QuaNet that instead uses CNN as a probabilistic classifier, taking its last layer representation as the document embedding:

-
import quapy as qp
-from quapy.method.meta import QuaNet
-from quapy.classification.neural import NeuralClassifierTrainer, CNNnet
-
-# use samples of 100 elements
-qp.environ['SAMPLE_SIZE'] = 100
-
-# load the kindle dataset as text, and convert words to numerical indexes
-dataset = qp.datasets.fetch_reviews('kindle', pickle=True)
-qp.data.preprocessing.index(dataset, min_df=5, inplace=True)
-
-# the text classifier is a CNN trained by NeuralClassifierTrainer
-cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes)
-learner = NeuralClassifierTrainer(cnn, device='cuda')
-
-# train QuaNet
-model = QuaNet(learner, device='cuda')
-model.fit(dataset.training)
-estim_prevalence = model.quantify(dataset.test.instances)
-
-
-
-
-
- - -
-
-
-
- -
-
- - - - \ No newline at end of file diff --git a/docs/build/html/Model-Selection.html b/docs/build/html/Model-Selection.html deleted file mode 100644 index 03a399e..0000000 --- a/docs/build/html/Model-Selection.html +++ /dev/null @@ -1,268 +0,0 @@ - - - - - - - - - - Model Selection — QuaPy 0.1.7 documentation - - - - - - - - - - - - - - - - - - - -
-
-
-
- -
-

Model Selection

-

As a supervised machine learning task, quantification methods -can strongly depend on a good choice of model hyper-parameters. -The process whereby those hyper-parameters are chosen is -typically known as Model Selection, and typically consists of -testing different settings and picking the one that performed -best in a held-out validation set in terms of any given -evaluation measure.

-
-

Targeting a Quantification-oriented loss

-

The task being optimized determines the evaluation protocol, -i.e., the criteria according to which the performance of -any given method for solving is to be assessed. -As a task on its own right, quantification should impose -its own model selection strategies, i.e., strategies -aimed at finding appropriate configurations -specifically designed for the task of quantification.

-

Quantification has long been regarded as an add-on of -classification, and thus the model selection strategies -customarily adopted in classification have simply been -applied to quantification (see the next section). -It has been argued in Moreo, Alejandro, and Fabrizio Sebastiani. -Re-Assessing the “Classify and Count” Quantification Method. -ECIR 2021: Advances in Information Retrieval pp 75–91. -that specific model selection strategies should -be adopted for quantification. That is, model selection -strategies for quantification should target -quantification-oriented losses and be tested in a variety -of scenarios exhibiting different degrees of prior -probability shift.

-

The class qp.model_selection.GridSearchQ implements a grid-search exploration over the space of -hyper-parameter combinations that evaluates -each combination of hyper-parameters by means of a given quantification-oriented -error metric (e.g., any of the error functions implemented -in qp.error) and according to a -sampling generation protocol.

-

The following is an example (also included in the examples folder) of model selection for quantification:

-
import quapy as qp
-from quapy.protocol import APP
-from quapy.method.aggregative import DistributionMatching
-from sklearn.linear_model import LogisticRegression
-import numpy as np
-
-"""
-In this example, we show how to perform model selection on a DistributionMatching quantifier.
-"""
-
-model = DistributionMatching(LogisticRegression())
-
-qp.environ['SAMPLE_SIZE'] = 100
-qp.environ['N_JOBS'] = -1  # explore hyper-parameters in parallel
-
-training, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
-
-# The model will be returned by the fit method of GridSearchQ.
-# Every combination of hyper-parameters will be evaluated by confronting the
-# quantifier thus configured against a series of samples generated by means
-# of a sample generation protocol. For this example, we will use the
-# artificial-prevalence protocol (APP), that generates samples with prevalence
-# values in the entire range of values from a grid (e.g., [0, 0.1, 0.2, ..., 1]).
-# We devote 30% of the dataset for this exploration.
-training, validation = training.split_stratified(train_prop=0.7)
-protocol = APP(validation)
-
-# We will explore a classification-dependent hyper-parameter (e.g., the 'C'
-# hyper-parameter of LogisticRegression) and a quantification-dependent hyper-parameter
-# (e.g., the number of bins in a DistributionMatching quantifier.
-# Classifier-dependent hyper-parameters have to be marked with a prefix "classifier__"
-# in order to let the quantifier know this hyper-parameter belongs to its underlying
-# classifier.
-param_grid = {
-    'classifier__C': np.logspace(-3,3,7),
-    'nbins': [8, 16, 32, 64],
-}
-
-model = qp.model_selection.GridSearchQ(
-    model=model,
-    param_grid=param_grid,
-    protocol=protocol,
-    error='mae',  # the error to optimize is the MAE (a quantification-oriented loss)
-    refit=True,   # retrain on the whole labelled set once done
-    verbose=True  # show information as the process goes on
-).fit(training)
-
-print(f'model selection ended: best hyper-parameters={model.best_params_}')
-model = model.best_model_
-
-# evaluation in terms of MAE
-# we use the same evaluation protocol (APP) on the test set
-mae_score = qp.evaluation.evaluate(model, protocol=APP(test), error_metric='mae')
-
-print(f'MAE={mae_score:.5f}')
-
-
-

In this example, the system outputs:

-
[GridSearchQ]: starting model selection with self.n_jobs =-1
-[GridSearchQ]: hyperparams={'classifier__C': 0.01, 'nbins': 64}	 got mae score 0.04021 [took 1.1356s]
-[GridSearchQ]: hyperparams={'classifier__C': 0.01, 'nbins': 32}	 got mae score 0.04286 [took 1.2139s]
-[GridSearchQ]: hyperparams={'classifier__C': 0.01, 'nbins': 16}	 got mae score 0.04888 [took 1.2491s]
-[GridSearchQ]: hyperparams={'classifier__C': 0.001, 'nbins': 8}	 got mae score 0.05163 [took 1.5372s]
-[...]
-[GridSearchQ]: hyperparams={'classifier__C': 1000.0, 'nbins': 32}	 got mae score 0.02445 [took 2.9056s]
-[GridSearchQ]: optimization finished: best params {'classifier__C': 100.0, 'nbins': 32} (score=0.02234) [took 7.3114s]
-[GridSearchQ]: refitting on the whole development set
-model selection ended: best hyper-parameters={'classifier__C': 100.0, 'nbins': 32}
-MAE=0.03102
-
-
-

The parameter val_split can alternatively be used to indicate -a validation set (i.e., an instance of LabelledCollection) instead -of a proportion. This could be useful if one wants to have control -on the specific data split to be used across different model selection -experiments.

-
-
-

Targeting a Classification-oriented loss

-

Optimizing a model for quantification could rather be -computationally costly. -In aggregative methods, one could alternatively try to optimize -the classifier’s hyper-parameters for classification. -Although this is theoretically suboptimal, many articles in -quantification literature have opted for this strategy.

-

In QuaPy, this is achieved by simply instantiating the -classifier learner as a GridSearchCV from scikit-learn. -The following code illustrates how to do that:

-
learner = GridSearchCV(
-    LogisticRegression(),
-    param_grid={'C': np.logspace(-4, 5, 10), 'class_weight': ['balanced', None]},
-    cv=5)
-model = DistributionMatching(learner).fit(dataset.training)
-
-
-

However, this is conceptually flawed, since the model should be -optimized for the task at hand (quantification), and not for a surrogate task (classification), -i.e., the model should be requested to deliver low quantification errors, rather -than low classification errors.

-
-
- - -
-
-
-
- -
-
- - - - \ No newline at end of file diff --git a/docs/build/html/Plotting.html b/docs/build/html/Plotting.html deleted file mode 100644 index d41bef6..0000000 --- a/docs/build/html/Plotting.html +++ /dev/null @@ -1,350 +0,0 @@ - - - - - - - - - - Plotting — QuaPy 0.1.7 documentation - - - - - - - - - - - - - - - - - - - -
-
-
-
- -
-

Plotting

-

The module qp.plot implements some basic plotting functions -that can help analyse the performance of a quantification method.

-

All plotting functions receive as inputs the outcomes of -some experiments and include, for each experiment, -the following three main arguments:

-
    -
  • method_names a list containing the names of the quantification methods

  • -
  • true_prevs a list containing matrices of true prevalences

  • -
  • estim_prevs a list containing matrices of estimated prevalences -(should be of the same shape as the corresponding matrix in true_prevs)

  • -
-

Note that a method (as indicated by a name in method_names) can -appear more than once. This could occur when various datasets are -involved in the experiments. In this case, all experiments for the -method will be merged and the plot will represent the method’s -performance across various datasets.

-

This is a very simple example of a valid input for the plotting functions:

-
method_names = ['classify & count', 'EMQ', 'classify & count']
-true_prevs = [
-    np.array([[0.5, 0.5], [0.25, 0.75]]),
-    np.array([[0.0, 1.0], [0.25, 0.75], [0.0, 0.1]]),
-    np.array([[0.0, 1.0], [0.25, 0.75], [0.0, 0.1]]),
-]
-estim_prevs = [
-    np.array([[0.45, 0.55], [0.6, 0.4]]),
-    np.array([[0.0, 1.0], [0.5, 0.5], [0.2, 0.8]]),
-    np.array([[0.1, 0.9], [0.3, 0.7], [0.0, 0.1]]),
-]
-
-
-

in which the classify & count has been tested in two datasets and -the EMQ method has been tested only in one dataset. For the first -experiment, only two (binary) quantifications have been tested, -while for the second and third experiments three instances have -been tested.

-

In general, we would like to test the performance of the -quantification methods across different scenarios showcasing -the accuracy of the quantifier in predicting class prevalences -for a wide range of prior distributions. This can easily be -achieved by means of the -artificial sampling protocol -that is implemented in QuaPy.

-

The following code shows how to perform one simple experiment -in which the 4 CC-variants, all equipped with a linear SVM, are -applied to one binary dataset of reviews about Kindle devices and -tested across the entire spectrum of class priors (taking 21 splits -of the interval [0,1], i.e., using prevalence steps of 0.05, and -generating 100 random samples at each prevalence).

-
import quapy as qp
-from protocol import APP
-from quapy.method.aggregative import CC, ACC, PCC, PACC
-from sklearn.svm import LinearSVC
-
-qp.environ['SAMPLE_SIZE'] = 500
-
-def gen_data():
-
-    def base_classifier():
-        return LinearSVC(class_weight='balanced')
-
-    def models():
-        yield 'CC', CC(base_classifier())
-        yield 'ACC', ACC(base_classifier())
-        yield 'PCC', PCC(base_classifier())
-        yield 'PACC', PACC(base_classifier())
-
-    train, test = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5).train_test
-
-    method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], []
-
-    for method_name, model in models():
-        model.fit(train)
-        true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
-
-        method_names.append(method_name)
-        true_prevs.append(true_prev)
-        estim_prevs.append(estim_prev)
-        tr_prevs.append(train.prevalence())
-
-    return method_names, true_prevs, estim_prevs, tr_prevs
-
-method_names, true_prevs, estim_prevs, tr_prevs = gen_data()
-
-
-

the plots that can be generated are explained below.

-
-

Diagonal Plot

-

The diagonal plot shows a very insightful view of the -quantifier’s performance. It plots the predicted class -prevalence (in the y-axis) against the true class prevalence -(in the x-axis). Unfortunately, it is limited to binary quantification, -although one can simply generate as many diagonal plots as -classes there are by indicating which class should be considered -the target of the plot.

-

The following call will produce the plot:

-
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, train_prev=tr_prevs[0], savepath='./plots/bin_diag.png')
-
-
-

the last argument is optional, and indicates the path where to save -the plot (the file extension will determine the format – typical extensions -are ‘.png’ or ‘.pdf’). If this path is not provided, then the plot -will be shown but not saved. -The resulting plot should look like:

-

diagonal plot on Kindle

-

Note that in this case, we are also indicating the training -prevalence, which is plotted in the diagonal a as cyan dot. -The color bands indicate the standard deviations of the predictions, -and can be hidden by setting the argument show_std=False (see -the complete list of arguments in the documentation).

-

Finally, note how most quantifiers, and specially the “unadjusted” -variants CC and PCC, are strongly biased towards the -prevalence seen during training.

-
-
-

Quantification bias

-

This plot aims at evincing the bias that any quantifier -displays with respect to the training prevalences by -means of box plots. -This plot can be generated by:

-
qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, savepath='./plots/bin_bias.png')
-
-
-

and should look like:

-

bias plot on Kindle

-

The box plots show some interesting facts:

-
    -
  • all methods are biased towards the training prevalence but specially -so CC and PCC (an unbiased quantifier would have a box centered at 0)

  • -
  • the bias is always positive, indicating that all methods tend to -overestimate the positive class prevalence

  • -
  • CC and PCC have high variability while ACC and specially PACC exhibit -lower variability.

  • -
-

Again, these plots could be generated for experiments ranging across -different datasets, and the plot will merge all data accordingly.

-

Another illustrative example can be shown that consists of -training different CC quantifiers trained at different -(artificially sampled) training prevalences. -For this example, we generate training samples of 5000 -documents containing 10%, 20%, …, 90% of positives from the -IMDb dataset, and generate the bias plot again. -This example can be run by rewritting the gen_data() function -like this:

-
def gen_data():
-
-    train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
-    model = CC(LinearSVC())
-
-    method_data = []
-    for training_prevalence in np.linspace(0.1, 0.9, 9):
-        training_size = 5000
-        # since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained
-        train_sample = train.sampling(training_size, 1-training_prevalence)
-        model.fit(train_sample)
-        true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
-        method_name = 'CC$_{'+f'{int(100*training_prevalence)}' + '\%}$'
-        method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence()))
-
-    return zip(*method_data)
-
-
-

and the plot should now look like:

-

bias plot on IMDb

-

which clearly shows a negative bias for CC variants trained on -data containing more negatives (i.e., < 50%) and positive biases -in cases containing more positives (i.e., >50%). The CC trained -at 50% behaves as an unbiased estimator of the positive class -prevalence.

-

The function qp.plot.binary_bias_bins allows the user to -generate box plots broken down by bins of true test prevalence. -To this aim, an argument nbins is passed which indicates -how many isometric subintervals to take. For example -the following plot is produced for nbins=3:

-

bias plot on IMDb

-

Interestingly enough, the seemingly unbiased estimator (CC at 50%) happens to display -a positive bias (or a tendency to overestimate) in cases of low prevalence -(i.e., when the true prevalence of the positive class is below 33%), -and a negative bias (or a tendency to underestimate) in cases of high prevalence -(i.e., when the true prevalence is beyond 67%).

-

Out of curiosity, the diagonal plot for this experiment looks like:

-

diag plot on IMDb

-

showing pretty clearly the dependency of CC on the prior probabilities -of the labeled set it was trained on.

-
-
-

Error by Drift

-

Above discussed plots are useful for analyzing and comparing -the performance of different quantification methods, but are -limited to the binary case. The “error by drift” is a plot -that shows the error in predictions as a function of the -(prior probability) drift between each test sample and the -training set. Interestingly, the error and drift can both be measured -in terms of any evaluation measure for quantification (like the -ones available in qp.error) and can thus be computed -irrespectively of the number of classes.

-

The following shows how to generate the plot for the 4 CC variants, -using 10 bins for the drift -and absolute error as the measure of the error (the -drift in the x-axis is always computed in terms of absolute error since -other errors are harder to interpret):

-
qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, 
-    error_name='ae', n_bins=10, savepath='./plots/err_drift.png')
-
-
-

diag plot on IMDb

-

Note that all methods work reasonably well in cases of low prevalence -drift (i.e., any CC-variant is a good quantifier whenever the IID -assumption is approximately preserved). The higher the drift, the worse -those quantifiers tend to perform, although it is clear that PACC -yields the lowest error for the most difficult cases.

-

Remember that any plot can be generated across many datasets, and -that this would probably result in a more solid comparison. -In those cases, however, it is likely that the variances of each -method get higher, to the detriment of the visualization. -We recommend to set show_std=False in those cases -in order to hide the color bands.

-
-
- - -
-
-
-
- -
-
- - - - \ No newline at end of file diff --git a/docs/build/html/_modules/index.html b/docs/build/html/_modules/index.html new file mode 100644 index 0000000..9951ba2 --- /dev/null +++ b/docs/build/html/_modules/index.html @@ -0,0 +1,124 @@ + + + + + + Overview: module code — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ + +
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/classification/calibration.html b/docs/build/html/_modules/quapy/classification/calibration.html new file mode 100644 index 0000000..6576c6c --- /dev/null +++ b/docs/build/html/_modules/quapy/classification/calibration.html @@ -0,0 +1,319 @@ + + + + + + quapy.classification.calibration — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for quapy.classification.calibration

+from copy import deepcopy
+
+from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
+from sklearn.base import BaseEstimator, clone
+from sklearn.model_selection import cross_val_predict, train_test_split
+import numpy as np
+
+
+# Wrappers of calibration defined by Alexandari et al. in paper <http://proceedings.mlr.press/v119/alexandari20a.html>
+# requires "pip install abstension"
+# see https://github.com/kundajelab/abstention
+
+
+
[docs]class RecalibratedProbabilisticClassifier: + """ + Abstract class for (re)calibration method from `abstention.calibration`, as defined in + `Alexandari, A., Kundaje, A., & Shrikumar, A. (2020, November). Maximum likelihood with bias-corrected calibration + is hard-to-beat at label shift adaptation. In International Conference on Machine Learning (pp. 222-232). PMLR. + <http://proceedings.mlr.press/v119/alexandari20a.html>`_: + """ + pass
+ + +
[docs]class RecalibratedProbabilisticClassifierBase(BaseEstimator, RecalibratedProbabilisticClassifier): + """ + Applies a (re)calibration method from `abstention.calibration`, as defined in + `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_. + + + :param classifier: a scikit-learn probabilistic classifier + :param calibrator: the calibration object (an instance of abstention.calibration.CalibratorFactory) + :param val_split: indicate an integer k for performing kFCV to obtain the posterior probabilities, or a float p + in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the + training instances (the rest is used for training). In any case, the classifier is retrained in the whole + training set afterwards. Default value is 5. + :param n_jobs: indicate the number of parallel workers (only when val_split is an integer); default=None + :param verbose: whether or not to display information in the standard output + """ + + def __init__(self, classifier, calibrator, val_split=5, n_jobs=None, verbose=False): + self.classifier = classifier + self.calibrator = calibrator + self.val_split = val_split + self.n_jobs = n_jobs + self.verbose = verbose + +
[docs] def fit(self, X, y): + """ + Fits the calibration for the probabilistic classifier. + + :param X: array-like of shape `(n_samples, n_features)` with the data instances + :param y: array-like of shape `(n_samples,)` with the class labels + :return: self + """ + k = self.val_split + if isinstance(k, int): + if k < 2: + raise ValueError('wrong value for val_split: the number of folds must be > 2') + return self.fit_cv(X, y) + elif isinstance(k, float): + if not (0 < k < 1): + raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)') + return self.fit_tr_val(X, y)
+ +
[docs] def fit_cv(self, X, y): + """ + Fits the calibration in a cross-validation manner, i.e., it generates posterior probabilities for all + training instances via cross-validation, and then retrains the classifier on all training instances. + The posterior probabilities thus generated are used for calibrating the outputs of the classifier. + + :param X: array-like of shape `(n_samples, n_features)` with the data instances + :param y: array-like of shape `(n_samples,)` with the class labels + :return: self + """ + posteriors = cross_val_predict( + self.classifier, X, y, cv=self.val_split, n_jobs=self.n_jobs, verbose=self.verbose, method='predict_proba' + ) + self.classifier.fit(X, y) + nclasses = len(np.unique(y)) + self.calibration_function = self.calibrator(posteriors, np.eye(nclasses)[y], posterior_supplied=True) + return self
+ +
[docs] def fit_tr_val(self, X, y): + """ + Fits the calibration in a train/val-split manner, i.e.t, it partitions the training instances into a + training and a validation set, and then uses the training samples to learn classifier which is then used + to generate posterior probabilities for the held-out validation data. These posteriors are used to calibrate + the classifier. The classifier is not retrained on the whole dataset. + + :param X: array-like of shape `(n_samples, n_features)` with the data instances + :param y: array-like of shape `(n_samples,)` with the class labels + :return: self + """ + Xtr, Xva, ytr, yva = train_test_split(X, y, test_size=self.val_split, stratify=y) + self.classifier.fit(Xtr, ytr) + posteriors = self.classifier.predict_proba(Xva) + nclasses = len(np.unique(yva)) + self.calibration_function = self.calibrator(posteriors, np.eye(nclasses)[yva], posterior_supplied=True) + return self
+ +
[docs] def predict(self, X): + """ + Predicts class labels for the data instances in `X` + + :param X: array-like of shape `(n_samples, n_features)` with the data instances + :return: array-like of shape `(n_samples,)` with the class label predictions + """ + return self.classifier.predict(X)
+ +
[docs] def predict_proba(self, X): + """ + Generates posterior probabilities for the data instances in `X` + + :param X: array-like of shape `(n_samples, n_features)` with the data instances + :return: array-like of shape `(n_samples, n_classes)` with posterior probabilities + """ + posteriors = self.classifier.predict_proba(X) + return self.calibration_function(posteriors)
+ + @property + def classes_(self): + """ + Returns the classes on which the classifier has been trained on + + :return: array-like of shape `(n_classes)` + """ + return self.classifier.classes_
+ + +
[docs]class NBVSCalibration(RecalibratedProbabilisticClassifierBase): + """ + Applies the No-Bias Vector Scaling (NBVS) calibration method from `abstention.calibration`, as defined in + `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_: + + :param classifier: a scikit-learn probabilistic classifier + :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p + in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the + training instances (the rest is used for training). In any case, the classifier is retrained in the whole + training set afterwards. Default value is 5. + :param n_jobs: indicate the number of parallel workers (only when val_split is an integer) + :param verbose: whether or not to display information in the standard output + """ + + def __init__(self, classifier, val_split=5, n_jobs=None, verbose=False): + self.classifier = classifier + self.calibrator = NoBiasVectorScaling(verbose=verbose) + self.val_split = val_split + self.n_jobs = n_jobs + self.verbose = verbose
+ + +
[docs]class BCTSCalibration(RecalibratedProbabilisticClassifierBase): + """ + Applies the Bias-Corrected Temperature Scaling (BCTS) calibration method from `abstention.calibration`, as defined in + `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_: + + :param classifier: a scikit-learn probabilistic classifier + :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p + in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the + training instances (the rest is used for training). In any case, the classifier is retrained in the whole + training set afterwards. Default value is 5. + :param n_jobs: indicate the number of parallel workers (only when val_split is an integer) + :param verbose: whether or not to display information in the standard output + """ + + def __init__(self, classifier, val_split=5, n_jobs=None, verbose=False): + self.classifier = classifier + self.calibrator = TempScaling(verbose=verbose, bias_positions='all') + self.val_split = val_split + self.n_jobs = n_jobs + self.verbose = verbose
+ + +
[docs]class TSCalibration(RecalibratedProbabilisticClassifierBase): + """ + Applies the Temperature Scaling (TS) calibration method from `abstention.calibration`, as defined in + `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_: + + :param classifier: a scikit-learn probabilistic classifier + :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p + in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the + training instances (the rest is used for training). In any case, the classifier is retrained in the whole + training set afterwards. Default value is 5. + :param n_jobs: indicate the number of parallel workers (only when val_split is an integer) + :param verbose: whether or not to display information in the standard output + """ + + def __init__(self, classifier, val_split=5, n_jobs=None, verbose=False): + self.classifier = classifier + self.calibrator = TempScaling(verbose=verbose) + self.val_split = val_split + self.n_jobs = n_jobs + self.verbose = verbose
+ + +
[docs]class VSCalibration(RecalibratedProbabilisticClassifierBase): + """ + Applies the Vector Scaling (VS) calibration method from `abstention.calibration`, as defined in + `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_: + + :param classifier: a scikit-learn probabilistic classifier + :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p + in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the + training instances (the rest is used for training). In any case, the classifier is retrained in the whole + training set afterwards. Default value is 5. + :param n_jobs: indicate the number of parallel workers (only when val_split is an integer) + :param verbose: whether or not to display information in the standard output + """ + + def __init__(self, classifier, val_split=5, n_jobs=None, verbose=False): + self.classifier = classifier + self.calibrator = VectorScaling(verbose=verbose) + self.val_split = val_split + self.n_jobs = n_jobs + self.verbose = verbose
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/classification/methods.html b/docs/build/html/_modules/quapy/classification/methods.html new file mode 100644 index 0000000..883f802 --- /dev/null +++ b/docs/build/html/_modules/quapy/classification/methods.html @@ -0,0 +1,220 @@ + + + + + + quapy.classification.methods — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.classification.methods

+from sklearn.base import BaseEstimator
+from sklearn.decomposition import TruncatedSVD
+from sklearn.linear_model import LogisticRegression
+
+
+
+[docs] +class LowRankLogisticRegression(BaseEstimator): + """ + An example of a classification method (i.e., an object that implements `fit`, `predict`, and `predict_proba`) + that also generates embedded inputs (i.e., that implements `transform`), as those required for + :class:`quapy.method.neural.QuaNet`. This is a mock method to allow for easily instantiating + :class:`quapy.method.neural.QuaNet` on array-like real-valued instances. + The transformation consists of applying :class:`sklearn.decomposition.TruncatedSVD` + while classification is performed using :class:`sklearn.linear_model.LogisticRegression` on the low-rank space. + + :param n_components: the number of principal components to retain + :param kwargs: parameters for the + `Logistic Regression <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html>`__ classifier + """ + + def __init__(self, n_components=100, **kwargs): + self.n_components = n_components + self.classifier = LogisticRegression(**kwargs) + +
+[docs] + def get_params(self): + """ + Get hyper-parameters for this estimator. + + :return: a dictionary with parameter names mapped to their values + """ + params = {'n_components': self.n_components} + params.update(self.classifier.get_params()) + return params
+ + +
+[docs] + def set_params(self, **params): + """ + Set the parameters of this estimator. + + :param parameters: a `**kwargs` dictionary with the estimator parameters for + `Logistic Regression <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html>`__ + and eventually also `n_components` for `TruncatedSVD` + """ + params_ = dict(params) + if 'n_components' in params_: + self.n_components = params_['n_components'] + del params_['n_components'] + self.classifier.set_params(**params_)
+ + +
+[docs] + def fit(self, X, y): + """ + Fit the model according to the given training data. The fit consists of + fitting `TruncatedSVD` and then `LogisticRegression` on the low-rank representation. + + :param X: array-like of shape `(n_samples, n_features)` with the instances + :param y: array-like of shape `(n_samples, n_classes)` with the class labels + :return: `self` + """ + nF = X.shape[1] + self.pca = None + if nF > self.n_components: + self.pca = TruncatedSVD(self.n_components).fit(X) + X = self.transform(X) + self.classifier.fit(X, y) + self.classes_ = self.classifier.classes_ + return self
+ + +
+[docs] + def predict(self, X): + """ + Predicts labels for the instances `X` embedded into the low-rank space. + + :param X: array-like of shape `(n_samples, n_features)` instances to classify + :return: a `numpy` array of length `n` containing the label predictions, where `n` is the number of + instances in `X` + """ + X = self.transform(X) + return self.classifier.predict(X)
+ + +
+[docs] + def predict_proba(self, X): + """ + Predicts posterior probabilities for the instances `X` embedded into the low-rank space. + + :param X: array-like of shape `(n_samples, n_features)` instances to classify + :return: array-like of shape `(n_samples, n_classes)` with the posterior probabilities + """ + X = self.transform(X) + return self.classifier.predict_proba(X)
+ + +
+[docs] + def transform(self, X): + """ + Returns the low-rank approximation of `X` with `n_components` dimensions, or `X` unaltered if + `n_components` >= `X.shape[1]`. + + :param X: array-like of shape `(n_samples, n_features)` instances to embed + :return: array-like of shape `(n_samples, n_components)` with the embedded instances + """ + if self.pca is None: + return X + return self.pca.transform(X)
+
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/classification/neural.html b/docs/build/html/_modules/quapy/classification/neural.html new file mode 100644 index 0000000..fd5d9b1 --- /dev/null +++ b/docs/build/html/_modules/quapy/classification/neural.html @@ -0,0 +1,715 @@ + + + + + + quapy.classification.neural — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.classification.neural

+import os
+from abc import ABCMeta, abstractmethod
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from sklearn.metrics import accuracy_score, f1_score
+from torch.nn.utils.rnn import pad_sequence
+from tqdm import tqdm
+
+import quapy as qp
+from quapy.data import LabelledCollection
+from quapy.util import EarlyStop
+
+
+
+[docs] +class NeuralClassifierTrainer: + """ + Trains a neural network for text classification. + + :param net: an instance of `TextClassifierNet` implementing the forward pass + :param lr: learning rate (default 1e-3) + :param weight_decay: weight decay (default 0) + :param patience: number of epochs that do not show any improvement in validation + to wait before applying early stop (default 10) + :param epochs: maximum number of training epochs (default 200) + :param batch_size: batch size for training (default 64) + :param batch_size_test: batch size for test (default 512) + :param padding_length: maximum number of tokens to consider in a document (default 300) + :param device: specify 'cpu' (default) or 'cuda' for enabling gpu + :param checkpointpath: where to store the parameters of the best model found so far + according to the evaluation in the held-out validation split (default '../checkpoint/classifier_net.dat') + """ + + def __init__(self, + net: 'TextClassifierNet', + lr=1e-3, + weight_decay=0, + patience=10, + epochs=200, + batch_size=64, + batch_size_test=512, + padding_length=300, + device='cuda', + checkpointpath='../checkpoint/classifier_net.dat'): + + super().__init__() + + assert isinstance(net, TextClassifierNet), f'net is not an instance of {TextClassifierNet.__name__}' + self.net = net.to(device) + self.vocab_size = self.net.vocabulary_size + self.trainer_hyperparams={ + 'lr': lr, + 'weight_decay': weight_decay, + 'patience': patience, + 'epochs': epochs, + 'batch_size': batch_size, + 'batch_size_test': batch_size_test, + 'padding_length': padding_length, + 'device': torch.device(device) + } + self.learner_hyperparams = self.net.get_params() + self.checkpointpath = checkpointpath + + print(f'[NeuralNetwork running on {device}]') + os.makedirs(Path(checkpointpath).parent, exist_ok=True) + +
+[docs] + def reset_net_params(self, vocab_size, n_classes): + """Reinitialize the network parameters + + :param vocab_size: the size of the vocabulary + :param n_classes: the number of target classes + """ + self.net = self.net.__class__(vocab_size, n_classes, **self.learner_hyperparams) + self.net = self.net.to(self.trainer_hyperparams['device']) + self.net.xavier_uniform()
+ + +
+[docs] + def get_params(self): + """Get hyper-parameters for this estimator + + :return: a dictionary with parameter names mapped to their values + """ + return {**self.net.get_params(), **self.trainer_hyperparams}
+ + +
+[docs] + def set_params(self, **params): + """Set the parameters of this trainer and the learner it is training. + In this current version, parameter names for the trainer and learner should + be disjoint. + + :param params: a `**kwargs` dictionary with the parameters + """ + trainer_hyperparams = self.trainer_hyperparams + learner_hyperparams = self.net.get_params() + for key, val in params.items(): + if key in trainer_hyperparams and key in learner_hyperparams: + raise ValueError(f'the use of parameter {key} is ambiguous since it can refer to ' + f'a parameters of the Trainer or the learner {self.net.__name__}') + elif key not in trainer_hyperparams and key not in learner_hyperparams: + raise ValueError(f'parameter {key} is not valid') + + if key in trainer_hyperparams: + trainer_hyperparams[key] = val + else: + learner_hyperparams[key] = val + + self.trainer_hyperparams = trainer_hyperparams + self.learner_hyperparams = learner_hyperparams
+ + + @property + def device(self): + """ Gets the device in which the network is allocated + + :return: device + """ + return next(self.net.parameters()).device + + def _train_epoch(self, data, status, pbar, epoch): + self.net.train() + criterion = torch.nn.CrossEntropyLoss() + losses, predictions, true_labels = [], [], [] + for xi, yi in data: + self.optim.zero_grad() + logits = self.net.forward(xi) + loss = criterion(logits, yi) + loss.backward() + self.optim.step() + losses.append(loss.item()) + preds = torch.softmax(logits, dim=-1).detach().cpu().numpy().argmax(axis=-1) + + status["loss"] = np.mean(losses) + predictions.extend(preds.tolist()) + true_labels.extend(yi.detach().cpu().numpy().tolist()) + status["acc"] = accuracy_score(true_labels, predictions) + status["f1"] = f1_score(true_labels, predictions, average='macro') + self.__update_progress_bar(pbar, epoch) + + def _test_epoch(self, data, status, pbar, epoch): + self.net.eval() + criterion = torch.nn.CrossEntropyLoss() + losses, predictions, true_labels = [], [], [] + with torch.no_grad(): + for xi, yi in data: + logits = self.net.forward(xi) + loss = criterion(logits, yi) + losses.append(loss.item()) + preds = torch.softmax(logits, dim=-1).detach().cpu().numpy().argmax(axis=-1) + predictions.extend(preds.tolist()) + true_labels.extend(yi.detach().cpu().numpy().tolist()) + + status["loss"] = np.mean(losses) + status["acc"] = accuracy_score(true_labels, predictions) + status["f1"] = f1_score(true_labels, predictions, average='macro') + self.__update_progress_bar(pbar, epoch) + + def __update_progress_bar(self, pbar, epoch): + pbar.set_description(f'[{self.net.__class__.__name__}] training epoch={epoch} ' + f'tr-loss={self.status["tr"]["loss"]:.5f} ' + f'tr-acc={100 * self.status["tr"]["acc"]:.2f}% ' + f'tr-macroF1={100 * self.status["tr"]["f1"]:.2f}% ' + f'patience={self.early_stop.patience}/{self.early_stop.PATIENCE_LIMIT} ' + f'val-loss={self.status["va"]["loss"]:.5f} ' + f'val-acc={100 * self.status["va"]["acc"]:.2f}% ' + f'macroF1={100 * self.status["va"]["f1"]:.2f}%') + +
+[docs] + def fit(self, instances, labels, val_split=0.3): + """ + Fits the model according to the given training data. + + :param instances: list of lists of indexed tokens + :param labels: array-like of shape `(n_samples, n_classes)` with the class labels + :param val_split: proportion of training documents to be taken as the validation set (default 0.3) + :return: + """ + train, val = LabelledCollection(instances, labels).split_stratified(1-val_split) + self.classes_ = train.classes_ + opt = self.trainer_hyperparams + checkpoint = self.checkpointpath + self.reset_net_params(self.vocab_size, train.n_classes) + + train_generator = TorchDataset(train.instances, train.labels).asDataloader( + opt['batch_size'], shuffle=True, pad_length=opt['padding_length'], device=opt['device']) + valid_generator = TorchDataset(val.instances, val.labels).asDataloader( + opt['batch_size_test'], shuffle=False, pad_length=opt['padding_length'], device=opt['device']) + + self.status = {'tr': {'loss': -1, 'acc': -1, 'f1': -1}, + 'va': {'loss': -1, 'acc': -1, 'f1': -1}} + + self.optim = torch.optim.Adam(self.net.parameters(), lr=opt['lr'], weight_decay=opt['weight_decay']) + self.early_stop = EarlyStop(opt['patience'], lower_is_better=False) + + with tqdm(range(1, opt['epochs'] + 1)) as pbar: + for epoch in pbar: + self._train_epoch(train_generator, self.status['tr'], pbar, epoch) + self._test_epoch(valid_generator, self.status['va'], pbar, epoch) + + self.early_stop(self.status['va']['f1'], epoch) + if self.early_stop.IMPROVED: + torch.save(self.net.state_dict(), checkpoint) + elif self.early_stop.STOP: + print(f'training ended by patience exhasted; loading best model parameters in {checkpoint} ' + f'for epoch {self.early_stop.best_epoch}') + self.net.load_state_dict(torch.load(checkpoint)) + break + + print('performing one training pass over the validation set...') + self._train_epoch(valid_generator, self.status['tr'], pbar, epoch=0) + print('[done]') + + return self
+ + +
+[docs] + def predict(self, instances): + """ + Predicts labels for the instances + + :param instances: list of lists of indexed tokens + :return: a `numpy` array of length `n` containing the label predictions, where `n` is the number of + instances in `X` + """ + return np.argmax(self.predict_proba(instances), axis=-1)
+ + +
+[docs] + def predict_proba(self, instances): + """ + Predicts posterior probabilities for the instances + + :param X: array-like of shape `(n_samples, n_features)` instances to classify + :return: array-like of shape `(n_samples, n_classes)` with the posterior probabilities + """ + self.net.eval() + opt = self.trainer_hyperparams + with torch.no_grad(): + posteriors = [] + for xi in TorchDataset(instances).asDataloader( + opt['batch_size_test'], shuffle=False, pad_length=opt['padding_length'], device=opt['device']): + posteriors.append(self.net.predict_proba(xi)) + return np.concatenate(posteriors)
+ + +
+[docs] + def transform(self, instances): + """ + Returns the embeddings of the instances + + :param instances: list of lists of indexed tokens + :return: array-like of shape `(n_samples, embed_size)` with the embedded instances, + where `embed_size` is defined by the classification network + """ + self.net.eval() + embeddings = [] + opt = self.trainer_hyperparams + with torch.no_grad(): + for xi in TorchDataset(instances).asDataloader( + opt['batch_size_test'], shuffle=False, pad_length=opt['padding_length'], device=opt['device']): + embeddings.append(self.net.document_embedding(xi).detach().cpu().numpy()) + return np.concatenate(embeddings)
+
+ + + +
+[docs] +class TorchDataset(torch.utils.data.Dataset): + """ + Transforms labelled instances into a Torch's :class:`torch.utils.data.DataLoader` object + + :param instances: list of lists of indexed tokens + :param labels: array-like of shape `(n_samples, n_classes)` with the class labels + """ + + def __init__(self, instances, labels=None): + self.instances = instances + self.labels = labels + + def __len__(self): + return len(self.instances) + + def __getitem__(self, index): + return {'doc': self.instances[index], 'label': self.labels[index] if self.labels is not None else None} + +
+[docs] + def asDataloader(self, batch_size, shuffle, pad_length, device): + """ + Converts the labelled collection into a Torch DataLoader with dynamic padding for + the batch + + :param batch_size: batch size + :param shuffle: whether or not to shuffle instances + :param pad_length: the maximum length for the list of tokens (dynamic padding is + applied, meaning that if the longest document in the batch is shorter than + `pad_length`, then the batch is padded up to its length, and not to `pad_length`. + :param device: whether to allocate tensors in cpu or in cuda + :return: a :class:`torch.utils.data.DataLoader` object + """ + def collate(batch): + data = [torch.LongTensor(item['doc'][:pad_length]) for item in batch] + data = pad_sequence(data, batch_first=True, padding_value=qp.environ['PAD_INDEX']).to(device) + targets = [item['label'] for item in batch] + if targets[0] is None: + return data + else: + targets = torch.as_tensor(targets, dtype=torch.long).to(device) + return [data, targets] + + torchDataset = TorchDataset(self.instances, self.labels) + return torch.utils.data.DataLoader(torchDataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate)
+
+ + + +
+[docs] +class TextClassifierNet(torch.nn.Module, metaclass=ABCMeta): + """ + Abstract Text classifier (`torch.nn.Module`) + """ + +
+[docs] + @abstractmethod + def document_embedding(self, x): + """Embeds documents (i.e., performs the forward pass up to the + next-to-last layer). + + :param x: a batch of instances, typically generated by a torch's `DataLoader` + instance (see :class:`quapy.classification.neural.TorchDataset`) + :return: a torch tensor of shape `(n_samples, n_dimensions)`, where + `n_samples` is the number of documents, and `n_dimensions` is the + dimensionality of the embedding + """ + ...
+ + +
+[docs] + def forward(self, x): + """Performs the forward pass. + + :param x: a batch of instances, typically generated by a torch's `DataLoader` + instance (see :class:`quapy.classification.neural.TorchDataset`) + :return: a tensor of shape `(n_instances, n_classes)` with the decision scores + for each of the instances and classes + """ + doc_embedded = self.document_embedding(x) + return self.output(doc_embedded)
+ + +
+[docs] + def dimensions(self): + """Gets the number of dimensions of the embedding space + + :return: integer + """ + return self.dim
+ + +
+[docs] + def predict_proba(self, x): + """ + Predicts posterior probabilities for the instances in `x` + + :param x: a torch tensor of indexed tokens with shape `(n_instances, pad_length)` + where `n_instances` is the number of instances in the batch, and `pad_length` + is length of the pad in the batch + :return: array-like of shape `(n_samples, n_classes)` with the posterior probabilities + """ + logits = self(x) + return torch.softmax(logits, dim=1).detach().cpu().numpy()
+ + +
+[docs] + def xavier_uniform(self): + """ + Performs Xavier initialization of the network parameters + """ + for p in self.parameters(): + if p.dim() > 1 and p.requires_grad: + torch.nn.init.xavier_uniform_(p)
+ + +
+[docs] + @abstractmethod + def get_params(self): + """ + Get hyper-parameters for this estimator + + :return: a dictionary with parameter names mapped to their values + """ + ...
+ + + @property + def vocabulary_size(self): + """ + Return the size of the vocabulary + + :return: integer + """ + ...
+ + + +
+[docs] +class LSTMnet(TextClassifierNet): + """ + An implementation of :class:`quapy.classification.neural.TextClassifierNet` based on + Long Short Term Memory networks. + + :param vocabulary_size: the size of the vocabulary + :param n_classes: number of target classes + :param embedding_size: the dimensionality of the word embeddings space (default 100) + :param hidden_size: the dimensionality of the hidden space (default 256) + :param repr_size: the dimensionality of the document embeddings space (default 100) + :param lstm_class_nlayers: number of LSTM layers (default 1) + :param drop_p: drop probability for dropout (default 0.5) + """ + + def __init__(self, vocabulary_size, n_classes, embedding_size=100, hidden_size=256, repr_size=100, lstm_class_nlayers=1, + drop_p=0.5): + + super().__init__() + self.vocabulary_size_ = vocabulary_size + self.n_classes = n_classes + self.hyperparams={ + 'embedding_size': embedding_size, + 'hidden_size': hidden_size, + 'repr_size': repr_size, + 'lstm_class_nlayers': lstm_class_nlayers, + 'drop_p': drop_p + } + + self.word_embedding = torch.nn.Embedding(vocabulary_size, embedding_size) + self.lstm = torch.nn.LSTM(embedding_size, hidden_size, lstm_class_nlayers, dropout=drop_p, batch_first=True) + self.dropout = torch.nn.Dropout(drop_p) + + self.dim = repr_size + self.doc_embedder = torch.nn.Linear(hidden_size, self.dim) + self.output = torch.nn.Linear(self.dim, n_classes) + + def __init_hidden(self, set_size): + opt = self.hyperparams + var_hidden = torch.zeros(opt['lstm_class_nlayers'], set_size, opt['hidden_size']) + var_cell = torch.zeros(opt['lstm_class_nlayers'], set_size, opt['hidden_size']) + if next(self.lstm.parameters()).is_cuda: + var_hidden, var_cell = var_hidden.cuda(), var_cell.cuda() + return var_hidden, var_cell + +
+[docs] + def document_embedding(self, x): + """Embeds documents (i.e., performs the forward pass up to the + next-to-last layer). + + :param x: a batch of instances, typically generated by a torch's `DataLoader` + instance (see :class:`quapy.classification.neural.TorchDataset`) + :return: a torch tensor of shape `(n_samples, n_dimensions)`, where + `n_samples` is the number of documents, and `n_dimensions` is the + dimensionality of the embedding + """ + embedded = self.word_embedding(x) + rnn_output, rnn_hidden = self.lstm(embedded, self.__init_hidden(x.size()[0])) + abstracted = self.dropout(F.relu(rnn_hidden[0][-1])) + abstracted = self.doc_embedder(abstracted) + return abstracted
+ + +
+[docs] + def get_params(self): + """ + Get hyper-parameters for this estimator + + :return: a dictionary with parameter names mapped to their values + """ + return self.hyperparams
+ + + @property + def vocabulary_size(self): + """ + Return the size of the vocabulary + + :return: integer + """ + return self.vocabulary_size_
+ + + +
+[docs] +class CNNnet(TextClassifierNet): + """ + An implementation of :class:`quapy.classification.neural.TextClassifierNet` based on + Convolutional Neural Networks. + + :param vocabulary_size: the size of the vocabulary + :param n_classes: number of target classes + :param embedding_size: the dimensionality of the word embeddings space (default 100) + :param hidden_size: the dimensionality of the hidden space (default 256) + :param repr_size: the dimensionality of the document embeddings space (default 100) + :param kernel_heights: list of kernel lengths (default [3,5,7]), i.e., the number of + consecutive tokens that each kernel covers + :param stride: convolutional stride (default 1) + :param stride: convolutional pad (default 0) + :param drop_p: drop probability for dropout (default 0.5) + """ + + def __init__(self, vocabulary_size, n_classes, embedding_size=100, hidden_size=256, repr_size=100, + kernel_heights=[3, 5, 7], stride=1, padding=0, drop_p=0.5): + super(CNNnet, self).__init__() + + self.vocabulary_size_ = vocabulary_size + self.n_classes = n_classes + self.hyperparams={ + 'embedding_size': embedding_size, + 'hidden_size': hidden_size, + 'repr_size': repr_size, + 'kernel_heights':kernel_heights, + 'stride': stride, + 'drop_p': drop_p + } + self.word_embedding = torch.nn.Embedding(vocabulary_size, embedding_size) + in_channels = 1 + self.conv1 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[0], embedding_size), stride, padding) + self.conv2 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[1], embedding_size), stride, padding) + self.conv3 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[2], embedding_size), stride, padding) + self.dropout = nn.Dropout(drop_p) + + self.dim = repr_size + self.doc_embedder = torch.nn.Linear(len(kernel_heights) * hidden_size, self.dim) + self.output = nn.Linear(self.dim, n_classes) + + def __conv_block(self, input, conv_layer): + conv_out = conv_layer(input) # conv_out.size() = (batch_size, out_channels, dim, 1) + activation = F.relu(conv_out.squeeze(3)) # activation.size() = (batch_size, out_channels, dim1) + max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2) # maxpool_out.size() = (batch_size, out_channels) + return max_out + +
+[docs] + def document_embedding(self, input): + """Embeds documents (i.e., performs the forward pass up to the + next-to-last layer). + + :param input: a batch of instances, typically generated by a torch's `DataLoader` + instance (see :class:`quapy.classification.neural.TorchDataset`) + :return: a torch tensor of shape `(n_samples, n_dimensions)`, where + `n_samples` is the number of documents, and `n_dimensions` is the + dimensionality of the embedding + """ + input = self.word_embedding(input) + input = input.unsqueeze(1) # input.size() = (batch_size, 1, num_seq, embedding_length) + + max_out1 = self.__conv_block(input, self.conv1) + max_out2 = self.__conv_block(input, self.conv2) + max_out3 = self.__conv_block(input, self.conv3) + + all_out = torch.cat((max_out1, max_out2, max_out3), 1) # all_out.size() = (batch_size, num_kernels*out_channels) + abstracted = self.dropout(F.relu(all_out)) # (batch_size, num_kernels*out_channels) + abstracted = self.doc_embedder(abstracted) + return abstracted
+ + +
+[docs] + def get_params(self): + """ + Get hyper-parameters for this estimator + + :return: a dictionary with parameter names mapped to their values + """ + return self.hyperparams
+ + + @property + def vocabulary_size(self): + """ + Return the size of the vocabulary + + :return: integer + """ + return self.vocabulary_size_
+ + + + + + +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/classification/svmperf.html b/docs/build/html/_modules/quapy/classification/svmperf.html new file mode 100644 index 0000000..959ad48 --- /dev/null +++ b/docs/build/html/_modules/quapy/classification/svmperf.html @@ -0,0 +1,268 @@ + + + + + + quapy.classification.svmperf — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.classification.svmperf

+import random
+import shutil
+import subprocess
+import tempfile
+from os import remove, makedirs
+from os.path import join, exists
+from subprocess import PIPE, STDOUT
+import numpy as np
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.datasets import dump_svmlight_file
+
+
+
+[docs] +class SVMperf(BaseEstimator, ClassifierMixin): + """A wrapper for the `SVM-perf package <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`__ by Thorsten Joachims. + When using losses for quantification, the source code has to be patched. See + the `installation documentation <https://hlt-isti.github.io/QuaPy/build/html/Installation.html#svm-perf-with-quantification-oriented-losses>`__ + for further details. + + References: + + * `Esuli et al.2015 <https://dl.acm.org/doi/abs/10.1145/2700406?casa_token=8D2fHsGCVn0AAAAA:ZfThYOvrzWxMGfZYlQW_y8Cagg-o_l6X_PcF09mdETQ4Tu7jK98mxFbGSXp9ZSO14JkUIYuDGFG0>`__ + * `Barranquero et al.2015 <https://www.sciencedirect.com/science/article/abs/pii/S003132031400291X>`__ + + :param svmperf_base: path to directory containing the binary files `svm_perf_learn` and `svm_perf_classify` + :param C: trade-off between training error and margin (default 0.01) + :param verbose: set to True to print svm-perf std outputs + :param loss: the loss to optimize for. Available losses are "01", "f1", "kld", "nkld", "q", "qacc", "qf1", "qgm", "mae", "mrae". + :param host_folder: directory where to store the trained model; set to None (default) for using a tmp directory + (temporal directories are automatically deleted) + """ + + # losses with their respective codes in svm_perf implementation + valid_losses = {'01':0, 'f1':1, 'kld':12, 'nkld':13, 'q':22, 'qacc':23, 'qf1':24, 'qgm':25, 'mae':26, 'mrae':27} + + def __init__(self, svmperf_base, C=0.01, verbose=False, loss='01', host_folder=None): + assert exists(svmperf_base), f'path {svmperf_base} does not seem to point to a valid path' + self.svmperf_base = svmperf_base + self.C = C + self.verbose = verbose + self.loss = loss + self.host_folder = host_folder + + # def set_params(self, **parameters): + # """ + # Set the hyper-parameters for svm-perf. Currently, only the `C` and `loss` parameters are supported + # + # :param parameters: a `**kwargs` dictionary `{'C': <float>}` + # """ + # assert sorted(list(parameters.keys())) == ['C', 'loss'], \ + # 'currently, only the C and loss parameters are supported' + # self.C = parameters.get('C', self.C) + # self.loss = parameters.get('loss', self.loss) + # + # def get_params(self, deep=True): + # return {'C': self.C, 'loss': self.loss} + +
+[docs] + def fit(self, X, y): + """ + Trains the SVM for the multivariate performance loss + + :param X: training instances + :param y: a binary vector of labels + :return: `self` + """ + assert self.loss in SVMperf.valid_losses, \ + f'unsupported loss {self.loss}, valid ones are {list(SVMperf.valid_losses.keys())}' + + self.svmperf_learn = join(self.svmperf_base, 'svm_perf_learn') + self.svmperf_classify = join(self.svmperf_base, 'svm_perf_classify') + self.loss_cmd = '-w 3 -l ' + str(self.valid_losses[self.loss]) + self.c_cmd = '-c ' + str(self.C) + + self.classes_ = sorted(np.unique(y)) + self.n_classes_ = len(self.classes_) + + local_random = random.Random() + # this would allow to run parallel instances of predict + random_code = 'svmperfprocess'+'-'.join(str(local_random.randint(0, 1000000)) for _ in range(5)) + if self.host_folder is None: + # tmp dir are removed after the fit terminates in multiprocessing... + self.tmpdir = tempfile.TemporaryDirectory(suffix=random_code).name + else: + self.tmpdir = join(self.host_folder, '.' + random_code) + makedirs(self.tmpdir, exist_ok=True) + + self.model = join(self.tmpdir, 'model-'+random_code) + traindat = join(self.tmpdir, f'train-{random_code}.dat') + + dump_svmlight_file(X, y, traindat, zero_based=False) + + cmd = ' '.join([self.svmperf_learn, self.c_cmd, self.loss_cmd, traindat, self.model]) + if self.verbose: + print('[Running]', cmd) + p = subprocess.run(cmd.split(), stdout=PIPE, stderr=STDOUT) + if not exists(self.model): + print(p.stderr.decode('utf-8')) + remove(traindat) + + if self.verbose: + print(p.stdout.decode('utf-8')) + + return self
+ + +
+[docs] + def predict(self, X): + """ + Predicts labels for the instances `X` + + :param X: array-like of shape `(n_samples, n_features)` instances to classify + :return: a `numpy` array of length `n` containing the label predictions, where `n` is the number of + instances in `X` + """ + confidence_scores = self.decision_function(X) + predictions = (confidence_scores > 0) * 1 + return predictions
+ + +
+[docs] + def decision_function(self, X, y=None): + """ + Evaluate the decision function for the samples in `X`. + + :param X: array-like of shape `(n_samples, n_features)` containing the instances to classify + :param y: unused + :return: array-like of shape `(n_samples,)` containing the decision scores of the instances + """ + assert hasattr(self, 'tmpdir'), 'predict called before fit' + assert self.tmpdir is not None, 'model directory corrupted' + assert exists(self.model), 'model not found' + if y is None: + y = np.zeros(X.shape[0]) + + # in order to allow for parallel runs of predict, a random code is assigned + local_random = random.Random() + random_code = '-'.join(str(local_random.randint(0, 1000000)) for _ in range(5)) + predictions_path = join(self.tmpdir, 'predictions' + random_code + '.dat') + testdat = join(self.tmpdir, 'test' + random_code + '.dat') + dump_svmlight_file(X, y, testdat, zero_based=False) + + cmd = ' '.join([self.svmperf_classify, testdat, self.model, predictions_path]) + if self.verbose: + print('[Running]', cmd) + p = subprocess.run(cmd.split(), stdout=PIPE, stderr=STDOUT) + + if self.verbose: + print(p.stdout.decode('utf-8')) + + scores = np.loadtxt(predictions_path) + remove(testdat) + remove(predictions_path) + + return scores
+ + + def __del__(self): + if hasattr(self, 'tmpdir'): + shutil.rmtree(self.tmpdir, ignore_errors=True)
+ + +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/data/_ifcb.html b/docs/build/html/_modules/quapy/data/_ifcb.html new file mode 100644 index 0000000..942a5e6 --- /dev/null +++ b/docs/build/html/_modules/quapy/data/_ifcb.html @@ -0,0 +1,165 @@ + + + + + + quapy.data._ifcb — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.data._ifcb

+import os
+import pandas as pd
+from quapy.protocol import AbstractProtocol
+
+
+[docs] +class IFCBTrainSamplesFromDir(AbstractProtocol): + + def __init__(self, path_dir:str, classes: list): + self.path_dir = path_dir + self.classes = classes + self.samples = [] + for filename in os.listdir(path_dir): + if filename.endswith('.csv'): + self.samples.append(filename) + + def __call__(self): + for sample in self.samples: + s = pd.read_csv(os.path.join(self.path_dir,sample)) + # all columns but the first where we get the class + X = s.iloc[:, 1:].to_numpy() + y = s.iloc[:, 0].to_numpy() + yield X, y + +
+[docs] + def total(self): + """ + Returns the total number of samples that the protocol generates. + + :return: The number of training samples to generate. + """ + return len(self.samples)
+
+ + + +
+[docs] +class IFCBTestSamples(AbstractProtocol): + + def __init__(self, path_dir:str, test_prevalences_path: str): + self.path_dir = path_dir + self.test_prevalences = pd.read_csv(os.path.join(path_dir, test_prevalences_path)) + + def __call__(self): + for _, test_sample in self.test_prevalences.iterrows(): + #Load the sample from disk + X = pd.read_csv(os.path.join(self.path_dir,test_sample['sample']+'.csv')).to_numpy() + prevalences = test_sample.iloc[1:].to_numpy().astype(float) + yield X, prevalences + +
+[docs] + def total(self): + """ + Returns the total number of samples that the protocol generates. + + :return: The number of test samples to generate. + """ + return len(self.test_prevalences.index)
+
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/data/_lequa2022.html b/docs/build/html/_modules/quapy/data/_lequa2022.html new file mode 100644 index 0000000..f2a8fab --- /dev/null +++ b/docs/build/html/_modules/quapy/data/_lequa2022.html @@ -0,0 +1,307 @@ + + + + + + quapy.data._lequa2022 — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.data._lequa2022

+from typing import Tuple, Union
+import pandas as pd
+import numpy as np
+import os
+
+from quapy.protocol import AbstractProtocol
+
+DEV_SAMPLES = 1000
+TEST_SAMPLES = 5000
+
+ERROR_TOL = 1E-3
+
+
+
+[docs] +def load_category_map(path): + cat2code = {} + with open(path, 'rt') as fin: + for line in fin: + category, code = line.split() + cat2code[category] = int(code) + code2cat = [cat for cat, code in sorted(cat2code.items(), key=lambda x: x[1])] + return cat2code, code2cat
+ + + +
+[docs] +def load_raw_documents(path): + df = pd.read_csv(path) + documents = list(df["text"].values) + labels = None + if "label" in df.columns: + labels = df["label"].values.astype(int) + return documents, labels
+ + + +
+[docs] +def load_vector_documents(path): + D = pd.read_csv(path).to_numpy(dtype=float) + labelled = D.shape[1] == 301 + if labelled: + X, y = D[:, 1:], D[:, 0].astype(int).flatten() + else: + X, y = D, None + return X, y
+ + + +
+[docs] +class SamplesFromDir(AbstractProtocol): + + def __init__(self, path_dir:str, ground_truth_path:str, load_fn): + self.path_dir = path_dir + self.load_fn = load_fn + self.true_prevs = ResultSubmission.load(ground_truth_path) + + def __call__(self): + for id, prevalence in self.true_prevs.iterrows(): + sample, _ = self.load_fn(os.path.join(self.path_dir, f'{id}.txt')) + yield sample, prevalence
+ + + +
+[docs] +class ResultSubmission: + + def __init__(self): + self.df = None + + def __init_df(self, categories: int): + if not isinstance(categories, int) or categories < 2: + raise TypeError('wrong format for categories: an int (>=2) was expected') + df = pd.DataFrame(columns=list(range(categories))) + df.index.set_names('id', inplace=True) + self.df = df + + @property + def n_categories(self): + return len(self.df.columns.values) + +
+[docs] + def add(self, sample_id: int, prevalence_values: np.ndarray): + if not isinstance(sample_id, int): + raise TypeError(f'error: expected int for sample_sample, found {type(sample_id)}') + if not isinstance(prevalence_values, np.ndarray): + raise TypeError(f'error: expected np.ndarray for prevalence_values, found {type(prevalence_values)}') + if self.df is None: + self.__init_df(categories=len(prevalence_values)) + if sample_id in self.df.index.values: + raise ValueError(f'error: prevalence values for "{sample_id}" already added') + if prevalence_values.ndim != 1 and prevalence_values.size != self.n_categories: + raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}') + if (prevalence_values < 0).any() or (prevalence_values > 1).any(): + raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_id}"') + if np.abs(prevalence_values.sum() - 1) > ERROR_TOL: + raise ValueError(f'error: prevalence values do not sum up to one for "{sample_id}"' + f'(error tolerance {ERROR_TOL})') + + self.df.loc[sample_id] = prevalence_values
+ + + def __len__(self): + return len(self.df) + +
+[docs] + @classmethod + def load(cls, path: str) -> 'ResultSubmission': + df = ResultSubmission.check_file_format(path) + r = ResultSubmission() + r.df = df + return r
+ + +
+[docs] + def dump(self, path: str): + ResultSubmission.check_dataframe_format(self.df) + self.df.to_csv(path)
+ + +
+[docs] + def prevalence(self, sample_id: int): + sel = self.df.loc[sample_id] + if sel.empty: + return None + else: + return sel.values.flatten()
+ + +
+[docs] + def iterrows(self): + for index, row in self.df.iterrows(): + prevalence = row.values.flatten() + yield index, prevalence
+ + +
+[docs] + @classmethod + def check_file_format(cls, path) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: + try: + df = pd.read_csv(path, index_col=0) + except Exception as e: + print(f'the file {path} does not seem to be a valid csv file. ') + print(e) + return ResultSubmission.check_dataframe_format(df, path=path)
+ + +
+[docs] + @classmethod + def check_dataframe_format(cls, df, path=None) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: + hint_path = '' # if given, show the data path in the error message + if path is not None: + hint_path = f' in {path}' + + if df.index.name != 'id' or len(df.columns) < 2: + raise ValueError(f'wrong header{hint_path}, ' + f'the format of the header should be "id,0,...,n-1", ' + f'where n is the number of categories') + if [int(ci) for ci in df.columns.values] != list(range(len(df.columns))): + raise ValueError(f'wrong header{hint_path}, category ids should be 0,1,2,...,n-1, ' + f'where n is the number of categories') + if df.empty: + raise ValueError(f'error{hint_path}: results file is empty') + elif len(df) != DEV_SAMPLES and len(df) != TEST_SAMPLES: + raise ValueError(f'wrong number of prevalence values found{hint_path}; ' + f'expected {DEV_SAMPLES} for development sets and ' + f'{TEST_SAMPLES} for test sets; found {len(df)}') + + ids = set(df.index.values) + expected_ids = set(range(len(df))) + if ids != expected_ids: + missing = expected_ids - ids + if missing: + raise ValueError(f'there are {len(missing)} missing ids{hint_path}: {sorted(missing)}') + unexpected = ids - expected_ids + if unexpected: + raise ValueError(f'there are {len(missing)} unexpected ids{hint_path}: {sorted(unexpected)}') + + for category_id in df.columns: + if (df[category_id] < 0).any() or (df[category_id] > 1).any(): + raise ValueError(f'error{hint_path} column "{category_id}" contains values out of range [0,1]') + + prevs = df.values + round_errors = np.abs(prevs.sum(axis=-1) - 1.) > ERROR_TOL + if round_errors.any(): + raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} ' + f'do not sum up to 1 (error tolerance {ERROR_TOL}), ' + f'probably due to some rounding errors.') + + return df
+
+ + + +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/data/base.html b/docs/build/html/_modules/quapy/data/base.html new file mode 100644 index 0000000..e3a2e89 --- /dev/null +++ b/docs/build/html/_modules/quapy/data/base.html @@ -0,0 +1,728 @@ + + + + + + quapy.data.base — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.data.base

+import itertools
+from functools import cached_property
+from typing import Iterable
+
+import numpy as np
+from scipy.sparse import issparse
+from scipy.sparse import vstack
+from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
+from numpy.random import RandomState
+from quapy.functional import strprev
+from quapy.util import temp_seed
+
+
+
+[docs] +class LabelledCollection: + """ + A LabelledCollection is a set of objects each with a label attached to each of them. + This class implements several sampling routines and other utilities. + + :param instances: array-like (np.ndarray, list, or csr_matrix are supported) + :param labels: array-like with the same length of instances + :param classes: optional, list of classes from which labels are taken. If not specified, the classes are inferred + from the labels. The classes must be indicated in cases in which some of the labels might have no examples + (i.e., a prevalence of 0) + """ + + def __init__(self, instances, labels, classes=None): + if issparse(instances): + self.instances = instances + elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str): + # lists of strings occupy too much as ndarrays (although python-objects add a heavy overload) + self.instances = np.asarray(instances, dtype=object) + else: + self.instances = np.asarray(instances) + self.labels = np.asarray(labels) + n_docs = len(self) + if classes is None: + self.classes_ = np.unique(self.labels) + self.classes_.sort() + else: + self.classes_ = np.unique(np.asarray(classes)) + self.classes_.sort() + if len(set(self.labels).difference(set(classes))) > 0: + raise ValueError(f'labels ({set(self.labels)}) contain values not included in classes_ ({set(classes)})') + self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_} + +
+[docs] + @classmethod + def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs): + """ + Loads a labelled set of data and convert it into a :class:`LabelledCollection` instance. The function in charge + of reading the instances must be specified. This function can be a custom one, or any of the reading functions + defined in :mod:`quapy.data.reader` module. + + :param path: string, the path to the file containing the labelled instances + :param loader_func: a custom function that implements the data loader and returns a tuple with instances and + labels + :param classes: array-like, the classes according to which the instances are labelled + :param loader_kwargs: any argument that the `loader_func` function needs in order to read the instances, i.e., + these arguments are used to call `loader_func(path, **loader_kwargs)` + :return: a :class:`LabelledCollection` object + """ + return LabelledCollection(*loader_func(path, **loader_kwargs), classes)
+ + + def __len__(self): + """ + Returns the length of this collection (number of labelled instances) + + :return: integer + """ + return self.instances.shape[0] + +
+[docs] + def prevalence(self): + """ + Returns the prevalence, or relative frequency, of the classes in the codeframe. + + :return: a np.ndarray of shape `(n_classes)` with the relative frequencies of each class, in the same order + as listed by `self.classes_` + """ + return self.counts() / len(self)
+ + +
+[docs] + def counts(self): + """ + Returns the number of instances for each of the classes in the codeframe. + + :return: a np.ndarray of shape `(n_classes)` with the number of instances of each class, in the same order + as listed by `self.classes_` + """ + return np.asarray([len(self.index[class_]) for class_ in self.classes_])
+ + + @property + def n_classes(self): + """ + The number of classes + + :return: integer + """ + return len(self.classes_) + + @property + def binary(self): + """ + Returns True if the number of classes is 2 + + :return: boolean + """ + return self.n_classes == 2 + +
+[docs] + def sampling_index(self, size, *prevs, shuffle=True, random_state=None): + """ + Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the + prevalence values are not specified, then returns the index of a uniform sampling. + For each class, the sampling is drawn with replacement if the requested prevalence is larger than + the actual prevalence of the class, or without replacement otherwise. + + :param size: integer, the requested size + :param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since + it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in + `self.classes_` can be specified, while the other class takes prevalence value `1-p` + :param shuffle: if set to True (default), shuffles the index before returning it + :param random_state: seed for reproducing sampling + :return: a np.ndarray of shape `(size)` with the indexes + """ + if len(prevs) == 0: # no prevalence was indicated; returns an index for uniform sampling + return self.uniform_sampling_index(size, random_state=random_state) + if len(prevs) == self.n_classes - 1: + prevs = prevs + (1 - sum(prevs),) + assert len(prevs) == self.n_classes, 'unexpected number of prevalences' + assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})' + + # Decide how many instances should be taken for each class in order to satisfy the requested prevalence + # accurately, and the number of instances in the sample (exactly). If int(size * prevs[i]) (which is + # <= size * prevs[i]) examples are drawn from class i, there could be a remainder number of instances to take + # to satisfy the size constrain. The remainder is distributed along the classes with probability = prevs. + # (This aims at avoiding the remainder to be placed in a class for which the prevalence requested is 0.) + n_requests = {class_: round(size * prevs[i]) for i, class_ in enumerate(self.classes_)} + remainder = size - sum(n_requests.values()) + with temp_seed(random_state): + # due to rounding, the remainder can be 0, >0, or <0 + if remainder > 0: + # when the remainder is >0 we randomly add 1 to the requests for each class; + # more prevalent classes are more likely to be taken in order to minimize the impact in the final prevalence + for rand_class in np.random.choice(self.classes_, size=remainder, p=prevs): + n_requests[rand_class] += 1 + elif remainder < 0: + # when the remainder is <0 we randomly remove 1 from the requests, unless the request is 0 for a chosen + # class; we repeat until remainder==0 + while remainder!=0: + rand_class = np.random.choice(self.classes_, p=prevs) + if n_requests[rand_class] > 0: + n_requests[rand_class] -= 1 + remainder += 1 + + indexes_sample = [] + for class_, n_requested in n_requests.items(): + n_candidates = len(self.index[class_]) + index_sample = self.index[class_][ + np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates)) + ] if n_requested > 0 else [] + + indexes_sample.append(index_sample) + + indexes_sample = np.concatenate(indexes_sample).astype(int) + + if shuffle: + indexes_sample = np.random.permutation(indexes_sample) + + return indexes_sample
+ + +
+[docs] + def uniform_sampling_index(self, size, random_state=None): + """ + Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn + with replacement if the requested size is greater than the number of instances, or without replacement + otherwise. + + :param size: integer, the size of the uniform sample + :param random_state: if specified, guarantees reproducibility of the split. + :return: a np.ndarray of shape `(size)` with the indexes + """ + if random_state is not None: + ng = RandomState(seed=random_state) + else: + ng = np.random + return ng.choice(len(self), size, replace=size > len(self))
+ + +
+[docs] + def sampling(self, size, *prevs, shuffle=True, random_state=None): + """ + Return a random sample (an instance of :class:`LabelledCollection`) of desired size and desired prevalence + values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than + the actual prevalence of the class, or with replacement otherwise. + + :param size: integer, the requested size + :param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since + it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in + `self.classes_` can be specified, while the other class takes prevalence value `1-p` + :param shuffle: if set to True (default), shuffles the index before returning it + :param random_state: seed for reproducing sampling + :return: an instance of :class:`LabelledCollection` with length == `size` and prevalence close to `prevs` (or + prevalence == `prevs` if the exact prevalence values can be met as proportions of instances) + """ + prev_index = self.sampling_index(size, *prevs, shuffle=shuffle, random_state=random_state) + return self.sampling_from_index(prev_index)
+ + +
+[docs] + def uniform_sampling(self, size, random_state=None): + """ + Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn + with replacement if the requested size is greater than the number of instances, or without replacement + otherwise. + + :param size: integer, the requested size + :param random_state: if specified, guarantees reproducibility of the split. + :return: an instance of :class:`LabelledCollection` with length == `size` + """ + unif_index = self.uniform_sampling_index(size, random_state=random_state) + return self.sampling_from_index(unif_index)
+ + +
+[docs] + def sampling_from_index(self, index): + """ + Returns an instance of :class:`LabelledCollection` whose elements are sampled from this collection using the + index. + + :param index: np.ndarray + :return: an instance of :class:`LabelledCollection` + """ + documents = self.instances[index] + labels = self.labels[index] + return LabelledCollection(documents, labels, classes=self.classes_)
+ + +
+[docs] + def split_stratified(self, train_prop=0.6, random_state=None): + """ + Returns two instances of :class:`LabelledCollection` split with stratification from this collection, at desired + proportion. + + :param train_prop: the proportion of elements to include in the left-most returned collection (typically used + as the training collection). The rest of elements are included in the right-most returned collection + (typically used as a test collection). + :param random_state: if specified, guarantees reproducibility of the split. + :return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the + second one with `1-train_prop` elements + """ + tr_docs, te_docs, tr_labels, te_labels = train_test_split( + self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state + ) + training = LabelledCollection(tr_docs, tr_labels, classes=self.classes_) + test = LabelledCollection(te_docs, te_labels, classes=self.classes_) + return training, test
+ + +
+[docs] + def split_random(self, train_prop=0.6, random_state=None): + """ + Returns two instances of :class:`LabelledCollection` split randomly from this collection, at desired + proportion. + + :param train_prop: the proportion of elements to include in the left-most returned collection (typically used + as the training collection). The rest of elements are included in the right-most returned collection + (typically used as a test collection). + :param random_state: if specified, guarantees reproducibility of the split. + :return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the + second one with `1-train_prop` elements + """ + indexes = np.random.RandomState(seed=random_state).permutation(len(self)) + if isinstance(train_prop, int): + assert train_prop < len(self), \ + 'argument train_prop cannot be greater than the number of elements in the collection' + splitpoint = train_prop + elif isinstance(train_prop, float): + assert 0 < train_prop < 1, \ + 'argument train_prop out of range (0,1)' + splitpoint = int(np.round(len(self)*train_prop)) + left, right = indexes[:splitpoint], indexes[splitpoint:] + training = self.sampling_from_index(left) + test = self.sampling_from_index(right) + return training, test
+ + + def __add__(self, other): + """ + Returns a new :class:`LabelledCollection` as the union of this collection with another collection. + Both labelled collections must have the same classes. + + :param other: another :class:`LabelledCollection` + :return: a :class:`LabelledCollection` representing the union of both collections + """ + if not all(np.sort(self.classes_)==np.sort(other.classes_)): + raise NotImplementedError(f'unsupported operation for collections on different classes; ' + f'expected {self.classes_}, found {other.classes_}') + return LabelledCollection.join(self, other) + +
+[docs] + @classmethod + def join(cls, *args: Iterable['LabelledCollection']): + """ + Returns a new :class:`LabelledCollection` as the union of the collections given in input. + + :param args: instances of :class:`LabelledCollection` + :return: a :class:`LabelledCollection` representing the union of both collections + """ + + args = [lc for lc in args if lc is not None] + assert len(args) > 0, 'empty list is not allowed for mix' + + assert all([isinstance(lc, LabelledCollection) for lc in args]), \ + 'only instances of LabelledCollection allowed' + + first_instances = args[0].instances + first_type = type(first_instances) + assert all([type(lc.instances)==first_type for lc in args[1:]]), \ + 'not all the collections are of instances of the same type' + + if issparse(first_instances) or isinstance(first_instances, np.ndarray): + first_ndim = first_instances.ndim + assert all([lc.instances.ndim == first_ndim for lc in args[1:]]), \ + 'not all the ndarrays are of the same dimension' + if first_ndim > 1: + first_shape = first_instances.shape[1:] + assert all([lc.instances.shape[1:] == first_shape for lc in args[1:]]), \ + 'not all the ndarrays are of the same shape' + if issparse(first_instances): + instances = vstack([lc.instances for lc in args]) + else: + instances = np.concatenate([lc.instances for lc in args]) + elif isinstance(first_instances, list): + instances = list(itertools.chain(lc.instances for lc in args)) + else: + raise NotImplementedError('unsupported operation for collection types') + labels = np.concatenate([lc.labels for lc in args]) + classes = np.unique(labels).sort() + return LabelledCollection(instances, labels, classes=classes)
+ + + @property + def Xy(self): + """ + Gets the instances and labels. This is useful when working with `sklearn` estimators, e.g.: + + >>> svm = LinearSVC().fit(*my_collection.Xy) + + :return: a tuple `(instances, labels)` from this collection + """ + return self.instances, self.labels + + @property + def Xp(self): + """ + Gets the instances and the true prevalence. This is useful when implementing evaluation protocols from + a :class:`LabelledCollection` object. + + :return: a tuple `(instances, prevalence)` from this collection + """ + return self.instances, self.prevalence() + + @property + def X(self): + """ + An alias to self.instances + + :return: self.instances + """ + return self.instances + + @property + def y(self): + """ + An alias to self.labels + + :return: self.labels + """ + return self.labels + + @property + def p(self): + """ + An alias to self.prevalence() + + :return: self.prevalence() + """ + return self.prevalence() + + +
+[docs] + def stats(self, show=True): + """ + Returns (and eventually prints) a dictionary with some stats of this collection. E.g.,: + + >>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5) + >>> data.training.stats() + >>> #instances=3821, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], prevs=[0.081, 0.919] + + :param show: if set to True (default), prints the stats in standard output + :return: a dictionary containing some stats of this collection. Keys include `#instances` (the number of + instances), `type` (the type representing the instances), `#features` (the number of features, if the + instances are in array-like format), `#classes` (the classes of the collection), `prevs` (the prevalence + values for each class) + """ + ninstances = len(self) + instance_type = type(self.instances[0]) + if instance_type == list: + nfeats = len(self.instances[0]) + elif instance_type == np.ndarray or issparse(self.instances): + nfeats = self.instances.shape[1] + else: + nfeats = '?' + stats_ = {'instances': ninstances, + 'type': instance_type, + 'features': nfeats, + 'classes': self.classes_, + 'prevs': strprev(self.prevalence())} + if show: + print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, ' + f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}') + return stats_
+ + +
+[docs] + def kFCV(self, nfolds=5, nrepeats=1, random_state=None): + """ + Generator of stratified folds to be used in k-fold cross validation. + + :param nfolds: integer (default 5), the number of folds to generate + :param nrepeats: integer (default 1), the number of rounds of k-fold cross validation to run + :param random_state: integer (default 0), guarantees that the folds generated are reproducible + :return: yields `nfolds * nrepeats` folds for k-fold cross validation + """ + kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state) + for train_index, test_index in kf.split(*self.Xy): + train = self.sampling_from_index(train_index) + test = self.sampling_from_index(test_index) + yield train, test
+
+ + + +
+[docs] +class Dataset: + """ + Abstraction of training and test :class:`LabelledCollection` objects. + + :param training: a :class:`LabelledCollection` instance + :param test: a :class:`LabelledCollection` instance + :param vocabulary: if indicated, is a dictionary of the terms used in this textual dataset + :param name: a string representing the name of the dataset + """ + + def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''): + assert set(training.classes_) == set(test.classes_), 'incompatible labels in training and test collections' + self.training = training + self.test = test + self.vocabulary = vocabulary + self.name = name + +
+[docs] + @classmethod + def SplitStratified(cls, collection: LabelledCollection, train_size=0.6): + """ + Generates a :class:`Dataset` from a stratified split of a :class:`LabelledCollection` instance. + See :meth:`LabelledCollection.split_stratified` + + :param collection: :class:`LabelledCollection` + :param train_size: the proportion of training documents (the rest conforms the test split) + :return: an instance of :class:`Dataset` + """ + return Dataset(*collection.split_stratified(train_prop=train_size))
+ + + @property + def classes_(self): + """ + The classes according to which the training collection is labelled + + :return: The classes according to which the training collection is labelled + """ + return self.training.classes_ + + @property + def n_classes(self): + """ + The number of classes according to which the training collection is labelled + + :return: integer + """ + return self.training.n_classes + + @property + def binary(self): + """ + Returns True if the training collection is labelled according to two classes + + :return: boolean + """ + return self.training.binary + +
+[docs] + @classmethod + def load(cls, train_path, test_path, loader_func: callable, classes=None, **loader_kwargs): + """ + Loads a training and a test labelled set of data and convert it into a :class:`Dataset` instance. + The function in charge of reading the instances must be specified. This function can be a custom one, or any of + the reading functions defined in :mod:`quapy.data.reader` module. + + :param train_path: string, the path to the file containing the training instances + :param test_path: string, the path to the file containing the test instances + :param loader_func: a custom function that implements the data loader and returns a tuple with instances and + labels + :param classes: array-like, the classes according to which the instances are labelled + :param loader_kwargs: any argument that the `loader_func` function needs in order to read the instances. + See :meth:`LabelledCollection.load` for further details. + :return: a :class:`Dataset` object + """ + + training = LabelledCollection.load(train_path, loader_func, classes, **loader_kwargs) + test = LabelledCollection.load(test_path, loader_func, classes, **loader_kwargs) + return Dataset(training, test)
+ + + @property + def vocabulary_size(self): + """ + If the dataset is textual, and the vocabulary was indicated, returns the size of the vocabulary + + :return: integer + """ + return len(self.vocabulary) + + @property + def train_test(self): + """ + Alias to `self.training` and `self.test` + + :return: the training and test collections + :return: the training and test collections + """ + return self.training, self.test + +
+[docs] + def stats(self, show=True): + """ + Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,: + + >>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5) + >>> data.stats() + >>> Dataset=kindle #tr-instances=3821, #te-instances=21591, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], tr-prevs=[0.081, 0.919], te-prevs=[0.063, 0.937] + + :param show: if set to True (default), prints the stats in standard output + :return: a dictionary containing some stats of this collection for the training and test collections. The keys + are `train` and `test`, and point to dedicated dictionaries of stats, for each collection, with keys + `#instances` (the number of instances), `type` (the type representing the instances), + `#features` (the number of features, if the instances are in array-like format), `#classes` (the classes of + the collection), `prevs` (the prevalence values for each class) + """ + tr_stats = self.training.stats(show=False) + te_stats = self.test.stats(show=False) + if show: + print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, ' + f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, ' + f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}') + return {'train': tr_stats, 'test': te_stats}
+ + +
+[docs] + @classmethod + def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0): + """ + Generator of stratified folds to be used in k-fold cross validation. This function is only a wrapper around + :meth:`LabelledCollection.kFCV` that returns :class:`Dataset` instances made of training and test folds. + + :param nfolds: integer (default 5), the number of folds to generate + :param nrepeats: integer (default 1), the number of rounds of k-fold cross validation to run + :param random_state: integer (default 0), guarantees that the folds generated are reproducible + :return: yields `nfolds * nrepeats` folds for k-fold cross validation as instances of :class:`Dataset` + """ + for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)): + yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})')
+ + + +
+[docs] + def reduce(self, n_train=100, n_test=100): + """ + Reduce the number of instances in place for quick experiments. Preserves the prevalence of each set. + + :param n_train: number of training documents to keep (default 100) + :param n_test: number of test documents to keep (default 100) + :return: self + """ + self.training = self.training.sampling(n_train, *self.training.prevalence()) + self.test = self.test.sampling(n_test, *self.test.prevalence()) + return self
+
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/data/datasets.html b/docs/build/html/_modules/quapy/data/datasets.html new file mode 100644 index 0000000..b910036 --- /dev/null +++ b/docs/build/html/_modules/quapy/data/datasets.html @@ -0,0 +1,919 @@ + + + + + + quapy.data.datasets — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.data.datasets

+
[docs]def warn(*args, **kwargs): + pass
+import warnings +warnings.warn = warn +import os +import zipfile +from os.path import join +import pandas as pd +from ucimlrepo import fetch_ucirepo +from quapy.data.base import Dataset, LabelledCollection +from quapy.data.preprocessing import text2tfidf, reduce_columns +from quapy.data.reader import * +from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource + + +REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb'] +TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders', + 'semeval13', 'semeval14', 'semeval15', 'semeval16', + 'sst', 'wa', 'wb'] +TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders', + 'semeval', 'semeval16', + 'sst', 'wa', 'wb'] +UCI_BINARY_DATASETS = ['acute.a', 'acute.b', + 'balance.1', 'balance.2', 'balance.3', + 'breast-cancer', + 'cmc.1', 'cmc.2', 'cmc.3', + 'ctg.1', 'ctg.2', 'ctg.3', + #'diabetes', # <-- I haven't found this one... + 'german', + 'haberman', + 'ionosphere', + 'iris.1', 'iris.2', 'iris.3', + 'mammographic', + 'pageblocks.5', + #'phoneme', # <-- I haven't found this one... + 'semeion', + 'sonar', + 'spambase', + 'spectf', + 'tictactoe', + 'transfusion', + 'wdbc', + 'wine.1', 'wine.2', 'wine.3', + 'wine-q-red', 'wine-q-white', + 'yeast'] + +UCI_MULTICLASS_DATASETS = ['dry-bean', + 'wine-quality', + 'academic-success', + 'digits', + 'letter'] + +LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B'] + +_TXA_SAMPLE_SIZE = 250 +_TXB_SAMPLE_SIZE = 1000 + +LEQUA2022_SAMPLE_SIZE = { + 'TXA': _TXA_SAMPLE_SIZE, + 'TXB': _TXB_SAMPLE_SIZE, + 'T1A': _TXA_SAMPLE_SIZE, + 'T1B': _TXB_SAMPLE_SIZE, + 'T2A': _TXA_SAMPLE_SIZE, + 'T2B': _TXB_SAMPLE_SIZE, + 'binary': _TXA_SAMPLE_SIZE, + 'multiclass': _TXB_SAMPLE_SIZE +} + + +
[docs]def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset: + """ + Loads a Reviews dataset as a Dataset instance, as used in + `Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification." + Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_. + The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS` + + :param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb' + :param tfidf: set to True to transform the raw documents into tfidf weighted matrices + :param min_df: minimun number of documents that should contain a term in order for the term to be + kept (ignored if tfidf==False) + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for + faster subsequent invokations + :return: a :class:`quapy.data.base.Dataset` instance + """ + assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \ + f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \ + f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}' + if data_home is None: + data_home = get_quapy_home() + + URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt' + URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt' + os.makedirs(join(data_home, 'reviews'), exist_ok=True) + train_path = join(data_home, 'reviews', dataset_name, 'train.txt') + test_path = join(data_home, 'reviews', dataset_name, 'test.txt') + download_file_if_not_exists(URL_TRAIN, train_path) + download_file_if_not_exists(URL_TEST, test_path) + + pickle_path = None + if pickle: + pickle_path = join(data_home, 'reviews', 'pickle', f'{dataset_name}.pkl') + data = pickled_resource(pickle_path, Dataset.load, train_path, test_path, from_text) + + if tfidf: + text2tfidf(data, inplace=True) + if min_df is not None: + reduce_columns(data, min_df=min_df, inplace=True) + + data.name = dataset_name + + return data
+ + +
[docs]def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset: + """ + Loads a Twitter dataset as a :class:`quapy.data.base.Dataset` instance, as used in: + `Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. + Social Network Analysis and Mining6(19), 1–22 (2016) <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_ + Note that the datasets 'semeval13', 'semeval14', 'semeval15' share the same training set. + The list of valid dataset names corresponding to training sets can be accessed in + `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN`, while the test sets can be accessed in + `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST` + + :param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13', + 'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb' + :param for_model_selection: if True, then returns the train split as the training set and the devel split + as the test set; if False, then returns the train+devel split as the training set and the test set as the + test set + :param min_df: minimun number of documents that should contain a term in order for the term to be kept + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for + faster subsequent invokations + :return: a :class:`quapy.data.base.Dataset` instance + """ + assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \ + f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \ + f'Valid ones are {TWITTER_SENTIMENT_DATASETS_TRAIN} for model selection and ' \ + f'{TWITTER_SENTIMENT_DATASETS_TEST} for test (datasets "semeval14", "semeval15", "semeval16" share ' \ + f'a common training set "semeval")' + if data_home is None: + data_home = get_quapy_home() + + URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip' + unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam') + if not os.path.exists(unzipped_path): + downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip') + download_file(URL, downloaded_path) + with zipfile.ZipFile(downloaded_path) as file: + file.extractall(data_home) + os.remove(downloaded_path) + + if dataset_name in {'semeval13', 'semeval14', 'semeval15'}: + trainset_name = 'semeval' + testset_name = 'semeval' if for_model_selection else dataset_name + print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common " + f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}") + else: + if dataset_name == 'semeval' and for_model_selection==False: + raise ValueError('dataset "semeval" can only be used for model selection. ' + 'Use "semeval13", "semeval14", or "semeval15" for model evaluation.') + trainset_name = testset_name = dataset_name + + if for_model_selection: + train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt') + test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt') + else: + train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt') + if dataset_name == 'semeval16': # there is a different test name in the case of semeval16 only + test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt') + else: + test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt') + + pickle_path = None + if pickle: + mode = "train-dev" if for_model_selection else "train+dev-test" + pickle_path = join(unzipped_path, 'pickle', f'{testset_name}.{mode}.pkl') + data = pickled_resource(pickle_path, Dataset.load, train, test, from_sparse) + + if min_df is not None: + reduce_columns(data, min_df=min_df, inplace=True) + + data.name = dataset_name + + return data
+ + +
[docs]def fetch_UCIBinaryDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: + """ + Loads a UCI dataset as an instance of :class:`quapy.data.base.Dataset`, as used in + `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). + Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. + Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_ + and + `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019). + Dynamic ensemble selection for quantification tasks. + Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_. + The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further + information on how to use these collections), and so a train-test split is generated at desired proportion. + The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS` + + :param dataset_name: a dataset name + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets + :return: a :class:`quapy.data.base.Dataset` instance + """ + data = fetch_UCIBinaryLabelledCollection(dataset_name, data_home, verbose) + return Dataset(*data.split_stratified(1 - test_split, random_state=0))
+ + +
[docs]def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: + """ + Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in + `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). + Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. + Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_ + and + `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019). + Dynamic ensemble selection for quantification tasks. + Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_. + The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation + protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation. + This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.: + + >>> import quapy as qp + >>> collection = qp.datasets.fetch_UCIBinaryLabelledCollection("yeast") + >>> for data in qp.train.Dataset.kFCV(collection, nfolds=5, nrepeats=2): + >>> ... + + The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS` + + :param dataset_name: a dataset name + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets + :return: a :class:`quapy.data.base.LabelledCollection` instance + """ + + assert dataset_name in UCI_BINARY_DATASETS, \ + f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \ + f'Valid ones are {UCI_BINARY_DATASETS}' + if data_home is None: + data_home = get_quapy_home() + + dataset_fullname = { + 'acute.a': 'Acute Inflammations (urinary bladder)', + 'acute.b': 'Acute Inflammations (renal pelvis)', + 'balance.1': 'Balance Scale Weight & Distance Database (left)', + 'balance.2': 'Balance Scale Weight & Distance Database (balanced)', + 'balance.3': 'Balance Scale Weight & Distance Database (right)', + 'breast-cancer': 'Breast Cancer Wisconsin (Original)', + 'cmc.1': 'Contraceptive Method Choice (no use)', + 'cmc.2': 'Contraceptive Method Choice (long term)', + 'cmc.3': 'Contraceptive Method Choice (short term)', + 'ctg.1': 'Cardiotocography Data Set (normal)', + 'ctg.2': 'Cardiotocography Data Set (suspect)', + 'ctg.3': 'Cardiotocography Data Set (pathologic)', + 'german': 'Statlog German Credit Data', + 'haberman': "Haberman's Survival Data", + 'ionosphere': 'Johns Hopkins University Ionosphere DB', + 'iris.1': 'Iris Plants Database(x)', + 'iris.2': 'Iris Plants Database(versicolour)', + 'iris.3': 'Iris Plants Database(virginica)', + 'mammographic': 'Mammographic Mass', + 'pageblocks.5': 'Page Blocks Classification (5)', + 'semeion': 'Semeion Handwritten Digit (8)', + 'sonar': 'Sonar, Mines vs. Rocks', + 'spambase': 'Spambase Data Set', + 'spectf': 'SPECTF Heart Data', + 'tictactoe': 'Tic-Tac-Toe Endgame Database', + 'transfusion': 'Blood Transfusion Service Center Data Set', + 'wdbc': 'Wisconsin Diagnostic Breast Cancer', + 'wine.1': 'Wine Recognition Data (1)', + 'wine.2': 'Wine Recognition Data (2)', + 'wine.3': 'Wine Recognition Data (3)', + 'wine-q-red': 'Wine Quality Red (6-10)', + 'wine-q-white': 'Wine Quality White (6-10)', + 'yeast': 'Yeast', + } + + # the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use + # to download the raw dataset + identifier_map = { + 'acute.a': 'acute', + 'acute.b': 'acute', + 'balance.1': 'balance-scale', + 'balance.2': 'balance-scale', + 'balance.3': 'balance-scale', + 'breast-cancer': 'breast-cancer-wisconsin', + 'cmc.1': 'cmc', + 'cmc.2': 'cmc', + 'cmc.3': 'cmc', + 'ctg.1': '00193', + 'ctg.2': '00193', + 'ctg.3': '00193', + 'german': 'statlog/german', + 'haberman': 'haberman', + 'ionosphere': 'ionosphere', + 'iris.1': 'iris', + 'iris.2': 'iris', + 'iris.3': 'iris', + 'mammographic': 'mammographic-masses', + 'pageblocks.5': 'page-blocks', + 'semeion': 'semeion', + 'sonar': 'undocumented/connectionist-bench/sonar', + 'spambase': 'spambase', + 'spectf': 'spect', + 'tictactoe': 'tic-tac-toe', + 'transfusion': 'blood-transfusion', + 'wdbc': 'breast-cancer-wisconsin', + 'wine-q-red': 'wine-quality', + 'wine-q-white': 'wine-quality', + 'wine.1': 'wine', + 'wine.2': 'wine', + 'wine.3': 'wine', + 'yeast': 'yeast', + } + + # the filename is the name of the file within the data_folder indexed by the identifier + file_name = { + 'acute': 'diagnosis.data', + '00193': 'CTG.xls', + 'statlog/german': 'german.data-numeric', + 'mammographic-masses': 'mammographic_masses.data', + 'page-blocks': 'page-blocks.data.Z', + 'undocumented/connectionist-bench/sonar': 'sonar.all-data', + 'spect': ['SPECTF.train', 'SPECTF.test'], + 'blood-transfusion': 'transfusion.data', + 'wine-quality': ['winequality-red.csv', 'winequality-white.csv'], + 'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data' + } + + # the filename containing the dataset description (if any) + desc_name = { + 'acute': 'diagnosis.names', + '00193': None, + 'statlog/german': 'german.doc', + 'mammographic-masses': 'mammographic_masses.names', + 'undocumented/connectionist-bench/sonar': 'sonar.names', + 'spect': 'SPECTF.names', + 'blood-transfusion': 'transfusion.names', + 'wine-quality': 'winequality.names', + 'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names' + } + + identifier = identifier_map[dataset_name] + filename = file_name.get(identifier, f'{identifier}.data') + descfile = desc_name.get(identifier, f'{identifier}.names') + fullname = dataset_fullname[dataset_name] + + URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}' + data_dir = join(data_home, 'uci_datasets', identifier) + if isinstance(filename, str): # filename could be a list of files, in which case it will be processed later + data_path = join(data_dir, filename) + download_file_if_not_exists(f'{URL}/{filename}', data_path) + + if descfile: + try: + download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}') + if verbose: + print(open(f'{data_dir}/{descfile}', 'rt').read()) + except Exception: + print('could not read the description file') + elif verbose: + print('no file description available') + + if verbose: + print(f'Loading {dataset_name} ({fullname})') + if identifier == 'acute': + df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t') + + df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False) + [_df_replace(df, col) for col in range(1, 6)] + X = df.loc[:, 0:5].values + if dataset_name == 'acute.a': + y = binarize(df[6], pos_class='yes') + elif dataset_name == 'acute.b': + y = binarize(df[7], pos_class='yes') + + if identifier == 'balance-scale': + df = pd.read_csv(data_path, header=None, sep=',') + if dataset_name == 'balance.1': + y = binarize(df[0], pos_class='L') + elif dataset_name == 'balance.2': + y = binarize(df[0], pos_class='B') + elif dataset_name == 'balance.3': + y = binarize(df[0], pos_class='R') + X = df.loc[:, 1:].astype(float).values + + if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer': + df = pd.read_csv(data_path, header=None, sep=',') + Xy = df.loc[:, 1:10] + Xy[Xy=='?']=np.nan + Xy = Xy.dropna(axis=0) + X = Xy.loc[:, 1:9] + X = X.astype(float).values + y = binarize(Xy[10], pos_class=2) + + if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc': + df = pd.read_csv(data_path, header=None, sep=',') + X = df.loc[:, 2:32].astype(float).values + y = df[1].values + y = binarize(y, pos_class='M') + + if identifier == 'cmc': + df = pd.read_csv(data_path, header=None, sep=',') + X = df.loc[:, 0:8].astype(float).values + y = df[9].astype(int).values + if dataset_name == 'cmc.1': + y = binarize(y, pos_class=1) + elif dataset_name == 'cmc.2': + y = binarize(y, pos_class=2) + elif dataset_name == 'cmc.3': + y = binarize(y, pos_class=3) + + if identifier == '00193': + df = pd.read_excel(data_path, sheet_name='Data', skipfooter=3) + df = df[list(range(1,24))] # select columns numbered (number 23 is the target label) + # replaces the header with the first row + new_header = df.iloc[0] # grab the first row for the header + df = df[1:] # take the data less the header row + df.columns = new_header # set the header row as the df header + X = df.iloc[:, 0:22].astype(float).values + y = df['NSP'].astype(int).values + if dataset_name == 'ctg.1': + y = binarize(y, pos_class=1) # 1==Normal + elif dataset_name == 'ctg.2': + y = binarize(y, pos_class=2) # 2==Suspect + elif dataset_name == 'ctg.3': + y = binarize(y, pos_class=3) # 3==Pathologic + + if identifier == 'statlog/german': + df = pd.read_csv(data_path, header=None, delim_whitespace=True) + X = df.iloc[:, 0:24].astype(float).values + y = df[24].astype(int).values + y = binarize(y, pos_class=1) + + if identifier == 'haberman': + df = pd.read_csv(data_path, header=None) + X = df.iloc[:, 0:3].astype(float).values + y = df[3].astype(int).values + y = binarize(y, pos_class=2) + + if identifier == 'ionosphere': + df = pd.read_csv(data_path, header=None) + X = df.iloc[:, 0:34].astype(float).values + y = df[34].values + y = binarize(y, pos_class='b') + + if identifier == 'iris': + df = pd.read_csv(data_path, header=None) + X = df.iloc[:, 0:4].astype(float).values + y = df[4].values + if dataset_name == 'iris.1': + y = binarize(y, pos_class='Iris-setosa') # 1==Setosa + elif dataset_name == 'iris.2': + y = binarize(y, pos_class='Iris-versicolor') # 2==Versicolor + elif dataset_name == 'iris.3': + y = binarize(y, pos_class='Iris-virginica') # 3==Virginica + + if identifier == 'mammographic-masses': + df = pd.read_csv(data_path, header=None, sep=',') + df[df == '?'] = np.nan + Xy = df.dropna(axis=0) + X = Xy.iloc[:, 0:5] + X = X.astype(float).values + y = binarize(Xy.iloc[:,5], pos_class=1) + + if identifier == 'page-blocks': + data_path_ = data_path.replace('.Z', '') + if not os.path.exists(data_path_): + raise FileNotFoundError(f'Warning: file {data_path_} does not exist. If this is the first time you ' + f'attempt to load this dataset, then you have to manually unzip the {data_path} ' + f'and name the extracted file {data_path_} (unfortunately, neither zipfile, nor ' + f'gzip can handle unix compressed files automatically -- there is a repo in GitHub ' + f'https://github.com/umeat/unlzw where the problem seems to be solved anyway).') + df = pd.read_csv(data_path_, header=None, delim_whitespace=True) + X = df.iloc[:, 0:10].astype(float).values + y = df[10].values + y = binarize(y, pos_class=5) # 5==block "graphic" + + if identifier == 'semeion': + df = pd.read_csv(data_path, header=None, delim_whitespace=True ) + X = df.iloc[:, 0:256].astype(float).values + y = df[263].values # 263 stands for digit 8 (labels are one-hot vectors from col 256-266) + y = binarize(y, pos_class=1) + + if identifier == 'undocumented/connectionist-bench/sonar': + df = pd.read_csv(data_path, header=None, sep=',') + X = df.iloc[:, 0:60].astype(float).values + y = df[60].values + y = binarize(y, pos_class='R') + + if identifier == 'spambase': + df = pd.read_csv(data_path, header=None, sep=',') + X = df.iloc[:, 0:57].astype(float).values + y = df[57].values + y = binarize(y, pos_class=1) + + if identifier == 'spect': + dfs = [] + for file in filename: + data_path = join(data_dir, file) + download_file_if_not_exists(f'{URL}/{file}', data_path) + dfs.append(pd.read_csv(data_path, header=None, sep=',')) + df = pd.concat(dfs) + X = df.iloc[:, 1:45].astype(float).values + y = df[0].values + y = binarize(y, pos_class=0) + + if identifier == 'tic-tac-toe': + df = pd.read_csv(data_path, header=None, sep=',') + X = df.iloc[:, 0:9].replace('o',0).replace('b',1).replace('x',2).values + y = df[9].values + y = binarize(y, pos_class='negative') + + if identifier == 'blood-transfusion': + df = pd.read_csv(data_path, sep=',') + X = df.iloc[:, 0:4].astype(float).values + y = df.iloc[:, 4].values + y = binarize(y, pos_class=1) + + if identifier == 'wine': + df = pd.read_csv(data_path, header=None, sep=',') + X = df.iloc[:, 1:14].astype(float).values + y = df[0].values + if dataset_name == 'wine.1': + y = binarize(y, pos_class=1) + elif dataset_name == 'wine.2': + y = binarize(y, pos_class=2) + elif dataset_name == 'wine.3': + y = binarize(y, pos_class=3) + + if identifier == 'wine-quality': + filename = filename[0] if dataset_name=='wine-q-red' else filename[1] + data_path = join(data_dir, filename) + download_file_if_not_exists(f'{URL}/{filename}', data_path) + df = pd.read_csv(data_path, sep=';') + X = df.iloc[:, 0:11].astype(float).values + y = df.iloc[:, 11].values > 5 + + if identifier == 'yeast': + df = pd.read_csv(data_path, header=None, delim_whitespace=True) + X = df.iloc[:, 1:9].astype(float).values + y = df.iloc[:, 9].values + y = binarize(y, pos_class='NUC') + + data = LabelledCollection(X, y) + if verbose: + data.stats() + return data
+ + +
[docs]def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: + """ + Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. + + The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria: + - It has more than 1000 instances + - It is suited for classification + - It has more than two classes + - It is available for Python import (requires ucimlrepo package) + + >>> import quapy as qp + >>> dataset = qp.datasets.fetch_UCIMulticlassDataset("dry-bean") + >>> train, test = dataset.train_test + >>> ... + + The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS` + + The datasets are downloaded only once and pickled into disk, saving time for consecutive calls. + + :param dataset_name: a dataset name + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param verbose: set to True (default is False) to get information (stats) about the dataset + :return: a :class:`quapy.data.base.Dataset` instance + """ + data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose) + return Dataset(*data.split_stratified(1 - test_split, random_state=0))
+ + +
[docs]def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: + """ + Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`. + + The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria: + - It has more than 1000 instances + - It is suited for classification + - It has more than two classes + - It is available for Python import (requires ucimlrepo package) + + >>> import quapy as qp + >>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean") + >>> X, y = collection.Xy + >>> ... + + The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS` + + The datasets are downloaded only once and pickled into disk, saving time for consecutive calls. + + :param dataset_name: a dataset name + :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param verbose: set to True (default is False) to get information (stats) about the dataset + :return: a :class:`quapy.data.base.LabelledCollection` instance + """ + assert dataset_name in UCI_MULTICLASS_DATASETS, \ + f'Name {dataset_name} does not match any known dataset from the ' \ + f'UCI Machine Learning datasets repository (multiclass). ' \ + f'Valid ones are {UCI_MULTICLASS_DATASETS}' + + if data_home is None: + data_home = get_quapy_home() + + identifiers = { + "dry-bean": 602, + "wine-quality": 186, + "academic-success": 697, + "digits": 80, + "letter": 59 + } + + full_names = { + "dry-bean": "Dry Bean Dataset", + "wine-quality": "Wine Quality", + "academic-success": "Predict students' dropout and academic success", + "digits": "Optical Recognition of Handwritten Digits", + "letter": "Letter Recognition" + } + + identifier = identifiers[dataset_name] + fullname = full_names[dataset_name] + + if verbose: + print(f'Loading UCI Muticlass {dataset_name} ({fullname})') + + file = join(data_home, 'uci_multiclass', dataset_name+'.pkl') + + def download(id): + data = fetch_ucirepo(id=id) + X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze() + classes = np.sort(np.unique(y)) + y = np.searchsorted(classes, y) + return LabelledCollection(X, y) + + data = pickled_resource(file, download, identifier) + + if verbose: + data.stats() + + return data
+ + +def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float): + df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False) + + +
[docs]def fetch_lequa2022(task, data_home=None): + """ + Loads the official datasets provided for the `LeQua <https://lequa2022.github.io/index>`_ competition. + In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification + problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide raw documents instead. + Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B are multiclass quantification + problems consisting of estimating the class prevalence values of 28 different merchandise products. + We refer to the `Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022). + A Detailed Overview of LeQua@ CLEF 2022: Learning to Quantify. + <https://ceur-ws.org/Vol-3180/paper-146.pdf>`_ for a detailed description + on the tasks and datasets. + + The datasets are downloaded only once, and stored for fast reuse. + + See `lequa2022_experiments.py` provided in the example folder, that can serve as a guide on how to use these + datasets. + + + :param task: a string representing the task name; valid ones are T1A, T1B, T2A, and T2B + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :return: a tuple `(train, val_gen, test_gen)` where `train` is an instance of + :class:`quapy.data.base.LabelledCollection`, `val_gen` and `test_gen` are instances of + :class:`quapy.data._lequa2022.SamplesFromDir`, a subclass of :class:`quapy.protocol.AbstractProtocol`, + that return a series of samples stored in a directory which are labelled by prevalence. + """ + + from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir + + assert task in LEQUA2022_TASKS, \ + f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}' + if data_home is None: + data_home = get_quapy_home() + + URL_TRAINDEV=f'https://zenodo.org/record/6546188/files/{task}.train_dev.zip' + URL_TEST=f'https://zenodo.org/record/6546188/files/{task}.test.zip' + URL_TEST_PREV=f'https://zenodo.org/record/6546188/files/{task}.test_prevalences.zip' + + lequa_dir = join(data_home, 'lequa2022') + os.makedirs(lequa_dir, exist_ok=True) + + def download_unzip_and_remove(unzipped_path, url): + tmp_path = join(lequa_dir, task + '_tmp.zip') + download_file_if_not_exists(url, tmp_path) + with zipfile.ZipFile(tmp_path) as file: + file.extractall(unzipped_path) + os.remove(tmp_path) + + if not os.path.exists(join(lequa_dir, task)): + download_unzip_and_remove(lequa_dir, URL_TRAINDEV) + download_unzip_and_remove(lequa_dir, URL_TEST) + download_unzip_and_remove(lequa_dir, URL_TEST_PREV) + + if task in ['T1A', 'T1B']: + load_fn = load_vector_documents + elif task in ['T2A', 'T2B']: + load_fn = load_raw_documents + + tr_path = join(lequa_dir, task, 'public', 'training_data.txt') + train = LabelledCollection.load(tr_path, loader_func=load_fn) + + val_samples_path = join(lequa_dir, task, 'public', 'dev_samples') + val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt') + val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn) + + test_samples_path = join(lequa_dir, task, 'public', 'test_samples') + test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt') + test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn) + + return train, val_gen, test_gen
+ + +
[docs]def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None): + """ + Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more + information on this dataset, please follow the zenodo link). + This dataset is based on the data available publicly at + `WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_. + The scripts for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_. + Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms. + + The datasets are downloaded only once, and stored for fast reuse. + + :param single_sample_train: a boolean. If true, it will return the train dataset as a + :class:`quapy.data.base.LabelledCollection` (all examples together). + If false, a generator of training samples will be returned. Each example in the training set has an individual label. + :param for_model_selection: if True, then returns a split 30% of the training set (86 out of 286 samples) to be used for model selection; + if False, then returns the full training set as training set and the test set as the test set + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :return: a tuple `(train, test_gen)` where `train` is an instance of + :class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is true or + :class:`quapy.data._ifcb.IFCBTrainSamplesFromDir`, i.e. a sampling protocol that returns a series of samples + labelled example by example. test_gen will be a :class:`quapy.data._ifcb.IFCBTestSamples`, + i.e., a sampling protocol that returns a series of samples labelled by prevalence. + """ + + from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples, get_sample_list, generate_modelselection_split + + if data_home is None: + data_home = get_quapy_home() + + URL_TRAIN=f'https://zenodo.org/records/10036244/files/IFCB.train.zip' + URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip' + URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip' + + ifcb_dir = join(data_home, 'ifcb') + os.makedirs(ifcb_dir, exist_ok=True) + + def download_unzip_and_remove(unzipped_path, url): + tmp_path = join(ifcb_dir, 'ifcb_tmp.zip') + download_file_if_not_exists(url, tmp_path) + with zipfile.ZipFile(tmp_path) as file: + file.extractall(unzipped_path) + os.remove(tmp_path) + + if not os.path.exists(os.path.join(ifcb_dir,'train')): + download_unzip_and_remove(ifcb_dir, URL_TRAIN) + if not os.path.exists(os.path.join(ifcb_dir,'test')): + download_unzip_and_remove(ifcb_dir, URL_TEST) + if not os.path.exists(os.path.join(ifcb_dir,'test_prevalences.csv')): + download_unzip_and_remove(ifcb_dir, URL_TEST_PREV) + + # Load test prevalences and classes + test_true_prev_path = join(ifcb_dir, 'test_prevalences.csv') + test_true_prev = pd.read_csv(test_true_prev_path) + classes = test_true_prev.columns[1:] + + #Load train and test samples + train_samples_path = join(ifcb_dir,'train') + test_samples_path = join(ifcb_dir,'test') + + if for_model_selection: + # In this case, return 70% of training data as the training set and 30% as the test set + samples = get_sample_list(train_samples_path) + train, test = generate_modelselection_split(samples, split=0.3) + train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train) + + # Test prevalence is computed from class labels + test_gen = IFCBTestSamples(path_dir=train_samples_path, test_prevalences=None, samples=test, classes=classes) + else: + # In this case, we use all training samples as the training set and the test samples as the test set + train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes) + test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences=test_true_prev) + + # In the case the user wants it, join all the train samples in one LabelledCollection + if single_sample_train: + train = LabelledCollection.join(*[lc for lc in train_gen()]) + return train, test_gen + else: + return train_gen, test_gen
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/data/preprocessing.html b/docs/build/html/_modules/quapy/data/preprocessing.html new file mode 100644 index 0000000..a50aa64 --- /dev/null +++ b/docs/build/html/_modules/quapy/data/preprocessing.html @@ -0,0 +1,373 @@ + + + + + + quapy.data.preprocessing — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.data.preprocessing

+import numpy as np
+from scipy.sparse import spmatrix
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+import quapy as qp
+from quapy.data.base import Dataset
+from quapy.util import map_parallel
+from .base import LabelledCollection
+
+
+
+[docs] +def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs): + """ + Transforms a :class:`quapy.data.base.Dataset` of textual instances into a :class:`quapy.data.base.Dataset` of + tfidf weighted sparse vectors + + :param dataset: a :class:`quapy.data.base.Dataset` where the instances of training and test collections are + lists of str + :param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary (default 3) + :param sublinear_tf: whether or not to apply the log scalling to the tf counters (default True) + :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default) + :param kwargs: the rest of parameters of the transformation (as for sklearn's + `TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`_) + :return: a new :class:`quapy.data.base.Dataset` in `csr_matrix` format (if inplace=False) or a reference to the + current Dataset (if inplace=True) where the instances are stored in a `csr_matrix` of real-valued tfidf scores + """ + __check_type(dataset.training.instances, np.ndarray, str) + __check_type(dataset.test.instances, np.ndarray, str) + + vectorizer = TfidfVectorizer(min_df=min_df, sublinear_tf=sublinear_tf, **kwargs) + training_documents = vectorizer.fit_transform(dataset.training.instances) + test_documents = vectorizer.transform(dataset.test.instances) + + if inplace: + dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.classes_) + dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.classes_) + dataset.vocabulary = vectorizer.vocabulary_ + return dataset + else: + training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.classes_) + test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.classes_) + return Dataset(training, test, vectorizer.vocabulary_)
+ + + +
+[docs] +def reduce_columns(dataset: Dataset, min_df=5, inplace=False): + """ + Reduces the dimensionality of the instances, represented as a `csr_matrix` (or any subtype of + `scipy.sparse.spmatrix`), of training and test documents by removing the columns of words which are not present + in at least `min_df` instances in the training set + + :param dataset: a :class:`quapy.data.base.Dataset` in which instances are represented in sparse format (any + subtype of scipy.sparse.spmatrix) + :param min_df: integer, minimum number of instances below which the columns are removed + :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default) + :return: a new :class:`quapy.data.base.Dataset` (if inplace=False) or a reference to the current + :class:`quapy.data.base.Dataset` (inplace=True) where the dimensions corresponding to infrequent terms + in the training set have been removed + """ + __check_type(dataset.training.instances, spmatrix) + __check_type(dataset.test.instances, spmatrix) + assert dataset.training.instances.shape[1] == dataset.test.instances.shape[1], 'unaligned vector spaces' + + def filter_by_occurrences(X, W): + column_prevalence = np.asarray((X > 0).sum(axis=0)).flatten() + take_columns = column_prevalence >= min_df + X = X[:, take_columns] + W = W[:, take_columns] + return X, W + + Xtr, Xte = filter_by_occurrences(dataset.training.instances, dataset.test.instances) + if inplace: + dataset.training.instances = Xtr + dataset.test.instances = Xte + return dataset + else: + training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.classes_) + test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.classes_) + return Dataset(training, test)
+ + + +
+[docs] +def standardize(dataset: Dataset, inplace=False): + """ + Standardizes the real-valued columns of a :class:`quapy.data.base.Dataset`. + Standardization, aka z-scoring, of a variable `X` comes down to subtracting the average and normalizing by the + standard deviation. + + :param dataset: a :class:`quapy.data.base.Dataset` object + :param inplace: set to True if the transformation is to be applied inplace, or to False (default) if a new + :class:`quapy.data.base.Dataset` is to be returned + :return: an instance of :class:`quapy.data.base.Dataset` + """ + s = StandardScaler(copy=not inplace) + training = s.fit_transform(dataset.training.instances) + test = s.transform(dataset.test.instances) + if inplace: + return dataset + else: + return Dataset(training, test, dataset.vocabulary, dataset.name)
+ + + +
+[docs] +def index(dataset: Dataset, min_df=5, inplace=False, **kwargs): + """ + Indexes the tokens of a textual :class:`quapy.data.base.Dataset` of string documents. + To index a document means to replace each different token by a unique numerical index. + Rare words (i.e., words occurring less than `min_df` times) are replaced by a special token `UNK` + + :param dataset: a :class:`quapy.data.base.Dataset` object where the instances of training and test documents + are lists of str + :param min_df: minimum number of occurrences below which the term is replaced by a `UNK` index + :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default) + :param kwargs: the rest of parameters of the transformation (as for sklearn's + `CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>_`) + :return: a new :class:`quapy.data.base.Dataset` (if inplace=False) or a reference to the current + :class:`quapy.data.base.Dataset` (inplace=True) consisting of lists of integer values representing indices. + """ + __check_type(dataset.training.instances, np.ndarray, str) + __check_type(dataset.test.instances, np.ndarray, str) + + indexer = IndexTransformer(min_df=min_df, **kwargs) + training_index = indexer.fit_transform(dataset.training.instances) + test_index = indexer.transform(dataset.test.instances) + + training_index = np.asarray(training_index, dtype=object) + test_index = np.asarray(test_index, dtype=object) + + if inplace: + dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.classes_) + dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.classes_) + dataset.vocabulary = indexer.vocabulary_ + return dataset + else: + training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.classes_) + test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.classes_) + return Dataset(training, test, indexer.vocabulary_)
+ + + +def __check_type(container, container_type=None, element_type=None): + if container_type: + assert isinstance(container, container_type), \ + f'unexpected type of container (expected {container_type}, found {type(container)})' + if element_type: + assert isinstance(container[0], element_type), \ + f'unexpected type of element (expected {container_type}, found {type(container)})' + + +
+[docs] +class IndexTransformer: + """ + This class implements a sklearn's-style transformer that indexes text as numerical ids for the tokens it + contains, and that would be generated by sklearn's + `CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_ + + :param kwargs: keyworded arguments from + `CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_ + """ + + def __init__(self, **kwargs): + self.vect = CountVectorizer(**kwargs) + self.unk = -1 # a valid index is assigned after fit + self.pad = -2 # a valid index is assigned after fit + +
+[docs] + def fit(self, X): + """ + Fits the transformer, i.e., decides on the vocabulary, given a list of strings. + + :param X: a list of strings + :return: self + """ + self.vect.fit(X) + self.analyzer = self.vect.build_analyzer() + self.vocabulary_ = self.vect.vocabulary_ + self.unk = self.add_word(qp.environ['UNK_TOKEN'], qp.environ['UNK_INDEX']) + self.pad = self.add_word(qp.environ['PAD_TOKEN'], qp.environ['PAD_INDEX']) + return self
+ + +
+[docs] + def transform(self, X, n_jobs=None): + """ + Transforms the strings in `X` as lists of numerical ids + + :param X: a list of strings + :param n_jobs: the number of parallel workers to carry out this task + :return: a `np.ndarray` of numerical ids + """ + # given the number of tasks and the number of jobs, generates the slices for the parallel processes + assert self.unk != -1, 'transform called before fit' + n_jobs = qp._get_njobs(n_jobs) + return map_parallel(func=self._index, args=X, n_jobs=n_jobs)
+ + + + def _index(self, documents): + vocab = self.vocabulary_.copy() + return [[vocab.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')] + +
+[docs] + def fit_transform(self, X, n_jobs=None): + """ + Fits the transform on `X` and transforms it. + + :param X: a list of strings + :param n_jobs: the number of parallel workers to carry out this task + :return: a `np.ndarray` of numerical ids + """ + return self.fit(X).transform(X, n_jobs=n_jobs)
+ + +
+[docs] + def vocabulary_size(self): + """ + Gets the length of the vocabulary according to which the document tokens have been indexed + + :return: integer + """ + return len(self.vocabulary_)
+ + +
+[docs] + def add_word(self, word, id=None, nogaps=True): + """ + Adds a new token (regardless of whether it has been found in the text or not), with dedicated id. + Useful to define special tokens for codifying unknown words, or padding tokens. + + :param word: string, surface form of the token + :param id: integer, numerical value to assign to the token (leave as None for indicating the next valid id, + default) + :param nogaps: if set to True (default) asserts that the id indicated leads to no numerical gaps with + precedent ids stored so far + :return: integer, the numerical id for the new token + """ + if word in self.vocabulary_: + raise ValueError(f'word {word} already in dictionary') + if id is None: + # add the word with the next id + self.vocabulary_[word] = len(self.vocabulary_) + else: + id2word = {id_:word_ for word_, id_ in self.vocabulary_.items()} + if id in id2word: + old_word = id2word[id] + self.vocabulary_[word] = id + del self.vocabulary_[old_word] + self.add_word(old_word) + elif nogaps: + if id > self.vocabulary_size()+1: + raise ValueError(f'word {word} added with id {id}, while the current vocabulary size ' + f'is of {self.vocabulary_size()}, and id gaps are not allowed') + return self.vocabulary_[word]
+
+ + +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/data/reader.html b/docs/build/html/_modules/quapy/data/reader.html new file mode 100644 index 0000000..4c9c163 --- /dev/null +++ b/docs/build/html/_modules/quapy/data/reader.html @@ -0,0 +1,244 @@ + + + + + + quapy.data.reader — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.data.reader

+import numpy as np
+from scipy.sparse import dok_matrix
+from tqdm import tqdm
+
+
+
+[docs] +def from_text(path, encoding='utf-8', verbose=1, class2int=True): + """ + Reads a labelled colletion of documents. + File fomart <0 or 1>\t<document>\n + + :param path: path to the labelled collection + :param encoding: the text encoding used to open the file + :param verbose: if >0 (default) shows some progress information in standard output + :return: a list of sentences, and a list of labels + """ + all_sentences, all_labels = [], [] + if verbose>0: + file = tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}') + else: + file = open(path, 'rt', encoding=encoding).readlines() + for line in file: + line = line.strip() + if line: + try: + label, sentence = line.split('\t') + sentence = sentence.strip() + if class2int: + label = int(label) + if sentence: + all_sentences.append(sentence) + all_labels.append(label) + except ValueError: + print(f'format error in {line}') + return all_sentences, all_labels
+ + + +
+[docs] +def from_sparse(path): + """ + Reads a labelled collection of real-valued instances expressed in sparse format + File format <-1 or 0 or 1>[\s col(int):val(float)]\n + + :param path: path to the labelled collection + :return: a `csr_matrix` containing the instances (rows), and a ndarray containing the labels + """ + + def split_col_val(col_val): + col, val = col_val.split(':') + col, val = int(col) - 1, float(val) + return col, val + + all_documents, all_labels = [], [] + max_col = 0 + for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'): + parts = line.strip().split() + if parts: + all_labels.append(int(parts[0])) + cols, vals = zip(*[split_col_val(col_val) for col_val in parts[1:]]) + cols, vals = np.asarray(cols), np.asarray(vals) + max_col = max(max_col, cols.max()) + all_documents.append((cols, vals)) + n_docs = len(all_labels) + X = dok_matrix((n_docs, max_col + 1), dtype=float) + for i, (cols, vals) in tqdm(enumerate(all_documents), total=len(all_documents), + desc=f'\-- filling matrix of shape {X.shape}'): + X[i, cols] = vals + X = X.tocsr() + y = np.asarray(all_labels) + 1 + return X, y
+ + + +
+[docs] +def from_csv(path, encoding='utf-8'): + """ + Reads a csv file in which columns are separated by ','. + File format <label>,<feat1>,<feat2>,...,<featn>\n + + :param path: path to the csv file + :param encoding: the text encoding used to open the file + :return: a np.ndarray for the labels and a ndarray (float) for the covariates + """ + + X, y = [], [] + for instance in tqdm(open(path, 'rt', encoding=encoding).readlines(), desc=f'reading {path}'): + yi, *xi = instance.strip().split(',') + X.append(list(map(float,xi))) + y.append(yi) + X = np.asarray(X) + y = np.asarray(y) + return X, y
+ + + +
+[docs] +def reindex_labels(y): + """ + Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes. + E.g.: + + >>> reindex_labels(['B', 'B', 'A', 'C']) + >>> (array([1, 1, 0, 2]), array(['A', 'B', 'C'], dtype='<U1')) + + :param y: the list or array of original labels + :return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes. + """ + y = np.asarray(y) + classnames = np.asarray(sorted(np.unique(y))) + label2index = {label: index for index, label in enumerate(classnames)} + indexed = np.empty(y.shape, dtype=int) + for label in classnames: + indexed[y==label] = label2index[label] + return indexed, classnames
+ + + +
+[docs] +def binarize(y, pos_class): + """ + Binarizes a categorical array-like collection of labels towards the positive class `pos_class`. E.g.,: + + >>> binarize([1, 2, 3, 1, 1, 0], pos_class=2) + >>> array([0, 1, 0, 0, 0, 0]) + + :param y: array-like of labels + :param pos_class: integer, the positive class + :return: a binary np.ndarray, in which values 1 corresponds to positions in whcih `y` had `pos_class` labels, and + 0 otherwise + """ + y = np.asarray(y) + ybin = np.zeros(y.shape, dtype=int) + ybin[y == pos_class] = 1 + return ybin
+ + +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/error.html b/docs/build/html/_modules/quapy/error.html new file mode 100644 index 0000000..1613468 --- /dev/null +++ b/docs/build/html/_modules/quapy/error.html @@ -0,0 +1,433 @@ + + + + + + quapy.error — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.error

+"""Implementation of error measures used for quantification"""
+
+import numpy as np
+from sklearn.metrics import f1_score
+import quapy as qp
+
+
+
[docs]def from_name(err_name): + """Gets an error function from its name. E.g., `from_name("mae")` + will return function :meth:`quapy.error.mae` + + :param err_name: string, the error name + :return: a callable implementing the requested error + """ + assert err_name in ERROR_NAMES, f'unknown error {err_name}' + callable_error = globals()[err_name] + return callable_error
+ + +
[docs]def f1e(y_true, y_pred): + """F1 error: simply computes the error in terms of macro :math:`F_1`, i.e., + :math:`1-F_1^M`, where :math:`F_1` is the harmonic mean of precision and recall, + defined as :math:`\\frac{2tp}{2tp+fp+fn}`, with `tp`, `fp`, and `fn` standing + for true positives, false positives, and false negatives, respectively. + `Macro` averaging means the :math:`F_1` is computed for each category independently, + and then averaged. + + :param y_true: array-like of true labels + :param y_pred: array-like of predicted labels + :return: :math:`1-F_1^M` + """ + return 1. - f1_score(y_true, y_pred, average='macro')
+ + +
[docs]def acce(y_true, y_pred): + """Computes the error in terms of 1-accuracy. The accuracy is computed as + :math:`\\frac{tp+tn}{tp+fp+fn+tn}`, with `tp`, `fp`, `fn`, and `tn` standing + for true positives, false positives, false negatives, and true negatives, + respectively + + :param y_true: array-like of true labels + :param y_pred: array-like of predicted labels + :return: 1-accuracy + """ + return 1. - (y_true == y_pred).mean()
+ + +
[docs]def mae(prevs, prevs_hat): + """Computes the mean absolute error (see :meth:`quapy.error.ae`) across the sample pairs. + + :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted + prevalence values + :return: mean absolute error + """ + return ae(prevs, prevs_hat).mean()
+ + +
[docs]def ae(prevs, prevs_hat): + """Computes the absolute error between the two prevalence vectors. + Absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as + :math:`AE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}|\\hat{p}(y)-p(y)|`, + where :math:`\\mathcal{Y}` are the classes of interest. + + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values + :return: absolute error + """ + assert prevs.shape == prevs_hat.shape, f'wrong shape {prevs.shape} vs. {prevs_hat.shape}' + return abs(prevs_hat - prevs).mean(axis=-1)
+ + +
[docs]def nae(prevs, prevs_hat): + """Computes the normalized absolute error between the two prevalence vectors. + Normalized absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as + :math:`NAE(p,\\hat{p})=\\frac{AE(p,\\hat{p})}{z_{AE}}`, + where :math:`z_{AE}=\\frac{2(1-\\min_{y\\in \\mathcal{Y}} p(y))}{|\\mathcal{Y}|}`, and :math:`\\mathcal{Y}` + are the classes of interest. + + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values + :return: normalized absolute error + """ + assert prevs.shape == prevs_hat.shape, f'wrong shape {prevs.shape} vs. {prevs_hat.shape}' + return abs(prevs_hat - prevs).sum(axis=-1)/(2*(1-prevs.min(axis=-1)))
+ + +
[docs]def mnae(prevs, prevs_hat): + """Computes the mean normalized absolute error (see :meth:`quapy.error.nae`) across the sample pairs. + + :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted + prevalence values + :return: mean normalized absolute error + """ + return nae(prevs, prevs_hat).mean()
+ + +
[docs]def mse(prevs, prevs_hat): + """Computes the mean squared error (see :meth:`quapy.error.se`) across the sample pairs. + + :param prevs: array-like of shape `(n_samples, n_classes,)` with the + true prevalence values + :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the + predicted prevalence values + :return: mean squared error + """ + return se(prevs, prevs_hat).mean()
+ + +
[docs]def se(prevs, prevs_hat): + """Computes the squared error between the two prevalence vectors. + Squared error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as + :math:`SE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}(\\hat{p}(y)-p(y))^2`, + where + :math:`\\mathcal{Y}` are the classes of interest. + + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values + :return: absolute error + """ + return ((prevs_hat - prevs) ** 2).mean(axis=-1)
+ + +
[docs]def mkld(prevs, prevs_hat, eps=None): + """Computes the mean Kullback-Leibler divergence (see :meth:`quapy.error.kld`) across the + sample pairs. The distributions are smoothed using the `eps` factor + (see :meth:`quapy.error.smooth`). + + :param prevs: array-like of shape `(n_samples, n_classes,)` with the true + prevalence values + :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted + prevalence values + :param eps: smoothing factor. KLD is not defined in cases in which the distributions contain + zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size. + If `eps=None`, the sample size will be taken from the environment variable `SAMPLE_SIZE` + (which has thus to be set beforehand). + :return: mean Kullback-Leibler distribution + """ + return kld(prevs, prevs_hat, eps).mean()
+ + +
[docs]def kld(prevs, prevs_hat, eps=None): + """Computes the Kullback-Leibler divergence between the two prevalence distributions. + Kullback-Leibler divergence between two prevalence distributions :math:`p` and :math:`\\hat{p}` + is computed as + :math:`KLD(p,\\hat{p})=D_{KL}(p||\\hat{p})= + \\sum_{y\\in \\mathcal{Y}} p(y)\\log\\frac{p(y)}{\\hat{p}(y)}`, + where :math:`\\mathcal{Y}` are the classes of interest. + The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). + + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values + :param eps: smoothing factor. KLD is not defined in cases in which the distributions contain + zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size. + If `eps=None`, the sample size will be taken from the environment variable `SAMPLE_SIZE` + (which has thus to be set beforehand). + :return: Kullback-Leibler divergence between the two distributions + """ + eps = __check_eps(eps) + smooth_prevs = prevs + eps + smooth_prevs_hat = prevs_hat + eps + return (smooth_prevs*np.log(smooth_prevs/smooth_prevs_hat)).sum(axis=-1)
+ + +
[docs]def mnkld(prevs, prevs_hat, eps=None): + """Computes the mean Normalized Kullback-Leibler divergence (see :meth:`quapy.error.nkld`) + across the sample pairs. The distributions are smoothed using the `eps` factor + (see :meth:`quapy.error.smooth`). + + :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted + prevalence values + :param eps: smoothing factor. NKLD is not defined in cases in which the distributions contain + zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size. + If `eps=None`, the sample size will be taken from the environment variable `SAMPLE_SIZE` + (which has thus to be set beforehand). + :return: mean Normalized Kullback-Leibler distribution + """ + return nkld(prevs, prevs_hat, eps).mean()
+ + +
[docs]def nkld(prevs, prevs_hat, eps=None): + """Computes the Normalized Kullback-Leibler divergence between the two prevalence distributions. + Normalized Kullback-Leibler divergence between two prevalence distributions :math:`p` and + :math:`\\hat{p}` is computed as + math:`NKLD(p,\\hat{p}) = 2\\frac{e^{KLD(p,\\hat{p})}}{e^{KLD(p,\\hat{p})}+1}-1`, + where + :math:`\\mathcal{Y}` are the classes of interest. + The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). + + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values + :param eps: smoothing factor. NKLD is not defined in cases in which the distributions + contain zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample + size. If `eps=None`, the sample size will be taken from the environment variable + `SAMPLE_SIZE` (which has thus to be set beforehand). + :return: Normalized Kullback-Leibler divergence between the two distributions + """ + ekld = np.exp(kld(prevs, prevs_hat, eps)) + return 2. * ekld / (1 + ekld) - 1.
+ + +
[docs]def mrae(prevs, prevs_hat, eps=None): + """Computes the mean relative absolute error (see :meth:`quapy.error.rae`) across + the sample pairs. The distributions are smoothed using the `eps` factor (see + :meth:`quapy.error.smooth`). + + :param prevs: array-like of shape `(n_samples, n_classes,)` with the true + prevalence values + :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted + prevalence values + :param eps: smoothing factor. `mrae` is not defined in cases in which the true + distribution contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, + with :math:`T` the sample size. If `eps=None`, the sample size will be taken from + the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand). + :return: mean relative absolute error + """ + return rae(prevs, prevs_hat, eps).mean()
+ + +
[docs]def rae(prevs, prevs_hat, eps=None): + """Computes the absolute relative error between the two prevalence vectors. + Relative absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` + is computed as + :math:`RAE(p,\\hat{p})= + \\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}\\frac{|\\hat{p}(y)-p(y)|}{p(y)}`, + where :math:`\\mathcal{Y}` are the classes of interest. + The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). + + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values + :param eps: smoothing factor. `rae` is not defined in cases in which the true distribution + contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the + sample size. If `eps=None`, the sample size will be taken from the environment variable + `SAMPLE_SIZE` (which has thus to be set beforehand). + :return: relative absolute error + """ + eps = __check_eps(eps) + prevs = smooth(prevs, eps) + prevs_hat = smooth(prevs_hat, eps) + return (abs(prevs - prevs_hat) / prevs).mean(axis=-1)
+ + +
[docs]def nrae(prevs, prevs_hat, eps=None): + """Computes the normalized absolute relative error between the two prevalence vectors. + Relative absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` + is computed as + :math:`NRAE(p,\\hat{p})= \\frac{RAE(p,\\hat{p})}{z_{RAE}}`, + where + :math:`z_{RAE} = \\frac{|\\mathcal{Y}|-1+\\frac{1-\\min_{y\\in \\mathcal{Y}} p(y)}{\\min_{y\\in \\mathcal{Y}} p(y)}}{|\\mathcal{Y}|}` + and :math:`\\mathcal{Y}` are the classes of interest. + The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). + + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values + :param eps: smoothing factor. `nrae` is not defined in cases in which the true distribution + contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the + sample size. If `eps=None`, the sample size will be taken from the environment variable + `SAMPLE_SIZE` (which has thus to be set beforehand). + :return: normalized relative absolute error + """ + eps = __check_eps(eps) + prevs = smooth(prevs, eps) + prevs_hat = smooth(prevs_hat, eps) + min_p = prevs.min(axis=-1) + return (abs(prevs - prevs_hat) / prevs).sum(axis=-1)/(prevs.shape[-1]-1+(1-min_p)/min_p)
+ + +
[docs]def mnrae(prevs, prevs_hat, eps=None): + """Computes the mean normalized relative absolute error (see :meth:`quapy.error.nrae`) across + the sample pairs. The distributions are smoothed using the `eps` factor (see + :meth:`quapy.error.smooth`). + + :param prevs: array-like of shape `(n_samples, n_classes,)` with the true + prevalence values + :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted + prevalence values + :param eps: smoothing factor. `mnrae` is not defined in cases in which the true + distribution contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, + with :math:`T` the sample size. If `eps=None`, the sample size will be taken from + the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand). + :return: mean normalized relative absolute error + """ + return nrae(prevs, prevs_hat, eps).mean()
+ + +
[docs]def smooth(prevs, eps): + """ Smooths a prevalence distribution with :math:`\\epsilon` (`eps`) as: + :math:`\\underline{p}(y)=\\frac{\\epsilon+p(y)}{\\epsilon|\\mathcal{Y}|+ + \\displaystyle\\sum_{y\\in \\mathcal{Y}}p(y)}` + + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param eps: smoothing factor + :return: array-like of shape `(n_classes,)` with the smoothed distribution + """ + n_classes = prevs.shape[-1] + return (prevs + eps) / (eps * n_classes + 1)
+ + +def __check_eps(eps=None): + if eps is None: + sample_size = qp.environ['SAMPLE_SIZE'] + if sample_size is None: + raise ValueError('eps was not defined, and qp.environ["SAMPLE_SIZE"] was not set') + eps = 1. / (2. * sample_size) + return eps + + +CLASSIFICATION_ERROR = {f1e, acce} +QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld} +QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld} +QUANTIFICATION_ERROR_SMOOTH = {kld, nkld, rae, nrae, mkld, mnkld, mrae} +CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR} +QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR} +QUANTIFICATION_ERROR_SINGLE_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR_SINGLE} +QUANTIFICATION_ERROR_SMOOTH_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR_SMOOTH} +ERROR_NAMES = \ + CLASSIFICATION_ERROR_NAMES | QUANTIFICATION_ERROR_NAMES | QUANTIFICATION_ERROR_SINGLE_NAMES + +f1_error = f1e +acc_error = acce +mean_absolute_error = mae +absolute_error = ae +mean_relative_absolute_error = mrae +relative_absolute_error = rae +normalized_absolute_error = nae +normalized_relative_absolute_error = nrae +mean_normalized_absolute_error = mnae +mean_normalized_relative_absolute_error = mnrae +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/evaluation.html b/docs/build/html/_modules/quapy/evaluation.html new file mode 100644 index 0000000..56d34a5 --- /dev/null +++ b/docs/build/html/_modules/quapy/evaluation.html @@ -0,0 +1,291 @@ + + + + + + quapy.evaluation — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.evaluation

+from typing import Union, Callable, Iterable
+import numpy as np
+from tqdm import tqdm
+import quapy as qp
+from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol, IterateProtocol
+from quapy.method.base import BaseQuantifier
+import pandas as pd
+
+
+
[docs]def prediction( + model: BaseQuantifier, + protocol: AbstractProtocol, + aggr_speedup: Union[str, bool] = 'auto', + verbose=False): + """ + Uses a quantification model to generate predictions for the samples generated via a specific protocol. + This function is central to all evaluation processes, and is endowed with an optimization to speed-up the + prediction of protocols that generate samples from a large collection. The optimization applies to aggregative + quantifiers only, and to OnLabelledCollectionProtocol protocols, and comes down to generating the classification + predictions once and for all, and then generating samples over the classification predictions (instead of over + the raw instances), so that the classifier prediction is never called again. This behaviour is obtained by + setting `aggr_speedup` to 'auto' or True, and is only carried out if the overall process is convenient in terms + of computations (e.g., if the number of classification predictions needed for the original collection exceed the + number of classification predictions needed for all samples, then the optimization is not undertaken). + + :param model: a quantifier, instance of :class:`quapy.method.base.BaseQuantifier` + :param protocol: :class:`quapy.protocol.AbstractProtocol`; if this object is also instance of + :class:`quapy.protocol.OnLabelledCollectionProtocol`, then the aggregation speed-up can be run. This is the protocol + in charge of generating the samples for which the model has to issue class prevalence predictions. + :param aggr_speedup: whether or not to apply the speed-up. Set to "force" for applying it even if the number of + instances in the original collection on which the protocol acts is larger than the number of instances + in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is + convenient or not. Set to False to deactivate. + :param verbose: boolean, show or not information in stdout + :return: a tuple `(true_prevs, estim_prevs)` in which each element in the tuple is an array of shape + `(n_samples, n_classes)` containing the true, or predicted, prevalence values for each sample + """ + assert aggr_speedup in [False, True, 'auto', 'force'], 'invalid value for aggr_speedup' + + sout = lambda x: print(x) if verbose else None + + apply_optimization = False + + if aggr_speedup in [True, 'auto', 'force']: + # checks whether the prediction can be made more efficiently; this check consists in verifying if the model is + # of type aggregative, if the protocol is based on LabelledCollection, and if the total number of documents to + # classify using the protocol would exceed the number of test documents in the original collection + from quapy.method.aggregative import AggregativeQuantifier + if isinstance(model, AggregativeQuantifier) and isinstance(protocol, OnLabelledCollectionProtocol): + if aggr_speedup == 'force': + apply_optimization = True + sout(f'forcing aggregative speedup') + elif hasattr(protocol, 'sample_size'): + nD = len(protocol.get_labelled_collection()) + samplesD = protocol.total() * protocol.sample_size + if nD < samplesD: + apply_optimization = True + sout(f'speeding up the prediction for the aggregative quantifier, ' + f'total classifications {nD} instead of {samplesD}') + + if apply_optimization: + pre_classified = model.classify(protocol.get_labelled_collection().instances) + protocol_with_predictions = protocol.on_preclassified_instances(pre_classified) + return __prediction_helper(model.aggregate, protocol_with_predictions, verbose) + else: + return __prediction_helper(model.quantify, protocol, verbose)
+ + +def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False): + true_prevs, estim_prevs = [], [] + for sample_instances, sample_prev in tqdm(protocol(), total=protocol.total(), desc='predicting') if verbose else protocol(): + estim_prevs.append(quantification_fn(sample_instances)) + true_prevs.append(sample_prev) + + true_prevs = np.asarray(true_prevs) + estim_prevs = np.asarray(estim_prevs) + + return true_prevs, estim_prevs + + +
[docs]def evaluation_report(model: BaseQuantifier, + protocol: AbstractProtocol, + error_metrics: Iterable[Union[str,Callable]] = 'mae', + aggr_speedup: Union[str, bool] = 'auto', + verbose=False): + """ + Generates a report (a pandas' DataFrame) containing information of the evaluation of the model as according + to a specific protocol and in terms of one or more evaluation metrics (errors). + + + :param model: a quantifier, instance of :class:`quapy.method.base.BaseQuantifier` + :param protocol: :class:`quapy.protocol.AbstractProtocol`; if this object is also instance of + :class:`quapy.protocol.OnLabelledCollectionProtocol`, then the aggregation speed-up can be run. This is the protocol + in charge of generating the samples in which the model is evaluated. + :param error_metrics: a string, or list of strings, representing the name(s) of an error function in `qp.error` + (e.g., 'mae', the default value), or a callable function, or a list of callable functions, implementing + the error function itself. + :param aggr_speedup: whether or not to apply the speed-up. Set to "force" for applying it even if the number of + instances in the original collection on which the protocol acts is larger than the number of instances + in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is + convenient or not. Set to False to deactivate. + :param verbose: boolean, show or not information in stdout + :return: a pandas' DataFrame containing the columns 'true-prev' (the true prevalence of each sample), + 'estim-prev' (the prevalence estimated by the model for each sample), and as many columns as error metrics + have been indicated, each displaying the score in terms of that metric for every sample. + """ + + true_prevs, estim_prevs = prediction(model, protocol, aggr_speedup=aggr_speedup, verbose=verbose) + return _prevalence_report(true_prevs, estim_prevs, error_metrics)
+ + +def _prevalence_report(true_prevs, estim_prevs, error_metrics: Iterable[Union[str, Callable]] = 'mae'): + + if isinstance(error_metrics, str): + error_metrics = [error_metrics] + + error_funcs = [qp.error.from_name(e) if isinstance(e, str) else e for e in error_metrics] + assert all(hasattr(e, '__call__') for e in error_funcs), 'invalid error functions' + error_names = [e.__name__ for e in error_funcs] + + row_entries = [] + for true_prev, estim_prev in zip(true_prevs, estim_prevs): + series = {'true-prev': true_prev, 'estim-prev': estim_prev} + for error_name, error_metric in zip(error_names, error_funcs): + score = error_metric(true_prev, estim_prev) + series[error_name] = score + row_entries.append(series) + + df = pd.DataFrame.from_records(row_entries) + return df + + +
[docs]def evaluate( + model: BaseQuantifier, + protocol: AbstractProtocol, + error_metric: Union[str, Callable], + aggr_speedup: Union[str, bool] = 'auto', + verbose=False): + """ + Evaluates a quantification model according to a specific sample generation protocol and in terms of one + evaluation metric (error). + + :param model: a quantifier, instance of :class:`quapy.method.base.BaseQuantifier` + :param protocol: :class:`quapy.protocol.AbstractProtocol`; if this object is also instance of + :class:`quapy.protocol.OnLabelledCollectionProtocol`, then the aggregation speed-up can be run. This is the + protocol in charge of generating the samples in which the model is evaluated. + :param error_metric: a string representing the name(s) of an error function in `qp.error` + (e.g., 'mae'), or a callable function implementing the error function itself. + :param aggr_speedup: whether or not to apply the speed-up. Set to "force" for applying it even if the number of + instances in the original collection on which the protocol acts is larger than the number of instances + in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is + convenient or not. Set to False to deactivate. + :param verbose: boolean, show or not information in stdout + :return: if the error metric is not averaged (e.g., 'ae', 'rae'), returns an array of shape `(n_samples,)` with + the error scores for each sample; if the error metric is averaged (e.g., 'mae', 'mrae') then returns + a single float + """ + + if isinstance(error_metric, str): + error_metric = qp.error.from_name(error_metric) + true_prevs, estim_prevs = prediction(model, protocol, aggr_speedup=aggr_speedup, verbose=verbose) + return error_metric(true_prevs, estim_prevs)
+ + +
[docs]def evaluate_on_samples( + model: BaseQuantifier, + samples: Iterable[qp.data.LabelledCollection], + error_metric: Union[str, Callable], + verbose=False): + """ + Evaluates a quantification model on a given set of samples and in terms of one evaluation metric (error). + + :param model: a quantifier, instance of :class:`quapy.method.base.BaseQuantifier` + :param samples: a list of samples on which the quantifier is to be evaluated + :param error_metric: a string representing the name(s) of an error function in `qp.error` + (e.g., 'mae'), or a callable function implementing the error function itself. + :param verbose: boolean, show or not information in stdout + :return: if the error metric is not averaged (e.g., 'ae', 'rae'), returns an array of shape `(n_samples,)` with + the error scores for each sample; if the error metric is averaged (e.g., 'mae', 'mrae') then returns + a single float + """ + + return evaluate(model, IterateProtocol(samples), error_metric, aggr_speedup=False, verbose=verbose)
+ + + + + +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/functional.html b/docs/build/html/_modules/quapy/functional.html new file mode 100644 index 0000000..1b02248 --- /dev/null +++ b/docs/build/html/_modules/quapy/functional.html @@ -0,0 +1,468 @@ + + + + + + quapy.functional — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.functional

+import itertools
+from collections import defaultdict
+from typing import Union, Callable
+
+import scipy
+import numpy as np
+
+
+
[docs]def prevalence_linspace(n_prevalences=21, repeats=1, smooth_limits_epsilon=0.01): + """ + Produces an array of uniformly separated values of prevalence. + By default, produces an array of 21 prevalence values, with + step 0.05 and with the limits smoothed, i.e.: + [0.01, 0.05, 0.10, 0.15, ..., 0.90, 0.95, 0.99] + + :param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21) + :param repeats: number of times each prevalence is to be repeated (defaults to 1) + :param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1 + :return: an array of uniformly separated prevalence values + """ + p = np.linspace(0., 1., num=n_prevalences, endpoint=True) + p[0] += smooth_limits_epsilon + p[-1] -= smooth_limits_epsilon + if p[0] > p[1]: + raise ValueError(f'the smoothing in the limits is greater than the prevalence step') + if repeats > 1: + p = np.repeat(p, repeats) + return p
+ + +
[docs]def prevalence_from_labels(labels, classes): + """ + Computed the prevalence values from a vector of labels. + + :param labels: array-like of shape `(n_instances)` with the label for each instance + :param classes: the class labels. This is needed in order to correctly compute the prevalence vector even when + some classes have no examples. + :return: an ndarray of shape `(len(classes))` with the class prevalence values + """ + if labels.ndim != 1: + raise ValueError(f'param labels does not seem to be a ndarray of label predictions') + unique, counts = np.unique(labels, return_counts=True) + by_class = defaultdict(lambda:0, dict(zip(unique, counts))) + prevalences = np.asarray([by_class[class_] for class_ in classes], dtype=float) + prevalences /= prevalences.sum() + return prevalences
+ + +
[docs]def prevalence_from_probabilities(posteriors, binarize: bool = False): + """ + Returns a vector of prevalence values from a matrix of posterior probabilities. + + :param posteriors: array-like of shape `(n_instances, n_classes,)` with posterior probabilities for each class + :param binarize: set to True (default is False) for computing the prevalence values on crisp decisions (i.e., + converting the vectors of posterior probabilities into class indices, by taking the argmax). + :return: array of shape `(n_classes,)` containing the prevalence values + """ + if posteriors.ndim != 2: + raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities') + if binarize: + predictions = np.argmax(posteriors, axis=-1) + return prevalence_from_labels(predictions, np.arange(posteriors.shape[1])) + else: + prevalences = posteriors.mean(axis=0) + prevalences /= prevalences.sum() + return prevalences
+ + +
[docs]def as_binary_prevalence(positive_prevalence: Union[float, np.ndarray], clip_if_necessary=False): + """ + Helper that, given a float representing the prevalence for the positive class, returns a np.ndarray of two + values representing a binary distribution. + + :param positive_prevalence: prevalence for the positive class + :param clip_if_necessary: if True, clips the value in [0,1] in order to guarantee the resulting distribution + is valid. If False, it then checks that the value is in the valid range, and raises an error if not. + :return: np.ndarray of shape `(2,)` + """ + if clip_if_necessary: + positive_prevalence = np.clip(positive_prevalence, 0, 1) + else: + assert 0 <= positive_prevalence <= 1, 'the value provided is not a valid prevalence for the positive class' + return np.asarray([1-positive_prevalence, positive_prevalence]).T
+ + + +
[docs]def HellingerDistance(P, Q) -> float: + """ + Computes the Hellingher Distance (HD) between (discretized) distributions `P` and `Q`. + The HD for two discrete distributions of `k` bins is defined as: + + .. math:: + HD(P,Q) = \\frac{ 1 }{ \\sqrt{ 2 } } \\sqrt{ \\sum_{i=1}^k ( \\sqrt{p_i} - \\sqrt{q_i} )^2 } + + :param P: real-valued array-like of shape `(k,)` representing a discrete distribution + :param Q: real-valued array-like of shape `(k,)` representing a discrete distribution + :return: float + """ + return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2))
+ + +
[docs]def TopsoeDistance(P, Q, epsilon=1e-20): + """ + Topsoe distance between two (discretized) distributions `P` and `Q`. + The Topsoe distance for two discrete distributions of `k` bins is defined as: + + .. math:: + Topsoe(P,Q) = \\sum_{i=1}^k \\left( p_i \\log\\left(\\frac{ 2 p_i + \\epsilon }{ p_i+q_i+\\epsilon }\\right) + + q_i \\log\\left(\\frac{ 2 q_i + \\epsilon }{ p_i+q_i+\\epsilon }\\right) \\right) + + :param P: real-valued array-like of shape `(k,)` representing a discrete distribution + :param Q: real-valued array-like of shape `(k,)` representing a discrete distribution + :return: float + """ + return np.sum(P*np.log((2*P+epsilon)/(P+Q+epsilon)) + Q*np.log((2*Q+epsilon)/(P+Q+epsilon)))
+ + +
[docs]def uniform_prevalence_sampling(n_classes, size=1): + """ + Implements the `Kraemer algorithm <http://www.cs.cmu.edu/~nasmith/papers/smith+tromble.tr04.pdf>`_ + for sampling uniformly at random from the unit simplex. This implementation is adapted from this + `post <https://cs.stackexchange.com/questions/3227/uniform-sampling-from-a-simplex>_`. + + :param n_classes: integer, number of classes (dimensionality of the simplex) + :param size: number of samples to return + :return: `np.ndarray` of shape `(size, n_classes,)` if `size>1`, or of shape `(n_classes,)` otherwise + """ + if n_classes == 2: + u = np.random.rand(size) + u = np.vstack([1-u, u]).T + else: + u = np.random.rand(size, n_classes-1) + u.sort(axis=-1) + _0s = np.zeros(shape=(size, 1)) + _1s = np.ones(shape=(size, 1)) + a = np.hstack([_0s, u]) + b = np.hstack([u, _1s]) + u = b-a + if size == 1: + u = u.flatten() + return u
+ + +uniform_simplex_sampling = uniform_prevalence_sampling + + +
[docs]def strprev(prevalences, prec=3): + """ + Returns a string representation for a prevalence vector. E.g., + + >>> strprev([1/3, 2/3], prec=2) + >>> '[0.33, 0.67]' + + :param prevalences: a vector of prevalence values + :param prec: float precision + :return: string + """ + return '['+ ', '.join([f'{p:.{prec}f}' for p in prevalences]) + ']'
+ + +
[docs]def adjusted_quantification(prevalence_estim, tpr, fpr, clip=True): + """ + Implements the adjustment of ACC and PACC for the binary case. The adjustment for a prevalence estimate of the + positive class `p` comes down to computing: + + .. math:: + ACC(p) = \\frac{ p - fpr }{ tpr - fpr } + + :param prevalence_estim: float, the estimated value for the positive class + :param tpr: float, the true positive rate of the classifier + :param fpr: float, the false positive rate of the classifier + :param clip: set to True (default) to clip values that might exceed the range [0,1] + :return: float, the adjusted count + """ + + den = tpr - fpr + if den == 0: + den += 1e-8 + adjusted = (prevalence_estim - fpr) / den + if clip: + adjusted = np.clip(adjusted, 0., 1.) + return adjusted
+ + +
[docs]def normalize_prevalence(prevalences): + """ + Normalize a vector or matrix of prevalence values. The normalization consists of applying a L1 normalization in + cases in which the prevalence values are not all-zeros, and to convert the prevalence values into `1/n_classes` in + cases in which all values are zero. + + :param prevalences: array-like of shape `(n_classes,)` or of shape `(n_samples, n_classes,)` with prevalence values + :return: a normalized vector or matrix of prevalence values + """ + prevalences = np.asarray(prevalences) + n_classes = prevalences.shape[-1] + accum = prevalences.sum(axis=-1, keepdims=True) + prevalences = np.true_divide(prevalences, accum, where=accum>0) + allzeros = accum.flatten()==0 + if any(allzeros): + if prevalences.ndim == 1: + prevalences = np.full(shape=n_classes, fill_value=1./n_classes) + else: + prevalences[accum.flatten()==0] = np.full(shape=n_classes, fill_value=1./n_classes) + return prevalences
+ + +def __num_prevalence_combinations_depr(n_prevpoints:int, n_classes:int, n_repeats:int=1): + """ + Computes the number of prevalence combinations in the n_classes-dimensional simplex if `nprevpoints` equally distant + prevalence values are generated and `n_repeats` repetitions are requested. + + :param n_classes: integer, number of classes + :param n_prevpoints: integer, number of prevalence points. + :param n_repeats: integer, number of repetitions for each prevalence combination + :return: The number of possible combinations. For example, if n_classes=2, n_prevpoints=5, n_repeats=1, then the + number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0] + """ + __cache={} + def __f(nc,np): + if (nc,np) in __cache: # cached result + return __cache[(nc,np)] + if nc==1: # stop condition + return 1 + else: # recursive call + x = sum([__f(nc-1, np-i) for i in range(np)]) + __cache[(nc,np)] = x + return x + return __f(n_classes, n_prevpoints) * n_repeats + + +
[docs]def num_prevalence_combinations(n_prevpoints:int, n_classes:int, n_repeats:int=1): + """ + Computes the number of valid prevalence combinations in the n_classes-dimensional simplex if `n_prevpoints` equally + distant prevalence values are generated and `n_repeats` repetitions are requested. + The computation comes down to calculating: + + .. math:: + \\binom{N+C-1}{C-1} \\times r + + where `N` is `n_prevpoints-1`, i.e., the number of probability mass blocks to allocate, `C` is the number of + classes, and `r` is `n_repeats`. This solution comes from the + `Stars and Bars <https://brilliant.org/wiki/integer-equations-star-and-bars/>`_ problem. + + :param n_classes: integer, number of classes + :param n_prevpoints: integer, number of prevalence points. + :param n_repeats: integer, number of repetitions for each prevalence combination + :return: The number of possible combinations. For example, if n_classes=2, n_prevpoints=5, n_repeats=1, then the + number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0] + """ + N = n_prevpoints-1 + C = n_classes + r = n_repeats + return int(scipy.special.binom(N + C - 1, C - 1) * r)
+ + +
[docs]def get_nprevpoints_approximation(combinations_budget:int, n_classes:int, n_repeats:int=1): + """ + Searches for the largest number of (equidistant) prevalence points to define for each of the `n_classes` classes so + that the number of valid prevalence values generated as combinations of prevalence points (points in a + `n_classes`-dimensional simplex) do not exceed combinations_budget. + + :param combinations_budget: integer, maximum number of combinations allowed + :param n_classes: integer, number of classes + :param n_repeats: integer, number of repetitions for each prevalence combination + :return: the largest number of prevalence points that generate less than combinations_budget valid prevalences + """ + assert n_classes > 0 and n_repeats > 0 and combinations_budget > 0, 'parameters must be positive integers' + n_prevpoints = 1 + while True: + combinations = num_prevalence_combinations(n_prevpoints, n_classes, n_repeats) + if combinations > combinations_budget: + return n_prevpoints-1 + else: + n_prevpoints += 1
+ + +
[docs]def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08): + """ + Checks that p is a valid prevalence vector, i.e., that it contains values in [0,1] and that the values sum up to 1. + + :param p: the prevalence vector to check + :return: True if `p` is valid, False otherwise + """ + p = np.asarray(p) + if not all(p>=0): + if raise_exception: + raise ValueError('the prevalence vector contains negative numbers') + return False + if not all(p<=1): + if raise_exception: + raise ValueError('the prevalence vector contains values >1') + return False + if not np.isclose(p.sum(), 1, atol=toleranze): + if raise_exception: + raise ValueError('the prevalence vector does not sum up to 1') + return False + return True
+ + +
[docs]def get_divergence(divergence: Union[str, Callable]): + if isinstance(divergence, str): + if divergence=='HD': + return HellingerDistance + elif divergence=='topsoe': + return TopsoeDistance + else: + raise ValueError(f'unknown divergence {divergence}') + elif callable(divergence): + return divergence + else: + raise ValueError(f'argument "divergence" not understood; use a str or a callable function')
+ + +
[docs]def argmin_prevalence(loss, n_classes, method='optim_minimize'): + if method == 'optim_minimize': + return optim_minimize(loss, n_classes) + elif method == 'linear_search': + return linear_search(loss, n_classes) + elif method == 'ternary_search': + raise NotImplementedError() + else: + raise NotImplementedError()
+ + +
[docs]def optim_minimize(loss, n_classes): + """ + Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex + that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's + SLSQP routine. + + :param loss: (callable) the function to minimize + :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector + :return: (ndarray) the best prevalence vector found + """ + from scipy import optimize + + # the initial point is set as the uniform distribution + uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + + # solutions are bounded to those contained in the unit-simplex + bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] + constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 + r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) + return r.x
+ + + +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/method/_kdey.html b/docs/build/html/_modules/quapy/method/_kdey.html new file mode 100644 index 0000000..4e96e56 --- /dev/null +++ b/docs/build/html/_modules/quapy/method/_kdey.html @@ -0,0 +1,462 @@ + + + + + + quapy.method._kdey — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.method._kdey

+from typing import Union
+import numpy as np
+from sklearn.base import BaseEstimator
+from sklearn.neighbors import KernelDensity
+
+import quapy as qp
+from quapy.data import LabelledCollection
+from quapy.method.aggregative import AggregativeSoftQuantifier
+import quapy.functional as F
+
+from sklearn.metrics.pairwise import rbf_kernel
+
+
+
[docs]class KDEBase: + """ + Common ancestor for KDE-based methods. Implements some common routines. + """ + + BANDWIDTH_METHOD = ['scott', 'silverman'] + + @classmethod + def _check_bandwidth(cls, bandwidth): + """ + Checks that the bandwidth parameter is correct + + :param bandwidth: either a string (see BANDWIDTH_METHOD) or a float + :return: nothing, but raises an exception for invalid values + """ + assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \ + f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values' + if isinstance(bandwidth, float): + assert 0 < bandwidth < 1, "the bandwith for KDEy should be in (0,1), since this method models the unit simplex" + +
[docs] def get_kde_function(self, X, bandwidth): + """ + Wraps the KDE function from scikit-learn. + + :param X: data for which the density function is to be estimated + :param bandwidth: the bandwidth of the kernel + :return: a scikit-learn's KernelDensity object + """ + return KernelDensity(bandwidth=bandwidth).fit(X)
+ +
[docs] def pdf(self, kde, X): + """ + Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this + function returns :math:`e^{s}` + + :param kde: a previously fit KDE function + :param X: the data for which the density is to be estimated + :return: np.ndarray with the densities + """ + return np.exp(kde.score_samples(X))
+ +
[docs] def get_mixture_components(self, X, y, n_classes, bandwidth): + """ + Returns an array containing the mixture components, i.e., the KDE functions for each class. + + :param X: the data containing the covariates + :param y: the class labels + :param n_classes: integer, the number of classes + :param bandwidth: float, the bandwidth of the kernel + :return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates + """ + return [self.get_kde_function(X[y == cat], bandwidth) for cat in range(n_classes)]
+ + + +
[docs]class KDEyML(AggregativeSoftQuantifier, KDEBase): + """ + Kernel Density Estimation model for quantification (KDEy) relying on the Kullback-Leibler divergence (KLD) as + the divergence measure to be minimized. This method was first proposed in the paper + `Kernel Density Estimation for Multiclass Quantification <https://arxiv.org/abs/2401.00490>`_, in which + the authors show that minimizing the distribution mathing criterion for KLD is akin to performing + maximum likelihood (ML). + + The distribution matching optimization problem comes down to solving: + + :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})` + + where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence) + :math:`\\alpha` defined by + + :math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})` + + where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the + KDE function that uses the datapoints in X as the kernel centers. + + In KDEy-ML, the divergence is taken to be the Kullback-Leibler Divergence. This is equivalent to solving: + :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} - + \\mathbb{E}_{q_{\\widetilde{U}}} \\left[ \\log \\boldsymbol{p}_{\\alpha}(\\widetilde{x}) \\right]` + + which corresponds to the maximum likelihood estimate. + + :param classifier: a sklearn's Estimator that generates a binary classifier. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a collection defining the specific set of data to use for validation. + Alternatively, this set can be specified at fit time by indicating the exact set of data + on which the predictions are to be generated. + :param bandwidth: float, the bandwidth of the Kernel + :param n_jobs: number of parallel workers + :param random_state: a seed to be set before fitting any base quantifier (default None) + """ + + def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=None): + self._check_bandwidth(bandwidth) + self.classifier = classifier + self.val_split = val_split + self.bandwidth = bandwidth + self.n_jobs = n_jobs + self.random_state=random_state + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth) + return self
+ +
[docs] def aggregate(self, posteriors: np.ndarray): + """ + Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood + of the data (i.e., that minimizes the negative log-likelihood) + + :param posteriors: instances in the sample converted into posterior probabilities + :return: a vector of class prevalence estimates + """ + np.random.RandomState(self.random_state) + epsilon = 1e-10 + n_classes = len(self.mix_densities) + test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities] + + def neg_loglikelihood(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) + + return F.optim_minimize(neg_loglikelihood, n_classes)
+ + +
[docs]class KDEyHD(AggregativeSoftQuantifier, KDEBase): + """ + Kernel Density Estimation model for quantification (KDEy) relying on the squared Hellinger Disntace (HD) as + the divergence measure to be minimized. This method was first proposed in the paper + `Kernel Density Estimation for Multiclass Quantification <https://arxiv.org/abs/2401.00490>`_, in which + the authors proposed a Monte Carlo approach for minimizing the divergence. + + The distribution matching optimization problem comes down to solving: + + :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})` + + where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence) + :math:`\\alpha` defined by + + :math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})` + + where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the + KDE function that uses the datapoints in X as the kernel centers. + + In KDEy-HD, the divergence is taken to be the squared Hellinger Distance, an f-divergence with corresponding + f-generator function given by: + + :math:`f(u)=(\\sqrt{u}-1)^2` + + The authors proposed a Monte Carlo solution that relies on importance sampling: + + :math:`\\hat{D}_f(p||q)= \\frac{1}{t} \\sum_{i=1}^t f\\left(\\frac{p(x_i)}{q(x_i)}\\right) \\frac{q(x_i)}{r(x_i)}` + + where the datapoints (trials) :math:`x_1,\\ldots,x_t\\sim_{\\mathrm{iid}} r` with :math:`r` the + uniform distribution. + + :param classifier: a sklearn's Estimator that generates a binary classifier. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a collection defining the specific set of data to use for validation. + Alternatively, this set can be specified at fit time by indicating the exact set of data + on which the predictions are to be generated. + :param bandwidth: float, the bandwidth of the Kernel + :param n_jobs: number of parallel workers + :param random_state: a seed to be set before fitting any base quantifier (default None) + :param montecarlo_trials: number of Monte Carlo trials (default 10000) + """ + + def __init__(self, classifier: BaseEstimator, val_split=10, divergence: str='HD', + bandwidth=0.1, n_jobs=None, random_state=None, montecarlo_trials=10000): + + self._check_bandwidth(bandwidth) + self.classifier = classifier + self.val_split = val_split + self.divergence = divergence + self.bandwidth = bandwidth + self.n_jobs = n_jobs + self.random_state=random_state + self.montecarlo_trials = montecarlo_trials + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth) + + N = self.montecarlo_trials + rs = self.random_state + n = data.n_classes + self.reference_samples = np.vstack([kde_i.sample(N//n, random_state=rs) for kde_i in self.mix_densities]) + self.reference_classwise_densities = np.asarray([self.pdf(kde_j, self.reference_samples) for kde_j in self.mix_densities]) + self.reference_density = np.mean(self.reference_classwise_densities, axis=0) # equiv. to (uniform @ self.reference_classwise_densities) + + return self
+ +
[docs] def aggregate(self, posteriors: np.ndarray): + # we retain all n*N examples (sampled from a mixture with uniform parameter), and then + # apply importance sampling (IS). In this version we compute D(p_alpha||q) with IS + n_classes = len(self.mix_densities) + + test_kde = self.get_kde_function(posteriors, self.bandwidth) + test_densities = self.pdf(test_kde, self.reference_samples) + + def f_squared_hellinger(u): + return (np.sqrt(u)-1)**2 + + # todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway + if self.divergence.lower() == 'hd': + f = f_squared_hellinger + else: + raise ValueError('only squared HD is currently implemented') + + epsilon = 1e-10 + qs = test_densities + epsilon + rs = self.reference_density + epsilon + iw = qs/rs #importance weights + p_class = self.reference_classwise_densities + epsilon + fracs = p_class/qs + + def divergence(prev): + # ps / qs = (prev @ p_class) / qs = prev @ (p_class / qs) = prev @ fracs + ps_div_qs = prev @ fracs + return np.mean( f(ps_div_qs) * iw ) + + return F.optim_minimize(divergence, n_classes)
+ + +
[docs]class KDEyCS(AggregativeSoftQuantifier): + """ + Kernel Density Estimation model for quantification (KDEy) relying on the Cauchy-Schwarz divergence (CS) as + the divergence measure to be minimized. This method was first proposed in the paper + `Kernel Density Estimation for Multiclass Quantification <https://arxiv.org/abs/2401.00490>`_, in which + the authors proposed a Monte Carlo approach for minimizing the divergence. + + The distribution matching optimization problem comes down to solving: + + :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})` + + where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence) + :math:`\\alpha` defined by + + :math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})` + + where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the + KDE function that uses the datapoints in X as the kernel centers. + + In KDEy-CS, the divergence is taken to be the Cauchy-Schwarz divergence given by: + + :math:`\\mathcal{D}_{\\mathrm{CS}}(p||q)=-\\log\\left(\\frac{\\int p(x)q(x)dx}{\\sqrt{\\int p(x)^2dx \\int q(x)^2dx}}\\right)` + + The authors showed that this distribution matching admits a closed-form solution + + :param classifier: a sklearn's Estimator that generates a binary classifier. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a collection defining the specific set of data to use for validation. + Alternatively, this set can be specified at fit time by indicating the exact set of data + on which the predictions are to be generated. + :param bandwidth: float, the bandwidth of the Kernel + :param n_jobs: number of parallel workers + """ + + def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None): + KDEBase._check_bandwidth(bandwidth) + self.classifier = classifier + self.val_split = val_split + self.bandwidth = bandwidth + self.n_jobs = n_jobs + +
[docs] def gram_matrix_mix_sum(self, X, Y=None): + # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y)) + # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are + # two "scalar matrices" (h^2)*I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth) + h = self.bandwidth + variance = 2 * (h**2) + nD = X.shape[1] + gamma = 1/(2*variance) + norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD))) + gram = norm_factor * rbf_kernel(X, Y, gamma=gamma) + return gram.sum()
+ +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + + P, y = classif_predictions.Xy + n = data.n_classes + + assert all(sorted(np.unique(y)) == np.arange(n)), \ + 'label name gaps not allowed in current implementation' + + # counts_inv keeps track of the relative weight of each datapoint within its class + # (i.e., the weight in its KDE model) + counts_inv = 1 / (data.counts()) + + # tr_tr_sums corresponds to symbol \overline{B} in the paper + tr_tr_sums = np.zeros(shape=(n,n), dtype=float) + for i in range(n): + for j in range(n): + if i > j: + tr_tr_sums[i,j] = tr_tr_sums[j,i] + else: + block = self.gram_matrix_mix_sum(P[y == i], P[y == j] if i!=j else None) + tr_tr_sums[i, j] = block + + # keep track of these data structures for the test phase + self.Ptr = P + self.ytr = y + self.tr_tr_sums = tr_tr_sums + self.counts_inv = counts_inv + + return self
+ + +
[docs] def aggregate(self, posteriors: np.ndarray): + Ptr = self.Ptr + Pte = posteriors + y = self.ytr + tr_tr_sums = self.tr_tr_sums + + M, nD = Pte.shape + Minv = (1/M) # t in the paper + n = Ptr.shape[1] + + # becomes a constant that does not affect the optimization, no need to compute it + # partC = 0.5*np.log(self.gram_matrix_mix_sum(Pte) * Kinv * Kinv) + + # tr_te_sums corresponds to \overline{a}*(1/Li)*(1/M) in the paper (note the constants + # are already aggregated to tr_te_sums, so these multiplications are not carried out + # at each iteration of the optimization phase) + tr_te_sums = np.zeros(shape=n, dtype=float) + for i in range(n): + tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte) + + def divergence(alpha): + # called \overline{r} in the paper + alpha_ratio = alpha * self.counts_inv + + # recal that tr_te_sums already accounts for the constant terms (1/Li)*(1/M) + partA = -np.log((alpha_ratio @ tr_te_sums) * Minv) + partB = 0.5 * np.log(alpha_ratio @ tr_tr_sums @ alpha_ratio) + return partA + partB #+ partC + + return F.optim_minimize(divergence, n)
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/method/_neural.html b/docs/build/html/_modules/quapy/method/_neural.html new file mode 100644 index 0000000..706a7cc --- /dev/null +++ b/docs/build/html/_modules/quapy/method/_neural.html @@ -0,0 +1,520 @@ + + + + + + quapy.method._neural — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.method._neural

+import os
+from pathlib import Path
+import random
+
+import torch
+from torch.nn import MSELoss
+from torch.nn.functional import relu
+
+from quapy.protocol import UPP
+from quapy.method.aggregative import *
+from quapy.util import EarlyStop
+from tqdm import tqdm
+
+
+
[docs]class QuaNetTrainer(BaseQuantifier): + """ + Implementation of `QuaNet <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_, a neural network for + quantification. This implementation uses `PyTorch <https://pytorch.org/>`_ and can take advantage of GPU + for speeding-up the training phase. + + Example: + + >>> import quapy as qp + >>> from quapy.method.meta import QuaNet + >>> from quapy.classification.neural import NeuralClassifierTrainer, CNNnet + >>> + >>> # use samples of 100 elements + >>> qp.environ['SAMPLE_SIZE'] = 100 + >>> + >>> # load the kindle dataset as text, and convert words to numerical indexes + >>> dataset = qp.datasets.fetch_reviews('kindle', pickle=True) + >>> qp.train.preprocessing.index(dataset, min_df=5, inplace=True) + >>> + >>> # the text classifier is a CNN trained by NeuralClassifierTrainer + >>> cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes) + >>> classifier = NeuralClassifierTrainer(cnn, device='cuda') + >>> + >>> # train QuaNet (QuaNet is an alias to QuaNetTrainer) + >>> model = QuaNet(classifier, qp.environ['SAMPLE_SIZE'], device='cuda') + >>> model.fit(dataset.training) + >>> estim_prevalence = model.quantify(dataset.test.instances) + + :param classifier: an object implementing `fit` (i.e., that can be trained on labelled data), + `predict_proba` (i.e., that can generate posterior probabilities of unlabelled examples) and + `transform` (i.e., that can generate embedded representations of the unlabelled instances). + :param sample_size: integer, the sample size; default is None, meaning that the sample size should be + taken from qp.environ["SAMPLE_SIZE"] + :param n_epochs: integer, maximum number of training epochs + :param tr_iter_per_poch: integer, number of training iterations before considering an epoch complete + :param va_iter_per_poch: integer, number of validation iterations to perform after each epoch + :param lr: float, the learning rate + :param lstm_hidden_size: integer, hidden dimensionality of the LSTM cells + :param lstm_nlayers: integer, number of LSTM layers + :param ff_layers: list of integers, dimensions of the densely-connected FF layers on top of the + quantification embedding + :param bidirectional: boolean, indicates whether the LSTM is bidirectional or not + :param qdrop_p: float, dropout probability + :param patience: integer, number of epochs showing no improvement in the validation set before stopping the + training phase (early stopping) + :param checkpointdir: string, a path where to store models' checkpoints + :param checkpointname: string (optional), the name of the model's checkpoint + :param device: string, indicate "cpu" or "cuda" + """ + + def __init__(self, + classifier, + sample_size=None, + n_epochs=100, + tr_iter_per_poch=500, + va_iter_per_poch=100, + lr=1e-3, + lstm_hidden_size=64, + lstm_nlayers=1, + ff_layers=[1024, 512], + bidirectional=True, + qdrop_p=0.5, + patience=10, + checkpointdir='../checkpoint', + checkpointname=None, + device='cuda'): + + assert hasattr(classifier, 'transform'), \ + f'the classifier {classifier.__class__.__name__} does not seem to be able to produce document embeddings ' \ + f'since it does not implement the method "transform"' + assert hasattr(classifier, 'predict_proba'), \ + f'the classifier {classifier.__class__.__name__} does not seem to be able to produce posterior probabilities ' \ + f'since it does not implement the method "predict_proba"' + self.classifier = classifier + self.sample_size = qp._get_sample_size(sample_size) + self.n_epochs = n_epochs + self.tr_iter = tr_iter_per_poch + self.va_iter = va_iter_per_poch + self.lr = lr + self.quanet_params = { + 'lstm_hidden_size': lstm_hidden_size, + 'lstm_nlayers': lstm_nlayers, + 'ff_layers': ff_layers, + 'bidirectional': bidirectional, + 'qdrop_p': qdrop_p + } + + self.patience = patience + if checkpointname is None: + local_random = random.Random() + random_code = '-'.join(str(local_random.randint(0, 1000000)) for _ in range(5)) + checkpointname = 'QuaNet-'+random_code + self.checkpointdir = checkpointdir + self.checkpoint = os.path.join(checkpointdir, checkpointname) + self.device = torch.device(device) + + self.__check_params_colision(self.quanet_params, self.classifier.get_params()) + self._classes_ = None + +
[docs] def fit(self, data: LabelledCollection, fit_classifier=True): + """ + Trains QuaNet. + + :param data: the training data on which to train QuaNet. If `fit_classifier=True`, the data will be split in + 40/40/20 for training the classifier, training QuaNet, and validating QuaNet, respectively. If + `fit_classifier=False`, the data will be split in 66/34 for training QuaNet and validating it, respectively. + :param fit_classifier: if True, trains the classifier on a split containing 40% of the data + :return: self + """ + self._classes_ = data.classes_ + os.makedirs(self.checkpointdir, exist_ok=True) + + if fit_classifier: + classifier_data, unused_data = data.split_stratified(0.4) + train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20% + self.classifier.fit(*classifier_data.Xy) + else: + classifier_data = None + train_data, valid_data = data.split_stratified(0.66) + + # estimate the hard and soft stats tpr and fpr of the classifier + self.tr_prev = data.prevalence() + + # compute the posterior probabilities of the instances + valid_posteriors = self.classifier.predict_proba(valid_data.instances) + train_posteriors = self.classifier.predict_proba(train_data.instances) + + # turn instances' original representations into embeddings + valid_data_embed = LabelledCollection(self.classifier.transform(valid_data.instances), valid_data.labels, self._classes_) + train_data_embed = LabelledCollection(self.classifier.transform(train_data.instances), train_data.labels, self._classes_) + + self.quantifiers = { + 'cc': CC(self.classifier).fit(None, fit_classifier=False), + 'acc': ACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data), + 'pcc': PCC(self.classifier).fit(None, fit_classifier=False), + 'pacc': PACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data), + } + if classifier_data is not None: + self.quantifiers['emq'] = EMQ(self.classifier).fit(classifier_data, fit_classifier=False) + + self.status = { + 'tr-loss': -1, + 'va-loss': -1, + 'tr-mae': -1, + 'va-mae': -1, + } + + nQ = len(self.quantifiers) + nC = data.n_classes + self.quanet = QuaNetModule( + doc_embedding_size=train_data_embed.instances.shape[1], + n_classes=data.n_classes, + stats_size=nQ*nC, + order_by=0 if data.binary else None, + **self.quanet_params + ).to(self.device) + print(self.quanet) + + self.optim = torch.optim.Adam(self.quanet.parameters(), lr=self.lr) + early_stop = EarlyStop(self.patience, lower_is_better=True) + + checkpoint = self.checkpoint + + for epoch_i in range(1, self.n_epochs): + self._epoch(train_data_embed, train_posteriors, self.tr_iter, epoch_i, early_stop, train=True) + self._epoch(valid_data_embed, valid_posteriors, self.va_iter, epoch_i, early_stop, train=False) + + early_stop(self.status['va-loss'], epoch_i) + if early_stop.IMPROVED: + torch.save(self.quanet.state_dict(), checkpoint) + elif early_stop.STOP: + print(f'training ended by patience exhausted; loading best model parameters in {checkpoint} ' + f'for epoch {early_stop.best_epoch}') + self.quanet.load_state_dict(torch.load(checkpoint)) + break + + return self
+ + def _get_aggregative_estims(self, posteriors): + label_predictions = np.argmax(posteriors, axis=-1) + prevs_estim = [] + for quantifier in self.quantifiers.values(): + predictions = posteriors if isinstance(quantifier, AggregativeSoftQuantifier) else label_predictions + prevs_estim.extend(quantifier.aggregate(predictions)) + + # there is no real need for adding static estims like the TPR or FPR from training since those are constant + + return prevs_estim + +
[docs] def quantify(self, instances): + posteriors = self.classifier.predict_proba(instances) + embeddings = self.classifier.transform(instances) + quant_estims = self._get_aggregative_estims(posteriors) + self.quanet.eval() + with torch.no_grad(): + prevalence = self.quanet.forward(embeddings, posteriors, quant_estims) + if self.device == torch.device('cuda'): + prevalence = prevalence.cpu() + prevalence = prevalence.numpy().flatten() + return prevalence
+ + def _epoch(self, data: LabelledCollection, posteriors, iterations, epoch, early_stop, train): + mse_loss = MSELoss() + + self.quanet.train(mode=train) + losses = [] + mae_errors = [] + sampler = UPP( + data, + sample_size=self.sample_size, + repeats=iterations, + random_state=None if train else 0 # different samples during train, same samples during validation + ) + pbar = tqdm(sampler.samples_parameters(), total=sampler.total()) + for it, index in enumerate(pbar): + sample_data = data.sampling_from_index(index) + sample_posteriors = posteriors[index] + quant_estims = self._get_aggregative_estims(sample_posteriors) + ptrue = torch.as_tensor([sample_data.prevalence()], dtype=torch.float, device=self.device) + if train: + self.optim.zero_grad() + phat = self.quanet.forward(sample_data.instances, sample_posteriors, quant_estims) + loss = mse_loss(phat, ptrue) + mae = mae_loss(phat, ptrue) + loss.backward() + self.optim.step() + else: + with torch.no_grad(): + phat = self.quanet.forward(sample_data.instances, sample_posteriors, quant_estims) + loss = mse_loss(phat, ptrue) + mae = mae_loss(phat, ptrue) + + losses.append(loss.item()) + mae_errors.append(mae.item()) + + mse = np.mean(losses) + mae = np.mean(mae_errors) + if train: + self.status['tr-loss'] = mse + self.status['tr-mae'] = mae + else: + self.status['va-loss'] = mse + self.status['va-mae'] = mae + + if train: + pbar.set_description(f'[QuaNet] ' + f'epoch={epoch} [it={it}/{iterations}]\t' + f'tr-mseloss={self.status["tr-loss"]:.5f} tr-maeloss={self.status["tr-mae"]:.5f}\t' + f'val-mseloss={self.status["va-loss"]:.5f} val-maeloss={self.status["va-mae"]:.5f} ' + f'patience={early_stop.patience}/{early_stop.PATIENCE_LIMIT}') + +
[docs] def get_params(self, deep=True): + classifier_params = self.classifier.get_params() + classifier_params = {'classifier__'+k:v for k,v in classifier_params.items()} + return {**classifier_params, **self.quanet_params}
+ +
[docs] def set_params(self, **parameters): + learner_params = {} + for key, val in parameters.items(): + if key in self.quanet_params: + self.quanet_params[key] = val + elif key.startswith('classifier__'): + learner_params[key.replace('classifier__', '')] = val + else: + raise ValueError('unknown parameter ', key) + self.classifier.set_params(**learner_params)
+ + def __check_params_colision(self, quanet_params, learner_params): + quanet_keys = set(quanet_params.keys()) + learner_keys = set(learner_params.keys()) + intersection = quanet_keys.intersection(learner_keys) + if len(intersection) > 0: + raise ValueError(f'the use of parameters {intersection} is ambiguous sine those can refer to ' + f'the parameters of QuaNet or the learner {self.classifier.__class__.__name__}') + +
[docs] def clean_checkpoint(self): + """ + Removes the checkpoint + """ + os.remove(self.checkpoint)
+ +
[docs] def clean_checkpoint_dir(self): + """ + Removes anything contained in the checkpoint directory + """ + import shutil + shutil.rmtree(self.checkpointdir, ignore_errors=True)
+ + @property + def classes_(self): + return self._classes_
+ + +
[docs]def mae_loss(output, target): + """ + Torch-like wrapper for the Mean Absolute Error + + :param output: predictions + :param target: ground truth values + :return: mean absolute error loss + """ + return torch.mean(torch.abs(output - target))
+ + +
[docs]class QuaNetModule(torch.nn.Module): + """ + Implements the `QuaNet <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_ forward pass. + See :class:`QuaNetTrainer` for training QuaNet. + + :param doc_embedding_size: integer, the dimensionality of the document embeddings + :param n_classes: integer, number of classes + :param stats_size: integer, number of statistics estimated by simple quantification methods + :param lstm_hidden_size: integer, hidden dimensionality of the LSTM cell + :param lstm_nlayers: integer, number of LSTM layers + :param ff_layers: list of integers, dimensions of the densely-connected FF layers on top of the + quantification embedding + :param bidirectional: boolean, whether or not to use bidirectional LSTM + :param qdrop_p: float, dropout probability + :param order_by: integer, class for which the document embeddings are to be sorted + """ + + def __init__(self, + doc_embedding_size, + n_classes, + stats_size, + lstm_hidden_size=64, + lstm_nlayers=1, + ff_layers=[1024, 512], + bidirectional=True, + qdrop_p=0.5, + order_by=0): + + super().__init__() + + self.n_classes = n_classes + self.order_by = order_by + self.hidden_size = lstm_hidden_size + self.nlayers = lstm_nlayers + self.bidirectional = bidirectional + self.ndirections = 2 if self.bidirectional else 1 + self.qdrop_p = qdrop_p + self.lstm = torch.nn.LSTM(doc_embedding_size + n_classes, # +n_classes stands for the posterior probs. (concatenated) + lstm_hidden_size, lstm_nlayers, bidirectional=bidirectional, + dropout=qdrop_p, batch_first=True) + self.dropout = torch.nn.Dropout(self.qdrop_p) + + lstm_output_size = self.hidden_size * self.ndirections + ff_input_size = lstm_output_size + stats_size + prev_size = ff_input_size + self.ff_layers = torch.nn.ModuleList() + for lin_size in ff_layers: + self.ff_layers.append(torch.nn.Linear(prev_size, lin_size)) + prev_size = lin_size + self.output = torch.nn.Linear(prev_size, n_classes) + + @property + def device(self): + return torch.device('cuda') if next(self.parameters()).is_cuda else torch.device('cpu') + + def _init_hidden(self): + directions = 2 if self.bidirectional else 1 + var_hidden = torch.zeros(self.nlayers * directions, 1, self.hidden_size) + var_cell = torch.zeros(self.nlayers * directions, 1, self.hidden_size) + if next(self.lstm.parameters()).is_cuda: + var_hidden, var_cell = var_hidden.cuda(), var_cell.cuda() + return var_hidden, var_cell + +
[docs] def forward(self, doc_embeddings, doc_posteriors, statistics): + device = self.device + doc_embeddings = torch.as_tensor(doc_embeddings, dtype=torch.float, device=device) + doc_posteriors = torch.as_tensor(doc_posteriors, dtype=torch.float, device=device) + statistics = torch.as_tensor(statistics, dtype=torch.float, device=device) + + if self.order_by is not None: + order = torch.argsort(doc_posteriors[:, self.order_by]) + doc_embeddings = doc_embeddings[order] + doc_posteriors = doc_posteriors[order] + + embeded_posteriors = torch.cat((doc_embeddings, doc_posteriors), dim=-1) + + # the entire set represents only one instance in quapy contexts, and so the batch_size=1 + # the shape should be (1, number-of-instances, embedding-size + n_classes) + embeded_posteriors = embeded_posteriors.unsqueeze(0) + + self.lstm.flatten_parameters() + _, (rnn_hidden,_) = self.lstm(embeded_posteriors, self._init_hidden()) + rnn_hidden = rnn_hidden.view(self.nlayers, self.ndirections, 1, self.hidden_size) + quant_embedding = rnn_hidden[0].view(-1) + quant_embedding = torch.cat((quant_embedding, statistics)) + + abstracted = quant_embedding.unsqueeze(0) + for linear in self.ff_layers: + abstracted = self.dropout(relu(linear(abstracted))) + + logits = self.output(abstracted).view(1, -1) + prevalence = torch.softmax(logits, -1) + + return prevalence
+ + + + + +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/method/_threshold_optim.html b/docs/build/html/_modules/quapy/method/_threshold_optim.html new file mode 100644 index 0000000..486aa61 --- /dev/null +++ b/docs/build/html/_modules/quapy/method/_threshold_optim.html @@ -0,0 +1,364 @@ + + + + + + quapy.method._threshold_optim — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for quapy.method._threshold_optim

+from abc import abstractmethod
+
+import numpy as np
+from sklearn.base import BaseEstimator
+import quapy as qp
+import quapy.functional as F
+from quapy.data import LabelledCollection
+from quapy.method.aggregative import BinaryAggregativeQuantifier
+
+
+
[docs]class ThresholdOptimization(BinaryAggregativeQuantifier): + """ + Abstract class of Threshold Optimization variants for :class:`ACC` as proposed by + `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and + `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_. + The goal is to bring improved stability to the denominator of the adjustment. + The different variants are based on different heuristics for choosing a decision threshold + that would allow for more true positives and many more false positives, on the grounds this + would deliver larger denominators. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ + + def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None): + self.classifier = classifier + self.val_split = val_split + self.n_jobs = qp._get_njobs(n_jobs) + +
[docs] @abstractmethod + def condition(self, tpr, fpr) -> float: + """ + Implements the criterion according to which the threshold should be selected. + This function should return the (float) score to be minimized. + + :param tpr: float, true positive rate + :param fpr: float, false positive rate + :return: float, a score for the given `tpr` and `fpr` + """ + ...
+ +
[docs] def discard(self, tpr, fpr) -> bool: + """ + Indicates whether a combination of tpr and fpr should be discarded + + :param tpr: float, true positive rate + :param fpr: float, false positive rate + :return: true if the combination is to be discarded, false otherwise + """ + return (tpr - fpr) == 0
+ + + def _eval_candidate_thresholds(self, decision_scores, y): + """ + Seeks for the best `tpr` and `fpr` according to the score obtained at different + decision thresholds. The scoring function is implemented in function `_condition`. + + :param decision_scores: array-like with the classification scores + :param y: predicted labels for the validation set (or for the training set via `k`-fold cross validation) + :return: best `tpr` and `fpr` and `threshold` according to `_condition` + """ + candidate_thresholds = np.unique(decision_scores) + + candidates = [] + scores = [] + for candidate_threshold in candidate_thresholds: + y_ = self.classes_[1 * (decision_scores >= candidate_threshold)] + TP, FP, FN, TN = self._compute_table(y, y_) + tpr = self._compute_tpr(TP, FN) + fpr = self._compute_fpr(FP, TN) + if not self.discard(tpr, fpr): + candidate_score = self.condition(tpr, fpr) + candidates.append([tpr, fpr, candidate_threshold]) + scores.append(candidate_score) + + if len(candidates) == 0: + # if no candidate gives rise to a valid combination of tpr and fpr, this method defaults to the standard + # classify & count; this is akin to assign tpr=1, fpr=0, threshold=0 + tpr, fpr, threshold = 1, 0, 0 + candidates.append([tpr, fpr, threshold]) + scores.append(0) + + candidates = np.asarray(candidates) + candidates = candidates[np.argsort(scores)] # sort candidates by candidate_score + + return candidates + +
[docs] def aggregate_with_threshold(self, classif_predictions, tprs, fprs, thresholds): + # This function performs the adjusted count for given tpr, fpr, and threshold. + # Note that, due to broadcasting, tprs, fprs, and thresholds could be arrays of length > 1 + prevs_estims = np.mean(classif_predictions[:, None] >= thresholds, axis=0) + prevs_estims = (prevs_estims - fprs) / (tprs - fprs) + prevs_estims = F.as_binary_prevalence(prevs_estims, clip_if_necessary=True) + return prevs_estims.squeeze()
+ + def _compute_table(self, y, y_): + TP = np.logical_and(y == y_, y == self.pos_label).sum() + FP = np.logical_and(y != y_, y == self.neg_label).sum() + FN = np.logical_and(y != y_, y == self.pos_label).sum() + TN = np.logical_and(y == y_, y == self.neg_label).sum() + return TP, FP, FN, TN + + def _compute_tpr(self, TP, FP): + if TP + FP == 0: + return 1 + return TP / (TP + FP) + + def _compute_fpr(self, FP, TN): + if FP + TN == 0: + return 0 + return FP / (FP + TN) + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + decision_scores, y = classif_predictions.Xy + # the standard behavior is to keep the best threshold only + self.tpr, self.fpr, self.threshold = self._eval_candidate_thresholds(decision_scores, y)[0] + return self
+ +
[docs] def aggregate(self, classif_predictions: np.ndarray): + # the standard behavior is to compute the adjusted count using the best threshold found + return self.aggregate_with_threshold(classif_predictions, self.tpr, self.fpr, self.threshold)
+ + +
[docs]class T50(ThresholdOptimization): + """ + Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and + `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks + for the threshold that makes `tpr` closest to 0.5. + The goal is to bring improved stability to the denominator of the adjustment. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ + + def __init__(self, classifier: BaseEstimator, val_split=5): + super().__init__(classifier, val_split) + +
[docs] def condition(self, tpr, fpr) -> float: + return abs(tpr - 0.5)
+ + +
[docs]class MAX(ThresholdOptimization): + """ + Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and + `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks + for the threshold that maximizes `tpr-fpr`. + The goal is to bring improved stability to the denominator of the adjustment. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ + + def __init__(self, classifier: BaseEstimator, val_split=5): + super().__init__(classifier, val_split) + +
[docs] def condition(self, tpr, fpr) -> float: + # MAX strives to maximize (tpr - fpr), which is equivalent to minimize (fpr - tpr) + return (fpr - tpr)
+ + +
[docs]class X(ThresholdOptimization): + """ + Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and + `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks + for the threshold that yields `tpr=1-fpr`. + The goal is to bring improved stability to the denominator of the adjustment. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ + + def __init__(self, classifier: BaseEstimator, val_split=5): + super().__init__(classifier, val_split) + +
[docs] def condition(self, tpr, fpr) -> float: + return abs(1 - (tpr + fpr))
+ + +
[docs]class MS(ThresholdOptimization): + """ + Median Sweep. Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and + `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that generates + class prevalence estimates for all decision thresholds and returns the median of them all. + The goal is to bring improved stability to the denominator of the adjustment. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ + def __init__(self, classifier: BaseEstimator, val_split=5): + super().__init__(classifier, val_split) + +
[docs] def condition(self, tpr, fpr) -> float: + return 1
+ +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + decision_scores, y = classif_predictions.Xy + # keeps all candidates + tprs_fprs_thresholds = self._eval_candidate_thresholds(decision_scores, y) + self.tprs = tprs_fprs_thresholds[:, 0] + self.fprs = tprs_fprs_thresholds[:, 1] + self.thresholds = tprs_fprs_thresholds[:, 2] + return self
+ +
[docs] def aggregate(self, classif_predictions: np.ndarray): + prevalences = self.aggregate_with_threshold(classif_predictions, self.tprs, self.fprs, self.thresholds) + if prevalences.ndim==2: + prevalences = np.median(prevalences, axis=0) + return prevalences
+ + +
[docs]class MS2(MS): + """ + Median Sweep 2. Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and + `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that generates + class prevalence estimates for all decision thresholds and returns the median of for cases in + which `tpr-fpr>0.25` + The goal is to bring improved stability to the denominator of the adjustment. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ + def __init__(self, classifier: BaseEstimator, val_split=5): + super().__init__(classifier, val_split) + +
[docs] def discard(self, tpr, fpr) -> bool: + return (tpr-fpr) <= 0.25
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/method/aggregative.html b/docs/build/html/_modules/quapy/method/aggregative.html new file mode 100644 index 0000000..8311baa --- /dev/null +++ b/docs/build/html/_modules/quapy/method/aggregative.html @@ -0,0 +1,1440 @@ + + + + + + quapy.method.aggregative — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.method.aggregative

+from abc import ABC, abstractmethod
+from copy import deepcopy
+from typing import Callable, Union
+import numpy as np
+from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
+from scipy import optimize
+from sklearn.base import BaseEstimator
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn.metrics import confusion_matrix
+from sklearn.model_selection import cross_val_predict
+
+import quapy as qp
+import quapy.functional as F
+from quapy.functional import get_divergence
+from quapy.classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration
+from quapy.classification.svmperf import SVMperf
+from quapy.data import LabelledCollection
+from quapy.method.base import BaseQuantifier, BinaryQuantifier, OneVsAllGeneric
+
+
+# Abstract classes
+# ------------------------------------
+
+
[docs]class AggregativeQuantifier(BaseQuantifier, ABC): + """ + Abstract class for quantification methods that base their estimations on the aggregation of classification + results. Aggregative quantifiers implement a pipeline that consists of generating classification predictions + and aggregating them. For this reason, the training phase is implemented by :meth:`classification_fit` followed + by :meth:`aggregation_fit`, while the testing phase is implemented by :meth:`classify` followed by + :meth:`aggregate`. Subclasses of this abstract class must provide implementations for these methods. + Aggregative quantifiers also maintain a :attr:`classifier` attribute. + + The method :meth:`fit` comes with a default implementation based on :meth:`classification_fit` + and :meth:`aggregation_fit`. + + The method :meth:`quantify` comes with a default implementation based on :meth:`classify` + and :meth:`aggregate`. + """ + + val_split_ = None + + @property + def val_split(self): + return self.val_split_ + + @val_split.setter + def val_split(self, val_split): + if isinstance(val_split, LabelledCollection): + print('warning: setting val_split with a LabelledCollection will be inefficient in' + 'model selection. Rather pass the LabelledCollection at fit time') + self.val_split_ = val_split + + def _check_init_parameters(self): + """ + Implements any check to be performed in the parameters of the init method before undertaking + the training of the quantifier. This is made as to allow for a quick execution stop when the + parameters are not valid. + + :return: Nothing. May raise an exception. + """ + pass + + def _check_non_empty_classes(self, data: LabelledCollection): + """ + Asserts all classes have positive instances. + + :param data: LabelledCollection + :return: Nothing. May raise an exception. + """ + sample_prevs = data.prevalence() + empty_classes = np.argwhere(sample_prevs==0).flatten() + if len(empty_classes)>0: + empty_class_names = data.classes_[empty_classes] + raise ValueError(f'classes {empty_class_names} have no training examples') + +
[docs] def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None): + """ + Trains the aggregative quantifier. This comes down to training a classifier and an aggregation function. + + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + :return: self + """ + self._check_init_parameters() + classif_predictions = self.classifier_fit_predict(data, fit_classifier, predict_on=val_split) + self.aggregation_fit(classif_predictions, data) + return self
+ +
[docs] def classifier_fit_predict(self, data: LabelledCollection, fit_classifier=True, predict_on=None): + """ + Trains the classifier if requested (`fit_classifier=True`) and generate the necessary predictions to + train the aggregation function. + + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + :param predict_on: specifies the set on which predictions need to be issued. This parameter can + be specified as None (default) to indicate no prediction is needed; a float in (0, 1) to + indicate the proportion of instances to be used for predictions (the remainder is used for + training); an integer >1 to indicate that the predictions must be generated via k-fold + cross-validation, using this integer as k; or the data sample itself on which to generate + the predictions. + """ + assert isinstance(fit_classifier, bool), 'unexpected type for "fit_classifier", must be boolean' + + self._check_classifier(adapt_if_necessary=(self._classifier_method() == 'predict_proba')) + + if fit_classifier: + self._check_non_empty_classes(data) + + if predict_on is None: + predict_on = self.val_split + + if predict_on is None: + if fit_classifier: + self.classifier.fit(*data.Xy) + predictions = None + elif isinstance(predict_on, float): + if fit_classifier: + if not (0. < predict_on < 1.): + raise ValueError(f'proportion {predict_on=} out of range, must be in (0,1)') + train, val = data.split_stratified(train_prop=(1 - predict_on)) + self.classifier.fit(*train.Xy) + predictions = LabelledCollection(self.classify(val.X), val.y, classes=data.classes_) + else: + raise ValueError(f'wrong type for predict_on: since fit_classifier=False, ' + f'the set on which predictions have to be issued must be ' + f'explicitly indicated') + + elif isinstance(predict_on, LabelledCollection): + if fit_classifier: + self.classifier.fit(*data.Xy) + predictions = LabelledCollection(self.classify(predict_on.X), predict_on.y, classes=predict_on.classes_) + + elif isinstance(predict_on, int): + if fit_classifier: + if predict_on <= 1: + raise ValueError(f'invalid value {predict_on} in fit. ' + f'Specify a integer >1 for kFCV estimation.') + else: + n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None) + predictions = cross_val_predict( + self.classifier, *data.Xy, cv=predict_on, n_jobs=n_jobs, method=self._classifier_method()) + predictions = LabelledCollection(predictions, data.y, classes=data.classes_) + self.classifier.fit(*data.Xy) + else: + raise ValueError(f'wrong type for predict_on: since fit_classifier=False, ' + f'the set on which predictions have to be issued must be ' + f'explicitly indicated') + + else: + raise ValueError( + f'error: param "predict_on" ({type(predict_on)}) not understood; ' + f'use either a float indicating the split proportion, or a ' + f'tuple (X,y) indicating the validation partition') + + return predictions
+ +
[docs] @abstractmethod + def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Trains the aggregation function. + + :param classif_predictions: a LabelledCollection containing the label predictions issued + by the classifier + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + """ + ...
+ + @property + def classifier(self): + """ + Gives access to the classifier + + :return: the classifier (typically an sklearn's Estimator) + """ + return self.classifier_ + + @classifier.setter + def classifier(self, classifier): + """ + Setter for the classifier + + :param classifier: the classifier + """ + self.classifier_ = classifier + +
[docs] def classify(self, instances): + """ + Provides the label predictions for the given instances. The predictions should respect the format expected by + :meth:`aggregate`, e.g., posterior probabilities for probabilistic quantifiers, or crisp predictions for + non-probabilistic quantifiers. The default one is "decision_function". + + :param instances: array-like of shape `(n_instances, n_features,)` + :return: np.ndarray of shape `(n_instances,)` with label predictions + """ + return getattr(self.classifier, self._classifier_method())(instances)
+ + def _classifier_method(self): + """ + Name of the method that must be used for issuing label predictions. The default one is "decision_function". + + :return: string + """ + return 'decision_function' + + def _check_classifier(self, adapt_if_necessary=False): + """ + Guarantees that the underlying classifier implements the method required for issuing predictions, i.e., + the method indicated by the :meth:`_classifier_method` + + :param adapt_if_necessary: if True, the method will try to comply with the required specifications + """ + assert hasattr(self.classifier, self._classifier_method()), \ + f"the method does not implement the required {self._classifier_method()} method" + +
[docs] def quantify(self, instances): + """ + Generate class prevalence estimates for the sample's instances by aggregating the label predictions generated + by the classifier. + + :param instances: array-like + :return: `np.ndarray` of shape `(n_classes)` with class prevalence estimates. + """ + classif_predictions = self.classify(instances) + return self.aggregate(classif_predictions)
+ +
[docs] @abstractmethod + def aggregate(self, classif_predictions: np.ndarray): + """ + Implements the aggregation of label predictions. + + :param classif_predictions: `np.ndarray` of label predictions + :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates. + """ + ...
+ + @property + def classes_(self): + """ + Class labels, in the same order in which class prevalence values are to be computed. + This default implementation actually returns the class labels of the learner. + + :return: array-like + """ + return self.classifier.classes_
+ + +
[docs]class AggregativeCrispQuantifier(AggregativeQuantifier, ABC): + """ + Abstract class for quantification methods that base their estimations on the aggregation of crips decisions + as returned by a hard classifier. Aggregative crisp quantifiers thus extend Aggregative + Quantifiers by implementing specifications about crisp predictions. + """ + + def _classifier_method(self): + """ + Name of the method that must be used for issuing label predictions. For crisp quantifiers, the method + is 'predict', that returns an array of shape `(n_instances,)` of label predictions. + + :return: the string "predict", i.e., the standard method name for scikit-learn hard predictions + """ + return 'predict'
+ + +
[docs]class AggregativeSoftQuantifier(AggregativeQuantifier, ABC): + """ + Abstract class for quantification methods that base their estimations on the aggregation of posterior + probabilities as returned by a probabilistic classifier. + Aggregative soft quantifiers thus extend Aggregative Quantifiers by implementing specifications + about soft predictions. + """ + + def _classifier_method(self): + """ + Name of the method that must be used for issuing label predictions. For probabilistic quantifiers, the method + is 'predict_proba', that returns an array of shape `(n_instances, n_dimensions,)` with posterior + probabilities. + + :return: the string "predict_proba", i.e., the standard method name for scikit-learn soft predictions + """ + return 'predict_proba' + + def _check_classifier(self, adapt_if_necessary=False): + """ + Guarantees that the underlying classifier implements the method indicated by the :meth:`_classifier_method`. + In case it does not, the classifier is calibrated (by means of the Platt's calibration method implemented by + scikit-learn in CalibratedClassifierCV, with cv=5). This calibration is only allowed if `adapt_if_necessary` + is set to True. If otherwise (i.e., the classifier is not probabilistic, and `adapt_if_necessary` is set + to False), an exception will be raised. + + :param adapt_if_necessary: a hard classifier is turned into a soft classifier if `adapt_if_necessary==True` + """ + if not hasattr(self.classifier, self._classifier_method()): + if adapt_if_necessary: + print(f'warning: The learner {self.classifier.__class__.__name__} does not seem to be ' + f'probabilistic. The learner will be calibrated (using CalibratedClassifierCV).') + self.classifier = CalibratedClassifierCV(self.classifier, cv=5) + else: + raise AssertionError(f'error: The learner {self.classifier.__class__.__name__} does not ' + f'seem to be probabilistic. The learner cannot be calibrated since ' + f'fit_classifier is set to False')
+ + +
[docs]class BinaryAggregativeQuantifier(AggregativeQuantifier, BinaryQuantifier): + + @property + def pos_label(self): + return self.classifier.classes_[1] + + @property + def neg_label(self): + return self.classifier.classes_[0] + +
[docs] def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None): + self._check_binary(data, self.__class__.__name__) + return super().fit(data, fit_classifier, val_split)
+ + +# Methods +# ------------------------------------ +
[docs]class CC(AggregativeCrispQuantifier): + """ + The most basic Quantification method. One that simply classifies all instances and counts how many have been + attributed to each of the classes in order to compute class prevalence estimates. + + :param classifier: a sklearn's Estimator that generates a classifier + """ + + def __init__(self, classifier: BaseEstimator): + self.classifier = classifier + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Nothing to do here! + + :param classif_predictions: this is actually None + """ + pass
+ +
[docs] def aggregate(self, classif_predictions: np.ndarray): + """ + Computes class prevalence estimates by counting the prevalence of each of the predicted labels. + + :param classif_predictions: array-like with label predictions + :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates. + """ + return F.prevalence_from_labels(classif_predictions, self.classes_)
+ + +
[docs]class ACC(AggregativeCrispQuantifier): + """ + `Adjusted Classify & Count <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_, + the "adjusted" variant of :class:`CC`, that corrects the predictions of CC + according to the `misclassification rates`. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a collection defining the specific set of data to use for validation. + Alternatively, this set can be specified at fit time by indicating the exact set of data + on which the predictions are to be generated. + :param n_jobs: number of parallel workers + :param solver: indicates the method to be used for obtaining the final estimates. The choice + 'exact' comes down to solving the system of linear equations :math:`Ax=B` where `A` is a + matrix containing the class-conditional probabilities of the predictions (e.g., the tpr and fpr in + binary) and `B` is the vector of prevalence values estimated via CC, as :math:`x=A^{-1}B`. This solution + might not exist for degenerated classifiers, in which case the method defaults to classify and count + (i.e., does not attempt any adjustment). + Another option is to search for the prevalence vector that minimizes the L2 norm of :math:`|Ax-B|`. The latter + is achieved by indicating solver='minimize'. This one generally works better, and is the default parameter. + More details about this can be consulted in `Bunse, M. "On Multi-Class Extensions of Adjusted Classify and + Count", on proceedings of the 2nd International Workshop on Learning to Quantify: Methods and Applications + (LQ 2022), ECML/PKDD 2022, Grenoble (France) <https://lq-2022.github.io/proceedings/CompleteVolume.pdf>`_. + """ + + def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None, solver='minimize'): + self.classifier = classifier + self.val_split = val_split + self.n_jobs = qp._get_njobs(n_jobs) + self.solver = solver + + def _check_init_parameters(self): + assert self.solver in ['exact', 'minimize'], "unknown solver; valid ones are 'exact', 'minimize'" + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Estimates the misclassification rates. + + :param classif_predictions: classifier predictions with true labels + """ + pred_labels, true_labels = classif_predictions.Xy + self.cc = CC(self.classifier) + self.Pte_cond_estim_ = self.getPteCondEstim(self.classifier.classes_, true_labels, pred_labels)
+ +
[docs] @classmethod + def getPteCondEstim(cls, classes, y, y_): + # estimate the matrix with entry (i,j) being the estimate of P(hat_yi|yj), that is, the probability that a + # document that belongs to yj ends up being classified as belonging to yi + conf = confusion_matrix(y, y_, labels=classes).T + conf = conf.astype(float) + class_counts = conf.sum(axis=0) + for i, _ in enumerate(classes): + if class_counts[i] == 0: + conf[i, i] = 1 + else: + conf[:, i] /= class_counts[i] + return conf
+ +
[docs] def aggregate(self, classif_predictions): + prevs_estim = self.cc.aggregate(classif_predictions) + return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim, solver=self.solver)
+ +
[docs] @classmethod + def solve_adjustment(cls, PteCondEstim, prevs_estim, solver='exact'): + """ + Solves the system linear system :math:`Ax = B` with :math:`A` = `PteCondEstim` and :math:`B` = `prevs_estim` + + :param PteCondEstim: a `np.ndarray` of shape `(n_classes,n_classes,)` with entry `(i,j)` being the estimate + of :math:`P(y_i|y_j)`, that is, the probability that an instance that belongs to :math:`y_j` ends up being + classified as belonging to :math:`y_i` + :param prevs_estim: a `np.ndarray` of shape `(n_classes,)` with the class prevalence estimates + :param solver: indicates the method to use for solving the system of linear equations. Valid options are + 'exact' (tries to solve the system --may fail if the misclassificatin matrix has rank < n_classes) or + 'optim_minimize' (minimizes a norm --always exists). + :return: an adjusted `np.ndarray` of shape `(n_classes,)` with the corrected class prevalence estimates + """ + + A = PteCondEstim + B = prevs_estim + + if solver == 'exact': + # attempts an exact solution of the linear system (may fail) + + try: + adjusted_prevs = np.linalg.solve(A, B) + adjusted_prevs = np.clip(adjusted_prevs, 0, 1) + adjusted_prevs /= adjusted_prevs.sum() + except np.linalg.LinAlgError: + adjusted_prevs = prevs_estim # no way to adjust them! + + return adjusted_prevs + + elif solver == 'minimize': + # poses the problem as an optimization one, and tries to minimize the norm of the differences + + def loss(prev): + return np.linalg.norm(A @ prev - B) + + return F.optim_minimize(loss, n_classes=A.shape[0])
+ + +
[docs]class PCC(AggregativeSoftQuantifier): + """ + `Probabilistic Classify & Count <https://ieeexplore.ieee.org/abstract/document/5694031>`_, + the probabilistic variant of CC that relies on the posterior probabilities returned by a probabilistic classifier. + + :param classifier: a sklearn's Estimator that generates a classifier + """ + + def __init__(self, classifier: BaseEstimator): + self.classifier = classifier + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Nothing to do here! + + :param classif_predictions: this is actually None + """ + pass
+ +
[docs] def aggregate(self, classif_posteriors): + return F.prevalence_from_probabilities(classif_posteriors, binarize=False)
+ + +
[docs]class PACC(AggregativeSoftQuantifier): + """ + `Probabilistic Adjusted Classify & Count <https://ieeexplore.ieee.org/abstract/document/5694031>`_, + the probabilistic variant of ACC that relies on the posterior probabilities returned by a probabilistic classifier. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`). Alternatively, this set can be specified at fit time by indicating the exact set of data + on which the predictions are to be generated. + :param n_jobs: number of parallel workers + :param solver: indicates the method to be used for obtaining the final estimates. The choice + 'exact' comes down to solving the system of linear equations :math:`Ax=B` where `A` is a + matrix containing the class-conditional probabilities of the predictions (e.g., the tpr and fpr in + binary) and `B` is the vector of prevalence values estimated via CC, as :math:`x=A^{-1}B`. This solution + might not exist for degenerated classifiers, in which case the method defaults to classify and count + (i.e., does not attempt any adjustment). + Another option is to search for the prevalence vector that minimizes the L2 norm of :math:`|Ax-B|`. The latter + is achieved by indicating solver='minimize'. This one generally works better, and is the default parameter. + More details about this can be consulted in `Bunse, M. "On Multi-Class Extensions of Adjusted Classify and + Count", on proceedings of the 2nd International Workshop on Learning to Quantify: Methods and Applications + (LQ 2022), ECML/PKDD 2022, Grenoble (France) <https://lq-2022.github.io/proceedings/CompleteVolume.pdf>`_. + + """ + + def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None, solver='minimize'): + self.classifier = classifier + self.val_split = val_split + self.n_jobs = qp._get_njobs(n_jobs) + self.solver = solver + + def _check_init_parameters(self): + assert self.solver in ['exact', 'minimize'], "unknown solver; valid ones are 'exact', 'minimize'" + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Estimates the misclassification rates + + :param classif_predictions: classifier soft predictions with true labels + """ + posteriors, true_labels = classif_predictions.Xy + self.pcc = PCC(self.classifier) + self.Pte_cond_estim_ = self.getPteCondEstim(self.classifier.classes_, true_labels, posteriors)
+ +
[docs] def aggregate(self, classif_posteriors): + prevs_estim = self.pcc.aggregate(classif_posteriors) + return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim, solver=self.solver)
+ +
[docs] @classmethod + def getPteCondEstim(cls, classes, y, y_): + # estimate the matrix with entry (i,j) being the estimate of P(hat_yi|yj), that is, the probability that a + # document that belongs to yj ends up being classified as belonging to yi + n_classes = len(classes) + confusion = np.eye(n_classes) + for i, class_ in enumerate(classes): + idx = y == class_ + if idx.any(): + confusion[i] = y_[idx].mean(axis=0) + + return confusion.T
+ + +
[docs]class EMQ(AggregativeSoftQuantifier): + """ + `Expectation Maximization for Quantification <https://ieeexplore.ieee.org/abstract/document/6789744>`_ (EMQ), + aka `Saerens-Latinne-Decaestecker` (SLD) algorithm. + EMQ consists of using the well-known `Expectation Maximization algorithm` to iteratively update the posterior + probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via + maximum-likelihood estimation, in a mutually recursive way, until convergence. + + This implementation also gives access to the heuristics proposed by `Alexandari et al. paper + <http://proceedings.mlr.press/v119/alexandari20a.html>`_. These heuristics consist of using, as the training + prevalence, an estimate of it obtained via k-fold cross validation (instead of the true training prevalence), + and to recalibrate the posterior probabilities of the classifier. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer, indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`, default 5); or as a collection defining the specific set of data to use for validation. + Alternatively, this set can be specified at fit time by indicating the exact set of data + on which the predictions are to be generated. This hyperparameter is only meant to be used when the + heuristics are to be applied, i.e., if a recalibration is required. The default value is None (meaning + the recalibration is not required). In case this hyperparameter is set to a value other than None, but + the recalibration is not required (recalib=None), a warning message will be raised. + :param exact_train_prev: set to True (default) for using the true training prevalence as the initial observation; + set to False for computing the training prevalence as an estimate of it, i.e., as the expected + value of the posterior probabilities of the training instances. + :param recalib: a string indicating the method of recalibration. + Available choices include "nbvs" (No-Bias Vector Scaling), "bcts" (Bias-Corrected Temperature Scaling, + default), "ts" (Temperature Scaling), and "vs" (Vector Scaling). Default is None (no recalibration). + :param n_jobs: number of parallel workers. Only used for recalibrating the classifier if `val_split` is set to + an integer `k` --the number of folds. + """ + + MAX_ITER = 1000 + EPSILON = 1e-4 + + def __init__(self, classifier: BaseEstimator, val_split=None, exact_train_prev=True, recalib=None, n_jobs=None): + self.classifier = classifier + self.val_split = val_split + self.exact_train_prev = exact_train_prev + self.recalib = recalib + self.n_jobs = n_jobs + +
[docs] @classmethod + def EMQ_BCTS(cls, classifier: BaseEstimator, n_jobs=None): + """ + Constructs an instance of EMQ using the best configuration found in the `Alexandari et al. paper + <http://proceedings.mlr.press/v119/alexandari20a.html>`_, i.e., one that relies on Bias-Corrected Temperature + Scaling (BCTS) as a recalibration function, and that uses an estimate of the training prevalence instead of + the true training prevalence. + + :param classifier: a sklearn's Estimator that generates a classifier + :param n_jobs: number of parallel workers. + :return: An instance of EMQ with BCTS + """ + return EMQ(classifier, val_split=5, exact_train_prev=False, recalib='bcts', n_jobs=n_jobs)
+ + def _check_init_parameters(self): + if self.val_split is not None: + if self.exact_train_prev and self.recalib is None: + raise RuntimeWarning(f'The parameter {self.val_split=} was specified for EMQ, while the parameters ' + f'{self.exact_train_prev=} and {self.recalib=}. This has no effect and causes an unnecessary ' + f'overload.') + else: + if self.recalib is not None: + print(f'[warning] The parameter {self.recalib=} requires the val_split be different from None. ' + f'This parameter will be set to 5. To avoid this warning, set this value to a float value ' + f'indicating the proportion of training data to be used as validation, or to an integer ' + f'indicating the number of folds for kFCV.') + self.val_split=5 + +
[docs] def classify(self, instances): + """ + Provides the posterior probabilities for the given instances. If the classifier was required + to be recalibrated, then these posteriors are recalibrated accordingly. + + :param instances: array-like of shape `(n_instances, n_dimensions,)` + :return: np.ndarray of shape `(n_instances, n_classes,)` with posterior probabilities + """ + posteriors = self.classifier.predict_proba(instances) + if hasattr(self, 'calibration_function') and self.calibration_function is not None: + posteriors = self.calibration_function(posteriors) + return posteriors
+ +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + if self.recalib is not None: + P, y = classif_predictions.Xy + if self.recalib == 'nbvs': + calibrator = NoBiasVectorScaling() + elif self.recalib == 'bcts': + calibrator = TempScaling(bias_positions='all') + elif self.recalib == 'ts': + calibrator = TempScaling() + elif self.recalib == 'vs': + calibrator = VectorScaling() + else: + raise ValueError('invalid param argument for recalibration method; available ones are ' + '"nbvs", "bcts", "ts", and "vs".') + + self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True) + + if self.exact_train_prev: + self.train_prevalence = data.prevalence() + else: + train_posteriors = classif_predictions.X + if self.recalib is not None: + train_posteriors = self.calibration_function(train_posteriors) + self.train_prevalence = F.prevalence_from_probabilities(train_posteriors)
+ +
[docs] def aggregate(self, classif_posteriors, epsilon=EPSILON): + priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon) + return priors
+ +
[docs] def predict_proba(self, instances, epsilon=EPSILON): + """ + Returns the posterior probabilities updated by the EM algorithm. + + :param instances: np.ndarray of shape `(n_instances, n_dimensions)` + :param epsilon: error tolerance + :return: np.ndarray of shape `(n_instances, n_classes)` + """ + classif_posteriors = self.classify(instances) + priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon) + return posteriors
+ +
[docs] @classmethod + def EM(cls, tr_prev, posterior_probabilities, epsilon=EPSILON): + """ + Computes the `Expectation Maximization` routine. + + :param tr_prev: array-like, the training prevalence + :param posterior_probabilities: `np.ndarray` of shape `(n_instances, n_classes,)` with the + posterior probabilities + :param epsilon: float, the threshold different between two consecutive iterations + to reach before stopping the loop + :return: a tuple with the estimated prevalence values (shape `(n_classes,)`) and + the corrected posterior probabilities (shape `(n_instances, n_classes,)`) + """ + Px = posterior_probabilities + Ptr = np.copy(tr_prev) + qs = np.copy(Ptr) # qs (the running estimate) is initialized as the training prevalence + + s, converged = 0, False + qs_prev_ = None + while not converged and s < EMQ.MAX_ITER: + # E-step: ps is Ps(y|xi) + ps_unnormalized = (qs / Ptr) * Px + ps = ps_unnormalized / ps_unnormalized.sum(axis=1, keepdims=True) + + # M-step: + qs = ps.mean(axis=0) + + if qs_prev_ is not None and qp.error.mae(qs, qs_prev_) < epsilon and s > 10: + converged = True + + qs_prev_ = qs + s += 1 + + if not converged: + print('[warning] the method has reached the maximum number of iterations; it might have not converged') + + return qs, ps
+ + +
[docs]class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): + """ + `Hellinger Distance y <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDy). + HDy is a probabilistic method for training binary quantifiers, that models quantification as the problem of + minimizing the divergence (in terms of the Hellinger Distance) between two distributions of posterior + probabilities returned by the classifier. One of the distributions is generated from the unlabelled examples and + the other is generated from a validation set. This latter distribution is defined as a mixture of the + class-conditional distributions of the posterior probabilities returned for the positive and negative validation + examples, respectively. The parameters of the mixture thus represent the estimates of the class prevalence values. + + :param classifier: a sklearn's Estimator that generates a binary classifier + :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out + validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5).. + """ + + def __init__(self, classifier: BaseEstimator, val_split=5): + self.classifier = classifier + self.val_split = val_split + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Trains a HDy quantifier. + + :param data: the training set + :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit) + :param val_split: either a float in (0,1) indicating the proportion of training instances to use for + validation (e.g., 0.3 for using 30% of the training set as validation data), or a + :class:`quapy.data.base.LabelledCollection` indicating the validation set itself + :return: self + """ + P, y = classif_predictions.Xy + Px = P[:, self.pos_label] # takes only the P(y=+1|x) + self.Pxy1 = Px[y == self.pos_label] + self.Pxy0 = Px[y == self.neg_label] + + # pre-compute the histogram for positive and negative examples + self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110] + + def hist(P, bins): + h = np.histogram(P, bins=bins, range=(0, 1), density=True)[0] + return h / h.sum() + + self.Pxy1_density = {bins: hist(self.Pxy1, bins) for bins in self.bins} + self.Pxy0_density = {bins: hist(self.Pxy0, bins) for bins in self.bins} + + return self
+ +
[docs] def aggregate(self, classif_posteriors): + # "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10, + # and the final estimated a priori probability was taken as the median of these 11 estimates." + # (González-Castro, et al., 2013). + + Px = classif_posteriors[:, self.pos_label] # takes only the P(y=+1|x) + + prev_estimations = [] + # for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110] + # Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True) + # Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True) + for bins in self.bins: + Pxy0_density = self.Pxy0_density[bins] + Pxy1_density = self.Pxy1_density[bins] + + Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True) + + # the authors proposed to search for the prevalence yielding the best matching as a linear search + # at small steps (modern implementations resort to an optimization procedure, + # see class DistributionMatching) + prev_selected, min_dist = None, None + for prev in F.prevalence_linspace(n_prevalences=101, repeats=1, smooth_limits_epsilon=0.0): + Px_train = prev * Pxy1_density + (1 - prev) * Pxy0_density + hdy = F.HellingerDistance(Px_train, Px_test) + if prev_selected is None or hdy < min_dist: + prev_selected, min_dist = prev, hdy + prev_estimations.append(prev_selected) + + class1_prev = np.median(prev_estimations) + return F.as_binary_prevalence(class1_prev)
+ + +
[docs]class DyS(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): + """ + `DyS framework <https://ojs.aaai.org/index.php/AAAI/article/view/4376>`_ (DyS). + DyS is a generalization of HDy method, using a Ternary Search in order to find the prevalence that + minimizes the distance between distributions. + Details for the ternary search have been got from <https://dl.acm.org/doi/pdf/10.1145/3219819.3220059> + + :param classifier: a sklearn's Estimator that generates a binary classifier + :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out + validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5).. + :param n_bins: an int with the number of bins to use to compute the histograms. + :param divergence: a str indicating the name of divergence (currently supported ones are "HD" or "topsoe"), or a + callable function computes the divergence between two distributions (two equally sized arrays). + :param tol: a float with the tolerance for the ternary search algorithm. + :param n_jobs: number of parallel workers. + """ + + def __init__(self, classifier: BaseEstimator, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05, n_jobs=None): + self.classifier = classifier + self.val_split = val_split + self.tol = tol + self.divergence = divergence + self.n_bins = n_bins + self.n_jobs = n_jobs + + def _ternary_search(self, f, left, right, tol): + """ + Find maximum of unimodal function f() within [left, right] + """ + while abs(right - left) >= tol: + left_third = left + (right - left) / 3 + right_third = right - (right - left) / 3 + + if f(left_third) > f(right_third): + left = left_third + else: + right = right_third + + # Left and right are the current bounds; the maximum is between them + return (left + right) / 2 + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + Px, y = classif_predictions.Xy + Px = Px[:, self.pos_label] # takes only the P(y=+1|x) + self.Pxy1 = Px[y == self.pos_label] + self.Pxy0 = Px[y == self.neg_label] + self.Pxy1_density = np.histogram(self.Pxy1, bins=self.n_bins, range=(0, 1), density=True)[0] + self.Pxy0_density = np.histogram(self.Pxy0, bins=self.n_bins, range=(0, 1), density=True)[0] + return self
+ +
[docs] def aggregate(self, classif_posteriors): + Px = classif_posteriors[:, self.pos_label] # takes only the P(y=+1|x) + + Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0] + divergence = get_divergence(self.divergence) + + def distribution_distance(prev): + Px_train = prev * self.Pxy1_density + (1 - prev) * self.Pxy0_density + return divergence(Px_train, Px_test) + + class1_prev = self._ternary_search(f=distribution_distance, left=0, right=1, tol=self.tol) + return F.as_binary_prevalence(class1_prev)
+ + +
[docs]class SMM(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): + """ + `SMM method <https://ieeexplore.ieee.org/document/9260028>`_ (SMM). + SMM is a simplification of matching distribution methods where the representation of the examples + is created using the mean instead of a histogram (conceptually equivalent to PACC). + + :param classifier: a sklearn's Estimator that generates a binary classifier. + :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out + validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5).. + """ + + def __init__(self, classifier: BaseEstimator, val_split=5): + self.classifier = classifier + self.val_split = val_split + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + Px, y = classif_predictions.Xy + Px = Px[:, self.pos_label] # takes only the P(y=+1|x) + self.Pxy1 = Px[y == self.pos_label] + self.Pxy0 = Px[y == self.neg_label] + self.Pxy1_mean = np.mean(self.Pxy1) # equiv. TPR + self.Pxy0_mean = np.mean(self.Pxy0) # equiv. FPR + return self
+ +
[docs] def aggregate(self, classif_posteriors): + Px = classif_posteriors[:, self.pos_label] # takes only the P(y=+1|x) + Px_mean = np.mean(Px) + + class1_prev = (Px_mean - self.Pxy0_mean)/(self.Pxy1_mean - self.Pxy0_mean) + return F.as_binary_prevalence(class1_prev, clip_if_necessary=True)
+ + +
[docs]class DMy(AggregativeSoftQuantifier): + """ + Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of posterior + probabilities. This implementation takes the number of bins, the divergence, and the possibility to work on CDF + as hyperparameters. + + :param classifier: a `sklearn`'s Estimator that generates a probabilistic classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set to model the + validation distribution. + This parameter can be indicated as a real value (between 0 and 1), representing a proportion of + validation data, or as an integer, indicating that the validation distribution should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + :param nbins: number of bins used to discretize the distributions (default 8) + :param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented) + or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger + Distance) + :param cdf: whether to use CDF instead of PDF (default False) + :param n_jobs: number of parallel workers (default None) + """ + + def __init__(self, classifier, val_split=5, nbins=8, divergence: Union[str, Callable]='HD', + cdf=False, search='optim_minimize', n_jobs=None): + self.classifier = classifier + self.val_split = val_split + self.nbins = nbins + self.divergence = divergence + self.cdf = cdf + self.search = search + self.n_jobs = n_jobs + + # @classmethod + # def HDy(cls, classifier, val_split=5, n_jobs=None): + # from quapy.method.meta import MedianEstimator + # + # hdy = DMy(classifier=classifier, val_split=val_split, search='linear_search', divergence='HD') + # hdy = AggregativeMedianEstimator(hdy, param_grid={'nbins': np.linspace(10, 110, 11).astype(int)}, n_jobs=n_jobs) + # return hdy + + def _get_distributions(self, posteriors): + histograms = [] + post_dims = posteriors.shape[1] + if post_dims == 2: + # in binary quantification we can use only one class, since the other one is its complement + post_dims = 1 + for dim in range(post_dims): + hist = np.histogram(posteriors[:, dim], bins=self.nbins, range=(0, 1))[0] + histograms.append(hist) + + counts = np.vstack(histograms) + distributions = counts/counts.sum(axis=1)[:,np.newaxis] + if self.cdf: + distributions = np.cumsum(distributions, axis=1) + return distributions + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Trains the classifier (if requested) and generates the validation distributions out of the training data. + The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of + channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; then `di=V[i]` + are the distributions obtained from training data labelled with class `i`; while `dij = di[j]` is the discrete + distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]` + is the fraction of instances with a value in the `k`-th bin. + + :param data: the training set + :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit) + :param val_split: either a float in (0,1) indicating the proportion of training instances to use for + validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection + indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV + to estimate the parameters + """ + posteriors, true_labels = classif_predictions.Xy + n_classes = len(self.classifier.classes_) + + self.validation_distribution = qp.util.parallel( + func=self._get_distributions, + args=[posteriors[true_labels==cat] for cat in range(n_classes)], + n_jobs=self.n_jobs, + backend='threading' + )
+ +
[docs] def aggregate(self, posteriors: np.ndarray): + """ + Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution + (the mixture) that best matches the test distribution, in terms of the divergence measure of choice. + In the multiclass case, with `n` the number of classes, the test and mixture distributions contain + `n` channels (proper distributions of binned posterior probabilities), on which the divergence is computed + independently. The matching is computed as an average of the divergence across all channels. + + :param posteriors: posterior probabilities of the instances in the sample + :return: a vector of class prevalence estimates + """ + test_distribution = self._get_distributions(posteriors) + divergence = get_divergence(self.divergence) + n_classes, n_channels, nbins = self.validation_distribution.shape + def loss(prev): + prev = np.expand_dims(prev, axis=0) + mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_channels, -1) + divs = [divergence(test_distribution[ch], mixture_distribution[ch]) for ch in range(n_channels)] + return np.mean(divs) + + return F.argmin_prevalence(loss, n_classes, method=self.search)
+ + + +
[docs]def newELM(svmperf_base=None, loss='01', C=1): + """ + Explicit Loss Minimization (ELM) quantifiers. + Quantifiers based on ELM represent a family of methods based on structured output learning; + these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss + measure. This implementation relies on + `Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output + learning algorithm, which has to be installed and patched for the purpose (see this + `script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_). + This function equivalent to: + + >>> CC(SVMperf(svmperf_base, loss, C)) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default) + this path will be obtained from qp.environ['SVMPERF_HOME'] + :param loss: the loss to optimize (see :attr:`quapy.classification.svmperf.SVMperf.valid_losses`) + :param C: trade-off between training error and margin (default 0.01) + :return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the + underlying classifier + """ + if svmperf_base is None: + svmperf_base = qp.environ['SVMPERF_HOME'] + assert svmperf_base is not None, \ + 'param svmperf_base was not specified, and the variable SVMPERF_HOME has not been set in the environment' + return CC(SVMperf(svmperf_base, loss=loss, C=C))
+ + +
[docs]def newSVMQ(svmperf_base=None, C=1): + """ + SVM(Q) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the `Q` loss combining a + classification-oriented loss and a quantification-oriented loss, as proposed by + `Barranquero et al. 2015 <https://www.sciencedirect.com/science/article/pii/S003132031400291X>`_. + Equivalent to: + + >>> CC(SVMperf(svmperf_base, loss='q', C=C)) + + Quantifiers based on ELM represent a family of methods based on structured output learning; + these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss + measure. This implementation relies on + `Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output + learning algorithm, which has to be installed and patched for the purpose (see this + `script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_). + This function is a wrapper around CC(SVMperf(svmperf_base, loss, C)) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default) + this path will be obtained from qp.environ['SVMPERF_HOME'] + :param C: trade-off between training error and margin (default 0.01) + :return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the + underlying classifier + """ + return newELM(svmperf_base, loss='q', C=C)
+ +def newSVMKLD(svmperf_base=None, C=1): + """ + SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Kullback-Leibler Divergence + as proposed by `Esuli et al. 2015 <https://dl.acm.org/doi/abs/10.1145/2700406>`_. + Equivalent to: + + >>> CC(SVMperf(svmperf_base, loss='kld', C=C)) + + Quantifiers based on ELM represent a family of methods based on structured output learning; + these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss + measure. This implementation relies on + `Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output + learning algorithm, which has to be installed and patched for the purpose (see this + `script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_). + This function is a wrapper around CC(SVMperf(svmperf_base, loss, C)) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default) + this path will be obtained from qp.environ['SVMPERF_HOME'] + :param C: trade-off between training error and margin (default 0.01) + :return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the + underlying classifier + """ + return newELM(svmperf_base, loss='kld', C=C) + + +
[docs]def newSVMKLD(svmperf_base=None, C=1): + """ + SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Kullback-Leibler Divergence + normalized via the logistic function, as proposed by + `Esuli et al. 2015 <https://dl.acm.org/doi/abs/10.1145/2700406>`_. + Equivalent to: + + >>> CC(SVMperf(svmperf_base, loss='nkld', C=C)) + + Quantifiers based on ELM represent a family of methods based on structured output learning; + these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss + measure. This implementation relies on + `Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output + learning algorithm, which has to be installed and patched for the purpose (see this + `script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_). + This function is a wrapper around CC(SVMperf(svmperf_base, loss, C)) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default) + this path will be obtained from qp.environ['SVMPERF_HOME'] + :param C: trade-off between training error and margin (default 0.01) + :return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the + underlying classifier + """ + return newELM(svmperf_base, loss='nkld', C=C)
+ +
[docs]def newSVMAE(svmperf_base=None, C=1): + """ + SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Absolute Error as first used by + `Moreo and Sebastiani, 2021 <https://arxiv.org/abs/2011.02552>`_. + Equivalent to: + + >>> CC(SVMperf(svmperf_base, loss='mae', C=C)) + + Quantifiers based on ELM represent a family of methods based on structured output learning; + these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss + measure. This implementation relies on + `Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output + learning algorithm, which has to be installed and patched for the purpose (see this + `script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_). + This function is a wrapper around CC(SVMperf(svmperf_base, loss, C)) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default) + this path will be obtained from qp.environ['SVMPERF_HOME'] + :param C: trade-off between training error and margin (default 0.01) + :return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the + underlying classifier + """ + return newELM(svmperf_base, loss='mae', C=C)
+ +
[docs]def newSVMRAE(svmperf_base=None, C=1): + """ + SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Relative Absolute Error as first + used by `Moreo and Sebastiani, 2021 <https://arxiv.org/abs/2011.02552>`_. + Equivalent to: + + >>> CC(SVMperf(svmperf_base, loss='mrae', C=C)) + + Quantifiers based on ELM represent a family of methods based on structured output learning; + these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss + measure. This implementation relies on + `Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output + learning algorithm, which has to be installed and patched for the purpose (see this + `script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_). + This function is a wrapper around CC(SVMperf(svmperf_base, loss, C)) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default) + this path will be obtained from qp.environ['SVMPERF_HOME'] + :param C: trade-off between training error and margin (default 0.01) + :return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the + underlying classifier + """ + return newELM(svmperf_base, loss='mrae', C=C)
+ + +
[docs]class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier): + """ + Allows any binary quantifier to perform quantification on single-label datasets. + The method maintains one binary quantifier for each class, and then l1-normalizes the outputs so that the + class prevelences sum up to 1. + This variant was used, along with the :class:`EMQ` quantifier, in + `Gao and Sebastiani, 2016 <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_. + + :param binary_quantifier: a quantifier (binary) that will be employed to work on multiclass model in a + one-vs-all manner + :param n_jobs: number of parallel workers + :param parallel_backend: the parallel backend for joblib (default "loky"); this is helpful for some quantifiers + (e.g., ELM-based ones) that cannot be run with multiprocessing, since the temp dir they create during fit will + is removed and no longer available at predict time. + """ + + def __init__(self, binary_quantifier, n_jobs=None, parallel_backend='multiprocessing'): + assert isinstance(binary_quantifier, BaseQuantifier), \ + f'{self.binary_quantifier} does not seem to be a Quantifier' + assert isinstance(binary_quantifier, AggregativeQuantifier), \ + f'{self.binary_quantifier} does not seem to be of type Aggregative' + self.binary_quantifier = binary_quantifier + self.n_jobs = qp._get_njobs(n_jobs) + self.parallel_backend = parallel_backend + +
[docs] def classify(self, instances): + """ + If the base quantifier is not probabilistic, returns a matrix of shape `(n,m,)` with `n` the number of + instances and `m` the number of classes. The entry `(i,j)` is a binary value indicating whether instance + `i `belongs to class `j`. The binary classifications are independent of each other, meaning that an instance + can end up be attributed to 0, 1, or more classes. + If the base quantifier is probabilistic, returns a matrix of shape `(n,m,2)` with `n` the number of instances + and `m` the number of classes. The entry `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the + posterior probability that instance `i` belongs (resp. does not belong) to class `j`. The posterior + probabilities are independent of each other, meaning that, in general, they do not sum up to one. + + :param instances: array-like + :return: `np.ndarray` + """ + + classif_predictions = self._parallel(self._delayed_binary_classification, instances) + if isinstance(self.binary_quantifier, AggregativeSoftQuantifier): + return np.swapaxes(classif_predictions, 0, 1) + else: + return classif_predictions.T
+ +
[docs] def aggregate(self, classif_predictions): + prevalences = self._parallel(self._delayed_binary_aggregate, classif_predictions) + return F.normalize_prevalence(prevalences)
+ + def _delayed_binary_classification(self, c, X): + return self.dict_binary_quantifiers[c].classify(X) + + def _delayed_binary_aggregate(self, c, classif_predictions): + # the estimation for the positive class prevalence + return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
+ + +
[docs]class AggregativeMedianEstimator(BinaryQuantifier): + """ + This method is a meta-quantifier that returns, as the estimated class prevalence values, the median of the + estimation returned by differently (hyper)parameterized base quantifiers. + The median of unit-vectors is only guaranteed to be a unit-vector for n=2 dimensions, + i.e., in cases of binary quantification. + + :param base_quantifier: the base, binary quantifier + :param random_state: a seed to be set before fitting any base quantifier (default None) + :param param_grid: the grid or parameters towards which the median will be computed + :param n_jobs: number of parllel workes + """ + def __init__(self, base_quantifier: AggregativeQuantifier, param_grid: dict, random_state=None, n_jobs=None): + self.base_quantifier = base_quantifier + self.param_grid = param_grid + self.random_state = random_state + self.n_jobs = qp._get_njobs(n_jobs) + +
[docs] def get_params(self, deep=True): + return self.base_quantifier.get_params(deep)
+ +
[docs] def set_params(self, **params): + self.base_quantifier.set_params(**params)
+ + def _delayed_fit(self, args): + with qp.util.temp_seed(self.random_state): + params, training = args + model = deepcopy(self.base_quantifier) + model.set_params(**params) + model.fit(training) + return model + + def _delayed_fit_classifier(self, args): + with qp.util.temp_seed(self.random_state): + print('enter job') + cls_params, training, kwargs = args + model = deepcopy(self.base_quantifier) + model.set_params(**cls_params) + predictions = model.classifier_fit_predict(training, **kwargs) + print('exit job') + return (model, predictions) + + def _delayed_fit_aggregation(self, args): + with qp.util.temp_seed(self.random_state): + ((model, predictions), q_params), training = args + model = deepcopy(model) + model.set_params(**q_params) + model.aggregation_fit(predictions, training) + return model + + +
[docs] def fit(self, training: LabelledCollection, **kwargs): + import itertools + + self._check_binary(training, self.__class__.__name__) + + if isinstance(self.base_quantifier, AggregativeQuantifier): + cls_configs, q_configs = qp.model_selection.group_params(self.param_grid) + + if len(cls_configs) > 1: + models_preds = qp.util.parallel( + self._delayed_fit_classifier, + ((params, training, kwargs) for params in cls_configs), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + asarray=False, + backend='threading' + ) + else: + print('only 1') + model = self.base_quantifier + model.set_params(**cls_configs[0]) + predictions = model.classifier_fit_predict(training, **kwargs) + models_preds = [(model, predictions)] + + self.models = qp.util.parallel( + self._delayed_fit_aggregation, + ((setup, training) for setup in itertools.product(models_preds, q_configs)), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + backend='threading' + ) + else: + configs = qp.model_selection.expand_grid(self.param_grid) + self.models = qp.util.parallel( + self._delayed_fit, + ((params, training) for params in configs), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + backend='threading' + ) + return self
+ + def _delayed_predict(self, args): + model, instances = args + return model.quantify(instances) + +
[docs] def quantify(self, instances): + prev_preds = qp.util.parallel( + self._delayed_predict, + ((model, instances) for model in self.models), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + backend='threading' + ) + return np.median(prev_preds, axis=0)
+ + +#--------------------------------------------------------------- +# imports +#--------------------------------------------------------------- + +from . import _threshold_optim + +T50 = _threshold_optim.T50 +MAX = _threshold_optim.MAX +X = _threshold_optim.X +MS = _threshold_optim.MS +MS2 = _threshold_optim.MS2 + + +from . import _kdey + +KDEyML = _kdey.KDEyML +KDEyHD = _kdey.KDEyHD +KDEyCS = _kdey.KDEyCS + +#--------------------------------------------------------------- +# aliases +#--------------------------------------------------------------- + +ClassifyAndCount = CC +AdjustedClassifyAndCount = ACC +ProbabilisticClassifyAndCount = PCC +ProbabilisticAdjustedClassifyAndCount = PACC +ExpectationMaximizationQuantifier = EMQ +DistributionMatchingY = DMy +SLD = EMQ +HellingerDistanceY = HDy +MedianSweep = MS +MedianSweep2 = MS2 +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/method/base.html b/docs/build/html/_modules/quapy/method/base.html new file mode 100644 index 0000000..6288bd1 --- /dev/null +++ b/docs/build/html/_modules/quapy/method/base.html @@ -0,0 +1,212 @@ + + + + + + quapy.method.base — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.method.base

+from abc import ABCMeta, abstractmethod
+from copy import deepcopy
+
+from joblib import Parallel, delayed
+from sklearn.base import BaseEstimator
+
+import quapy as qp
+from quapy.data import LabelledCollection
+import numpy as np
+
+
+# Base Quantifier abstract class
+# ------------------------------------
+
[docs]class BaseQuantifier(BaseEstimator): + """ + Abstract Quantifier. A quantifier is defined as an object of a class that implements the method :meth:`fit` on + :class:`quapy.data.base.LabelledCollection`, the method :meth:`quantify`, and the :meth:`set_params` and + :meth:`get_params` for model selection (see :meth:`quapy.model_selection.GridSearchQ`) + """ + +
[docs] @abstractmethod + def fit(self, data: LabelledCollection): + """ + Trains a quantifier. + + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + :return: self + """ + ...
+ +
[docs] @abstractmethod + def quantify(self, instances): + """ + Generate class prevalence estimates for the sample's instances + + :param instances: array-like + :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates. + """ + ...
+ + +
[docs]class BinaryQuantifier(BaseQuantifier): + """ + Abstract class of binary quantifiers, i.e., quantifiers estimating class prevalence values for only two classes + (typically, to be interpreted as one class and its complement). + """ + + def _check_binary(self, data: LabelledCollection, quantifier_name): + assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \ + f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
+ + +
[docs]class OneVsAll: + pass
+ + +
[docs]def newOneVsAll(binary_quantifier, n_jobs=None): + assert isinstance(binary_quantifier, BaseQuantifier), \ + f'{binary_quantifier} does not seem to be a Quantifier' + if isinstance(binary_quantifier, qp.method.aggregative.AggregativeQuantifier): + return qp.method.aggregative.OneVsAllAggregative(binary_quantifier, n_jobs) + else: + return OneVsAllGeneric(binary_quantifier, n_jobs)
+ + +
[docs]class OneVsAllGeneric(OneVsAll, BaseQuantifier): + """ + Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary + quantifier for each class, and then l1-normalizes the outputs so that the class prevelence values sum up to 1. + """ + + def __init__(self, binary_quantifier, n_jobs=None): + assert isinstance(binary_quantifier, BaseQuantifier), \ + f'{binary_quantifier} does not seem to be a Quantifier' + if isinstance(binary_quantifier, qp.method.aggregative.AggregativeQuantifier): + print('[warning] the quantifier seems to be an instance of qp.method.aggregative.AggregativeQuantifier; ' + f'you might prefer instantiating {qp.method.aggregative.OneVsAllAggregative.__name__}') + self.binary_quantifier = binary_quantifier + self.n_jobs = qp._get_njobs(n_jobs) + +
[docs] def fit(self, data: LabelledCollection, fit_classifier=True): + assert not data.binary, f'{self.__class__.__name__} expect non-binary data' + assert fit_classifier == True, 'fit_classifier must be True' + + self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_} + self._parallel(self._delayed_binary_fit, data) + return self
+ + def _parallel(self, func, *args, **kwargs): + return np.asarray( + Parallel(n_jobs=self.n_jobs, backend='threading')( + delayed(func)(c, *args, **kwargs) for c in self.classes_ + ) + ) + +
[docs] def quantify(self, instances): + prevalences = self._parallel(self._delayed_binary_predict, instances) + return qp.functional.normalize_prevalence(prevalences)
+ + @property + def classes_(self): + return sorted(self.dict_binary_quantifiers.keys()) + + def _delayed_binary_predict(self, c, X): + return self.dict_binary_quantifiers[c].quantify(X)[1] + + def _delayed_binary_fit(self, c, data): + bindata = LabelledCollection(data.instances, data.labels == c, classes=[False, True]) + self.dict_binary_quantifiers[c].fit(bindata)
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/method/meta.html b/docs/build/html/_modules/quapy/method/meta.html new file mode 100644 index 0000000..ca38440 --- /dev/null +++ b/docs/build/html/_modules/quapy/method/meta.html @@ -0,0 +1,796 @@ + + + + + + quapy.method.meta — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.method.meta

+import itertools
+from copy import deepcopy
+from typing import Union
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import f1_score, make_scorer, accuracy_score
+from sklearn.model_selection import GridSearchCV, cross_val_predict
+from tqdm import tqdm
+
+import quapy as qp
+from quapy import functional as F
+from quapy.data import LabelledCollection
+from quapy.model_selection import GridSearchQ
+from quapy.method.base import BaseQuantifier, BinaryQuantifier
+from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ, AggregativeQuantifier
+
+try:
+    from . import _neural
+except ModuleNotFoundError:
+    _neural = None
+
+
+if _neural:
+    QuaNet = _neural.QuaNetTrainer
+else:
+    QuaNet = "QuaNet is not available due to missing torch package"
+
+
+
[docs]class MedianEstimator2(BinaryQuantifier): + """ + This method is a meta-quantifier that returns, as the estimated class prevalence values, the median of the + estimation returned by differently (hyper)parameterized base quantifiers. + The median of unit-vectors is only guaranteed to be a unit-vector for n=2 dimensions, + i.e., in cases of binary quantification. + + :param base_quantifier: the base, binary quantifier + :param random_state: a seed to be set before fitting any base quantifier (default None) + :param param_grid: the grid or parameters towards which the median will be computed + :param n_jobs: number of parllel workes + """ + def __init__(self, base_quantifier: BinaryQuantifier, param_grid: dict, random_state=None, n_jobs=None): + self.base_quantifier = base_quantifier + self.param_grid = param_grid + self.random_state = random_state + self.n_jobs = qp._get_njobs(n_jobs) + +
[docs] def get_params(self, deep=True): + return self.base_quantifier.get_params(deep)
+ +
[docs] def set_params(self, **params): + self.base_quantifier.set_params(**params)
+ + def _delayed_fit(self, args): + with qp.util.temp_seed(self.random_state): + params, training = args + model = deepcopy(self.base_quantifier) + model.set_params(**params) + model.fit(training) + return model + +
[docs] def fit(self, training: LabelledCollection): + self._check_binary(training, self.__class__.__name__) + + configs = qp.model_selection.expand_grid(self.param_grid) + self.models = qp.util.parallel( + self._delayed_fit, + ((params, training) for params in configs), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs + ) + return self
+ + def _delayed_predict(self, args): + model, instances = args + return model.quantify(instances) + +
[docs] def quantify(self, instances): + prev_preds = qp.util.parallel( + self._delayed_predict, + ((model, instances) for model in self.models), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs + ) + prev_preds = np.asarray(prev_preds) + return np.median(prev_preds, axis=0)
+ + +
[docs]class MedianEstimator(BinaryQuantifier): + """ + This method is a meta-quantifier that returns, as the estimated class prevalence values, the median of the + estimation returned by differently (hyper)parameterized base quantifiers. + The median of unit-vectors is only guaranteed to be a unit-vector for n=2 dimensions, + i.e., in cases of binary quantification. + + :param base_quantifier: the base, binary quantifier + :param random_state: a seed to be set before fitting any base quantifier (default None) + :param param_grid: the grid or parameters towards which the median will be computed + :param n_jobs: number of parllel workes + """ + def __init__(self, base_quantifier: BinaryQuantifier, param_grid: dict, random_state=None, n_jobs=None): + self.base_quantifier = base_quantifier + self.param_grid = param_grid + self.random_state = random_state + self.n_jobs = qp._get_njobs(n_jobs) + +
[docs] def get_params(self, deep=True): + return self.base_quantifier.get_params(deep)
+ +
[docs] def set_params(self, **params): + self.base_quantifier.set_params(**params)
+ + def _delayed_fit(self, args): + with qp.util.temp_seed(self.random_state): + params, training = args + model = deepcopy(self.base_quantifier) + model.set_params(**params) + model.fit(training) + return model + + def _delayed_fit_classifier(self, args): + with qp.util.temp_seed(self.random_state): + cls_params, training = args + model = deepcopy(self.base_quantifier) + model.set_params(**cls_params) + predictions = model.classifier_fit_predict(training, predict_on=model.val_split) + return (model, predictions) + + def _delayed_fit_aggregation(self, args): + with qp.util.temp_seed(self.random_state): + ((model, predictions), q_params), training = args + model = deepcopy(model) + model.set_params(**q_params) + model.aggregation_fit(predictions, training) + return model + + +
[docs] def fit(self, training: LabelledCollection): + self._check_binary(training, self.__class__.__name__) + + if isinstance(self.base_quantifier, AggregativeQuantifier): + cls_configs, q_configs = qp.model_selection.group_params(self.param_grid) + + if len(cls_configs) > 1: + models_preds = qp.util.parallel( + self._delayed_fit_classifier, + ((params, training) for params in cls_configs), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + asarray=False + ) + else: + model = self.base_quantifier + model.set_params(**cls_configs[0]) + predictions = model.classifier_fit_predict(training, predict_on=model.val_split) + models_preds = [(model, predictions)] + + self.models = qp.util.parallel( + self._delayed_fit_aggregation, + ((setup, training) for setup in itertools.product(models_preds, q_configs)), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + asarray=False + ) + else: + configs = qp.model_selection.expand_grid(self.param_grid) + self.models = qp.util.parallel( + self._delayed_fit, + ((params, training) for params in configs), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + asarray=False + ) + return self
+ + def _delayed_predict(self, args): + model, instances = args + return model.quantify(instances) + +
[docs] def quantify(self, instances): + prev_preds = qp.util.parallel( + self._delayed_predict, + ((model, instances) for model in self.models), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + asarray=False + ) + prev_preds = np.asarray(prev_preds) + return np.median(prev_preds, axis=0)
+ + +
[docs]class Ensemble(BaseQuantifier): + VALID_POLICIES = {'ave', 'ptr', 'ds'} | qp.error.QUANTIFICATION_ERROR_NAMES + + """ + Implementation of the Ensemble methods for quantification described by + `Pérez-Gállego et al., 2017 <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_ + and + `Pérez-Gállego et al., 2019 <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_. + The policies implemented include: + + - Average (`policy='ave'`): computes class prevalence estimates as the average of the estimates + returned by the base quantifiers. + - Training Prevalence (`policy='ptr'`): applies a dynamic selection to the ensemble’s members by retaining only + those members such that the class prevalence values in the samples they use as training set are closest to + preliminary class prevalence estimates computed as the average of the estimates of all the members. The final + estimate is recomputed by considering only the selected members. + - Distribution Similarity (`policy='ds'`): performs a dynamic selection of base members by retaining + the members trained on samples whose distribution of posterior probabilities is closest, in terms of the + Hellinger Distance, to the distribution of posterior probabilities in the test sample + - Accuracy (`policy='<valid error name>'`): performs a static selection of the ensemble members by + retaining those that minimize a quantification error measure, which is passed as an argument. + + Example: + + >>> model = Ensemble(quantifier=ACC(LogisticRegression()), size=30, policy='ave', n_jobs=-1) + + :param quantifier: base quantification member of the ensemble + :param size: number of members + :param red_size: number of members to retain after selection (depending on the policy) + :param min_pos: minimum number of positive instances to consider a sample as valid + :param policy: the selection policy; available policies include: `ave` (default), `ptr`, `ds`, and accuracy + (which is instantiated via a valid error name, e.g., `mae`) + :param max_sample_size: maximum number of instances to consider in the samples (set to None + to indicate no limit, default) + :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out + validation split, or a :class:`quapy.data.base.LabelledCollection` (the split itself). + :param n_jobs: number of parallel workers (default 1) + :param verbose: set to True (default is False) to get some information in standard output + """ + + def __init__(self, + quantifier: BaseQuantifier, + size=50, + red_size=25, + min_pos=5, + policy='ave', + max_sample_size=None, + val_split:Union[qp.data.LabelledCollection, float]=None, + n_jobs=None, + verbose=False): + assert policy in Ensemble.VALID_POLICIES, \ + f'unknown policy={policy}; valid are {Ensemble.VALID_POLICIES}' + assert max_sample_size is None or max_sample_size > 0, \ + 'wrong value for max_sample_size; set it to a positive number or None' + self.base_quantifier = quantifier + self.size = size + self.min_pos = min_pos + self.red_size = red_size + self.policy = policy + self.val_split = val_split + self.n_jobs = qp._get_njobs(n_jobs) + self.post_proba_fn = None + self.verbose = verbose + self.max_sample_size = max_sample_size + + def _sout(self, msg): + if self.verbose: + print('[Ensemble]' + msg) + +
[docs] def fit(self, data: qp.data.LabelledCollection, val_split: Union[qp.data.LabelledCollection, float] = None): + + if self.policy == 'ds' and not data.binary: + raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary') + + if val_split is None: + val_split = self.val_split + + # randomly chooses the prevalences for each member of the ensemble (preventing classes with less than + # min_pos positive examples) + sample_size = len(data) if self.max_sample_size is None else min(self.max_sample_size, len(data)) + prevs = [_draw_simplex(ndim=data.n_classes, min_val=self.min_pos / sample_size) for _ in range(self.size)] + + posteriors = None + if self.policy == 'ds': + # precompute the training posterior probabilities + posteriors, self.post_proba_fn = self._ds_policy_get_posteriors(data) + + is_static_policy = (self.policy in qp.error.QUANTIFICATION_ERROR_NAMES) + + args = ( + (self.base_quantifier, data, val_split, prev, posteriors, is_static_policy, self.verbose, sample_size) + for prev in prevs + ) + self.ensemble = qp.util.parallel( + _delayed_new_instance, + tqdm(args, desc='fitting ensamble', total=self.size) if self.verbose else args, + asarray=False, + n_jobs=self.n_jobs) + + # static selection policy (the name of a quantification-oriented error function to minimize) + if self.policy in qp.error.QUANTIFICATION_ERROR_NAMES: + self._accuracy_policy(error_name=self.policy) + + self._sout('Fit [Done]') + return self
+ +
[docs] def quantify(self, instances): + predictions = np.asarray( + qp.util.parallel(_delayed_quantify, ((Qi, instances) for Qi in self.ensemble), n_jobs=self.n_jobs) + ) + + if self.policy == 'ptr': + predictions = self._ptr_policy(predictions) + elif self.policy == 'ds': + predictions = self._ds_policy(predictions, instances) + + predictions = np.mean(predictions, axis=0) + return F.normalize_prevalence(predictions)
+ +
[docs] def set_params(self, **parameters): + """ + This function should not be used within :class:`quapy.model_selection.GridSearchQ` (is here for compatibility + with the abstract class). + Instead, use `Ensemble(GridSearchQ(q),...)`, with `q` a Quantifier (recommended), or + `Ensemble(Q(GridSearchCV(l)))` with `Q` a quantifier class that has a classifier `l` optimized for + classification (not recommended). + + :param parameters: dictionary + :return: raises an Exception + """ + raise NotImplementedError(f'{self.__class__.__name__} should not be used within GridSearchQ; ' + f'instead, use Ensemble(GridSearchQ(q),...), with q a Quantifier (recommended), ' + f'or Ensemble(Q(GridSearchCV(l))) with Q a quantifier class that has a classifier ' + f'l optimized for classification (not recommended).')
+ +
[docs] def get_params(self, deep=True): + """ + This function should not be used within :class:`quapy.model_selection.GridSearchQ` (is here for compatibility + with the abstract class). + Instead, use `Ensemble(GridSearchQ(q),...)`, with `q` a Quantifier (recommended), or + `Ensemble(Q(GridSearchCV(l)))` with `Q` a quantifier class that has a classifier `l` optimized for + classification (not recommended). + + :param deep: for compatibility with scikit-learn + :return: raises an Exception + """ + + raise NotImplementedError()
+ + def _accuracy_policy(self, error_name): + """ + Selects the red_size best performant quantifiers in a static way (i.e., dropping all non-selected instances). + For each model in the ensemble, the performance is measured in terms of _error_name_ on the quantification of + the samples used for training the rest of the models in the ensemble. + """ + from quapy.evaluation import evaluate_on_samples + error = qp.error.from_name(error_name) + tests = [m[3] for m in self.ensemble] + scores = [] + for i, model in enumerate(self.ensemble): + scores.append(evaluate_on_samples(model[0], tests[:i] + tests[i + 1:], error)) + order = np.argsort(scores) + + self.ensemble = _select_k(self.ensemble, order, k=self.red_size) + + def _ptr_policy(self, predictions): + """ + Selects the predictions made by models that have been trained on samples with a prevalence that is most similar + to a first approximation of the test prevalence as made by all models in the ensemble. + """ + test_prev_estim = predictions.mean(axis=0) + tr_prevs = [m[1] for m in self.ensemble] + ptr_differences = [qp.error.mse(ptr_i, test_prev_estim) for ptr_i in tr_prevs] + order = np.argsort(ptr_differences) + return _select_k(predictions, order, k=self.red_size) + + def _ds_policy_get_posteriors(self, data: LabelledCollection): + """ + In the original article, there are some aspects regarding this method that are not mentioned. The paper says + that the distribution of posterior probabilities from training and test examples is compared by means of the + Hellinger Distance. However, how these posterior probabilities are generated is not specified. In the article, + a Logistic Regressor (LR) is used as the classifier device and that could be used for this purpose. However, in + general, a Quantifier is not necessarily an instance of Aggreggative Probabilistic Quantifiers, and so, that the + quantifier builds on top of a probabilistic classifier cannot be given for granted. Additionally, it would not + be correct to generate the posterior probabilities for training instances that have concurred in training the + classifier that generates them. + + This function thus generates the posterior probabilities for all training documents in a cross-validation way, + using LR with hyperparameters that have previously been optimized via grid search in 5FCV. + + :param data: a LabelledCollection + :return: (P,f,) where P is an ndarray containing the posterior probabilities of the training data, generated via + cross-validation and using an optimized LR, and the function to be used in order to generate posterior + probabilities for test instances. + """ + + X, y = data.Xy + lr_base = LogisticRegression(class_weight='balanced', max_iter=1000) + + param_grid = {'C': np.logspace(-4, 4, 9)} + optim = GridSearchCV(lr_base, param_grid=param_grid, cv=5, n_jobs=self.n_jobs, refit=True).fit(X, y) + + posteriors = cross_val_predict(optim.best_estimator_, X, y, cv=5, n_jobs=self.n_jobs, method='predict_proba') + posteriors_generator = optim.best_estimator_.predict_proba + + return posteriors, posteriors_generator + + def _ds_policy(self, predictions, test): + test_posteriors = self.post_proba_fn(test) + test_distribution = get_probability_distribution(test_posteriors) + tr_distributions = [m[2] for m in self.ensemble] + dist = [F.HellingerDistance(tr_dist_i, test_distribution) for tr_dist_i in tr_distributions] + order = np.argsort(dist) + return _select_k(predictions, order, k=self.red_size) + + @property + def aggregative(self): + """ + Indicates that the quantifier is not aggregative. + + :return: False + """ + return False + + @property + def probabilistic(self): + """ + Indicates that the quantifier is not probabilistic. + + :return: False + """ + return False
+ + +
[docs]def get_probability_distribution(posterior_probabilities, bins=8): + """ + Gets a histogram out of the posterior probabilities (only for the binary case). + + :param posterior_probabilities: array-like of shape `(n_instances, 2,)` + :param bins: integer + :return: `np.ndarray` with the relative frequencies for each bin (for the positive class only) + """ + assert posterior_probabilities.shape[1] == 2, 'the posterior probabilities do not seem to be for a binary problem' + posterior_probabilities = posterior_probabilities[:, 1] # take the positive posteriors only + distribution, _ = np.histogram(posterior_probabilities, bins=bins, range=(0, 1), density=True) + return distribution
+ + +def _select_k(elements, order, k): + return [elements[idx] for idx in order[:k]] + + +def _delayed_new_instance(args): + base_quantifier, data, val_split, prev, posteriors, keep_samples, verbose, sample_size = args + if verbose: + print(f'\tfit-start for prev {F.strprev(prev)}, sample_size={sample_size}') + model = deepcopy(base_quantifier) + + if val_split is not None: + if isinstance(val_split, float): + assert 0 < val_split < 1, 'val_split should be in (0,1)' + data, val_split = data.split_stratified(train_prop=1 - val_split) + + sample_index = data.sampling_index(sample_size, *prev) + sample = data.sampling_from_index(sample_index) + + if val_split is not None: + model.fit(sample, val_split=val_split) + else: + model.fit(sample) + + tr_prevalence = sample.prevalence() + tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None + + if verbose: + print(f'\t\--fit-ended for prev {F.strprev(prev)}') + + return (model, tr_prevalence, tr_distribution, sample if keep_samples else None) + + +def _delayed_quantify(args): + quantifier, instances = args + return quantifier[0].quantify(instances) + + +def _draw_simplex(ndim, min_val, max_trials=100): + """ + Returns a uniform sampling from the ndim-dimensional simplex but guarantees that all dimensions + are >= min_class_prev (for min_val>0, this makes the sampling not truly uniform) + + :param ndim: number of dimensions of the simplex + :param min_val: minimum class prevalence allowed. If less than 1/ndim a ValueError will be throw since + there is no possible solution. + :return: a sample from the ndim-dimensional simplex that is uniform in S(ndim)-R where S(ndim) is the simplex + and R is the simplex subset containing dimensions lower than min_val + """ + if min_val >= 1 / ndim: + raise ValueError(f'no sample can be draw from the {ndim}-dimensional simplex so that ' + f'all its values are >={min_val} (try with a larger value for min_pos)') + trials = 0 + while True: + u = F.uniform_simplex_sampling(ndim) + if all(u >= min_val): + return u + trials += 1 + if trials >= max_trials: + raise ValueError(f'it looks like finding a random simplex with all its dimensions being' + f'>= {min_val} is unlikely (it failed after {max_trials} trials)') + + +def _instantiate_ensemble(classifier, base_quantifier_class, param_grid, optim, param_model_sel, **kwargs): + if optim is None: + base_quantifier = base_quantifier_class(classifier) + elif optim in qp.error.CLASSIFICATION_ERROR: + if optim == qp.error.f1e: + scoring = make_scorer(f1_score) + elif optim == qp.error.acce: + scoring = make_scorer(accuracy_score) + classifier = GridSearchCV(classifier, param_grid, scoring=scoring) + base_quantifier = base_quantifier_class(classifier) + else: + base_quantifier = GridSearchQ(base_quantifier_class(classifier), + param_grid=param_grid, + **param_model_sel, + error=optim) + + return Ensemble(base_quantifier, **kwargs) + + +def _check_error(error): + if error is None: + return None + if error in qp.error.QUANTIFICATION_ERROR or error in qp.error.CLASSIFICATION_ERROR: + return error + elif isinstance(error, str): + return qp.error.from_name(error) + else: + raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n' + f'the name of an error function in {qp.error.ERROR_NAMES}') + + +
[docs]def ensembleFactory(classifier, base_quantifier_class, param_grid=None, optim=None, param_model_sel: dict = None, + **kwargs): + """ + Ensemble factory. Provides a unified interface for instantiating ensembles that can be optimized (via model + selection for quantification) for a given evaluation metric using :class:`quapy.model_selection.GridSearchQ`. + If the evaluation metric is classification-oriented + (instead of quantification-oriented), then the optimization will be carried out via sklearn's + `GridSearchCV <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html>`_. + + Example to instantiate an :class:`Ensemble` based on :class:`quapy.method.aggregative.PACC` + in which the base members are optimized for :meth:`quapy.error.mae` via + :class:`quapy.model_selection.GridSearchQ`. The ensemble follows the policy `Accuracy` based + on :meth:`quapy.error.mae` (the same measure being optimized), + meaning that a static selection of members of the ensemble is made based on their performance + in terms of this error. + + >>> param_grid = { + >>> 'C': np.logspace(-3,3,7), + >>> 'class_weight': ['balanced', None] + >>> } + >>> param_mod_sel = { + >>> 'sample_size': 500, + >>> 'protocol': 'app' + >>> } + >>> common={ + >>> 'max_sample_size': 1000, + >>> 'n_jobs': -1, + >>> 'param_grid': param_grid, + >>> 'param_mod_sel': param_mod_sel, + >>> } + >>> + >>> ensembleFactory(LogisticRegression(), PACC, optim='mae', policy='mae', **common) + + :param classifier: sklearn's Estimator that generates a classifier + :param base_quantifier_class: a class of quantifiers + :param param_grid: a dictionary with the grid of parameters to optimize for + :param optim: a valid quantification or classification error, or a string name of it + :param param_model_sel: a dictionary containing any keyworded argument to pass to + :class:`quapy.model_selection.GridSearchQ` + :param kwargs: kwargs for the class :class:`Ensemble` + :return: an instance of :class:`Ensemble` + """ + if optim is not None: + if param_grid is None: + raise ValueError(f'param_grid is None but optim was requested.') + if param_model_sel is None: + raise ValueError(f'param_model_sel is None but optim was requested.') + error = _check_error(optim) + return _instantiate_ensemble(classifier, base_quantifier_class, param_grid, error, param_model_sel, **kwargs)
+ + +
[docs]def ECC(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs): + """ + Implements an ensemble of :class:`quapy.method.aggregative.CC` quantifiers, as used by + `Pérez-Gállego et al., 2019 <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_. + + Equivalent to: + + >>> ensembleFactory(classifier, CC, param_grid, optim, param_mod_sel, **kwargs) + + See :meth:`ensembleFactory` for further details. + + :param classifier: sklearn's Estimator that generates a classifier + :param param_grid: a dictionary with the grid of parameters to optimize for + :param optim: a valid quantification or classification error, or a string name of it + :param param_model_sel: a dictionary containing any keyworded argument to pass to + :class:`quapy.model_selection.GridSearchQ` + :param kwargs: kwargs for the class :class:`Ensemble` + :return: an instance of :class:`Ensemble` + """ + + return ensembleFactory(classifier, CC, param_grid, optim, param_mod_sel, **kwargs)
+ + +
[docs]def EACC(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs): + """ + Implements an ensemble of :class:`quapy.method.aggregative.ACC` quantifiers, as used by + `Pérez-Gállego et al., 2019 <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_. + + Equivalent to: + + >>> ensembleFactory(classifier, ACC, param_grid, optim, param_mod_sel, **kwargs) + + See :meth:`ensembleFactory` for further details. + + :param classifier: sklearn's Estimator that generates a classifier + :param param_grid: a dictionary with the grid of parameters to optimize for + :param optim: a valid quantification or classification error, or a string name of it + :param param_model_sel: a dictionary containing any keyworded argument to pass to + :class:`quapy.model_selection.GridSearchQ` + :param kwargs: kwargs for the class :class:`Ensemble` + :return: an instance of :class:`Ensemble` + """ + + return ensembleFactory(classifier, ACC, param_grid, optim, param_mod_sel, **kwargs)
+ + +
[docs]def EPACC(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs): + """ + Implements an ensemble of :class:`quapy.method.aggregative.PACC` quantifiers. + + Equivalent to: + + >>> ensembleFactory(classifier, PACC, param_grid, optim, param_mod_sel, **kwargs) + + See :meth:`ensembleFactory` for further details. + + :param classifier: sklearn's Estimator that generates a classifier + :param param_grid: a dictionary with the grid of parameters to optimize for + :param optim: a valid quantification or classification error, or a string name of it + :param param_model_sel: a dictionary containing any keyworded argument to pass to + :class:`quapy.model_selection.GridSearchQ` + :param kwargs: kwargs for the class :class:`Ensemble` + :return: an instance of :class:`Ensemble` + """ + + return ensembleFactory(classifier, PACC, param_grid, optim, param_mod_sel, **kwargs)
+ + +
[docs]def EHDy(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs): + """ + Implements an ensemble of :class:`quapy.method.aggregative.HDy` quantifiers, as used by + `Pérez-Gállego et al., 2019 <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_. + + Equivalent to: + + >>> ensembleFactory(classifier, HDy, param_grid, optim, param_mod_sel, **kwargs) + + See :meth:`ensembleFactory` for further details. + + :param classifier: sklearn's Estimator that generates a classifier + :param param_grid: a dictionary with the grid of parameters to optimize for + :param optim: a valid quantification or classification error, or a string name of it + :param param_model_sel: a dictionary containing any keyworded argument to pass to + :class:`quapy.model_selection.GridSearchQ` + :param kwargs: kwargs for the class :class:`Ensemble` + :return: an instance of :class:`Ensemble` + """ + + return ensembleFactory(classifier, HDy, param_grid, optim, param_mod_sel, **kwargs)
+ + +
[docs]def EEMQ(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs): + """ + Implements an ensemble of :class:`quapy.method.aggregative.EMQ` quantifiers. + + Equivalent to: + + >>> ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs) + + See :meth:`ensembleFactory` for further details. + + :param classifier: sklearn's Estimator that generates a classifier + :param param_grid: a dictionary with the grid of parameters to optimize for + :param optim: a valid quantification or classification error, or a string name of it + :param param_model_sel: a dictionary containing any keyworded argument to pass to + :class:`quapy.model_selection.GridSearchQ` + :param kwargs: kwargs for the class :class:`Ensemble` + :return: an instance of :class:`Ensemble` + """ + + return ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs)
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/method/non_aggregative.html b/docs/build/html/_modules/quapy/method/non_aggregative.html new file mode 100644 index 0000000..aeb5b96 --- /dev/null +++ b/docs/build/html/_modules/quapy/method/non_aggregative.html @@ -0,0 +1,266 @@ + + + + + + quapy.method.non_aggregative — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.method.non_aggregative

+from typing import Union, Callable
+import numpy as np
+
+from quapy.functional import get_divergence
+from quapy.data import LabelledCollection
+from quapy.method.base import BaseQuantifier, BinaryQuantifier
+import quapy.functional as F
+
+
+
[docs]class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier): + """ + The `Maximum Likelihood Prevalence Estimation` (MLPE) method is a lazy method that assumes there is no prior + probability shift between training and test instances (put it other way, that the i.i.d. assumpion holds). + The estimation of class prevalence values for any test sample is always (i.e., irrespective of the test sample + itself) the class prevalence seen during training. This method is considered to be a lower-bound quantifier that + any quantification method should beat. + """ + + def __init__(self): + self._classes_ = None + +
[docs] def fit(self, data: LabelledCollection): + """ + Computes the training prevalence and stores it. + + :param data: the training sample + :return: self + """ + self.estimated_prevalence = data.prevalence() + return self
+ +
[docs] def quantify(self, instances): + """ + Ignores the input instances and returns, as the class prevalence estimantes, the training prevalence. + + :param instances: array-like (ignored) + :return: the class prevalence seen during training + """ + return self.estimated_prevalence
+ + +
[docs]class DMx(BaseQuantifier): + """ + Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of covariates. + This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters. + + :param nbins: number of bins used to discretize the distributions (default 8) + :param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented) + or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger + Distance) + :param cdf: whether to use CDF instead of PDF (default False) + :param n_jobs: number of parallel workers (default None) + """ + + def __init__(self, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, search='optim_minimize', n_jobs=None): + self.nbins = nbins + self.divergence = divergence + self.cdf = cdf + self.search = search + self.n_jobs = n_jobs + +
[docs] @classmethod + def HDx(cls, n_jobs=None): + """ + `Hellinger Distance x <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDx). + HDx is a method for training binary quantifiers, that models quantification as the problem of + minimizing the average divergence (in terms of the Hellinger Distance) across the feature-specific normalized + histograms of two representations, one for the unlabelled examples, and another generated from the training + examples as a mixture model of the class-specific representations. The parameters of the mixture thus represent + the estimates of the class prevalence values. + + The method computes all matchings for nbins in [10, 20, ..., 110] and reports the mean of the median. + The best prevalence is searched via linear search, from 0 to 1 stepping by 0.01. + + :param n_jobs: number of parallel workers + :return: an instance of this class setup to mimick the performance of the HDx as originally proposed by + González-Castro, Alaiz-Rodríguez, Alegre (2013) + """ + from quapy.method.meta import MedianEstimator + + dmx = DMx(divergence='HD', cdf=False, search='linear_search') + nbins = {'nbins': np.linspace(10, 110, 11, dtype=int)} + hdx = MedianEstimator(base_quantifier=dmx, param_grid=nbins, n_jobs=n_jobs) + return hdx
+ + def __get_distributions(self, X): + + histograms = [] + for feat_idx in range(self.nfeats): + feature = X[:, feat_idx] + feat_range = self.feat_ranges[feat_idx] + hist = np.histogram(feature, bins=self.nbins, range=feat_range)[0] + norm_hist = hist / hist.sum() + histograms.append(norm_hist) + distributions = np.vstack(histograms) + + if self.cdf: + distributions = np.cumsum(distributions, axis=1) + + return distributions + +
[docs] def fit(self, data: LabelledCollection): + """ + Generates the validation distributions out of the training data (covariates). + The validation distributions have shape `(n, nfeats, nbins)`, with `n` the number of classes, `nfeats` + the number of features, and `nbins` the number of bins. + In particular, let `V` be the validation distributions; then `di=V[i]` are the distributions obtained from + training data labelled with class `i`; while `dij = di[j]` is the discrete distribution for feature j in + training data labelled with class `i`, and `dij[k]` is the fraction of instances with a value in the `k`-th bin. + + :param data: the training set + """ + X, y = data.Xy + + self.nfeats = X.shape[1] + self.feat_ranges = _get_features_range(X) + + self.validation_distribution = np.asarray( + [self.__get_distributions(X[y==cat]) for cat in range(data.n_classes)] + ) + + return self
+ +
[docs] def quantify(self, instances): + """ + Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution + (the mixture) that best matches the test distribution, in terms of the divergence measure of choice. + The matching is computed as the average dissimilarity (in terms of the dissimilarity measure of choice) + between all feature-specific discrete distributions. + + :param instances: instances in the sample + :return: a vector of class prevalence estimates + """ + + assert instances.shape[1] == self.nfeats, f'wrong shape; expected {self.nfeats}, found {instances.shape[1]}' + + test_distribution = self.__get_distributions(instances) + divergence = get_divergence(self.divergence) + n_classes, n_feats, nbins = self.validation_distribution.shape + def loss(prev): + prev = np.expand_dims(prev, axis=0) + mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_feats, -1) + divs = [divergence(test_distribution[feat], mixture_distribution[feat]) for feat in range(n_feats)] + return np.mean(divs) + + return F.argmin_prevalence(loss, n_classes, method=self.search)
+ + + +def _get_features_range(X): + feat_ranges = [] + ncols = X.shape[1] + for col_idx in range(ncols): + feature = X[:,col_idx] + feat_ranges.append((np.min(feature), np.max(feature))) + return feat_ranges + + +#--------------------------------------------------------------- +# aliases +#--------------------------------------------------------------- + +DistributionMatchingX = DMx +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/model_selection.html b/docs/build/html/_modules/quapy/model_selection.html new file mode 100644 index 0000000..172c8f3 --- /dev/null +++ b/docs/build/html/_modules/quapy/model_selection.html @@ -0,0 +1,516 @@ + + + + + + quapy.model_selection — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.model_selection

+import itertools
+import signal
+from copy import deepcopy
+from enum import Enum
+from typing import Union, Callable
+from functools import wraps
+
+import numpy as np
+from sklearn import clone
+
+import quapy as qp
+from quapy import evaluation
+from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
+from quapy.data.base import LabelledCollection
+from quapy.method.aggregative import BaseQuantifier, AggregativeQuantifier
+from quapy.util import timeout
+from time import time
+
+
+
[docs]class Status(Enum): + SUCCESS = 1 + TIMEOUT = 2 + INVALID = 3 + ERROR = 4
+ + +
[docs]class ConfigStatus: + def __init__(self, params, status, msg=''): + self.params = params + self.status = status + self.msg = msg + + def __str__(self): + return f':params:{self.params} :status:{self.status} ' + self.msg + + def __repr__(self): + return str(self) + +
[docs] def success(self): + return self.status == Status.SUCCESS
+ +
[docs] def failed(self): + return self.status != Status.SUCCESS
+ + +
[docs]class GridSearchQ(BaseQuantifier): + """Grid Search optimization targeting a quantification-oriented metric. + + Optimizes the hyperparameters of a quantification method, based on an evaluation method and on an evaluation + protocol for quantification. + + :param model: the quantifier to optimize + :type model: BaseQuantifier + :param param_grid: a dictionary with keys the parameter names and values the list of values to explore + :param protocol: a sample generation protocol, an instance of :class:`quapy.protocol.AbstractProtocol` + :param error: an error function (callable) or a string indicating the name of an error function (valid ones + are those in :class:`quapy.error.QUANTIFICATION_ERROR` + :param refit: whether to refit the model on the whole labelled collection (training+validation) with + the best chosen hyperparameter combination. Ignored if protocol='gen' + :param timeout: establishes a timer (in seconds) for each of the hyperparameters configurations being tested. + Whenever a run takes longer than this timer, that configuration will be ignored. If all configurations end up + being ignored, a TimeoutError exception is raised. If -1 (default) then no time bound is set. + :param raise_errors: boolean, if True then raises an exception when a param combination yields any error, if + otherwise is False (default), then the combination is marked with an error status, but the process goes on. + However, if no configuration yields a valid model, then a ValueError exception will be raised. + :param verbose: set to True to get information through the stdout + """ + + def __init__(self, + model: BaseQuantifier, + param_grid: dict, + protocol: AbstractProtocol, + error: Union[Callable, str] = qp.error.mae, + refit=True, + timeout=-1, + n_jobs=None, + raise_errors=False, + verbose=False): + + self.model = model + self.param_grid = param_grid + self.protocol = protocol + self.refit = refit + self.timeout = timeout + self.n_jobs = qp._get_njobs(n_jobs) + self.raise_errors = raise_errors + self.verbose = verbose + self.__check_error(error) + assert isinstance(protocol, AbstractProtocol), 'unknown protocol' + + def _sout(self, msg): + if self.verbose: + print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}') + + def __check_error(self, error): + if error in qp.error.QUANTIFICATION_ERROR: + self.error = error + elif isinstance(error, str): + self.error = qp.error.from_name(error) + elif hasattr(error, '__call__'): + self.error = error + else: + raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n' + f'the name of an error function in {qp.error.QUANTIFICATION_ERROR_NAMES}') + + def _prepare_classifier(self, cls_params): + model = deepcopy(self.model) + + def job(cls_params): + model.set_params(**cls_params) + predictions = model.classifier_fit_predict(self._training) + return predictions + + predictions, status, took = self._error_handler(job, cls_params) + self._sout(f'[classifier fit] hyperparams={cls_params} [took {took:.3f}s]') + return model, predictions, status, took + + def _prepare_aggregation(self, args): + model, predictions, cls_took, cls_params, q_params = args + model = deepcopy(model) + params = {**cls_params, **q_params} + + def job(q_params): + model.set_params(**q_params) + model.aggregation_fit(predictions, self._training) + score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error) + return score + + score, status, aggr_took = self._error_handler(job, q_params) + self._print_status(params, score, status, aggr_took) + return model, params, score, status, (cls_took+aggr_took) + + def _prepare_nonaggr_model(self, params): + model = deepcopy(self.model) + + def job(params): + model.set_params(**params) + model.fit(self._training) + score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error) + return score + + score, status, took = self._error_handler(job, params) + self._print_status(params, score, status, took) + return model, params, score, status, took + + def _break_down_fit(self): + """ + Decides whether to break down the fit phase in two (classifier-fit followed by aggregation-fit). + In order to do so, some conditions should be met: a) the quantifier is of type aggregative, + b) the set of hyperparameters can be split into two disjoint non-empty groups. + + :return: True if the conditions are met, False otherwise + """ + if not isinstance(self.model, AggregativeQuantifier): + return False + cls_configs, q_configs = group_params(self.param_grid) + if (len(cls_configs) == 1) or (len(q_configs)==1): + return False + return True + + def _compute_scores_aggregative(self, training): + # break down the set of hyperparameters into two: classifier-specific, quantifier-specific + cls_configs, q_configs = group_params(self.param_grid) + + # train all classifiers and get the predictions + self._training = training + cls_outs = qp.util.parallel( + self._prepare_classifier, + cls_configs, + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs + ) + + # filter out classifier configurations that yielded any error + success_outs = [] + for (model, predictions, status, took), cls_config in zip(cls_outs, cls_configs): + if status.success(): + success_outs.append((model, predictions, took, cls_config)) + else: + self.error_collector.append(status) + + if len(success_outs) == 0: + raise ValueError('No valid configuration found for the classifier!') + + # explore the quantifier-specific hyperparameters for each valid training configuration + aggr_configs = [(*out, q_config) for out, q_config in itertools.product(success_outs, q_configs)] + aggr_outs = qp.util.parallel( + self._prepare_aggregation, + aggr_configs, + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs + ) + + return aggr_outs + + def _compute_scores_nonaggregative(self, training): + configs = expand_grid(self.param_grid) + self._training = training + scores = qp.util.parallel( + self._prepare_nonaggr_model, + configs, + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs + ) + return scores + + def _print_status(self, params, score, status, took): + if status.success(): + self._sout(f'hyperparams=[{params}]\t got {self.error.__name__} = {score:.5f} [took {took:.3f}s]') + else: + self._sout(f'error={status}') + +
[docs] def fit(self, training: LabelledCollection): + """ Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing + the error metric. + + :param training: the training set on which to optimize the hyperparameters + :return: self + """ + + if self.refit and not isinstance(self.protocol, OnLabelledCollectionProtocol): + raise RuntimeWarning( + f'"refit" was requested, but the protocol does not implement ' + f'the {OnLabelledCollectionProtocol.__name__} interface' + ) + + tinit = time() + + self.error_collector = [] + + self._sout(f'starting model selection with n_jobs={self.n_jobs}') + if self._break_down_fit(): + results = self._compute_scores_aggregative(training) + else: + results = self._compute_scores_nonaggregative(training) + + self.param_scores_ = {} + self.best_score_ = None + for model, params, score, status, took in results: + if status.success(): + if self.best_score_ is None or score < self.best_score_: + self.best_score_ = score + self.best_params_ = params + self.best_model_ = model + self.param_scores_[str(params)] = score + else: + self.param_scores_[str(params)] = status.status + self.error_collector.append(status) + + tend = time()-tinit + + if self.best_score_ is None: + raise ValueError('no combination of hyperparameters seemed to work') + + self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) ' + f'[took {tend:.4f}s]') + + no_errors = len(self.error_collector) + if no_errors>0: + self._sout(f'warning: {no_errors} errors found') + for err in self.error_collector: + self._sout(f'\t{str(err)}') + + if self.refit: + if isinstance(self.protocol, OnLabelledCollectionProtocol): + tinit = time() + self._sout(f'refitting on the whole development set') + self.best_model_.fit(training + self.protocol.get_labelled_collection()) + tend = time() - tinit + self.refit_time_ = tend + else: + # already checked + raise RuntimeWarning(f'the model cannot be refit on the whole dataset') + + return self
+ +
[docs] def quantify(self, instances): + """Estimate class prevalence values using the best model found after calling the :meth:`fit` method. + + :param instances: sample contanining the instances + :return: a ndarray of shape `(n_classes)` with class prevalence estimates as according to the best model found + by the model selection process. + """ + assert hasattr(self, 'best_model_'), 'quantify called before fit' + return self.best_model().quantify(instances)
+ +
[docs] def set_params(self, **parameters): + """Sets the hyper-parameters to explore. + + :param parameters: a dictionary with keys the parameter names and values the list of values to explore + """ + self.param_grid = parameters
+ +
[docs] def get_params(self, deep=True): + """Returns the dictionary of hyper-parameters to explore (`param_grid`) + + :param deep: Unused + :return: the dictionary `param_grid` + """ + return self.param_grid
+ +
[docs] def best_model(self): + """ + Returns the best model found after calling the :meth:`fit` method, i.e., the one trained on the combination + of hyper-parameters that minimized the error function. + + :return: a trained quantifier + """ + if hasattr(self, 'best_model_'): + return self.best_model_ + raise ValueError('best_model called before fit')
+ + def _error_handler(self, func, params): + """ + Endorses one job with two returned values: the status, and the time of execution + + :param func: the function to be called + :param params: parameters of the function + :return: `tuple(out, status, time)` where `out` is the function output, + `status` is an enum value from `Status`, and `time` is the time it + took to complete the call + """ + + output = None + + def _handle(status, exception): + if self.raise_errors: + raise exception + else: + return ConfigStatus(params, status) + + try: + with timeout(self.timeout): + tinit = time() + output = func(params) + status = ConfigStatus(params, Status.SUCCESS) + + except TimeoutError as e: + status = _handle(Status.TIMEOUT, e) + + except ValueError as e: + status = _handle(Status.INVALID, e) + + except Exception as e: + status = _handle(Status.ERROR, e) + + took = time() - tinit + return output, status, took
+ + +
[docs]def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfolds=3, random_state=0): + """ + Akin to `scikit-learn's cross_val_predict <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html>`_ + but for quantification. + + :param quantifier: a quantifier issuing class prevalence values + :param data: a labelled collection + :param nfolds: number of folds for k-fold cross validation generation + :param random_state: random seed for reproducibility + :return: a vector of class prevalence values + """ + + total_prev = np.zeros(shape=data.n_classes) + + for train, test in data.kFCV(nfolds=nfolds, random_state=random_state): + quantifier.fit(train) + fold_prev = quantifier.quantify(test.X) + rel_size = 1. * len(test) / len(data) + total_prev += fold_prev*rel_size + + return total_prev
+ + +
[docs]def expand_grid(param_grid: dict): + """ + Expands a param_grid dictionary as a list of configurations. + Example: + + >>> combinations = expand_grid({'A': [1, 10, 100], 'B': [True, False]}) + >>> print(combinations) + >>> [{'A': 1, 'B': True}, {'A': 1, 'B': False}, {'A': 10, 'B': True}, {'A': 10, 'B': False}, {'A': 100, 'B': True}, {'A': 100, 'B': False}] + + :param param_grid: dictionary with keys representing hyper-parameter names, and values representing the range + to explore for that hyper-parameter + :return: a list of configurations, i.e., combinations of hyper-parameter assignments in the grid. + """ + params_keys = list(param_grid.keys()) + params_values = list(param_grid.values()) + configs = [{k: combs[i] for i, k in enumerate(params_keys)} for combs in itertools.product(*params_values)] + return configs
+ + +
[docs]def group_params(param_grid: dict): + """ + Partitions a param_grid dictionary as two lists of configurations, one for the classifier-specific + hyper-parameters, and another for que quantifier-specific hyper-parameters + + :param param_grid: dictionary with keys representing hyper-parameter names, and values representing the range + to explore for that hyper-parameter + :return: two expanded grids of configurations, one for the classifier, another for the quantifier + """ + classifier_params, quantifier_params = {}, {} + for key, values in param_grid.items(): + if key.startswith('classifier__') or key == 'val_split': + classifier_params[key] = values + else: + quantifier_params[key] = values + + classifier_configs = expand_grid(classifier_params) + quantifier_configs = expand_grid(quantifier_params) + + return classifier_configs, quantifier_configs
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/plot.html b/docs/build/html/_modules/quapy/plot.html new file mode 100644 index 0000000..79179a1 --- /dev/null +++ b/docs/build/html/_modules/quapy/plot.html @@ -0,0 +1,687 @@ + + + + + + quapy.plot — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.plot

+from collections import defaultdict
+import matplotlib.pyplot as plt
+from matplotlib.cm import get_cmap
+import numpy as np
+from matplotlib import cm
+from scipy.stats import ttest_ind_from_stats
+from matplotlib.ticker import ScalarFormatter
+import math
+
+import quapy as qp
+
+plt.rcParams['figure.figsize'] = [10, 6]
+plt.rcParams['figure.dpi'] = 200
+plt.rcParams['font.size'] = 18
+
+
+
+[docs] +def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, show_std=True, legend=True, + train_prev=None, savepath=None, method_order=None): + """ + The diagonal plot displays the predicted prevalence values (along the y-axis) as a function of the true prevalence + values (along the x-axis). The optimal quantifier is described by the diagonal (0,0)-(1,1) of the plot (hence the + name). It is convenient for binary quantification problems, though it can be used for multiclass problems by + indicating which class is to be taken as the positive class. (For multiclass quantification problems, other plots + like the :meth:`error_by_drift` might be preferable though). + + :param method_names: array-like with the method names for each experiment + :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for + each experiment + :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components) + for each experiment + :param pos_class: index of the positive class + :param title: the title to be displayed in the plot + :param show_std: whether or not to show standard deviations (represented by color bands). This might be inconvenient + for cases in which many methods are compared, or when the standard deviations are high -- default True) + :param legend: whether or not to display the leyend (default True) + :param train_prev: if indicated (default is None), the training prevalence (for the positive class) is hightlighted + in the plot. This is convenient when all the experiments have been conducted in the same dataset. + :param savepath: path where to save the plot. If not indicated (as default), the plot is shown. + :param method_order: if indicated (default is None), imposes the order in which the methods are processed (i.e., + listed in the legend and associated with matplotlib colors). + """ + fig, ax = plt.subplots() + ax.set_aspect('equal') + ax.grid() + ax.plot([0, 1], [0, 1], '--k', label='ideal', zorder=1) + + method_names, true_prevs, estim_prevs = _merge(method_names, true_prevs, estim_prevs) + + order = list(zip(method_names, true_prevs, estim_prevs)) + if method_order is not None: + table = {method_name:[true_prev, estim_prev] for method_name, true_prev, estim_prev in order} + order = [(method_name, *table[method_name]) for method_name in method_order] + + NUM_COLORS = len(method_names) + if NUM_COLORS>10: + cm = plt.get_cmap('tab20') + ax.set_prop_cycle(color=[cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)]) + for method, true_prev, estim_prev in order: + true_prev = true_prev[:,pos_class] + estim_prev = estim_prev[:,pos_class] + + x_ticks = np.unique(true_prev) + x_ticks.sort() + y_ave = np.asarray([estim_prev[true_prev == x].mean() for x in x_ticks]) + y_std = np.asarray([estim_prev[true_prev == x].std() for x in x_ticks]) + + ax.errorbar(x_ticks, y_ave, fmt='-', marker='o', label=method, markersize=3, zorder=2) + if show_std: + ax.fill_between(x_ticks, y_ave - y_std, y_ave + y_std, alpha=0.25) + + if train_prev is not None: + train_prev = train_prev[pos_class] + ax.scatter(train_prev, train_prev, c='c', label='tr-prev', linewidth=2, edgecolor='k', s=100, zorder=3) + + ax.set(xlabel='true prevalence', ylabel='estimated prevalence', title=title) + ax.set_ylim(0, 1) + ax.set_xlim(0, 1) + + if legend: + ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) + # box = ax.get_position() + # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + # ax.legend(loc='lower center', + # bbox_to_anchor=(1, -0.5), + # ncol=(len(method_names)+1)//2) + + _save_or_show(savepath)
+ + + +
+[docs] +def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None): + """ + Box-plots displaying the global bias (i.e., signed error computed as the estimated value minus the true value) + for each quantification method with respect to a given positive class. + + :param method_names: array-like with the method names for each experiment + :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for + each experiment + :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components) + for each experiment + :param pos_class: index of the positive class + :param title: the title to be displayed in the plot + :param savepath: path where to save the plot. If not indicated (as default), the plot is shown. + """ + + method_names, true_prevs, estim_prevs = _merge(method_names, true_prevs, estim_prevs) + + fig, ax = plt.subplots() + ax.grid() + + data, labels = [], [] + for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs): + true_prev = true_prev[:,pos_class] + estim_prev = estim_prev[:,pos_class] + data.append(estim_prev-true_prev) + labels.append(method) + + ax.boxplot(data, labels=labels, patch_artist=False, showmeans=True) + plt.xticks(rotation=45) + ax.set(ylabel='error bias', title=title) + + _save_or_show(savepath)
+ + + +
+[docs] +def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=5, colormap=cm.tab10, + vertical_xticks=False, legend=True, savepath=None): + """ + Box-plots displaying the local bias (i.e., signed error computed as the estimated value minus the true value) + for different bins of (true) prevalence of the positive classs, for each quantification method. + + :param method_names: array-like with the method names for each experiment + :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for + each experiment + :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components) + for each experiment + :param pos_class: index of the positive class + :param title: the title to be displayed in the plot + :param nbins: number of bins + :param colormap: the matplotlib colormap to use (default cm.tab10) + :param vertical_xticks: whether or not to add secondary grid (default is False) + :param legend: whether or not to display the legend (default is True) + :param savepath: path where to save the plot. If not indicated (as default), the plot is shown. + """ + from pylab import boxplot, plot, setp + + fig, ax = plt.subplots() + ax.grid() + + method_names, true_prevs, estim_prevs = _merge(method_names, true_prevs, estim_prevs) + + bins = np.linspace(0, 1, nbins+1) + binwidth = 1/nbins + data = {} + for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs): + true_prev = true_prev[:,pos_class] + estim_prev = estim_prev[:,pos_class] + + data[method] = [] + inds = np.digitize(true_prev, bins, right=True) + for ind in range(len(bins)): + selected = inds==ind + data[method].append(estim_prev[selected] - true_prev[selected]) + + nmethods = len(method_names) + boxwidth = binwidth/(nmethods+4) + for i,bin in enumerate(bins[:-1]): + boxdata = [data[method][i] for method in method_names] + positions = [bin+(i*boxwidth)+2*boxwidth for i,_ in enumerate(method_names)] + box = boxplot(boxdata, showmeans=False, positions=positions, widths = boxwidth, sym='+', patch_artist=True) + for boxid in range(len(method_names)): + c = colormap.colors[boxid%len(colormap.colors)] + setp(box['fliers'][boxid], color=c, marker='+', markersize=3., markeredgecolor=c) + setp(box['boxes'][boxid], color=c) + setp(box['medians'][boxid], color='k') + + major_xticks_positions, minor_xticks_positions = [], [] + major_xticks_labels, minor_xticks_labels = [], [] + for i,b in enumerate(bins[:-1]): + major_xticks_positions.append(b) + minor_xticks_positions.append(b + binwidth / 2) + major_xticks_labels.append('') + minor_xticks_labels.append(f'[{bins[i]:.2f}-{bins[i + 1]:.2f})') + ax.set_xticks(major_xticks_positions) + ax.set_xticks(minor_xticks_positions, minor=True) + ax.set_xticklabels(major_xticks_labels) + ax.set_xticklabels(minor_xticks_labels, minor=True, rotation='vertical' if vertical_xticks else 'horizontal') + + if vertical_xticks: + # Pad margins so that markers don't get clipped by the axes + plt.margins(0.2) + # Tweak spacing to prevent clipping of tick-labels + plt.subplots_adjust(bottom=0.15) + + if legend: + # adds the legend to the list hs, initialized with the "ideal" quantifier (one that has 0 bias across all bins. i.e. + # a line from (0,0) to (1,0). The other elements are simply labelled dot-plots that are to be removed (setting + # set_visible to False for all but the first element) after the legend has been placed + hs=[ax.plot([0, 1], [0, 0], '-k', zorder=2)[0]] + for colorid in range(len(method_names)): + color=colormap.colors[colorid % len(colormap.colors)] + h, = plot([0, 0], '-s', markerfacecolor=color, color='k',mec=color, linewidth=1.) + hs.append(h) + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + ax.legend(hs, ['ideal']+method_names, loc='center left', bbox_to_anchor=(1, 0.5)) + [h.set_visible(False) for h in hs[1:]] + + # x-axis and y-axis labels and limits + ax.set(xlabel='prevalence', ylabel='error bias', title=title) + ax.set_xlim(0, 1) + + _save_or_show(savepath)
+ + + +
+[docs] +def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, + n_bins=20, error_name='ae', show_std=False, + show_density=True, + show_legend=True, + logscale=False, + title=f'Quantification error as a function of distribution shift', + vlines=None, + method_order=None, + savepath=None): + """ + Plots the error (along the x-axis, as measured in terms of `error_name`) as a function of the train-test shift + (along the y-axis, as measured in terms of :meth:`quapy.error.ae`). This plot is useful especially for multiclass + problems, in which "diagonal plots" may be cumbersone, and in order to gain understanding about how methods + fare in different regions of the prior probability shift spectrum (e.g., in the low-shift regime vs. in the + high-shift regime). + + :param method_names: array-like with the method names for each experiment + :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for + each experiment + :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components) + for each experiment + :param tr_prevs: training prevalence of each experiment + :param n_bins: number of bins in which the y-axis is to be divided (default is 20) + :param error_name: a string representing the name of an error function (as defined in `quapy.error`, default is "ae") + :param show_std: whether or not to show standard deviations as color bands (default is False) + :param show_density: whether or not to display the distribution of experiments for each bin (default is True) + :param show_density: whether or not to display the legend of the chart (default is True) + :param logscale: whether or not to log-scale the y-error measure (default is False) + :param title: title of the plot (default is "Quantification error as a function of distribution shift") + :param vlines: array-like list of values (default is None). If indicated, highlights some regions of the space + using vertical dotted lines. + :param method_order: if indicated (default is None), imposes the order in which the methods are processed (i.e., + listed in the legend and associated with matplotlib colors). + :param savepath: path where to save the plot. If not indicated (as default), the plot is shown. + """ + + fig, ax = plt.subplots() + ax.grid() + + x_error = qp.error.ae + y_error = getattr(qp.error, error_name) + + # get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same + # order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to + # x_error function) and 'y' is the estim-test shift (computed as according to y_error) + data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order) + + if method_order is None: + method_order = method_names + + _set_colors(ax, n_methods=len(method_order)) + + bins = np.linspace(0, 1, n_bins+1) + binwidth = 1 / n_bins + min_x, max_x, min_y, max_y = None, None, None, None + npoints = np.zeros(len(bins), dtype=float) + for method in method_order: + tr_test_drifts = data[method]['x'] + method_drifts = data[method]['y'] + if logscale: + ax.set_yscale("log") + ax.yaxis.set_major_formatter(ScalarFormatter()) + ax.yaxis.get_major_formatter().set_scientific(False) + ax.minorticks_off() + + inds = np.digitize(tr_test_drifts, bins, right=True) + + xs, ys, ystds = [], [], [] + for p,ind in enumerate(range(len(bins))): + selected = inds==ind + if selected.sum() > 0: + xs.append(ind*binwidth-binwidth/2) + ys.append(np.mean(method_drifts[selected])) + ystds.append(np.std(method_drifts[selected])) + npoints[p] += len(method_drifts[selected]) + + xs = np.asarray(xs) + ys = np.asarray(ys) + ystds = np.asarray(ystds) + + min_x_method, max_x_method, min_y_method, max_y_method = xs.min(), xs.max(), ys.min(), ys.max() + min_x = min_x_method if min_x is None or min_x_method < min_x else min_x + max_x = max_x_method if max_x is None or max_x_method > max_x else max_x + max_y = max_y_method if max_y is None or max_y_method > max_y else max_y + min_y = min_y_method if min_y is None or min_y_method < min_y else min_y + max_y = max_y_method if max_y is None or max_y_method > max_y else max_y + + ax.errorbar(xs, ys, fmt='-', marker='o', color='w', markersize=8, linewidth=4, zorder=1) + ax.errorbar(xs, ys, fmt='-', marker='o', label=method, markersize=6, linewidth=2, zorder=2) + + if show_std: + ax.fill_between(xs, ys-ystds, ys+ystds, alpha=0.25) + + if show_density: + ax2 = ax.twinx() + densities = npoints/np.sum(npoints) + ax2.bar([ind * binwidth-binwidth/2 for ind in range(len(bins))], + densities, alpha=0.15, color='g', width=binwidth, label='density') + ax2.set_ylim(0,max(densities)) + ax2.spines['right'].set_color('g') + ax2.tick_params(axis='y', colors='g') + + ax.set(xlabel=f'Distribution shift between training set and test sample', + ylabel=f'{error_name.upper()} (true distribution, predicted distribution)', + title=title) + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + if vlines: + for vline in vlines: + ax.axvline(vline, 0, 1, linestyle='--', color='k') + + ax.set_xlim(min_x, max_x) + if logscale: + #nice scale for the logaritmic axis + ax.set_ylim(0,10 ** math.ceil(math.log10(max_y))) + + + if show_legend: + fig.legend(loc='lower center', + bbox_to_anchor=(1, 0.5), + ncol=(len(method_names)+1)//2) + + _save_or_show(savepath)
+ + + +
+[docs] +def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, + n_bins=20, binning='isomerous', + x_error='ae', y_error='ae', ttest_alpha=0.005, tail_density_threshold=0.005, + method_order=None, + savepath=None): + """ + Displays (only) the top performing methods for different regions of the train-test shift in form of a broken + bar chart, in which each method has bars only for those regions in which either one of the following conditions + hold: (i) it is the best method (in average) for the bin, or (ii) it is not statistically significantly different + (in average) as according to a two-sided t-test on independent samples at confidence `ttest_alpha`. + The binning can be made "isometric" (same size), or "isomerous" (same number of experiments -- default). A second + plot is displayed on top, that displays the distribution of experiments for each bin (when binning="isometric") or + the percentiles points of the distribution (when binning="isomerous"). + + :param method_names: array-like with the method names for each experiment + :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for + each experiment + :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components) + for each experiment + :param tr_prevs: training prevalence of each experiment + :param n_bins: number of bins in which the y-axis is to be divided (default is 20) + :param binning: type of binning, either "isomerous" (default) or "isometric" + :param x_error: a string representing the name of an error function (as defined in `quapy.error`) to be used for + measuring the amount of train-test shift (default is "ae") + :param y_error: a string representing the name of an error function (as defined in `quapy.error`) to be used for + measuring the amount of error in the prevalence estimations (default is "ae") + :param ttest_alpha: the confidence interval above which a p-value (two-sided t-test on independent samples) is + to be considered as an indicator that the two means are not statistically significantly different. Default is + 0.005, meaning that a `p-value > 0.005` indicates the two methods involved are to be considered similar + :param tail_density_threshold: sets a threshold on the density of experiments (over the total number of experiments) + below which a bin in the tail (i.e., the right-most ones) will be discarded. This is in order to avoid some + bins to be shown for train-test outliers. + :param method_order: if indicated (default is None), imposes the order in which the methods are processed (i.e., + listed in the legend and associated with matplotlib colors). + :param savepath: path where to save the plot. If not indicated (as default), the plot is shown. + :return: + """ + assert binning in ['isomerous', 'isometric'], 'unknown binning type; valid types are "isomerous" and "isometric"' + + x_error = getattr(qp.error, x_error) + y_error = getattr(qp.error, y_error) + + # get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same + # order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to + # x_error function) and 'y' is the estim-test shift (computed as according to y_error) + data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order) + + if method_order is None: + method_order = method_names + + if binning == 'isomerous': + # take bins containing the same amount of examples + tr_test_drifts = np.concatenate([data[m]['x'] for m in method_order]) + bins = np.quantile(tr_test_drifts, q=np.linspace(0, 1, n_bins+1)).flatten() + else: + # take equidistant bins + bins = np.linspace(0, 1, n_bins+1) + bins[0] = -0.001 + bins[-1] += 0.001 + + # we use this to keep track of how many datapoits contribute to each bin + inds_histogram_global = np.zeros(n_bins, dtype=float) + n_methods = len(method_order) + buckets = np.zeros(shape=(n_methods, n_bins, 3)) + for i, method in enumerate(method_order): + tr_test_drifts = data[method]['x'] + method_drifts = data[method]['y'] + + inds = np.digitize(tr_test_drifts, bins, right=False) + inds_histogram_global += np.histogram(tr_test_drifts, density=False, bins=bins)[0] + + for j in range(len(bins)): + selected = inds == j + if selected.sum() > 0: + buckets[i, j-1, 0] = np.mean(method_drifts[selected]) + buckets[i, j-1, 1] = np.std(method_drifts[selected]) + buckets[i, j-1, 2] = selected.sum() + + # cancel last buckets with low density + histogram = inds_histogram_global / inds_histogram_global.sum() + for tail in reversed(range(len(histogram))): + if histogram[tail] < tail_density_threshold: + buckets[:,tail,2] = 0 + else: + break + + salient_methods = set() + best_methods = [] + for bucket in range(buckets.shape[1]): + nc = buckets[:, bucket, 2].sum() + if nc == 0: + best_methods.append([]) + continue + + order = np.argsort(buckets[:, bucket, 0]) + rank1 = order[0] + best_bucket_methods = [method_order[rank1]] + best_mean, best_std, best_nc = buckets[rank1, bucket, :] + for method_index in order[1:]: + method_mean, method_std, method_nc = buckets[method_index, bucket, :] + _, pval = ttest_ind_from_stats(best_mean, best_std, best_nc, method_mean, method_std, method_nc) + if pval > ttest_alpha: + best_bucket_methods.append(method_order[method_index]) + best_methods.append(best_bucket_methods) + salient_methods.update(best_bucket_methods) + print(best_bucket_methods) + + if binning=='isomerous': + fig, axes = plt.subplots(2, 1, gridspec_kw={'height_ratios': [0.2, 1]}, figsize=(20, len(salient_methods))) + else: + fig, axes = plt.subplots(2, 1, gridspec_kw={'height_ratios': [1, 1]}, figsize=(20, len(salient_methods))) + + ax = axes[1] + high_from = 0 + yticks, yticks_method_names = [], [] + color = get_cmap('Accent').colors + vlines = [] + bar_high = 1 + for method in [m for m in method_order if m in salient_methods]: + broken_paths = [] + path_start, path_end = None, None + for i, best_bucket_methods in enumerate(best_methods): + if method in best_bucket_methods: + if path_start is None: + path_start = bins[i] + path_end = bins[i+1]-path_start + else: + path_end += bins[i+1]-bins[i] + else: + if path_start is not None: + broken_paths.append(tuple((path_start, path_end))) + path_start, path_end = None, None + if path_start is not None: + broken_paths.append(tuple((path_start, path_end))) + + ax.broken_barh(broken_paths, (high_from, bar_high), facecolors=color[len(yticks_method_names)]) + yticks.append(high_from+bar_high/2) + high_from += bar_high + yticks_method_names.append(method) + for path_start, path_end in broken_paths: + vlines.extend([path_start, path_start+path_end]) + + vlines = np.unique(vlines) + vlines = sorted(vlines) + for v in vlines[1:-1]: + ax.axvline(x=v, color='k', linestyle='--') + + ax.set_ylim(0, high_from) + ax.set_xlim(vlines[0], vlines[-1]) + ax.set_xlabel('Distribution shift between training set and sample') + + ax.set_yticks(yticks) + ax.set_yticklabels(yticks_method_names) + + # upper plot (explaining distribution) + ax = axes[0] + if binning == 'isometric': + # show the density for each region + bins[0]=0 + y_pos = [b+(bins[i+1]-b)/2 for i,b in enumerate(bins[:-1]) if histogram[i]>0] + bar_width = [bins[i+1]-bins[i] for i in range(len(bins[:-1])) if histogram[i]>0] + ax.bar(y_pos, [n for n in histogram if n>0], bar_width, align='center', alpha=0.5, color='silver') + ax.set_ylabel('shift\ndistribution', rotation=0, ha='right', va='center') + ax.set_xlim(vlines[0], vlines[-1]) + ax.get_xaxis().set_visible(False) + plt.subplots_adjust(wspace=0, hspace=0.1) + else: + # show the percentiles of the distribution + cumsum = np.cumsum(histogram) + for i in range(len(bins[:-1])): + start, width = bins[i], bins[i+1]-bins[i] + ax.broken_barh([tuple((start, width))], (0, 1), facecolors='whitesmoke' if i%2==0 else 'silver') + if i < len(bins)-2: + ax.text(bins[i+1], 0.5, '$P_{'+f'{int(np.round(cumsum[i]*100))}'+'}$', ha='center') + ax.set_ylim(0, 1) + ax.set_xlim(vlines[0], vlines[-1]) + ax.get_yaxis().set_visible(False) + ax.get_xaxis().set_visible(False) + plt.subplots_adjust(wspace=0, hspace=0) + + _save_or_show(savepath)
+ + + +def _merge(method_names, true_prevs, estim_prevs): + ndims = true_prevs[0].shape[1] + data = defaultdict(lambda: {'true': np.empty(shape=(0, ndims)), 'estim': np.empty(shape=(0, ndims))}) + method_order=[] + for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs): + data[method]['true'] = np.concatenate([data[method]['true'], true_prev]) + data[method]['estim'] = np.concatenate([data[method]['estim'], estim_prev]) + if method not in method_order: + method_order.append(method) + true_prevs_ = [data[m]['true'] for m in method_order] + estim_prevs_ = [data[m]['estim'] for m in method_order] + return method_order, true_prevs_, estim_prevs_ + + +def _set_colors(ax, n_methods): + NUM_COLORS = n_methods + cm = plt.get_cmap('tab20') + ax.set_prop_cycle(color=[cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)]) + + +def _save_or_show(savepath): + # if savepath is specified, then saves the plot in that path; otherwise the plot is shown + if savepath is not None: + qp.util.create_parent_dir(savepath) + # plt.tight_layout() + plt.savefig(savepath, bbox_inches='tight') + else: + plt.show() + + +def _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order): + data = defaultdict(lambda: {'x': np.empty(shape=(0)), 'y': np.empty(shape=(0))}) + + if method_order is None: + method_order = [] + + for method, test_prevs_i, estim_prevs_i, tr_prev_i in zip(method_names, true_prevs, estim_prevs, tr_prevs): + tr_prev_i = np.repeat(tr_prev_i.reshape(1, -1), repeats=test_prevs_i.shape[0], axis=0) + + tr_test_drifts = x_error(test_prevs_i, tr_prev_i) + data[method]['x'] = np.concatenate([data[method]['x'], tr_test_drifts]) + + method_drifts = y_error(test_prevs_i, estim_prevs_i) + data[method]['y'] = np.concatenate([data[method]['y'], method_drifts]) + + if method not in method_order: + method_order.append(method) + + return data +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/protocol.html b/docs/build/html/_modules/quapy/protocol.html new file mode 100644 index 0000000..7d96338 --- /dev/null +++ b/docs/build/html/_modules/quapy/protocol.html @@ -0,0 +1,606 @@ + + + + + + quapy.protocol — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.protocol

+from copy import deepcopy
+import quapy as qp
+import numpy as np
+import itertools
+from contextlib import ExitStack
+from abc import ABCMeta, abstractmethod
+from quapy.data import LabelledCollection
+import quapy.functional as F
+from os.path import exists
+from glob import glob
+
+
+
[docs]class AbstractProtocol(metaclass=ABCMeta): + """ + Abstract parent class for sample generation protocols. + """ + + @abstractmethod + def __call__(self): + """ + Implements the protocol. Yields one sample at a time along with its prevalence + + :return: yields a tuple `(sample, prev) at a time, where `sample` is a set of instances + and in which `prev` is an `nd.array` with the class prevalence values + """ + ... + +
[docs] def total(self): + """ + Indicates the total number of samples that the protocol generates. + + :return: The number of samples to generate if known, or `None` otherwise. + """ + return None
+ + +
[docs]class IterateProtocol(AbstractProtocol): + """ + A very simple protocol which simply iterates over a list of previously generated samples + + :param samples: a list of :class:`quapy.data.base.LabelledCollection` + """ + def __init__(self, samples: [LabelledCollection]): + self.samples = samples + + def __call__(self): + """ + Yields one sample from the initial list at a time + + :return: yields a tuple `(sample, prev) at a time, where `sample` is a set of instances + and in which `prev` is an `nd.array` with the class prevalence values + """ + for sample in self.samples: + yield sample.Xp + +
[docs] def total(self): + """ + Returns the number of samples in this protocol + + :return: int + """ + return len(self.samples)
+ + +
[docs]class AbstractStochasticSeededProtocol(AbstractProtocol): + """ + An `AbstractStochasticSeededProtocol` is a protocol that generates, via any random procedure (e.g., + via random sampling), sequences of :class:`quapy.data.base.LabelledCollection` samples. + The protocol abstraction enforces + the object to be instantiated using a seed, so that the sequence can be fully replicated. + In order to make this functionality possible, the classes extending this abstraction need to + implement only two functions, :meth:`samples_parameters` which generates all the parameters + needed for extracting the samples, and :meth:`sample` that, given some parameters as input, + deterministically generates a sample. + + :param random_state: the seed for allowing to replicate any sequence of samples. Default is 0, meaning that + the sequence will be consistent every time the protocol is called. + """ + + _random_state = -1 # means "not set" + + def __init__(self, random_state=0): + self.random_state = random_state + + @property + def random_state(self): + return self._random_state + + @random_state.setter + def random_state(self, random_state): + self._random_state = random_state + +
[docs] @abstractmethod + def samples_parameters(self): + """ + This function has to return all the necessary parameters to replicate the samples + + :return: a list of parameters, each of which serves to deterministically generate a sample + """ + ...
+ +
[docs] @abstractmethod + def sample(self, params): + """ + Extract one sample determined by the given parameters + + :param params: all the necessary parameters to generate a sample + :return: one sample (the same sample has to be generated for the same parameters) + """ + ...
+ + def __call__(self): + """ + Yields one sample at a time. The type of object returned depends on the `collator` function. The + default behaviour returns tuples of the form `(sample, prevalence)`. + + :return: a tuple `(sample, prevalence)` if return_type='sample_prev', or an instance of + :class:`qp.data.LabelledCollection` if return_type='labelled_collection' + """ + with ExitStack() as stack: + if self.random_state == -1: + raise ValueError('The random seed has never been initialized. ' + 'Set it to None not to impose replicability.') + if self.random_state is not None: + stack.enter_context(qp.util.temp_seed(self.random_state)) + for params in self.samples_parameters(): + yield self.collator(self.sample(params)) + +
[docs] def collator(self, sample, *args): + """ + The collator prepares the sample to accommodate the desired output format before returning the output. + This collator simply returns the sample as it is. Classes inheriting from this abstract class can + implement their custom collators. + + :param sample: the sample to be returned + :param args: additional arguments + :return: the sample adhering to a desired output format (in this case, the sample is returned as it is) + """ + return sample
+ + +
[docs]class OnLabelledCollectionProtocol: + """ + Protocols that generate samples from a :class:`qp.data.LabelledCollection` object. + """ + + RETURN_TYPES = ['sample_prev', 'labelled_collection', 'index'] + +
[docs] def get_labelled_collection(self): + """ + Returns the labelled collection on which this protocol acts. + + :return: an object of type :class:`qp.data.LabelledCollection` + """ + return self.data
+ +
[docs] def on_preclassified_instances(self, pre_classifications, in_place=False): + """ + Returns a copy of this protocol that acts on a modified version of the original + :class:`qp.data.LabelledCollection` in which the original instances have been replaced + with the outputs of a classifier for each instance. (This is convenient for speeding-up + the evaluation procedures for many samples, by pre-classifying the instances in advance.) + + :param pre_classifications: the predictions issued by a classifier, typically an array-like + with shape `(n_instances,)` when the classifier is a hard one, or with shape + `(n_instances, n_classes)` when the classifier is a probabilistic one. + :param in_place: whether or not to apply the modification in-place or in a new copy (default). + :return: a copy of this protocol + """ + assert len(pre_classifications) == len(self.data), \ + f'error: the pre-classified data has different shape ' \ + f'(expected {len(self.data)}, found {len(pre_classifications)})' + if in_place: + self.data.instances = pre_classifications + return self + else: + new = deepcopy(self) + return new.on_preclassified_instances(pre_classifications, in_place=True)
+ +
[docs] @classmethod + def get_collator(cls, return_type='sample_prev'): + """ + Returns a collator function, i.e., a function that prepares the yielded data + + :param return_type: either 'sample_prev' (default) if the collator is requested to yield tuples of + `(sample, prevalence)`, or 'labelled_collection' when it is requested to yield instances of + :class:`qp.data.LabelledCollection` + :return: the collator function (a callable function that takes as input an instance of + :class:`qp.data.LabelledCollection`) + """ + assert return_type in cls.RETURN_TYPES, \ + f'unknown return type passed as argument; valid ones are {cls.RETURN_TYPES}' + if return_type=='sample_prev': + return lambda lc:lc.Xp + elif return_type=='labelled_collection': + return lambda lc:lc
+ + +
[docs]class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol): + """ + Implementation of the artificial prevalence protocol (APP). + The APP consists of exploring a grid of prevalence values containing `n_prevalences` points (e.g., + [0, 0.05, 0.1, 0.15, ..., 1], if `n_prevalences=21`), and generating all valid combinations of + prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ..., + [1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of samples for each valid + combination of prevalence values is indicated by `repeats`. + + :param data: a `LabelledCollection` from which the samples will be drawn + :param sample_size: integer, number of instances in each sample; if None (default) then it is taken from + qp.environ["SAMPLE_SIZE"]. If this is not set, a ValueError exception is raised. + :param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the + grid (default is 21) + :param repeats: number of copies for each valid prevalence vector (default is 10) + :param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1 + :param random_state: allows replicating samples across runs (default 0, meaning that the sequence of samples + will be the same every time the protocol is called) + :param sanity_check: int, raises an exception warning the user that the number of examples to be generated exceed + this number; set to None for skipping this check + :param return_type: set to "sample_prev" (default) to get the pairs of (sample, prevalence) at each iteration, or + to "labelled_collection" to get instead instances of LabelledCollection + """ + + def __init__(self, data: LabelledCollection, sample_size=None, n_prevalences=21, repeats=10, + smooth_limits_epsilon=0, random_state=0, sanity_check=10000, return_type='sample_prev'): + super(APP, self).__init__(random_state) + self.data = data + self.sample_size = qp._get_sample_size(sample_size) + self.n_prevalences = n_prevalences + self.repeats = repeats + self.smooth_limits_epsilon = smooth_limits_epsilon + if not ((isinstance(sanity_check, int) and sanity_check>0) or sanity_check is None): + raise ValueError('param "sanity_check" must either be None or a positive integer') + if isinstance(sanity_check, int): + n = F.num_prevalence_combinations(n_prevpoints=n_prevalences, n_classes=data.n_classes, n_repeats=repeats) + if n > sanity_check: + raise RuntimeError( + f"Abort: the number of samples that will be generated by {self.__class__.__name__} ({n}) " + f"exceeds the maximum number of allowed samples ({sanity_check = }). Set 'sanity_check' to " + f"None, or to a higher number, for bypassing this check.") + + self.collator = OnLabelledCollectionProtocol.get_collator(return_type) + +
[docs] def prevalence_grid(self): + """ + Generates vectors of prevalence values from an exhaustive grid of prevalence values. The + number of prevalence values explored for each dimension depends on `n_prevalences`, so that, if, for example, + `n_prevalences=11` then the prevalence values of the grid are taken from [0, 0.1, 0.2, ..., 0.9, 1]. Only + valid prevalence distributions are returned, i.e., vectors of prevalence values that sum up to 1. For each + valid vector of prevalence values, `repeat` copies are returned. The vector of prevalence values can be + implicit (by setting `return_constrained_dim=False`), meaning that the last dimension (which is constrained + to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to + 1). Note that this method is deterministic, i.e., there is no random sampling anywhere. + + :return: a `np.ndarray` of shape `(n, dimensions)` if `return_constrained_dim=True` or of shape + `(n, dimensions-1)` if `return_constrained_dim=False`, where `n` is the number of valid combinations found + in the grid multiplied by `repeat` + """ + dimensions = self.data.n_classes + s = F.prevalence_linspace(self.n_prevalences, repeats=1, smooth_limits_epsilon=self.smooth_limits_epsilon) + eps = (s[1]-s[0])/2 # handling floating rounding + s = [s] * (dimensions - 1) + prevs = [p for p in itertools.product(*s, repeat=1) if (sum(p) < (1.+eps))] + prevs = np.asarray(prevs).reshape(len(prevs), -1) + if self.repeats > 1: + prevs = np.repeat(prevs, self.repeats, axis=0) + return prevs
+ +
[docs] def samples_parameters(self): + """ + Return all the necessary parameters to replicate the samples as according to the APP protocol. + + :return: a list of indexes that realize the APP sampling + """ + indexes = [] + for prevs in self.prevalence_grid(): + index = self.data.sampling_index(self.sample_size, *prevs) + indexes.append(index) + return indexes
+ +
[docs] def sample(self, index): + """ + Realizes the sample given the index of the instances. + + :param index: indexes of the instances to select + :return: an instance of :class:`qp.data.LabelledCollection` + """ + return self.data.sampling_from_index(index)
+ +
[docs] def total(self): + """ + Returns the number of samples that will be generated + + :return: int + """ + return F.num_prevalence_combinations(self.n_prevalences, self.data.n_classes, self.repeats)
+ + +
[docs]class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol): + """ + A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing + samples uniformly at random, therefore approximately preserving the natural prevalence of the collection. + + :param data: a `LabelledCollection` from which the samples will be drawn + :param sample_size: integer, the number of instances in each sample; if None (default) then it is taken from + qp.environ["SAMPLE_SIZE"]. If this is not set, a ValueError exception is raised. + :param repeats: the number of samples to generate. Default is 100. + :param random_state: allows replicating samples across runs (default 0, meaning that the sequence of samples + will be the same every time the protocol is called) + :param return_type: set to "sample_prev" (default) to get the pairs of (sample, prevalence) at each iteration, or + to "labelled_collection" to get instead instances of LabelledCollection + """ + + def __init__(self, data:LabelledCollection, sample_size=None, repeats=100, random_state=0, + return_type='sample_prev'): + super(NPP, self).__init__(random_state) + self.data = data + self.sample_size = qp._get_sample_size(sample_size) + self.repeats = repeats + self.random_state = random_state + self.collator = OnLabelledCollectionProtocol.get_collator(return_type) + +
[docs] def samples_parameters(self): + """ + Return all the necessary parameters to replicate the samples as according to the NPP protocol. + + :return: a list of indexes that realize the NPP sampling + """ + indexes = [] + for _ in range(self.repeats): + index = self.data.uniform_sampling_index(self.sample_size) + indexes.append(index) + return indexes
+ +
[docs] def sample(self, index): + """ + Realizes the sample given the index of the instances. + + :param index: indexes of the instances to select + :return: an instance of :class:`qp.data.LabelledCollection` + """ + return self.data.sampling_from_index(index)
+ +
[docs] def total(self): + """ + Returns the number of samples that will be generated (equals to "repeats") + + :return: int + """ + return self.repeats
+ + +
[docs]class UPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol): + """ + A variant of :class:`APP` that, instead of using a grid of equidistant prevalence values, + relies on the Kraemer algorithm for sampling unit (k-1)-simplex uniformly at random, with + k the number of classes. This protocol covers the entire range of prevalence values in a + statistical sense, i.e., unlike APP there is no guarantee that it is covered precisely + equally for all classes, but it is preferred in cases in which the number of possible + combinations of the grid values of APP makes this endeavour intractable. + + :param data: a `LabelledCollection` from which the samples will be drawn + :param sample_size: integer, the number of instances in each sample; if None (default) then it is taken from + qp.environ["SAMPLE_SIZE"]. If this is not set, a ValueError exception is raised. + :param repeats: the number of samples to generate. Default is 100. + :param random_state: allows replicating samples across runs (default 0, meaning that the sequence of samples + will be the same every time the protocol is called) + :param return_type: set to "sample_prev" (default) to get the pairs of (sample, prevalence) at each iteration, or + to "labelled_collection" to get instead instances of LabelledCollection + """ + + def __init__(self, data: LabelledCollection, sample_size=None, repeats=100, random_state=0, + return_type='sample_prev'): + super(UPP, self).__init__(random_state) + self.data = data + self.sample_size = qp._get_sample_size(sample_size) + self.repeats = repeats + self.random_state = random_state + self.collator = OnLabelledCollectionProtocol.get_collator(return_type) + +
[docs] def samples_parameters(self): + """ + Return all the necessary parameters to replicate the samples as according to the UPP protocol. + + :return: a list of indexes that realize the UPP sampling + """ + indexes = [] + for prevs in F.uniform_simplex_sampling(n_classes=self.data.n_classes, size=self.repeats): + index = self.data.sampling_index(self.sample_size, *prevs) + indexes.append(index) + return indexes
+ +
[docs] def sample(self, index): + """ + Realizes the sample given the index of the instances. + + :param index: indexes of the instances to select + :return: an instance of :class:`qp.data.LabelledCollection` + """ + return self.data.sampling_from_index(index)
+ +
[docs] def total(self): + """ + Returns the number of samples that will be generated (equals to "repeats") + + :return: int + """ + return self.repeats
+ + +
[docs]class DomainMixer(AbstractStochasticSeededProtocol): + """ + Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence. + + :param domainA: one domain, an object of :class:`qp.data.LabelledCollection` + :param domainB: another domain, an object of :class:`qp.data.LabelledCollection` + :param sample_size: integer, the number of instances in each sample; if None (default) then it is taken from + qp.environ["SAMPLE_SIZE"]. If this is not set, a ValueError exception is raised. + :param repeats: int, number of samples to draw for every mixture rate + :param prevalence: the prevalence to preserv along the mixtures. If specified, should be an array containing + one prevalence value (positive float) for each class and summing up to one. If not specified, the prevalence + will be taken from the domain A (default). + :param mixture_points: an integer indicating the number of points to take from a linear scale (e.g., 21 will + generate the mixture points [1, 0.95, 0.9, ..., 0]), or the array of mixture values itself. + the specific points + :param random_state: allows replicating samples across runs (default 0, meaning that the sequence of samples + will be the same every time the protocol is called) + """ + + def __init__( + self, + domainA: LabelledCollection, + domainB: LabelledCollection, + sample_size, + repeats=1, + prevalence=None, + mixture_points=11, + random_state=0, + return_type='sample_prev'): + super(DomainMixer, self).__init__(random_state) + self.A = domainA + self.B = domainB + self.sample_size = qp._get_sample_size(sample_size) + self.repeats = repeats + if prevalence is None: + self.prevalence = domainA.prevalence() + else: + self.prevalence = np.asarray(prevalence) + assert len(self.prevalence) == domainA.n_classes, \ + f'wrong shape for the vector prevalence (expected {domainA.n_classes})' + assert F.check_prevalence_vector(self.prevalence), \ + f'the prevalence vector is not valid (either it contains values outside [0,1] or does not sum up to 1)' + if isinstance(mixture_points, int): + self.mixture_points = np.linspace(0, 1, mixture_points)[::-1] + else: + self.mixture_points = np.asarray(mixture_points) + assert all(np.logical_and(self.mixture_points >= 0, self.mixture_points<=1)), \ + 'mixture_model datatype not understood (expected int or a sequence of real values in [0,1])' + self.random_state = random_state + self.collator = OnLabelledCollectionProtocol.get_collator(return_type) + +
[docs] def samples_parameters(self): + """ + Return all the necessary parameters to replicate the samples as according to the this protocol. + + :return: a list of zipped indexes (from A and B) that realize the sampling + """ + indexesA, indexesB = [], [] + for propA in self.mixture_points: + for _ in range(self.repeats): + nA = int(np.round(self.sample_size * propA)) + nB = self.sample_size-nA + sampleAidx = self.A.sampling_index(nA, *self.prevalence) + sampleBidx = self.B.sampling_index(nB, *self.prevalence) + indexesA.append(sampleAidx) + indexesB.append(sampleBidx) + return list(zip(indexesA, indexesB))
+ +
[docs] def sample(self, indexes): + """ + Realizes the sample given a pair of indexes of the instances from A and B. + + :param indexes: indexes of the instances to select from A and B + :return: an instance of :class:`qp.data.LabelledCollection` + """ + indexesA, indexesB = indexes + sampleA = self.A.sampling_from_index(indexesA) + sampleB = self.B.sampling_from_index(indexesB) + return sampleA+sampleB
+ +
[docs] def total(self): + """ + Returns the number of samples that will be generated (equals to "repeats * mixture_points") + + :return: int + """ + return self.repeats * len(self.mixture_points)
+ + +# aliases + +ArtificialPrevalenceProtocol = APP +NaturalPrevalenceProtocol = NPP +UniformPrevalenceProtocol = UPP +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_base.html b/docs/build/html/_modules/quapy/tests/test_base.html new file mode 100644 index 0000000..baf8cfa --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_base.html @@ -0,0 +1,110 @@ + + + + + + quapy.tests.test_base — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.tests.test_base

+import pytest
+
+
+[docs] +def test_import(): + import quapy as qp + assert qp.__version__ is not None
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_datasets.html b/docs/build/html/_modules/quapy/tests/test_datasets.html new file mode 100644 index 0000000..785e535 --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_datasets.html @@ -0,0 +1,178 @@ + + + + + + quapy.tests.test_datasets — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.tests.test_datasets

+import pytest
+
+from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \
+    TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_BINARY_DATASETS, LEQUA2022_TASKS, UCI_MULTICLASS_DATASETS,\
+    fetch_reviews, fetch_twitter, fetch_UCIBinaryDataset, fetch_lequa2022, fetch_UCIMulticlassLabelledCollection
+
+
+
+[docs] +@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS) +def test_fetch_reviews(dataset_name): + dataset = fetch_reviews(dataset_name) + print(f'Dataset {dataset_name}') + print('Training set stats') + dataset.training.stats() + print('Test set stats') + dataset.test.stats()
+ + + +
+[docs] +@pytest.mark.parametrize('dataset_name', TWITTER_SENTIMENT_DATASETS_TEST + TWITTER_SENTIMENT_DATASETS_TRAIN) +def test_fetch_twitter(dataset_name): + try: + dataset = fetch_twitter(dataset_name) + except ValueError as ve: + if dataset_name == 'semeval' and ve.args[0].startswith( + 'dataset "semeval" can only be used for model selection.'): + dataset = fetch_twitter(dataset_name, for_model_selection=True) + print(f'Dataset {dataset_name}') + print('Training set stats') + dataset.training.stats() + print('Test set stats')
+ + + +
+[docs] +@pytest.mark.parametrize('dataset_name', UCI_BINARY_DATASETS) +def test_fetch_UCIDataset(dataset_name): + try: + dataset = fetch_UCIBinaryDataset(dataset_name) + except FileNotFoundError as fnfe: + if dataset_name == 'pageblocks.5' and fnfe.args[0].find( + 'If this is the first time you attempt to load this dataset') > 0: + print('The pageblocks.5 dataset requires some hand processing to be usable, skipping this test.') + return + print(f'Dataset {dataset_name}') + print('Training set stats') + dataset.training.stats() + print('Test set stats')
+ + + +
+[docs] +@pytest.mark.parametrize('dataset_name', UCI_MULTICLASS_DATASETS) +def test_fetch_UCIMultiDataset(dataset_name): + dataset = fetch_UCIMulticlassLabelledCollection(dataset_name) + print(f'Dataset {dataset_name}') + print('Training set stats') + dataset.stats() + print('Test set stats')
+ + + +
+[docs] +@pytest.mark.parametrize('dataset_name', LEQUA2022_TASKS) +def test_fetch_lequa2022(dataset_name): + train, gen_val, gen_test = fetch_lequa2022(dataset_name) + print(train.stats()) + print('Val:', gen_val.total()) + print('Test:', gen_test.total())
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_evaluation.html b/docs/build/html/_modules/quapy/tests/test_evaluation.html new file mode 100644 index 0000000..d5603a4 --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_evaluation.html @@ -0,0 +1,195 @@ + + + + + + quapy.tests.test_evaluation — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.tests.test_evaluation

+import unittest
+
+import numpy as np
+
+import quapy as qp
+from sklearn.linear_model import LogisticRegression
+from time import time
+
+from quapy.error import QUANTIFICATION_ERROR_SINGLE, QUANTIFICATION_ERROR, QUANTIFICATION_ERROR_NAMES, \
+    QUANTIFICATION_ERROR_SINGLE_NAMES
+from quapy.method.aggregative import EMQ, PCC
+from quapy.method.base import BaseQuantifier
+
+
+
+[docs] +class EvalTestCase(unittest.TestCase): +
+[docs] + def test_eval_speedup(self): + + data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True) + train, test = data.training, data.test + + protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_state=1) + + class SlowLR(LogisticRegression): + def predict_proba(self, X): + import time + time.sleep(1) + return super().predict_proba(X) + + emq = EMQ(SlowLR()).fit(train) + + tinit = time() + score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True, aggr_speedup='force') + tend_optim = time()-tinit + print(f'evaluation (with optimization) took {tend_optim}s [MAE={score:.4f}]') + + class NonAggregativeEMQ(BaseQuantifier): + + def __init__(self, cls): + self.emq = EMQ(cls) + + def quantify(self, instances): + return self.emq.quantify(instances) + + def fit(self, data): + self.emq.fit(data) + return self + + emq = NonAggregativeEMQ(SlowLR()).fit(train) + + tinit = time() + score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True) + tend_no_optim = time() - tinit + print(f'evaluation (w/o optimization) took {tend_no_optim}s [MAE={score:.4f}]') + + self.assertEqual(tend_no_optim>(tend_optim/2), True)
+ + +
+[docs] + def test_evaluation_output(self): + + data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True) + train, test = data.training, data.test + + qp.environ['SAMPLE_SIZE']=100 + + protocol = qp.protocol.APP(test, random_state=0) + + q = PCC(LogisticRegression()).fit(train) + + single_errors = list(QUANTIFICATION_ERROR_SINGLE_NAMES) + averaged_errors = ['m'+e for e in single_errors] + single_errors = single_errors + [qp.error.from_name(e) for e in single_errors] + averaged_errors = averaged_errors + [qp.error.from_name(e) for e in averaged_errors] + for error_metric, averaged_error_metric in zip(single_errors, averaged_errors): + score = qp.evaluation.evaluate(q, protocol, error_metric=averaged_error_metric) + self.assertTrue(isinstance(score, float)) + + scores = qp.evaluation.evaluate(q, protocol, error_metric=error_metric) + self.assertTrue(isinstance(scores, np.ndarray)) + + self.assertEqual(scores.mean(), score)
+
+ + + + +if __name__ == '__main__': + unittest.main() +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_hierarchy.html b/docs/build/html/_modules/quapy/tests/test_hierarchy.html new file mode 100644 index 0000000..793091b --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_hierarchy.html @@ -0,0 +1,143 @@ + + + + + + quapy.tests.test_hierarchy — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.tests.test_hierarchy

+import unittest
+from sklearn.linear_model import LogisticRegression
+from quapy.method.aggregative import *
+
+
+
+[docs] +class HierarchyTestCase(unittest.TestCase): + +
+[docs] + def test_aggregative(self): + lr = LogisticRegression() + for m in [CC(lr), PCC(lr), ACC(lr), PACC(lr)]: + self.assertEqual(isinstance(m, AggregativeQuantifier), True)
+ + +
+[docs] + def test_binary(self): + lr = LogisticRegression() + for m in [HDy(lr)]: + self.assertEqual(isinstance(m, BinaryQuantifier), True)
+ + +
+[docs] + def test_probabilistic(self): + lr = LogisticRegression() + for m in [CC(lr), ACC(lr)]: + self.assertEqual(isinstance(m, AggregativeCrispQuantifier), True) + self.assertEqual(isinstance(m, AggregativeSoftQuantifier), False) + for m in [PCC(lr), PACC(lr)]: + self.assertEqual(isinstance(m, AggregativeCrispQuantifier), False) + self.assertEqual(isinstance(m, AggregativeSoftQuantifier), True)
+
+ + + +if __name__ == '__main__': + unittest.main() +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_labelcollection.html b/docs/build/html/_modules/quapy/tests/test_labelcollection.html new file mode 100644 index 0000000..682aeba --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_labelcollection.html @@ -0,0 +1,176 @@ + + + + + + quapy.tests.test_labelcollection — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for quapy.tests.test_labelcollection

+import unittest
+import numpy as np
+from scipy.sparse import csr_matrix
+
+import quapy as qp
+
+
+
+[docs] +class LabelCollectionTestCase(unittest.TestCase): +
+[docs] + def test_split(self): + x = np.arange(100) + y = np.random.randint(0,5,100) + data = qp.data.LabelledCollection(x,y) + tr, te = data.split_random(0.7) + check_prev = tr.prevalence()*0.7 + te.prevalence()*0.3 + + self.assertEqual(len(tr), 70) + self.assertEqual(len(te), 30) + self.assertEqual(np.allclose(check_prev, data.prevalence()), True) + self.assertEqual(len(tr+te), len(data))
+ + +
+[docs] + def test_join(self): + x = np.arange(50) + y = np.random.randint(2, 5, 50) + data1 = qp.data.LabelledCollection(x, y) + + x = np.arange(200) + y = np.random.randint(0, 3, 200) + data2 = qp.data.LabelledCollection(x, y) + + x = np.arange(100) + y = np.random.randint(0, 6, 100) + data3 = qp.data.LabelledCollection(x, y) + + combined = qp.data.LabelledCollection.join(data1, data2, data3) + self.assertEqual(len(combined), len(data1)+len(data2)+len(data3)) + self.assertEqual(all(combined.classes_ == np.arange(6)), True) + + x = np.random.rand(10, 3) + y = np.random.randint(0, 1, 10) + data4 = qp.data.LabelledCollection(x, y) + with self.assertRaises(Exception): + combined = qp.data.LabelledCollection.join(data1, data2, data3, data4) + + x = np.random.rand(20, 3) + y = np.random.randint(0, 1, 20) + data5 = qp.data.LabelledCollection(x, y) + combined = qp.data.LabelledCollection.join(data4, data5) + self.assertEqual(len(combined), len(data4)+len(data5)) + + x = np.random.rand(10, 4) + y = np.random.randint(0, 1, 10) + data6 = qp.data.LabelledCollection(x, y) + with self.assertRaises(Exception): + combined = qp.data.LabelledCollection.join(data4, data5, data6) + + data4.instances = csr_matrix(data4.instances) + with self.assertRaises(Exception): + combined = qp.data.LabelledCollection.join(data4, data5) + data5.instances = csr_matrix(data5.instances) + combined = qp.data.LabelledCollection.join(data4, data5) + self.assertEqual(len(combined), len(data4) + len(data5))
+
+ + + +if __name__ == '__main__': + unittest.main() +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_methods.html b/docs/build/html/_modules/quapy/tests/test_methods.html new file mode 100644 index 0000000..e2b28a9 --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_methods.html @@ -0,0 +1,357 @@ + + + + + + quapy.tests.test_methods — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.tests.test_methods

+import numpy as np
+import pytest
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import LinearSVC
+
+import method.aggregative
+import quapy as qp
+from quapy.model_selection import GridSearchQ
+from quapy.method.base import BinaryQuantifier
+from quapy.data import Dataset, LabelledCollection
+from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS
+from quapy.method.meta import Ensemble
+from quapy.protocol import APP
+from quapy.method.aggregative import DMy
+from quapy.method.meta import MedianEstimator
+
+# datasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True), id='hcr'),
+#             pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')]
+
+tinydatasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True).reduce(), id='tiny_hcr'),
+                pytest.param(qp.datasets.fetch_UCIBinaryDataset('ionosphere').reduce(), id='tiny_ionosphere')]
+
+learners = [LogisticRegression, LinearSVC]
+
+
+
+[docs] +@pytest.mark.parametrize('dataset', tinydatasets) +@pytest.mark.parametrize('aggregative_method', AGGREGATIVE_METHODS) +@pytest.mark.parametrize('learner', learners) +def test_aggregative_methods(dataset: Dataset, aggregative_method, learner): + model = aggregative_method(learner()) + + if isinstance(model, BinaryQuantifier) and not dataset.binary: + print(f'skipping the test of binary model {type(model)} on non-binary dataset {dataset}') + return + + model.fit(dataset.training) + + estim_prevalences = model.quantify(dataset.test.instances) + + true_prevalences = dataset.test.prevalence() + error = qp.error.mae(true_prevalences, estim_prevalences) + + assert type(error) == np.float64
+ + + +
+[docs] +@pytest.mark.parametrize('dataset', tinydatasets) +@pytest.mark.parametrize('non_aggregative_method', NON_AGGREGATIVE_METHODS) +def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method): + model = non_aggregative_method() + + if isinstance(model, BinaryQuantifier) and not dataset.binary: + print(f'skipping the test of binary model {model} on non-binary dataset {dataset}') + return + + model.fit(dataset.training) + + estim_prevalences = model.quantify(dataset.test.instances) + + true_prevalences = dataset.test.prevalence() + error = qp.error.mae(true_prevalences, estim_prevalences) + + assert type(error) == np.float64
+ + + +
+[docs] +@pytest.mark.parametrize('base_method', [method.aggregative.ACC, method.aggregative.PACC]) +@pytest.mark.parametrize('learner', [LogisticRegression]) +@pytest.mark.parametrize('dataset', tinydatasets) +@pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES) +def test_ensemble_method(base_method, learner, dataset: Dataset, policy): + + qp.environ['SAMPLE_SIZE'] = 20 + + base_quantifier=base_method(learner()) + + if not dataset.binary and policy=='ds': + print(f'skipping the test of binary policy ds on non-binary dataset {dataset}') + return + + model = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1) + + model.fit(dataset.training) + + estim_prevalences = model.quantify(dataset.test.instances) + + true_prevalences = dataset.test.prevalence() + error = qp.error.mae(true_prevalences, estim_prevalences) + + assert type(error) == np.float64
+ + + +
+[docs] +def test_quanet_method(): + try: + import quapy.classification.neural + except ModuleNotFoundError: + print('skipping QuaNet test due to missing torch package') + return + + qp.environ['SAMPLE_SIZE'] = 100 + + # load the kindle dataset as text, and convert words to numerical indexes + dataset = qp.datasets.fetch_reviews('kindle', pickle=True).reduce(200, 200) + qp.data.preprocessing.index(dataset, min_df=5, inplace=True) + + from quapy.classification.neural import CNNnet + cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes) + + from quapy.classification.neural import NeuralClassifierTrainer + learner = NeuralClassifierTrainer(cnn, device='cuda') + + from quapy.method.meta import QuaNet + model = QuaNet(learner, device='cuda') + + if isinstance(model, BinaryQuantifier) and not dataset.binary: + print(f'skipping the test of binary model {model} on non-binary dataset {dataset}') + return + + model.fit(dataset.training) + + estim_prevalences = model.quantify(dataset.test.instances) + + true_prevalences = dataset.test.prevalence() + error = qp.error.mae(true_prevalences, estim_prevalences) + + assert type(error) == np.float64
+ + + +
+[docs] +def test_str_label_names(): + model = qp.method.aggregative.CC(LogisticRegression()) + + dataset = qp.datasets.fetch_reviews('imdb', pickle=True) + dataset = Dataset(dataset.training.sampling(1000, *dataset.training.prevalence()), + dataset.test.sampling(1000, 0.25, 0.75)) + qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True) + + np.random.seed(0) + model.fit(dataset.training) + + int_estim_prevalences = model.quantify(dataset.test.instances) + true_prevalences = dataset.test.prevalence() + + error = qp.error.mae(true_prevalences, int_estim_prevalences) + assert type(error) == np.float64 + + dataset_str = Dataset(LabelledCollection(dataset.training.instances, + ['one' if label == 1 else 'zero' for label in dataset.training.labels]), + LabelledCollection(dataset.test.instances, + ['one' if label == 1 else 'zero' for label in dataset.test.labels])) + assert all(dataset_str.training.classes_ == dataset_str.test.classes_), 'wrong indexation' + np.random.seed(0) + model.fit(dataset_str.training) + + str_estim_prevalences = model.quantify(dataset_str.test.instances) + true_prevalences = dataset_str.test.prevalence() + + error = qp.error.mae(true_prevalences, str_estim_prevalences) + assert type(error) == np.float64 + + print(true_prevalences) + print(int_estim_prevalences) + print(str_estim_prevalences) + + np.testing.assert_almost_equal(int_estim_prevalences[1], + str_estim_prevalences[list(model.classes_).index('one')])
+ + +# helper +def __fit_test(quantifier, train, test): + quantifier.fit(train) + test_samples = APP(test) + true_prevs, estim_prevs = qp.evaluation.prediction(quantifier, test_samples) + return qp.error.mae(true_prevs, estim_prevs), estim_prevs + + +
+[docs] +def test_median_meta(): + """ + This test compares the performance of the MedianQuantifier with respect to computing the median of the predictions + of a differently parameterized quantifier. We use the DistributionMatching base quantifier and the median is + computed across different values of nbins + """ + + qp.environ['SAMPLE_SIZE'] = 100 + + # grid of values + nbins_grid = list(range(2, 11)) + + dataset = 'kindle' + train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test + prevs = [] + errors = [] + for nbins in nbins_grid: + with qp.util.temp_seed(0): + q = DMy(LogisticRegression(), nbins=nbins) + mae, estim_prevs = __fit_test(q, train, test) + prevs.append(estim_prevs) + errors.append(mae) + print(f'{dataset} DistributionMatching(nbins={nbins}) got MAE {mae:.4f}') + prevs = np.asarray(prevs) + mae = np.mean(errors) + print(f'\tMAE={mae:.4f}') + + q = DMy(LogisticRegression()) + q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1) + median_mae, prev = __fit_test(q, train, test) + print(f'\tMAE={median_mae:.4f}') + + np.testing.assert_almost_equal(np.median(prevs, axis=0), prev) + assert median_mae < mae, 'the median-based quantifier provided a higher error...'
+ + + +
+[docs] +def test_median_meta_modsel(): + """ + This test checks the median-meta quantifier with model selection + """ + + qp.environ['SAMPLE_SIZE'] = 100 + + dataset = 'kindle' + train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test + train, val = train.split_stratified(random_state=0) + + nbins_grid = [2, 4, 5, 10, 15] + + q = DMy(LogisticRegression()) + q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1) + median_mae, _ = __fit_test(q, train, test) + print(f'\tMAE={median_mae:.4f}') + + q = DMy(LogisticRegression()) + lr_params = {'classifier__C': np.logspace(-1, 1, 3)} + q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1) + q = GridSearchQ(q, param_grid=lr_params, protocol=APP(val), n_jobs=-1) + optimized_median_ave, _ = __fit_test(q, train, test) + print(f'\tMAE={optimized_median_ave:.4f}') + + assert optimized_median_ave < median_mae, "the optimized method yielded worse performance..."
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_modsel.html b/docs/build/html/_modules/quapy/tests/test_modsel.html new file mode 100644 index 0000000..ff1c51c --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_modsel.html @@ -0,0 +1,225 @@ + + + + + + quapy.tests.test_modsel — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.tests.test_modsel

+import unittest
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+
+import quapy as qp
+from quapy.method.aggregative import PACC
+from quapy.model_selection import GridSearchQ
+from quapy.protocol import APP
+import time
+
+
+
+[docs] +class ModselTestCase(unittest.TestCase): + +
+[docs] + def test_modsel(self): + + q = PACC(LogisticRegression(random_state=1, max_iter=5000)) + + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) + training, validation = data.training.split_stratified(0.7, random_state=1) + + param_grid = {'classifier__C': np.logspace(-3,3,7)} + app = APP(validation, sample_size=100, random_state=1) + q = GridSearchQ( + q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True + ).fit(training) + print('best params', q.best_params_) + print('best score', q.best_score_) + + self.assertEqual(q.best_params_['classifier__C'], 10.0) + self.assertEqual(q.best_model().get_params()['classifier__C'], 10.0)
+ + +
+[docs] + def test_modsel_parallel(self): + + q = PACC(LogisticRegression(random_state=1, max_iter=5000)) + + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) + training, validation = data.training.split_stratified(0.7, random_state=1) + # test = data.test + + param_grid = {'classifier__C': np.logspace(-3,3,7)} + app = APP(validation, sample_size=100, random_state=1) + q = GridSearchQ( + q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True + ).fit(training) + print('best params', q.best_params_) + print('best score', q.best_score_) + + self.assertEqual(q.best_params_['classifier__C'], 10.0) + self.assertEqual(q.best_model().get_params()['classifier__C'], 10.0)
+ + +
+[docs] + def test_modsel_parallel_speedup(self): + class SlowLR(LogisticRegression): + def fit(self, X, y, sample_weight=None): + time.sleep(1) + return super(SlowLR, self).fit(X, y, sample_weight) + + q = PACC(SlowLR(random_state=1, max_iter=5000)) + + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) + training, validation = data.training.split_stratified(0.7, random_state=1) + + param_grid = {'classifier__C': np.logspace(-3, 3, 7)} + app = APP(validation, sample_size=100, random_state=1) + + tinit = time.time() + GridSearchQ( + q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True + ).fit(training) + tend_nooptim = time.time()-tinit + + tinit = time.time() + GridSearchQ( + q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True + ).fit(training) + tend_optim = time.time() - tinit + + print(f'parallel training took {tend_optim:.4f}s') + print(f'sequential training took {tend_nooptim:.4f}s') + + self.assertEqual(tend_optim < (0.5*tend_nooptim), True)
+ + +
+[docs] + def test_modsel_timeout(self): + + class SlowLR(LogisticRegression): + def fit(self, X, y, sample_weight=None): + import time + time.sleep(10) + super(SlowLR, self).fit(X, y, sample_weight) + + q = PACC(SlowLR()) + + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) + training, validation = data.training.split_stratified(0.7, random_state=1) + # test = data.test + + param_grid = {'classifier__C': np.logspace(-3,3,7)} + app = APP(validation, sample_size=100, random_state=1) + q = GridSearchQ( + q, param_grid, protocol=app, error='mae', refit=True, timeout=3, n_jobs=-1, verbose=True + ) + with self.assertRaises(TimeoutError): + q.fit(training)
+
+ + + +if __name__ == '__main__': + unittest.main() +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_protocols.html b/docs/build/html/_modules/quapy/tests/test_protocols.html new file mode 100644 index 0000000..65e6d83 --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_protocols.html @@ -0,0 +1,336 @@ + + + + + + quapy.tests.test_protocols — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.tests.test_protocols

+import unittest
+import numpy as np
+
+import quapy.functional
+from quapy.data import LabelledCollection
+from quapy.protocol import APP, NPP, UPP, DomainMixer, AbstractStochasticSeededProtocol
+
+
+
+[docs] +def mock_labelled_collection(prefix=''): + y = [0] * 250 + [1] * 250 + [2] * 250 + [3] * 250 + X = [prefix + str(i) + '-' + str(yi) for i, yi in enumerate(y)] + return LabelledCollection(X, y, classes=sorted(np.unique(y)))
+ + + +
+[docs] +def samples_to_str(protocol): + samples_str = "" + for instances, prev in protocol(): + samples_str += f'{instances}\t{prev}\n' + return samples_str
+ + + +
+[docs] +class TestProtocols(unittest.TestCase): + +
+[docs] + def test_app_sanity_check(self): + data = mock_labelled_collection() + n_prevpoints = 101 + repeats = 10 + with self.assertRaises(RuntimeError): + p = APP(data, sample_size=5, n_prevalences=n_prevpoints, repeats=repeats, random_state=42) + n_combinations = \ + quapy.functional.num_prevalence_combinations(n_prevpoints, n_classes=data.n_classes, n_repeats=repeats) + p = APP(data, sample_size=5, n_prevalences=n_prevpoints, random_state=42, sanity_check=n_combinations) + p = APP(data, sample_size=5, n_prevalences=n_prevpoints, random_state=42, sanity_check=None)
+ + +
+[docs] + def test_app_replicate(self): + data = mock_labelled_collection() + p = APP(data, sample_size=5, n_prevalences=11, random_state=42) + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertEqual(samples1, samples2) + + p = APP(data, sample_size=5, n_prevalences=11) # <- random_state is by default set to 0 + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertEqual(samples1, samples2)
+ + +
+[docs] + def test_app_not_replicate(self): + data = mock_labelled_collection() + p = APP(data, sample_size=5, n_prevalences=11, random_state=None) + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertNotEqual(samples1, samples2) + + p = APP(data, sample_size=5, n_prevalences=11, random_state=42) + samples1 = samples_to_str(p) + p = APP(data, sample_size=5, n_prevalences=11, random_state=0) + samples2 = samples_to_str(p) + + self.assertNotEqual(samples1, samples2)
+ + +
+[docs] + def test_app_number(self): + data = mock_labelled_collection() + p = APP(data, sample_size=100, n_prevalences=10, repeats=1) + + # surprisingly enough, for some n_prevalences the test fails, notwithstanding + # everything is correct. The problem is that in function APP.prevalence_grid() + # there is sometimes one rounding error that gets cumulated and + # surpasses 1.0 (by a very small float value, 0.0000000000002 or sthe like) + # so these tuples are mistakenly removed... I have tried with np.close, and + # other workarounds, but eventually happens that there is some negative probability + # in the sampling function... + + count = 0 + for _ in p(): + count+=1 + + self.assertEqual(count, p.total())
+ + +
+[docs] + def test_npp_replicate(self): + data = mock_labelled_collection() + p = NPP(data, sample_size=5, repeats=5, random_state=42) + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertEqual(samples1, samples2) + + p = NPP(data, sample_size=5, repeats=5) # <- random_state is by default set to 0 + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertEqual(samples1, samples2)
+ + +
+[docs] + def test_npp_not_replicate(self): + data = mock_labelled_collection() + p = NPP(data, sample_size=5, repeats=5, random_state=None) + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertNotEqual(samples1, samples2) + + p = NPP(data, sample_size=5, repeats=5, random_state=42) + samples1 = samples_to_str(p) + p = NPP(data, sample_size=5, repeats=5, random_state=0) + samples2 = samples_to_str(p) + self.assertNotEqual(samples1, samples2)
+ + +
+[docs] + def test_kraemer_replicate(self): + data = mock_labelled_collection() + p = UPP(data, sample_size=5, repeats=10, random_state=42) + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertEqual(samples1, samples2) + + p = UPP(data, sample_size=5, repeats=10) # <- random_state is by default set to 0 + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertEqual(samples1, samples2)
+ + +
+[docs] + def test_kraemer_not_replicate(self): + data = mock_labelled_collection() + p = UPP(data, sample_size=5, repeats=10, random_state=None) + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertNotEqual(samples1, samples2)
+ + +
+[docs] + def test_covariate_shift_replicate(self): + dataA = mock_labelled_collection('domA') + dataB = mock_labelled_collection('domB') + p = DomainMixer(dataA, dataB, sample_size=10, mixture_points=11, random_state=1) + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertEqual(samples1, samples2) + + p = DomainMixer(dataA, dataB, sample_size=10, mixture_points=11) # <- random_state is by default set to 0 + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertEqual(samples1, samples2)
+ + +
+[docs] + def test_covariate_shift_not_replicate(self): + dataA = mock_labelled_collection('domA') + dataB = mock_labelled_collection('domB') + p = DomainMixer(dataA, dataB, sample_size=10, mixture_points=11, random_state=None) + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertNotEqual(samples1, samples2)
+ + +
+[docs] + def test_no_seed_init(self): + class NoSeedInit(AbstractStochasticSeededProtocol): + def __init__(self): + self.data = mock_labelled_collection() + + def samples_parameters(self): + # return a matrix containing sampling indexes in the rows + return np.random.randint(0, len(self.data), 10*10).reshape(10, 10) + + def sample(self, params): + index = np.unique(params) + return self.data.sampling_from_index(index) + + p = NoSeedInit() + + # this should raise a ValueError, since the class is said to be AbstractStochasticSeededProtocol but the + # random_seed has never been passed to super(NoSeedInit, self).__init__(random_seed) + with self.assertRaises(ValueError): + for sample in p(): + pass + print('done')
+
+ + + +if __name__ == '__main__': + unittest.main() +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_replicability.html b/docs/build/html/_modules/quapy/tests/test_replicability.html new file mode 100644 index 0000000..4731cce --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_replicability.html @@ -0,0 +1,225 @@ + + + + + + quapy.tests.test_replicability — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for quapy.tests.test_replicability

+import unittest
+import quapy as qp
+from quapy.data import LabelledCollection
+from quapy.functional import strprev
+from sklearn.linear_model import LogisticRegression
+import numpy as np
+from quapy.method.aggregative import PACC
+import quapy.functional as F
+
+
+
+[docs] +class MyTestCase(unittest.TestCase): + +
+[docs] + def test_prediction_replicability(self): + + dataset = qp.datasets.fetch_UCIBinaryDataset('yeast') + + with qp.util.temp_seed(0): + lr = LogisticRegression(random_state=0, max_iter=10000) + pacc = PACC(lr) + prev = pacc.fit(dataset.training).quantify(dataset.test.X) + str_prev1 = strprev(prev, prec=5) + + with qp.util.temp_seed(0): + lr = LogisticRegression(random_state=0, max_iter=10000) + pacc = PACC(lr) + prev2 = pacc.fit(dataset.training).quantify(dataset.test.X) + str_prev2 = strprev(prev2, prec=5) + + self.assertEqual(str_prev1, str_prev2) # add assertion here
+ + + +
+[docs] + def test_samping_replicability(self): + + def equal_collections(c1, c2, value=True): + self.assertEqual(np.all(c1.Xtr == c2.Xtr), value) + self.assertEqual(np.all(c1.ytr == c2.ytr), value) + if value: + self.assertEqual(np.all(c1.classes_ == c2.classes_), value) + + X = list(map(str, range(100))) + y = np.random.randint(0, 2, 100) + data = LabelledCollection(instances=X, labels=y) + + sample1 = data.sampling(50) + sample2 = data.sampling(50) + equal_collections(sample1, sample2, False) + + sample1 = data.sampling(50, random_state=0) + sample2 = data.sampling(50, random_state=0) + equal_collections(sample1, sample2, True) + + sample1 = data.sampling(50, *[0.7, 0.3], random_state=0) + sample2 = data.sampling(50, *[0.7, 0.3], random_state=0) + equal_collections(sample1, sample2, True) + + with qp.util.temp_seed(0): + sample1 = data.sampling(50, *[0.7, 0.3]) + with qp.util.temp_seed(0): + sample2 = data.sampling(50, *[0.7, 0.3]) + equal_collections(sample1, sample2, True) + + sample1 = data.sampling(50, *[0.7, 0.3], random_state=0) + sample2 = data.sampling(50, *[0.7, 0.3], random_state=0) + equal_collections(sample1, sample2, True) + + sample1_tr, sample1_te = data.split_stratified(train_prop=0.7, random_state=0) + sample2_tr, sample2_te = data.split_stratified(train_prop=0.7, random_state=0) + equal_collections(sample1_tr, sample2_tr, True) + equal_collections(sample1_te, sample2_te, True) + + with qp.util.temp_seed(0): + sample1_tr, sample1_te = data.split_stratified(train_prop=0.7) + with qp.util.temp_seed(0): + sample2_tr, sample2_te = data.split_stratified(train_prop=0.7) + equal_collections(sample1_tr, sample2_tr, True) + equal_collections(sample1_te, sample2_te, True)
+ + + +
+[docs] + def test_parallel_replicability(self): + + train, test = qp.datasets.fetch_UCIMulticlassDataset('dry-bean').train_test + + test = test.sampling(500, *[0.1, 0.0, 0.1, 0.1, 0.2, 0.5, 0.0]) + + with qp.util.temp_seed(10): + pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2) + pacc.fit(train, val_split=0.5) + prev1 = F.strprev(pacc.quantify(test.instances)) + + with qp.util.temp_seed(0): + pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2) + pacc.fit(train, val_split=0.5) + prev2 = F.strprev(pacc.quantify(test.instances)) + + with qp.util.temp_seed(0): + pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2) + pacc.fit(train, val_split=0.5) + prev3 = F.strprev(pacc.quantify(test.instances)) + + print(prev1) + print(prev2) + print(prev3) + + self.assertNotEqual(prev1, prev2) + self.assertEqual(prev2, prev3)
+
+ + + + + +if __name__ == '__main__': + unittest.main() +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/util.html b/docs/build/html/_modules/quapy/util.html new file mode 100644 index 0000000..25532bd --- /dev/null +++ b/docs/build/html/_modules/quapy/util.html @@ -0,0 +1,402 @@ + + + + + + quapy.util — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.util

+import contextlib
+import itertools
+import multiprocessing
+import os
+import pickle
+import urllib
+from pathlib import Path
+from contextlib import ExitStack
+import quapy as qp
+
+import numpy as np
+from joblib import Parallel, delayed
+from time import time
+import signal
+
+
+def _get_parallel_slices(n_tasks, n_jobs):
+    if n_jobs == -1:
+        n_jobs = multiprocessing.cpu_count()
+    batch = int(n_tasks / n_jobs)
+    remainder = n_tasks % n_jobs
+    return [slice(job * batch, (job + 1) * batch + (remainder if job == n_jobs - 1 else 0)) for job in range(n_jobs)]
+
+
+
[docs]def map_parallel(func, args, n_jobs): + """ + Applies func to n_jobs slices of args. E.g., if args is an array of 99 items and n_jobs=2, then + func is applied in two parallel processes to args[0:50] and to args[50:99]. func is a function + that already works with a list of arguments. + + :param func: function to be parallelized + :param args: array-like of arguments to be passed to the function in different parallel calls + :param n_jobs: the number of workers + """ + args = np.asarray(args) + slices = _get_parallel_slices(len(args), n_jobs) + results = Parallel(n_jobs=n_jobs)( + delayed(func)(args[slice_i]) for slice_i in slices + ) + return list(itertools.chain.from_iterable(results))
+ + +
[docs]def parallel(func, args, n_jobs, seed=None, asarray=True, backend='loky'): + """ + A wrapper of multiprocessing: + + >>> Parallel(n_jobs=n_jobs)( + >>> delayed(func)(args_i) for args_i in args + >>> ) + + that takes the `quapy.environ` variable as input silently. + Seeds the child processes to ensure reproducibility when n_jobs>1. + + :param func: callable + :param args: args of func + :param seed: the numeric seed + :param asarray: set to True to return a np.ndarray instead of a list + :param backend: indicates the backend used for handling parallel works + """ + def func_dec(environ, seed, *args): + qp.environ = environ.copy() + qp.environ['N_JOBS'] = 1 + #set a context with a temporal seed to ensure results are reproducibles in parallel + with ExitStack() as stack: + if seed is not None: + stack.enter_context(qp.util.temp_seed(seed)) + return func(*args) + + out = Parallel(n_jobs=n_jobs, backend=backend)( + delayed(func_dec)(qp.environ, None if seed is None else seed+i, args_i) for i, args_i in enumerate(args) + ) + if asarray: + out = np.asarray(out) + return out
+ + +
[docs]@contextlib.contextmanager +def temp_seed(random_state): + """ + Can be used in a "with" context to set a temporal seed without modifying the outer numpy's current state. E.g.: + + >>> with temp_seed(random_seed): + >>> pass # do any computation depending on np.random functionality + + :param random_state: the seed to set within the "with" context + """ + if random_state is not None: + state = np.random.get_state() + #save the seed just in case is needed (for instance for setting the seed to child processes) + qp.environ['_R_SEED'] = random_state + np.random.seed(random_state) + try: + yield + finally: + if random_state is not None: + np.random.set_state(state)
+ + +
[docs]def download_file(url, archive_filename): + """ + Downloads a file from a url + + :param url: the url + :param archive_filename: destination filename + """ + def progress(blocknum, bs, size): + total_sz_mb = '%.2f MB' % (size / 1e6) + current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) + print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') + print("Downloading %s" % url) + urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress) + print("")
+ + +
[docs]def download_file_if_not_exists(url, archive_filename): + """ + Dowloads a function (using :meth:`download_file`) if the file does not exist. + + :param url: the url + :param archive_filename: destination filename + """ + if os.path.exists(archive_filename): + return + create_if_not_exist(os.path.dirname(archive_filename)) + download_file(url, archive_filename)
+ + +
[docs]def create_if_not_exist(path): + """ + An alias to `os.makedirs(path, exist_ok=True)` that also returns the path. This is useful in cases like, e.g.: + + >>> path = create_if_not_exist(os.path.join(dir, subdir, anotherdir)) + + :param path: path to create + :return: the path itself + """ + os.makedirs(path, exist_ok=True) + return path
+ + +
[docs]def get_quapy_home(): + """ + Gets the home directory of QuaPy, i.e., the directory where QuaPy saves permanent data, such as dowloaded datasets. + This directory is `~/quapy_data` + + :return: a string representing the path + """ + home = os.path.join(str(Path.home()), 'quapy_data') + os.makedirs(home, exist_ok=True) + return home
+ + +
[docs]def create_parent_dir(path): + """ + Creates the parent dir (if any) of a given path, if not exists. E.g., for `./path/to/file.txt`, the path `./path/to` + is created. + + :param path: the path + """ + parentdir = Path(path).parent + if parentdir: + os.makedirs(parentdir, exist_ok=True)
+ + +
[docs]def save_text_file(path, text): + """ + Saves a text file to disk, given its full path, and creates the parent directory if missing. + + :param path: path where to save the path. + :param text: text to save. + """ + create_parent_dir(path) + with open(text, 'wt') as fout: + fout.write(text)
+ + +
[docs]def pickled_resource(pickle_path:str, generation_func:callable, *args): + """ + Allows for fast reuse of resources that are generated only once by calling generation_func(\\*args). The next times + this function is invoked, it loads the pickled resource. Example: + + >>> def some_array(n): # a mock resource created with one parameter (`n`) + >>> return np.random.rand(n) + >>> pickled_resource('./my_array.pkl', some_array, 10) # the resource does not exist: it is created by calling some_array(10) + >>> pickled_resource('./my_array.pkl', some_array, 10) # the resource exists; it is loaded from './my_array.pkl' + + :param pickle_path: the path where to save (first time) and load (next times) the resource + :param generation_func: the function that generates the resource, in case it does not exist in pickle_path + :param args: any arg that generation_func uses for generating the resources + :return: the resource + """ + if pickle_path is None: + return generation_func(*args) + else: + if os.path.exists(pickle_path): + return pickle.load(open(pickle_path, 'rb')) + else: + instance = generation_func(*args) + os.makedirs(str(Path(pickle_path).parent), exist_ok=True) + pickle.dump(instance, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL) + return instance
+ + +def _check_sample_size(sample_size): + if sample_size is None: + assert qp.environ['SAMPLE_SIZE'] is not None, \ + 'error: sample_size set to None, and cannot be resolved from the environment' + sample_size = qp.environ['SAMPLE_SIZE'] + assert isinstance(sample_size, int) and sample_size > 0, \ + 'error: sample_size is not a positive integer' + return sample_size + + +
[docs]class EarlyStop: + """ + A class implementing the early-stopping condition typically used for training neural networks. + + >>> earlystop = EarlyStop(patience=2, lower_is_better=True) + >>> earlystop(0.9, epoch=0) + >>> earlystop(0.7, epoch=1) + >>> earlystop.IMPROVED # is True + >>> earlystop(1.0, epoch=2) + >>> earlystop.STOP # is False (patience=1) + >>> earlystop(1.0, epoch=3) + >>> earlystop.STOP # is True (patience=0) + >>> earlystop.best_epoch # is 1 + >>> earlystop.best_score # is 0.7 + + :param patience: the number of (consecutive) times that a monitored evaluation metric (typically obtaind in a + held-out validation split) can be found to be worse than the best one obtained so far, before flagging the + stopping condition. An instance of this class is `callable`, and is to be used as follows: + :param lower_is_better: if True (default) the metric is to be minimized. + :ivar best_score: keeps track of the best value seen so far + :ivar best_epoch: keeps track of the epoch in which the best score was set + :ivar STOP: flag (boolean) indicating the stopping condition + :ivar IMPROVED: flag (boolean) indicating whether there was an improvement in the last call + """ + + def __init__(self, patience, lower_is_better=True): + + self.PATIENCE_LIMIT = patience + self.better = lambda a,b: a<b if lower_is_better else a>b + self.patience = patience + self.best_score = None + self.best_epoch = None + self.STOP = False + self.IMPROVED = False + + def __call__(self, watch_score, epoch): + """ + Commits the new score found in epoch `epoch`. If the score improves over the best score found so far, then + the patiente counter gets reset. If otherwise, the patience counter is decreased, and in case it reachs 0, + the flag STOP becomes True. + + :param watch_score: the new score + :param epoch: the current epoch + """ + self.IMPROVED = (self.best_score is None or self.better(watch_score, self.best_score)) + if self.IMPROVED: + self.best_score = watch_score + self.best_epoch = epoch + self.patience = self.PATIENCE_LIMIT + else: + self.patience -= 1 + if self.patience <= 0: + self.STOP = True
+ + +
[docs]@contextlib.contextmanager +def timeout(seconds): + """ + Opens a context that will launch an exception if not closed after a given number of seconds + + >>> def func(start_msg, end_msg): + >>> print(start_msg) + >>> sleep(2) + >>> print(end_msg) + >>> + >>> with timeout(1): + >>> func('begin function', 'end function') + >>> Out[] + >>> begin function + >>> TimeoutError + + + :param seconds: number of seconds, set to <=0 to ignore the timer + """ + if seconds > 0: + def handler(signum, frame): + raise TimeoutError() + + signal.signal(signal.SIGALRM, handler) + signal.alarm(seconds) + + yield + + if seconds > 0: + signal.alarm(0)
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_sources/Datasets.md.txt b/docs/build/html/_sources/Datasets.md.txt deleted file mode 100644 index d5e7563..0000000 --- a/docs/build/html/_sources/Datasets.md.txt +++ /dev/null @@ -1,356 +0,0 @@ -# Datasets - -QuaPy makes available several datasets that have been used in -quantification literature, as well as an interface to allow -anyone import their custom datasets. - -A _Dataset_ object in QuaPy is roughly a pair of _LabelledCollection_ objects, -one playing the role of the training set, another the test set. -_LabelledCollection_ is a data class consisting of the (iterable) -instances and labels. This class handles most of the sampling functionality in QuaPy. -Take a look at the following code: - -```python -import quapy as qp -import quapy.functional as F - -instances = [ - '1st positive document', '2nd positive document', - 'the only negative document', - '1st neutral document', '2nd neutral document', '3rd neutral document' -] -labels = [2, 2, 0, 1, 1, 1] - -data = qp.data.LabelledCollection(instances, labels) -print(F.strprev(data.prevalence(), prec=2)) -``` - -Output the class prevalences (showing 2 digit precision): -``` -[0.17, 0.50, 0.33] -``` - -One can easily produce new samples at desired class prevalence values: - -```python -sample_size = 10 -prev = [0.4, 0.1, 0.5] -sample = data.sampling(sample_size, *prev) - -print('instances:', sample.instances) -print('labels:', sample.labels) -print('prevalence:', F.strprev(sample.prevalence(), prec=2)) -``` - -Which outputs: -``` -instances: ['the only negative document' '2nd positive document' - '2nd positive document' '2nd neutral document' '1st positive document' - 'the only negative document' 'the only negative document' - 'the only negative document' '2nd positive document' - '1st positive document'] -labels: [0 2 2 1 2 0 0 0 2 2] -prevalence: [0.40, 0.10, 0.50] -``` - -Samples can be made consistent across different runs (e.g., to test -different methods on the same exact samples) by sampling and retaining -the indexes, that can then be used to generate the sample: - -```python -index = data.sampling_index(sample_size, *prev) -for method in methods: - sample = data.sampling_from_index(index) - ... -``` - -However, generating samples for evaluation purposes is tackled in QuaPy -by means of the evaluation protocols (see the dedicated entries in the Wiki -for [evaluation](https://github.com/HLT-ISTI/QuaPy/wiki/Evaluation) and -[protocols](https://github.com/HLT-ISTI/QuaPy/wiki/Protocols)). - - -## Reviews Datasets - -Three datasets of reviews about Kindle devices, Harry Potter's series, and -the well-known IMDb movie reviews can be fetched using a unified interface. -For example: - -```python -import quapy as qp -data = qp.datasets.fetch_reviews('kindle') -``` - -These datasets have been used in: -``` -Esuli, A., Moreo, A., & Sebastiani, F. (2018, October). -A recurrent neural network for sentiment quantification. -In Proceedings of the 27th ACM International Conference on -Information and Knowledge Management (pp. 1775-1778). -``` - -The list of reviews ids is available in: - -```python -qp.datasets.REVIEWS_SENTIMENT_DATASETS -``` - -Some statistics of the fhe available datasets are summarized below: - -| Dataset | classes | train size | test size | train prev | test prev | type | -|---|:---:|:---:|:---:|:---:|:---:|---| -| hp | 2 | 9533 | 18399 | [0.018, 0.982] | [0.065, 0.935] | text | -| kindle | 2 | 3821 | 21591 | [0.081, 0.919] | [0.063, 0.937] | text | -| imdb | 2 | 25000 | 25000 | [0.500, 0.500] | [0.500, 0.500] | text | - - -## Twitter Sentiment Datasets - -11 Twitter datasets for sentiment analysis. -Text is not accessible, and the documents were made available -in tf-idf format. Each dataset presents two splits: a train/val -split for model selection purposes, and a train+val/test split -for model evaluation. The following code exemplifies how to load -a twitter dataset for model selection. - -```python -import quapy as qp -data = qp.datasets.fetch_twitter('gasp', for_model_selection=True) -``` - -The datasets were used in: - -``` -Gao, W., & Sebastiani, F. (2015, August). -Tweet sentiment: From classification to quantification. -In 2015 IEEE/ACM International Conference on Advances in -Social Networks Analysis and Mining (ASONAM) (pp. 97-104). IEEE. -``` - -Three of the datasets (semeval13, semeval14, and semeval15) share the -same training set (semeval), meaning that the training split one would get -when requesting any of them is the same. The dataset "semeval" can only -be requested with "for_model_selection=True". -The lists of the Twitter dataset's ids can be consulted in: - -```python -# a list of 11 dataset ids that can be used for model selection or model evaluation -qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST - -# 9 dataset ids in which "semeval13", "semeval14", and "semeval15" are replaced with "semeval" -qp.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN -``` - -Some details can be found below: - -| Dataset | classes | train size | test size | features | train prev | test prev | type | -|---|:---:|:---:|:---:|:---:|:---:|:---:|---| -| gasp | 3 | 8788 | 3765 | 694582 | [0.421, 0.496, 0.082] | [0.407, 0.507, 0.086] | sparse | -| hcr | 3 | 1594 | 798 | 222046 | [0.546, 0.211, 0.243] | [0.640, 0.167, 0.193] | sparse | -| omd | 3 | 1839 | 787 | 199151 | [0.463, 0.271, 0.266] | [0.437, 0.283, 0.280] | sparse | -| sanders | 3 | 2155 | 923 | 229399 | [0.161, 0.691, 0.148] | [0.164, 0.688, 0.148] | sparse | -| semeval13 | 3 | 11338 | 3813 | 1215742 | [0.159, 0.470, 0.372] | [0.158, 0.430, 0.412] | sparse | -| semeval14 | 3 | 11338 | 1853 | 1215742 | [0.159, 0.470, 0.372] | [0.109, 0.361, 0.530] | sparse | -| semeval15 | 3 | 11338 | 2390 | 1215742 | [0.159, 0.470, 0.372] | [0.153, 0.413, 0.434] | sparse | -| semeval16 | 3 | 8000 | 2000 | 889504 | [0.157, 0.351, 0.492] | [0.163, 0.341, 0.497] | sparse | -| sst | 3 | 2971 | 1271 | 376132 | [0.261, 0.452, 0.288] | [0.207, 0.481, 0.312] | sparse | -| wa | 3 | 2184 | 936 | 248563 | [0.305, 0.414, 0.281] | [0.282, 0.446, 0.272] | sparse | -| wb | 3 | 4259 | 1823 | 404333 | [0.270, 0.392, 0.337] | [0.274, 0.392, 0.335] | sparse | - - -## UCI Machine Learning - -A set of 32 datasets from the [UCI Machine Learning repository](https://archive.ics.uci.edu/ml/datasets.php) -used in: - -``` -Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). -Using ensembles for problems with characterizable changes -in data distribution: A case study on quantification. -Information Fusion, 34, 87-100. -``` - -The list does not exactly coincide with that used in Pérez-Gállego et al. 2017 -since we were unable to find the datasets with ids "diabetes" and "phoneme". - -These dataset can be loaded by calling, e.g.: - -```python -import quapy as qp -data = qp.datasets.fetch_UCIDataset('yeast', verbose=True) -``` - -This call will return a _Dataset_ object in which the training and -test splits are randomly drawn, in a stratified manner, from the whole -collection at 70% and 30%, respectively. The _verbose=True_ option indicates -that the dataset description should be printed in standard output. -The original data is not split, -and some papers submit the entire collection to a kFCV validation. -In order to accommodate with these practices, one could first instantiate -the entire collection, and then creating a generator that will return one -training+test dataset at a time, following a kFCV protocol: - -```python -import quapy as qp -collection = qp.datasets.fetch_UCILabelledCollection("yeast") -for data in qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=2): - ... -``` - -Above code will allow to conduct a 2x5FCV evaluation on the "yeast" dataset. - -All datasets come in numerical form (dense matrices); some statistics -are summarized below. - -| Dataset | classes | instances | features | prev | type | -|---|:---:|:---:|:---:|:---:|---| -| acute.a | 2 | 120 | 6 | [0.508, 0.492] | dense | -| acute.b | 2 | 120 | 6 | [0.583, 0.417] | dense | -| balance.1 | 2 | 625 | 4 | [0.539, 0.461] | dense | -| balance.2 | 2 | 625 | 4 | [0.922, 0.078] | dense | -| balance.3 | 2 | 625 | 4 | [0.539, 0.461] | dense | -| breast-cancer | 2 | 683 | 9 | [0.350, 0.650] | dense | -| cmc.1 | 2 | 1473 | 9 | [0.573, 0.427] | dense | -| cmc.2 | 2 | 1473 | 9 | [0.774, 0.226] | dense | -| cmc.3 | 2 | 1473 | 9 | [0.653, 0.347] | dense | -| ctg.1 | 2 | 2126 | 22 | [0.222, 0.778] | dense | -| ctg.2 | 2 | 2126 | 22 | [0.861, 0.139] | dense | -| ctg.3 | 2 | 2126 | 22 | [0.917, 0.083] | dense | -| german | 2 | 1000 | 24 | [0.300, 0.700] | dense | -| haberman | 2 | 306 | 3 | [0.735, 0.265] | dense | -| ionosphere | 2 | 351 | 34 | [0.641, 0.359] | dense | -| iris.1 | 2 | 150 | 4 | [0.667, 0.333] | dense | -| iris.2 | 2 | 150 | 4 | [0.667, 0.333] | dense | -| iris.3 | 2 | 150 | 4 | [0.667, 0.333] | dense | -| mammographic | 2 | 830 | 5 | [0.514, 0.486] | dense | -| pageblocks.5 | 2 | 5473 | 10 | [0.979, 0.021] | dense | -| semeion | 2 | 1593 | 256 | [0.901, 0.099] | dense | -| sonar | 2 | 208 | 60 | [0.534, 0.466] | dense | -| spambase | 2 | 4601 | 57 | [0.606, 0.394] | dense | -| spectf | 2 | 267 | 44 | [0.794, 0.206] | dense | -| tictactoe | 2 | 958 | 9 | [0.653, 0.347] | dense | -| transfusion | 2 | 748 | 4 | [0.762, 0.238] | dense | -| wdbc | 2 | 569 | 30 | [0.627, 0.373] | dense | -| wine.1 | 2 | 178 | 13 | [0.669, 0.331] | dense | -| wine.2 | 2 | 178 | 13 | [0.601, 0.399] | dense | -| wine.3 | 2 | 178 | 13 | [0.730, 0.270] | dense | -| wine-q-red | 2 | 1599 | 11 | [0.465, 0.535] | dense | -| wine-q-white | 2 | 4898 | 11 | [0.335, 0.665] | dense | -| yeast | 2 | 1484 | 8 | [0.711, 0.289] | dense | - -### Issues: -All datasets will be downloaded automatically the first time they are requested, and -stored in the _quapy_data_ folder for faster further reuse. -However, some datasets require special actions that at the moment are not fully -automated. - -* Datasets with ids "ctg.1", "ctg.2", and "ctg.3" (_Cardiotocography Data Set_) load -an Excel file, which requires the user to install the _xlrd_ Python module in order -to open it. -* The dataset with id "pageblocks.5" (_Page Blocks Classification (5)_) needs to -open a "unix compressed file" (extension .Z), which is not directly doable with -standard Pythons packages like gzip or zip. This file would need to be uncompressed using -OS-dependent software manually. Information on how to do it will be printed the first -time the dataset is invoked. - -## LeQua Datasets - -QuaPy also provides the datasets used for the LeQua competition. -In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification -problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide -raw documents instead. -Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B -are multiclass quantification problems consisting of estimating the class prevalence -values of 28 different merchandise products. - -Every task consists of a training set, a set of validation samples (for model selection) -and a set of test samples (for evaluation). QuaPy returns this data as a LabelledCollection -(training) and two generation protocols (for validation and test samples), as follows: - -```python -training, val_generator, test_generator = fetch_lequa2022(task=task) -``` - -See the `lequa2022_experiments.py` in the examples folder for further details on how to -carry out experiments using these datasets. - -The datasets are downloaded only once, and stored for fast reuse. - -Some statistics are summarized below: - -| Dataset | classes | train size | validation samples | test samples | docs by sample | type | -|---------|:-------:|:----------:|:------------------:|:------------:|:----------------:|:--------:| -| T1A | 2 | 5000 | 1000 | 5000 | 250 | vector | -| T1B | 28 | 20000 | 1000 | 5000 | 1000 | vector | -| T2A | 2 | 5000 | 1000 | 5000 | 250 | text | -| T2B | 28 | 20000 | 1000 | 5000 | 1000 | text | - -For further details on the datasets, we refer to the original -[paper](https://ceur-ws.org/Vol-3180/paper-146.pdf): - -``` -Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022). -A Detailed Overview of LeQua@ CLEF 2022: Learning to Quantify. -``` - -## Adding Custom Datasets - -QuaPy provides data loaders for simple formats dealing with -text, following the format: - -``` -class-id \t first document's pre-processed text \n -class-id \t second document's pre-processed text \n -... -``` - -and sparse representations of the form: - -``` -{-1, 0, or +1} col(int):val(float) col(int):val(float) ... \n -... -``` - -The code in charge in loading a LabelledCollection is: - -```python -@classmethod -def load(cls, path:str, loader_func:callable): - return LabelledCollection(*loader_func(path)) -``` - -indicating that any _loader_func_ (e.g., a user-defined one) which -returns valid arguments for initializing a _LabelledCollection_ object will allow -to load any collection. In particular, the _LabelledCollection_ receives as -arguments the instances (as an iterable) and the labels (as an iterable) and, -additionally, the number of classes can be specified (it would otherwise be -inferred from the labels, but that requires at least one positive example for -all classes to be present in the collection). - -The same _loader_func_ can be passed to a Dataset, along with two -paths, in order to create a training and test pair of _LabelledCollection_, -e.g.: - -```python -import quapy as qp - -train_path = '../my_data/train.dat' -test_path = '../my_data/test.dat' - -def my_custom_loader(path): - with open(path, 'rb') as fin: - ... - return instances, labels - -data = qp.data.Dataset.load(train_path, test_path, my_custom_loader) -``` - -### Data Processing - -QuaPy implements a number of preprocessing functions in the package _qp.data.preprocessing_, including: - -* _text2tfidf_: tfidf vectorization -* _reduce_columns_: reducing the number of columns based on term frequency -* _standardize_: transforms the column values into z-scores (i.e., subtract the mean and normalizes by the standard deviation, so -that the column values have zero mean and unit variance). -* _index_: transforms textual tokens into lists of numeric ids) diff --git a/docs/build/html/_sources/Installation.rst.txt b/docs/build/html/_sources/Installation.rst.txt deleted file mode 100644 index 0eaabd6..0000000 --- a/docs/build/html/_sources/Installation.rst.txt +++ /dev/null @@ -1,56 +0,0 @@ -Installation ------------- - -QuaPy can be easily installed via `pip` - -:: - - pip install quapy - -See `pip page `_ for older versions. - -Requirements -************ - -* scikit-learn, numpy, scipy -* pytorch (for QuaNet) -* svmperf patched for quantification (see below) -* joblib -* tqdm -* pandas, xlrd -* matplotlib - - -SVM-perf with quantification-oriented losses -******************************************** - -In order to run experiments involving SVM(Q), SVM(KLD), SVM(NKLD), -SVM(AE), or SVM(RAE), you have to first download the -`svmperf `_ -package, apply the patch -`svm-perf-quantification-ext.patch `_, -and compile the sources. -The script -`prepare_svmperf.sh `_, -does all the job. Simply run: - -:: - - ./prepare_svmperf.sh - - -The resulting directory `./svm_perf_quantification` contains the -patched version of `svmperf` with quantification-oriented losses. - -The -`svm-perf-quantification-ext.patch `_ -is an extension of the patch made available by -`Esuli et al. 2015 `_ -that allows SVMperf to optimize for -the `Q` measure as proposed by -`Barranquero et al. 2015 `_ -and for the `KLD` and `NKLD` as proposed by -`Esuli et al. 2015 `_ -for quantification. -This patch extends the former by also allowing SVMperf to optimize for -`AE` and `RAE`. \ No newline at end of file diff --git a/docs/build/html/_sources/Methods.md.txt b/docs/build/html/_sources/Methods.md.txt deleted file mode 100644 index 7060a0a..0000000 --- a/docs/build/html/_sources/Methods.md.txt +++ /dev/null @@ -1,438 +0,0 @@ -# Quantification Methods - -Quantification methods can be categorized as belonging to -_aggregative_ and _non-aggregative_ groups. -Most methods included in QuaPy at the moment are of type _aggregative_ -(though we plan to add many more methods in the near future), i.e., -are methods characterized by the fact that -quantification is performed as an aggregation function of the individual -products of classification. - -Any quantifier in QuaPy shoud extend the class _BaseQuantifier_, -and implement some abstract methods: -```python - @abstractmethod - def fit(self, data: LabelledCollection): ... - - @abstractmethod - def quantify(self, instances): ... -``` -The meaning of those functions should be familiar to those -used to work with scikit-learn since the class structure of QuaPy -is directly inspired by scikit-learn's _Estimators_. Functions -_fit_ and _quantify_ are used to train the model and to provide -class estimations (the reason why -scikit-learn' structure has not been adopted _as is_ in QuaPy responds to -the fact that scikit-learn's _predict_ function is expected to return -one output for each input element --e.g., a predicted label for each -instance in a sample-- while in quantification the output for a sample -is one single array of class prevalences). -Quantifiers also extend from scikit-learn's `BaseEstimator`, in order -to simplify the use of _set_params_ and _get_params_ used in -[model selector](https://github.com/HLT-ISTI/QuaPy/wiki/Model-Selection). - -## Aggregative Methods - -All quantification methods are implemented as part of the -_qp.method_ package. In particular, _aggregative_ methods are defined in -_qp.method.aggregative_, and extend _AggregativeQuantifier(BaseQuantifier)_. -The methods that any _aggregative_ quantifier must implement are: - -```python - @abstractmethod - def fit(self, data: LabelledCollection, fit_learner=True): ... - - @abstractmethod - def aggregate(self, classif_predictions:np.ndarray): ... -``` - -since, as mentioned before, aggregative methods base their prediction on the -individual predictions of a classifier. Indeed, a default implementation -of _BaseQuantifier.quantify_ is already provided, which looks like: - -```python - def quantify(self, instances): - classif_predictions = self.classify(instances) - return self.aggregate(classif_predictions) -``` -Aggregative quantifiers are expected to maintain a classifier (which is -accessed through the _@property_ _classifier_). This classifier is -given as input to the quantifier, and can be already fit -on external data (in which case, the _fit_learner_ argument should -be set to False), or be fit by the quantifier's fit (default). - -Another class of _aggregative_ methods are the _probabilistic_ -aggregative methods, that should inherit from the abstract class -_AggregativeProbabilisticQuantifier(AggregativeQuantifier)_. -The particularity of _probabilistic_ aggregative methods (w.r.t. -non-probabilistic ones), is that the default quantifier is defined -in terms of the posterior probabilities returned by a probabilistic -classifier, and not by the crisp decisions of a hard classifier. -In any case, the interface _classify(instances)_ remains unchanged. - -One advantage of _aggregative_ methods (either probabilistic or not) -is that the evaluation according to any sampling procedure (e.g., -the [artificial sampling protocol](https://github.com/HLT-ISTI/QuaPy/wiki/Evaluation)) -can be achieved very efficiently, since the entire set can be pre-classified -once, and the quantification estimations for different samples can directly -reuse these predictions, without requiring to classify each element every time. -QuaPy leverages this property to speed-up any procedure having to do with -quantification over samples, as is customarily done in model selection or -in evaluation. - -### The Classify & Count variants - -QuaPy implements the four CC variants, i.e.: - -* _CC_ (Classify & Count), the simplest aggregative quantifier; one that - simply relies on the label predictions of a classifier to deliver class estimates. -* _ACC_ (Adjusted Classify & Count), the adjusted variant of CC. -* _PCC_ (Probabilistic Classify & Count), the probabilistic variant of CC that -relies on the soft estimations (or posterior probabilities) returned by a (probabilistic) classifier. -* _PACC_ (Probabilistic Adjusted Classify & Count), the adjusted variant of PCC. - -The following code serves as a complete example using CC equipped -with a SVM as the classifier: - -```python -import quapy as qp -import quapy.functional as F -from sklearn.svm import LinearSVC - -training, test = qp.datasets.fetch_twitter('hcr', pickle=True).train_test - -# instantiate a classifier learner, in this case a SVM -svm = LinearSVC() - -# instantiate a Classify & Count with the SVM -# (an alias is available in qp.method.aggregative.ClassifyAndCount) -model = qp.method.aggregative.CC(svm) -model.fit(training) -estim_prevalence = model.quantify(test.instances) -``` - -The same code could be used to instantiate an ACC, by simply replacing -the instantiation of the model with: -```python -model = qp.method.aggregative.ACC(svm) -``` -Note that the adjusted variants (ACC and PACC) need to estimate -some parameters for performing the adjustment (e.g., the -_true positive rate_ and the _false positive rate_ in case of -binary classification) that are estimated on a validation split -of the labelled set. In this case, the __init__ method of -ACC defines an additional parameter, _val_split_ which, by -default, is set to 0.4 and so, the 40% of the labelled data -will be used for estimating the parameters for adjusting the -predictions. This parameters can also be set with an integer, -indicating that the parameters should be estimated by means of -_k_-fold cross-validation, for which the integer indicates the -number _k_ of folds. Finally, _val_split_ can be set to a -specific held-out validation set (i.e., an instance of _LabelledCollection_). - -The specification of _val_split_ can be -postponed to the invokation of the fit method (if _val_split_ was also -set in the constructor, the one specified at fit time would prevail), -e.g.: - -```python -model = qp.method.aggregative.ACC(svm) -# perform 5-fold cross validation for estimating ACC's parameters -# (overrides the default val_split=0.4 in the constructor) -model.fit(training, val_split=5) -``` - -The following code illustrates the case in which PCC is used: - -```python -model = qp.method.aggregative.PCC(svm) -model.fit(training) -estim_prevalence = model.quantify(test.instances) -print('classifier:', model.classifier) -``` -In this case, QuaPy will print: -``` -The learner LinearSVC does not seem to be probabilistic. The learner will be calibrated. -classifier: CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5) -``` -The first output indicates that the learner (_LinearSVC_ in this case) -is not a probabilistic classifier (i.e., it does not implement the -_predict_proba_ method) and so, the classifier will be converted to -a probabilistic one through [calibration](https://scikit-learn.org/stable/modules/calibration.html). -As a result, the classifier that is printed in the second line points -to a _CalibratedClassifier_ instance. Note that calibration can only -be applied to hard classifiers when _fit_learner=True_; an exception -will be raised otherwise. - -Lastly, everything we said aboud ACC and PCC -applies to PACC as well. - - -### Expectation Maximization (EMQ) - -The Expectation Maximization Quantifier (EMQ), also known as -the SLD, is available at _qp.method.aggregative.EMQ_ or via the -alias _qp.method.aggregative.ExpectationMaximizationQuantifier_. -The method is described in: - -_Saerens, M., Latinne, P., and Decaestecker, C. (2002). Adjusting the outputs of a classifier -to new a priori probabilities: A simple procedure. Neural Computation, 14(1):21–41._ - -EMQ works with a probabilistic classifier (if the classifier -given as input is a hard one, a calibration will be attempted). -Although this method was originally proposed for improving the -posterior probabilities of a probabilistic classifier, and not -for improving the estimation of prior probabilities, EMQ ranks -almost always among the most effective quantifiers in the -experiments we have carried out. - -An example of use can be found below: - -```python -import quapy as qp -from sklearn.linear_model import LogisticRegression - -dataset = qp.datasets.fetch_twitter('hcr', pickle=True) - -model = qp.method.aggregative.EMQ(LogisticRegression()) -model.fit(dataset.training) -estim_prevalence = model.quantify(dataset.test.instances) -``` - -_New in v0.1.7_: EMQ now accepts two new parameters in the construction method, namely -_exact_train_prev_ which allows to use the true training prevalence as the departing -prevalence estimation (default behaviour), or instead an approximation of it as -suggested by [Alexandari et al. (2020)](http://proceedings.mlr.press/v119/alexandari20a.html) -(by setting _exact_train_prev=False_). -The other parameter is _recalib_ which allows to indicate a calibration method, among those -proposed by [Alexandari et al. (2020)](http://proceedings.mlr.press/v119/alexandari20a.html), -including the Bias-Corrected Temperature Scaling, Vector Scaling, etc. -See the API documentation for further details. - - -### Hellinger Distance y (HDy) - -Implementation of the method based on the Hellinger Distance y (HDy) proposed by -[González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution -estimation based on the Hellinger distance. Information Sciences, 218:146–164.](https://www.sciencedirect.com/science/article/pii/S0020025512004069) - -It is implemented in _qp.method.aggregative.HDy_ (also accessible -through the allias _qp.method.aggregative.HellingerDistanceY_). -This method works with a probabilistic classifier (hard classifiers -can be used as well and will be calibrated) and requires a validation -set to estimate parameter for the mixture model. Just like -ACC and PACC, this quantifier receives a _val_split_ argument -in the constructor (or in the fit method, in which case the previous -value is overridden) that can either be a float indicating the proportion -of training data to be taken as the validation set (in a random -stratified split), or a validation set (i.e., an instance of -_LabelledCollection_) itself. - -HDy was proposed as a binary classifier and the implementation -provided in QuaPy accepts only binary datasets. - -The following code shows an example of use: -```python -import quapy as qp -from sklearn.linear_model import LogisticRegression - -# load a binary dataset -dataset = qp.datasets.fetch_reviews('hp', pickle=True) -qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True) - -model = qp.method.aggregative.HDy(LogisticRegression()) -model.fit(dataset.training) -estim_prevalence = model.quantify(dataset.test.instances) -``` - -_New in v0.1.7:_ QuaPy now provides an implementation of the generalized -"Distribution Matching" approaches for multiclass, inspired by the framework -of [Firat (2016)](https://arxiv.org/abs/1606.00868). One can instantiate -a variant of HDy for multiclass quantification as follows: - -```python -mutliclassHDy = qp.method.aggregative.DistributionMatching(classifier=LogisticRegression(), divergence='HD', cdf=False) -``` - -_New in v0.1.7:_ QuaPy now provides an implementation of the "DyS" -framework proposed by [Maletzke et al (2020)](https://ojs.aaai.org/index.php/AAAI/article/view/4376) -and the "SMM" method proposed by [Hassan et al (2019)](https://ieeexplore.ieee.org/document/9260028) -(thanks to _Pablo González_ for the contributions!) - -### Threshold Optimization methods - -_New in v0.1.7:_ QuaPy now implements Forman's threshold optimization methods; -see, e.g., [(Forman 2006)](https://dl.acm.org/doi/abs/10.1145/1150402.1150423) -and [(Forman 2008)](https://link.springer.com/article/10.1007/s10618-008-0097-y). -These include: T50, MAX, X, Median Sweep (MS), and its variant MS2. - -### Explicit Loss Minimization - -The Explicit Loss Minimization (ELM) represent a family of methods -based on structured output learning, i.e., quantifiers relying on -classifiers that have been optimized targeting a -quantification-oriented evaluation measure. -The original methods are implemented in QuaPy as classify & count (CC) -quantifiers that use Joachim's [SVMperf](https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html) -as the underlying classifier, properly set to optimize for the desired loss. - -In QuaPy, this can be more achieved by calling the functions: - -* _newSVMQ_: returns the quantification method called SVM(Q) that optimizes for the metric _Q_ defined -in [_Barranquero, J., Díez, J., and del Coz, J. J. (2015). Quantification-oriented learning based -on reliable classifiers. Pattern Recognition, 48(2):591–604._](https://www.sciencedirect.com/science/article/pii/S003132031400291X) -* _newSVMKLD_ and _newSVMNKLD_: returns the quantification method called SVM(KLD) and SVM(nKLD), standing for - Kullback-Leibler Divergence and Normalized Kullback-Leibler Divergence, as proposed in [_Esuli, A. and Sebastiani, F. (2015). - Optimizing text quantifiers for multivariate loss functions. - ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27._](https://dl.acm.org/doi/abs/10.1145/2700406) -* _newSVMAE_ and _newSVMRAE_: returns a quantification method called SVM(AE) and SVM(RAE) that optimizes for the (Mean) Absolute Error and for the - (Mean) Relative Absolute Error, as first used by - [_Moreo, A. and Sebastiani, F. (2021). Tweet sentiment quantification: An experimental re-evaluation. PLOS ONE 17 (9), 1-23._](https://arxiv.org/abs/2011.02552) - -the last two methods (SVM(AE) and SVM(RAE)) have been implemented in -QuaPy in order to make available ELM variants for what nowadays -are considered the most well-behaved evaluation metrics in quantification. - -In order to make these models work, you would need to run the script -_prepare_svmperf.sh_ (distributed along with QuaPy) that -downloads _SVMperf_' source code, applies a patch that -implements the quantification oriented losses, and compiles the -sources. - -If you want to add any custom loss, you would need to modify -the source code of _SVMperf_ in order to implement it, and -assign a valid loss code to it. Then you must re-compile -the whole thing and instantiate the quantifier in QuaPy -as follows: - -```python -# you can either set the path to your custom svm_perf_quantification implementation -# in the environment variable, or as an argument to the constructor of ELM -qp.environ['SVMPERF_HOME'] = './path/to/svm_perf_quantification' - -# assign an alias to your custom loss and the id you have assigned to it -svmperf = qp.classification.svmperf.SVMperf -svmperf.valid_losses['mycustomloss'] = 28 - -# instantiate the ELM method indicating the loss -model = qp.method.aggregative.ELM(loss='mycustomloss') -``` - -All ELM are binary quantifiers since they rely on _SVMperf_, that -currently supports only binary classification. -ELM variants (any binary quantifier in general) can be extended -to operate in single-label scenarios trivially by adopting a -"one-vs-all" strategy (as, e.g., in -[_Gao, W. and Sebastiani, F. (2016). From classification to quantification in tweet sentiment -analysis. Social Network Analysis and Mining, 6(19):1–22_](https://link.springer.com/article/10.1007/s13278-016-0327-z)). -In QuaPy this is possible by using the _OneVsAll_ class. - -There are two ways for instantiating this class, _OneVsAllGeneric_ that works for -any quantifier, and _OneVsAllAggregative_ that is optimized for aggregative quantifiers. -In general, you can simply use the _getOneVsAll_ function and QuaPy will choose -the more convenient of the two. - -```python -import quapy as qp -from quapy.method.aggregative import SVMQ - -# load a single-label dataset (this one contains 3 classes) -dataset = qp.datasets.fetch_twitter('hcr', pickle=True) - -# let qp know where svmperf is -qp.environ['SVMPERF_HOME'] = '../svm_perf_quantification' - -model = getOneVsAll(SVMQ(), n_jobs=-1) # run them on parallel -model.fit(dataset.training) -estim_prevalence = model.quantify(dataset.test.instances) -``` - -Check the examples _[explicit_loss_minimization.py](..%2Fexamples%2Fexplicit_loss_minimization.py)_ -and [one_vs_all.py](..%2Fexamples%2Fone_vs_all.py) for more details. - -## Meta Models - -By _meta_ models we mean quantification methods that are defined on top of other -quantification methods, and that thus do not squarely belong to the aggregative nor -the non-aggregative group (indeed, _meta_ models could use quantifiers from any of those -groups). -_Meta_ models are implemented in the _qp.method.meta_ module. - -### Ensembles - -QuaPy implements (some of) the variants proposed in: - -* [_Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). -Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. -Information Fusion, 34, 87-100._](https://www.sciencedirect.com/science/article/pii/S1566253516300628) -* [_Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019). - Dynamic ensemble selection for quantification tasks. - Information Fusion, 45, 1-15._](https://www.sciencedirect.com/science/article/pii/S1566253517303652) - -The following code shows how to instantiate an Ensemble of 30 _Adjusted Classify & Count_ (ACC) -quantifiers operating with a _Logistic Regressor_ (LR) as the base classifier, and using the -_average_ as the aggregation policy (see the original article for further details). -The last parameter indicates to use all processors for parallelization. - -```python -import quapy as qp -from quapy.method.aggregative import ACC -from quapy.method.meta import Ensemble -from sklearn.linear_model import LogisticRegression - -dataset = qp.datasets.fetch_UCIDataset('haberman') - -model = Ensemble(quantifier=ACC(LogisticRegression()), size=30, policy='ave', n_jobs=-1) -model.fit(dataset.training) -estim_prevalence = model.quantify(dataset.test.instances) -``` - -Other aggregation policies implemented in QuaPy include: -* 'ptr' for applying a dynamic selection based on the training prevalence of the ensemble's members -* 'ds' for applying a dynamic selection based on the Hellinger Distance -* _any valid quantification measure_ (e.g., 'mse') for performing a static selection based on -the performance estimated for each member of the ensemble in terms of that evaluation metric. - -When using any of the above options, it is important to set the _red_size_ parameter, which -informs of the number of members to retain. - -Please, check the [model selection](https://github.com/HLT-ISTI/QuaPy/wiki/Model-Selection) -wiki if you want to optimize the hyperparameters of ensemble for classification or quantification. - -### The QuaNet neural network - -QuaPy offers an implementation of QuaNet, a deep learning model presented in: - -[_Esuli, A., Moreo, A., & Sebastiani, F. (2018, October). -A recurrent neural network for sentiment quantification. -In Proceedings of the 27th ACM International Conference on -Information and Knowledge Management (pp. 1775-1778)._](https://dl.acm.org/doi/abs/10.1145/3269206.3269287) - -This model requires _torch_ to be installed. -QuaNet also requires a classifier that can provide embedded representations -of the inputs. -In the original paper, QuaNet was tested using an LSTM as the base classifier. -In the following example, we show an instantiation of QuaNet that instead uses CNN as a probabilistic classifier, taking its last layer representation as the document embedding: - -```python -import quapy as qp -from quapy.method.meta import QuaNet -from quapy.classification.neural import NeuralClassifierTrainer, CNNnet - -# use samples of 100 elements -qp.environ['SAMPLE_SIZE'] = 100 - -# load the kindle dataset as text, and convert words to numerical indexes -dataset = qp.datasets.fetch_reviews('kindle', pickle=True) -qp.data.preprocessing.index(dataset, min_df=5, inplace=True) - -# the text classifier is a CNN trained by NeuralClassifierTrainer -cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes) -learner = NeuralClassifierTrainer(cnn, device='cuda') - -# train QuaNet -model = QuaNet(learner, device='cuda') -model.fit(dataset.training) -estim_prevalence = model.quantify(dataset.test.instances) -``` - diff --git a/docs/build/html/_sources/index.rst.txt b/docs/build/html/_sources/index.rst.txt deleted file mode 100644 index bf17bc7..0000000 --- a/docs/build/html/_sources/index.rst.txt +++ /dev/null @@ -1,92 +0,0 @@ -.. QuaPy documentation master file, created by - sphinx-quickstart on Tue Nov 9 11:31:32 2021. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to QuaPy's documentation! -================================= - -QuaPy is an open source framework for Quantification (a.k.a. Supervised Prevalence Estimation) -written in Python. - -Introduction ------------- - -QuaPy roots on the concept of data sample, and provides implementations of most important concepts -in quantification literature, such as the most important quantification baselines, many advanced -quantification methods, quantification-oriented model selection, many evaluation measures and protocols -used for evaluating quantification methods. -QuaPy also integrates commonly used datasets and offers visualization tools for facilitating the analysis and -interpretation of results. - -A quick example: -**************** - -The following script fetchs a Twitter dataset, trains and evaluates an -`Adjusted Classify & Count` model in terms of the `Mean Absolute Error` (MAE) -between the class prevalences estimated for the test set and the true prevalences -of the test set. - -:: - - import quapy as qp - from sklearn.linear_model import LogisticRegression - - dataset = qp.datasets.fetch_twitter('semeval16') - - # create an "Adjusted Classify & Count" quantifier - model = qp.method.aggregative.ACC(LogisticRegression()) - model.fit(dataset.training) - - estim_prevalences = model.quantify(dataset.test.instances) - true_prevalences = dataset.test.prevalence() - - error = qp.error.mae(true_prevalences, estim_prevalences) - - print(f'Mean Absolute Error (MAE)={error:.3f}') - - -Quantification is useful in scenarios of prior probability shift. In other -words, we would not be interested in estimating the class prevalences of the test set if -we could assume the IID assumption to hold, as this prevalence would simply coincide with the -class prevalence of the training set. For this reason, any Quantification model -should be tested across samples characterized by different class prevalences. -QuaPy implements sampling procedures and evaluation protocols that automates this endeavour. -See the :doc:`Evaluation` for detailed examples. - -Features -******** - -* Implementation of most popular quantification methods (Classify-&-Count variants, Expectation-Maximization, SVM-based variants for quantification, HDy, QuaNet, and Ensembles). -* Versatile functionality for performing evaluation based on artificial sampling protocols. -* Implementation of most commonly used evaluation metrics (e.g., MAE, MRAE, MSE, NKLD, etc.). -* Popular datasets for Quantification (textual and numeric) available, including: - * 32 UCI Machine Learning datasets. - * 11 Twitter Sentiment datasets. - * 3 Reviews Sentiment datasets. - * 4 tasks from LeQua competition (_new in v0.1.7!_) -* Native supports for binary and single-label scenarios of quantification. -* Model selection functionality targeting quantification-oriented losses. -* Visualization tools for analysing results. - -.. toctree:: - :maxdepth: 2 - :caption: Contents: - - Installation - Datasets - Evaluation - Protocols - Methods - Model-Selection - Plotting - API Developers documentation - - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/docs/build/html/_sources/quapy.method.rst.txt b/docs/build/html/_sources/quapy.method.rst.txt deleted file mode 100644 index cfda57f..0000000 --- a/docs/build/html/_sources/quapy.method.rst.txt +++ /dev/null @@ -1,55 +0,0 @@ -:tocdepth: 2 - -quapy.method package -==================== - -Submodules ----------- - -quapy.method.aggregative ------------------------- - -.. automodule:: quapy.method.aggregative - :members: - :undoc-members: - :show-inheritance: - -quapy.method.base ------------------ - -.. automodule:: quapy.method.base - :members: - :undoc-members: - :show-inheritance: - -quapy.method.meta ------------------ - -.. automodule:: quapy.method.meta - :members: - :undoc-members: - :show-inheritance: - -quapy.method.neural -------------------- - -.. automodule:: quapy.method.neural - :members: - :undoc-members: - :show-inheritance: - -quapy.method.non\_aggregative ------------------------------ - -.. automodule:: quapy.method.non_aggregative - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: quapy.method - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/build/html/_static/_sphinx_javascript_frameworks_compat.js b/docs/build/html/_static/_sphinx_javascript_frameworks_compat.js new file mode 100644 index 0000000..8549469 --- /dev/null +++ b/docs/build/html/_static/_sphinx_javascript_frameworks_compat.js @@ -0,0 +1,134 @@ +/* + * _sphinx_javascript_frameworks_compat.js + * ~~~~~~~~~~ + * + * Compatability shim for jQuery and underscores.js. + * + * WILL BE REMOVED IN Sphinx 6.0 + * xref RemovedInSphinx60Warning + * + */ + +/** + * select a different prefix for underscore + */ +$u = _.noConflict(); + + +/** + * small helper function to urldecode strings + * + * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent#Decoding_query_parameters_from_a_URL + */ +jQuery.urldecode = function(x) { + if (!x) { + return x + } + return decodeURIComponent(x.replace(/\+/g, ' ')); +}; + +/** + * small helper function to urlencode strings + */ +jQuery.urlencode = encodeURIComponent; + +/** + * This function returns the parsed url parameters of the + * current request. Multiple values per key are supported, + * it will always return arrays of strings for the value parts. + */ +jQuery.getQueryParameters = function(s) { + if (typeof s === 'undefined') + s = document.location.search; + var parts = s.substr(s.indexOf('?') + 1).split('&'); + var result = {}; + for (var i = 0; i < parts.length; i++) { + var tmp = parts[i].split('=', 2); + var key = jQuery.urldecode(tmp[0]); + var value = jQuery.urldecode(tmp[1]); + if (key in result) + result[key].push(value); + else + result[key] = [value]; + } + return result; +}; + +/** + * highlight a given string on a jquery object by wrapping it in + * span elements with the given class name. + */ +jQuery.fn.highlightText = function(text, className) { + function highlight(node, addItems) { + if (node.nodeType === 3) { + var val = node.nodeValue; + var pos = val.toLowerCase().indexOf(text); + if (pos >= 0 && + !jQuery(node.parentNode).hasClass(className) && + !jQuery(node.parentNode).hasClass("nohighlight")) { + var span; + var isInSVG = jQuery(node).closest("body, svg, foreignObject").is("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.className = className; + } + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + node.parentNode.insertBefore(span, node.parentNode.insertBefore( + document.createTextNode(val.substr(pos + text.length)), + node.nextSibling)); + node.nodeValue = val.substr(0, pos); + if (isInSVG) { + var rect = document.createElementNS("http://www.w3.org/2000/svg", "rect"); + var bbox = node.parentElement.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute('class', className); + addItems.push({ + "parent": node.parentNode, + "target": rect}); + } + } + } + else if (!jQuery(node).is("button, select, textarea")) { + jQuery.each(node.childNodes, function() { + highlight(this, addItems); + }); + } + } + var addItems = []; + var result = this.each(function() { + highlight(this, addItems); + }); + for (var i = 0; i < addItems.length; ++i) { + jQuery(addItems[i].parent).before(addItems[i].target); + } + return result; +}; + +/* + * backward compatibility for jQuery.browser + * This will be supported until firefox bug is fixed. + */ +if (!jQuery.browser) { + jQuery.uaMatch = function(ua) { + ua = ua.toLowerCase(); + + var match = /(chrome)[ \/]([\w.]+)/.exec(ua) || + /(webkit)[ \/]([\w.]+)/.exec(ua) || + /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) || + /(msie) ([\w.]+)/.exec(ua) || + ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) || + []; + + return { + browser: match[ 1 ] || "", + version: match[ 2 ] || "0" + }; + }; + jQuery.browser = {}; + jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true; +} diff --git a/docs/build/html/_static/background_b01.png b/docs/build/html/_static/background_b01.png deleted file mode 100644 index 353f26d..0000000 Binary files a/docs/build/html/_static/background_b01.png and /dev/null differ diff --git a/docs/build/html/_static/basic.css b/docs/build/html/_static/basic.css deleted file mode 100644 index 096e3f6..0000000 --- a/docs/build/html/_static/basic.css +++ /dev/null @@ -1,900 +0,0 @@ -/* - * basic.css - * ~~~~~~~~~ - * - * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * - */ - -/* -- main layout ----------------------------------------------------------- */ - -div.clearer { - clear: both; -} - -div.section::after { - display: block; - content: ''; - clear: left; -} - -/* -- relbar ---------------------------------------------------------------- */ - -div.related { - width: 100%; - font-size: 90%; -} - -div.related h3 { - display: none; -} - -div.related ul { - margin: 0; - padding: 0 0 0 10px; - list-style: none; -} - -div.related li { - display: inline; -} - -div.related li.right { - float: right; - margin-right: 5px; -} - -/* -- sidebar --------------------------------------------------------------- */ - -div.sphinxsidebarwrapper { - padding: 10px 5px 0 10px; -} - -div.sphinxsidebar { - float: left; - width: 210px; - margin-left: -100%; - font-size: 90%; - word-wrap: break-word; - overflow-wrap : break-word; -} - -div.sphinxsidebar ul { - list-style: none; -} - -div.sphinxsidebar ul ul, -div.sphinxsidebar ul.want-points { - margin-left: 20px; - list-style: square; -} - -div.sphinxsidebar ul ul { - margin-top: 0; - margin-bottom: 0; -} - -div.sphinxsidebar form { - margin-top: 10px; -} - -div.sphinxsidebar input { - border: 1px solid #98dbcc; - font-family: sans-serif; - font-size: 1em; -} - -div.sphinxsidebar #searchbox form.search { - overflow: hidden; -} - -div.sphinxsidebar #searchbox input[type="text"] { - float: left; - width: 80%; - padding: 0.25em; - box-sizing: border-box; -} - -div.sphinxsidebar #searchbox input[type="submit"] { - float: left; - width: 20%; - border-left: none; - padding: 0.25em; - box-sizing: border-box; -} - - -img { - border: 0; - max-width: 100%; -} - -/* -- search page ----------------------------------------------------------- */ - -ul.search { - margin: 10px 0 0 20px; - padding: 0; -} - -ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; -} - -ul.search li a { - font-weight: bold; -} - -ul.search li p.context { - color: #888; - margin: 2px 0 0 30px; - text-align: left; -} - -ul.keywordmatches li.goodmatch a { - font-weight: bold; -} - -/* -- index page ------------------------------------------------------------ */ - -table.contentstable { - width: 90%; - margin-left: auto; - margin-right: auto; -} - -table.contentstable p.biglink { - line-height: 150%; -} - -a.biglink { - font-size: 1.3em; -} - -span.linkdescr { - font-style: italic; - padding-top: 5px; - font-size: 90%; -} - -/* -- general index --------------------------------------------------------- */ - -table.indextable { - width: 100%; -} - -table.indextable td { - text-align: left; - vertical-align: top; -} - -table.indextable ul { - margin-top: 0; - margin-bottom: 0; - list-style-type: none; -} - -table.indextable > tbody > tr > td > ul { - padding-left: 0em; -} - -table.indextable tr.pcap { - height: 10px; -} - -table.indextable tr.cap { - margin-top: 10px; - background-color: #f2f2f2; -} - -img.toggler { - margin-right: 3px; - margin-top: 3px; - cursor: pointer; -} - -div.modindex-jumpbox { - border-top: 1px solid #ddd; - border-bottom: 1px solid #ddd; - margin: 1em 0 1em 0; - padding: 0.4em; -} - -div.genindex-jumpbox { - border-top: 1px solid #ddd; - border-bottom: 1px solid #ddd; - margin: 1em 0 1em 0; - padding: 0.4em; -} - -/* -- domain module index --------------------------------------------------- */ - -table.modindextable td { - padding: 2px; - border-collapse: collapse; -} - -/* -- general body styles --------------------------------------------------- */ - -div.body { - min-width: 360px; - max-width: 800px; -} - -div.body p, div.body dd, div.body li, div.body blockquote { - -moz-hyphens: auto; - -ms-hyphens: auto; - -webkit-hyphens: auto; - hyphens: auto; -} - -a.headerlink { - visibility: hidden; -} - -h1:hover > a.headerlink, -h2:hover > a.headerlink, -h3:hover > a.headerlink, -h4:hover > a.headerlink, -h5:hover > a.headerlink, -h6:hover > a.headerlink, -dt:hover > a.headerlink, -caption:hover > a.headerlink, -p.caption:hover > a.headerlink, -div.code-block-caption:hover > a.headerlink { - visibility: visible; -} - -div.body p.caption { - text-align: inherit; -} - -div.body td { - text-align: left; -} - -.first { - margin-top: 0 !important; -} - -p.rubric { - margin-top: 30px; - font-weight: bold; -} - -img.align-left, figure.align-left, .figure.align-left, object.align-left { - clear: left; - float: left; - margin-right: 1em; -} - -img.align-right, figure.align-right, .figure.align-right, object.align-right { - clear: right; - float: right; - margin-left: 1em; -} - -img.align-center, figure.align-center, .figure.align-center, object.align-center { - display: block; - margin-left: auto; - margin-right: auto; -} - -img.align-default, figure.align-default, .figure.align-default { - display: block; - margin-left: auto; - margin-right: auto; -} - -.align-left { - text-align: left; -} - -.align-center { - text-align: center; -} - -.align-default { - text-align: center; -} - -.align-right { - text-align: right; -} - -/* -- sidebars -------------------------------------------------------------- */ - -div.sidebar, -aside.sidebar { - margin: 0 0 0.5em 1em; - border: 1px solid #ddb; - padding: 7px; - background-color: #ffe; - width: 40%; - float: right; - clear: right; - overflow-x: auto; -} - -p.sidebar-title { - font-weight: bold; -} -nav.contents, -aside.topic, -div.admonition, div.topic, blockquote { - clear: left; -} - -/* -- topics ---------------------------------------------------------------- */ -nav.contents, -aside.topic, -div.topic { - border: 1px solid #ccc; - padding: 7px; - margin: 10px 0 10px 0; -} - -p.topic-title { - font-size: 1.1em; - font-weight: bold; - margin-top: 10px; -} - -/* -- admonitions ----------------------------------------------------------- */ - -div.admonition { - margin-top: 10px; - margin-bottom: 10px; - padding: 7px; -} - -div.admonition dt { - font-weight: bold; -} - -p.admonition-title { - margin: 0px 10px 5px 0px; - font-weight: bold; -} - -div.body p.centered { - text-align: center; - margin-top: 25px; -} - -/* -- content of sidebars/topics/admonitions -------------------------------- */ - -div.sidebar > :last-child, -aside.sidebar > :last-child, -nav.contents > :last-child, -aside.topic > :last-child, -div.topic > :last-child, -div.admonition > :last-child { - margin-bottom: 0; -} - -div.sidebar::after, -aside.sidebar::after, -nav.contents::after, -aside.topic::after, -div.topic::after, -div.admonition::after, -blockquote::after { - display: block; - content: ''; - clear: both; -} - -/* -- tables ---------------------------------------------------------------- */ - -table.docutils { - margin-top: 10px; - margin-bottom: 10px; - border: 0; - border-collapse: collapse; -} - -table.align-center { - margin-left: auto; - margin-right: auto; -} - -table.align-default { - margin-left: auto; - margin-right: auto; -} - -table caption span.caption-number { - font-style: italic; -} - -table caption span.caption-text { -} - -table.docutils td, table.docutils th { - padding: 1px 8px 1px 5px; - border-top: 0; - border-left: 0; - border-right: 0; - border-bottom: 1px solid #aaa; -} - -th { - text-align: left; - padding-right: 5px; -} - -table.citation { - border-left: solid 1px gray; - margin-left: 1px; -} - -table.citation td { - border-bottom: none; -} - -th > :first-child, -td > :first-child { - margin-top: 0px; -} - -th > :last-child, -td > :last-child { - margin-bottom: 0px; -} - -/* -- figures --------------------------------------------------------------- */ - -div.figure, figure { - margin: 0.5em; - padding: 0.5em; -} - -div.figure p.caption, figcaption { - padding: 0.3em; -} - -div.figure p.caption span.caption-number, -figcaption span.caption-number { - font-style: italic; -} - -div.figure p.caption span.caption-text, -figcaption span.caption-text { -} - -/* -- field list styles ----------------------------------------------------- */ - -table.field-list td, table.field-list th { - border: 0 !important; -} - -.field-list ul { - margin: 0; - padding-left: 1em; -} - -.field-list p { - margin: 0; -} - -.field-name { - -moz-hyphens: manual; - -ms-hyphens: manual; - -webkit-hyphens: manual; - hyphens: manual; -} - -/* -- hlist styles ---------------------------------------------------------- */ - -table.hlist { - margin: 1em 0; -} - -table.hlist td { - vertical-align: top; -} - -/* -- object description styles --------------------------------------------- */ - -.sig { - font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace; -} - -.sig-name, code.descname { - background-color: transparent; - font-weight: bold; -} - -.sig-name { - font-size: 1.1em; -} - -code.descname { - font-size: 1.2em; -} - -.sig-prename, code.descclassname { - background-color: transparent; -} - -.optional { - font-size: 1.3em; -} - -.sig-paren { - font-size: larger; -} - -.sig-param.n { - font-style: italic; -} - -/* C++ specific styling */ - -.sig-inline.c-texpr, -.sig-inline.cpp-texpr { - font-family: unset; -} - -.sig.c .k, .sig.c .kt, -.sig.cpp .k, .sig.cpp .kt { - color: #0033B3; -} - -.sig.c .m, -.sig.cpp .m { - color: #1750EB; -} - -.sig.c .s, .sig.c .sc, -.sig.cpp .s, .sig.cpp .sc { - color: #067D17; -} - - -/* -- other body styles ----------------------------------------------------- */ - -ol.arabic { - list-style: decimal; -} - -ol.loweralpha { - list-style: lower-alpha; -} - -ol.upperalpha { - list-style: upper-alpha; -} - -ol.lowerroman { - list-style: lower-roman; -} - -ol.upperroman { - list-style: upper-roman; -} - -:not(li) > ol > li:first-child > :first-child, -:not(li) > ul > li:first-child > :first-child { - margin-top: 0px; -} - -:not(li) > ol > li:last-child > :last-child, -:not(li) > ul > li:last-child > :last-child { - margin-bottom: 0px; -} - -ol.simple ol p, -ol.simple ul p, -ul.simple ol p, -ul.simple ul p { - margin-top: 0; -} - -ol.simple > li:not(:first-child) > p, -ul.simple > li:not(:first-child) > p { - margin-top: 0; -} - -ol.simple p, -ul.simple p { - margin-bottom: 0; -} -aside.footnote > span, -div.citation > span { - float: left; -} -aside.footnote > span:last-of-type, -div.citation > span:last-of-type { - padding-right: 0.5em; -} -aside.footnote > p { - margin-left: 2em; -} -div.citation > p { - margin-left: 4em; -} -aside.footnote > p:last-of-type, -div.citation > p:last-of-type { - margin-bottom: 0em; -} -aside.footnote > p:last-of-type:after, -div.citation > p:last-of-type:after { - content: ""; - clear: both; -} - -dl.field-list { - display: grid; - grid-template-columns: fit-content(30%) auto; -} - -dl.field-list > dt { - font-weight: bold; - word-break: break-word; - padding-left: 0.5em; - padding-right: 5px; -} - -dl.field-list > dd { - padding-left: 0.5em; - margin-top: 0em; - margin-left: 0em; - margin-bottom: 0em; -} - -dl { - margin-bottom: 15px; -} - -dd > :first-child { - margin-top: 0px; -} - -dd ul, dd table { - margin-bottom: 10px; -} - -dd { - margin-top: 3px; - margin-bottom: 10px; - margin-left: 30px; -} - -dl > dd:last-child, -dl > dd:last-child > :last-child { - margin-bottom: 0; -} - -dt:target, span.highlighted { - background-color: #fbe54e; -} - -rect.highlighted { - fill: #fbe54e; -} - -dl.glossary dt { - font-weight: bold; - font-size: 1.1em; -} - -.versionmodified { - font-style: italic; -} - -.system-message { - background-color: #fda; - padding: 5px; - border: 3px solid red; -} - -.footnote:target { - background-color: #ffa; -} - -.line-block { - display: block; - margin-top: 1em; - margin-bottom: 1em; -} - -.line-block .line-block { - margin-top: 0; - margin-bottom: 0; - margin-left: 1.5em; -} - -.guilabel, .menuselection { - font-family: sans-serif; -} - -.accelerator { - text-decoration: underline; -} - -.classifier { - font-style: oblique; -} - -.classifier:before { - font-style: normal; - margin: 0 0.5em; - content: ":"; - display: inline-block; -} - -abbr, acronym { - border-bottom: dotted 1px; - cursor: help; -} - -/* -- code displays --------------------------------------------------------- */ - -pre { - overflow: auto; - overflow-y: hidden; /* fixes display issues on Chrome browsers */ -} - -pre, div[class*="highlight-"] { - clear: both; -} - -span.pre { - -moz-hyphens: none; - -ms-hyphens: none; - -webkit-hyphens: none; - hyphens: none; - white-space: nowrap; -} - -div[class*="highlight-"] { - margin: 1em 0; -} - -td.linenos pre { - border: 0; - background-color: transparent; - color: #aaa; -} - -table.highlighttable { - display: block; -} - -table.highlighttable tbody { - display: block; -} - -table.highlighttable tr { - display: flex; -} - -table.highlighttable td { - margin: 0; - padding: 0; -} - -table.highlighttable td.linenos { - padding-right: 0.5em; -} - -table.highlighttable td.code { - flex: 1; - overflow: hidden; -} - -.highlight .hll { - display: block; -} - -div.highlight pre, -table.highlighttable pre { - margin: 0; -} - -div.code-block-caption + div { - margin-top: 0; -} - -div.code-block-caption { - margin-top: 1em; - padding: 2px 5px; - font-size: small; -} - -div.code-block-caption code { - background-color: transparent; -} - -table.highlighttable td.linenos, -span.linenos, -div.highlight span.gp { /* gp: Generic.Prompt */ - user-select: none; - -webkit-user-select: text; /* Safari fallback only */ - -webkit-user-select: none; /* Chrome/Safari */ - -moz-user-select: none; /* Firefox */ - -ms-user-select: none; /* IE10+ */ -} - -div.code-block-caption span.caption-number { - padding: 0.1em 0.3em; - font-style: italic; -} - -div.code-block-caption span.caption-text { -} - -div.literal-block-wrapper { - margin: 1em 0; -} - -code.xref, a code { - background-color: transparent; - font-weight: bold; -} - -h1 code, h2 code, h3 code, h4 code, h5 code, h6 code { - background-color: transparent; -} - -.viewcode-link { - float: right; -} - -.viewcode-back { - float: right; - font-family: sans-serif; -} - -div.viewcode-block:target { - margin: -1px -10px; - padding: 0 10px; -} - -/* -- math display ---------------------------------------------------------- */ - -img.math { - vertical-align: middle; -} - -div.body div.math p { - text-align: center; -} - -span.eqno { - float: right; -} - -span.eqno a.headerlink { - position: absolute; - z-index: 1; -} - -div.math:hover a.headerlink { - visibility: visible; -} - -/* -- printout stylesheet --------------------------------------------------- */ - -@media print { - div.document, - div.documentwrapper, - div.bodywrapper { - margin: 0 !important; - width: 100%; - } - - div.sphinxsidebar, - div.related, - div.footer, - #top-link { - display: none; - } -} \ No newline at end of file diff --git a/docs/build/html/_static/bizstyle.css b/docs/build/html/_static/bizstyle.css deleted file mode 100644 index ec32aa0..0000000 --- a/docs/build/html/_static/bizstyle.css +++ /dev/null @@ -1,508 +0,0 @@ -/* - * bizstyle.css_t - * ~~~~~~~~~~~~~~ - * - * Sphinx stylesheet -- business style theme. - * - * :copyright: Copyright 2011-2014 by Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * - */ - -@import url("basic.css"); - -/* -- page layout ----------------------------------------------------------- */ - -body { - font-family: 'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', - 'Verdana', sans-serif; - font-size: 14px; - letter-spacing: -0.01em; - line-height: 150%; - text-align: center; - background-color: white; - background-image: url(background_b01.png); - color: black; - padding: 0; - border-right: 1px solid #336699; - border-left: 1px solid #336699; - - margin: 0px 40px 0px 40px; -} - -div.document { - background-color: white; - text-align: left; - background-repeat: repeat-x; - - -moz-box-shadow: 2px 2px 5px #000; - -webkit-box-shadow: 2px 2px 5px #000; -} - -div.documentwrapper { - float: left; - width: 100%; -} - -div.bodywrapper { - margin: 0 0 0 240px; - border-left: 1px solid #ccc; -} - -div.body { - margin: 0; - padding: 0.5em 20px 20px 20px; -} -div.bodywrapper { - margin: 0 0 0 calc(210px + 30px); -} - -div.related { - font-size: 1em; - - -moz-box-shadow: 2px 2px 5px #000; - -webkit-box-shadow: 2px 2px 5px #000; -} - -div.related ul { - background-color: #336699; - height: 100%; - overflow: hidden; - border-top: 1px solid #ddd; - border-bottom: 1px solid #ddd; -} - -div.related ul li { - color: white; - margin: 0; - padding: 0; - height: 2em; - float: left; -} - -div.related ul li.right { - float: right; - margin-right: 5px; -} - -div.related ul li a { - margin: 0; - padding: 0 5px 0 5px; - line-height: 1.75em; - color: #fff; -} - -div.related ul li a:hover { - color: #fff; - text-decoration: underline; -} - -div.sphinxsidebarwrapper { - padding: 0; -} - -div.sphinxsidebar { - padding: 0.5em 12px 12px 12px; - width: 210px; - font-size: 1em; - text-align: left; -} - -div.sphinxsidebar h3, div.sphinxsidebar h4 { - margin: 1em 0 0.5em 0; - font-size: 1em; - padding: 0.1em 0 0.1em 0.5em; - color: white; - border: 1px solid #336699; - background-color: #336699; -} - -div.sphinxsidebar h3 a { - color: white; -} - -div.sphinxsidebar ul { - padding-left: 1.5em; - margin-top: 7px; - padding: 0; - line-height: 130%; -} - -div.sphinxsidebar ul ul { - margin-left: 20px; -} - -div.sphinxsidebar input { - border: 1px solid #336699; -} - -div.footer { - background-color: white; - color: #336699; - padding: 3px 8px 3px 0; - clear: both; - font-size: 0.8em; - text-align: right; - border-bottom: 1px solid #336699; - - -moz-box-shadow: 2px 2px 5px #000; - -webkit-box-shadow: 2px 2px 5px #000; -} - -div.footer a { - color: #336699; - text-decoration: underline; -} - -/* -- body styles ----------------------------------------------------------- */ - -p { - margin: 0.8em 0 0.5em 0; -} - -a { - color: #336699; - text-decoration: none; -} - -a:hover { - color: #336699; - text-decoration: underline; -} - -div.body a { - text-decoration: underline; -} - -h1, h2, h3 { - color: #336699; -} - -h1 { - margin: 0; - padding: 0.7em 0 0.3em 0; - font-size: 1.5em; -} - -h2 { - margin: 1.3em 0 0.2em 0; - font-size: 1.35em; - padding-bottom: .5em; - border-bottom: 1px solid #336699; -} - -h3 { - margin: 1em 0 -0.3em 0; - font-size: 1.2em; - padding-bottom: .3em; - border-bottom: 1px solid #CCCCCC; -} - -div.body h1 a, div.body h2 a, div.body h3 a, -div.body h4 a, div.body h5 a, div.body h6 a { - color: black!important; -} - -h1 a.anchor, h2 a.anchor, h3 a.anchor, -h4 a.anchor, h5 a.anchor, h6 a.anchor { - display: none; - margin: 0 0 0 0.3em; - padding: 0 0.2em 0 0.2em; - color: #aaa!important; -} - -h1:hover a.anchor, h2:hover a.anchor, h3:hover a.anchor, h4:hover a.anchor, -h5:hover a.anchor, h6:hover a.anchor { - display: inline; -} - -h1 a.anchor:hover, h2 a.anchor:hover, h3 a.anchor:hover, h4 a.anchor:hover, -h5 a.anchor:hover, h6 a.anchor:hover { - color: #777; - background-color: #eee; -} - -a.headerlink { - color: #c60f0f!important; - font-size: 1em; - margin-left: 6px; - padding: 0 4px 0 4px; - text-decoration: none!important; -} - -a.headerlink:hover { - background-color: #ccc; - color: white!important; -} - -cite, code, tt { - font-family: 'Consolas', 'Deja Vu Sans Mono', - 'Bitstream Vera Sans Mono', monospace; - font-size: 0.95em; - letter-spacing: 0.01em; -} - -code { - background-color: #F2F2F2; - border-bottom: 1px solid #ddd; - color: #333; -} - -code.descname, code.descclassname, code.xref { - border: 0; -} - -hr { - border: 1px solid #abc; - margin: 2em; -} - -a code { - border: 0; - color: #CA7900; -} - -a code:hover { - color: #2491CF; -} - -pre { - background-color: transparent !important; - font-family: 'Consolas', 'Deja Vu Sans Mono', - 'Bitstream Vera Sans Mono', monospace; - font-size: 0.95em; - letter-spacing: 0.015em; - line-height: 120%; - padding: 0.5em; - border-right: 5px solid #ccc; - border-left: 5px solid #ccc; -} - -pre a { - color: inherit; - text-decoration: underline; -} - -td.linenos pre { - padding: 0.5em 0; -} - -div.quotebar { - background-color: #f8f8f8; - max-width: 250px; - float: right; - padding: 2px 7px; - border: 1px solid #ccc; -} -nav.contents, -aside.topic, - -div.topic { - background-color: #f8f8f8; -} - -table { - border-collapse: collapse; - margin: 0 -0.5em 0 -0.5em; -} - -table td, table th { - padding: 0.2em 0.5em 0.2em 0.5em; -} - -div.admonition { - font-size: 0.9em; - margin: 1em 0 1em 0; - border: 3px solid #cccccc; - background-color: #f7f7f7; - padding: 0; -} - -div.admonition p { - margin: 0.5em 1em 0.5em 1em; - padding: 0; -} - -div.admonition li p { - margin-left: 0; -} - -div.admonition pre, div.warning pre { - margin: 0; -} - -div.highlight { - margin: 0.4em 1em; -} - -div.admonition p.admonition-title { - margin: 0; - padding: 0.1em 0 0.1em 0.5em; - color: white; - border-bottom: 3px solid #cccccc; - font-weight: bold; - background-color: #165e83; -} - -div.danger { border: 3px solid #f0908d; background-color: #f0cfa0; } -div.error { border: 3px solid #f0908d; background-color: #ede4cd; } -div.warning { border: 3px solid #f8b862; background-color: #f0cfa0; } -div.caution { border: 3px solid #f8b862; background-color: #ede4cd; } -div.attention { border: 3px solid #f8b862; background-color: #f3f3f3; } -div.important { border: 3px solid #f0cfa0; background-color: #ede4cd; } -div.note { border: 3px solid #f0cfa0; background-color: #f3f3f3; } -div.hint { border: 3px solid #bed2c3; background-color: #f3f3f3; } -div.tip { border: 3px solid #bed2c3; background-color: #f3f3f3; } - -div.danger p.admonition-title, div.error p.admonition-title { - background-color: #b7282e; - border-bottom: 3px solid #f0908d; -} - -div.caution p.admonition-title, -div.warning p.admonition-title, -div.attention p.admonition-title { - background-color: #f19072; - border-bottom: 3px solid #f8b862; -} - -div.note p.admonition-title, div.important p.admonition-title { - background-color: #f8b862; - border-bottom: 3px solid #f0cfa0; -} - -div.hint p.admonition-title, div.tip p.admonition-title { - background-color: #7ebea5; - border-bottom: 3px solid #bed2c3; -} - -div.admonition ul, div.admonition ol, -div.warning ul, div.warning ol { - margin: 0.1em 0.5em 0.5em 3em; - padding: 0; -} - -div.versioninfo { - margin: 1em 0 0 0; - border: 1px solid #ccc; - background-color: #DDEAF0; - padding: 8px; - line-height: 1.3em; - font-size: 0.9em; -} - -.viewcode-back { - font-family: 'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', - 'Verdana', sans-serif; -} - -div.viewcode-block:target { - background-color: #f4debf; - border-top: 1px solid #ac9; - border-bottom: 1px solid #ac9; -} - -p.versionchanged span.versionmodified { - font-size: 0.9em; - margin-right: 0.2em; - padding: 0.1em; - background-color: #DCE6A0; -} - -dl.field-list > dt { - color: white; - background-color: #82A0BE; -} - -dl.field-list > dd { - background-color: #f7f7f7; -} - -/* -- table styles ---------------------------------------------------------- */ - -table.docutils { - margin: 1em 0; - padding: 0; - border: 1px solid white; - background-color: #f7f7f7; -} - -table.docutils td, table.docutils th { - padding: 1px 8px 1px 5px; - border-top: 0; - border-left: 0; - border-right: 1px solid white; - border-bottom: 1px solid white; -} - -table.docutils td p { - margin-top: 0; - margin-bottom: 0.3em; -} - -table.field-list td, table.field-list th { - border: 0 !important; - word-break: break-word; -} - -table.footnote td, table.footnote th { - border: 0 !important; -} - -th { - color: white; - text-align: left; - padding-right: 5px; - background-color: #82A0BE; -} - -div.literal-block-wrapper div.code-block-caption { - background-color: #EEE; - border-style: solid; - border-color: #CCC; - border-width: 1px 5px; -} - -/* WIDE DESKTOP STYLE */ -@media only screen and (min-width: 1176px) { -body { - margin: 0 40px 0 40px; -} -} - -/* TABLET STYLE */ -@media only screen and (min-width: 768px) and (max-width: 991px) { -body { - margin: 0 40px 0 40px; -} -} - -/* MOBILE LAYOUT (PORTRAIT/320px) */ -@media only screen and (max-width: 767px) { -body { - margin: 0; -} -div.bodywrapper { - margin: 0; - width: 100%; - border: none; -} -div.sphinxsidebar { - display: none; -} -} - -/* MOBILE LAYOUT (LANDSCAPE/480px) */ -@media only screen and (min-width: 480px) and (max-width: 767px) { -body { - margin: 0 20px 0 20px; -} -} - -/* RETINA OVERRIDES */ -@media -only screen and (-webkit-min-device-pixel-ratio: 2), -only screen and (min-device-pixel-ratio: 2) { -} - -/* -- end ------------------------------------------------------------------- */ \ No newline at end of file diff --git a/docs/build/html/_static/bizstyle.js b/docs/build/html/_static/bizstyle.js deleted file mode 100644 index 4d5d01d..0000000 --- a/docs/build/html/_static/bizstyle.js +++ /dev/null @@ -1,30 +0,0 @@ -// -// bizstyle.js -// ~~~~~~~~~~~ -// -// Sphinx javascript -- for bizstyle theme. -// -// This theme was created by referring to 'sphinxdoc' -// -// :copyright: Copyright 2012-2014 by Sphinx team, see AUTHORS. -// :license: BSD, see LICENSE for details. -// -const initialiseBizStyle = () => { - if (navigator.userAgent.indexOf("iPhone") > 0 || navigator.userAgent.indexOf("Android") > 0) { - document.querySelector("li.nav-item-0 a").innerText = "Top" - } - const truncator = item => {if (item.textContent.length > 20) { - item.title = item.innerText - item.innerText = item.innerText.substr(0, 17) + "..." - } - } - document.querySelectorAll("div.related:first ul li:not(.right) a").slice(1).forEach(truncator); - document.querySelectorAll("div.related:last ul li:not(.right) a").slice(1).forEach(truncator); -} - -window.addEventListener("resize", - () => (document.querySelector("li.nav-item-0 a").innerText = (window.innerWidth <= 776) ? "Top" : "QuaPy 0.1.7 documentation") -) - -if (document.readyState !== "loading") initialiseBizStyle() -else document.addEventListener("DOMContentLoaded", initialiseBizStyle) \ No newline at end of file diff --git a/docs/build/html/_static/contents.png b/docs/build/html/_static/contents.png new file mode 100644 index 0000000..6c59aa1 Binary files /dev/null and b/docs/build/html/_static/contents.png differ diff --git a/docs/build/html/_static/css/badge_only.css b/docs/build/html/_static/css/badge_only.css new file mode 100644 index 0000000..c718cee --- /dev/null +++ b/docs/build/html/_static/css/badge_only.css @@ -0,0 +1 @@ +.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-style:normal;font-weight:400;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#FontAwesome) format("svg")}.fa:before{font-family:FontAwesome;font-style:normal;font-weight:400;line-height:1}.fa:before,a .fa{text-decoration:inherit}.fa:before,a .fa,li .fa{display:inline-block}li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-caret-down:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before,.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before,.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before,.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60}.rst-versions .rst-current-version:after{clear:both;content:"";display:block}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}} \ No newline at end of file diff --git a/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff b/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff new file mode 100644 index 0000000..6cb6000 Binary files /dev/null and b/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff differ diff --git a/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff2 b/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff2 new file mode 100644 index 0000000..7059e23 Binary files /dev/null and b/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff2 differ diff --git a/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff b/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff new file mode 100644 index 0000000..f815f63 Binary files /dev/null and b/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff differ diff --git a/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff2 b/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff2 new file mode 100644 index 0000000..f2c76e5 Binary files /dev/null and b/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff2 differ diff --git a/docs/build/html/_static/css/fonts/fontawesome-webfont.eot b/docs/build/html/_static/css/fonts/fontawesome-webfont.eot new file mode 100644 index 0000000..e9f60ca Binary files /dev/null and b/docs/build/html/_static/css/fonts/fontawesome-webfont.eot differ diff --git a/docs/build/html/_static/css/fonts/fontawesome-webfont.svg b/docs/build/html/_static/css/fonts/fontawesome-webfont.svg new file mode 100644 index 0000000..855c845 --- /dev/null +++ b/docs/build/html/_static/css/fonts/fontawesome-webfont.svg @@ -0,0 +1,2671 @@ + + + + +Created by FontForge 20120731 at Mon Oct 24 17:37:40 2016 + By ,,, +Copyright Dave Gandy 2016. All rights reserved. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/build/html/_static/css/fonts/fontawesome-webfont.ttf b/docs/build/html/_static/css/fonts/fontawesome-webfont.ttf new file mode 100644 index 0000000..35acda2 Binary files /dev/null and b/docs/build/html/_static/css/fonts/fontawesome-webfont.ttf differ diff --git a/docs/build/html/_static/css/fonts/fontawesome-webfont.woff b/docs/build/html/_static/css/fonts/fontawesome-webfont.woff new file mode 100644 index 0000000..400014a Binary files /dev/null and b/docs/build/html/_static/css/fonts/fontawesome-webfont.woff differ diff --git a/docs/build/html/_static/css/fonts/fontawesome-webfont.woff2 b/docs/build/html/_static/css/fonts/fontawesome-webfont.woff2 new file mode 100644 index 0000000..4d13fc6 Binary files /dev/null and b/docs/build/html/_static/css/fonts/fontawesome-webfont.woff2 differ diff --git a/docs/build/html/_static/css/fonts/lato-bold-italic.woff b/docs/build/html/_static/css/fonts/lato-bold-italic.woff new file mode 100644 index 0000000..88ad05b Binary files /dev/null and b/docs/build/html/_static/css/fonts/lato-bold-italic.woff differ diff --git a/docs/build/html/_static/css/fonts/lato-bold-italic.woff2 b/docs/build/html/_static/css/fonts/lato-bold-italic.woff2 new file mode 100644 index 0000000..c4e3d80 Binary files /dev/null and b/docs/build/html/_static/css/fonts/lato-bold-italic.woff2 differ diff --git a/docs/build/html/_static/css/fonts/lato-bold.woff b/docs/build/html/_static/css/fonts/lato-bold.woff new file mode 100644 index 0000000..c6dff51 Binary files /dev/null and b/docs/build/html/_static/css/fonts/lato-bold.woff differ diff --git a/docs/build/html/_static/css/fonts/lato-bold.woff2 b/docs/build/html/_static/css/fonts/lato-bold.woff2 new file mode 100644 index 0000000..bb19504 Binary files /dev/null and b/docs/build/html/_static/css/fonts/lato-bold.woff2 differ diff --git a/docs/build/html/_static/css/fonts/lato-normal-italic.woff b/docs/build/html/_static/css/fonts/lato-normal-italic.woff new file mode 100644 index 0000000..76114bc Binary files /dev/null and b/docs/build/html/_static/css/fonts/lato-normal-italic.woff differ diff --git a/docs/build/html/_static/css/fonts/lato-normal-italic.woff2 b/docs/build/html/_static/css/fonts/lato-normal-italic.woff2 new file mode 100644 index 0000000..3404f37 Binary files /dev/null and b/docs/build/html/_static/css/fonts/lato-normal-italic.woff2 differ diff --git a/docs/build/html/_static/css/fonts/lato-normal.woff b/docs/build/html/_static/css/fonts/lato-normal.woff new file mode 100644 index 0000000..ae1307f Binary files /dev/null and b/docs/build/html/_static/css/fonts/lato-normal.woff differ diff --git a/docs/build/html/_static/css/fonts/lato-normal.woff2 b/docs/build/html/_static/css/fonts/lato-normal.woff2 new file mode 100644 index 0000000..3bf9843 Binary files /dev/null and b/docs/build/html/_static/css/fonts/lato-normal.woff2 differ diff --git a/docs/build/html/_static/css/theme.css b/docs/build/html/_static/css/theme.css new file mode 100644 index 0000000..19a446a --- /dev/null +++ b/docs/build/html/_static/css/theme.css @@ -0,0 +1,4 @@ +html{box-sizing:border-box}*,:after,:before{box-sizing:inherit}article,aside,details,figcaption,figure,footer,header,hgroup,nav,section{display:block}audio,canvas,video{display:inline-block;*display:inline;*zoom:1}[hidden],audio:not([controls]){display:none}*{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}html{font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%}body{margin:0}a:active,a:hover{outline:0}abbr[title]{border-bottom:1px dotted}b,strong{font-weight:700}blockquote{margin:0}dfn{font-style:italic}ins{background:#ff9;text-decoration:none}ins,mark{color:#000}mark{background:#ff0;font-style:italic;font-weight:700}.rst-content code,.rst-content tt,code,kbd,pre,samp{font-family:monospace,serif;_font-family:courier new,monospace;font-size:1em}pre{white-space:pre}q{quotes:none}q:after,q:before{content:"";content:none}small{font-size:85%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sup{top:-.5em}sub{bottom:-.25em}dl,ol,ul{margin:0;padding:0;list-style:none;list-style-image:none}li{list-style:none}dd{margin:0}img{border:0;-ms-interpolation-mode:bicubic;vertical-align:middle;max-width:100%}svg:not(:root){overflow:hidden}figure,form{margin:0}label{cursor:pointer}button,input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}button,input{line-height:normal}button,input[type=button],input[type=reset],input[type=submit]{cursor:pointer;-webkit-appearance:button;*overflow:visible}button[disabled],input[disabled]{cursor:default}input[type=search]{-webkit-appearance:textfield;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;box-sizing:content-box}textarea{resize:vertical}table{border-collapse:collapse;border-spacing:0}td{vertical-align:top}.chromeframe{margin:.2em 0;background:#ccc;color:#000;padding:.2em 0}.ir{display:block;border:0;text-indent:-999em;overflow:hidden;background-color:transparent;background-repeat:no-repeat;text-align:left;direction:ltr;*line-height:0}.ir br{display:none}.hidden{display:none!important;visibility:hidden}.visuallyhidden{border:0;clip:rect(0 0 0 0);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px}.visuallyhidden.focusable:active,.visuallyhidden.focusable:focus{clip:auto;height:auto;margin:0;overflow:visible;position:static;width:auto}.invisible{visibility:hidden}.relative{position:relative}big,small{font-size:100%}@media print{body,html,section{background:none!important}*{box-shadow:none!important;text-shadow:none!important;filter:none!important;-ms-filter:none!important}a,a:visited{text-decoration:underline}.ir a:after,a[href^="#"]:after,a[href^="javascript:"]:after{content:""}blockquote,pre{page-break-inside:avoid}thead{display:table-header-group}img,tr{page-break-inside:avoid}img{max-width:100%!important}@page{margin:.5cm}.rst-content .toctree-wrapper>p.caption,h2,h3,p{orphans:3;widows:3}.rst-content .toctree-wrapper>p.caption,h2,h3{page-break-after:avoid}}.btn,.fa:before,.icon:before,.rst-content .admonition,.rst-content .admonition-title:before,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .code-block-caption .headerlink:before,.rst-content .danger,.rst-content .eqno .headerlink:before,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-alert,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before,input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week],select,textarea{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}/*! + * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome + * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) + */@font-face{font-family:FontAwesome;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713);src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix&v=4.7.0) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#fontawesomeregular) format("svg");font-weight:400;font-style:normal}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{display:inline-block;font:normal normal normal 14px/1 FontAwesome;font-size:inherit;text-rendering:auto;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.fa-lg{font-size:1.33333em;line-height:.75em;vertical-align:-15%}.fa-2x{font-size:2em}.fa-3x{font-size:3em}.fa-4x{font-size:4em}.fa-5x{font-size:5em}.fa-fw{width:1.28571em;text-align:center}.fa-ul{padding-left:0;margin-left:2.14286em;list-style-type:none}.fa-ul>li{position:relative}.fa-li{position:absolute;left:-2.14286em;width:2.14286em;top:.14286em;text-align:center}.fa-li.fa-lg{left:-1.85714em}.fa-border{padding:.2em .25em .15em;border:.08em solid #eee;border-radius:.1em}.fa-pull-left{float:left}.fa-pull-right{float:right}.fa-pull-left.icon,.fa.fa-pull-left,.rst-content .code-block-caption .fa-pull-left.headerlink,.rst-content .eqno .fa-pull-left.headerlink,.rst-content .fa-pull-left.admonition-title,.rst-content code.download span.fa-pull-left:first-child,.rst-content dl dt .fa-pull-left.headerlink,.rst-content h1 .fa-pull-left.headerlink,.rst-content h2 .fa-pull-left.headerlink,.rst-content h3 .fa-pull-left.headerlink,.rst-content h4 .fa-pull-left.headerlink,.rst-content h5 .fa-pull-left.headerlink,.rst-content h6 .fa-pull-left.headerlink,.rst-content p .fa-pull-left.headerlink,.rst-content table>caption .fa-pull-left.headerlink,.rst-content tt.download span.fa-pull-left:first-child,.wy-menu-vertical li.current>a button.fa-pull-left.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-left.toctree-expand,.wy-menu-vertical li button.fa-pull-left.toctree-expand{margin-right:.3em}.fa-pull-right.icon,.fa.fa-pull-right,.rst-content .code-block-caption .fa-pull-right.headerlink,.rst-content .eqno .fa-pull-right.headerlink,.rst-content .fa-pull-right.admonition-title,.rst-content code.download span.fa-pull-right:first-child,.rst-content dl dt .fa-pull-right.headerlink,.rst-content h1 .fa-pull-right.headerlink,.rst-content h2 .fa-pull-right.headerlink,.rst-content h3 .fa-pull-right.headerlink,.rst-content h4 .fa-pull-right.headerlink,.rst-content h5 .fa-pull-right.headerlink,.rst-content h6 .fa-pull-right.headerlink,.rst-content p .fa-pull-right.headerlink,.rst-content table>caption .fa-pull-right.headerlink,.rst-content tt.download span.fa-pull-right:first-child,.wy-menu-vertical li.current>a button.fa-pull-right.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-right.toctree-expand,.wy-menu-vertical li button.fa-pull-right.toctree-expand{margin-left:.3em}.pull-right{float:right}.pull-left{float:left}.fa.pull-left,.pull-left.icon,.rst-content .code-block-caption .pull-left.headerlink,.rst-content .eqno .pull-left.headerlink,.rst-content .pull-left.admonition-title,.rst-content code.download span.pull-left:first-child,.rst-content dl dt .pull-left.headerlink,.rst-content h1 .pull-left.headerlink,.rst-content h2 .pull-left.headerlink,.rst-content h3 .pull-left.headerlink,.rst-content h4 .pull-left.headerlink,.rst-content h5 .pull-left.headerlink,.rst-content h6 .pull-left.headerlink,.rst-content p .pull-left.headerlink,.rst-content table>caption .pull-left.headerlink,.rst-content tt.download span.pull-left:first-child,.wy-menu-vertical li.current>a button.pull-left.toctree-expand,.wy-menu-vertical li.on a button.pull-left.toctree-expand,.wy-menu-vertical li button.pull-left.toctree-expand{margin-right:.3em}.fa.pull-right,.pull-right.icon,.rst-content .code-block-caption .pull-right.headerlink,.rst-content .eqno .pull-right.headerlink,.rst-content .pull-right.admonition-title,.rst-content code.download span.pull-right:first-child,.rst-content dl dt .pull-right.headerlink,.rst-content h1 .pull-right.headerlink,.rst-content h2 .pull-right.headerlink,.rst-content h3 .pull-right.headerlink,.rst-content h4 .pull-right.headerlink,.rst-content h5 .pull-right.headerlink,.rst-content h6 .pull-right.headerlink,.rst-content p .pull-right.headerlink,.rst-content table>caption .pull-right.headerlink,.rst-content tt.download span.pull-right:first-child,.wy-menu-vertical li.current>a button.pull-right.toctree-expand,.wy-menu-vertical li.on a button.pull-right.toctree-expand,.wy-menu-vertical li button.pull-right.toctree-expand{margin-left:.3em}.fa-spin{-webkit-animation:fa-spin 2s linear infinite;animation:fa-spin 2s linear infinite}.fa-pulse{-webkit-animation:fa-spin 1s steps(8) infinite;animation:fa-spin 1s steps(8) infinite}@-webkit-keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.fa-rotate-90{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=1)";-webkit-transform:rotate(90deg);-ms-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2)";-webkit-transform:rotate(180deg);-ms-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=3)";-webkit-transform:rotate(270deg);-ms-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)";-webkit-transform:scaleX(-1);-ms-transform:scaleX(-1);transform:scaleX(-1)}.fa-flip-vertical{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)";-webkit-transform:scaleY(-1);-ms-transform:scaleY(-1);transform:scaleY(-1)}:root .fa-flip-horizontal,:root .fa-flip-vertical,:root .fa-rotate-90,:root .fa-rotate-180,:root .fa-rotate-270{filter:none}.fa-stack{position:relative;display:inline-block;width:2em;height:2em;line-height:2em;vertical-align:middle}.fa-stack-1x,.fa-stack-2x{position:absolute;left:0;width:100%;text-align:center}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-glass:before{content:""}.fa-music:before{content:""}.fa-search:before,.icon-search:before{content:""}.fa-envelope-o:before{content:""}.fa-heart:before{content:""}.fa-star:before{content:""}.fa-star-o:before{content:""}.fa-user:before{content:""}.fa-film:before{content:""}.fa-th-large:before{content:""}.fa-th:before{content:""}.fa-th-list:before{content:""}.fa-check:before{content:""}.fa-close:before,.fa-remove:before,.fa-times:before{content:""}.fa-search-plus:before{content:""}.fa-search-minus:before{content:""}.fa-power-off:before{content:""}.fa-signal:before{content:""}.fa-cog:before,.fa-gear:before{content:""}.fa-trash-o:before{content:""}.fa-home:before,.icon-home:before{content:""}.fa-file-o:before{content:""}.fa-clock-o:before{content:""}.fa-road:before{content:""}.fa-download:before,.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{content:""}.fa-arrow-circle-o-down:before{content:""}.fa-arrow-circle-o-up:before{content:""}.fa-inbox:before{content:""}.fa-play-circle-o:before{content:""}.fa-repeat:before,.fa-rotate-right:before{content:""}.fa-refresh:before{content:""}.fa-list-alt:before{content:""}.fa-lock:before{content:""}.fa-flag:before{content:""}.fa-headphones:before{content:""}.fa-volume-off:before{content:""}.fa-volume-down:before{content:""}.fa-volume-up:before{content:""}.fa-qrcode:before{content:""}.fa-barcode:before{content:""}.fa-tag:before{content:""}.fa-tags:before{content:""}.fa-book:before,.icon-book:before{content:""}.fa-bookmark:before{content:""}.fa-print:before{content:""}.fa-camera:before{content:""}.fa-font:before{content:""}.fa-bold:before{content:""}.fa-italic:before{content:""}.fa-text-height:before{content:""}.fa-text-width:before{content:""}.fa-align-left:before{content:""}.fa-align-center:before{content:""}.fa-align-right:before{content:""}.fa-align-justify:before{content:""}.fa-list:before{content:""}.fa-dedent:before,.fa-outdent:before{content:""}.fa-indent:before{content:""}.fa-video-camera:before{content:""}.fa-image:before,.fa-photo:before,.fa-picture-o:before{content:""}.fa-pencil:before{content:""}.fa-map-marker:before{content:""}.fa-adjust:before{content:""}.fa-tint:before{content:""}.fa-edit:before,.fa-pencil-square-o:before{content:""}.fa-share-square-o:before{content:""}.fa-check-square-o:before{content:""}.fa-arrows:before{content:""}.fa-step-backward:before{content:""}.fa-fast-backward:before{content:""}.fa-backward:before{content:""}.fa-play:before{content:""}.fa-pause:before{content:""}.fa-stop:before{content:""}.fa-forward:before{content:""}.fa-fast-forward:before{content:""}.fa-step-forward:before{content:""}.fa-eject:before{content:""}.fa-chevron-left:before{content:""}.fa-chevron-right:before{content:""}.fa-plus-circle:before{content:""}.fa-minus-circle:before{content:""}.fa-times-circle:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before{content:""}.fa-check-circle:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before{content:""}.fa-question-circle:before{content:""}.fa-info-circle:before{content:""}.fa-crosshairs:before{content:""}.fa-times-circle-o:before{content:""}.fa-check-circle-o:before{content:""}.fa-ban:before{content:""}.fa-arrow-left:before{content:""}.fa-arrow-right:before{content:""}.fa-arrow-up:before{content:""}.fa-arrow-down:before{content:""}.fa-mail-forward:before,.fa-share:before{content:""}.fa-expand:before{content:""}.fa-compress:before{content:""}.fa-plus:before{content:""}.fa-minus:before{content:""}.fa-asterisk:before{content:""}.fa-exclamation-circle:before,.rst-content .admonition-title:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before{content:""}.fa-gift:before{content:""}.fa-leaf:before{content:""}.fa-fire:before,.icon-fire:before{content:""}.fa-eye:before{content:""}.fa-eye-slash:before{content:""}.fa-exclamation-triangle:before,.fa-warning:before{content:""}.fa-plane:before{content:""}.fa-calendar:before{content:""}.fa-random:before{content:""}.fa-comment:before{content:""}.fa-magnet:before{content:""}.fa-chevron-up:before{content:""}.fa-chevron-down:before{content:""}.fa-retweet:before{content:""}.fa-shopping-cart:before{content:""}.fa-folder:before{content:""}.fa-folder-open:before{content:""}.fa-arrows-v:before{content:""}.fa-arrows-h:before{content:""}.fa-bar-chart-o:before,.fa-bar-chart:before{content:""}.fa-twitter-square:before{content:""}.fa-facebook-square:before{content:""}.fa-camera-retro:before{content:""}.fa-key:before{content:""}.fa-cogs:before,.fa-gears:before{content:""}.fa-comments:before{content:""}.fa-thumbs-o-up:before{content:""}.fa-thumbs-o-down:before{content:""}.fa-star-half:before{content:""}.fa-heart-o:before{content:""}.fa-sign-out:before{content:""}.fa-linkedin-square:before{content:""}.fa-thumb-tack:before{content:""}.fa-external-link:before{content:""}.fa-sign-in:before{content:""}.fa-trophy:before{content:""}.fa-github-square:before{content:""}.fa-upload:before{content:""}.fa-lemon-o:before{content:""}.fa-phone:before{content:""}.fa-square-o:before{content:""}.fa-bookmark-o:before{content:""}.fa-phone-square:before{content:""}.fa-twitter:before{content:""}.fa-facebook-f:before,.fa-facebook:before{content:""}.fa-github:before,.icon-github:before{content:""}.fa-unlock:before{content:""}.fa-credit-card:before{content:""}.fa-feed:before,.fa-rss:before{content:""}.fa-hdd-o:before{content:""}.fa-bullhorn:before{content:""}.fa-bell:before{content:""}.fa-certificate:before{content:""}.fa-hand-o-right:before{content:""}.fa-hand-o-left:before{content:""}.fa-hand-o-up:before{content:""}.fa-hand-o-down:before{content:""}.fa-arrow-circle-left:before,.icon-circle-arrow-left:before{content:""}.fa-arrow-circle-right:before,.icon-circle-arrow-right:before{content:""}.fa-arrow-circle-up:before{content:""}.fa-arrow-circle-down:before{content:""}.fa-globe:before{content:""}.fa-wrench:before{content:""}.fa-tasks:before{content:""}.fa-filter:before{content:""}.fa-briefcase:before{content:""}.fa-arrows-alt:before{content:""}.fa-group:before,.fa-users:before{content:""}.fa-chain:before,.fa-link:before,.icon-link:before{content:""}.fa-cloud:before{content:""}.fa-flask:before{content:""}.fa-cut:before,.fa-scissors:before{content:""}.fa-copy:before,.fa-files-o:before{content:""}.fa-paperclip:before{content:""}.fa-floppy-o:before,.fa-save:before{content:""}.fa-square:before{content:""}.fa-bars:before,.fa-navicon:before,.fa-reorder:before{content:""}.fa-list-ul:before{content:""}.fa-list-ol:before{content:""}.fa-strikethrough:before{content:""}.fa-underline:before{content:""}.fa-table:before{content:""}.fa-magic:before{content:""}.fa-truck:before{content:""}.fa-pinterest:before{content:""}.fa-pinterest-square:before{content:""}.fa-google-plus-square:before{content:""}.fa-google-plus:before{content:""}.fa-money:before{content:""}.fa-caret-down:before,.icon-caret-down:before,.wy-dropdown .caret:before{content:""}.fa-caret-up:before{content:""}.fa-caret-left:before{content:""}.fa-caret-right:before{content:""}.fa-columns:before{content:""}.fa-sort:before,.fa-unsorted:before{content:""}.fa-sort-desc:before,.fa-sort-down:before{content:""}.fa-sort-asc:before,.fa-sort-up:before{content:""}.fa-envelope:before{content:""}.fa-linkedin:before{content:""}.fa-rotate-left:before,.fa-undo:before{content:""}.fa-gavel:before,.fa-legal:before{content:""}.fa-dashboard:before,.fa-tachometer:before{content:""}.fa-comment-o:before{content:""}.fa-comments-o:before{content:""}.fa-bolt:before,.fa-flash:before{content:""}.fa-sitemap:before{content:""}.fa-umbrella:before{content:""}.fa-clipboard:before,.fa-paste:before{content:""}.fa-lightbulb-o:before{content:""}.fa-exchange:before{content:""}.fa-cloud-download:before{content:""}.fa-cloud-upload:before{content:""}.fa-user-md:before{content:""}.fa-stethoscope:before{content:""}.fa-suitcase:before{content:""}.fa-bell-o:before{content:""}.fa-coffee:before{content:""}.fa-cutlery:before{content:""}.fa-file-text-o:before{content:""}.fa-building-o:before{content:""}.fa-hospital-o:before{content:""}.fa-ambulance:before{content:""}.fa-medkit:before{content:""}.fa-fighter-jet:before{content:""}.fa-beer:before{content:""}.fa-h-square:before{content:""}.fa-plus-square:before{content:""}.fa-angle-double-left:before{content:""}.fa-angle-double-right:before{content:""}.fa-angle-double-up:before{content:""}.fa-angle-double-down:before{content:""}.fa-angle-left:before{content:""}.fa-angle-right:before{content:""}.fa-angle-up:before{content:""}.fa-angle-down:before{content:""}.fa-desktop:before{content:""}.fa-laptop:before{content:""}.fa-tablet:before{content:""}.fa-mobile-phone:before,.fa-mobile:before{content:""}.fa-circle-o:before{content:""}.fa-quote-left:before{content:""}.fa-quote-right:before{content:""}.fa-spinner:before{content:""}.fa-circle:before{content:""}.fa-mail-reply:before,.fa-reply:before{content:""}.fa-github-alt:before{content:""}.fa-folder-o:before{content:""}.fa-folder-open-o:before{content:""}.fa-smile-o:before{content:""}.fa-frown-o:before{content:""}.fa-meh-o:before{content:""}.fa-gamepad:before{content:""}.fa-keyboard-o:before{content:""}.fa-flag-o:before{content:""}.fa-flag-checkered:before{content:""}.fa-terminal:before{content:""}.fa-code:before{content:""}.fa-mail-reply-all:before,.fa-reply-all:before{content:""}.fa-star-half-empty:before,.fa-star-half-full:before,.fa-star-half-o:before{content:""}.fa-location-arrow:before{content:""}.fa-crop:before{content:""}.fa-code-fork:before{content:""}.fa-chain-broken:before,.fa-unlink:before{content:""}.fa-question:before{content:""}.fa-info:before{content:""}.fa-exclamation:before{content:""}.fa-superscript:before{content:""}.fa-subscript:before{content:""}.fa-eraser:before{content:""}.fa-puzzle-piece:before{content:""}.fa-microphone:before{content:""}.fa-microphone-slash:before{content:""}.fa-shield:before{content:""}.fa-calendar-o:before{content:""}.fa-fire-extinguisher:before{content:""}.fa-rocket:before{content:""}.fa-maxcdn:before{content:""}.fa-chevron-circle-left:before{content:""}.fa-chevron-circle-right:before{content:""}.fa-chevron-circle-up:before{content:""}.fa-chevron-circle-down:before{content:""}.fa-html5:before{content:""}.fa-css3:before{content:""}.fa-anchor:before{content:""}.fa-unlock-alt:before{content:""}.fa-bullseye:before{content:""}.fa-ellipsis-h:before{content:""}.fa-ellipsis-v:before{content:""}.fa-rss-square:before{content:""}.fa-play-circle:before{content:""}.fa-ticket:before{content:""}.fa-minus-square:before{content:""}.fa-minus-square-o:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before{content:""}.fa-level-up:before{content:""}.fa-level-down:before{content:""}.fa-check-square:before{content:""}.fa-pencil-square:before{content:""}.fa-external-link-square:before{content:""}.fa-share-square:before{content:""}.fa-compass:before{content:""}.fa-caret-square-o-down:before,.fa-toggle-down:before{content:""}.fa-caret-square-o-up:before,.fa-toggle-up:before{content:""}.fa-caret-square-o-right:before,.fa-toggle-right:before{content:""}.fa-eur:before,.fa-euro:before{content:""}.fa-gbp:before{content:""}.fa-dollar:before,.fa-usd:before{content:""}.fa-inr:before,.fa-rupee:before{content:""}.fa-cny:before,.fa-jpy:before,.fa-rmb:before,.fa-yen:before{content:""}.fa-rouble:before,.fa-rub:before,.fa-ruble:before{content:""}.fa-krw:before,.fa-won:before{content:""}.fa-bitcoin:before,.fa-btc:before{content:""}.fa-file:before{content:""}.fa-file-text:before{content:""}.fa-sort-alpha-asc:before{content:""}.fa-sort-alpha-desc:before{content:""}.fa-sort-amount-asc:before{content:""}.fa-sort-amount-desc:before{content:""}.fa-sort-numeric-asc:before{content:""}.fa-sort-numeric-desc:before{content:""}.fa-thumbs-up:before{content:""}.fa-thumbs-down:before{content:""}.fa-youtube-square:before{content:""}.fa-youtube:before{content:""}.fa-xing:before{content:""}.fa-xing-square:before{content:""}.fa-youtube-play:before{content:""}.fa-dropbox:before{content:""}.fa-stack-overflow:before{content:""}.fa-instagram:before{content:""}.fa-flickr:before{content:""}.fa-adn:before{content:""}.fa-bitbucket:before,.icon-bitbucket:before{content:""}.fa-bitbucket-square:before{content:""}.fa-tumblr:before{content:""}.fa-tumblr-square:before{content:""}.fa-long-arrow-down:before{content:""}.fa-long-arrow-up:before{content:""}.fa-long-arrow-left:before{content:""}.fa-long-arrow-right:before{content:""}.fa-apple:before{content:""}.fa-windows:before{content:""}.fa-android:before{content:""}.fa-linux:before{content:""}.fa-dribbble:before{content:""}.fa-skype:before{content:""}.fa-foursquare:before{content:""}.fa-trello:before{content:""}.fa-female:before{content:""}.fa-male:before{content:""}.fa-gittip:before,.fa-gratipay:before{content:""}.fa-sun-o:before{content:""}.fa-moon-o:before{content:""}.fa-archive:before{content:""}.fa-bug:before{content:""}.fa-vk:before{content:""}.fa-weibo:before{content:""}.fa-renren:before{content:""}.fa-pagelines:before{content:""}.fa-stack-exchange:before{content:""}.fa-arrow-circle-o-right:before{content:""}.fa-arrow-circle-o-left:before{content:""}.fa-caret-square-o-left:before,.fa-toggle-left:before{content:""}.fa-dot-circle-o:before{content:""}.fa-wheelchair:before{content:""}.fa-vimeo-square:before{content:""}.fa-try:before,.fa-turkish-lira:before{content:""}.fa-plus-square-o:before,.wy-menu-vertical li button.toctree-expand:before{content:""}.fa-space-shuttle:before{content:""}.fa-slack:before{content:""}.fa-envelope-square:before{content:""}.fa-wordpress:before{content:""}.fa-openid:before{content:""}.fa-bank:before,.fa-institution:before,.fa-university:before{content:""}.fa-graduation-cap:before,.fa-mortar-board:before{content:""}.fa-yahoo:before{content:""}.fa-google:before{content:""}.fa-reddit:before{content:""}.fa-reddit-square:before{content:""}.fa-stumbleupon-circle:before{content:""}.fa-stumbleupon:before{content:""}.fa-delicious:before{content:""}.fa-digg:before{content:""}.fa-pied-piper-pp:before{content:""}.fa-pied-piper-alt:before{content:""}.fa-drupal:before{content:""}.fa-joomla:before{content:""}.fa-language:before{content:""}.fa-fax:before{content:""}.fa-building:before{content:""}.fa-child:before{content:""}.fa-paw:before{content:""}.fa-spoon:before{content:""}.fa-cube:before{content:""}.fa-cubes:before{content:""}.fa-behance:before{content:""}.fa-behance-square:before{content:""}.fa-steam:before{content:""}.fa-steam-square:before{content:""}.fa-recycle:before{content:""}.fa-automobile:before,.fa-car:before{content:""}.fa-cab:before,.fa-taxi:before{content:""}.fa-tree:before{content:""}.fa-spotify:before{content:""}.fa-deviantart:before{content:""}.fa-soundcloud:before{content:""}.fa-database:before{content:""}.fa-file-pdf-o:before{content:""}.fa-file-word-o:before{content:""}.fa-file-excel-o:before{content:""}.fa-file-powerpoint-o:before{content:""}.fa-file-image-o:before,.fa-file-photo-o:before,.fa-file-picture-o:before{content:""}.fa-file-archive-o:before,.fa-file-zip-o:before{content:""}.fa-file-audio-o:before,.fa-file-sound-o:before{content:""}.fa-file-movie-o:before,.fa-file-video-o:before{content:""}.fa-file-code-o:before{content:""}.fa-vine:before{content:""}.fa-codepen:before{content:""}.fa-jsfiddle:before{content:""}.fa-life-bouy:before,.fa-life-buoy:before,.fa-life-ring:before,.fa-life-saver:before,.fa-support:before{content:""}.fa-circle-o-notch:before{content:""}.fa-ra:before,.fa-rebel:before,.fa-resistance:before{content:""}.fa-empire:before,.fa-ge:before{content:""}.fa-git-square:before{content:""}.fa-git:before{content:""}.fa-hacker-news:before,.fa-y-combinator-square:before,.fa-yc-square:before{content:""}.fa-tencent-weibo:before{content:""}.fa-qq:before{content:""}.fa-wechat:before,.fa-weixin:before{content:""}.fa-paper-plane:before,.fa-send:before{content:""}.fa-paper-plane-o:before,.fa-send-o:before{content:""}.fa-history:before{content:""}.fa-circle-thin:before{content:""}.fa-header:before{content:""}.fa-paragraph:before{content:""}.fa-sliders:before{content:""}.fa-share-alt:before{content:""}.fa-share-alt-square:before{content:""}.fa-bomb:before{content:""}.fa-futbol-o:before,.fa-soccer-ball-o:before{content:""}.fa-tty:before{content:""}.fa-binoculars:before{content:""}.fa-plug:before{content:""}.fa-slideshare:before{content:""}.fa-twitch:before{content:""}.fa-yelp:before{content:""}.fa-newspaper-o:before{content:""}.fa-wifi:before{content:""}.fa-calculator:before{content:""}.fa-paypal:before{content:""}.fa-google-wallet:before{content:""}.fa-cc-visa:before{content:""}.fa-cc-mastercard:before{content:""}.fa-cc-discover:before{content:""}.fa-cc-amex:before{content:""}.fa-cc-paypal:before{content:""}.fa-cc-stripe:before{content:""}.fa-bell-slash:before{content:""}.fa-bell-slash-o:before{content:""}.fa-trash:before{content:""}.fa-copyright:before{content:""}.fa-at:before{content:""}.fa-eyedropper:before{content:""}.fa-paint-brush:before{content:""}.fa-birthday-cake:before{content:""}.fa-area-chart:before{content:""}.fa-pie-chart:before{content:""}.fa-line-chart:before{content:""}.fa-lastfm:before{content:""}.fa-lastfm-square:before{content:""}.fa-toggle-off:before{content:""}.fa-toggle-on:before{content:""}.fa-bicycle:before{content:""}.fa-bus:before{content:""}.fa-ioxhost:before{content:""}.fa-angellist:before{content:""}.fa-cc:before{content:""}.fa-ils:before,.fa-shekel:before,.fa-sheqel:before{content:""}.fa-meanpath:before{content:""}.fa-buysellads:before{content:""}.fa-connectdevelop:before{content:""}.fa-dashcube:before{content:""}.fa-forumbee:before{content:""}.fa-leanpub:before{content:""}.fa-sellsy:before{content:""}.fa-shirtsinbulk:before{content:""}.fa-simplybuilt:before{content:""}.fa-skyatlas:before{content:""}.fa-cart-plus:before{content:""}.fa-cart-arrow-down:before{content:""}.fa-diamond:before{content:""}.fa-ship:before{content:""}.fa-user-secret:before{content:""}.fa-motorcycle:before{content:""}.fa-street-view:before{content:""}.fa-heartbeat:before{content:""}.fa-venus:before{content:""}.fa-mars:before{content:""}.fa-mercury:before{content:""}.fa-intersex:before,.fa-transgender:before{content:""}.fa-transgender-alt:before{content:""}.fa-venus-double:before{content:""}.fa-mars-double:before{content:""}.fa-venus-mars:before{content:""}.fa-mars-stroke:before{content:""}.fa-mars-stroke-v:before{content:""}.fa-mars-stroke-h:before{content:""}.fa-neuter:before{content:""}.fa-genderless:before{content:""}.fa-facebook-official:before{content:""}.fa-pinterest-p:before{content:""}.fa-whatsapp:before{content:""}.fa-server:before{content:""}.fa-user-plus:before{content:""}.fa-user-times:before{content:""}.fa-bed:before,.fa-hotel:before{content:""}.fa-viacoin:before{content:""}.fa-train:before{content:""}.fa-subway:before{content:""}.fa-medium:before{content:""}.fa-y-combinator:before,.fa-yc:before{content:""}.fa-optin-monster:before{content:""}.fa-opencart:before{content:""}.fa-expeditedssl:before{content:""}.fa-battery-4:before,.fa-battery-full:before,.fa-battery:before{content:""}.fa-battery-3:before,.fa-battery-three-quarters:before{content:""}.fa-battery-2:before,.fa-battery-half:before{content:""}.fa-battery-1:before,.fa-battery-quarter:before{content:""}.fa-battery-0:before,.fa-battery-empty:before{content:""}.fa-mouse-pointer:before{content:""}.fa-i-cursor:before{content:""}.fa-object-group:before{content:""}.fa-object-ungroup:before{content:""}.fa-sticky-note:before{content:""}.fa-sticky-note-o:before{content:""}.fa-cc-jcb:before{content:""}.fa-cc-diners-club:before{content:""}.fa-clone:before{content:""}.fa-balance-scale:before{content:""}.fa-hourglass-o:before{content:""}.fa-hourglass-1:before,.fa-hourglass-start:before{content:""}.fa-hourglass-2:before,.fa-hourglass-half:before{content:""}.fa-hourglass-3:before,.fa-hourglass-end:before{content:""}.fa-hourglass:before{content:""}.fa-hand-grab-o:before,.fa-hand-rock-o:before{content:""}.fa-hand-paper-o:before,.fa-hand-stop-o:before{content:""}.fa-hand-scissors-o:before{content:""}.fa-hand-lizard-o:before{content:""}.fa-hand-spock-o:before{content:""}.fa-hand-pointer-o:before{content:""}.fa-hand-peace-o:before{content:""}.fa-trademark:before{content:""}.fa-registered:before{content:""}.fa-creative-commons:before{content:""}.fa-gg:before{content:""}.fa-gg-circle:before{content:""}.fa-tripadvisor:before{content:""}.fa-odnoklassniki:before{content:""}.fa-odnoklassniki-square:before{content:""}.fa-get-pocket:before{content:""}.fa-wikipedia-w:before{content:""}.fa-safari:before{content:""}.fa-chrome:before{content:""}.fa-firefox:before{content:""}.fa-opera:before{content:""}.fa-internet-explorer:before{content:""}.fa-television:before,.fa-tv:before{content:""}.fa-contao:before{content:""}.fa-500px:before{content:""}.fa-amazon:before{content:""}.fa-calendar-plus-o:before{content:""}.fa-calendar-minus-o:before{content:""}.fa-calendar-times-o:before{content:""}.fa-calendar-check-o:before{content:""}.fa-industry:before{content:""}.fa-map-pin:before{content:""}.fa-map-signs:before{content:""}.fa-map-o:before{content:""}.fa-map:before{content:""}.fa-commenting:before{content:""}.fa-commenting-o:before{content:""}.fa-houzz:before{content:""}.fa-vimeo:before{content:""}.fa-black-tie:before{content:""}.fa-fonticons:before{content:""}.fa-reddit-alien:before{content:""}.fa-edge:before{content:""}.fa-credit-card-alt:before{content:""}.fa-codiepie:before{content:""}.fa-modx:before{content:""}.fa-fort-awesome:before{content:""}.fa-usb:before{content:""}.fa-product-hunt:before{content:""}.fa-mixcloud:before{content:""}.fa-scribd:before{content:""}.fa-pause-circle:before{content:""}.fa-pause-circle-o:before{content:""}.fa-stop-circle:before{content:""}.fa-stop-circle-o:before{content:""}.fa-shopping-bag:before{content:""}.fa-shopping-basket:before{content:""}.fa-hashtag:before{content:""}.fa-bluetooth:before{content:""}.fa-bluetooth-b:before{content:""}.fa-percent:before{content:""}.fa-gitlab:before,.icon-gitlab:before{content:""}.fa-wpbeginner:before{content:""}.fa-wpforms:before{content:""}.fa-envira:before{content:""}.fa-universal-access:before{content:""}.fa-wheelchair-alt:before{content:""}.fa-question-circle-o:before{content:""}.fa-blind:before{content:""}.fa-audio-description:before{content:""}.fa-volume-control-phone:before{content:""}.fa-braille:before{content:""}.fa-assistive-listening-systems:before{content:""}.fa-american-sign-language-interpreting:before,.fa-asl-interpreting:before{content:""}.fa-deaf:before,.fa-deafness:before,.fa-hard-of-hearing:before{content:""}.fa-glide:before{content:""}.fa-glide-g:before{content:""}.fa-sign-language:before,.fa-signing:before{content:""}.fa-low-vision:before{content:""}.fa-viadeo:before{content:""}.fa-viadeo-square:before{content:""}.fa-snapchat:before{content:""}.fa-snapchat-ghost:before{content:""}.fa-snapchat-square:before{content:""}.fa-pied-piper:before{content:""}.fa-first-order:before{content:""}.fa-yoast:before{content:""}.fa-themeisle:before{content:""}.fa-google-plus-circle:before,.fa-google-plus-official:before{content:""}.fa-fa:before,.fa-font-awesome:before{content:""}.fa-handshake-o:before{content:""}.fa-envelope-open:before{content:""}.fa-envelope-open-o:before{content:""}.fa-linode:before{content:""}.fa-address-book:before{content:""}.fa-address-book-o:before{content:""}.fa-address-card:before,.fa-vcard:before{content:""}.fa-address-card-o:before,.fa-vcard-o:before{content:""}.fa-user-circle:before{content:""}.fa-user-circle-o:before{content:""}.fa-user-o:before{content:""}.fa-id-badge:before{content:""}.fa-drivers-license:before,.fa-id-card:before{content:""}.fa-drivers-license-o:before,.fa-id-card-o:before{content:""}.fa-quora:before{content:""}.fa-free-code-camp:before{content:""}.fa-telegram:before{content:""}.fa-thermometer-4:before,.fa-thermometer-full:before,.fa-thermometer:before{content:""}.fa-thermometer-3:before,.fa-thermometer-three-quarters:before{content:""}.fa-thermometer-2:before,.fa-thermometer-half:before{content:""}.fa-thermometer-1:before,.fa-thermometer-quarter:before{content:""}.fa-thermometer-0:before,.fa-thermometer-empty:before{content:""}.fa-shower:before{content:""}.fa-bath:before,.fa-bathtub:before,.fa-s15:before{content:""}.fa-podcast:before{content:""}.fa-window-maximize:before{content:""}.fa-window-minimize:before{content:""}.fa-window-restore:before{content:""}.fa-times-rectangle:before,.fa-window-close:before{content:""}.fa-times-rectangle-o:before,.fa-window-close-o:before{content:""}.fa-bandcamp:before{content:""}.fa-grav:before{content:""}.fa-etsy:before{content:""}.fa-imdb:before{content:""}.fa-ravelry:before{content:""}.fa-eercast:before{content:""}.fa-microchip:before{content:""}.fa-snowflake-o:before{content:""}.fa-superpowers:before{content:""}.fa-wpexplorer:before{content:""}.fa-meetup:before{content:""}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0,0,0,0);border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;margin:0;overflow:visible;clip:auto}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-dropdown .caret,.wy-inline-validate.wy-inline-validate-danger .wy-input-context,.wy-inline-validate.wy-inline-validate-info .wy-input-context,.wy-inline-validate.wy-inline-validate-success .wy-input-context,.wy-inline-validate.wy-inline-validate-warning .wy-input-context,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{font-family:inherit}.fa:before,.icon:before,.rst-content .admonition-title:before,.rst-content .code-block-caption .headerlink:before,.rst-content .eqno .headerlink:before,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before{font-family:FontAwesome;display:inline-block;font-style:normal;font-weight:400;line-height:1;text-decoration:inherit}.rst-content .code-block-caption a .headerlink,.rst-content .eqno a .headerlink,.rst-content a .admonition-title,.rst-content code.download a span:first-child,.rst-content dl dt a .headerlink,.rst-content h1 a .headerlink,.rst-content h2 a .headerlink,.rst-content h3 a .headerlink,.rst-content h4 a .headerlink,.rst-content h5 a .headerlink,.rst-content h6 a .headerlink,.rst-content p.caption a .headerlink,.rst-content p a .headerlink,.rst-content table>caption a .headerlink,.rst-content tt.download a span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li a button.toctree-expand,a .fa,a .icon,a .rst-content .admonition-title,a .rst-content .code-block-caption .headerlink,a .rst-content .eqno .headerlink,a .rst-content code.download span:first-child,a .rst-content dl dt .headerlink,a .rst-content h1 .headerlink,a .rst-content h2 .headerlink,a .rst-content h3 .headerlink,a .rst-content h4 .headerlink,a .rst-content h5 .headerlink,a .rst-content h6 .headerlink,a .rst-content p.caption .headerlink,a .rst-content p .headerlink,a .rst-content table>caption .headerlink,a .rst-content tt.download span:first-child,a .wy-menu-vertical li button.toctree-expand{display:inline-block;text-decoration:inherit}.btn .fa,.btn .icon,.btn .rst-content .admonition-title,.btn .rst-content .code-block-caption .headerlink,.btn .rst-content .eqno .headerlink,.btn .rst-content code.download span:first-child,.btn .rst-content dl dt .headerlink,.btn .rst-content h1 .headerlink,.btn .rst-content h2 .headerlink,.btn .rst-content h3 .headerlink,.btn .rst-content h4 .headerlink,.btn .rst-content h5 .headerlink,.btn .rst-content h6 .headerlink,.btn .rst-content p .headerlink,.btn .rst-content table>caption .headerlink,.btn .rst-content tt.download span:first-child,.btn .wy-menu-vertical li.current>a button.toctree-expand,.btn .wy-menu-vertical li.on a button.toctree-expand,.btn .wy-menu-vertical li button.toctree-expand,.nav .fa,.nav .icon,.nav .rst-content .admonition-title,.nav .rst-content .code-block-caption .headerlink,.nav .rst-content .eqno .headerlink,.nav .rst-content code.download span:first-child,.nav .rst-content dl dt .headerlink,.nav .rst-content h1 .headerlink,.nav .rst-content h2 .headerlink,.nav .rst-content h3 .headerlink,.nav .rst-content h4 .headerlink,.nav .rst-content h5 .headerlink,.nav .rst-content h6 .headerlink,.nav .rst-content p .headerlink,.nav .rst-content table>caption .headerlink,.nav .rst-content tt.download span:first-child,.nav .wy-menu-vertical li.current>a button.toctree-expand,.nav .wy-menu-vertical li.on a button.toctree-expand,.nav .wy-menu-vertical li button.toctree-expand,.rst-content .btn .admonition-title,.rst-content .code-block-caption .btn .headerlink,.rst-content .code-block-caption .nav .headerlink,.rst-content .eqno .btn .headerlink,.rst-content .eqno .nav .headerlink,.rst-content .nav .admonition-title,.rst-content code.download .btn span:first-child,.rst-content code.download .nav span:first-child,.rst-content dl dt .btn .headerlink,.rst-content dl dt .nav .headerlink,.rst-content h1 .btn .headerlink,.rst-content h1 .nav .headerlink,.rst-content h2 .btn .headerlink,.rst-content h2 .nav .headerlink,.rst-content h3 .btn .headerlink,.rst-content h3 .nav .headerlink,.rst-content h4 .btn .headerlink,.rst-content h4 .nav .headerlink,.rst-content h5 .btn .headerlink,.rst-content h5 .nav .headerlink,.rst-content h6 .btn .headerlink,.rst-content h6 .nav .headerlink,.rst-content p .btn .headerlink,.rst-content p .nav .headerlink,.rst-content table>caption .btn .headerlink,.rst-content table>caption .nav .headerlink,.rst-content tt.download .btn span:first-child,.rst-content tt.download .nav span:first-child,.wy-menu-vertical li .btn button.toctree-expand,.wy-menu-vertical li.current>a .btn button.toctree-expand,.wy-menu-vertical li.current>a .nav button.toctree-expand,.wy-menu-vertical li .nav button.toctree-expand,.wy-menu-vertical li.on a .btn button.toctree-expand,.wy-menu-vertical li.on a .nav button.toctree-expand{display:inline}.btn .fa-large.icon,.btn .fa.fa-large,.btn .rst-content .code-block-caption .fa-large.headerlink,.btn .rst-content .eqno .fa-large.headerlink,.btn .rst-content .fa-large.admonition-title,.btn .rst-content code.download span.fa-large:first-child,.btn .rst-content dl dt .fa-large.headerlink,.btn .rst-content h1 .fa-large.headerlink,.btn .rst-content h2 .fa-large.headerlink,.btn .rst-content h3 .fa-large.headerlink,.btn .rst-content h4 .fa-large.headerlink,.btn .rst-content h5 .fa-large.headerlink,.btn .rst-content h6 .fa-large.headerlink,.btn .rst-content p .fa-large.headerlink,.btn .rst-content table>caption .fa-large.headerlink,.btn .rst-content tt.download span.fa-large:first-child,.btn .wy-menu-vertical li button.fa-large.toctree-expand,.nav .fa-large.icon,.nav .fa.fa-large,.nav .rst-content .code-block-caption .fa-large.headerlink,.nav .rst-content .eqno .fa-large.headerlink,.nav .rst-content .fa-large.admonition-title,.nav .rst-content code.download span.fa-large:first-child,.nav .rst-content dl dt .fa-large.headerlink,.nav .rst-content h1 .fa-large.headerlink,.nav .rst-content h2 .fa-large.headerlink,.nav .rst-content h3 .fa-large.headerlink,.nav .rst-content h4 .fa-large.headerlink,.nav .rst-content h5 .fa-large.headerlink,.nav .rst-content h6 .fa-large.headerlink,.nav .rst-content p .fa-large.headerlink,.nav .rst-content table>caption .fa-large.headerlink,.nav .rst-content tt.download span.fa-large:first-child,.nav .wy-menu-vertical li button.fa-large.toctree-expand,.rst-content .btn .fa-large.admonition-title,.rst-content .code-block-caption .btn .fa-large.headerlink,.rst-content .code-block-caption .nav .fa-large.headerlink,.rst-content .eqno .btn .fa-large.headerlink,.rst-content .eqno .nav .fa-large.headerlink,.rst-content .nav .fa-large.admonition-title,.rst-content code.download .btn span.fa-large:first-child,.rst-content code.download .nav span.fa-large:first-child,.rst-content dl dt .btn .fa-large.headerlink,.rst-content dl dt .nav .fa-large.headerlink,.rst-content h1 .btn .fa-large.headerlink,.rst-content h1 .nav .fa-large.headerlink,.rst-content h2 .btn .fa-large.headerlink,.rst-content h2 .nav .fa-large.headerlink,.rst-content h3 .btn .fa-large.headerlink,.rst-content h3 .nav .fa-large.headerlink,.rst-content h4 .btn .fa-large.headerlink,.rst-content h4 .nav .fa-large.headerlink,.rst-content h5 .btn .fa-large.headerlink,.rst-content h5 .nav .fa-large.headerlink,.rst-content h6 .btn .fa-large.headerlink,.rst-content h6 .nav .fa-large.headerlink,.rst-content p .btn .fa-large.headerlink,.rst-content p .nav .fa-large.headerlink,.rst-content table>caption .btn .fa-large.headerlink,.rst-content table>caption .nav .fa-large.headerlink,.rst-content tt.download .btn span.fa-large:first-child,.rst-content tt.download .nav span.fa-large:first-child,.wy-menu-vertical li .btn button.fa-large.toctree-expand,.wy-menu-vertical li .nav button.fa-large.toctree-expand{line-height:.9em}.btn .fa-spin.icon,.btn .fa.fa-spin,.btn .rst-content .code-block-caption .fa-spin.headerlink,.btn .rst-content .eqno .fa-spin.headerlink,.btn .rst-content .fa-spin.admonition-title,.btn .rst-content code.download span.fa-spin:first-child,.btn .rst-content dl dt .fa-spin.headerlink,.btn .rst-content h1 .fa-spin.headerlink,.btn .rst-content h2 .fa-spin.headerlink,.btn .rst-content h3 .fa-spin.headerlink,.btn .rst-content h4 .fa-spin.headerlink,.btn .rst-content h5 .fa-spin.headerlink,.btn .rst-content h6 .fa-spin.headerlink,.btn .rst-content p .fa-spin.headerlink,.btn .rst-content table>caption .fa-spin.headerlink,.btn .rst-content tt.download span.fa-spin:first-child,.btn .wy-menu-vertical li button.fa-spin.toctree-expand,.nav .fa-spin.icon,.nav .fa.fa-spin,.nav .rst-content .code-block-caption .fa-spin.headerlink,.nav .rst-content .eqno .fa-spin.headerlink,.nav .rst-content .fa-spin.admonition-title,.nav .rst-content code.download span.fa-spin:first-child,.nav .rst-content dl dt .fa-spin.headerlink,.nav .rst-content h1 .fa-spin.headerlink,.nav .rst-content h2 .fa-spin.headerlink,.nav .rst-content h3 .fa-spin.headerlink,.nav .rst-content h4 .fa-spin.headerlink,.nav .rst-content h5 .fa-spin.headerlink,.nav .rst-content h6 .fa-spin.headerlink,.nav .rst-content p .fa-spin.headerlink,.nav .rst-content table>caption .fa-spin.headerlink,.nav .rst-content tt.download span.fa-spin:first-child,.nav .wy-menu-vertical li button.fa-spin.toctree-expand,.rst-content .btn .fa-spin.admonition-title,.rst-content .code-block-caption .btn .fa-spin.headerlink,.rst-content .code-block-caption .nav .fa-spin.headerlink,.rst-content .eqno .btn .fa-spin.headerlink,.rst-content .eqno .nav .fa-spin.headerlink,.rst-content .nav .fa-spin.admonition-title,.rst-content code.download .btn span.fa-spin:first-child,.rst-content code.download .nav span.fa-spin:first-child,.rst-content dl dt .btn .fa-spin.headerlink,.rst-content dl dt .nav .fa-spin.headerlink,.rst-content h1 .btn .fa-spin.headerlink,.rst-content h1 .nav .fa-spin.headerlink,.rst-content h2 .btn .fa-spin.headerlink,.rst-content h2 .nav .fa-spin.headerlink,.rst-content h3 .btn .fa-spin.headerlink,.rst-content h3 .nav .fa-spin.headerlink,.rst-content h4 .btn .fa-spin.headerlink,.rst-content h4 .nav .fa-spin.headerlink,.rst-content h5 .btn .fa-spin.headerlink,.rst-content h5 .nav .fa-spin.headerlink,.rst-content h6 .btn .fa-spin.headerlink,.rst-content h6 .nav .fa-spin.headerlink,.rst-content p .btn .fa-spin.headerlink,.rst-content p .nav .fa-spin.headerlink,.rst-content table>caption .btn .fa-spin.headerlink,.rst-content table>caption .nav .fa-spin.headerlink,.rst-content tt.download .btn span.fa-spin:first-child,.rst-content tt.download .nav span.fa-spin:first-child,.wy-menu-vertical li .btn button.fa-spin.toctree-expand,.wy-menu-vertical li .nav button.fa-spin.toctree-expand{display:inline-block}.btn.fa:before,.btn.icon:before,.rst-content .btn.admonition-title:before,.rst-content .code-block-caption .btn.headerlink:before,.rst-content .eqno .btn.headerlink:before,.rst-content code.download span.btn:first-child:before,.rst-content dl dt .btn.headerlink:before,.rst-content h1 .btn.headerlink:before,.rst-content h2 .btn.headerlink:before,.rst-content h3 .btn.headerlink:before,.rst-content h4 .btn.headerlink:before,.rst-content h5 .btn.headerlink:before,.rst-content h6 .btn.headerlink:before,.rst-content p .btn.headerlink:before,.rst-content table>caption .btn.headerlink:before,.rst-content tt.download span.btn:first-child:before,.wy-menu-vertical li button.btn.toctree-expand:before{opacity:.5;-webkit-transition:opacity .05s ease-in;-moz-transition:opacity .05s ease-in;transition:opacity .05s ease-in}.btn.fa:hover:before,.btn.icon:hover:before,.rst-content .btn.admonition-title:hover:before,.rst-content .code-block-caption .btn.headerlink:hover:before,.rst-content .eqno .btn.headerlink:hover:before,.rst-content code.download span.btn:first-child:hover:before,.rst-content dl dt .btn.headerlink:hover:before,.rst-content h1 .btn.headerlink:hover:before,.rst-content h2 .btn.headerlink:hover:before,.rst-content h3 .btn.headerlink:hover:before,.rst-content h4 .btn.headerlink:hover:before,.rst-content h5 .btn.headerlink:hover:before,.rst-content h6 .btn.headerlink:hover:before,.rst-content p .btn.headerlink:hover:before,.rst-content table>caption .btn.headerlink:hover:before,.rst-content tt.download span.btn:first-child:hover:before,.wy-menu-vertical li button.btn.toctree-expand:hover:before{opacity:1}.btn-mini .fa:before,.btn-mini .icon:before,.btn-mini .rst-content .admonition-title:before,.btn-mini .rst-content .code-block-caption .headerlink:before,.btn-mini .rst-content .eqno .headerlink:before,.btn-mini .rst-content code.download span:first-child:before,.btn-mini .rst-content dl dt .headerlink:before,.btn-mini .rst-content h1 .headerlink:before,.btn-mini .rst-content h2 .headerlink:before,.btn-mini .rst-content h3 .headerlink:before,.btn-mini .rst-content h4 .headerlink:before,.btn-mini .rst-content h5 .headerlink:before,.btn-mini .rst-content h6 .headerlink:before,.btn-mini .rst-content p .headerlink:before,.btn-mini .rst-content table>caption .headerlink:before,.btn-mini .rst-content tt.download span:first-child:before,.btn-mini .wy-menu-vertical li button.toctree-expand:before,.rst-content .btn-mini .admonition-title:before,.rst-content .code-block-caption .btn-mini .headerlink:before,.rst-content .eqno .btn-mini .headerlink:before,.rst-content code.download .btn-mini span:first-child:before,.rst-content dl dt .btn-mini .headerlink:before,.rst-content h1 .btn-mini .headerlink:before,.rst-content h2 .btn-mini .headerlink:before,.rst-content h3 .btn-mini .headerlink:before,.rst-content h4 .btn-mini .headerlink:before,.rst-content h5 .btn-mini .headerlink:before,.rst-content h6 .btn-mini .headerlink:before,.rst-content p .btn-mini .headerlink:before,.rst-content table>caption .btn-mini .headerlink:before,.rst-content tt.download .btn-mini span:first-child:before,.wy-menu-vertical li .btn-mini button.toctree-expand:before{font-size:14px;vertical-align:-15%}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.wy-alert{padding:12px;line-height:24px;margin-bottom:24px;background:#e7f2fa}.rst-content .admonition-title,.wy-alert-title{font-weight:700;display:block;color:#fff;background:#6ab0de;padding:6px 12px;margin:-12px -12px 12px}.rst-content .danger,.rst-content .error,.rst-content .wy-alert-danger.admonition,.rst-content .wy-alert-danger.admonition-todo,.rst-content .wy-alert-danger.attention,.rst-content .wy-alert-danger.caution,.rst-content .wy-alert-danger.hint,.rst-content .wy-alert-danger.important,.rst-content .wy-alert-danger.note,.rst-content .wy-alert-danger.seealso,.rst-content .wy-alert-danger.tip,.rst-content .wy-alert-danger.warning,.wy-alert.wy-alert-danger{background:#fdf3f2}.rst-content .danger .admonition-title,.rst-content .danger .wy-alert-title,.rst-content .error .admonition-title,.rst-content .error .wy-alert-title,.rst-content .wy-alert-danger.admonition-todo .admonition-title,.rst-content .wy-alert-danger.admonition-todo .wy-alert-title,.rst-content .wy-alert-danger.admonition .admonition-title,.rst-content .wy-alert-danger.admonition .wy-alert-title,.rst-content .wy-alert-danger.attention .admonition-title,.rst-content .wy-alert-danger.attention .wy-alert-title,.rst-content .wy-alert-danger.caution .admonition-title,.rst-content .wy-alert-danger.caution .wy-alert-title,.rst-content .wy-alert-danger.hint .admonition-title,.rst-content .wy-alert-danger.hint .wy-alert-title,.rst-content .wy-alert-danger.important .admonition-title,.rst-content .wy-alert-danger.important .wy-alert-title,.rst-content .wy-alert-danger.note .admonition-title,.rst-content .wy-alert-danger.note .wy-alert-title,.rst-content .wy-alert-danger.seealso .admonition-title,.rst-content .wy-alert-danger.seealso .wy-alert-title,.rst-content .wy-alert-danger.tip .admonition-title,.rst-content .wy-alert-danger.tip .wy-alert-title,.rst-content .wy-alert-danger.warning .admonition-title,.rst-content .wy-alert-danger.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-danger .admonition-title,.wy-alert.wy-alert-danger .rst-content .admonition-title,.wy-alert.wy-alert-danger .wy-alert-title{background:#f29f97}.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .warning,.rst-content .wy-alert-warning.admonition,.rst-content .wy-alert-warning.danger,.rst-content .wy-alert-warning.error,.rst-content .wy-alert-warning.hint,.rst-content .wy-alert-warning.important,.rst-content .wy-alert-warning.note,.rst-content .wy-alert-warning.seealso,.rst-content .wy-alert-warning.tip,.wy-alert.wy-alert-warning{background:#ffedcc}.rst-content .admonition-todo .admonition-title,.rst-content .admonition-todo .wy-alert-title,.rst-content .attention .admonition-title,.rst-content .attention .wy-alert-title,.rst-content .caution .admonition-title,.rst-content .caution .wy-alert-title,.rst-content .warning .admonition-title,.rst-content .warning .wy-alert-title,.rst-content .wy-alert-warning.admonition .admonition-title,.rst-content .wy-alert-warning.admonition .wy-alert-title,.rst-content .wy-alert-warning.danger .admonition-title,.rst-content .wy-alert-warning.danger .wy-alert-title,.rst-content .wy-alert-warning.error .admonition-title,.rst-content .wy-alert-warning.error .wy-alert-title,.rst-content .wy-alert-warning.hint .admonition-title,.rst-content .wy-alert-warning.hint .wy-alert-title,.rst-content .wy-alert-warning.important .admonition-title,.rst-content .wy-alert-warning.important .wy-alert-title,.rst-content .wy-alert-warning.note .admonition-title,.rst-content .wy-alert-warning.note .wy-alert-title,.rst-content .wy-alert-warning.seealso .admonition-title,.rst-content .wy-alert-warning.seealso .wy-alert-title,.rst-content .wy-alert-warning.tip .admonition-title,.rst-content .wy-alert-warning.tip .wy-alert-title,.rst-content .wy-alert.wy-alert-warning .admonition-title,.wy-alert.wy-alert-warning .rst-content .admonition-title,.wy-alert.wy-alert-warning .wy-alert-title{background:#f0b37e}.rst-content .note,.rst-content .seealso,.rst-content .wy-alert-info.admonition,.rst-content .wy-alert-info.admonition-todo,.rst-content .wy-alert-info.attention,.rst-content .wy-alert-info.caution,.rst-content .wy-alert-info.danger,.rst-content .wy-alert-info.error,.rst-content .wy-alert-info.hint,.rst-content .wy-alert-info.important,.rst-content .wy-alert-info.tip,.rst-content .wy-alert-info.warning,.wy-alert.wy-alert-info{background:#e7f2fa}.rst-content .note .admonition-title,.rst-content .note .wy-alert-title,.rst-content .seealso .admonition-title,.rst-content .seealso .wy-alert-title,.rst-content .wy-alert-info.admonition-todo .admonition-title,.rst-content .wy-alert-info.admonition-todo .wy-alert-title,.rst-content .wy-alert-info.admonition .admonition-title,.rst-content .wy-alert-info.admonition .wy-alert-title,.rst-content .wy-alert-info.attention .admonition-title,.rst-content .wy-alert-info.attention .wy-alert-title,.rst-content .wy-alert-info.caution .admonition-title,.rst-content .wy-alert-info.caution .wy-alert-title,.rst-content .wy-alert-info.danger .admonition-title,.rst-content .wy-alert-info.danger .wy-alert-title,.rst-content .wy-alert-info.error .admonition-title,.rst-content .wy-alert-info.error .wy-alert-title,.rst-content .wy-alert-info.hint .admonition-title,.rst-content .wy-alert-info.hint .wy-alert-title,.rst-content .wy-alert-info.important .admonition-title,.rst-content .wy-alert-info.important .wy-alert-title,.rst-content .wy-alert-info.tip .admonition-title,.rst-content .wy-alert-info.tip .wy-alert-title,.rst-content .wy-alert-info.warning .admonition-title,.rst-content .wy-alert-info.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-info .admonition-title,.wy-alert.wy-alert-info .rst-content .admonition-title,.wy-alert.wy-alert-info .wy-alert-title{background:#6ab0de}.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .wy-alert-success.admonition,.rst-content .wy-alert-success.admonition-todo,.rst-content .wy-alert-success.attention,.rst-content .wy-alert-success.caution,.rst-content .wy-alert-success.danger,.rst-content .wy-alert-success.error,.rst-content .wy-alert-success.note,.rst-content .wy-alert-success.seealso,.rst-content .wy-alert-success.warning,.wy-alert.wy-alert-success{background:#dbfaf4}.rst-content .hint .admonition-title,.rst-content .hint .wy-alert-title,.rst-content .important .admonition-title,.rst-content .important .wy-alert-title,.rst-content .tip .admonition-title,.rst-content .tip .wy-alert-title,.rst-content .wy-alert-success.admonition-todo .admonition-title,.rst-content .wy-alert-success.admonition-todo .wy-alert-title,.rst-content .wy-alert-success.admonition .admonition-title,.rst-content .wy-alert-success.admonition .wy-alert-title,.rst-content .wy-alert-success.attention .admonition-title,.rst-content .wy-alert-success.attention .wy-alert-title,.rst-content .wy-alert-success.caution .admonition-title,.rst-content .wy-alert-success.caution .wy-alert-title,.rst-content .wy-alert-success.danger .admonition-title,.rst-content .wy-alert-success.danger .wy-alert-title,.rst-content .wy-alert-success.error .admonition-title,.rst-content .wy-alert-success.error .wy-alert-title,.rst-content .wy-alert-success.note .admonition-title,.rst-content .wy-alert-success.note .wy-alert-title,.rst-content .wy-alert-success.seealso .admonition-title,.rst-content .wy-alert-success.seealso .wy-alert-title,.rst-content .wy-alert-success.warning .admonition-title,.rst-content .wy-alert-success.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-success .admonition-title,.wy-alert.wy-alert-success .rst-content .admonition-title,.wy-alert.wy-alert-success .wy-alert-title{background:#1abc9c}.rst-content .wy-alert-neutral.admonition,.rst-content .wy-alert-neutral.admonition-todo,.rst-content .wy-alert-neutral.attention,.rst-content .wy-alert-neutral.caution,.rst-content .wy-alert-neutral.danger,.rst-content .wy-alert-neutral.error,.rst-content .wy-alert-neutral.hint,.rst-content .wy-alert-neutral.important,.rst-content .wy-alert-neutral.note,.rst-content .wy-alert-neutral.seealso,.rst-content .wy-alert-neutral.tip,.rst-content .wy-alert-neutral.warning,.wy-alert.wy-alert-neutral{background:#f3f6f6}.rst-content .wy-alert-neutral.admonition-todo .admonition-title,.rst-content .wy-alert-neutral.admonition-todo .wy-alert-title,.rst-content .wy-alert-neutral.admonition .admonition-title,.rst-content .wy-alert-neutral.admonition .wy-alert-title,.rst-content .wy-alert-neutral.attention .admonition-title,.rst-content .wy-alert-neutral.attention .wy-alert-title,.rst-content .wy-alert-neutral.caution .admonition-title,.rst-content .wy-alert-neutral.caution .wy-alert-title,.rst-content .wy-alert-neutral.danger .admonition-title,.rst-content .wy-alert-neutral.danger .wy-alert-title,.rst-content .wy-alert-neutral.error .admonition-title,.rst-content .wy-alert-neutral.error .wy-alert-title,.rst-content .wy-alert-neutral.hint .admonition-title,.rst-content .wy-alert-neutral.hint .wy-alert-title,.rst-content .wy-alert-neutral.important .admonition-title,.rst-content .wy-alert-neutral.important .wy-alert-title,.rst-content .wy-alert-neutral.note .admonition-title,.rst-content .wy-alert-neutral.note .wy-alert-title,.rst-content .wy-alert-neutral.seealso .admonition-title,.rst-content .wy-alert-neutral.seealso .wy-alert-title,.rst-content .wy-alert-neutral.tip .admonition-title,.rst-content .wy-alert-neutral.tip .wy-alert-title,.rst-content .wy-alert-neutral.warning .admonition-title,.rst-content .wy-alert-neutral.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-neutral .admonition-title,.wy-alert.wy-alert-neutral .rst-content .admonition-title,.wy-alert.wy-alert-neutral .wy-alert-title{color:#404040;background:#e1e4e5}.rst-content .wy-alert-neutral.admonition-todo a,.rst-content .wy-alert-neutral.admonition a,.rst-content .wy-alert-neutral.attention a,.rst-content .wy-alert-neutral.caution a,.rst-content .wy-alert-neutral.danger a,.rst-content .wy-alert-neutral.error a,.rst-content .wy-alert-neutral.hint a,.rst-content .wy-alert-neutral.important a,.rst-content .wy-alert-neutral.note a,.rst-content .wy-alert-neutral.seealso a,.rst-content .wy-alert-neutral.tip a,.rst-content .wy-alert-neutral.warning a,.wy-alert.wy-alert-neutral a{color:#2980b9}.rst-content .admonition-todo p:last-child,.rst-content .admonition p:last-child,.rst-content .attention p:last-child,.rst-content .caution p:last-child,.rst-content .danger p:last-child,.rst-content .error p:last-child,.rst-content .hint p:last-child,.rst-content .important p:last-child,.rst-content .note p:last-child,.rst-content .seealso p:last-child,.rst-content .tip p:last-child,.rst-content .warning p:last-child,.wy-alert p:last-child{margin-bottom:0}.wy-tray-container{position:fixed;bottom:0;left:0;z-index:600}.wy-tray-container li{display:block;width:300px;background:transparent;color:#fff;text-align:center;box-shadow:0 5px 5px 0 rgba(0,0,0,.1);padding:0 24px;min-width:20%;opacity:0;height:0;line-height:56px;overflow:hidden;-webkit-transition:all .3s ease-in;-moz-transition:all .3s ease-in;transition:all .3s ease-in}.wy-tray-container li.wy-tray-item-success{background:#27ae60}.wy-tray-container li.wy-tray-item-info{background:#2980b9}.wy-tray-container li.wy-tray-item-warning{background:#e67e22}.wy-tray-container li.wy-tray-item-danger{background:#e74c3c}.wy-tray-container li.on{opacity:1;height:56px}@media screen and (max-width:768px){.wy-tray-container{bottom:auto;top:0;width:100%}.wy-tray-container li{width:100%}}button{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle;cursor:pointer;line-height:normal;-webkit-appearance:button;*overflow:visible}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}button[disabled]{cursor:default}.btn{display:inline-block;border-radius:2px;line-height:normal;white-space:nowrap;text-align:center;cursor:pointer;font-size:100%;padding:6px 12px 8px;color:#fff;border:1px solid rgba(0,0,0,.1);background-color:#27ae60;text-decoration:none;font-weight:400;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 2px -1px hsla(0,0%,100%,.5),inset 0 -2px 0 0 rgba(0,0,0,.1);outline-none:false;vertical-align:middle;*display:inline;zoom:1;-webkit-user-drag:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;-webkit-transition:all .1s linear;-moz-transition:all .1s linear;transition:all .1s linear}.btn-hover{background:#2e8ece;color:#fff}.btn:hover{background:#2cc36b;color:#fff}.btn:focus{background:#2cc36b;outline:0}.btn:active{box-shadow:inset 0 -1px 0 0 rgba(0,0,0,.05),inset 0 2px 0 0 rgba(0,0,0,.1);padding:8px 12px 6px}.btn:visited{color:#fff}.btn-disabled,.btn-disabled:active,.btn-disabled:focus,.btn-disabled:hover,.btn:disabled{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:.4;cursor:not-allowed;box-shadow:none}.btn::-moz-focus-inner{padding:0;border:0}.btn-small{font-size:80%}.btn-info{background-color:#2980b9!important}.btn-info:hover{background-color:#2e8ece!important}.btn-neutral{background-color:#f3f6f6!important;color:#404040!important}.btn-neutral:hover{background-color:#e5ebeb!important;color:#404040}.btn-neutral:visited{color:#404040!important}.btn-success{background-color:#27ae60!important}.btn-success:hover{background-color:#295!important}.btn-danger{background-color:#e74c3c!important}.btn-danger:hover{background-color:#ea6153!important}.btn-warning{background-color:#e67e22!important}.btn-warning:hover{background-color:#e98b39!important}.btn-invert{background-color:#222}.btn-invert:hover{background-color:#2f2f2f!important}.btn-link{background-color:transparent!important;color:#2980b9;box-shadow:none;border-color:transparent!important}.btn-link:active,.btn-link:hover{background-color:transparent!important;color:#409ad5!important;box-shadow:none}.btn-link:visited{color:#9b59b6}.wy-btn-group .btn,.wy-control .btn{vertical-align:middle}.wy-btn-group{margin-bottom:24px;*zoom:1}.wy-btn-group:after,.wy-btn-group:before{display:table;content:""}.wy-btn-group:after{clear:both}.wy-dropdown{position:relative;display:inline-block}.wy-dropdown-active .wy-dropdown-menu{display:block}.wy-dropdown-menu{position:absolute;left:0;display:none;float:left;top:100%;min-width:100%;background:#fcfcfc;z-index:100;border:1px solid #cfd7dd;box-shadow:0 2px 2px 0 rgba(0,0,0,.1);padding:12px}.wy-dropdown-menu>dd>a{display:block;clear:both;color:#404040;white-space:nowrap;font-size:90%;padding:0 12px;cursor:pointer}.wy-dropdown-menu>dd>a:hover{background:#2980b9;color:#fff}.wy-dropdown-menu>dd.divider{border-top:1px solid #cfd7dd;margin:6px 0}.wy-dropdown-menu>dd.search{padding-bottom:12px}.wy-dropdown-menu>dd.search input[type=search]{width:100%}.wy-dropdown-menu>dd.call-to-action{background:#e3e3e3;text-transform:uppercase;font-weight:500;font-size:80%}.wy-dropdown-menu>dd.call-to-action:hover{background:#e3e3e3}.wy-dropdown-menu>dd.call-to-action .btn{color:#fff}.wy-dropdown.wy-dropdown-up .wy-dropdown-menu{bottom:100%;top:auto;left:auto;right:0}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu{background:#fcfcfc;margin-top:2px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a{padding:6px 12px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a:hover{background:#2980b9;color:#fff}.wy-dropdown.wy-dropdown-left .wy-dropdown-menu{right:0;left:auto;text-align:right}.wy-dropdown-arrow:before{content:" ";border-bottom:5px solid #f5f5f5;border-left:5px solid transparent;border-right:5px solid transparent;position:absolute;display:block;top:-4px;left:50%;margin-left:-3px}.wy-dropdown-arrow.wy-dropdown-arrow-left:before{left:11px}.wy-form-stacked select{display:block}.wy-form-aligned .wy-help-inline,.wy-form-aligned input,.wy-form-aligned label,.wy-form-aligned select,.wy-form-aligned textarea{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-form-aligned .wy-control-group>label{display:inline-block;vertical-align:middle;width:10em;margin:6px 12px 0 0;float:left}.wy-form-aligned .wy-control{float:left}.wy-form-aligned .wy-control label{display:block}.wy-form-aligned .wy-control select{margin-top:6px}fieldset{margin:0}fieldset,legend{border:0;padding:0}legend{width:100%;white-space:normal;margin-bottom:24px;font-size:150%;*margin-left:-7px}label,legend{display:block}label{margin:0 0 .3125em;color:#333;font-size:90%}input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}.wy-control-group{margin-bottom:24px;max-width:1200px;margin-left:auto;margin-right:auto;*zoom:1}.wy-control-group:after,.wy-control-group:before{display:table;content:""}.wy-control-group:after{clear:both}.wy-control-group.wy-control-group-required>label:after{content:" *";color:#e74c3c}.wy-control-group .wy-form-full,.wy-control-group .wy-form-halves,.wy-control-group .wy-form-thirds{padding-bottom:12px}.wy-control-group .wy-form-full input[type=color],.wy-control-group .wy-form-full input[type=date],.wy-control-group .wy-form-full input[type=datetime-local],.wy-control-group .wy-form-full input[type=datetime],.wy-control-group .wy-form-full input[type=email],.wy-control-group .wy-form-full input[type=month],.wy-control-group .wy-form-full input[type=number],.wy-control-group .wy-form-full input[type=password],.wy-control-group .wy-form-full input[type=search],.wy-control-group .wy-form-full input[type=tel],.wy-control-group .wy-form-full input[type=text],.wy-control-group .wy-form-full input[type=time],.wy-control-group .wy-form-full input[type=url],.wy-control-group .wy-form-full input[type=week],.wy-control-group .wy-form-full select,.wy-control-group .wy-form-halves input[type=color],.wy-control-group .wy-form-halves input[type=date],.wy-control-group .wy-form-halves input[type=datetime-local],.wy-control-group .wy-form-halves input[type=datetime],.wy-control-group .wy-form-halves input[type=email],.wy-control-group .wy-form-halves input[type=month],.wy-control-group .wy-form-halves input[type=number],.wy-control-group .wy-form-halves input[type=password],.wy-control-group .wy-form-halves input[type=search],.wy-control-group .wy-form-halves input[type=tel],.wy-control-group .wy-form-halves input[type=text],.wy-control-group .wy-form-halves input[type=time],.wy-control-group .wy-form-halves input[type=url],.wy-control-group .wy-form-halves input[type=week],.wy-control-group .wy-form-halves select,.wy-control-group .wy-form-thirds input[type=color],.wy-control-group .wy-form-thirds input[type=date],.wy-control-group .wy-form-thirds input[type=datetime-local],.wy-control-group .wy-form-thirds input[type=datetime],.wy-control-group .wy-form-thirds input[type=email],.wy-control-group .wy-form-thirds input[type=month],.wy-control-group .wy-form-thirds input[type=number],.wy-control-group .wy-form-thirds input[type=password],.wy-control-group .wy-form-thirds input[type=search],.wy-control-group .wy-form-thirds input[type=tel],.wy-control-group .wy-form-thirds input[type=text],.wy-control-group .wy-form-thirds input[type=time],.wy-control-group .wy-form-thirds input[type=url],.wy-control-group .wy-form-thirds input[type=week],.wy-control-group .wy-form-thirds select{width:100%}.wy-control-group .wy-form-full{float:left;display:block;width:100%;margin-right:0}.wy-control-group .wy-form-full:last-child{margin-right:0}.wy-control-group .wy-form-halves{float:left;display:block;margin-right:2.35765%;width:48.82117%}.wy-control-group .wy-form-halves:last-child,.wy-control-group .wy-form-halves:nth-of-type(2n){margin-right:0}.wy-control-group .wy-form-halves:nth-of-type(odd){clear:left}.wy-control-group .wy-form-thirds{float:left;display:block;margin-right:2.35765%;width:31.76157%}.wy-control-group .wy-form-thirds:last-child,.wy-control-group .wy-form-thirds:nth-of-type(3n){margin-right:0}.wy-control-group .wy-form-thirds:nth-of-type(3n+1){clear:left}.wy-control-group.wy-control-group-no-input .wy-control,.wy-control-no-input{margin:6px 0 0;font-size:90%}.wy-control-no-input{display:inline-block}.wy-control-group.fluid-input input[type=color],.wy-control-group.fluid-input input[type=date],.wy-control-group.fluid-input input[type=datetime-local],.wy-control-group.fluid-input input[type=datetime],.wy-control-group.fluid-input input[type=email],.wy-control-group.fluid-input input[type=month],.wy-control-group.fluid-input input[type=number],.wy-control-group.fluid-input input[type=password],.wy-control-group.fluid-input input[type=search],.wy-control-group.fluid-input input[type=tel],.wy-control-group.fluid-input input[type=text],.wy-control-group.fluid-input input[type=time],.wy-control-group.fluid-input input[type=url],.wy-control-group.fluid-input input[type=week]{width:100%}.wy-form-message-inline{padding-left:.3em;color:#666;font-size:90%}.wy-form-message{display:block;color:#999;font-size:70%;margin-top:.3125em;font-style:italic}.wy-form-message p{font-size:inherit;font-style:italic;margin-bottom:6px}.wy-form-message p:last-child{margin-bottom:0}input{line-height:normal}input[type=button],input[type=reset],input[type=submit]{-webkit-appearance:button;cursor:pointer;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;*overflow:visible}input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week]{-webkit-appearance:none;padding:6px;display:inline-block;border:1px solid #ccc;font-size:80%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 3px #ddd;border-radius:0;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}input[type=datetime-local]{padding:.34375em .625em}input[disabled]{cursor:default}input[type=checkbox],input[type=radio]{padding:0;margin-right:.3125em;*height:13px;*width:13px}input[type=checkbox],input[type=radio],input[type=search]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}input[type=search]::-webkit-search-cancel-button,input[type=search]::-webkit-search-decoration{-webkit-appearance:none}input[type=color]:focus,input[type=date]:focus,input[type=datetime-local]:focus,input[type=datetime]:focus,input[type=email]:focus,input[type=month]:focus,input[type=number]:focus,input[type=password]:focus,input[type=search]:focus,input[type=tel]:focus,input[type=text]:focus,input[type=time]:focus,input[type=url]:focus,input[type=week]:focus{outline:0;outline:thin dotted\9;border-color:#333}input.no-focus:focus{border-color:#ccc!important}input[type=checkbox]:focus,input[type=file]:focus,input[type=radio]:focus{outline:thin dotted #333;outline:1px auto #129fea}input[type=color][disabled],input[type=date][disabled],input[type=datetime-local][disabled],input[type=datetime][disabled],input[type=email][disabled],input[type=month][disabled],input[type=number][disabled],input[type=password][disabled],input[type=search][disabled],input[type=tel][disabled],input[type=text][disabled],input[type=time][disabled],input[type=url][disabled],input[type=week][disabled]{cursor:not-allowed;background-color:#fafafa}input:focus:invalid,select:focus:invalid,textarea:focus:invalid{color:#e74c3c;border:1px solid #e74c3c}input:focus:invalid:focus,select:focus:invalid:focus,textarea:focus:invalid:focus{border-color:#e74c3c}input[type=checkbox]:focus:invalid:focus,input[type=file]:focus:invalid:focus,input[type=radio]:focus:invalid:focus{outline-color:#e74c3c}input.wy-input-large{padding:12px;font-size:100%}textarea{overflow:auto;vertical-align:top;width:100%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif}select,textarea{padding:.5em .625em;display:inline-block;border:1px solid #ccc;font-size:80%;box-shadow:inset 0 1px 3px #ddd;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}select{border:1px solid #ccc;background-color:#fff}select[multiple]{height:auto}select:focus,textarea:focus{outline:0}input[readonly],select[disabled],select[readonly],textarea[disabled],textarea[readonly]{cursor:not-allowed;background-color:#fafafa}input[type=checkbox][disabled],input[type=radio][disabled]{cursor:not-allowed}.wy-checkbox,.wy-radio{margin:6px 0;color:#404040;display:block}.wy-checkbox input,.wy-radio input{vertical-align:baseline}.wy-form-message-inline{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-input-prefix,.wy-input-suffix{white-space:nowrap;padding:6px}.wy-input-prefix .wy-input-context,.wy-input-suffix .wy-input-context{line-height:27px;padding:0 8px;display:inline-block;font-size:80%;background-color:#f3f6f6;border:1px solid #ccc;color:#999}.wy-input-suffix .wy-input-context{border-left:0}.wy-input-prefix .wy-input-context{border-right:0}.wy-switch{position:relative;display:block;height:24px;margin-top:12px;cursor:pointer}.wy-switch:before{left:0;top:0;width:36px;height:12px;background:#ccc}.wy-switch:after,.wy-switch:before{position:absolute;content:"";display:block;border-radius:4px;-webkit-transition:all .2s ease-in-out;-moz-transition:all .2s ease-in-out;transition:all .2s ease-in-out}.wy-switch:after{width:18px;height:18px;background:#999;left:-3px;top:-3px}.wy-switch span{position:absolute;left:48px;display:block;font-size:12px;color:#ccc;line-height:1}.wy-switch.active:before{background:#1e8449}.wy-switch.active:after{left:24px;background:#27ae60}.wy-switch.disabled{cursor:not-allowed;opacity:.8}.wy-control-group.wy-control-group-error .wy-form-message,.wy-control-group.wy-control-group-error>label{color:#e74c3c}.wy-control-group.wy-control-group-error input[type=color],.wy-control-group.wy-control-group-error input[type=date],.wy-control-group.wy-control-group-error input[type=datetime-local],.wy-control-group.wy-control-group-error input[type=datetime],.wy-control-group.wy-control-group-error input[type=email],.wy-control-group.wy-control-group-error input[type=month],.wy-control-group.wy-control-group-error input[type=number],.wy-control-group.wy-control-group-error input[type=password],.wy-control-group.wy-control-group-error input[type=search],.wy-control-group.wy-control-group-error input[type=tel],.wy-control-group.wy-control-group-error input[type=text],.wy-control-group.wy-control-group-error input[type=time],.wy-control-group.wy-control-group-error input[type=url],.wy-control-group.wy-control-group-error input[type=week],.wy-control-group.wy-control-group-error textarea{border:1px solid #e74c3c}.wy-inline-validate{white-space:nowrap}.wy-inline-validate .wy-input-context{padding:.5em .625em;display:inline-block;font-size:80%}.wy-inline-validate.wy-inline-validate-success .wy-input-context{color:#27ae60}.wy-inline-validate.wy-inline-validate-danger .wy-input-context{color:#e74c3c}.wy-inline-validate.wy-inline-validate-warning .wy-input-context{color:#e67e22}.wy-inline-validate.wy-inline-validate-info .wy-input-context{color:#2980b9}.rotate-90{-webkit-transform:rotate(90deg);-moz-transform:rotate(90deg);-ms-transform:rotate(90deg);-o-transform:rotate(90deg);transform:rotate(90deg)}.rotate-180{-webkit-transform:rotate(180deg);-moz-transform:rotate(180deg);-ms-transform:rotate(180deg);-o-transform:rotate(180deg);transform:rotate(180deg)}.rotate-270{-webkit-transform:rotate(270deg);-moz-transform:rotate(270deg);-ms-transform:rotate(270deg);-o-transform:rotate(270deg);transform:rotate(270deg)}.mirror{-webkit-transform:scaleX(-1);-moz-transform:scaleX(-1);-ms-transform:scaleX(-1);-o-transform:scaleX(-1);transform:scaleX(-1)}.mirror.rotate-90{-webkit-transform:scaleX(-1) rotate(90deg);-moz-transform:scaleX(-1) rotate(90deg);-ms-transform:scaleX(-1) rotate(90deg);-o-transform:scaleX(-1) rotate(90deg);transform:scaleX(-1) rotate(90deg)}.mirror.rotate-180{-webkit-transform:scaleX(-1) rotate(180deg);-moz-transform:scaleX(-1) rotate(180deg);-ms-transform:scaleX(-1) rotate(180deg);-o-transform:scaleX(-1) rotate(180deg);transform:scaleX(-1) rotate(180deg)}.mirror.rotate-270{-webkit-transform:scaleX(-1) rotate(270deg);-moz-transform:scaleX(-1) rotate(270deg);-ms-transform:scaleX(-1) rotate(270deg);-o-transform:scaleX(-1) rotate(270deg);transform:scaleX(-1) rotate(270deg)}@media only screen and (max-width:480px){.wy-form button[type=submit]{margin:.7em 0 0}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=text],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week],.wy-form label{margin-bottom:.3em;display:block}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week]{margin-bottom:0}.wy-form-aligned .wy-control-group label{margin-bottom:.3em;text-align:left;display:block;width:100%}.wy-form-aligned .wy-control{margin:1.5em 0 0}.wy-form-message,.wy-form-message-inline,.wy-form .wy-help-inline{display:block;font-size:80%;padding:6px 0}}@media screen and (max-width:768px){.tablet-hide{display:none}}@media screen and (max-width:480px){.mobile-hide{display:none}}.float-left{float:left}.float-right{float:right}.full-width{width:100%}.rst-content table.docutils,.rst-content table.field-list,.wy-table{border-collapse:collapse;border-spacing:0;empty-cells:show;margin-bottom:24px}.rst-content table.docutils caption,.rst-content table.field-list caption,.wy-table caption{color:#000;font:italic 85%/1 arial,sans-serif;padding:1em 0;text-align:center}.rst-content table.docutils td,.rst-content table.docutils th,.rst-content table.field-list td,.rst-content table.field-list th,.wy-table td,.wy-table th{font-size:90%;margin:0;overflow:visible;padding:8px 16px}.rst-content table.docutils td:first-child,.rst-content table.docutils th:first-child,.rst-content table.field-list td:first-child,.rst-content table.field-list th:first-child,.wy-table td:first-child,.wy-table th:first-child{border-left-width:0}.rst-content table.docutils thead,.rst-content table.field-list thead,.wy-table thead{color:#000;text-align:left;vertical-align:bottom;white-space:nowrap}.rst-content table.docutils thead th,.rst-content table.field-list thead th,.wy-table thead th{font-weight:700;border-bottom:2px solid #e1e4e5}.rst-content table.docutils td,.rst-content table.field-list td,.wy-table td{background-color:transparent;vertical-align:middle}.rst-content table.docutils td p,.rst-content table.field-list td p,.wy-table td p{line-height:18px}.rst-content table.docutils td p:last-child,.rst-content table.field-list td p:last-child,.wy-table td p:last-child{margin-bottom:0}.rst-content table.docutils .wy-table-cell-min,.rst-content table.field-list .wy-table-cell-min,.wy-table .wy-table-cell-min{width:1%;padding-right:0}.rst-content table.docutils .wy-table-cell-min input[type=checkbox],.rst-content table.field-list .wy-table-cell-min input[type=checkbox],.wy-table .wy-table-cell-min input[type=checkbox]{margin:0}.wy-table-secondary{color:grey;font-size:90%}.wy-table-tertiary{color:grey;font-size:80%}.rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td,.wy-table-backed,.wy-table-odd td,.wy-table-striped tr:nth-child(2n-1) td{background-color:#f3f6f6}.rst-content table.docutils,.wy-table-bordered-all{border:1px solid #e1e4e5}.rst-content table.docutils td,.wy-table-bordered-all td{border-bottom:1px solid #e1e4e5;border-left:1px solid #e1e4e5}.rst-content table.docutils tbody>tr:last-child td,.wy-table-bordered-all tbody>tr:last-child td{border-bottom-width:0}.wy-table-bordered{border:1px solid #e1e4e5}.wy-table-bordered-rows td{border-bottom:1px solid #e1e4e5}.wy-table-bordered-rows tbody>tr:last-child td{border-bottom-width:0}.wy-table-horizontal td,.wy-table-horizontal th{border-width:0 0 1px;border-bottom:1px solid #e1e4e5}.wy-table-horizontal tbody>tr:last-child td{border-bottom-width:0}.wy-table-responsive{margin-bottom:24px;max-width:100%;overflow:auto}.wy-table-responsive table{margin-bottom:0!important}.wy-table-responsive table td,.wy-table-responsive table th{white-space:nowrap}a{color:#2980b9;text-decoration:none;cursor:pointer}a:hover{color:#3091d1}a:visited{color:#9b59b6}html{height:100%}body,html{overflow-x:hidden}body{font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;font-weight:400;color:#404040;min-height:100%;background:#edf0f2}.wy-text-left{text-align:left}.wy-text-center{text-align:center}.wy-text-right{text-align:right}.wy-text-large{font-size:120%}.wy-text-normal{font-size:100%}.wy-text-small,small{font-size:80%}.wy-text-strike{text-decoration:line-through}.wy-text-warning{color:#e67e22!important}a.wy-text-warning:hover{color:#eb9950!important}.wy-text-info{color:#2980b9!important}a.wy-text-info:hover{color:#409ad5!important}.wy-text-success{color:#27ae60!important}a.wy-text-success:hover{color:#36d278!important}.wy-text-danger{color:#e74c3c!important}a.wy-text-danger:hover{color:#ed7669!important}.wy-text-neutral{color:#404040!important}a.wy-text-neutral:hover{color:#595959!important}.rst-content .toctree-wrapper>p.caption,h1,h2,h3,h4,h5,h6,legend{margin-top:0;font-weight:700;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif}p{line-height:24px;font-size:16px;margin:0 0 24px}h1{font-size:175%}.rst-content .toctree-wrapper>p.caption,h2{font-size:150%}h3{font-size:125%}h4{font-size:115%}h5{font-size:110%}h6{font-size:100%}hr{display:block;height:1px;border:0;border-top:1px solid #e1e4e5;margin:24px 0;padding:0}.rst-content code,.rst-content tt,code{white-space:nowrap;max-width:100%;background:#fff;border:1px solid #e1e4e5;font-size:75%;padding:0 5px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#e74c3c;overflow-x:auto}.rst-content tt.code-large,code.code-large{font-size:90%}.rst-content .section ul,.rst-content .toctree-wrapper ul,.rst-content section ul,.wy-plain-list-disc,article ul{list-style:disc;line-height:24px;margin-bottom:24px}.rst-content .section ul li,.rst-content .toctree-wrapper ul li,.rst-content section ul li,.wy-plain-list-disc li,article ul li{list-style:disc;margin-left:24px}.rst-content .section ul li p:last-child,.rst-content .section ul li ul,.rst-content .toctree-wrapper ul li p:last-child,.rst-content .toctree-wrapper ul li ul,.rst-content section ul li p:last-child,.rst-content section ul li ul,.wy-plain-list-disc li p:last-child,.wy-plain-list-disc li ul,article ul li p:last-child,article ul li ul{margin-bottom:0}.rst-content .section ul li li,.rst-content .toctree-wrapper ul li li,.rst-content section ul li li,.wy-plain-list-disc li li,article ul li li{list-style:circle}.rst-content .section ul li li li,.rst-content .toctree-wrapper ul li li li,.rst-content section ul li li li,.wy-plain-list-disc li li li,article ul li li li{list-style:square}.rst-content .section ul li ol li,.rst-content .toctree-wrapper ul li ol li,.rst-content section ul li ol li,.wy-plain-list-disc li ol li,article ul li ol li{list-style:decimal}.rst-content .section ol,.rst-content .section ol.arabic,.rst-content .toctree-wrapper ol,.rst-content .toctree-wrapper ol.arabic,.rst-content section ol,.rst-content section ol.arabic,.wy-plain-list-decimal,article ol{list-style:decimal;line-height:24px;margin-bottom:24px}.rst-content .section ol.arabic li,.rst-content .section ol li,.rst-content .toctree-wrapper ol.arabic li,.rst-content .toctree-wrapper ol li,.rst-content section ol.arabic li,.rst-content section ol li,.wy-plain-list-decimal li,article ol li{list-style:decimal;margin-left:24px}.rst-content .section ol.arabic li ul,.rst-content .section ol li p:last-child,.rst-content .section ol li ul,.rst-content .toctree-wrapper ol.arabic li ul,.rst-content .toctree-wrapper ol li p:last-child,.rst-content .toctree-wrapper ol li ul,.rst-content section ol.arabic li ul,.rst-content section ol li p:last-child,.rst-content section ol li ul,.wy-plain-list-decimal li p:last-child,.wy-plain-list-decimal li ul,article ol li p:last-child,article ol li ul{margin-bottom:0}.rst-content .section ol.arabic li ul li,.rst-content .section ol li ul li,.rst-content .toctree-wrapper ol.arabic li ul li,.rst-content .toctree-wrapper ol li ul li,.rst-content section ol.arabic li ul li,.rst-content section ol li ul li,.wy-plain-list-decimal li ul li,article ol li ul li{list-style:disc}.wy-breadcrumbs{*zoom:1}.wy-breadcrumbs:after,.wy-breadcrumbs:before{display:table;content:""}.wy-breadcrumbs:after{clear:both}.wy-breadcrumbs>li{display:inline-block;padding-top:5px}.wy-breadcrumbs>li.wy-breadcrumbs-aside{float:right}.rst-content .wy-breadcrumbs>li code,.rst-content .wy-breadcrumbs>li tt,.wy-breadcrumbs>li .rst-content tt,.wy-breadcrumbs>li code{all:inherit;color:inherit}.breadcrumb-item:before{content:"/";color:#bbb;font-size:13px;padding:0 6px 0 3px}.wy-breadcrumbs-extra{margin-bottom:0;color:#b3b3b3;font-size:80%;display:inline-block}@media screen and (max-width:480px){.wy-breadcrumbs-extra,.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}@media print{.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}html{font-size:16px}.wy-affix{position:fixed;top:1.618em}.wy-menu a:hover{text-decoration:none}.wy-menu-horiz{*zoom:1}.wy-menu-horiz:after,.wy-menu-horiz:before{display:table;content:""}.wy-menu-horiz:after{clear:both}.wy-menu-horiz li,.wy-menu-horiz ul{display:inline-block}.wy-menu-horiz li:hover{background:hsla(0,0%,100%,.1)}.wy-menu-horiz li.divide-left{border-left:1px solid #404040}.wy-menu-horiz li.divide-right{border-right:1px solid #404040}.wy-menu-horiz a{height:32px;display:inline-block;line-height:32px;padding:0 16px}.wy-menu-vertical{width:300px}.wy-menu-vertical header,.wy-menu-vertical p.caption{color:#55a5d9;height:32px;line-height:32px;padding:0 1.618em;margin:12px 0 0;display:block;font-weight:700;text-transform:uppercase;font-size:85%;white-space:nowrap}.wy-menu-vertical ul{margin-bottom:0}.wy-menu-vertical li.divide-top{border-top:1px solid #404040}.wy-menu-vertical li.divide-bottom{border-bottom:1px solid #404040}.wy-menu-vertical li.current{background:#e3e3e3}.wy-menu-vertical li.current a{color:grey;border-right:1px solid #c9c9c9;padding:.4045em 2.427em}.wy-menu-vertical li.current a:hover{background:#d6d6d6}.rst-content .wy-menu-vertical li tt,.wy-menu-vertical li .rst-content tt,.wy-menu-vertical li code{border:none;background:inherit;color:inherit;padding-left:0;padding-right:0}.wy-menu-vertical li button.toctree-expand{display:block;float:left;margin-left:-1.2em;line-height:18px;color:#4d4d4d;border:none;background:none;padding:0}.wy-menu-vertical li.current>a,.wy-menu-vertical li.on a{color:#404040;font-weight:700;position:relative;background:#fcfcfc;border:none;padding:.4045em 1.618em}.wy-menu-vertical li.current>a:hover,.wy-menu-vertical li.on a:hover{background:#fcfcfc}.wy-menu-vertical li.current>a:hover button.toctree-expand,.wy-menu-vertical li.on a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand{display:block;line-height:18px;color:#333}.wy-menu-vertical li.toctree-l1.current>a{border-bottom:1px solid #c9c9c9;border-top:1px solid #c9c9c9}.wy-menu-vertical .toctree-l1.current .toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .toctree-l11>ul{display:none}.wy-menu-vertical .toctree-l1.current .current.toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .current.toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .current.toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .current.toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .current.toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .current.toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .current.toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .current.toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .current.toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .current.toctree-l11>ul{display:block}.wy-menu-vertical li.toctree-l3,.wy-menu-vertical li.toctree-l4{font-size:.9em}.wy-menu-vertical li.toctree-l2 a,.wy-menu-vertical li.toctree-l3 a,.wy-menu-vertical li.toctree-l4 a,.wy-menu-vertical li.toctree-l5 a,.wy-menu-vertical li.toctree-l6 a,.wy-menu-vertical li.toctree-l7 a,.wy-menu-vertical li.toctree-l8 a,.wy-menu-vertical li.toctree-l9 a,.wy-menu-vertical li.toctree-l10 a{color:#404040}.wy-menu-vertical li.toctree-l2 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l3 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l4 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l5 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l6 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l7 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l8 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l9 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l10 a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a,.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a,.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a,.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a,.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a,.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a,.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a,.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{display:block}.wy-menu-vertical li.toctree-l2.current>a{padding:.4045em 2.427em}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{padding:.4045em 1.618em .4045em 4.045em}.wy-menu-vertical li.toctree-l3.current>a{padding:.4045em 4.045em}.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{padding:.4045em 1.618em .4045em 5.663em}.wy-menu-vertical li.toctree-l4.current>a{padding:.4045em 5.663em}.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a{padding:.4045em 1.618em .4045em 7.281em}.wy-menu-vertical li.toctree-l5.current>a{padding:.4045em 7.281em}.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a{padding:.4045em 1.618em .4045em 8.899em}.wy-menu-vertical li.toctree-l6.current>a{padding:.4045em 8.899em}.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a{padding:.4045em 1.618em .4045em 10.517em}.wy-menu-vertical li.toctree-l7.current>a{padding:.4045em 10.517em}.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a{padding:.4045em 1.618em .4045em 12.135em}.wy-menu-vertical li.toctree-l8.current>a{padding:.4045em 12.135em}.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a{padding:.4045em 1.618em .4045em 13.753em}.wy-menu-vertical li.toctree-l9.current>a{padding:.4045em 13.753em}.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a{padding:.4045em 1.618em .4045em 15.371em}.wy-menu-vertical li.toctree-l10.current>a{padding:.4045em 15.371em}.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{padding:.4045em 1.618em .4045em 16.989em}.wy-menu-vertical li.toctree-l2.current>a,.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{background:#c9c9c9}.wy-menu-vertical li.toctree-l2 button.toctree-expand{color:#a3a3a3}.wy-menu-vertical li.toctree-l3.current>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{background:#bdbdbd}.wy-menu-vertical li.toctree-l3 button.toctree-expand{color:#969696}.wy-menu-vertical li.current ul{display:block}.wy-menu-vertical li ul{margin-bottom:0;display:none}.wy-menu-vertical li ul li a{margin-bottom:0;color:#d9d9d9;font-weight:400}.wy-menu-vertical a{line-height:18px;padding:.4045em 1.618em;display:block;position:relative;font-size:90%;color:#d9d9d9}.wy-menu-vertical a:hover{background-color:#4e4a4a;cursor:pointer}.wy-menu-vertical a:hover button.toctree-expand{color:#d9d9d9}.wy-menu-vertical a:active{background-color:#2980b9;cursor:pointer;color:#fff}.wy-menu-vertical a:active button.toctree-expand{color:#fff}.wy-side-nav-search{display:block;width:300px;padding:.809em;margin-bottom:.809em;z-index:200;background-color:#2980b9;text-align:center;color:#fcfcfc}.wy-side-nav-search input[type=text]{width:100%;border-radius:50px;padding:6px 12px;border-color:#2472a4}.wy-side-nav-search img{display:block;margin:auto auto .809em;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-side-nav-search .wy-dropdown>a,.wy-side-nav-search>a{color:#fcfcfc;font-size:100%;font-weight:700;display:inline-block;padding:4px 6px;margin-bottom:.809em;max-width:100%}.wy-side-nav-search .wy-dropdown>a:hover,.wy-side-nav-search>a:hover{background:hsla(0,0%,100%,.1)}.wy-side-nav-search .wy-dropdown>a img.logo,.wy-side-nav-search>a img.logo{display:block;margin:0 auto;height:auto;width:auto;border-radius:0;max-width:100%;background:transparent}.wy-side-nav-search .wy-dropdown>a.icon img.logo,.wy-side-nav-search>a.icon img.logo{margin-top:.85em}.wy-side-nav-search>div.version{margin-top:-.4045em;margin-bottom:.809em;font-weight:400;color:hsla(0,0%,100%,.3)}.wy-nav .wy-menu-vertical header{color:#2980b9}.wy-nav .wy-menu-vertical a{color:#b3b3b3}.wy-nav .wy-menu-vertical a:hover{background-color:#2980b9;color:#fff}[data-menu-wrap]{-webkit-transition:all .2s ease-in;-moz-transition:all .2s ease-in;transition:all .2s ease-in;position:absolute;opacity:1;width:100%;opacity:0}[data-menu-wrap].move-center{left:0;right:auto;opacity:1}[data-menu-wrap].move-left{right:auto;left:-100%;opacity:0}[data-menu-wrap].move-right{right:-100%;left:auto;opacity:0}.wy-body-for-nav{background:#fcfcfc}.wy-grid-for-nav{position:absolute;width:100%;height:100%}.wy-nav-side{position:fixed;top:0;bottom:0;left:0;padding-bottom:2em;width:300px;overflow-x:hidden;overflow-y:hidden;min-height:100%;color:#9b9b9b;background:#343131;z-index:200}.wy-side-scroll{width:320px;position:relative;overflow-x:hidden;overflow-y:scroll;height:100%}.wy-nav-top{display:none;background:#2980b9;color:#fff;padding:.4045em .809em;position:relative;line-height:50px;text-align:center;font-size:100%;*zoom:1}.wy-nav-top:after,.wy-nav-top:before{display:table;content:""}.wy-nav-top:after{clear:both}.wy-nav-top a{color:#fff;font-weight:700}.wy-nav-top img{margin-right:12px;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-nav-top i{font-size:30px;float:left;cursor:pointer;padding-top:inherit}.wy-nav-content-wrap{margin-left:300px;background:#fcfcfc;min-height:100%}.wy-nav-content{padding:1.618em 3.236em;height:100%;max-width:800px;margin:auto}.wy-body-mask{position:fixed;width:100%;height:100%;background:rgba(0,0,0,.2);display:none;z-index:499}.wy-body-mask.on{display:block}footer{color:grey}footer p{margin-bottom:12px}.rst-content footer span.commit tt,footer span.commit .rst-content tt,footer span.commit code{padding:0;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:1em;background:none;border:none;color:grey}.rst-footer-buttons{*zoom:1}.rst-footer-buttons:after,.rst-footer-buttons:before{width:100%;display:table;content:""}.rst-footer-buttons:after{clear:both}.rst-breadcrumbs-buttons{margin-top:12px;*zoom:1}.rst-breadcrumbs-buttons:after,.rst-breadcrumbs-buttons:before{display:table;content:""}.rst-breadcrumbs-buttons:after{clear:both}#search-results .search li{margin-bottom:24px;border-bottom:1px solid #e1e4e5;padding-bottom:24px}#search-results .search li:first-child{border-top:1px solid #e1e4e5;padding-top:24px}#search-results .search li a{font-size:120%;margin-bottom:12px;display:inline-block}#search-results .context{color:grey;font-size:90%}.genindextable li>ul{margin-left:24px}@media screen and (max-width:768px){.wy-body-for-nav{background:#fcfcfc}.wy-nav-top{display:block}.wy-nav-side{left:-300px}.wy-nav-side.shift{width:85%;left:0}.wy-menu.wy-menu-vertical,.wy-side-nav-search,.wy-side-scroll{width:auto}.wy-nav-content-wrap{margin-left:0}.wy-nav-content-wrap .wy-nav-content{padding:1.618em}.wy-nav-content-wrap.shift{position:fixed;min-width:100%;left:85%;top:0;height:100%;overflow:hidden}}@media screen and (min-width:1100px){.wy-nav-content-wrap{background:rgba(0,0,0,.05)}.wy-nav-content{margin:0;background:#fcfcfc}}@media print{.rst-versions,.wy-nav-side,footer{display:none}.wy-nav-content-wrap{margin-left:0}}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60;*zoom:1}.rst-versions .rst-current-version:after,.rst-versions .rst-current-version:before{display:table;content:""}.rst-versions .rst-current-version:after{clear:both}.rst-content .code-block-caption .rst-versions .rst-current-version .headerlink,.rst-content .eqno .rst-versions .rst-current-version .headerlink,.rst-content .rst-versions .rst-current-version .admonition-title,.rst-content code.download .rst-versions .rst-current-version span:first-child,.rst-content dl dt .rst-versions .rst-current-version .headerlink,.rst-content h1 .rst-versions .rst-current-version .headerlink,.rst-content h2 .rst-versions .rst-current-version .headerlink,.rst-content h3 .rst-versions .rst-current-version .headerlink,.rst-content h4 .rst-versions .rst-current-version .headerlink,.rst-content h5 .rst-versions .rst-current-version .headerlink,.rst-content h6 .rst-versions .rst-current-version .headerlink,.rst-content p .rst-versions .rst-current-version .headerlink,.rst-content table>caption .rst-versions .rst-current-version .headerlink,.rst-content tt.download .rst-versions .rst-current-version span:first-child,.rst-versions .rst-current-version .fa,.rst-versions .rst-current-version .icon,.rst-versions .rst-current-version .rst-content .admonition-title,.rst-versions .rst-current-version .rst-content .code-block-caption .headerlink,.rst-versions .rst-current-version .rst-content .eqno .headerlink,.rst-versions .rst-current-version .rst-content code.download span:first-child,.rst-versions .rst-current-version .rst-content dl dt .headerlink,.rst-versions .rst-current-version .rst-content h1 .headerlink,.rst-versions .rst-current-version .rst-content h2 .headerlink,.rst-versions .rst-current-version .rst-content h3 .headerlink,.rst-versions .rst-current-version .rst-content h4 .headerlink,.rst-versions .rst-current-version .rst-content h5 .headerlink,.rst-versions .rst-current-version .rst-content h6 .headerlink,.rst-versions .rst-current-version .rst-content p .headerlink,.rst-versions .rst-current-version .rst-content table>caption .headerlink,.rst-versions .rst-current-version .rst-content tt.download span:first-child,.rst-versions .rst-current-version .wy-menu-vertical li button.toctree-expand,.wy-menu-vertical li .rst-versions .rst-current-version button.toctree-expand{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}.rst-content .toctree-wrapper>p.caption,.rst-content h1,.rst-content h2,.rst-content h3,.rst-content h4,.rst-content h5,.rst-content h6{margin-bottom:24px}.rst-content img{max-width:100%;height:auto}.rst-content div.figure,.rst-content figure{margin-bottom:24px}.rst-content div.figure .caption-text,.rst-content figure .caption-text{font-style:italic}.rst-content div.figure p:last-child.caption,.rst-content figure p:last-child.caption{margin-bottom:0}.rst-content div.figure.align-center,.rst-content figure.align-center{text-align:center}.rst-content .section>a>img,.rst-content .section>img,.rst-content section>a>img,.rst-content section>img{margin-bottom:24px}.rst-content abbr[title]{text-decoration:none}.rst-content.style-external-links a.reference.external:after{font-family:FontAwesome;content:"\f08e";color:#b3b3b3;vertical-align:super;font-size:60%;margin:0 .2em}.rst-content blockquote{margin-left:24px;line-height:24px;margin-bottom:24px}.rst-content pre.literal-block{white-space:pre;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;display:block;overflow:auto}.rst-content div[class^=highlight],.rst-content pre.literal-block{border:1px solid #e1e4e5;overflow-x:auto;margin:1px 0 24px}.rst-content div[class^=highlight] div[class^=highlight],.rst-content pre.literal-block div[class^=highlight]{padding:0;border:none;margin:0}.rst-content div[class^=highlight] td.code{width:100%}.rst-content .linenodiv pre{border-right:1px solid #e6e9ea;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;user-select:none;pointer-events:none}.rst-content div[class^=highlight] pre{white-space:pre;margin:0;padding:12px;display:block;overflow:auto}.rst-content div[class^=highlight] pre .hll{display:block;margin:0 -12px;padding:0 12px}.rst-content .linenodiv pre,.rst-content div[class^=highlight] pre,.rst-content pre.literal-block{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:12px;line-height:1.4}.rst-content div.highlight .gp,.rst-content div.highlight span.linenos{user-select:none;pointer-events:none}.rst-content div.highlight span.linenos{display:inline-block;padding-left:0;padding-right:12px;margin-right:12px;border-right:1px solid #e6e9ea}.rst-content .code-block-caption{font-style:italic;font-size:85%;line-height:1;padding:1em 0;text-align:center}@media print{.rst-content .codeblock,.rst-content div[class^=highlight],.rst-content div[class^=highlight] pre{white-space:pre-wrap}}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning{clear:both}.rst-content .admonition-todo .last,.rst-content .admonition-todo>:last-child,.rst-content .admonition .last,.rst-content .admonition>:last-child,.rst-content .attention .last,.rst-content .attention>:last-child,.rst-content .caution .last,.rst-content .caution>:last-child,.rst-content .danger .last,.rst-content .danger>:last-child,.rst-content .error .last,.rst-content .error>:last-child,.rst-content .hint .last,.rst-content .hint>:last-child,.rst-content .important .last,.rst-content .important>:last-child,.rst-content .note .last,.rst-content .note>:last-child,.rst-content .seealso .last,.rst-content .seealso>:last-child,.rst-content .tip .last,.rst-content .tip>:last-child,.rst-content .warning .last,.rst-content .warning>:last-child{margin-bottom:0}.rst-content .admonition-title:before{margin-right:4px}.rst-content .admonition table{border-color:rgba(0,0,0,.1)}.rst-content .admonition table td,.rst-content .admonition table th{background:transparent!important;border-color:rgba(0,0,0,.1)!important}.rst-content .section ol.loweralpha,.rst-content .section ol.loweralpha>li,.rst-content .toctree-wrapper ol.loweralpha,.rst-content .toctree-wrapper ol.loweralpha>li,.rst-content section ol.loweralpha,.rst-content section ol.loweralpha>li{list-style:lower-alpha}.rst-content .section ol.upperalpha,.rst-content .section ol.upperalpha>li,.rst-content .toctree-wrapper ol.upperalpha,.rst-content .toctree-wrapper ol.upperalpha>li,.rst-content section ol.upperalpha,.rst-content section ol.upperalpha>li{list-style:upper-alpha}.rst-content .section ol li>*,.rst-content .section ul li>*,.rst-content .toctree-wrapper ol li>*,.rst-content .toctree-wrapper ul li>*,.rst-content section ol li>*,.rst-content section ul li>*{margin-top:12px;margin-bottom:12px}.rst-content .section ol li>:first-child,.rst-content .section ul li>:first-child,.rst-content .toctree-wrapper ol li>:first-child,.rst-content .toctree-wrapper ul li>:first-child,.rst-content section ol li>:first-child,.rst-content section ul li>:first-child{margin-top:0}.rst-content .section ol li>p,.rst-content .section ol li>p:last-child,.rst-content .section ul li>p,.rst-content .section ul li>p:last-child,.rst-content .toctree-wrapper ol li>p,.rst-content .toctree-wrapper ol li>p:last-child,.rst-content .toctree-wrapper ul li>p,.rst-content .toctree-wrapper ul li>p:last-child,.rst-content section ol li>p,.rst-content section ol li>p:last-child,.rst-content section ul li>p,.rst-content section ul li>p:last-child{margin-bottom:12px}.rst-content .section ol li>p:only-child,.rst-content .section ol li>p:only-child:last-child,.rst-content .section ul li>p:only-child,.rst-content .section ul li>p:only-child:last-child,.rst-content .toctree-wrapper ol li>p:only-child,.rst-content .toctree-wrapper ol li>p:only-child:last-child,.rst-content .toctree-wrapper ul li>p:only-child,.rst-content .toctree-wrapper ul li>p:only-child:last-child,.rst-content section ol li>p:only-child,.rst-content section ol li>p:only-child:last-child,.rst-content section ul li>p:only-child,.rst-content section ul li>p:only-child:last-child{margin-bottom:0}.rst-content .section ol li>ol,.rst-content .section ol li>ul,.rst-content .section ul li>ol,.rst-content .section ul li>ul,.rst-content .toctree-wrapper ol li>ol,.rst-content .toctree-wrapper ol li>ul,.rst-content .toctree-wrapper ul li>ol,.rst-content .toctree-wrapper ul li>ul,.rst-content section ol li>ol,.rst-content section ol li>ul,.rst-content section ul li>ol,.rst-content section ul li>ul{margin-bottom:12px}.rst-content .section ol.simple li>*,.rst-content .section ol.simple li ol,.rst-content .section ol.simple li ul,.rst-content .section ul.simple li>*,.rst-content .section ul.simple li ol,.rst-content .section ul.simple li ul,.rst-content .toctree-wrapper ol.simple li>*,.rst-content .toctree-wrapper ol.simple li ol,.rst-content .toctree-wrapper ol.simple li ul,.rst-content .toctree-wrapper ul.simple li>*,.rst-content .toctree-wrapper ul.simple li ol,.rst-content .toctree-wrapper ul.simple li ul,.rst-content section ol.simple li>*,.rst-content section ol.simple li ol,.rst-content section ol.simple li ul,.rst-content section ul.simple li>*,.rst-content section ul.simple li ol,.rst-content section ul.simple li ul{margin-top:0;margin-bottom:0}.rst-content .line-block{margin-left:0;margin-bottom:24px;line-height:24px}.rst-content .line-block .line-block{margin-left:24px;margin-bottom:0}.rst-content .topic-title{font-weight:700;margin-bottom:12px}.rst-content .toc-backref{color:#404040}.rst-content .align-right{float:right;margin:0 0 24px 24px}.rst-content .align-left{float:left;margin:0 24px 24px 0}.rst-content .align-center{margin:auto}.rst-content .align-center:not(table){display:block}.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink{opacity:0;font-size:14px;font-family:FontAwesome;margin-left:.5em}.rst-content .code-block-caption .headerlink:focus,.rst-content .code-block-caption:hover .headerlink,.rst-content .eqno .headerlink:focus,.rst-content .eqno:hover .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink:focus,.rst-content .toctree-wrapper>p.caption:hover .headerlink,.rst-content dl dt .headerlink:focus,.rst-content dl dt:hover .headerlink,.rst-content h1 .headerlink:focus,.rst-content h1:hover .headerlink,.rst-content h2 .headerlink:focus,.rst-content h2:hover .headerlink,.rst-content h3 .headerlink:focus,.rst-content h3:hover .headerlink,.rst-content h4 .headerlink:focus,.rst-content h4:hover .headerlink,.rst-content h5 .headerlink:focus,.rst-content h5:hover .headerlink,.rst-content h6 .headerlink:focus,.rst-content h6:hover .headerlink,.rst-content p.caption .headerlink:focus,.rst-content p.caption:hover .headerlink,.rst-content p .headerlink:focus,.rst-content p:hover .headerlink,.rst-content table>caption .headerlink:focus,.rst-content table>caption:hover .headerlink{opacity:1}.rst-content p a{overflow-wrap:anywhere}.rst-content .wy-table td p,.rst-content .wy-table td ul,.rst-content .wy-table th p,.rst-content .wy-table th ul,.rst-content table.docutils td p,.rst-content table.docutils td ul,.rst-content table.docutils th p,.rst-content table.docutils th ul,.rst-content table.field-list td p,.rst-content table.field-list td ul,.rst-content table.field-list th p,.rst-content table.field-list th ul{font-size:inherit}.rst-content .btn:focus{outline:2px solid}.rst-content table>caption .headerlink:after{font-size:12px}.rst-content .centered{text-align:center}.rst-content .sidebar{float:right;width:40%;display:block;margin:0 0 24px 24px;padding:24px;background:#f3f6f6;border:1px solid #e1e4e5}.rst-content .sidebar dl,.rst-content .sidebar p,.rst-content .sidebar ul{font-size:90%}.rst-content .sidebar .last,.rst-content .sidebar>:last-child{margin-bottom:0}.rst-content .sidebar .sidebar-title{display:block;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif;font-weight:700;background:#e1e4e5;padding:6px 12px;margin:-24px -24px 24px;font-size:100%}.rst-content .highlighted{background:#f1c40f;box-shadow:0 0 0 2px #f1c40f;display:inline;font-weight:700}.rst-content .citation-reference,.rst-content .footnote-reference{vertical-align:baseline;position:relative;top:-.4em;line-height:0;font-size:90%}.rst-content .citation-reference>span.fn-bracket,.rst-content .footnote-reference>span.fn-bracket{display:none}.rst-content .hlist{width:100%}.rst-content dl dt span.classifier:before{content:" : "}.rst-content dl dt span.classifier-delimiter{display:none!important}html.writer-html4 .rst-content table.docutils.citation,html.writer-html4 .rst-content table.docutils.footnote{background:none;border:none}html.writer-html4 .rst-content table.docutils.citation td,html.writer-html4 .rst-content table.docutils.citation tr,html.writer-html4 .rst-content table.docutils.footnote td,html.writer-html4 .rst-content table.docutils.footnote tr{border:none;background-color:transparent!important;white-space:normal}html.writer-html4 .rst-content table.docutils.citation td.label,html.writer-html4 .rst-content table.docutils.footnote td.label{padding-left:0;padding-right:0;vertical-align:top}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.field-list,html.writer-html5 .rst-content dl.footnote{display:grid;grid-template-columns:auto minmax(80%,95%)}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dt{display:inline-grid;grid-template-columns:max-content auto}html.writer-html5 .rst-content aside.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content div.citation{display:grid;grid-template-columns:auto auto minmax(.65rem,auto) minmax(40%,95%)}html.writer-html5 .rst-content aside.citation>span.label,html.writer-html5 .rst-content aside.footnote>span.label,html.writer-html5 .rst-content div.citation>span.label{grid-column-start:1;grid-column-end:2}html.writer-html5 .rst-content aside.citation>span.backrefs,html.writer-html5 .rst-content aside.footnote>span.backrefs,html.writer-html5 .rst-content div.citation>span.backrefs{grid-column-start:2;grid-column-end:3;grid-row-start:1;grid-row-end:3}html.writer-html5 .rst-content aside.citation>p,html.writer-html5 .rst-content aside.footnote>p,html.writer-html5 .rst-content div.citation>p{grid-column-start:4;grid-column-end:5}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.field-list,html.writer-html5 .rst-content dl.footnote{margin-bottom:24px}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dt{padding-left:1rem}html.writer-html5 .rst-content dl.citation>dd,html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dd,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dd,html.writer-html5 .rst-content dl.footnote>dt{margin-bottom:0}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.footnote{font-size:.9rem}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.footnote>dt{margin:0 .5rem .5rem 0;line-height:1.2rem;word-break:break-all;font-weight:400}html.writer-html5 .rst-content dl.citation>dt>span.brackets:before,html.writer-html5 .rst-content dl.footnote>dt>span.brackets:before{content:"["}html.writer-html5 .rst-content dl.citation>dt>span.brackets:after,html.writer-html5 .rst-content dl.footnote>dt>span.brackets:after{content:"]"}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref{text-align:left;font-style:italic;margin-left:.65rem;word-break:break-word;word-spacing:-.1rem;max-width:5rem}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref>a,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref>a{word-break:keep-all}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref>a:not(:first-child):before,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref>a:not(:first-child):before{content:" "}html.writer-html5 .rst-content dl.citation>dd,html.writer-html5 .rst-content dl.footnote>dd{margin:0 0 .5rem;line-height:1.2rem}html.writer-html5 .rst-content dl.citation>dd p,html.writer-html5 .rst-content dl.footnote>dd p{font-size:.9rem}html.writer-html5 .rst-content aside.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content div.citation{padding-left:1rem;padding-right:1rem;font-size:.9rem;line-height:1.2rem}html.writer-html5 .rst-content aside.citation p,html.writer-html5 .rst-content aside.footnote p,html.writer-html5 .rst-content div.citation p{font-size:.9rem;line-height:1.2rem;margin-bottom:12px}html.writer-html5 .rst-content aside.citation span.backrefs,html.writer-html5 .rst-content aside.footnote span.backrefs,html.writer-html5 .rst-content div.citation span.backrefs{text-align:left;font-style:italic;margin-left:.65rem;word-break:break-word;word-spacing:-.1rem;max-width:5rem}html.writer-html5 .rst-content aside.citation span.backrefs>a,html.writer-html5 .rst-content aside.footnote span.backrefs>a,html.writer-html5 .rst-content div.citation span.backrefs>a{word-break:keep-all}html.writer-html5 .rst-content aside.citation span.backrefs>a:not(:first-child):before,html.writer-html5 .rst-content aside.footnote span.backrefs>a:not(:first-child):before,html.writer-html5 .rst-content div.citation span.backrefs>a:not(:first-child):before{content:" "}html.writer-html5 .rst-content aside.citation span.label,html.writer-html5 .rst-content aside.footnote span.label,html.writer-html5 .rst-content div.citation span.label{line-height:1.2rem}html.writer-html5 .rst-content aside.citation-list,html.writer-html5 .rst-content aside.footnote-list,html.writer-html5 .rst-content div.citation-list{margin-bottom:24px}html.writer-html5 .rst-content dl.option-list kbd{font-size:.9rem}.rst-content table.docutils.footnote,html.writer-html4 .rst-content table.docutils.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content aside.footnote-list aside.footnote,html.writer-html5 .rst-content div.citation-list>div.citation,html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.footnote{color:grey}.rst-content table.docutils.footnote code,.rst-content table.docutils.footnote tt,html.writer-html4 .rst-content table.docutils.citation code,html.writer-html4 .rst-content table.docutils.citation tt,html.writer-html5 .rst-content aside.footnote-list aside.footnote code,html.writer-html5 .rst-content aside.footnote-list aside.footnote tt,html.writer-html5 .rst-content aside.footnote code,html.writer-html5 .rst-content aside.footnote tt,html.writer-html5 .rst-content div.citation-list>div.citation code,html.writer-html5 .rst-content div.citation-list>div.citation tt,html.writer-html5 .rst-content dl.citation code,html.writer-html5 .rst-content dl.citation tt,html.writer-html5 .rst-content dl.footnote code,html.writer-html5 .rst-content dl.footnote tt{color:#555}.rst-content .wy-table-responsive.citation,.rst-content .wy-table-responsive.footnote{margin-bottom:0}.rst-content .wy-table-responsive.citation+:not(.citation),.rst-content .wy-table-responsive.footnote+:not(.footnote){margin-top:24px}.rst-content .wy-table-responsive.citation:last-child,.rst-content .wy-table-responsive.footnote:last-child{margin-bottom:24px}.rst-content table.docutils th{border-color:#e1e4e5}html.writer-html5 .rst-content table.docutils th{border:1px solid #e1e4e5}html.writer-html5 .rst-content table.docutils td>p,html.writer-html5 .rst-content table.docutils th>p{line-height:1rem;margin-bottom:0;font-size:.9rem}.rst-content table.docutils td .last,.rst-content table.docutils td .last>:last-child{margin-bottom:0}.rst-content table.field-list,.rst-content table.field-list td{border:none}.rst-content table.field-list td p{line-height:inherit}.rst-content table.field-list td>strong{display:inline-block}.rst-content table.field-list .field-name{padding-right:10px;text-align:left;white-space:nowrap}.rst-content table.field-list .field-body{text-align:left}.rst-content code,.rst-content tt{color:#000;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;padding:2px 5px}.rst-content code big,.rst-content code em,.rst-content tt big,.rst-content tt em{font-size:100%!important;line-height:normal}.rst-content code.literal,.rst-content tt.literal{color:#e74c3c;white-space:normal}.rst-content code.xref,.rst-content tt.xref,a .rst-content code,a .rst-content tt{font-weight:700;color:#404040;overflow-wrap:normal}.rst-content kbd,.rst-content pre,.rst-content samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace}.rst-content a code,.rst-content a tt{color:#2980b9}.rst-content dl{margin-bottom:24px}.rst-content dl dt{font-weight:700;margin-bottom:12px}.rst-content dl ol,.rst-content dl p,.rst-content dl table,.rst-content dl ul{margin-bottom:12px}.rst-content dl dd{margin:0 0 12px 24px;line-height:24px}.rst-content dl dd>ol:last-child,.rst-content dl dd>p:last-child,.rst-content dl dd>table:last-child,.rst-content dl dd>ul:last-child{margin-bottom:0}html.writer-html4 .rst-content dl:not(.docutils),html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple){margin-bottom:24px}html.writer-html4 .rst-content dl:not(.docutils)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt{display:table;margin:6px 0;font-size:90%;line-height:normal;background:#e7f2fa;color:#2980b9;border-top:3px solid #6ab0de;padding:6px;position:relative}html.writer-html4 .rst-content dl:not(.docutils)>dt:before,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt:before{color:#6ab0de}html.writer-html4 .rst-content dl:not(.docutils)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt{margin-bottom:6px;border:none;border-left:3px solid #ccc;background:#f0f0f0;color:#555}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils)>dt:first-child,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt:first-child{margin-top:0}html.writer-html4 .rst-content dl:not(.docutils) code.descclassname,html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descclassname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descname{background-color:transparent;border:none;padding:0;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descname{font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .optional,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .optional{display:inline-block;padding:0 4px;color:#000;font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .property,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .property{display:inline-block;padding-right:8px;max-width:100%}html.writer-html4 .rst-content dl:not(.docutils) .k,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .k{font-style:italic}html.writer-html4 .rst-content dl:not(.docutils) .descclassname,html.writer-html4 .rst-content dl:not(.docutils) .descname,html.writer-html4 .rst-content dl:not(.docutils) .sig-name,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .sig-name{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#000}.rst-content .viewcode-back,.rst-content .viewcode-link{display:inline-block;color:#27ae60;font-size:80%;padding-left:24px}.rst-content .viewcode-back{display:block;float:right}.rst-content p.rubric{margin-bottom:12px;font-weight:700}.rst-content code.download,.rst-content tt.download{background:inherit;padding:inherit;font-weight:400;font-family:inherit;font-size:inherit;color:inherit;border:inherit;white-space:inherit}.rst-content code.download span:first-child,.rst-content tt.download span:first-child{-webkit-font-smoothing:subpixel-antialiased}.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{margin-right:4px}.rst-content .guilabel,.rst-content .menuselection{font-size:80%;font-weight:700;border-radius:4px;padding:2.4px 6px;margin:auto 2px}.rst-content .guilabel,.rst-content .menuselection{border:1px solid #7fbbe3;background:#e7f2fa}.rst-content :not(dl.option-list)>:not(dt):not(kbd):not(.kbd)>.kbd,.rst-content :not(dl.option-list)>:not(dt):not(kbd):not(.kbd)>kbd{color:inherit;font-size:80%;background-color:#fff;border:1px solid #a6a6a6;border-radius:4px;box-shadow:0 2px grey;padding:2.4px 6px;margin:auto 0}.rst-content .versionmodified{font-style:italic}@media screen and (max-width:480px){.rst-content .sidebar{width:100%}}span[id*=MathJax-Span]{color:#404040}.math{text-align:center}@font-face{font-family:Lato;src:url(fonts/lato-normal.woff2?bd03a2cc277bbbc338d464e679fe9942) format("woff2"),url(fonts/lato-normal.woff?27bd77b9162d388cb8d4c4217c7c5e2a) format("woff");font-weight:400;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold.woff2?cccb897485813c7c256901dbca54ecf2) format("woff2"),url(fonts/lato-bold.woff?d878b6c29b10beca227e9eef4246111b) format("woff");font-weight:700;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold-italic.woff2?0b6bb6725576b072c5d0b02ecdd1900d) format("woff2"),url(fonts/lato-bold-italic.woff?9c7e4e9eb485b4a121c760e61bc3707c) format("woff");font-weight:700;font-style:italic;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-normal-italic.woff2?4eb103b4d12be57cb1d040ed5e162e9d) format("woff2"),url(fonts/lato-normal-italic.woff?f28f2d6482446544ef1ea1ccc6dd5892) format("woff");font-weight:400;font-style:italic;font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:400;src:url(fonts/Roboto-Slab-Regular.woff2?7abf5b8d04d26a2cafea937019bca958) format("woff2"),url(fonts/Roboto-Slab-Regular.woff?c1be9284088d487c5e3ff0a10a92e58c) format("woff");font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:700;src:url(fonts/Roboto-Slab-Bold.woff2?9984f4a9bda09be08e83f2506954adbe) format("woff2"),url(fonts/Roboto-Slab-Bold.woff?bed5564a116b05148e3b3bea6fb1162a) format("woff");font-display:block} \ No newline at end of file diff --git a/docs/build/html/_static/css3-mediaqueries.js b/docs/build/html/_static/css3-mediaqueries.js deleted file mode 100644 index 59735f5..0000000 --- a/docs/build/html/_static/css3-mediaqueries.js +++ /dev/null @@ -1 +0,0 @@ -if(typeof Object.create!=="function"){Object.create=function(e){function t(){}t.prototype=e;return new t}}var ua={toString:function(){return navigator.userAgent},test:function(e){return this.toString().toLowerCase().indexOf(e.toLowerCase())>-1}};ua.version=(ua.toString().toLowerCase().match(/[\s\S]+(?:rv|it|ra|ie)[\/: ]([\d.]+)/)||[])[1];ua.webkit=ua.test("webkit");ua.gecko=ua.test("gecko")&&!ua.webkit;ua.opera=ua.test("opera");ua.ie=ua.test("msie")&&!ua.opera;ua.ie6=ua.ie&&document.compatMode&&typeof document.documentElement.style.maxHeight==="undefined";ua.ie7=ua.ie&&document.documentElement&&typeof document.documentElement.style.maxHeight!=="undefined"&&typeof XDomainRequest==="undefined";ua.ie8=ua.ie&&typeof XDomainRequest!=="undefined";var domReady=function(){var e=[];var t=function(){if(!arguments.callee.done){arguments.callee.done=true;for(var t=0;t=200&&r.status<300||r.status===304||navigator.userAgent.indexOf("Safari")>-1&&typeof r.status==="undefined"){t(r.responseText)}else{n()}document.documentElement.style.cursor="";r=null}};r.send("")};var l=function(t){t=t.replace(e.REDUNDANT_COMPONENTS,"");t=t.replace(e.REDUNDANT_WHITESPACE,"$1");t=t.replace(e.WHITESPACE_IN_PARENTHESES,"($1)");t=t.replace(e.MORE_WHITESPACE," ");t=t.replace(e.FINAL_SEMICOLONS,"}");return t};var c={stylesheet:function(t){var n={};var r=[],i=[],s=[],o=[];var u=t.cssHelperText;var a=t.getAttribute("media");if(a){var f=a.toLowerCase().split(",")}else{var f=["all"]}for(var l=0;l-1&&a.href&&a.href.length!==0&&!a.disabled){r[r.length]=a}}if(r.length>0){var c=0;var d=function(){c++;if(c===r.length){i()}};var v=function(t){var n=t.href;f(n,function(r){r=l(r).replace(e.RELATIVE_URLS,"url("+n.substring(0,n.lastIndexOf("/"))+"/$1)");t.cssHelperText=r;d()},d)};for(u=0;u0){r.setAttribute("media",t.join(","))}document.getElementsByTagName("head")[0].appendChild(r);if(r.styleSheet){r.styleSheet.cssText=e}else{r.appendChild(document.createTextNode(e))}r.addedWithCssHelper=true;if(typeof n==="undefined"||n===true){cssHelper.parsed(function(t){var n=p(r,e);for(var i in n){if(n.hasOwnProperty(i)){g(i,n[i])}}a("newStyleParsed",r)})}else{r.parsingDisallowed=true}return r},removeStyle:function(e){return e.parentNode.removeChild(e)},parsed:function(e){if(n){s(e)}else{if(typeof t!=="undefined"){if(typeof e==="function"){e(t)}}else{s(e);d()}}},stylesheets:function(e){cssHelper.parsed(function(t){e(m.stylesheets||y("stylesheets"))})},mediaQueryLists:function(e){cssHelper.parsed(function(t){e(m.mediaQueryLists||y("mediaQueryLists"))})},rules:function(e){cssHelper.parsed(function(t){e(m.rules||y("rules"))})},selectors:function(e){cssHelper.parsed(function(t){e(m.selectors||y("selectors"))})},declarations:function(e){cssHelper.parsed(function(t){e(m.declarations||y("declarations"))})},properties:function(e){cssHelper.parsed(function(t){e(m.properties||y("properties"))})},broadcast:a,addListener:function(e,t){if(typeof t==="function"){if(!u[e]){u[e]={listeners:[]}}u[e].listeners[u[e].listeners.length]=t}},removeListener:function(e,t){if(typeof t==="function"&&u[e]){var n=u[e].listeners;for(var r=0;r=a||s&&l0}}else if("device-height"===e.substring(r-13,r)){c=screen.height;if(t!==null){if(u==="length"){return i&&c>=a||s&&c0}}else if("width"===e.substring(r-5,r)){l=document.documentElement.clientWidth||document.body.clientWidth;if(t!==null){if(u==="length"){return i&&l>=a||s&&l0}}else if("height"===e.substring(r-6,r)){c=document.documentElement.clientHeight||document.body.clientHeight;if(t!==null){if(u==="length"){return i&&c>=a||s&&c0}}else if("device-aspect-ratio"===e.substring(r-19,r)){return u==="aspect-ratio"&&screen.width*a[1]===screen.height*a[0]}else if("color-index"===e.substring(r-11,r)){var h=Math.pow(2,screen.colorDepth);if(t!==null){if(u==="absolute"){return i&&h>=a||s&&h0}}else if("color"===e.substring(r-5,r)){var p=screen.colorDepth;if(t!==null){if(u==="absolute"){return i&&p>=a||s&&p0}}else if("resolution"===e.substring(r-10,r)){var d;if(f==="dpcm"){d=o("1cm")}else{d=o("1in")}if(t!==null){if(u==="resolution"){return i&&d>=a||s&&d0}}else{return false}};var a=function(e){var t=e.getValid();var n=e.getExpressions();var r=n.length;if(r>0){for(var i=0;i0){u=false;for(var f=0;f0){l[c++]=","}l[c++]=h}}if(l.length>0){r[r.length]=cssHelper.addStyle("@media "+l.join("")+"{"+e.getCssText()+"}",t,false)}};var l=function(e,t){for(var n=0;n0}}var o=[],u=[];for(var f in i){if(i.hasOwnProperty(f)){o[o.length]=f;if(i[f]){u[u.length]=f}if(f==="all"){n=true}}}if(u.length>0){r[r.length]=cssHelper.addStyle(e.getCssText(),u,false)}var c=e.getMediaQueryLists();if(n){l(c)}else{l(c,o)}};var h=function(e){for(var t=0;td||Math.abs(s-t)>d){e=n;t=s;clearTimeout(r);r=setTimeout(function(){if(!i()){p()}else{cssHelper.broadcast("cssMediaQueriesTested")}},500)}};window.onresize=function(){var e=window.onresize||function(){};return function(){e();s()}}()};var m=document.documentElement;m.style.marginLeft="-32767px";setTimeout(function(){m.style.marginLeft=""},5e3);return function(){if(!i()){cssHelper.addListener("newStyleParsed",function(e){c(e.cssHelperParsed.stylesheet)});cssHelper.addListener("cssMediaQueriesTested",function(){if(ua.ie){m.style.width="1px"}setTimeout(function(){m.style.width="";m.style.marginLeft=""},0);cssHelper.removeListener("cssMediaQueriesTested",arguments.callee)});s();p()}else{m.style.marginLeft=""}v()}}());try{document.execCommand("BackgroundImageCache",false,true)}catch(e){} diff --git a/docs/build/html/_static/css3-mediaqueries_src.js b/docs/build/html/_static/css3-mediaqueries_src.js deleted file mode 100644 index 7878620..0000000 --- a/docs/build/html/_static/css3-mediaqueries_src.js +++ /dev/null @@ -1,1104 +0,0 @@ -/* -css3-mediaqueries.js - CSS Helper and CSS3 Media Queries Enabler - -author: Wouter van der Graaf -version: 1.0 (20110330) -license: MIT -website: http://code.google.com/p/css3-mediaqueries-js/ - -W3C spec: http://www.w3.org/TR/css3-mediaqueries/ - -Note: use of embedded ",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=y.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=y.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),y.elements=c+" "+a,j(b)}function f(a){var b=x[a[v]];return b||(b={},w++,a[v]=w,x[w]=b),b}function g(a,c,d){if(c||(c=b),q)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():u.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||t.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),q)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return y.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(y,b.frag)}function j(a){a||(a=b);var d=f(a);return!y.shivCSS||p||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),q||i(a,d),a}function k(a){for(var b,c=a.getElementsByTagName("*"),e=c.length,f=RegExp("^(?:"+d().join("|")+")$","i"),g=[];e--;)b=c[e],f.test(b.nodeName)&&g.push(b.applyElement(l(b)));return g}function l(a){for(var b,c=a.attributes,d=c.length,e=a.ownerDocument.createElement(A+":"+a.nodeName);d--;)b=c[d],b.specified&&e.setAttribute(b.nodeName,b.nodeValue);return e.style.cssText=a.style.cssText,e}function m(a){for(var b,c=a.split("{"),e=c.length,f=RegExp("(^|[\\s,>+~])("+d().join("|")+")(?=[[\\s,>+~#.:]|$)","gi"),g="$1"+A+"\\:$2";e--;)b=c[e]=c[e].split("}"),b[b.length-1]=b[b.length-1].replace(f,g),c[e]=b.join("}");return c.join("{")}function n(a){for(var b=a.length;b--;)a[b].removeNode()}function o(a){function b(){clearTimeout(g._removeSheetTimer),d&&d.removeNode(!0),d=null}var d,e,g=f(a),h=a.namespaces,i=a.parentWindow;return!B||a.printShived?a:("undefined"==typeof h[A]&&h.add(A),i.attachEvent("onbeforeprint",function(){b();for(var f,g,h,i=a.styleSheets,j=[],l=i.length,n=Array(l);l--;)n[l]=i[l];for(;h=n.pop();)if(!h.disabled&&z.test(h.media)){try{f=h.imports,g=f.length}catch(o){g=0}for(l=0;g>l;l++)n.push(f[l]);try{j.push(h.cssText)}catch(o){}}j=m(j.reverse().join("")),e=k(a),d=c(a,j)}),i.attachEvent("onafterprint",function(){n(e),clearTimeout(g._removeSheetTimer),g._removeSheetTimer=setTimeout(b,500)}),a.printShived=!0,a)}var p,q,r="3.7.3",s=a.html5||{},t=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,u=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,v="_html5shiv",w=0,x={};!function(){try{var a=b.createElement("a");a.innerHTML="",p="hidden"in a,q=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){p=!0,q=!0}}();var y={elements:s.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:r,shivCSS:s.shivCSS!==!1,supportsUnknownElements:q,shivMethods:s.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=y,j(b);var z=/^$|\b(?:all|print)\b/,A="html5shiv",B=!q&&function(){var c=b.documentElement;return!("undefined"==typeof b.namespaces||"undefined"==typeof b.parentWindow||"undefined"==typeof c.applyElement||"undefined"==typeof c.removeNode||"undefined"==typeof a.attachEvent)}();y.type+=" print",y.shivPrint=o,o(b),"object"==typeof module&&module.exports&&(module.exports=y)}("undefined"!=typeof window?window:this,document); \ No newline at end of file diff --git a/docs/build/html/_static/js/html5shiv.min.js b/docs/build/html/_static/js/html5shiv.min.js new file mode 100644 index 0000000..cd1c674 --- /dev/null +++ b/docs/build/html/_static/js/html5shiv.min.js @@ -0,0 +1,4 @@ +/** +* @preserve HTML5 Shiv 3.7.3 | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed +*/ +!function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=t.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=t.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),t.elements=c+" "+a,j(b)}function f(a){var b=s[a[q]];return b||(b={},r++,a[q]=r,s[r]=b),b}function g(a,c,d){if(c||(c=b),l)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():p.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||o.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),l)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return t.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(t,b.frag)}function j(a){a||(a=b);var d=f(a);return!t.shivCSS||k||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),l||i(a,d),a}var k,l,m="3.7.3-pre",n=a.html5||{},o=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,p=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,q="_html5shiv",r=0,s={};!function(){try{var a=b.createElement("a");a.innerHTML="",k="hidden"in a,l=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){k=!0,l=!0}}();var t={elements:n.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:m,shivCSS:n.shivCSS!==!1,supportsUnknownElements:l,shivMethods:n.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=t,j(b),"object"==typeof module&&module.exports&&(module.exports=t)}("undefined"!=typeof window?window:this,document); \ No newline at end of file diff --git a/docs/build/html/_static/js/theme.js b/docs/build/html/_static/js/theme.js new file mode 100644 index 0000000..1fddb6e --- /dev/null +++ b/docs/build/html/_static/js/theme.js @@ -0,0 +1 @@ +!function(n){var e={};function t(i){if(e[i])return e[i].exports;var o=e[i]={i:i,l:!1,exports:{}};return n[i].call(o.exports,o,o.exports,t),o.l=!0,o.exports}t.m=n,t.c=e,t.d=function(n,e,i){t.o(n,e)||Object.defineProperty(n,e,{enumerable:!0,get:i})},t.r=function(n){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(n,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(n,"__esModule",{value:!0})},t.t=function(n,e){if(1&e&&(n=t(n)),8&e)return n;if(4&e&&"object"==typeof n&&n&&n.__esModule)return n;var i=Object.create(null);if(t.r(i),Object.defineProperty(i,"default",{enumerable:!0,value:n}),2&e&&"string"!=typeof n)for(var o in n)t.d(i,o,function(e){return n[e]}.bind(null,o));return i},t.n=function(n){var e=n&&n.__esModule?function(){return n.default}:function(){return n};return t.d(e,"a",e),e},t.o=function(n,e){return Object.prototype.hasOwnProperty.call(n,e)},t.p="",t(t.s=0)}([function(n,e,t){t(1),n.exports=t(3)},function(n,e,t){(function(){var e="undefined"!=typeof window?window.jQuery:t(2);n.exports.ThemeNav={navBar:null,win:null,winScroll:!1,winResize:!1,linkScroll:!1,winPosition:0,winHeight:null,docHeight:null,isRunning:!1,enable:function(n){var t=this;void 0===n&&(n=!0),t.isRunning||(t.isRunning=!0,e((function(e){t.init(e),t.reset(),t.win.on("hashchange",t.reset),n&&t.win.on("scroll",(function(){t.linkScroll||t.winScroll||(t.winScroll=!0,requestAnimationFrame((function(){t.onScroll()})))})),t.win.on("resize",(function(){t.winResize||(t.winResize=!0,requestAnimationFrame((function(){t.onResize()})))})),t.onResize()})))},enableSticky:function(){this.enable(!0)},init:function(n){n(document);var e=this;this.navBar=n("div.wy-side-scroll:first"),this.win=n(window),n(document).on("click","[data-toggle='wy-nav-top']",(function(){n("[data-toggle='wy-nav-shift']").toggleClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift")})).on("click",".wy-menu-vertical .current ul li a",(function(){var t=n(this);n("[data-toggle='wy-nav-shift']").removeClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift"),e.toggleCurrent(t),e.hashChange()})).on("click","[data-toggle='rst-current-version']",(function(){n("[data-toggle='rst-versions']").toggleClass("shift-up")})),n("table.docutils:not(.field-list,.footnote,.citation)").wrap("
"),n("table.docutils.footnote").wrap("
"),n("table.docutils.citation").wrap("
"),n(".wy-menu-vertical ul").not(".simple").siblings("a").each((function(){var t=n(this);expand=n(''),expand.on("click",(function(n){return e.toggleCurrent(t),n.stopPropagation(),!1})),t.prepend(expand)}))},reset:function(){var n=encodeURI(window.location.hash)||"#";try{var e=$(".wy-menu-vertical"),t=e.find('[href="'+n+'"]');if(0===t.length){var i=$('.document [id="'+n.substring(1)+'"]').closest("div.section");0===(t=e.find('[href="#'+i.attr("id")+'"]')).length&&(t=e.find('[href="#"]'))}if(t.length>0){$(".wy-menu-vertical .current").removeClass("current").attr("aria-expanded","false"),t.addClass("current").attr("aria-expanded","true"),t.closest("li.toctree-l1").parent().addClass("current").attr("aria-expanded","true");for(let n=1;n<=10;n++)t.closest("li.toctree-l"+n).addClass("current").attr("aria-expanded","true");t[0].scrollIntoView()}}catch(n){console.log("Error expanding nav for anchor",n)}},onScroll:function(){this.winScroll=!1;var n=this.win.scrollTop(),e=n+this.winHeight,t=this.navBar.scrollTop()+(n-this.winPosition);n<0||e>this.docHeight||(this.navBar.scrollTop(t),this.winPosition=n)},onResize:function(){this.winResize=!1,this.winHeight=this.win.height(),this.docHeight=$(document).height()},hashChange:function(){this.linkScroll=!0,this.win.one("hashchange",(function(){this.linkScroll=!1}))},toggleCurrent:function(n){var e=n.closest("li");e.siblings("li.current").removeClass("current").attr("aria-expanded","false"),e.siblings().find("li.current").removeClass("current").attr("aria-expanded","false");var t=e.find("> ul li");t.length&&(t.removeClass("current").attr("aria-expanded","false"),e.toggleClass("current").attr("aria-expanded",(function(n,e){return"true"==e?"false":"true"})))}},"undefined"!=typeof window&&(window.SphinxRtdTheme={Navigation:n.exports.ThemeNav,StickyNav:n.exports.ThemeNav}),function(){for(var n=0,e=["ms","moz","webkit","o"],t=0;t0 - var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 - var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 - var s_v = "^(" + C + ")?" + v; // vowel in stem - - this.stemWord = function (w) { - var stem; - var suffix; - var firstch; - var origword = w; - - if (w.length < 3) - return w; - - var re; - var re2; - var re3; - var re4; - - firstch = w.substr(0,1); - if (firstch == "y") - w = firstch.toUpperCase() + w.substr(1); - - // Step 1a - re = /^(.+?)(ss|i)es$/; - re2 = /^(.+?)([^s])s$/; - - if (re.test(w)) - w = w.replace(re,"$1$2"); - else if (re2.test(w)) - w = w.replace(re2,"$1$2"); - - // Step 1b - re = /^(.+?)eed$/; - re2 = /^(.+?)(ed|ing)$/; - if (re.test(w)) { - var fp = re.exec(w); - re = new RegExp(mgr0); - if (re.test(fp[1])) { - re = /.$/; - w = w.replace(re,""); - } - } - else if (re2.test(w)) { - var fp = re2.exec(w); - stem = fp[1]; - re2 = new RegExp(s_v); - if (re2.test(stem)) { - w = stem; - re2 = /(at|bl|iz)$/; - re3 = new RegExp("([^aeiouylsz])\\1$"); - re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); - if (re2.test(w)) - w = w + "e"; - else if (re3.test(w)) { - re = /.$/; - w = w.replace(re,""); - } - else if (re4.test(w)) - w = w + "e"; - } - } - - // Step 1c - re = /^(.+?)y$/; - if (re.test(w)) { - var fp = re.exec(w); - stem = fp[1]; - re = new RegExp(s_v); - if (re.test(stem)) - w = stem + "i"; - } - - // Step 2 - re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; - if (re.test(w)) { - var fp = re.exec(w); - stem = fp[1]; - suffix = fp[2]; - re = new RegExp(mgr0); - if (re.test(stem)) - w = stem + step2list[suffix]; - } - - // Step 3 - re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; - if (re.test(w)) { - var fp = re.exec(w); - stem = fp[1]; - suffix = fp[2]; - re = new RegExp(mgr0); - if (re.test(stem)) - w = stem + step3list[suffix]; - } - - // Step 4 - re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; - re2 = /^(.+?)(s|t)(ion)$/; - if (re.test(w)) { - var fp = re.exec(w); - stem = fp[1]; - re = new RegExp(mgr1); - if (re.test(stem)) - w = stem; - } - else if (re2.test(w)) { - var fp = re2.exec(w); - stem = fp[1] + fp[2]; - re2 = new RegExp(mgr1); - if (re2.test(stem)) - w = stem; - } - - // Step 5 - re = /^(.+?)e$/; - if (re.test(w)) { - var fp = re.exec(w); - stem = fp[1]; - re = new RegExp(mgr1); - re2 = new RegExp(meq1); - re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); - if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) - w = stem; - } - re = /ll$/; - re2 = new RegExp(mgr1); - if (re.test(w) && re2.test(w)) { - re = /.$/; - w = w.replace(re,""); - } - - // and turn initial Y back to y - if (firstch == "y") - w = firstch.toLowerCase() + w.substr(1); - return w; - } -} - diff --git a/docs/build/html/_static/minus.png b/docs/build/html/_static/minus.png deleted file mode 100644 index d96755f..0000000 Binary files a/docs/build/html/_static/minus.png and /dev/null differ diff --git a/docs/build/html/_static/navigation.png b/docs/build/html/_static/navigation.png new file mode 100644 index 0000000..fda6cd2 Binary files /dev/null and b/docs/build/html/_static/navigation.png differ diff --git a/docs/build/html/_static/plus.png b/docs/build/html/_static/plus.png deleted file mode 100644 index 7107cec..0000000 Binary files a/docs/build/html/_static/plus.png and /dev/null differ diff --git a/docs/build/html/_static/pygments.css b/docs/build/html/_static/pygments.css deleted file mode 100644 index 7a18115..0000000 --- a/docs/build/html/_static/pygments.css +++ /dev/null @@ -1,74 +0,0 @@ -pre { line-height: 125%; } -td.linenos .normal { color: #666666; background-color: transparent; padding-left: 5px; padding-right: 5px; } -span.linenos { color: #666666; background-color: transparent; padding-left: 5px; padding-right: 5px; } -td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } -span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } -.highlight .hll { background-color: #ffffcc } -.highlight { background: #f0f0f0; } -.highlight .c { color: #60a0b0; font-style: italic } /* Comment */ -.highlight .err { border: 1px solid #FF0000 } /* Error */ -.highlight .k { color: #007020; font-weight: bold } /* Keyword */ -.highlight .o { color: #666666 } /* Operator */ -.highlight .ch { color: #60a0b0; font-style: italic } /* Comment.Hashbang */ -.highlight .cm { color: #60a0b0; font-style: italic } /* Comment.Multiline */ -.highlight .cp { color: #007020 } /* Comment.Preproc */ -.highlight .cpf { color: #60a0b0; font-style: italic } /* Comment.PreprocFile */ -.highlight .c1 { color: #60a0b0; font-style: italic } /* Comment.Single */ -.highlight .cs { color: #60a0b0; background-color: #fff0f0 } /* Comment.Special */ -.highlight .gd { color: #A00000 } /* Generic.Deleted */ -.highlight .ge { font-style: italic } /* Generic.Emph */ -.highlight .gr { color: #FF0000 } /* Generic.Error */ -.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ -.highlight .gi { color: #00A000 } /* Generic.Inserted */ -.highlight .go { color: #888888 } /* Generic.Output */ -.highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */ -.highlight .gs { font-weight: bold } /* Generic.Strong */ -.highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ -.highlight .gt { color: #0044DD } /* Generic.Traceback */ -.highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */ -.highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */ -.highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */ -.highlight .kp { color: #007020 } /* Keyword.Pseudo */ -.highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */ -.highlight .kt { color: #902000 } /* Keyword.Type */ -.highlight .m { color: #40a070 } /* Literal.Number */ -.highlight .s { color: #4070a0 } /* Literal.String */ -.highlight .na { color: #4070a0 } /* Name.Attribute */ -.highlight .nb { color: #007020 } /* Name.Builtin */ -.highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */ -.highlight .no { color: #60add5 } /* Name.Constant */ -.highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */ -.highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */ -.highlight .ne { color: #007020 } /* Name.Exception */ -.highlight .nf { color: #06287e } /* Name.Function */ -.highlight .nl { color: #002070; font-weight: bold } /* Name.Label */ -.highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */ -.highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */ -.highlight .nv { color: #bb60d5 } /* Name.Variable */ -.highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */ -.highlight .w { color: #bbbbbb } /* Text.Whitespace */ -.highlight .mb { color: #40a070 } /* Literal.Number.Bin */ -.highlight .mf { color: #40a070 } /* Literal.Number.Float */ -.highlight .mh { color: #40a070 } /* Literal.Number.Hex */ -.highlight .mi { color: #40a070 } /* Literal.Number.Integer */ -.highlight .mo { color: #40a070 } /* Literal.Number.Oct */ -.highlight .sa { color: #4070a0 } /* Literal.String.Affix */ -.highlight .sb { color: #4070a0 } /* Literal.String.Backtick */ -.highlight .sc { color: #4070a0 } /* Literal.String.Char */ -.highlight .dl { color: #4070a0 } /* Literal.String.Delimiter */ -.highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */ -.highlight .s2 { color: #4070a0 } /* Literal.String.Double */ -.highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */ -.highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */ -.highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */ -.highlight .sx { color: #c65d09 } /* Literal.String.Other */ -.highlight .sr { color: #235388 } /* Literal.String.Regex */ -.highlight .s1 { color: #4070a0 } /* Literal.String.Single */ -.highlight .ss { color: #517918 } /* Literal.String.Symbol */ -.highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */ -.highlight .fm { color: #06287e } /* Name.Function.Magic */ -.highlight .vc { color: #bb60d5 } /* Name.Variable.Class */ -.highlight .vg { color: #bb60d5 } /* Name.Variable.Global */ -.highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */ -.highlight .vm { color: #bb60d5 } /* Name.Variable.Magic */ -.highlight .il { color: #40a070 } /* Literal.Number.Integer.Long */ \ No newline at end of file diff --git a/docs/build/html/_static/searchtools.js b/docs/build/html/_static/searchtools.js deleted file mode 100644 index e89e34d..0000000 --- a/docs/build/html/_static/searchtools.js +++ /dev/null @@ -1,566 +0,0 @@ -/* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * - * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * - */ -"use strict"; - -/** - * Simple result scoring code. - */ -if (typeof Scorer === "undefined") { - var Scorer = { - // Implement the following function to further tweak the score for each result - // The function takes a result array [docname, title, anchor, descr, score, filename] - // and returns the new score. - /* - score: result => { - const [docname, title, anchor, descr, score, filename] = result - return score - }, - */ - - // query matches the full name of an object - objNameMatch: 11, - // or matches in the last dotted part of the object name - objPartialMatch: 6, - // Additive scores depending on the priority of the object - objPrio: { - 0: 15, // used to be importantResults - 1: 5, // used to be objectResults - 2: -5, // used to be unimportantResults - }, - // Used when the priority is not in the mapping. - objPrioDefault: 0, - - // query found in title - title: 15, - partialTitle: 7, - // query found in terms - term: 5, - partialTerm: 2, - }; -} - -const _removeChildren = (element) => { - while (element && element.lastChild) element.removeChild(element.lastChild); -}; - -/** - * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#escaping - */ -const _escapeRegExp = (string) => - string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string - -const _displayItem = (item, searchTerms) => { - const docBuilder = DOCUMENTATION_OPTIONS.BUILDER; - const docUrlRoot = DOCUMENTATION_OPTIONS.URL_ROOT; - const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX; - const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX; - const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; - - const [docName, title, anchor, descr, score, _filename] = item; - - let listItem = document.createElement("li"); - let requestUrl; - let linkUrl; - if (docBuilder === "dirhtml") { - // dirhtml builder - let dirname = docName + "/"; - if (dirname.match(/\/index\/$/)) - dirname = dirname.substring(0, dirname.length - 6); - else if (dirname === "index/") dirname = ""; - requestUrl = docUrlRoot + dirname; - linkUrl = requestUrl; - } else { - // normal html builders - requestUrl = docUrlRoot + docName + docFileSuffix; - linkUrl = docName + docLinkSuffix; - } - let linkEl = listItem.appendChild(document.createElement("a")); - linkEl.href = linkUrl + anchor; - linkEl.dataset.score = score; - linkEl.innerHTML = title; - if (descr) - listItem.appendChild(document.createElement("span")).innerHTML = - " (" + descr + ")"; - else if (showSearchSummary) - fetch(requestUrl) - .then((responseData) => responseData.text()) - .then((data) => { - if (data) - listItem.appendChild( - Search.makeSearchSummary(data, searchTerms) - ); - }); - Search.output.appendChild(listItem); -}; -const _finishSearch = (resultCount) => { - Search.stopPulse(); - Search.title.innerText = _("Search Results"); - if (!resultCount) - Search.status.innerText = Documentation.gettext( - "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." - ); - else - Search.status.innerText = _( - `Search finished, found ${resultCount} page(s) matching the search query.` - ); -}; -const _displayNextItem = ( - results, - resultCount, - searchTerms -) => { - // results left, load the summary and display it - // this is intended to be dynamic (don't sub resultsCount) - if (results.length) { - _displayItem(results.pop(), searchTerms); - setTimeout( - () => _displayNextItem(results, resultCount, searchTerms), - 5 - ); - } - // search finished, update title and status message - else _finishSearch(resultCount); -}; - -/** - * Default splitQuery function. Can be overridden in ``sphinx.search`` with a - * custom function per language. - * - * The regular expression works by splitting the string on consecutive characters - * that are not Unicode letters, numbers, underscores, or emoji characters. - * This is the same as ``\W+`` in Python, preserving the surrogate pair area. - */ -if (typeof splitQuery === "undefined") { - var splitQuery = (query) => query - .split(/[^\p{Letter}\p{Number}_\p{Emoji_Presentation}]+/gu) - .filter(term => term) // remove remaining empty strings -} - -/** - * Search Module - */ -const Search = { - _index: null, - _queued_query: null, - _pulse_status: -1, - - htmlToText: (htmlString) => { - const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html'); - htmlElement.querySelectorAll(".headerlink").forEach((el) => { el.remove() }); - const docContent = htmlElement.querySelector('[role="main"]'); - if (docContent !== undefined) return docContent.textContent; - console.warn( - "Content block not found. Sphinx search tries to obtain it via '[role=main]'. Could you check your theme or template." - ); - return ""; - }, - - init: () => { - const query = new URLSearchParams(window.location.search).get("q"); - document - .querySelectorAll('input[name="q"]') - .forEach((el) => (el.value = query)); - if (query) Search.performSearch(query); - }, - - loadIndex: (url) => - (document.body.appendChild(document.createElement("script")).src = url), - - setIndex: (index) => { - Search._index = index; - if (Search._queued_query !== null) { - const query = Search._queued_query; - Search._queued_query = null; - Search.query(query); - } - }, - - hasIndex: () => Search._index !== null, - - deferQuery: (query) => (Search._queued_query = query), - - stopPulse: () => (Search._pulse_status = -1), - - startPulse: () => { - if (Search._pulse_status >= 0) return; - - const pulse = () => { - Search._pulse_status = (Search._pulse_status + 1) % 4; - Search.dots.innerText = ".".repeat(Search._pulse_status); - if (Search._pulse_status >= 0) window.setTimeout(pulse, 500); - }; - pulse(); - }, - - /** - * perform a search for something (or wait until index is loaded) - */ - performSearch: (query) => { - // create the required interface elements - const searchText = document.createElement("h2"); - searchText.textContent = _("Searching"); - const searchSummary = document.createElement("p"); - searchSummary.classList.add("search-summary"); - searchSummary.innerText = ""; - const searchList = document.createElement("ul"); - searchList.classList.add("search"); - - const out = document.getElementById("search-results"); - Search.title = out.appendChild(searchText); - Search.dots = Search.title.appendChild(document.createElement("span")); - Search.status = out.appendChild(searchSummary); - Search.output = out.appendChild(searchList); - - const searchProgress = document.getElementById("search-progress"); - // Some themes don't use the search progress node - if (searchProgress) { - searchProgress.innerText = _("Preparing search..."); - } - Search.startPulse(); - - // index already loaded, the browser was quick! - if (Search.hasIndex()) Search.query(query); - else Search.deferQuery(query); - }, - - /** - * execute search (requires search index to be loaded) - */ - query: (query) => { - const filenames = Search._index.filenames; - const docNames = Search._index.docnames; - const titles = Search._index.titles; - const allTitles = Search._index.alltitles; - const indexEntries = Search._index.indexentries; - - // stem the search terms and add them to the correct list - const stemmer = new Stemmer(); - const searchTerms = new Set(); - const excludedTerms = new Set(); - const highlightTerms = new Set(); - const objectTerms = new Set(splitQuery(query.toLowerCase().trim())); - splitQuery(query.trim()).forEach((queryTerm) => { - const queryTermLower = queryTerm.toLowerCase(); - - // maybe skip this "word" - // stopwords array is from language_data.js - if ( - stopwords.indexOf(queryTermLower) !== -1 || - queryTerm.match(/^\d+$/) - ) - return; - - // stem the word - let word = stemmer.stemWord(queryTermLower); - // select the correct list - if (word[0] === "-") excludedTerms.add(word.substr(1)); - else { - searchTerms.add(word); - highlightTerms.add(queryTermLower); - } - }); - - if (SPHINX_HIGHLIGHT_ENABLED) { // set in sphinx_highlight.js - localStorage.setItem("sphinx_highlight_terms", [...highlightTerms].join(" ")) - } - - // console.debug("SEARCH: searching for:"); - // console.info("required: ", [...searchTerms]); - // console.info("excluded: ", [...excludedTerms]); - - // array of [docname, title, anchor, descr, score, filename] - let results = []; - _removeChildren(document.getElementById("search-progress")); - - const queryLower = query.toLowerCase(); - for (const [title, foundTitles] of Object.entries(allTitles)) { - if (title.toLowerCase().includes(queryLower) && (queryLower.length >= title.length/2)) { - for (const [file, id] of foundTitles) { - let score = Math.round(100 * queryLower.length / title.length) - results.push([ - docNames[file], - titles[file] !== title ? `${titles[file]} > ${title}` : title, - id !== null ? "#" + id : "", - null, - score, - filenames[file], - ]); - } - } - } - - // search for explicit entries in index directives - for (const [entry, foundEntries] of Object.entries(indexEntries)) { - if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) { - for (const [file, id] of foundEntries) { - let score = Math.round(100 * queryLower.length / entry.length) - results.push([ - docNames[file], - titles[file], - id ? "#" + id : "", - null, - score, - filenames[file], - ]); - } - } - } - - // lookup as object - objectTerms.forEach((term) => - results.push(...Search.performObjectSearch(term, objectTerms)) - ); - - // lookup as search terms in fulltext - results.push(...Search.performTermsSearch(searchTerms, excludedTerms)); - - // let the scorer override scores with a custom scoring function - if (Scorer.score) results.forEach((item) => (item[4] = Scorer.score(item))); - - // now sort the results by score (in opposite order of appearance, since the - // display function below uses pop() to retrieve items) and then - // alphabetically - results.sort((a, b) => { - const leftScore = a[4]; - const rightScore = b[4]; - if (leftScore === rightScore) { - // same score: sort alphabetically - const leftTitle = a[1].toLowerCase(); - const rightTitle = b[1].toLowerCase(); - if (leftTitle === rightTitle) return 0; - return leftTitle > rightTitle ? -1 : 1; // inverted is intentional - } - return leftScore > rightScore ? 1 : -1; - }); - - // remove duplicate search results - // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept - let seen = new Set(); - results = results.reverse().reduce((acc, result) => { - let resultStr = result.slice(0, 4).concat([result[5]]).map(v => String(v)).join(','); - if (!seen.has(resultStr)) { - acc.push(result); - seen.add(resultStr); - } - return acc; - }, []); - - results = results.reverse(); - - // for debugging - //Search.lastresults = results.slice(); // a copy - // console.info("search results:", Search.lastresults); - - // print the results - _displayNextItem(results, results.length, searchTerms); - }, - - /** - * search for object names - */ - performObjectSearch: (object, objectTerms) => { - const filenames = Search._index.filenames; - const docNames = Search._index.docnames; - const objects = Search._index.objects; - const objNames = Search._index.objnames; - const titles = Search._index.titles; - - const results = []; - - const objectSearchCallback = (prefix, match) => { - const name = match[4] - const fullname = (prefix ? prefix + "." : "") + name; - const fullnameLower = fullname.toLowerCase(); - if (fullnameLower.indexOf(object) < 0) return; - - let score = 0; - const parts = fullnameLower.split("."); - - // check for different match types: exact matches of full name or - // "last name" (i.e. last dotted part) - if (fullnameLower === object || parts.slice(-1)[0] === object) - score += Scorer.objNameMatch; - else if (parts.slice(-1)[0].indexOf(object) > -1) - score += Scorer.objPartialMatch; // matches in last name - - const objName = objNames[match[1]][2]; - const title = titles[match[0]]; - - // If more than one term searched for, we require other words to be - // found in the name/title/description - const otherTerms = new Set(objectTerms); - otherTerms.delete(object); - if (otherTerms.size > 0) { - const haystack = `${prefix} ${name} ${objName} ${title}`.toLowerCase(); - if ( - [...otherTerms].some((otherTerm) => haystack.indexOf(otherTerm) < 0) - ) - return; - } - - let anchor = match[3]; - if (anchor === "") anchor = fullname; - else if (anchor === "-") anchor = objNames[match[1]][1] + "-" + fullname; - - const descr = objName + _(", in ") + title; - - // add custom score for some objects according to scorer - if (Scorer.objPrio.hasOwnProperty(match[2])) - score += Scorer.objPrio[match[2]]; - else score += Scorer.objPrioDefault; - - results.push([ - docNames[match[0]], - fullname, - "#" + anchor, - descr, - score, - filenames[match[0]], - ]); - }; - Object.keys(objects).forEach((prefix) => - objects[prefix].forEach((array) => - objectSearchCallback(prefix, array) - ) - ); - return results; - }, - - /** - * search for full-text terms in the index - */ - performTermsSearch: (searchTerms, excludedTerms) => { - // prepare search - const terms = Search._index.terms; - const titleTerms = Search._index.titleterms; - const filenames = Search._index.filenames; - const docNames = Search._index.docnames; - const titles = Search._index.titles; - - const scoreMap = new Map(); - const fileMap = new Map(); - - // perform the search on the required terms - searchTerms.forEach((word) => { - const files = []; - const arr = [ - { files: terms[word], score: Scorer.term }, - { files: titleTerms[word], score: Scorer.title }, - ]; - // add support for partial matches - if (word.length > 2) { - const escapedWord = _escapeRegExp(word); - Object.keys(terms).forEach((term) => { - if (term.match(escapedWord) && !terms[word]) - arr.push({ files: terms[term], score: Scorer.partialTerm }); - }); - Object.keys(titleTerms).forEach((term) => { - if (term.match(escapedWord) && !titleTerms[word]) - arr.push({ files: titleTerms[word], score: Scorer.partialTitle }); - }); - } - - // no match but word was a required one - if (arr.every((record) => record.files === undefined)) return; - - // found search word in contents - arr.forEach((record) => { - if (record.files === undefined) return; - - let recordFiles = record.files; - if (recordFiles.length === undefined) recordFiles = [recordFiles]; - files.push(...recordFiles); - - // set score for the word in each file - recordFiles.forEach((file) => { - if (!scoreMap.has(file)) scoreMap.set(file, {}); - scoreMap.get(file)[word] = record.score; - }); - }); - - // create the mapping - files.forEach((file) => { - if (fileMap.has(file) && fileMap.get(file).indexOf(word) === -1) - fileMap.get(file).push(word); - else fileMap.set(file, [word]); - }); - }); - - // now check if the files don't contain excluded terms - const results = []; - for (const [file, wordList] of fileMap) { - // check if all requirements are matched - - // as search terms with length < 3 are discarded - const filteredTermCount = [...searchTerms].filter( - (term) => term.length > 2 - ).length; - if ( - wordList.length !== searchTerms.size && - wordList.length !== filteredTermCount - ) - continue; - - // ensure that none of the excluded terms is in the search result - if ( - [...excludedTerms].some( - (term) => - terms[term] === file || - titleTerms[term] === file || - (terms[term] || []).includes(file) || - (titleTerms[term] || []).includes(file) - ) - ) - break; - - // select one (max) score for the file. - const score = Math.max(...wordList.map((w) => scoreMap.get(file)[w])); - // add result to the result list - results.push([ - docNames[file], - titles[file], - "", - null, - score, - filenames[file], - ]); - } - return results; - }, - - /** - * helper function to return a node containing the - * search summary for a given text. keywords is a list - * of stemmed words. - */ - makeSearchSummary: (htmlText, keywords) => { - const text = Search.htmlToText(htmlText); - if (text === "") return null; - - const textLower = text.toLowerCase(); - const actualStartPosition = [...keywords] - .map((k) => textLower.indexOf(k.toLowerCase())) - .filter((i) => i > -1) - .slice(-1)[0]; - const startWithContext = Math.max(actualStartPosition - 120, 0); - - const top = startWithContext === 0 ? "" : "..."; - const tail = startWithContext + 240 < text.length ? "..." : ""; - - let summary = document.createElement("p"); - summary.classList.add("context"); - summary.textContent = top + text.substr(startWithContext, 240).trim() + tail; - - return summary; - }, -}; - -_ready(Search.init); diff --git a/docs/build/html/_static/sphinx_highlight.js b/docs/build/html/_static/sphinx_highlight.js new file mode 100644 index 0000000..aae669d --- /dev/null +++ b/docs/build/html/_static/sphinx_highlight.js @@ -0,0 +1,144 @@ +/* Highlighting utilities for Sphinx HTML documentation. */ +"use strict"; + +const SPHINX_HIGHLIGHT_ENABLED = true + +/** + * highlight a given string on a node by wrapping it in + * span elements with the given class name. + */ +const _highlight = (node, addItems, text, className) => { + if (node.nodeType === Node.TEXT_NODE) { + const val = node.nodeValue; + const parent = node.parentNode; + const pos = val.toLowerCase().indexOf(text); + if ( + pos >= 0 && + !parent.classList.contains(className) && + !parent.classList.contains("nohighlight") + ) { + let span; + + const closestNode = parent.closest("body, svg, foreignObject"); + const isInSVG = closestNode && closestNode.matches("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.classList.add(className); + } + + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + parent.insertBefore( + span, + parent.insertBefore( + document.createTextNode(val.substr(pos + text.length)), + node.nextSibling + ) + ); + node.nodeValue = val.substr(0, pos); + + if (isInSVG) { + const rect = document.createElementNS( + "http://www.w3.org/2000/svg", + "rect" + ); + const bbox = parent.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute("class", className); + addItems.push({ parent: parent, target: rect }); + } + } + } else if (node.matches && !node.matches("button, select, textarea")) { + node.childNodes.forEach((el) => _highlight(el, addItems, text, className)); + } +}; +const _highlightText = (thisNode, text, className) => { + let addItems = []; + _highlight(thisNode, addItems, text, className); + addItems.forEach((obj) => + obj.parent.insertAdjacentElement("beforebegin", obj.target) + ); +}; + +/** + * Small JavaScript module for the documentation. + */ +const SphinxHighlight = { + + /** + * highlight the search words provided in localstorage in the text + */ + highlightSearchWords: () => { + if (!SPHINX_HIGHLIGHT_ENABLED) return; // bail if no highlight + + // get and clear terms from localstorage + const url = new URL(window.location); + const highlight = + localStorage.getItem("sphinx_highlight_terms") + || url.searchParams.get("highlight") + || ""; + localStorage.removeItem("sphinx_highlight_terms") + url.searchParams.delete("highlight"); + window.history.replaceState({}, "", url); + + // get individual terms from highlight string + const terms = highlight.toLowerCase().split(/\s+/).filter(x => x); + if (terms.length === 0) return; // nothing to do + + // There should never be more than one element matching "div.body" + const divBody = document.querySelectorAll("div.body"); + const body = divBody.length ? divBody[0] : document.querySelector("body"); + window.setTimeout(() => { + terms.forEach((term) => _highlightText(body, term, "highlighted")); + }, 10); + + const searchBox = document.getElementById("searchbox"); + if (searchBox === null) return; + searchBox.appendChild( + document + .createRange() + .createContextualFragment( + '" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + localStorage.removeItem("sphinx_highlight_terms") + }, + + initEscapeListener: () => { + // only install a listener if it is really needed + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; + if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { + SphinxHighlight.hideSearchWords(); + event.preventDefault(); + } + }); + }, +}; + +_ready(SphinxHighlight.highlightSearchWords); +_ready(SphinxHighlight.initEscapeListener); diff --git a/docs/build/html/_static/sphinxdoc.css b/docs/build/html/_static/sphinxdoc.css new file mode 100644 index 0000000..b03830b --- /dev/null +++ b/docs/build/html/_static/sphinxdoc.css @@ -0,0 +1,354 @@ +/* + * sphinxdoc.css_t + * ~~~~~~~~~~~~~~~ + * + * Sphinx stylesheet -- sphinxdoc theme. Originally created by + * Armin Ronacher for Werkzeug. + * + * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +@import url("basic.css"); + +/* -- page layout ----------------------------------------------------------- */ + +body { + font-family: 'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', + 'Verdana', sans-serif; + font-size: 14px; + letter-spacing: -0.01em; + line-height: 150%; + text-align: center; + background-color: #BFD1D4; + color: black; + padding: 0; + border: 1px solid #aaa; + + margin: 0px 80px 0px 80px; + min-width: 740px; +} + +div.document { + background-color: white; + text-align: left; + background-image: url(contents.png); + background-repeat: repeat-x; +} + +div.documentwrapper { + float: left; + width: 100%; +} + +div.bodywrapper { + margin: 0 calc(230px + 10px) 0 0; + border-right: 1px solid #ccc; +} + +div.body { + margin: 0; + padding: 0.5em 20px 20px 20px; +} + +div.related { + font-size: 1em; +} + +div.related ul { + background-image: url(navigation.png); + height: 2em; + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; +} + +div.related ul li { + margin: 0; + padding: 0; + height: 2em; + float: left; +} + +div.related ul li.right { + float: right; + margin-right: 5px; +} + +div.related ul li a { + margin: 0; + padding: 0 5px 0 5px; + line-height: 1.75em; + color: #EE9816; +} + +div.related ul li a:hover { + color: #3CA8E7; +} + +div.sphinxsidebarwrapper { + padding: 0; +} + +div.sphinxsidebar { + padding: 0.5em 15px 15px 0; + width: calc(230px - 20px); + float: right; + font-size: 1em; + text-align: left; +} + +div.sphinxsidebar h3, div.sphinxsidebar h4 { + margin: 1em 0 0.5em 0; + font-size: 1em; + padding: 0.1em 0 0.1em 0.5em; + color: white; + border: 1px solid #86989B; + background-color: #AFC1C4; +} + +div.sphinxsidebar h3 a { + color: white; +} + +div.sphinxsidebar ul { + padding-left: 1.5em; + margin-top: 7px; + padding: 0; + line-height: 130%; +} + +div.sphinxsidebar ul ul { + margin-left: 20px; +} + +div.footer { + background-color: #E3EFF1; + color: #86989B; + padding: 3px 8px 3px 0; + clear: both; + font-size: 0.8em; + text-align: right; +} + +div.footer a { + color: #86989B; + text-decoration: underline; +} + +/* -- body styles ----------------------------------------------------------- */ + +p { + margin: 0.8em 0 0.5em 0; +} + +a { + color: #CA7900; + text-decoration: none; +} + +a:hover { + color: #2491CF; +} + +a:visited { + color: #551A8B; +} + +div.body a { + text-decoration: underline; +} + +h1 { + margin: 0; + padding: 0.7em 0 0.3em 0; + font-size: 1.5em; + color: #11557C; +} + +h2 { + margin: 1.3em 0 0.2em 0; + font-size: 1.35em; + padding: 0; +} + +h3 { + margin: 1em 0 -0.3em 0; + font-size: 1.2em; +} + +div.body h1 a, div.body h2 a, div.body h3 a, div.body h4 a, div.body h5 a, div.body h6 a { + color: black!important; +} + +h1 a.anchor, h2 a.anchor, h3 a.anchor, h4 a.anchor, h5 a.anchor, h6 a.anchor { + display: none; + margin: 0 0 0 0.3em; + padding: 0 0.2em 0 0.2em; + color: #aaa!important; +} + +h1:hover a.anchor, h2:hover a.anchor, h3:hover a.anchor, h4:hover a.anchor, +h5:hover a.anchor, h6:hover a.anchor { + display: inline; +} + +h1 a.anchor:hover, h2 a.anchor:hover, h3 a.anchor:hover, h4 a.anchor:hover, +h5 a.anchor:hover, h6 a.anchor:hover { + color: #777; + background-color: #eee; +} + +a.headerlink { + color: #c60f0f!important; + font-size: 1em; + margin-left: 6px; + padding: 0 4px 0 4px; + text-decoration: none!important; +} + +a.headerlink:hover { + background-color: #ccc; + color: white!important; +} + +cite, code, code { + font-family: 'Consolas', 'Deja Vu Sans Mono', + 'Bitstream Vera Sans Mono', monospace; + font-size: 0.95em; + letter-spacing: 0.01em; +} + +code { + background-color: #f2f2f2; + border-bottom: 1px solid #ddd; + color: #333; +} + +code.descname, code.descclassname, code.xref { + border: 0; +} + +hr { + border: 1px solid #abc; + margin: 2em; +} + +a code { + border: 0; + color: #CA7900; +} + +a code:hover { + color: #2491CF; +} + +pre { + font-family: 'Consolas', 'Deja Vu Sans Mono', + 'Bitstream Vera Sans Mono', monospace; + font-size: 0.95em; + letter-spacing: 0.015em; + line-height: 120%; + padding: 0.5em; + border: 1px solid #ccc; +} + +pre a { + color: inherit; + text-decoration: underline; +} + +td.linenos pre { + padding: 0.5em 0; +} + +div.quotebar { + background-color: #f8f8f8; + max-width: 250px; + float: right; + padding: 2px 7px; + border: 1px solid #ccc; +} + +nav.contents, +aside.topic, +div.topic { + background-color: #f8f8f8; +} + +table { + border-collapse: collapse; + margin: 0 -0.5em 0 -0.5em; +} + +table td, table th { + padding: 0.2em 0.5em 0.2em 0.5em; +} + +div.admonition, div.warning { + font-size: 0.9em; + margin: 1em 0 1em 0; + border: 1px solid #86989B; + background-color: #f7f7f7; + padding: 0; +} + +div.admonition p, div.warning p { + margin: 0.5em 1em 0.5em 1em; + padding: 0; +} + +div.admonition pre, div.warning pre { + margin: 0.4em 1em 0.4em 1em; +} + +div.admonition p.admonition-title, +div.warning p.admonition-title { + margin: 0; + padding: 0.1em 0 0.1em 0.5em; + color: white; + border-bottom: 1px solid #86989B; + font-weight: bold; + background-color: #AFC1C4; +} + +div.warning { + border: 1px solid #940000; +} + +div.warning p.admonition-title { + background-color: #CF0000; + border-bottom-color: #940000; +} + +div.admonition ul, div.admonition ol, +div.warning ul, div.warning ol { + margin: 0.1em 0.5em 0.5em 3em; + padding: 0; +} + +div.versioninfo { + margin: 1em 0 0 0; + border: 1px solid #ccc; + background-color: #DDEAF0; + padding: 8px; + line-height: 1.3em; + font-size: 0.9em; +} + +.viewcode-back { + font-family: 'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', + 'Verdana', sans-serif; +} + +div.viewcode-block:target { + background-color: #f4debf; + border-top: 1px solid #ac9; + border-bottom: 1px solid #ac9; +} + +div.code-block-caption { + background-color: #ddd; + color: #222; + border: 1px solid #ccc; +} \ No newline at end of file diff --git a/docs/build/html/_static/underscore-1.13.1.js b/docs/build/html/_static/underscore-1.13.1.js deleted file mode 100644 index ffd77af..0000000 --- a/docs/build/html/_static/underscore-1.13.1.js +++ /dev/null @@ -1,2042 +0,0 @@ -(function (global, factory) { - typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() : - typeof define === 'function' && define.amd ? define('underscore', factory) : - (global = typeof globalThis !== 'undefined' ? globalThis : global || self, (function () { - var current = global._; - var exports = global._ = factory(); - exports.noConflict = function () { global._ = current; return exports; }; - }())); -}(this, (function () { - // Underscore.js 1.13.1 - // https://underscorejs.org - // (c) 2009-2021 Jeremy Ashkenas, Julian Gonggrijp, and DocumentCloud and Investigative Reporters & Editors - // Underscore may be freely distributed under the MIT license. - - // Current version. - var VERSION = '1.13.1'; - - // Establish the root object, `window` (`self`) in the browser, `global` - // on the server, or `this` in some virtual machines. We use `self` - // instead of `window` for `WebWorker` support. - var root = typeof self == 'object' && self.self === self && self || - typeof global == 'object' && global.global === global && global || - Function('return this')() || - {}; - - // Save bytes in the minified (but not gzipped) version: - var ArrayProto = Array.prototype, ObjProto = Object.prototype; - var SymbolProto = typeof Symbol !== 'undefined' ? Symbol.prototype : null; - - // Create quick reference variables for speed access to core prototypes. - var push = ArrayProto.push, - slice = ArrayProto.slice, - toString = ObjProto.toString, - hasOwnProperty = ObjProto.hasOwnProperty; - - // Modern feature detection. - var supportsArrayBuffer = typeof ArrayBuffer !== 'undefined', - supportsDataView = typeof DataView !== 'undefined'; - - // All **ECMAScript 5+** native function implementations that we hope to use - // are declared here. - var nativeIsArray = Array.isArray, - nativeKeys = Object.keys, - nativeCreate = Object.create, - nativeIsView = supportsArrayBuffer && ArrayBuffer.isView; - - // Create references to these builtin functions because we override them. - var _isNaN = isNaN, - _isFinite = isFinite; - - // Keys in IE < 9 that won't be iterated by `for key in ...` and thus missed. - var hasEnumBug = !{toString: null}.propertyIsEnumerable('toString'); - var nonEnumerableProps = ['valueOf', 'isPrototypeOf', 'toString', - 'propertyIsEnumerable', 'hasOwnProperty', 'toLocaleString']; - - // The largest integer that can be represented exactly. - var MAX_ARRAY_INDEX = Math.pow(2, 53) - 1; - - // Some functions take a variable number of arguments, or a few expected - // arguments at the beginning and then a variable number of values to operate - // on. This helper accumulates all remaining arguments past the function’s - // argument length (or an explicit `startIndex`), into an array that becomes - // the last argument. Similar to ES6’s "rest parameter". - function restArguments(func, startIndex) { - startIndex = startIndex == null ? func.length - 1 : +startIndex; - return function() { - var length = Math.max(arguments.length - startIndex, 0), - rest = Array(length), - index = 0; - for (; index < length; index++) { - rest[index] = arguments[index + startIndex]; - } - switch (startIndex) { - case 0: return func.call(this, rest); - case 1: return func.call(this, arguments[0], rest); - case 2: return func.call(this, arguments[0], arguments[1], rest); - } - var args = Array(startIndex + 1); - for (index = 0; index < startIndex; index++) { - args[index] = arguments[index]; - } - args[startIndex] = rest; - return func.apply(this, args); - }; - } - - // Is a given variable an object? - function isObject(obj) { - var type = typeof obj; - return type === 'function' || type === 'object' && !!obj; - } - - // Is a given value equal to null? - function isNull(obj) { - return obj === null; - } - - // Is a given variable undefined? - function isUndefined(obj) { - return obj === void 0; - } - - // Is a given value a boolean? - function isBoolean(obj) { - return obj === true || obj === false || toString.call(obj) === '[object Boolean]'; - } - - // Is a given value a DOM element? - function isElement(obj) { - return !!(obj && obj.nodeType === 1); - } - - // Internal function for creating a `toString`-based type tester. - function tagTester(name) { - var tag = '[object ' + name + ']'; - return function(obj) { - return toString.call(obj) === tag; - }; - } - - var isString = tagTester('String'); - - var isNumber = tagTester('Number'); - - var isDate = tagTester('Date'); - - var isRegExp = tagTester('RegExp'); - - var isError = tagTester('Error'); - - var isSymbol = tagTester('Symbol'); - - var isArrayBuffer = tagTester('ArrayBuffer'); - - var isFunction = tagTester('Function'); - - // Optimize `isFunction` if appropriate. Work around some `typeof` bugs in old - // v8, IE 11 (#1621), Safari 8 (#1929), and PhantomJS (#2236). - var nodelist = root.document && root.document.childNodes; - if (typeof /./ != 'function' && typeof Int8Array != 'object' && typeof nodelist != 'function') { - isFunction = function(obj) { - return typeof obj == 'function' || false; - }; - } - - var isFunction$1 = isFunction; - - var hasObjectTag = tagTester('Object'); - - // In IE 10 - Edge 13, `DataView` has string tag `'[object Object]'`. - // In IE 11, the most common among them, this problem also applies to - // `Map`, `WeakMap` and `Set`. - var hasStringTagBug = ( - supportsDataView && hasObjectTag(new DataView(new ArrayBuffer(8))) - ), - isIE11 = (typeof Map !== 'undefined' && hasObjectTag(new Map)); - - var isDataView = tagTester('DataView'); - - // In IE 10 - Edge 13, we need a different heuristic - // to determine whether an object is a `DataView`. - function ie10IsDataView(obj) { - return obj != null && isFunction$1(obj.getInt8) && isArrayBuffer(obj.buffer); - } - - var isDataView$1 = (hasStringTagBug ? ie10IsDataView : isDataView); - - // Is a given value an array? - // Delegates to ECMA5's native `Array.isArray`. - var isArray = nativeIsArray || tagTester('Array'); - - // Internal function to check whether `key` is an own property name of `obj`. - function has$1(obj, key) { - return obj != null && hasOwnProperty.call(obj, key); - } - - var isArguments = tagTester('Arguments'); - - // Define a fallback version of the method in browsers (ahem, IE < 9), where - // there isn't any inspectable "Arguments" type. - (function() { - if (!isArguments(arguments)) { - isArguments = function(obj) { - return has$1(obj, 'callee'); - }; - } - }()); - - var isArguments$1 = isArguments; - - // Is a given object a finite number? - function isFinite$1(obj) { - return !isSymbol(obj) && _isFinite(obj) && !isNaN(parseFloat(obj)); - } - - // Is the given value `NaN`? - function isNaN$1(obj) { - return isNumber(obj) && _isNaN(obj); - } - - // Predicate-generating function. Often useful outside of Underscore. - function constant(value) { - return function() { - return value; - }; - } - - // Common internal logic for `isArrayLike` and `isBufferLike`. - function createSizePropertyCheck(getSizeProperty) { - return function(collection) { - var sizeProperty = getSizeProperty(collection); - return typeof sizeProperty == 'number' && sizeProperty >= 0 && sizeProperty <= MAX_ARRAY_INDEX; - } - } - - // Internal helper to generate a function to obtain property `key` from `obj`. - function shallowProperty(key) { - return function(obj) { - return obj == null ? void 0 : obj[key]; - }; - } - - // Internal helper to obtain the `byteLength` property of an object. - var getByteLength = shallowProperty('byteLength'); - - // Internal helper to determine whether we should spend extensive checks against - // `ArrayBuffer` et al. - var isBufferLike = createSizePropertyCheck(getByteLength); - - // Is a given value a typed array? - var typedArrayPattern = /\[object ((I|Ui)nt(8|16|32)|Float(32|64)|Uint8Clamped|Big(I|Ui)nt64)Array\]/; - function isTypedArray(obj) { - // `ArrayBuffer.isView` is the most future-proof, so use it when available. - // Otherwise, fall back on the above regular expression. - return nativeIsView ? (nativeIsView(obj) && !isDataView$1(obj)) : - isBufferLike(obj) && typedArrayPattern.test(toString.call(obj)); - } - - var isTypedArray$1 = supportsArrayBuffer ? isTypedArray : constant(false); - - // Internal helper to obtain the `length` property of an object. - var getLength = shallowProperty('length'); - - // Internal helper to create a simple lookup structure. - // `collectNonEnumProps` used to depend on `_.contains`, but this led to - // circular imports. `emulatedSet` is a one-off solution that only works for - // arrays of strings. - function emulatedSet(keys) { - var hash = {}; - for (var l = keys.length, i = 0; i < l; ++i) hash[keys[i]] = true; - return { - contains: function(key) { return hash[key]; }, - push: function(key) { - hash[key] = true; - return keys.push(key); - } - }; - } - - // Internal helper. Checks `keys` for the presence of keys in IE < 9 that won't - // be iterated by `for key in ...` and thus missed. Extends `keys` in place if - // needed. - function collectNonEnumProps(obj, keys) { - keys = emulatedSet(keys); - var nonEnumIdx = nonEnumerableProps.length; - var constructor = obj.constructor; - var proto = isFunction$1(constructor) && constructor.prototype || ObjProto; - - // Constructor is a special case. - var prop = 'constructor'; - if (has$1(obj, prop) && !keys.contains(prop)) keys.push(prop); - - while (nonEnumIdx--) { - prop = nonEnumerableProps[nonEnumIdx]; - if (prop in obj && obj[prop] !== proto[prop] && !keys.contains(prop)) { - keys.push(prop); - } - } - } - - // Retrieve the names of an object's own properties. - // Delegates to **ECMAScript 5**'s native `Object.keys`. - function keys(obj) { - if (!isObject(obj)) return []; - if (nativeKeys) return nativeKeys(obj); - var keys = []; - for (var key in obj) if (has$1(obj, key)) keys.push(key); - // Ahem, IE < 9. - if (hasEnumBug) collectNonEnumProps(obj, keys); - return keys; - } - - // Is a given array, string, or object empty? - // An "empty" object has no enumerable own-properties. - function isEmpty(obj) { - if (obj == null) return true; - // Skip the more expensive `toString`-based type checks if `obj` has no - // `.length`. - var length = getLength(obj); - if (typeof length == 'number' && ( - isArray(obj) || isString(obj) || isArguments$1(obj) - )) return length === 0; - return getLength(keys(obj)) === 0; - } - - // Returns whether an object has a given set of `key:value` pairs. - function isMatch(object, attrs) { - var _keys = keys(attrs), length = _keys.length; - if (object == null) return !length; - var obj = Object(object); - for (var i = 0; i < length; i++) { - var key = _keys[i]; - if (attrs[key] !== obj[key] || !(key in obj)) return false; - } - return true; - } - - // If Underscore is called as a function, it returns a wrapped object that can - // be used OO-style. This wrapper holds altered versions of all functions added - // through `_.mixin`. Wrapped objects may be chained. - function _$1(obj) { - if (obj instanceof _$1) return obj; - if (!(this instanceof _$1)) return new _$1(obj); - this._wrapped = obj; - } - - _$1.VERSION = VERSION; - - // Extracts the result from a wrapped and chained object. - _$1.prototype.value = function() { - return this._wrapped; - }; - - // Provide unwrapping proxies for some methods used in engine operations - // such as arithmetic and JSON stringification. - _$1.prototype.valueOf = _$1.prototype.toJSON = _$1.prototype.value; - - _$1.prototype.toString = function() { - return String(this._wrapped); - }; - - // Internal function to wrap or shallow-copy an ArrayBuffer, - // typed array or DataView to a new view, reusing the buffer. - function toBufferView(bufferSource) { - return new Uint8Array( - bufferSource.buffer || bufferSource, - bufferSource.byteOffset || 0, - getByteLength(bufferSource) - ); - } - - // We use this string twice, so give it a name for minification. - var tagDataView = '[object DataView]'; - - // Internal recursive comparison function for `_.isEqual`. - function eq(a, b, aStack, bStack) { - // Identical objects are equal. `0 === -0`, but they aren't identical. - // See the [Harmony `egal` proposal](https://wiki.ecmascript.org/doku.php?id=harmony:egal). - if (a === b) return a !== 0 || 1 / a === 1 / b; - // `null` or `undefined` only equal to itself (strict comparison). - if (a == null || b == null) return false; - // `NaN`s are equivalent, but non-reflexive. - if (a !== a) return b !== b; - // Exhaust primitive checks - var type = typeof a; - if (type !== 'function' && type !== 'object' && typeof b != 'object') return false; - return deepEq(a, b, aStack, bStack); - } - - // Internal recursive comparison function for `_.isEqual`. - function deepEq(a, b, aStack, bStack) { - // Unwrap any wrapped objects. - if (a instanceof _$1) a = a._wrapped; - if (b instanceof _$1) b = b._wrapped; - // Compare `[[Class]]` names. - var className = toString.call(a); - if (className !== toString.call(b)) return false; - // Work around a bug in IE 10 - Edge 13. - if (hasStringTagBug && className == '[object Object]' && isDataView$1(a)) { - if (!isDataView$1(b)) return false; - className = tagDataView; - } - switch (className) { - // These types are compared by value. - case '[object RegExp]': - // RegExps are coerced to strings for comparison (Note: '' + /a/i === '/a/i') - case '[object String]': - // Primitives and their corresponding object wrappers are equivalent; thus, `"5"` is - // equivalent to `new String("5")`. - return '' + a === '' + b; - case '[object Number]': - // `NaN`s are equivalent, but non-reflexive. - // Object(NaN) is equivalent to NaN. - if (+a !== +a) return +b !== +b; - // An `egal` comparison is performed for other numeric values. - return +a === 0 ? 1 / +a === 1 / b : +a === +b; - case '[object Date]': - case '[object Boolean]': - // Coerce dates and booleans to numeric primitive values. Dates are compared by their - // millisecond representations. Note that invalid dates with millisecond representations - // of `NaN` are not equivalent. - return +a === +b; - case '[object Symbol]': - return SymbolProto.valueOf.call(a) === SymbolProto.valueOf.call(b); - case '[object ArrayBuffer]': - case tagDataView: - // Coerce to typed array so we can fall through. - return deepEq(toBufferView(a), toBufferView(b), aStack, bStack); - } - - var areArrays = className === '[object Array]'; - if (!areArrays && isTypedArray$1(a)) { - var byteLength = getByteLength(a); - if (byteLength !== getByteLength(b)) return false; - if (a.buffer === b.buffer && a.byteOffset === b.byteOffset) return true; - areArrays = true; - } - if (!areArrays) { - if (typeof a != 'object' || typeof b != 'object') return false; - - // Objects with different constructors are not equivalent, but `Object`s or `Array`s - // from different frames are. - var aCtor = a.constructor, bCtor = b.constructor; - if (aCtor !== bCtor && !(isFunction$1(aCtor) && aCtor instanceof aCtor && - isFunction$1(bCtor) && bCtor instanceof bCtor) - && ('constructor' in a && 'constructor' in b)) { - return false; - } - } - // Assume equality for cyclic structures. The algorithm for detecting cyclic - // structures is adapted from ES 5.1 section 15.12.3, abstract operation `JO`. - - // Initializing stack of traversed objects. - // It's done here since we only need them for objects and arrays comparison. - aStack = aStack || []; - bStack = bStack || []; - var length = aStack.length; - while (length--) { - // Linear search. Performance is inversely proportional to the number of - // unique nested structures. - if (aStack[length] === a) return bStack[length] === b; - } - - // Add the first object to the stack of traversed objects. - aStack.push(a); - bStack.push(b); - - // Recursively compare objects and arrays. - if (areArrays) { - // Compare array lengths to determine if a deep comparison is necessary. - length = a.length; - if (length !== b.length) return false; - // Deep compare the contents, ignoring non-numeric properties. - while (length--) { - if (!eq(a[length], b[length], aStack, bStack)) return false; - } - } else { - // Deep compare objects. - var _keys = keys(a), key; - length = _keys.length; - // Ensure that both objects contain the same number of properties before comparing deep equality. - if (keys(b).length !== length) return false; - while (length--) { - // Deep compare each member - key = _keys[length]; - if (!(has$1(b, key) && eq(a[key], b[key], aStack, bStack))) return false; - } - } - // Remove the first object from the stack of traversed objects. - aStack.pop(); - bStack.pop(); - return true; - } - - // Perform a deep comparison to check if two objects are equal. - function isEqual(a, b) { - return eq(a, b); - } - - // Retrieve all the enumerable property names of an object. - function allKeys(obj) { - if (!isObject(obj)) return []; - var keys = []; - for (var key in obj) keys.push(key); - // Ahem, IE < 9. - if (hasEnumBug) collectNonEnumProps(obj, keys); - return keys; - } - - // Since the regular `Object.prototype.toString` type tests don't work for - // some types in IE 11, we use a fingerprinting heuristic instead, based - // on the methods. It's not great, but it's the best we got. - // The fingerprint method lists are defined below. - function ie11fingerprint(methods) { - var length = getLength(methods); - return function(obj) { - if (obj == null) return false; - // `Map`, `WeakMap` and `Set` have no enumerable keys. - var keys = allKeys(obj); - if (getLength(keys)) return false; - for (var i = 0; i < length; i++) { - if (!isFunction$1(obj[methods[i]])) return false; - } - // If we are testing against `WeakMap`, we need to ensure that - // `obj` doesn't have a `forEach` method in order to distinguish - // it from a regular `Map`. - return methods !== weakMapMethods || !isFunction$1(obj[forEachName]); - }; - } - - // In the interest of compact minification, we write - // each string in the fingerprints only once. - var forEachName = 'forEach', - hasName = 'has', - commonInit = ['clear', 'delete'], - mapTail = ['get', hasName, 'set']; - - // `Map`, `WeakMap` and `Set` each have slightly different - // combinations of the above sublists. - var mapMethods = commonInit.concat(forEachName, mapTail), - weakMapMethods = commonInit.concat(mapTail), - setMethods = ['add'].concat(commonInit, forEachName, hasName); - - var isMap = isIE11 ? ie11fingerprint(mapMethods) : tagTester('Map'); - - var isWeakMap = isIE11 ? ie11fingerprint(weakMapMethods) : tagTester('WeakMap'); - - var isSet = isIE11 ? ie11fingerprint(setMethods) : tagTester('Set'); - - var isWeakSet = tagTester('WeakSet'); - - // Retrieve the values of an object's properties. - function values(obj) { - var _keys = keys(obj); - var length = _keys.length; - var values = Array(length); - for (var i = 0; i < length; i++) { - values[i] = obj[_keys[i]]; - } - return values; - } - - // Convert an object into a list of `[key, value]` pairs. - // The opposite of `_.object` with one argument. - function pairs(obj) { - var _keys = keys(obj); - var length = _keys.length; - var pairs = Array(length); - for (var i = 0; i < length; i++) { - pairs[i] = [_keys[i], obj[_keys[i]]]; - } - return pairs; - } - - // Invert the keys and values of an object. The values must be serializable. - function invert(obj) { - var result = {}; - var _keys = keys(obj); - for (var i = 0, length = _keys.length; i < length; i++) { - result[obj[_keys[i]]] = _keys[i]; - } - return result; - } - - // Return a sorted list of the function names available on the object. - function functions(obj) { - var names = []; - for (var key in obj) { - if (isFunction$1(obj[key])) names.push(key); - } - return names.sort(); - } - - // An internal function for creating assigner functions. - function createAssigner(keysFunc, defaults) { - return function(obj) { - var length = arguments.length; - if (defaults) obj = Object(obj); - if (length < 2 || obj == null) return obj; - for (var index = 1; index < length; index++) { - var source = arguments[index], - keys = keysFunc(source), - l = keys.length; - for (var i = 0; i < l; i++) { - var key = keys[i]; - if (!defaults || obj[key] === void 0) obj[key] = source[key]; - } - } - return obj; - }; - } - - // Extend a given object with all the properties in passed-in object(s). - var extend = createAssigner(allKeys); - - // Assigns a given object with all the own properties in the passed-in - // object(s). - // (https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Object/assign) - var extendOwn = createAssigner(keys); - - // Fill in a given object with default properties. - var defaults = createAssigner(allKeys, true); - - // Create a naked function reference for surrogate-prototype-swapping. - function ctor() { - return function(){}; - } - - // An internal function for creating a new object that inherits from another. - function baseCreate(prototype) { - if (!isObject(prototype)) return {}; - if (nativeCreate) return nativeCreate(prototype); - var Ctor = ctor(); - Ctor.prototype = prototype; - var result = new Ctor; - Ctor.prototype = null; - return result; - } - - // Creates an object that inherits from the given prototype object. - // If additional properties are provided then they will be added to the - // created object. - function create(prototype, props) { - var result = baseCreate(prototype); - if (props) extendOwn(result, props); - return result; - } - - // Create a (shallow-cloned) duplicate of an object. - function clone(obj) { - if (!isObject(obj)) return obj; - return isArray(obj) ? obj.slice() : extend({}, obj); - } - - // Invokes `interceptor` with the `obj` and then returns `obj`. - // The primary purpose of this method is to "tap into" a method chain, in - // order to perform operations on intermediate results within the chain. - function tap(obj, interceptor) { - interceptor(obj); - return obj; - } - - // Normalize a (deep) property `path` to array. - // Like `_.iteratee`, this function can be customized. - function toPath$1(path) { - return isArray(path) ? path : [path]; - } - _$1.toPath = toPath$1; - - // Internal wrapper for `_.toPath` to enable minification. - // Similar to `cb` for `_.iteratee`. - function toPath(path) { - return _$1.toPath(path); - } - - // Internal function to obtain a nested property in `obj` along `path`. - function deepGet(obj, path) { - var length = path.length; - for (var i = 0; i < length; i++) { - if (obj == null) return void 0; - obj = obj[path[i]]; - } - return length ? obj : void 0; - } - - // Get the value of the (deep) property on `path` from `object`. - // If any property in `path` does not exist or if the value is - // `undefined`, return `defaultValue` instead. - // The `path` is normalized through `_.toPath`. - function get(object, path, defaultValue) { - var value = deepGet(object, toPath(path)); - return isUndefined(value) ? defaultValue : value; - } - - // Shortcut function for checking if an object has a given property directly on - // itself (in other words, not on a prototype). Unlike the internal `has` - // function, this public version can also traverse nested properties. - function has(obj, path) { - path = toPath(path); - var length = path.length; - for (var i = 0; i < length; i++) { - var key = path[i]; - if (!has$1(obj, key)) return false; - obj = obj[key]; - } - return !!length; - } - - // Keep the identity function around for default iteratees. - function identity(value) { - return value; - } - - // Returns a predicate for checking whether an object has a given set of - // `key:value` pairs. - function matcher(attrs) { - attrs = extendOwn({}, attrs); - return function(obj) { - return isMatch(obj, attrs); - }; - } - - // Creates a function that, when passed an object, will traverse that object’s - // properties down the given `path`, specified as an array of keys or indices. - function property(path) { - path = toPath(path); - return function(obj) { - return deepGet(obj, path); - }; - } - - // Internal function that returns an efficient (for current engines) version - // of the passed-in callback, to be repeatedly applied in other Underscore - // functions. - function optimizeCb(func, context, argCount) { - if (context === void 0) return func; - switch (argCount == null ? 3 : argCount) { - case 1: return function(value) { - return func.call(context, value); - }; - // The 2-argument case is omitted because we’re not using it. - case 3: return function(value, index, collection) { - return func.call(context, value, index, collection); - }; - case 4: return function(accumulator, value, index, collection) { - return func.call(context, accumulator, value, index, collection); - }; - } - return function() { - return func.apply(context, arguments); - }; - } - - // An internal function to generate callbacks that can be applied to each - // element in a collection, returning the desired result — either `_.identity`, - // an arbitrary callback, a property matcher, or a property accessor. - function baseIteratee(value, context, argCount) { - if (value == null) return identity; - if (isFunction$1(value)) return optimizeCb(value, context, argCount); - if (isObject(value) && !isArray(value)) return matcher(value); - return property(value); - } - - // External wrapper for our callback generator. Users may customize - // `_.iteratee` if they want additional predicate/iteratee shorthand styles. - // This abstraction hides the internal-only `argCount` argument. - function iteratee(value, context) { - return baseIteratee(value, context, Infinity); - } - _$1.iteratee = iteratee; - - // The function we call internally to generate a callback. It invokes - // `_.iteratee` if overridden, otherwise `baseIteratee`. - function cb(value, context, argCount) { - if (_$1.iteratee !== iteratee) return _$1.iteratee(value, context); - return baseIteratee(value, context, argCount); - } - - // Returns the results of applying the `iteratee` to each element of `obj`. - // In contrast to `_.map` it returns an object. - function mapObject(obj, iteratee, context) { - iteratee = cb(iteratee, context); - var _keys = keys(obj), - length = _keys.length, - results = {}; - for (var index = 0; index < length; index++) { - var currentKey = _keys[index]; - results[currentKey] = iteratee(obj[currentKey], currentKey, obj); - } - return results; - } - - // Predicate-generating function. Often useful outside of Underscore. - function noop(){} - - // Generates a function for a given object that returns a given property. - function propertyOf(obj) { - if (obj == null) return noop; - return function(path) { - return get(obj, path); - }; - } - - // Run a function **n** times. - function times(n, iteratee, context) { - var accum = Array(Math.max(0, n)); - iteratee = optimizeCb(iteratee, context, 1); - for (var i = 0; i < n; i++) accum[i] = iteratee(i); - return accum; - } - - // Return a random integer between `min` and `max` (inclusive). - function random(min, max) { - if (max == null) { - max = min; - min = 0; - } - return min + Math.floor(Math.random() * (max - min + 1)); - } - - // A (possibly faster) way to get the current timestamp as an integer. - var now = Date.now || function() { - return new Date().getTime(); - }; - - // Internal helper to generate functions for escaping and unescaping strings - // to/from HTML interpolation. - function createEscaper(map) { - var escaper = function(match) { - return map[match]; - }; - // Regexes for identifying a key that needs to be escaped. - var source = '(?:' + keys(map).join('|') + ')'; - var testRegexp = RegExp(source); - var replaceRegexp = RegExp(source, 'g'); - return function(string) { - string = string == null ? '' : '' + string; - return testRegexp.test(string) ? string.replace(replaceRegexp, escaper) : string; - }; - } - - // Internal list of HTML entities for escaping. - var escapeMap = { - '&': '&', - '<': '<', - '>': '>', - '"': '"', - "'": ''', - '`': '`' - }; - - // Function for escaping strings to HTML interpolation. - var _escape = createEscaper(escapeMap); - - // Internal list of HTML entities for unescaping. - var unescapeMap = invert(escapeMap); - - // Function for unescaping strings from HTML interpolation. - var _unescape = createEscaper(unescapeMap); - - // By default, Underscore uses ERB-style template delimiters. Change the - // following template settings to use alternative delimiters. - var templateSettings = _$1.templateSettings = { - evaluate: /<%([\s\S]+?)%>/g, - interpolate: /<%=([\s\S]+?)%>/g, - escape: /<%-([\s\S]+?)%>/g - }; - - // When customizing `_.templateSettings`, if you don't want to define an - // interpolation, evaluation or escaping regex, we need one that is - // guaranteed not to match. - var noMatch = /(.)^/; - - // Certain characters need to be escaped so that they can be put into a - // string literal. - var escapes = { - "'": "'", - '\\': '\\', - '\r': 'r', - '\n': 'n', - '\u2028': 'u2028', - '\u2029': 'u2029' - }; - - var escapeRegExp = /\\|'|\r|\n|\u2028|\u2029/g; - - function escapeChar(match) { - return '\\' + escapes[match]; - } - - // In order to prevent third-party code injection through - // `_.templateSettings.variable`, we test it against the following regular - // expression. It is intentionally a bit more liberal than just matching valid - // identifiers, but still prevents possible loopholes through defaults or - // destructuring assignment. - var bareIdentifier = /^\s*(\w|\$)+\s*$/; - - // JavaScript micro-templating, similar to John Resig's implementation. - // Underscore templating handles arbitrary delimiters, preserves whitespace, - // and correctly escapes quotes within interpolated code. - // NB: `oldSettings` only exists for backwards compatibility. - function template(text, settings, oldSettings) { - if (!settings && oldSettings) settings = oldSettings; - settings = defaults({}, settings, _$1.templateSettings); - - // Combine delimiters into one regular expression via alternation. - var matcher = RegExp([ - (settings.escape || noMatch).source, - (settings.interpolate || noMatch).source, - (settings.evaluate || noMatch).source - ].join('|') + '|$', 'g'); - - // Compile the template source, escaping string literals appropriately. - var index = 0; - var source = "__p+='"; - text.replace(matcher, function(match, escape, interpolate, evaluate, offset) { - source += text.slice(index, offset).replace(escapeRegExp, escapeChar); - index = offset + match.length; - - if (escape) { - source += "'+\n((__t=(" + escape + "))==null?'':_.escape(__t))+\n'"; - } else if (interpolate) { - source += "'+\n((__t=(" + interpolate + "))==null?'':__t)+\n'"; - } else if (evaluate) { - source += "';\n" + evaluate + "\n__p+='"; - } - - // Adobe VMs need the match returned to produce the correct offset. - return match; - }); - source += "';\n"; - - var argument = settings.variable; - if (argument) { - // Insure against third-party code injection. (CVE-2021-23358) - if (!bareIdentifier.test(argument)) throw new Error( - 'variable is not a bare identifier: ' + argument - ); - } else { - // If a variable is not specified, place data values in local scope. - source = 'with(obj||{}){\n' + source + '}\n'; - argument = 'obj'; - } - - source = "var __t,__p='',__j=Array.prototype.join," + - "print=function(){__p+=__j.call(arguments,'');};\n" + - source + 'return __p;\n'; - - var render; - try { - render = new Function(argument, '_', source); - } catch (e) { - e.source = source; - throw e; - } - - var template = function(data) { - return render.call(this, data, _$1); - }; - - // Provide the compiled source as a convenience for precompilation. - template.source = 'function(' + argument + '){\n' + source + '}'; - - return template; - } - - // Traverses the children of `obj` along `path`. If a child is a function, it - // is invoked with its parent as context. Returns the value of the final - // child, or `fallback` if any child is undefined. - function result(obj, path, fallback) { - path = toPath(path); - var length = path.length; - if (!length) { - return isFunction$1(fallback) ? fallback.call(obj) : fallback; - } - for (var i = 0; i < length; i++) { - var prop = obj == null ? void 0 : obj[path[i]]; - if (prop === void 0) { - prop = fallback; - i = length; // Ensure we don't continue iterating. - } - obj = isFunction$1(prop) ? prop.call(obj) : prop; - } - return obj; - } - - // Generate a unique integer id (unique within the entire client session). - // Useful for temporary DOM ids. - var idCounter = 0; - function uniqueId(prefix) { - var id = ++idCounter + ''; - return prefix ? prefix + id : id; - } - - // Start chaining a wrapped Underscore object. - function chain(obj) { - var instance = _$1(obj); - instance._chain = true; - return instance; - } - - // Internal function to execute `sourceFunc` bound to `context` with optional - // `args`. Determines whether to execute a function as a constructor or as a - // normal function. - function executeBound(sourceFunc, boundFunc, context, callingContext, args) { - if (!(callingContext instanceof boundFunc)) return sourceFunc.apply(context, args); - var self = baseCreate(sourceFunc.prototype); - var result = sourceFunc.apply(self, args); - if (isObject(result)) return result; - return self; - } - - // Partially apply a function by creating a version that has had some of its - // arguments pre-filled, without changing its dynamic `this` context. `_` acts - // as a placeholder by default, allowing any combination of arguments to be - // pre-filled. Set `_.partial.placeholder` for a custom placeholder argument. - var partial = restArguments(function(func, boundArgs) { - var placeholder = partial.placeholder; - var bound = function() { - var position = 0, length = boundArgs.length; - var args = Array(length); - for (var i = 0; i < length; i++) { - args[i] = boundArgs[i] === placeholder ? arguments[position++] : boundArgs[i]; - } - while (position < arguments.length) args.push(arguments[position++]); - return executeBound(func, bound, this, this, args); - }; - return bound; - }); - - partial.placeholder = _$1; - - // Create a function bound to a given object (assigning `this`, and arguments, - // optionally). - var bind = restArguments(function(func, context, args) { - if (!isFunction$1(func)) throw new TypeError('Bind must be called on a function'); - var bound = restArguments(function(callArgs) { - return executeBound(func, bound, context, this, args.concat(callArgs)); - }); - return bound; - }); - - // Internal helper for collection methods to determine whether a collection - // should be iterated as an array or as an object. - // Related: https://people.mozilla.org/~jorendorff/es6-draft.html#sec-tolength - // Avoids a very nasty iOS 8 JIT bug on ARM-64. #2094 - var isArrayLike = createSizePropertyCheck(getLength); - - // Internal implementation of a recursive `flatten` function. - function flatten$1(input, depth, strict, output) { - output = output || []; - if (!depth && depth !== 0) { - depth = Infinity; - } else if (depth <= 0) { - return output.concat(input); - } - var idx = output.length; - for (var i = 0, length = getLength(input); i < length; i++) { - var value = input[i]; - if (isArrayLike(value) && (isArray(value) || isArguments$1(value))) { - // Flatten current level of array or arguments object. - if (depth > 1) { - flatten$1(value, depth - 1, strict, output); - idx = output.length; - } else { - var j = 0, len = value.length; - while (j < len) output[idx++] = value[j++]; - } - } else if (!strict) { - output[idx++] = value; - } - } - return output; - } - - // Bind a number of an object's methods to that object. Remaining arguments - // are the method names to be bound. Useful for ensuring that all callbacks - // defined on an object belong to it. - var bindAll = restArguments(function(obj, keys) { - keys = flatten$1(keys, false, false); - var index = keys.length; - if (index < 1) throw new Error('bindAll must be passed function names'); - while (index--) { - var key = keys[index]; - obj[key] = bind(obj[key], obj); - } - return obj; - }); - - // Memoize an expensive function by storing its results. - function memoize(func, hasher) { - var memoize = function(key) { - var cache = memoize.cache; - var address = '' + (hasher ? hasher.apply(this, arguments) : key); - if (!has$1(cache, address)) cache[address] = func.apply(this, arguments); - return cache[address]; - }; - memoize.cache = {}; - return memoize; - } - - // Delays a function for the given number of milliseconds, and then calls - // it with the arguments supplied. - var delay = restArguments(function(func, wait, args) { - return setTimeout(function() { - return func.apply(null, args); - }, wait); - }); - - // Defers a function, scheduling it to run after the current call stack has - // cleared. - var defer = partial(delay, _$1, 1); - - // Returns a function, that, when invoked, will only be triggered at most once - // during a given window of time. Normally, the throttled function will run - // as much as it can, without ever going more than once per `wait` duration; - // but if you'd like to disable the execution on the leading edge, pass - // `{leading: false}`. To disable execution on the trailing edge, ditto. - function throttle(func, wait, options) { - var timeout, context, args, result; - var previous = 0; - if (!options) options = {}; - - var later = function() { - previous = options.leading === false ? 0 : now(); - timeout = null; - result = func.apply(context, args); - if (!timeout) context = args = null; - }; - - var throttled = function() { - var _now = now(); - if (!previous && options.leading === false) previous = _now; - var remaining = wait - (_now - previous); - context = this; - args = arguments; - if (remaining <= 0 || remaining > wait) { - if (timeout) { - clearTimeout(timeout); - timeout = null; - } - previous = _now; - result = func.apply(context, args); - if (!timeout) context = args = null; - } else if (!timeout && options.trailing !== false) { - timeout = setTimeout(later, remaining); - } - return result; - }; - - throttled.cancel = function() { - clearTimeout(timeout); - previous = 0; - timeout = context = args = null; - }; - - return throttled; - } - - // When a sequence of calls of the returned function ends, the argument - // function is triggered. The end of a sequence is defined by the `wait` - // parameter. If `immediate` is passed, the argument function will be - // triggered at the beginning of the sequence instead of at the end. - function debounce(func, wait, immediate) { - var timeout, previous, args, result, context; - - var later = function() { - var passed = now() - previous; - if (wait > passed) { - timeout = setTimeout(later, wait - passed); - } else { - timeout = null; - if (!immediate) result = func.apply(context, args); - // This check is needed because `func` can recursively invoke `debounced`. - if (!timeout) args = context = null; - } - }; - - var debounced = restArguments(function(_args) { - context = this; - args = _args; - previous = now(); - if (!timeout) { - timeout = setTimeout(later, wait); - if (immediate) result = func.apply(context, args); - } - return result; - }); - - debounced.cancel = function() { - clearTimeout(timeout); - timeout = args = context = null; - }; - - return debounced; - } - - // Returns the first function passed as an argument to the second, - // allowing you to adjust arguments, run code before and after, and - // conditionally execute the original function. - function wrap(func, wrapper) { - return partial(wrapper, func); - } - - // Returns a negated version of the passed-in predicate. - function negate(predicate) { - return function() { - return !predicate.apply(this, arguments); - }; - } - - // Returns a function that is the composition of a list of functions, each - // consuming the return value of the function that follows. - function compose() { - var args = arguments; - var start = args.length - 1; - return function() { - var i = start; - var result = args[start].apply(this, arguments); - while (i--) result = args[i].call(this, result); - return result; - }; - } - - // Returns a function that will only be executed on and after the Nth call. - function after(times, func) { - return function() { - if (--times < 1) { - return func.apply(this, arguments); - } - }; - } - - // Returns a function that will only be executed up to (but not including) the - // Nth call. - function before(times, func) { - var memo; - return function() { - if (--times > 0) { - memo = func.apply(this, arguments); - } - if (times <= 1) func = null; - return memo; - }; - } - - // Returns a function that will be executed at most one time, no matter how - // often you call it. Useful for lazy initialization. - var once = partial(before, 2); - - // Returns the first key on an object that passes a truth test. - function findKey(obj, predicate, context) { - predicate = cb(predicate, context); - var _keys = keys(obj), key; - for (var i = 0, length = _keys.length; i < length; i++) { - key = _keys[i]; - if (predicate(obj[key], key, obj)) return key; - } - } - - // Internal function to generate `_.findIndex` and `_.findLastIndex`. - function createPredicateIndexFinder(dir) { - return function(array, predicate, context) { - predicate = cb(predicate, context); - var length = getLength(array); - var index = dir > 0 ? 0 : length - 1; - for (; index >= 0 && index < length; index += dir) { - if (predicate(array[index], index, array)) return index; - } - return -1; - }; - } - - // Returns the first index on an array-like that passes a truth test. - var findIndex = createPredicateIndexFinder(1); - - // Returns the last index on an array-like that passes a truth test. - var findLastIndex = createPredicateIndexFinder(-1); - - // Use a comparator function to figure out the smallest index at which - // an object should be inserted so as to maintain order. Uses binary search. - function sortedIndex(array, obj, iteratee, context) { - iteratee = cb(iteratee, context, 1); - var value = iteratee(obj); - var low = 0, high = getLength(array); - while (low < high) { - var mid = Math.floor((low + high) / 2); - if (iteratee(array[mid]) < value) low = mid + 1; else high = mid; - } - return low; - } - - // Internal function to generate the `_.indexOf` and `_.lastIndexOf` functions. - function createIndexFinder(dir, predicateFind, sortedIndex) { - return function(array, item, idx) { - var i = 0, length = getLength(array); - if (typeof idx == 'number') { - if (dir > 0) { - i = idx >= 0 ? idx : Math.max(idx + length, i); - } else { - length = idx >= 0 ? Math.min(idx + 1, length) : idx + length + 1; - } - } else if (sortedIndex && idx && length) { - idx = sortedIndex(array, item); - return array[idx] === item ? idx : -1; - } - if (item !== item) { - idx = predicateFind(slice.call(array, i, length), isNaN$1); - return idx >= 0 ? idx + i : -1; - } - for (idx = dir > 0 ? i : length - 1; idx >= 0 && idx < length; idx += dir) { - if (array[idx] === item) return idx; - } - return -1; - }; - } - - // Return the position of the first occurrence of an item in an array, - // or -1 if the item is not included in the array. - // If the array is large and already in sort order, pass `true` - // for **isSorted** to use binary search. - var indexOf = createIndexFinder(1, findIndex, sortedIndex); - - // Return the position of the last occurrence of an item in an array, - // or -1 if the item is not included in the array. - var lastIndexOf = createIndexFinder(-1, findLastIndex); - - // Return the first value which passes a truth test. - function find(obj, predicate, context) { - var keyFinder = isArrayLike(obj) ? findIndex : findKey; - var key = keyFinder(obj, predicate, context); - if (key !== void 0 && key !== -1) return obj[key]; - } - - // Convenience version of a common use case of `_.find`: getting the first - // object containing specific `key:value` pairs. - function findWhere(obj, attrs) { - return find(obj, matcher(attrs)); - } - - // The cornerstone for collection functions, an `each` - // implementation, aka `forEach`. - // Handles raw objects in addition to array-likes. Treats all - // sparse array-likes as if they were dense. - function each(obj, iteratee, context) { - iteratee = optimizeCb(iteratee, context); - var i, length; - if (isArrayLike(obj)) { - for (i = 0, length = obj.length; i < length; i++) { - iteratee(obj[i], i, obj); - } - } else { - var _keys = keys(obj); - for (i = 0, length = _keys.length; i < length; i++) { - iteratee(obj[_keys[i]], _keys[i], obj); - } - } - return obj; - } - - // Return the results of applying the iteratee to each element. - function map(obj, iteratee, context) { - iteratee = cb(iteratee, context); - var _keys = !isArrayLike(obj) && keys(obj), - length = (_keys || obj).length, - results = Array(length); - for (var index = 0; index < length; index++) { - var currentKey = _keys ? _keys[index] : index; - results[index] = iteratee(obj[currentKey], currentKey, obj); - } - return results; - } - - // Internal helper to create a reducing function, iterating left or right. - function createReduce(dir) { - // Wrap code that reassigns argument variables in a separate function than - // the one that accesses `arguments.length` to avoid a perf hit. (#1991) - var reducer = function(obj, iteratee, memo, initial) { - var _keys = !isArrayLike(obj) && keys(obj), - length = (_keys || obj).length, - index = dir > 0 ? 0 : length - 1; - if (!initial) { - memo = obj[_keys ? _keys[index] : index]; - index += dir; - } - for (; index >= 0 && index < length; index += dir) { - var currentKey = _keys ? _keys[index] : index; - memo = iteratee(memo, obj[currentKey], currentKey, obj); - } - return memo; - }; - - return function(obj, iteratee, memo, context) { - var initial = arguments.length >= 3; - return reducer(obj, optimizeCb(iteratee, context, 4), memo, initial); - }; - } - - // **Reduce** builds up a single result from a list of values, aka `inject`, - // or `foldl`. - var reduce = createReduce(1); - - // The right-associative version of reduce, also known as `foldr`. - var reduceRight = createReduce(-1); - - // Return all the elements that pass a truth test. - function filter(obj, predicate, context) { - var results = []; - predicate = cb(predicate, context); - each(obj, function(value, index, list) { - if (predicate(value, index, list)) results.push(value); - }); - return results; - } - - // Return all the elements for which a truth test fails. - function reject(obj, predicate, context) { - return filter(obj, negate(cb(predicate)), context); - } - - // Determine whether all of the elements pass a truth test. - function every(obj, predicate, context) { - predicate = cb(predicate, context); - var _keys = !isArrayLike(obj) && keys(obj), - length = (_keys || obj).length; - for (var index = 0; index < length; index++) { - var currentKey = _keys ? _keys[index] : index; - if (!predicate(obj[currentKey], currentKey, obj)) return false; - } - return true; - } - - // Determine if at least one element in the object passes a truth test. - function some(obj, predicate, context) { - predicate = cb(predicate, context); - var _keys = !isArrayLike(obj) && keys(obj), - length = (_keys || obj).length; - for (var index = 0; index < length; index++) { - var currentKey = _keys ? _keys[index] : index; - if (predicate(obj[currentKey], currentKey, obj)) return true; - } - return false; - } - - // Determine if the array or object contains a given item (using `===`). - function contains(obj, item, fromIndex, guard) { - if (!isArrayLike(obj)) obj = values(obj); - if (typeof fromIndex != 'number' || guard) fromIndex = 0; - return indexOf(obj, item, fromIndex) >= 0; - } - - // Invoke a method (with arguments) on every item in a collection. - var invoke = restArguments(function(obj, path, args) { - var contextPath, func; - if (isFunction$1(path)) { - func = path; - } else { - path = toPath(path); - contextPath = path.slice(0, -1); - path = path[path.length - 1]; - } - return map(obj, function(context) { - var method = func; - if (!method) { - if (contextPath && contextPath.length) { - context = deepGet(context, contextPath); - } - if (context == null) return void 0; - method = context[path]; - } - return method == null ? method : method.apply(context, args); - }); - }); - - // Convenience version of a common use case of `_.map`: fetching a property. - function pluck(obj, key) { - return map(obj, property(key)); - } - - // Convenience version of a common use case of `_.filter`: selecting only - // objects containing specific `key:value` pairs. - function where(obj, attrs) { - return filter(obj, matcher(attrs)); - } - - // Return the maximum element (or element-based computation). - function max(obj, iteratee, context) { - var result = -Infinity, lastComputed = -Infinity, - value, computed; - if (iteratee == null || typeof iteratee == 'number' && typeof obj[0] != 'object' && obj != null) { - obj = isArrayLike(obj) ? obj : values(obj); - for (var i = 0, length = obj.length; i < length; i++) { - value = obj[i]; - if (value != null && value > result) { - result = value; - } - } - } else { - iteratee = cb(iteratee, context); - each(obj, function(v, index, list) { - computed = iteratee(v, index, list); - if (computed > lastComputed || computed === -Infinity && result === -Infinity) { - result = v; - lastComputed = computed; - } - }); - } - return result; - } - - // Return the minimum element (or element-based computation). - function min(obj, iteratee, context) { - var result = Infinity, lastComputed = Infinity, - value, computed; - if (iteratee == null || typeof iteratee == 'number' && typeof obj[0] != 'object' && obj != null) { - obj = isArrayLike(obj) ? obj : values(obj); - for (var i = 0, length = obj.length; i < length; i++) { - value = obj[i]; - if (value != null && value < result) { - result = value; - } - } - } else { - iteratee = cb(iteratee, context); - each(obj, function(v, index, list) { - computed = iteratee(v, index, list); - if (computed < lastComputed || computed === Infinity && result === Infinity) { - result = v; - lastComputed = computed; - } - }); - } - return result; - } - - // Sample **n** random values from a collection using the modern version of the - // [Fisher-Yates shuffle](https://en.wikipedia.org/wiki/Fisher–Yates_shuffle). - // If **n** is not specified, returns a single random element. - // The internal `guard` argument allows it to work with `_.map`. - function sample(obj, n, guard) { - if (n == null || guard) { - if (!isArrayLike(obj)) obj = values(obj); - return obj[random(obj.length - 1)]; - } - var sample = isArrayLike(obj) ? clone(obj) : values(obj); - var length = getLength(sample); - n = Math.max(Math.min(n, length), 0); - var last = length - 1; - for (var index = 0; index < n; index++) { - var rand = random(index, last); - var temp = sample[index]; - sample[index] = sample[rand]; - sample[rand] = temp; - } - return sample.slice(0, n); - } - - // Shuffle a collection. - function shuffle(obj) { - return sample(obj, Infinity); - } - - // Sort the object's values by a criterion produced by an iteratee. - function sortBy(obj, iteratee, context) { - var index = 0; - iteratee = cb(iteratee, context); - return pluck(map(obj, function(value, key, list) { - return { - value: value, - index: index++, - criteria: iteratee(value, key, list) - }; - }).sort(function(left, right) { - var a = left.criteria; - var b = right.criteria; - if (a !== b) { - if (a > b || a === void 0) return 1; - if (a < b || b === void 0) return -1; - } - return left.index - right.index; - }), 'value'); - } - - // An internal function used for aggregate "group by" operations. - function group(behavior, partition) { - return function(obj, iteratee, context) { - var result = partition ? [[], []] : {}; - iteratee = cb(iteratee, context); - each(obj, function(value, index) { - var key = iteratee(value, index, obj); - behavior(result, value, key); - }); - return result; - }; - } - - // Groups the object's values by a criterion. Pass either a string attribute - // to group by, or a function that returns the criterion. - var groupBy = group(function(result, value, key) { - if (has$1(result, key)) result[key].push(value); else result[key] = [value]; - }); - - // Indexes the object's values by a criterion, similar to `_.groupBy`, but for - // when you know that your index values will be unique. - var indexBy = group(function(result, value, key) { - result[key] = value; - }); - - // Counts instances of an object that group by a certain criterion. Pass - // either a string attribute to count by, or a function that returns the - // criterion. - var countBy = group(function(result, value, key) { - if (has$1(result, key)) result[key]++; else result[key] = 1; - }); - - // Split a collection into two arrays: one whose elements all pass the given - // truth test, and one whose elements all do not pass the truth test. - var partition = group(function(result, value, pass) { - result[pass ? 0 : 1].push(value); - }, true); - - // Safely create a real, live array from anything iterable. - var reStrSymbol = /[^\ud800-\udfff]|[\ud800-\udbff][\udc00-\udfff]|[\ud800-\udfff]/g; - function toArray(obj) { - if (!obj) return []; - if (isArray(obj)) return slice.call(obj); - if (isString(obj)) { - // Keep surrogate pair characters together. - return obj.match(reStrSymbol); - } - if (isArrayLike(obj)) return map(obj, identity); - return values(obj); - } - - // Return the number of elements in a collection. - function size(obj) { - if (obj == null) return 0; - return isArrayLike(obj) ? obj.length : keys(obj).length; - } - - // Internal `_.pick` helper function to determine whether `key` is an enumerable - // property name of `obj`. - function keyInObj(value, key, obj) { - return key in obj; - } - - // Return a copy of the object only containing the allowed properties. - var pick = restArguments(function(obj, keys) { - var result = {}, iteratee = keys[0]; - if (obj == null) return result; - if (isFunction$1(iteratee)) { - if (keys.length > 1) iteratee = optimizeCb(iteratee, keys[1]); - keys = allKeys(obj); - } else { - iteratee = keyInObj; - keys = flatten$1(keys, false, false); - obj = Object(obj); - } - for (var i = 0, length = keys.length; i < length; i++) { - var key = keys[i]; - var value = obj[key]; - if (iteratee(value, key, obj)) result[key] = value; - } - return result; - }); - - // Return a copy of the object without the disallowed properties. - var omit = restArguments(function(obj, keys) { - var iteratee = keys[0], context; - if (isFunction$1(iteratee)) { - iteratee = negate(iteratee); - if (keys.length > 1) context = keys[1]; - } else { - keys = map(flatten$1(keys, false, false), String); - iteratee = function(value, key) { - return !contains(keys, key); - }; - } - return pick(obj, iteratee, context); - }); - - // Returns everything but the last entry of the array. Especially useful on - // the arguments object. Passing **n** will return all the values in - // the array, excluding the last N. - function initial(array, n, guard) { - return slice.call(array, 0, Math.max(0, array.length - (n == null || guard ? 1 : n))); - } - - // Get the first element of an array. Passing **n** will return the first N - // values in the array. The **guard** check allows it to work with `_.map`. - function first(array, n, guard) { - if (array == null || array.length < 1) return n == null || guard ? void 0 : []; - if (n == null || guard) return array[0]; - return initial(array, array.length - n); - } - - // Returns everything but the first entry of the `array`. Especially useful on - // the `arguments` object. Passing an **n** will return the rest N values in the - // `array`. - function rest(array, n, guard) { - return slice.call(array, n == null || guard ? 1 : n); - } - - // Get the last element of an array. Passing **n** will return the last N - // values in the array. - function last(array, n, guard) { - if (array == null || array.length < 1) return n == null || guard ? void 0 : []; - if (n == null || guard) return array[array.length - 1]; - return rest(array, Math.max(0, array.length - n)); - } - - // Trim out all falsy values from an array. - function compact(array) { - return filter(array, Boolean); - } - - // Flatten out an array, either recursively (by default), or up to `depth`. - // Passing `true` or `false` as `depth` means `1` or `Infinity`, respectively. - function flatten(array, depth) { - return flatten$1(array, depth, false); - } - - // Take the difference between one array and a number of other arrays. - // Only the elements present in just the first array will remain. - var difference = restArguments(function(array, rest) { - rest = flatten$1(rest, true, true); - return filter(array, function(value){ - return !contains(rest, value); - }); - }); - - // Return a version of the array that does not contain the specified value(s). - var without = restArguments(function(array, otherArrays) { - return difference(array, otherArrays); - }); - - // Produce a duplicate-free version of the array. If the array has already - // been sorted, you have the option of using a faster algorithm. - // The faster algorithm will not work with an iteratee if the iteratee - // is not a one-to-one function, so providing an iteratee will disable - // the faster algorithm. - function uniq(array, isSorted, iteratee, context) { - if (!isBoolean(isSorted)) { - context = iteratee; - iteratee = isSorted; - isSorted = false; - } - if (iteratee != null) iteratee = cb(iteratee, context); - var result = []; - var seen = []; - for (var i = 0, length = getLength(array); i < length; i++) { - var value = array[i], - computed = iteratee ? iteratee(value, i, array) : value; - if (isSorted && !iteratee) { - if (!i || seen !== computed) result.push(value); - seen = computed; - } else if (iteratee) { - if (!contains(seen, computed)) { - seen.push(computed); - result.push(value); - } - } else if (!contains(result, value)) { - result.push(value); - } - } - return result; - } - - // Produce an array that contains the union: each distinct element from all of - // the passed-in arrays. - var union = restArguments(function(arrays) { - return uniq(flatten$1(arrays, true, true)); - }); - - // Produce an array that contains every item shared between all the - // passed-in arrays. - function intersection(array) { - var result = []; - var argsLength = arguments.length; - for (var i = 0, length = getLength(array); i < length; i++) { - var item = array[i]; - if (contains(result, item)) continue; - var j; - for (j = 1; j < argsLength; j++) { - if (!contains(arguments[j], item)) break; - } - if (j === argsLength) result.push(item); - } - return result; - } - - // Complement of zip. Unzip accepts an array of arrays and groups - // each array's elements on shared indices. - function unzip(array) { - var length = array && max(array, getLength).length || 0; - var result = Array(length); - - for (var index = 0; index < length; index++) { - result[index] = pluck(array, index); - } - return result; - } - - // Zip together multiple lists into a single array -- elements that share - // an index go together. - var zip = restArguments(unzip); - - // Converts lists into objects. Pass either a single array of `[key, value]` - // pairs, or two parallel arrays of the same length -- one of keys, and one of - // the corresponding values. Passing by pairs is the reverse of `_.pairs`. - function object(list, values) { - var result = {}; - for (var i = 0, length = getLength(list); i < length; i++) { - if (values) { - result[list[i]] = values[i]; - } else { - result[list[i][0]] = list[i][1]; - } - } - return result; - } - - // Generate an integer Array containing an arithmetic progression. A port of - // the native Python `range()` function. See - // [the Python documentation](https://docs.python.org/library/functions.html#range). - function range(start, stop, step) { - if (stop == null) { - stop = start || 0; - start = 0; - } - if (!step) { - step = stop < start ? -1 : 1; - } - - var length = Math.max(Math.ceil((stop - start) / step), 0); - var range = Array(length); - - for (var idx = 0; idx < length; idx++, start += step) { - range[idx] = start; - } - - return range; - } - - // Chunk a single array into multiple arrays, each containing `count` or fewer - // items. - function chunk(array, count) { - if (count == null || count < 1) return []; - var result = []; - var i = 0, length = array.length; - while (i < length) { - result.push(slice.call(array, i, i += count)); - } - return result; - } - - // Helper function to continue chaining intermediate results. - function chainResult(instance, obj) { - return instance._chain ? _$1(obj).chain() : obj; - } - - // Add your own custom functions to the Underscore object. - function mixin(obj) { - each(functions(obj), function(name) { - var func = _$1[name] = obj[name]; - _$1.prototype[name] = function() { - var args = [this._wrapped]; - push.apply(args, arguments); - return chainResult(this, func.apply(_$1, args)); - }; - }); - return _$1; - } - - // Add all mutator `Array` functions to the wrapper. - each(['pop', 'push', 'reverse', 'shift', 'sort', 'splice', 'unshift'], function(name) { - var method = ArrayProto[name]; - _$1.prototype[name] = function() { - var obj = this._wrapped; - if (obj != null) { - method.apply(obj, arguments); - if ((name === 'shift' || name === 'splice') && obj.length === 0) { - delete obj[0]; - } - } - return chainResult(this, obj); - }; - }); - - // Add all accessor `Array` functions to the wrapper. - each(['concat', 'join', 'slice'], function(name) { - var method = ArrayProto[name]; - _$1.prototype[name] = function() { - var obj = this._wrapped; - if (obj != null) obj = method.apply(obj, arguments); - return chainResult(this, obj); - }; - }); - - // Named Exports - - var allExports = { - __proto__: null, - VERSION: VERSION, - restArguments: restArguments, - isObject: isObject, - isNull: isNull, - isUndefined: isUndefined, - isBoolean: isBoolean, - isElement: isElement, - isString: isString, - isNumber: isNumber, - isDate: isDate, - isRegExp: isRegExp, - isError: isError, - isSymbol: isSymbol, - isArrayBuffer: isArrayBuffer, - isDataView: isDataView$1, - isArray: isArray, - isFunction: isFunction$1, - isArguments: isArguments$1, - isFinite: isFinite$1, - isNaN: isNaN$1, - isTypedArray: isTypedArray$1, - isEmpty: isEmpty, - isMatch: isMatch, - isEqual: isEqual, - isMap: isMap, - isWeakMap: isWeakMap, - isSet: isSet, - isWeakSet: isWeakSet, - keys: keys, - allKeys: allKeys, - values: values, - pairs: pairs, - invert: invert, - functions: functions, - methods: functions, - extend: extend, - extendOwn: extendOwn, - assign: extendOwn, - defaults: defaults, - create: create, - clone: clone, - tap: tap, - get: get, - has: has, - mapObject: mapObject, - identity: identity, - constant: constant, - noop: noop, - toPath: toPath$1, - property: property, - propertyOf: propertyOf, - matcher: matcher, - matches: matcher, - times: times, - random: random, - now: now, - escape: _escape, - unescape: _unescape, - templateSettings: templateSettings, - template: template, - result: result, - uniqueId: uniqueId, - chain: chain, - iteratee: iteratee, - partial: partial, - bind: bind, - bindAll: bindAll, - memoize: memoize, - delay: delay, - defer: defer, - throttle: throttle, - debounce: debounce, - wrap: wrap, - negate: negate, - compose: compose, - after: after, - before: before, - once: once, - findKey: findKey, - findIndex: findIndex, - findLastIndex: findLastIndex, - sortedIndex: sortedIndex, - indexOf: indexOf, - lastIndexOf: lastIndexOf, - find: find, - detect: find, - findWhere: findWhere, - each: each, - forEach: each, - map: map, - collect: map, - reduce: reduce, - foldl: reduce, - inject: reduce, - reduceRight: reduceRight, - foldr: reduceRight, - filter: filter, - select: filter, - reject: reject, - every: every, - all: every, - some: some, - any: some, - contains: contains, - includes: contains, - include: contains, - invoke: invoke, - pluck: pluck, - where: where, - max: max, - min: min, - shuffle: shuffle, - sample: sample, - sortBy: sortBy, - groupBy: groupBy, - indexBy: indexBy, - countBy: countBy, - partition: partition, - toArray: toArray, - size: size, - pick: pick, - omit: omit, - first: first, - head: first, - take: first, - initial: initial, - last: last, - rest: rest, - tail: rest, - drop: rest, - compact: compact, - flatten: flatten, - without: without, - uniq: uniq, - unique: uniq, - union: union, - intersection: intersection, - difference: difference, - unzip: unzip, - transpose: unzip, - zip: zip, - object: object, - range: range, - chunk: chunk, - mixin: mixin, - 'default': _$1 - }; - - // Default Export - - // Add all of the Underscore functions to the wrapper object. - var _ = mixin(allExports); - // Legacy Node.js API. - _._ = _; - - return _; - -}))); -//# sourceMappingURL=underscore-umd.js.map diff --git a/docs/build/html/_static/underscore.js b/docs/build/html/_static/underscore.js deleted file mode 100644 index cf177d4..0000000 --- a/docs/build/html/_static/underscore.js +++ /dev/null @@ -1,6 +0,0 @@ -!function(n,r){"object"==typeof exports&&"undefined"!=typeof module?module.exports=r():"function"==typeof define&&define.amd?define("underscore",r):(n="undefined"!=typeof globalThis?globalThis:n||self,function(){var t=n._,e=n._=r();e.noConflict=function(){return n._=t,e}}())}(this,(function(){ -// Underscore.js 1.13.1 -// https://underscorejs.org -// (c) 2009-2021 Jeremy Ashkenas, Julian Gonggrijp, and DocumentCloud and Investigative Reporters & Editors -// Underscore may be freely distributed under the MIT license. -var n="1.13.1",r="object"==typeof self&&self.self===self&&self||"object"==typeof global&&global.global===global&&global||Function("return this")()||{},t=Array.prototype,e=Object.prototype,u="undefined"!=typeof Symbol?Symbol.prototype:null,o=t.push,i=t.slice,a=e.toString,f=e.hasOwnProperty,c="undefined"!=typeof ArrayBuffer,l="undefined"!=typeof DataView,s=Array.isArray,p=Object.keys,v=Object.create,h=c&&ArrayBuffer.isView,y=isNaN,d=isFinite,g=!{toString:null}.propertyIsEnumerable("toString"),b=["valueOf","isPrototypeOf","toString","propertyIsEnumerable","hasOwnProperty","toLocaleString"],m=Math.pow(2,53)-1;function j(n,r){return r=null==r?n.length-1:+r,function(){for(var t=Math.max(arguments.length-r,0),e=Array(t),u=0;u=0&&t<=m}}function J(n){return function(r){return null==r?void 0:r[n]}}var G=J("byteLength"),H=K(G),Q=/\[object ((I|Ui)nt(8|16|32)|Float(32|64)|Uint8Clamped|Big(I|Ui)nt64)Array\]/;var X=c?function(n){return h?h(n)&&!q(n):H(n)&&Q.test(a.call(n))}:C(!1),Y=J("length");function Z(n,r){r=function(n){for(var r={},t=n.length,e=0;e":">",'"':""","'":"'","`":"`"},Cn=Ln($n),Kn=Ln(_n($n)),Jn=tn.templateSettings={evaluate:/<%([\s\S]+?)%>/g,interpolate:/<%=([\s\S]+?)%>/g,escape:/<%-([\s\S]+?)%>/g},Gn=/(.)^/,Hn={"'":"'","\\":"\\","\r":"r","\n":"n","\u2028":"u2028","\u2029":"u2029"},Qn=/\\|'|\r|\n|\u2028|\u2029/g;function Xn(n){return"\\"+Hn[n]}var Yn=/^\s*(\w|\$)+\s*$/;var Zn=0;function nr(n,r,t,e,u){if(!(e instanceof r))return n.apply(t,u);var o=Mn(n.prototype),i=n.apply(o,u);return _(i)?i:o}var rr=j((function(n,r){var t=rr.placeholder,e=function(){for(var u=0,o=r.length,i=Array(o),a=0;a1)ur(a,r-1,t,e),u=e.length;else for(var f=0,c=a.length;f0&&(t=r.apply(this,arguments)),n<=1&&(r=null),t}}var lr=rr(cr,2);function sr(n,r,t){r=qn(r,t);for(var e,u=nn(n),o=0,i=u.length;o0?0:u-1;o>=0&&o0?a=o>=0?o:Math.max(o+f,a):f=o>=0?Math.min(o+1,f):o+f+1;else if(t&&o&&f)return e[o=t(e,u)]===u?o:-1;if(u!=u)return(o=r(i.call(e,a,f),$))>=0?o+a:-1;for(o=n>0?a:f-1;o>=0&&o0?0:i-1;for(u||(e=r[o?o[a]:a],a+=n);a>=0&&a=3;return r(n,Fn(t,u,4),e,o)}}var Ar=wr(1),xr=wr(-1);function Sr(n,r,t){var e=[];return r=qn(r,t),jr(n,(function(n,t,u){r(n,t,u)&&e.push(n)})),e}function Or(n,r,t){r=qn(r,t);for(var e=!er(n)&&nn(n),u=(e||n).length,o=0;o=0}var Br=j((function(n,r,t){var e,u;return D(r)?u=r:(r=Nn(r),e=r.slice(0,-1),r=r[r.length-1]),_r(n,(function(n){var o=u;if(!o){if(e&&e.length&&(n=In(n,e)),null==n)return;o=n[r]}return null==o?o:o.apply(n,t)}))}));function Nr(n,r){return _r(n,Rn(r))}function Ir(n,r,t){var e,u,o=-1/0,i=-1/0;if(null==r||"number"==typeof r&&"object"!=typeof n[0]&&null!=n)for(var a=0,f=(n=er(n)?n:jn(n)).length;ao&&(o=e);else r=qn(r,t),jr(n,(function(n,t,e){((u=r(n,t,e))>i||u===-1/0&&o===-1/0)&&(o=n,i=u)}));return o}function Tr(n,r,t){if(null==r||t)return er(n)||(n=jn(n)),n[Wn(n.length-1)];var e=er(n)?En(n):jn(n),u=Y(e);r=Math.max(Math.min(r,u),0);for(var o=u-1,i=0;i1&&(e=Fn(e,r[1])),r=an(n)):(e=qr,r=ur(r,!1,!1),n=Object(n));for(var u=0,o=r.length;u1&&(t=r[1])):(r=_r(ur(r,!1,!1),String),e=function(n,t){return!Er(r,t)}),Ur(n,e,t)}));function zr(n,r,t){return i.call(n,0,Math.max(0,n.length-(null==r||t?1:r)))}function Lr(n,r,t){return null==n||n.length<1?null==r||t?void 0:[]:null==r||t?n[0]:zr(n,n.length-r)}function $r(n,r,t){return i.call(n,null==r||t?1:r)}var Cr=j((function(n,r){return r=ur(r,!0,!0),Sr(n,(function(n){return!Er(r,n)}))})),Kr=j((function(n,r){return Cr(n,r)}));function Jr(n,r,t,e){A(r)||(e=t,t=r,r=!1),null!=t&&(t=qn(t,e));for(var u=[],o=[],i=0,a=Y(n);ir?(e&&(clearTimeout(e),e=null),a=c,i=n.apply(u,o),e||(u=o=null)):e||!1===t.trailing||(e=setTimeout(f,l)),i};return c.cancel=function(){clearTimeout(e),a=0,e=u=o=null},c},debounce:function(n,r,t){var e,u,o,i,a,f=function(){var c=zn()-u;r>c?e=setTimeout(f,r-c):(e=null,t||(i=n.apply(a,o)),e||(o=a=null))},c=j((function(c){return a=this,o=c,u=zn(),e||(e=setTimeout(f,r),t&&(i=n.apply(a,o))),i}));return c.cancel=function(){clearTimeout(e),e=o=a=null},c},wrap:function(n,r){return rr(r,n)},negate:fr,compose:function(){var n=arguments,r=n.length-1;return function(){for(var t=r,e=n[r].apply(this,arguments);t--;)e=n[t].call(this,e);return e}},after:function(n,r){return function(){if(--n<1)return r.apply(this,arguments)}},before:cr,once:lr,findKey:sr,findIndex:vr,findLastIndex:hr,sortedIndex:yr,indexOf:gr,lastIndexOf:br,find:mr,detect:mr,findWhere:function(n,r){return mr(n,Dn(r))},each:jr,forEach:jr,map:_r,collect:_r,reduce:Ar,foldl:Ar,inject:Ar,reduceRight:xr,foldr:xr,filter:Sr,select:Sr,reject:function(n,r,t){return Sr(n,fr(qn(r)),t)},every:Or,all:Or,some:Mr,any:Mr,contains:Er,includes:Er,include:Er,invoke:Br,pluck:Nr,where:function(n,r){return Sr(n,Dn(r))},max:Ir,min:function(n,r,t){var e,u,o=1/0,i=1/0;if(null==r||"number"==typeof r&&"object"!=typeof n[0]&&null!=n)for(var a=0,f=(n=er(n)?n:jn(n)).length;ae||void 0===t)return 1;if(t + + + + + + API — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

API

+ + + + + + +

quapy

QuaPy module for quantification

+
+ + +
+
+
+ +
+ +
+

© Copyright 2024, Alejandro Moreo.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/generated/quapy.html b/docs/build/html/generated/quapy.html new file mode 100644 index 0000000..61ce026 --- /dev/null +++ b/docs/build/html/generated/quapy.html @@ -0,0 +1,106 @@ + + + + + + + quapy — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

quapy

+

QuaPy module for quantification

+
+ + +
+
+
+ +
+ +
+

© Copyright 2024, Alejandro Moreo.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/genindex.html b/docs/build/html/genindex.html deleted file mode 100644 index 5c2055f..0000000 --- a/docs/build/html/genindex.html +++ /dev/null @@ -1,1208 +0,0 @@ - - - - - - - - - Index — QuaPy 0.1.7 documentation - - - - - - - - - - - - - - - - - -
-
-
-
- - -

Index

- -
- A - | B - | C - | D - | E - | F - | G - | H - | I - | J - | K - | L - | M - | N - | O - | P - | Q - | R - | S - | T - | U - | V - | W - | X - | Y - -
-

A

- - - -
- -

B

- - - -
- -

C

- - - -
- -

D

- - - -
- -

E

- - - -
- -

F

- - - -
- -

G

- - - -
- -

H

- - - -
- -

I

- - - -
- -

J

- - -
- -

K

- - - -
- -

L

- - - -
- -

M

- - - -
- -

N

- - - -
- -

O

- - - -
- -

P

- - - -
- -

Q

- - - -
    -
  • - quapy.data.reader - -
  • -
  • - quapy.error - -
  • -
  • - quapy.evaluation - -
  • -
  • - quapy.functional - -
  • -
  • - quapy.method - -
  • -
  • - quapy.method.aggregative - -
  • -
  • - quapy.method.base - -
  • -
  • - quapy.method.meta - -
  • -
  • - quapy.method.neural - -
  • -
  • - quapy.method.non_aggregative - -
  • -
  • - quapy.model_selection - -
  • -
  • - quapy.plot - -
  • -
  • - quapy.protocol - -
  • -
  • - quapy.util - -
  • -
- -

R

- - - -
- -

S

- - - -
- -

T

- - - -
- -

U

- - - -
- -

V

- - - -
- -

W

- - -
- -

X

- - - -
- -

Y

- - -
- - - -
-
-
-
- -
-
- - - - \ No newline at end of file diff --git a/docs/build/html/index.html b/docs/build/html/index.html deleted file mode 100644 index 14faf1f..0000000 --- a/docs/build/html/index.html +++ /dev/null @@ -1,245 +0,0 @@ - - - - - - - - - - Welcome to QuaPy’s documentation! — QuaPy 0.1.7 documentation - - - - - - - - - - - - - - - - - - -
-
-
-
- -
-

Welcome to QuaPy’s documentation!

-

QuaPy is an open source framework for Quantification (a.k.a. Supervised Prevalence Estimation) -written in Python.

-
-

Introduction

-

QuaPy roots on the concept of data sample, and provides implementations of most important concepts -in quantification literature, such as the most important quantification baselines, many advanced -quantification methods, quantification-oriented model selection, many evaluation measures and protocols -used for evaluating quantification methods. -QuaPy also integrates commonly used datasets and offers visualization tools for facilitating the analysis and -interpretation of results.

-
-

A quick example:

-

The following script fetchs a Twitter dataset, trains and evaluates an -Adjusted Classify & Count model in terms of the Mean Absolute Error (MAE) -between the class prevalences estimated for the test set and the true prevalences -of the test set.

-
import quapy as qp
-from sklearn.linear_model import LogisticRegression
-
-dataset = qp.datasets.fetch_twitter('semeval16')
-
-# create an "Adjusted Classify & Count" quantifier
-model = qp.method.aggregative.ACC(LogisticRegression())
-model.fit(dataset.training)
-
-estim_prevalences = model.quantify(dataset.test.instances)
-true_prevalences  = dataset.test.prevalence()
-
-error = qp.error.mae(true_prevalences, estim_prevalences)
-
-print(f'Mean Absolute Error (MAE)={error:.3f}')
-
-
-

Quantification is useful in scenarios of prior probability shift. In other -words, we would not be interested in estimating the class prevalences of the test set if -we could assume the IID assumption to hold, as this prevalence would simply coincide with the -class prevalence of the training set. For this reason, any Quantification model -should be tested across samples characterized by different class prevalences. -QuaPy implements sampling procedures and evaluation protocols that automates this endeavour. -See the Evaluation for detailed examples.

-
-
-

Features

-
    -
  • Implementation of most popular quantification methods (Classify-&-Count variants, Expectation-Maximization, SVM-based variants for quantification, HDy, QuaNet, and Ensembles).

  • -
  • Versatile functionality for performing evaluation based on artificial sampling protocols.

  • -
  • Implementation of most commonly used evaluation metrics (e.g., MAE, MRAE, MSE, NKLD, etc.).

  • -
  • -
    Popular datasets for Quantification (textual and numeric) available, including:
      -
    • 32 UCI Machine Learning datasets.

    • -
    • 11 Twitter Sentiment datasets.

    • -
    • 3 Reviews Sentiment datasets.

    • -
    • 4 tasks from LeQua competition (_new in v0.1.7!_)

    • -
    -
    -
    -
  • -
  • Native supports for binary and single-label scenarios of quantification.

  • -
  • Model selection functionality targeting quantification-oriented losses.

  • -
  • Visualization tools for analysing results.

  • -
- -
-
-
-
-

Indices and tables

- -
- - -
-
-
-
- -
-
- - - - \ No newline at end of file diff --git a/docs/build/html/modules.html b/docs/build/html/modules.html deleted file mode 100644 index dcad8b8..0000000 --- a/docs/build/html/modules.html +++ /dev/null @@ -1,138 +0,0 @@ - - - - - - - - - - quapy — QuaPy 0.1.7 documentation - - - - - - - - - - - - - - - - - - - -
- - -
-
- - - - \ No newline at end of file diff --git a/docs/build/html/objects.inv b/docs/build/html/objects.inv deleted file mode 100644 index 438922f..0000000 Binary files a/docs/build/html/objects.inv and /dev/null differ diff --git a/docs/build/html/py-modindex.html b/docs/build/html/py-modindex.html deleted file mode 100644 index f314385..0000000 --- a/docs/build/html/py-modindex.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - - - Python Module Index — QuaPy 0.1.7 documentation - - - - - - - - - - - - - - - - - - - - -
-
-
-
- - -

Python Module Index

- -
- q -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
- q
- quapy -
    - quapy.classification -
    - quapy.classification.calibration -
    - quapy.classification.methods -
    - quapy.classification.neural -
    - quapy.classification.svmperf -
    - quapy.data -
    - quapy.data.base -
    - quapy.data.datasets -
    - quapy.data.preprocessing -
    - quapy.data.reader -
    - quapy.error -
    - quapy.evaluation -
    - quapy.functional -
    - quapy.method -
    - quapy.method.aggregative -
    - quapy.method.base -
    - quapy.method.meta -
    - quapy.method.neural -
    - quapy.method.non_aggregative -
    - quapy.model_selection -
    - quapy.plot -
    - quapy.protocol -
    - quapy.util -
- - -
-
-
-
- -
-
- - - - \ No newline at end of file diff --git a/docs/build/html/quapy.benchmarking.html b/docs/build/html/quapy.benchmarking.html new file mode 100644 index 0000000..ab3831f --- /dev/null +++ b/docs/build/html/quapy.benchmarking.html @@ -0,0 +1,119 @@ + + + + + + + quapy.benchmarking package — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

quapy.benchmarking package

+
+

Submodules

+
+
+

quapy.benchmarking.typical module

+
+
+quapy.benchmarking.typical.wrap_cls_params(params)
+
+ +
+
+

Module contents

+
+
+ + +
+
+
+ +
+ +
+

© Copyright 2024, Alejandro Moreo.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/quapy.classification.html b/docs/build/html/quapy.classification.html deleted file mode 100644 index d382736..0000000 --- a/docs/build/html/quapy.classification.html +++ /dev/null @@ -1,973 +0,0 @@ - - - - - - - - - - quapy.classification package — QuaPy 0.1.7 documentation - - - - - - - - - - - - - - - - - - - -
-
-
-
- -
-

quapy.classification package

-
-

Submodules

-
-
-

quapy.classification.calibration

-
-

New in version 0.1.7.

-
-
-
-class quapy.classification.calibration.BCTSCalibration(classifier, val_split=5, n_jobs=None, verbose=False)
-

Bases: RecalibratedProbabilisticClassifierBase

-

Applies the Bias-Corrected Temperature Scaling (BCTS) calibration method from abstention.calibration, as defined in -Alexandari et al. paper:

-
-
Parameters:
-
    -
  • classifier – a scikit-learn probabilistic classifier

  • -
  • val_split – indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p -in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the -training instances (the rest is used for training). In any case, the classifier is retrained in the whole -training set afterwards. Default value is 5.

  • -
  • n_jobs – indicate the number of parallel workers (only when val_split is an integer)

  • -
  • verbose – whether or not to display information in the standard output

  • -
-
-
-
- -
-
-class quapy.classification.calibration.NBVSCalibration(classifier, val_split=5, n_jobs=None, verbose=False)
-

Bases: RecalibratedProbabilisticClassifierBase

-

Applies the No-Bias Vector Scaling (NBVS) calibration method from abstention.calibration, as defined in -Alexandari et al. paper:

-
-
Parameters:
-
    -
  • classifier – a scikit-learn probabilistic classifier

  • -
  • val_split – indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p -in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the -training instances (the rest is used for training). In any case, the classifier is retrained in the whole -training set afterwards. Default value is 5.

  • -
  • n_jobs – indicate the number of parallel workers (only when val_split is an integer)

  • -
  • verbose – whether or not to display information in the standard output

  • -
-
-
-
- -
-
-class quapy.classification.calibration.RecalibratedProbabilisticClassifier
-

Bases: object

-

Abstract class for (re)calibration method from abstention.calibration, as defined in -Alexandari, A., Kundaje, A., & Shrikumar, A. (2020, November). Maximum likelihood with bias-corrected calibration -is hard-to-beat at label shift adaptation. In International Conference on Machine Learning (pp. 222-232). PMLR.:

-
- -
-
-class quapy.classification.calibration.RecalibratedProbabilisticClassifierBase(classifier, calibrator, val_split=5, n_jobs=None, verbose=False)
-

Bases: BaseEstimator, RecalibratedProbabilisticClassifier

-

Applies a (re)calibration method from abstention.calibration, as defined in -Alexandari et al. paper:

-
-
Parameters:
-
    -
  • classifier – a scikit-learn probabilistic classifier

  • -
  • calibrator – the calibration object (an instance of abstention.calibration.CalibratorFactory)

  • -
  • val_split – indicate an integer k for performing kFCV to obtain the posterior probabilities, or a float p -in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the -training instances (the rest is used for training). In any case, the classifier is retrained in the whole -training set afterwards. Default value is 5.

  • -
  • n_jobs – indicate the number of parallel workers (only when val_split is an integer); default=None

  • -
  • verbose – whether or not to display information in the standard output

  • -
-
-
-
-
-property classes_
-

Returns the classes on which the classifier has been trained on

-
-
Returns:
-

array-like of shape (n_classes)

-
-
-
- -
-
-fit(X, y)
-

Fits the calibration for the probabilistic classifier.

-
-
Parameters:
-
    -
  • X – array-like of shape (n_samples, n_features) with the data instances

  • -
  • y – array-like of shape (n_samples,) with the class labels

  • -
-
-
Returns:
-

self

-
-
-
- -
-
-fit_cv(X, y)
-

Fits the calibration in a cross-validation manner, i.e., it generates posterior probabilities for all -training instances via cross-validation, and then retrains the classifier on all training instances. -The posterior probabilities thus generated are used for calibrating the outputs of the classifier.

-
-
Parameters:
-
    -
  • X – array-like of shape (n_samples, n_features) with the data instances

  • -
  • y – array-like of shape (n_samples,) with the class labels

  • -
-
-
Returns:
-

self

-
-
-
- -
-
-fit_tr_val(X, y)
-

Fits the calibration in a train/val-split manner, i.e.t, it partitions the training instances into a -training and a validation set, and then uses the training samples to learn classifier which is then used -to generate posterior probabilities for the held-out validation data. These posteriors are used to calibrate -the classifier. The classifier is not retrained on the whole dataset.

-
-
Parameters:
-
    -
  • X – array-like of shape (n_samples, n_features) with the data instances

  • -
  • y – array-like of shape (n_samples,) with the class labels

  • -
-
-
Returns:
-

self

-
-
-
- -
-
-predict(X)
-

Predicts class labels for the data instances in X

-
-
Parameters:
-

X – array-like of shape (n_samples, n_features) with the data instances

-
-
Returns:
-

array-like of shape (n_samples,) with the class label predictions

-
-
-
- -
-
-predict_proba(X)
-

Generates posterior probabilities for the data instances in X

-
-
Parameters:
-

X – array-like of shape (n_samples, n_features) with the data instances

-
-
Returns:
-

array-like of shape (n_samples, n_classes) with posterior probabilities

-
-
-
- -
- -
-
-class quapy.classification.calibration.TSCalibration(classifier, val_split=5, n_jobs=None, verbose=False)
-

Bases: RecalibratedProbabilisticClassifierBase

-

Applies the Temperature Scaling (TS) calibration method from abstention.calibration, as defined in -Alexandari et al. paper:

-
-
Parameters:
-
    -
  • classifier – a scikit-learn probabilistic classifier

  • -
  • val_split – indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p -in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the -training instances (the rest is used for training). In any case, the classifier is retrained in the whole -training set afterwards. Default value is 5.

  • -
  • n_jobs – indicate the number of parallel workers (only when val_split is an integer)

  • -
  • verbose – whether or not to display information in the standard output

  • -
-
-
-
- -
-
-class quapy.classification.calibration.VSCalibration(classifier, val_split=5, n_jobs=None, verbose=False)
-

Bases: RecalibratedProbabilisticClassifierBase

-

Applies the Vector Scaling (VS) calibration method from abstention.calibration, as defined in -Alexandari et al. paper:

-
-
Parameters:
-
    -
  • classifier – a scikit-learn probabilistic classifier

  • -
  • val_split – indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p -in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the -training instances (the rest is used for training). In any case, the classifier is retrained in the whole -training set afterwards. Default value is 5.

  • -
  • n_jobs – indicate the number of parallel workers (only when val_split is an integer)

  • -
  • verbose – whether or not to display information in the standard output

  • -
-
-
-
- -
-
-

quapy.classification.methods

-
-
-class quapy.classification.methods.LowRankLogisticRegression(n_components=100, **kwargs)
-

Bases: BaseEstimator

-

An example of a classification method (i.e., an object that implements fit, predict, and predict_proba) -that also generates embedded inputs (i.e., that implements transform), as those required for -quapy.method.neural.QuaNet. This is a mock method to allow for easily instantiating -quapy.method.neural.QuaNet on array-like real-valued instances. -The transformation consists of applying sklearn.decomposition.TruncatedSVD -while classification is performed using sklearn.linear_model.LogisticRegression on the low-rank space.

-
-
Parameters:
-
    -
  • n_components – the number of principal components to retain

  • -
  • kwargs – parameters for the -Logistic Regression classifier

  • -
-
-
-
-
-fit(X, y)
-

Fit the model according to the given training data. The fit consists of -fitting TruncatedSVD and then LogisticRegression on the low-rank representation.

-
-
Parameters:
-
    -
  • X – array-like of shape (n_samples, n_features) with the instances

  • -
  • y – array-like of shape (n_samples, n_classes) with the class labels

  • -
-
-
Returns:
-

self

-
-
-
- -
-
-get_params(deep=True)
-

Get hyper-parameters for this estimator.

-
-
Parameters:
-

deep – compatibility with sklearn

-
-
Returns:
-

a dictionary with parameter names mapped to their values

-
-
-
- -
-
-predict(X)
-

Predicts labels for the instances X embedded into the low-rank space.

-
-
Parameters:
-

X – array-like of shape (n_samples, n_features) instances to classify

-
-
Returns:
-

a numpy array of length n containing the label predictions, where n is the number of -instances in X

-
-
-
- -
-
-predict_proba(X)
-

Predicts posterior probabilities for the instances X embedded into the low-rank space.

-
-
Parameters:
-

X – array-like of shape (n_samples, n_features) instances to classify

-
-
Returns:
-

array-like of shape (n_samples, n_classes) with the posterior probabilities

-
-
-
- -
-
-set_params(**params)
-

Set the parameters of this estimator.

-
-
Parameters:
-

parameters – a **kwargs dictionary with the estimator parameters for -Logistic Regression -and eventually also n_components for TruncatedSVD

-
-
-
- -
-
-transform(X)
-

Returns the low-rank approximation of X with n_components dimensions, or X unaltered if -n_components >= X.shape[1].

-
-
Parameters:
-

X – array-like of shape (n_samples, n_features) instances to embed

-
-
Returns:
-

array-like of shape (n_samples, n_components) with the embedded instances

-
-
-
- -
- -
-
-

quapy.classification.neural

-
-
-class quapy.classification.neural.CNNnet(vocabulary_size, n_classes, embedding_size=100, hidden_size=256, repr_size=100, kernel_heights=[3, 5, 7], stride=1, padding=0, drop_p=0.5)
-

Bases: TextClassifierNet

-

An implementation of quapy.classification.neural.TextClassifierNet based on -Convolutional Neural Networks.

-
-
Parameters:
-
    -
  • vocabulary_size – the size of the vocabulary

  • -
  • n_classes – number of target classes

  • -
  • embedding_size – the dimensionality of the word embeddings space (default 100)

  • -
  • hidden_size – the dimensionality of the hidden space (default 256)

  • -
  • repr_size – the dimensionality of the document embeddings space (default 100)

  • -
  • kernel_heights – list of kernel lengths (default [3,5,7]), i.e., the number of -consecutive tokens that each kernel covers

  • -
  • stride – convolutional stride (default 1)

  • -
  • stride – convolutional pad (default 0)

  • -
  • drop_p – drop probability for dropout (default 0.5)

  • -
-
-
-
-
-document_embedding(input)
-

Embeds documents (i.e., performs the forward pass up to the -next-to-last layer).

-
-
Parameters:
-

input – a batch of instances, typically generated by a torch’s DataLoader -instance (see quapy.classification.neural.TorchDataset)

-
-
Returns:
-

a torch tensor of shape (n_samples, n_dimensions), where -n_samples is the number of documents, and n_dimensions is the -dimensionality of the embedding

-
-
-
- -
-
-get_params()
-

Get hyper-parameters for this estimator

-
-
Returns:
-

a dictionary with parameter names mapped to their values

-
-
-
- -
-
-training: bool
-
- -
-
-property vocabulary_size
-

Return the size of the vocabulary

-
-
Returns:
-

integer

-
-
-
- -
- -
-
-class quapy.classification.neural.LSTMnet(vocabulary_size, n_classes, embedding_size=100, hidden_size=256, repr_size=100, lstm_class_nlayers=1, drop_p=0.5)
-

Bases: TextClassifierNet

-

An implementation of quapy.classification.neural.TextClassifierNet based on -Long Short Term Memory networks.

-
-
Parameters:
-
    -
  • vocabulary_size – the size of the vocabulary

  • -
  • n_classes – number of target classes

  • -
  • embedding_size – the dimensionality of the word embeddings space (default 100)

  • -
  • hidden_size – the dimensionality of the hidden space (default 256)

  • -
  • repr_size – the dimensionality of the document embeddings space (default 100)

  • -
  • lstm_class_nlayers – number of LSTM layers (default 1)

  • -
  • drop_p – drop probability for dropout (default 0.5)

  • -
-
-
-
-
-document_embedding(x)
-

Embeds documents (i.e., performs the forward pass up to the -next-to-last layer).

-
-
Parameters:
-

x – a batch of instances, typically generated by a torch’s DataLoader -instance (see quapy.classification.neural.TorchDataset)

-
-
Returns:
-

a torch tensor of shape (n_samples, n_dimensions), where -n_samples is the number of documents, and n_dimensions is the -dimensionality of the embedding

-
-
-
- -
-
-get_params()
-

Get hyper-parameters for this estimator

-
-
Returns:
-

a dictionary with parameter names mapped to their values

-
-
-
- -
-
-training: bool
-
- -
-
-property vocabulary_size
-

Return the size of the vocabulary

-
-
Returns:
-

integer

-
-
-
- -
- -
-
-class quapy.classification.neural.NeuralClassifierTrainer(net: TextClassifierNet, lr=0.001, weight_decay=0, patience=10, epochs=200, batch_size=64, batch_size_test=512, padding_length=300, device='cuda', checkpointpath='../checkpoint/classifier_net.dat')
-

Bases: object

-

Trains a neural network for text classification.

-
-
Parameters:
-
    -
  • net – an instance of TextClassifierNet implementing the forward pass

  • -
  • lr – learning rate (default 1e-3)

  • -
  • weight_decay – weight decay (default 0)

  • -
  • patience – number of epochs that do not show any improvement in validation -to wait before applying early stop (default 10)

  • -
  • epochs – maximum number of training epochs (default 200)

  • -
  • batch_size – batch size for training (default 64)

  • -
  • batch_size_test – batch size for test (default 512)

  • -
  • padding_length – maximum number of tokens to consider in a document (default 300)

  • -
  • device – specify ‘cpu’ (default) or ‘cuda’ for enabling gpu

  • -
  • checkpointpath – where to store the parameters of the best model found so far -according to the evaluation in the held-out validation split (default ‘../checkpoint/classifier_net.dat’)

  • -
-
-
-
-
-property device
-

Gets the device in which the network is allocated

-
-
Returns:
-

device

-
-
-
- -
-
-fit(instances, labels, val_split=0.3)
-

Fits the model according to the given training data.

-
-
Parameters:
-
    -
  • instances – list of lists of indexed tokens

  • -
  • labels – array-like of shape (n_samples, n_classes) with the class labels

  • -
  • val_split – proportion of training documents to be taken as the validation set (default 0.3)

  • -
-
-
Returns:
-

-
-
-
- -
-
-get_params()
-

Get hyper-parameters for this estimator

-
-
Returns:
-

a dictionary with parameter names mapped to their values

-
-
-
- -
-
-predict(instances)
-

Predicts labels for the instances

-
-
Parameters:
-

instances – list of lists of indexed tokens

-
-
Returns:
-

a numpy array of length n containing the label predictions, where n is the number of -instances in X

-
-
-
- -
-
-predict_proba(instances)
-

Predicts posterior probabilities for the instances

-
-
Parameters:
-

X – array-like of shape (n_samples, n_features) instances to classify

-
-
Returns:
-

array-like of shape (n_samples, n_classes) with the posterior probabilities

-
-
-
- -
-
-reset_net_params(vocab_size, n_classes)
-

Reinitialize the network parameters

-
-
Parameters:
-
    -
  • vocab_size – the size of the vocabulary

  • -
  • n_classes – the number of target classes

  • -
-
-
-
- -
-
-set_params(**params)
-

Set the parameters of this trainer and the learner it is training. -In this current version, parameter names for the trainer and learner should -be disjoint.

-
-
Parameters:
-

params – a **kwargs dictionary with the parameters

-
-
-
- -
-
-transform(instances)
-

Returns the embeddings of the instances

-
-
Parameters:
-

instances – list of lists of indexed tokens

-
-
Returns:
-

array-like of shape (n_samples, embed_size) with the embedded instances, -where embed_size is defined by the classification network

-
-
-
- -
- -
-
-class quapy.classification.neural.TextClassifierNet
-

Bases: Module

-

Abstract Text classifier (torch.nn.Module)

-
-
-dimensions()
-

Gets the number of dimensions of the embedding space

-
-
Returns:
-

integer

-
-
-
- -
-
-abstract document_embedding(x)
-

Embeds documents (i.e., performs the forward pass up to the -next-to-last layer).

-
-
Parameters:
-

x – a batch of instances, typically generated by a torch’s DataLoader -instance (see quapy.classification.neural.TorchDataset)

-
-
Returns:
-

a torch tensor of shape (n_samples, n_dimensions), where -n_samples is the number of documents, and n_dimensions is the -dimensionality of the embedding

-
-
-
- -
-
-forward(x)
-

Performs the forward pass.

-
-
Parameters:
-

x – a batch of instances, typically generated by a torch’s DataLoader -instance (see quapy.classification.neural.TorchDataset)

-
-
Returns:
-

a tensor of shape (n_instances, n_classes) with the decision scores -for each of the instances and classes

-
-
-
- -
-
-abstract get_params()
-

Get hyper-parameters for this estimator

-
-
Returns:
-

a dictionary with parameter names mapped to their values

-
-
-
- -
-
-predict_proba(x)
-

Predicts posterior probabilities for the instances in x

-
-
Parameters:
-

x – a torch tensor of indexed tokens with shape (n_instances, pad_length) -where n_instances is the number of instances in the batch, and pad_length -is length of the pad in the batch

-
-
Returns:
-

array-like of shape (n_samples, n_classes) with the posterior probabilities

-
-
-
- -
-
-training: bool
-
- -
-
-property vocabulary_size
-

Return the size of the vocabulary

-
-
Returns:
-

integer

-
-
-
- -
-
-xavier_uniform()
-

Performs Xavier initialization of the network parameters

-
- -
- -
-
-class quapy.classification.neural.TorchDataset(instances, labels=None)
-

Bases: Dataset

-

Transforms labelled instances into a Torch’s torch.utils.data.DataLoader object

-
-
Parameters:
-
    -
  • instances – list of lists of indexed tokens

  • -
  • labels – array-like of shape (n_samples, n_classes) with the class labels

  • -
-
-
-
-
-asDataloader(batch_size, shuffle, pad_length, device)
-

Converts the labelled collection into a Torch DataLoader with dynamic padding for -the batch

-
-
Parameters:
-
    -
  • batch_size – batch size

  • -
  • shuffle – whether or not to shuffle instances

  • -
  • pad_length – the maximum length for the list of tokens (dynamic padding is -applied, meaning that if the longest document in the batch is shorter than -pad_length, then the batch is padded up to its length, and not to pad_length.

  • -
  • device – whether to allocate tensors in cpu or in cuda

  • -
-
-
Returns:
-

a torch.utils.data.DataLoader object

-
-
-
- -
- -
-
-

quapy.classification.svmperf

-
-
-class quapy.classification.svmperf.SVMperf(svmperf_base, C=0.01, verbose=False, loss='01', host_folder=None)
-

Bases: BaseEstimator, ClassifierMixin

-

A wrapper for the SVM-perf package by Thorsten Joachims. -When using losses for quantification, the source code has to be patched. See -the installation documentation -for further details.

-

References:

-
-
-
-
Parameters:
-
    -
  • svmperf_base – path to directory containing the binary files svm_perf_learn and svm_perf_classify

  • -
  • C – trade-off between training error and margin (default 0.01)

  • -
  • verbose – set to True to print svm-perf std outputs

  • -
  • loss – the loss to optimize for. Available losses are “01”, “f1”, “kld”, “nkld”, “q”, “qacc”, “qf1”, “qgm”, “mae”, “mrae”.

  • -
  • host_folder – directory where to store the trained model; set to None (default) for using a tmp directory -(temporal directories are automatically deleted)

  • -
-
-
-
-
-decision_function(X, y=None)
-

Evaluate the decision function for the samples in X.

-
-
Parameters:
-
    -
  • X – array-like of shape (n_samples, n_features) containing the instances to classify

  • -
  • y – unused

  • -
-
-
Returns:
-

array-like of shape (n_samples,) containing the decision scores of the instances

-
-
-
- -
-
-fit(X, y)
-

Trains the SVM for the multivariate performance loss

-
-
Parameters:
-
    -
  • X – training instances

  • -
  • y – a binary vector of labels

  • -
-
-
Returns:
-

self

-
-
-
- -
-
-predict(X)
-

Predicts labels for the instances X

-
-
Parameters:
-

X – array-like of shape (n_samples, n_features) instances to classify

-
-
Returns:
-

a numpy array of length n containing the label predictions, where n is the number of -instances in X

-
-
-
- -
-
-valid_losses = {'01': 0, 'f1': 1, 'kld': 12, 'mae': 26, 'mrae': 27, 'nkld': 13, 'q': 22, 'qacc': 23, 'qf1': 24, 'qgm': 25}
-
- -
- -
-
-

Module contents

-
-
- - -
-
-
-
- -
-
- - - - \ No newline at end of file diff --git a/docs/build/html/quapy.data.html b/docs/build/html/quapy.data.html deleted file mode 100644 index ef262fd..0000000 --- a/docs/build/html/quapy.data.html +++ /dev/null @@ -1,1118 +0,0 @@ - - - - - - - - - - quapy.data package — QuaPy 0.1.7 documentation - - - - - - - - - - - - - - - - - - - -
-
-
-
- -
-

quapy.data package

-
-

Submodules

-
-
-

quapy.data.base

-
-
-class quapy.data.base.Dataset(training: LabelledCollection, test: LabelledCollection, vocabulary: Optional[dict] = None, name='')
-

Bases: object

-

Abstraction of training and test LabelledCollection objects.

-
-
Parameters:
-
    -
  • training – a LabelledCollection instance

  • -
  • test – a LabelledCollection instance

  • -
  • vocabulary – if indicated, is a dictionary of the terms used in this textual dataset

  • -
  • name – a string representing the name of the dataset

  • -
-
-
-
-
-classmethod SplitStratified(collection: LabelledCollection, train_size=0.6)
-

Generates a Dataset from a stratified split of a LabelledCollection instance. -See LabelledCollection.split_stratified()

-
-
Parameters:
-
    -
  • collectionLabelledCollection

  • -
  • train_size – the proportion of training documents (the rest conforms the test split)

  • -
-
-
Returns:
-

an instance of Dataset

-
-
-
- -
-
-property binary
-

Returns True if the training collection is labelled according to two classes

-
-
Returns:
-

boolean

-
-
-
- -
-
-property classes_
-

The classes according to which the training collection is labelled

-
-
Returns:
-

The classes according to which the training collection is labelled

-
-
-
- -
-
-classmethod kFCV(data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0)
-

Generator of stratified folds to be used in k-fold cross validation. This function is only a wrapper around -LabelledCollection.kFCV() that returns Dataset instances made of training and test folds.

-
-
Parameters:
-
    -
  • nfolds – integer (default 5), the number of folds to generate

  • -
  • nrepeats – integer (default 1), the number of rounds of k-fold cross validation to run

  • -
  • random_state – integer (default 0), guarantees that the folds generated are reproducible

  • -
-
-
Returns:
-

yields nfolds * nrepeats folds for k-fold cross validation as instances of Dataset

-
-
-
- -
-
-classmethod load(train_path, test_path, loader_func: callable, classes=None, **loader_kwargs)
-

Loads a training and a test labelled set of data and convert it into a Dataset instance. -The function in charge of reading the instances must be specified. This function can be a custom one, or any of -the reading functions defined in quapy.data.reader module.

-
-
Parameters:
-
    -
  • train_path – string, the path to the file containing the training instances

  • -
  • test_path – string, the path to the file containing the test instances

  • -
  • loader_func – a custom function that implements the data loader and returns a tuple with instances and -labels

  • -
  • classes – array-like, the classes according to which the instances are labelled

  • -
  • loader_kwargs – any argument that the loader_func function needs in order to read the instances. -See LabelledCollection.load() for further details.

  • -
-
-
Returns:
-

a Dataset object

-
-
-
- -
-
-property n_classes
-

The number of classes according to which the training collection is labelled

-
-
Returns:
-

integer

-
-
-
- -
-
-reduce(n_train=100, n_test=100)
-

Reduce the number of instances in place for quick experiments. Preserves the prevalence of each set.

-
-
Parameters:
-
    -
  • n_train – number of training documents to keep (default 100)

  • -
  • n_test – number of test documents to keep (default 100)

  • -
-
-
Returns:
-

self

-
-
-
- -
-
-stats(show=True)
-

Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,:

-
>>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
->>> data.stats()
->>> Dataset=kindle #tr-instances=3821, #te-instances=21591, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], tr-prevs=[0.081, 0.919], te-prevs=[0.063, 0.937]
-
-
-
-
Parameters:
-

show – if set to True (default), prints the stats in standard output

-
-
Returns:
-

a dictionary containing some stats of this collection for the training and test collections. The keys -are train and test, and point to dedicated dictionaries of stats, for each collection, with keys -#instances (the number of instances), type (the type representing the instances), -#features (the number of features, if the instances are in array-like format), #classes (the classes of -the collection), prevs (the prevalence values for each class)

-
-
-
- -
-
-property train_test
-

Alias to self.training and self.test

-
-
Returns:
-

the training and test collections

-
-
Returns:
-

the training and test collections

-
-
-
- -
-
-property vocabulary_size
-

If the dataset is textual, and the vocabulary was indicated, returns the size of the vocabulary

-
-
Returns:
-

integer

-
-
-
- -
- -
-
-class quapy.data.base.LabelledCollection(instances, labels, classes=None)
-

Bases: object

-

A LabelledCollection is a set of objects each with a label attached to each of them. -This class implements several sampling routines and other utilities.

-
-
Parameters:
-
    -
  • instances – array-like (np.ndarray, list, or csr_matrix are supported)

  • -
  • labels – array-like with the same length of instances

  • -
  • classes – optional, list of classes from which labels are taken. If not specified, the classes are inferred -from the labels. The classes must be indicated in cases in which some of the labels might have no examples -(i.e., a prevalence of 0)

  • -
-
-
-
-
-property X
-

An alias to self.instances

-
-
Returns:
-

self.instances

-
-
-
- -
-
-property Xp
-

Gets the instances and the true prevalence. This is useful when implementing evaluation protocols from -a LabelledCollection object.

-
-
Returns:
-

a tuple (instances, prevalence) from this collection

-
-
-
- -
-
-property Xy
-

Gets the instances and labels. This is useful when working with sklearn estimators, e.g.:

-
>>> svm = LinearSVC().fit(*my_collection.Xy)
-
-
-
-
Returns:
-

a tuple (instances, labels) from this collection

-
-
-
- -
-
-property binary
-

Returns True if the number of classes is 2

-
-
Returns:
-

boolean

-
-
-
- -
-
-counts()
-

Returns the number of instances for each of the classes in the codeframe.

-
-
Returns:
-

a np.ndarray of shape (n_classes) with the number of instances of each class, in the same order -as listed by self.classes_

-
-
-
- -
-
-classmethod join(*args: Iterable[LabelledCollection])
-

Returns a new LabelledCollection as the union of the collections given in input.

-
-
Parameters:
-

args – instances of LabelledCollection

-
-
Returns:
-

a LabelledCollection representing the union of both collections

-
-
-
- -
-
-kFCV(nfolds=5, nrepeats=1, random_state=None)
-

Generator of stratified folds to be used in k-fold cross validation.

-
-
Parameters:
-
    -
  • nfolds – integer (default 5), the number of folds to generate

  • -
  • nrepeats – integer (default 1), the number of rounds of k-fold cross validation to run

  • -
  • random_state – integer (default 0), guarantees that the folds generated are reproducible

  • -
-
-
Returns:
-

yields nfolds * nrepeats folds for k-fold cross validation

-
-
-
- -
-
-classmethod load(path: str, loader_func: callable, classes=None, **loader_kwargs)
-

Loads a labelled set of data and convert it into a LabelledCollection instance. The function in charge -of reading the instances must be specified. This function can be a custom one, or any of the reading functions -defined in quapy.data.reader module.

-
-
Parameters:
-
    -
  • path – string, the path to the file containing the labelled instances

  • -
  • loader_func – a custom function that implements the data loader and returns a tuple with instances and -labels

  • -
  • classes – array-like, the classes according to which the instances are labelled

  • -
  • loader_kwargs – any argument that the loader_func function needs in order to read the instances, i.e., -these arguments are used to call loader_func(path, **loader_kwargs)

  • -
-
-
Returns:
-

a LabelledCollection object

-
-
-
- -
-
-property n_classes
-

The number of classes

-
-
Returns:
-

integer

-
-
-
- -
-
-property p
-

An alias to self.prevalence()

-
-
Returns:
-

self.prevalence()

-
-
-
- -
-
-prevalence()
-

Returns the prevalence, or relative frequency, of the classes in the codeframe.

-
-
Returns:
-

a np.ndarray of shape (n_classes) with the relative frequencies of each class, in the same order -as listed by self.classes_

-
-
-
- -
-
-sampling(size, *prevs, shuffle=True, random_state=None)
-

Return a random sample (an instance of LabelledCollection) of desired size and desired prevalence -values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than -the actual prevalence of the class, or with replacement otherwise.

-
-
Parameters:
-
    -
  • size – integer, the requested size

  • -
  • prevs – the prevalence for each class; the prevalence value for the last class can be lead empty since -it is constrained. E.g., for binary collections, only the prevalence p for the first class (as listed in -self.classes_ can be specified, while the other class takes prevalence value 1-p

  • -
  • shuffle – if set to True (default), shuffles the index before returning it

  • -
  • random_state – seed for reproducing sampling

  • -
-
-
Returns:
-

an instance of LabelledCollection with length == size and prevalence close to prevs (or -prevalence == prevs if the exact prevalence values can be met as proportions of instances)

-
-
-
- -
-
-sampling_from_index(index)
-

Returns an instance of LabelledCollection whose elements are sampled from this collection using the -index.

-
-
Parameters:
-

index – np.ndarray

-
-
Returns:
-

an instance of LabelledCollection

-
-
-
- -
-
-sampling_index(size, *prevs, shuffle=True, random_state=None)
-

Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the -prevalence values are not specified, then returns the index of a uniform sampling. -For each class, the sampling is drawn with replacement if the requested prevalence is larger than -the actual prevalence of the class, or without replacement otherwise.

-
-
Parameters:
-
    -
  • size – integer, the requested size

  • -
  • prevs – the prevalence for each class; the prevalence value for the last class can be lead empty since -it is constrained. E.g., for binary collections, only the prevalence p for the first class (as listed in -self.classes_ can be specified, while the other class takes prevalence value 1-p

  • -
  • shuffle – if set to True (default), shuffles the index before returning it

  • -
  • random_state – seed for reproducing sampling

  • -
-
-
Returns:
-

a np.ndarray of shape (size) with the indexes

-
-
-
- -
-
-split_random(train_prop=0.6, random_state=None)
-

Returns two instances of LabelledCollection split randomly from this collection, at desired -proportion.

-
-
Parameters:
-
    -
  • train_prop – the proportion of elements to include in the left-most returned collection (typically used -as the training collection). The rest of elements are included in the right-most returned collection -(typically used as a test collection).

  • -
  • random_state – if specified, guarantees reproducibility of the split.

  • -
-
-
Returns:
-

two instances of LabelledCollection, the first one with train_prop elements, and the -second one with 1-train_prop elements

-
-
-
- -
-
-split_stratified(train_prop=0.6, random_state=None)
-

Returns two instances of LabelledCollection split with stratification from this collection, at desired -proportion.

-
-
Parameters:
-
    -
  • train_prop – the proportion of elements to include in the left-most returned collection (typically used -as the training collection). The rest of elements are included in the right-most returned collection -(typically used as a test collection).

  • -
  • random_state – if specified, guarantees reproducibility of the split.

  • -
-
-
Returns:
-

two instances of LabelledCollection, the first one with train_prop elements, and the -second one with 1-train_prop elements

-
-
-
- -
-
-stats(show=True)
-

Returns (and eventually prints) a dictionary with some stats of this collection. E.g.,:

-
>>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
->>> data.training.stats()
->>> #instances=3821, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], prevs=[0.081, 0.919]
-
-
-
-
Parameters:
-

show – if set to True (default), prints the stats in standard output

-
-
Returns:
-

a dictionary containing some stats of this collection. Keys include #instances (the number of -instances), type (the type representing the instances), #features (the number of features, if the -instances are in array-like format), #classes (the classes of the collection), prevs (the prevalence -values for each class)

-
-
-
- -
-
-uniform_sampling(size, random_state=None)
-

Returns a uniform sample (an instance of LabelledCollection) of desired size. The sampling is drawn -with replacement if the requested size is greater than the number of instances, or without replacement -otherwise.

-
-
Parameters:
-
    -
  • size – integer, the requested size

  • -
  • random_state – if specified, guarantees reproducibility of the split.

  • -
-
-
Returns:
-

an instance of LabelledCollection with length == size

-
-
-
- -
-
-uniform_sampling_index(size, random_state=None)
-

Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn -with replacement if the requested size is greater than the number of instances, or without replacement -otherwise.

-
-
Parameters:
-
    -
  • size – integer, the size of the uniform sample

  • -
  • random_state – if specified, guarantees reproducibility of the split.

  • -
-
-
Returns:
-

a np.ndarray of shape (size) with the indexes

-
-
-
- -
-
-property y
-

An alias to self.labels

-
-
Returns:
-

self.labels

-
-
-
- -
- -
-
-

quapy.data.datasets

-
-
-quapy.data.datasets.fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) Dataset
-

Loads a UCI dataset as an instance of quapy.data.base.Dataset, as used in -Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). -Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. -Information Fusion, 34, 87-100. -and -Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019). -Dynamic ensemble selection for quantification tasks. -Information Fusion, 45, 1-15.. -The datasets do not come with a predefined train-test split (see fetch_UCILabelledCollection() for further -information on how to use these collections), and so a train-test split is generated at desired proportion. -The list of valid dataset names can be accessed in quapy.data.datasets.UCI_DATASETS

-
-
Parameters:
-
    -
  • dataset_name – a dataset name

  • -
  • data_home – specify the quapy home directory where collections will be dumped (leave empty to use the default -~/quay_data/ directory)

  • -
  • test_split – proportion of documents to be included in the test set. The rest conforms the training set

  • -
  • verbose – set to True (default is False) to get information (from the UCI ML repository) about the datasets

  • -
-
-
Returns:
-

a quapy.data.base.Dataset instance

-
-
-
- -
-
-quapy.data.datasets.fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) Dataset
-

Loads a UCI collection as an instance of quapy.data.base.LabelledCollection, as used in -Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). -Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. -Information Fusion, 34, 87-100. -and -Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019). -Dynamic ensemble selection for quantification tasks. -Information Fusion, 45, 1-15.. -The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation -protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation. -This can be reproduced by using quapy.data.base.Dataset.kFCV(), e.g.:

-
>>> import quapy as qp
->>> collection = qp.datasets.fetch_UCILabelledCollection("yeast")
->>> for data in qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
->>>     ...
-
-
-

The list of valid dataset names can be accessed in quapy.data.datasets.UCI_DATASETS

-
-
Parameters:
-
    -
  • dataset_name – a dataset name

  • -
  • data_home – specify the quapy home directory where collections will be dumped (leave empty to use the default -~/quay_data/ directory)

  • -
  • test_split – proportion of documents to be included in the test set. The rest conforms the training set

  • -
  • verbose – set to True (default is False) to get information (from the UCI ML repository) about the datasets

  • -
-
-
Returns:
-

a quapy.data.base.Dataset instance

-
-
-
- -
-
-quapy.data.datasets.fetch_lequa2022(task, data_home=None)
-

Loads the official datasets provided for the LeQua competition. -In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification -problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide raw documents instead. -Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B are multiclass quantification -problems consisting of estimating the class prevalence values of 28 different merchandise products. -We refer to the Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022). -A Detailed Overview of LeQua@ CLEF 2022: Learning to Quantify. for a detailed description -on the tasks and datasets.

-

The datasets are downloaded only once, and stored for fast reuse.

-

See lequa2022_experiments.py provided in the example folder, that can serve as a guide on how to use these -datasets.

-
-
Parameters:
-
    -
  • task – a string representing the task name; valid ones are T1A, T1B, T2A, and T2B

  • -
  • data_home – specify the quapy home directory where collections will be dumped (leave empty to use the default -~/quay_data/ directory)

  • -
-
-
Returns:
-

a tuple (train, val_gen, test_gen) where train is an instance of -quapy.data.base.LabelledCollection, val_gen and test_gen are instances of -quapy.protocol.SamplesFromDir, i.e., are sampling protocols that return a series of samples -labelled by prevalence.

-
-
-
- -
-
-quapy.data.datasets.fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) Dataset
-

Loads a Reviews dataset as a Dataset instance, as used in -Esuli, A., Moreo, A., and Sebastiani, F. “A recurrent neural network for sentiment quantification.” -Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.. -The list of valid dataset names can be accessed in quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS

-
-
Parameters:
-
    -
  • dataset_name – the name of the dataset: valid ones are ‘hp’, ‘kindle’, ‘imdb’

  • -
  • tfidf – set to True to transform the raw documents into tfidf weighted matrices

  • -
  • min_df – minimun number of documents that should contain a term in order for the term to be -kept (ignored if tfidf==False)

  • -
  • data_home – specify the quapy home directory where collections will be dumped (leave empty to use the default -~/quay_data/ directory)

  • -
  • pickle – set to True to pickle the Dataset object the first time it is generated, in order to allow for -faster subsequent invokations

  • -
-
-
Returns:
-

a quapy.data.base.Dataset instance

-
-
-
- -
-
-quapy.data.datasets.fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) Dataset
-

Loads a Twitter dataset as a quapy.data.base.Dataset instance, as used in: -Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. -Social Network Analysis and Mining6(19), 1–22 (2016) -Note that the datasets ‘semeval13’, ‘semeval14’, ‘semeval15’ share the same training set. -The list of valid dataset names corresponding to training sets can be accessed in -quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN, while the test sets can be accessed in -quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST

-
-
Parameters:
-
    -
  • dataset_name – the name of the dataset: valid ones are ‘gasp’, ‘hcr’, ‘omd’, ‘sanders’, ‘semeval13’, -‘semeval14’, ‘semeval15’, ‘semeval16’, ‘sst’, ‘wa’, ‘wb’

  • -
  • for_model_selection – if True, then returns the train split as the training set and the devel split -as the test set; if False, then returns the train+devel split as the training set and the test set as the -test set

  • -
  • min_df – minimun number of documents that should contain a term in order for the term to be kept

  • -
  • data_home – specify the quapy home directory where collections will be dumped (leave empty to use the default -~/quay_data/ directory)

  • -
  • pickle – set to True to pickle the Dataset object the first time it is generated, in order to allow for -faster subsequent invokations

  • -
-
-
Returns:
-

a quapy.data.base.Dataset instance

-
-
-
- -
-
-quapy.data.datasets.warn(*args, **kwargs)
-
- -
-
-

quapy.data.preprocessing

-
-
-class quapy.data.preprocessing.IndexTransformer(**kwargs)
-

Bases: object

-

This class implements a sklearn’s-style transformer that indexes text as numerical ids for the tokens it -contains, and that would be generated by sklearn’s -CountVectorizer

-
-
Parameters:
-

kwargs

keyworded arguments from -CountVectorizer

-

-
-
-
-
-add_word(word, id=None, nogaps=True)
-

Adds a new token (regardless of whether it has been found in the text or not), with dedicated id. -Useful to define special tokens for codifying unknown words, or padding tokens.

-
-
Parameters:
-
    -
  • word – string, surface form of the token

  • -
  • id – integer, numerical value to assign to the token (leave as None for indicating the next valid id, -default)

  • -
  • nogaps – if set to True (default) asserts that the id indicated leads to no numerical gaps with -precedent ids stored so far

  • -
-
-
Returns:
-

integer, the numerical id for the new token

-
-
-
- -
-
-fit(X)
-

Fits the transformer, i.e., decides on the vocabulary, given a list of strings.

-
-
Parameters:
-

X – a list of strings

-
-
Returns:
-

self

-
-
-
- -
-
-fit_transform(X, n_jobs=None)
-

Fits the transform on X and transforms it.

-
-
Parameters:
-
    -
  • X – a list of strings

  • -
  • n_jobs – the number of parallel workers to carry out this task

  • -
-
-
Returns:
-

a np.ndarray of numerical ids

-
-
-
- -
-
-transform(X, n_jobs=None)
-

Transforms the strings in X as lists of numerical ids

-
-
Parameters:
-
    -
  • X – a list of strings

  • -
  • n_jobs – the number of parallel workers to carry out this task

  • -
-
-
Returns:
-

a np.ndarray of numerical ids

-
-
-
- -
-
-vocabulary_size()
-

Gets the length of the vocabulary according to which the document tokens have been indexed

-
-
Returns:
-

integer

-
-
-
- -
- -
-
-quapy.data.preprocessing.index(dataset: Dataset, min_df=5, inplace=False, **kwargs)
-

Indexes the tokens of a textual quapy.data.base.Dataset of string documents. -To index a document means to replace each different token by a unique numerical index. -Rare words (i.e., words occurring less than min_df times) are replaced by a special token UNK

-
-
Parameters:
-
    -
  • dataset – a quapy.data.base.Dataset object where the instances of training and test documents -are lists of str

  • -
  • min_df – minimum number of occurrences below which the term is replaced by a UNK index

  • -
  • inplace – whether or not to apply the transformation inplace (True), or to a new copy (False, default)

  • -
  • kwargs – the rest of parameters of the transformation (as for sklearn’s -CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>_)

  • -
-
-
Returns:
-

a new quapy.data.base.Dataset (if inplace=False) or a reference to the current -quapy.data.base.Dataset (inplace=True) consisting of lists of integer values representing indices.

-
-
-
- -
-
-quapy.data.preprocessing.reduce_columns(dataset: Dataset, min_df=5, inplace=False)
-

Reduces the dimensionality of the instances, represented as a csr_matrix (or any subtype of -scipy.sparse.spmatrix), of training and test documents by removing the columns of words which are not present -in at least min_df instances in the training set

-
-
Parameters:
-
    -
  • dataset – a quapy.data.base.Dataset in which instances are represented in sparse format (any -subtype of scipy.sparse.spmatrix)

  • -
  • min_df – integer, minimum number of instances below which the columns are removed

  • -
  • inplace – whether or not to apply the transformation inplace (True), or to a new copy (False, default)

  • -
-
-
Returns:
-

a new quapy.data.base.Dataset (if inplace=False) or a reference to the current -quapy.data.base.Dataset (inplace=True) where the dimensions corresponding to infrequent terms -in the training set have been removed

-
-
-
- -
-
-quapy.data.preprocessing.standardize(dataset: Dataset, inplace=False)
-

Standardizes the real-valued columns of a quapy.data.base.Dataset. -Standardization, aka z-scoring, of a variable X comes down to subtracting the average and normalizing by the -standard deviation.

-
-
Parameters:
-
-
-
Returns:
-

an instance of quapy.data.base.Dataset

-
-
-
- -
-
-quapy.data.preprocessing.text2tfidf(dataset: Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs)
-

Transforms a quapy.data.base.Dataset of textual instances into a quapy.data.base.Dataset of -tfidf weighted sparse vectors

-
-
Parameters:
-
    -
  • dataset – a quapy.data.base.Dataset where the instances of training and test collections are -lists of str

  • -
  • min_df – minimum number of occurrences for a word to be considered as part of the vocabulary (default 3)

  • -
  • sublinear_tf – whether or not to apply the log scalling to the tf counters (default True)

  • -
  • inplace – whether or not to apply the transformation inplace (True), or to a new copy (False, default)

  • -
  • kwargs – the rest of parameters of the transformation (as for sklearn’s -TfidfVectorizer)

  • -
-
-
Returns:
-

a new quapy.data.base.Dataset in csr_matrix format (if inplace=False) or a reference to the -current Dataset (if inplace=True) where the instances are stored in a csr_matrix of real-valued tfidf scores

-
-
-
- -
-
-

quapy.data.reader

-
-
-quapy.data.reader.binarize(y, pos_class)
-

Binarizes a categorical array-like collection of labels towards the positive class pos_class. E.g.,:

-
>>> binarize([1, 2, 3, 1, 1, 0], pos_class=2)
->>> array([0, 1, 0, 0, 0, 0])
-
-
-
-
Parameters:
-
    -
  • y – array-like of labels

  • -
  • pos_class – integer, the positive class

  • -
-
-
Returns:
-

a binary np.ndarray, in which values 1 corresponds to positions in whcih y had pos_class labels, and -0 otherwise

-
-
-
- -
-
-quapy.data.reader.from_csv(path, encoding='utf-8')
-

Reads a csv file in which columns are separated by ‘,’. -File format <label>,<feat1>,<feat2>,…,<featn>

-
-
Parameters:
-
    -
  • path – path to the csv file

  • -
  • encoding – the text encoding used to open the file

  • -
-
-
Returns:
-

a np.ndarray for the labels and a ndarray (float) for the covariates

-
-
-
- -
-
-quapy.data.reader.from_sparse(path)
-

Reads a labelled collection of real-valued instances expressed in sparse format -File format <-1 or 0 or 1>[s col(int):val(float)]

-
-
Parameters:
-

path – path to the labelled collection

-
-
Returns:
-

a csr_matrix containing the instances (rows), and a ndarray containing the labels

-
-
-
- -
-
-quapy.data.reader.from_text(path, encoding='utf-8', verbose=1, class2int=True)
-

Reads a labelled colletion of documents. -File fomart <0 or 1> <document>

-
-
Parameters:
-
    -
  • path – path to the labelled collection

  • -
  • encoding – the text encoding used to open the file

  • -
  • verbose – if >0 (default) shows some progress information in standard output

  • -
-
-
Returns:
-

a list of sentences, and a list of labels

-
-
-
- -
-
-quapy.data.reader.reindex_labels(y)
-

Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes. -E.g.:

-
>>> reindex_labels(['B', 'B', 'A', 'C'])
->>> (array([1, 1, 0, 2]), array(['A', 'B', 'C'], dtype='<U1'))
-
-
-
-
Parameters:
-

y – the list or array of original labels

-
-
Returns:
-

a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.

-
-
-
- -
-
-

Module contents

-
-
- - -
-
-
-
- -
-
- - - - \ No newline at end of file diff --git a/docs/build/html/quapy.html b/docs/build/html/quapy.html deleted file mode 100644 index a40c323..0000000 --- a/docs/build/html/quapy.html +++ /dev/null @@ -1,1951 +0,0 @@ - - - - - - - - - - quapy package — QuaPy 0.1.7 documentation - - - - - - - - - - - - - - - - - - - - -
-
-
-
- -
-

quapy package

-
-

Submodules

-
-
-

quapy.error

-

Implementation of error measures used for quantification

-
-
-quapy.error.absolute_error(prevs, prevs_hat)
-
-
Computes the absolute error between the two prevalence vectors.

Absolute error between two prevalence vectors \(p\) and \(\hat{p}\) is computed as -\(AE(p,\hat{p})=\frac{1}{|\mathcal{Y}|}\sum_{y\in \mathcal{Y}}|\hat{p}(y)-p(y)|\), -where \(\mathcal{Y}\) are the classes of interest.

-
-
-
-
Parameters:
-
    -
  • prevs – array-like of shape (n_classes,) with the true prevalence values

  • -
  • prevs_hat – array-like of shape (n_classes,) with the predicted prevalence values

  • -
-
-
Returns:
-

absolute error

-
-
-
- -
-
-quapy.error.acc_error(y_true, y_pred)
-

Computes the error in terms of 1-accuracy. The accuracy is computed as -\(\frac{tp+tn}{tp+fp+fn+tn}\), with tp, fp, fn, and tn standing -for true positives, false positives, false negatives, and true negatives, -respectively

-
-
Parameters:
-
    -
  • y_true – array-like of true labels

  • -
  • y_pred – array-like of predicted labels

  • -
-
-
Returns:
-

1-accuracy

-
-
-
- -
-
-quapy.error.acce(y_true, y_pred)
-

Computes the error in terms of 1-accuracy. The accuracy is computed as -\(\frac{tp+tn}{tp+fp+fn+tn}\), with tp, fp, fn, and tn standing -for true positives, false positives, false negatives, and true negatives, -respectively

-
-
Parameters:
-
    -
  • y_true – array-like of true labels

  • -
  • y_pred – array-like of predicted labels

  • -
-
-
Returns:
-

1-accuracy

-
-
-
- -
-
-quapy.error.ae(prevs, prevs_hat)
-
-
Computes the absolute error between the two prevalence vectors.

Absolute error between two prevalence vectors \(p\) and \(\hat{p}\) is computed as -\(AE(p,\hat{p})=\frac{1}{|\mathcal{Y}|}\sum_{y\in \mathcal{Y}}|\hat{p}(y)-p(y)|\), -where \(\mathcal{Y}\) are the classes of interest.

-
-
-
-
Parameters:
-
    -
  • prevs – array-like of shape (n_classes,) with the true prevalence values

  • -
  • prevs_hat – array-like of shape (n_classes,) with the predicted prevalence values

  • -
-
-
Returns:
-

absolute error

-
-
-
- -
-
-quapy.error.f1_error(y_true, y_pred)
-

F1 error: simply computes the error in terms of macro \(F_1\), i.e., -\(1-F_1^M\), where \(F_1\) is the harmonic mean of precision and recall, -defined as \(\frac{2tp}{2tp+fp+fn}\), with tp, fp, and fn standing -for true positives, false positives, and false negatives, respectively. -Macro averaging means the \(F_1\) is computed for each category independently, -and then averaged.

-
-
Parameters:
-
    -
  • y_true – array-like of true labels

  • -
  • y_pred – array-like of predicted labels

  • -
-
-
Returns:
-

\(1-F_1^M\)

-
-
-
- -
-
-quapy.error.f1e(y_true, y_pred)
-

F1 error: simply computes the error in terms of macro \(F_1\), i.e., -\(1-F_1^M\), where \(F_1\) is the harmonic mean of precision and recall, -defined as \(\frac{2tp}{2tp+fp+fn}\), with tp, fp, and fn standing -for true positives, false positives, and false negatives, respectively. -Macro averaging means the \(F_1\) is computed for each category independently, -and then averaged.

-
-
Parameters:
-
    -
  • y_true – array-like of true labels

  • -
  • y_pred – array-like of predicted labels

  • -
-
-
Returns:
-

\(1-F_1^M\)

-
-
-
- -
-
-quapy.error.from_name(err_name)
-

Gets an error function from its name. E.g., from_name(“mae”) -will return function quapy.error.mae()

-
-
Parameters:
-

err_name – string, the error name

-
-
Returns:
-

a callable implementing the requested error

-
-
-
- -
-
-quapy.error.kld(prevs, prevs_hat, eps=None)
-
-
Computes the Kullback-Leibler divergence between the two prevalence distributions.

Kullback-Leibler divergence between two prevalence distributions \(p\) and \(\hat{p}\) -is computed as -\(KLD(p,\hat{p})=D_{KL}(p||\hat{p})= -\sum_{y\in \mathcal{Y}} p(y)\log\frac{p(y)}{\hat{p}(y)}\), -where \(\mathcal{Y}\) are the classes of interest. -The distributions are smoothed using the eps factor (see quapy.error.smooth()).

-
-
-
-
Parameters:
-
    -
  • prevs – array-like of shape (n_classes,) with the true prevalence values

  • -
  • prevs_hat – array-like of shape (n_classes,) with the predicted prevalence values

  • -
  • eps – smoothing factor. KLD is not defined in cases in which the distributions contain -zeros; eps is typically set to be \(\frac{1}{2T}\), with \(T\) the sample size. -If eps=None, the sample size will be taken from the environment variable SAMPLE_SIZE -(which has thus to be set beforehand).

  • -
-
-
Returns:
-

Kullback-Leibler divergence between the two distributions

-
-
-
- -
-
-quapy.error.mae(prevs, prevs_hat)
-

Computes the mean absolute error (see quapy.error.ae()) across the sample pairs.

-
-
Parameters:
-
    -
  • prevs – array-like of shape (n_samples, n_classes,) with the true prevalence values

  • -
  • prevs_hat – array-like of shape (n_samples, n_classes,) with the predicted -prevalence values

  • -
-
-
Returns:
-

mean absolute error

-
-
-
- -
-
-quapy.error.mean_absolute_error(prevs, prevs_hat)
-

Computes the mean absolute error (see quapy.error.ae()) across the sample pairs.

-
-
Parameters:
-
    -
  • prevs – array-like of shape (n_samples, n_classes,) with the true prevalence values

  • -
  • prevs_hat – array-like of shape (n_samples, n_classes,) with the predicted -prevalence values

  • -
-
-
Returns:
-

mean absolute error

-
-
-
- -
-
-quapy.error.mean_relative_absolute_error(prevs, prevs_hat, eps=None)
-

Computes the mean relative absolute error (see quapy.error.rae()) across -the sample pairs. The distributions are smoothed using the eps factor (see -quapy.error.smooth()).

-
-
Parameters:
-
    -
  • prevs – array-like of shape (n_samples, n_classes,) with the true -prevalence values

  • -
  • prevs_hat – array-like of shape (n_samples, n_classes,) with the predicted -prevalence values

  • -
  • eps – smoothing factor. mrae is not defined in cases in which the true -distribution contains zeros; eps is typically set to be \(\frac{1}{2T}\), -with \(T\) the sample size. If eps=None, the sample size will be taken from -the environment variable SAMPLE_SIZE (which has thus to be set beforehand).

  • -
-
-
Returns:
-

mean relative absolute error

-
-
-
- -
-
-quapy.error.mkld(prevs, prevs_hat, eps=None)
-

Computes the mean Kullback-Leibler divergence (see quapy.error.kld()) across the -sample pairs. The distributions are smoothed using the eps factor -(see quapy.error.smooth()).

-
-
Parameters:
-
    -
  • prevs – array-like of shape (n_samples, n_classes,) with the true -prevalence values

  • -
  • prevs_hat – array-like of shape (n_samples, n_classes,) with the predicted -prevalence values

  • -
  • eps – smoothing factor. KLD is not defined in cases in which the distributions contain -zeros; eps is typically set to be \(\frac{1}{2T}\), with \(T\) the sample size. -If eps=None, the sample size will be taken from the environment variable SAMPLE_SIZE -(which has thus to be set beforehand).

  • -
-
-
Returns:
-

mean Kullback-Leibler distribution

-
-
-
- -
-
-quapy.error.mnkld(prevs, prevs_hat, eps=None)
-

Computes the mean Normalized Kullback-Leibler divergence (see quapy.error.nkld()) -across the sample pairs. The distributions are smoothed using the eps factor -(see quapy.error.smooth()).

-
-
Parameters:
-
    -
  • prevs – array-like of shape (n_samples, n_classes,) with the true prevalence values

  • -
  • prevs_hat – array-like of shape (n_samples, n_classes,) with the predicted -prevalence values

  • -
  • eps – smoothing factor. NKLD is not defined in cases in which the distributions contain -zeros; eps is typically set to be \(\frac{1}{2T}\), with \(T\) the sample size. -If eps=None, the sample size will be taken from the environment variable SAMPLE_SIZE -(which has thus to be set beforehand).

  • -
-
-
Returns:
-

mean Normalized Kullback-Leibler distribution

-
-
-
- -
-
-quapy.error.mrae(prevs, prevs_hat, eps=None)
-

Computes the mean relative absolute error (see quapy.error.rae()) across -the sample pairs. The distributions are smoothed using the eps factor (see -quapy.error.smooth()).

-
-
Parameters:
-
    -
  • prevs – array-like of shape (n_samples, n_classes,) with the true -prevalence values

  • -
  • prevs_hat – array-like of shape (n_samples, n_classes,) with the predicted -prevalence values

  • -
  • eps – smoothing factor. mrae is not defined in cases in which the true -distribution contains zeros; eps is typically set to be \(\frac{1}{2T}\), -with \(T\) the sample size. If eps=None, the sample size will be taken from -the environment variable SAMPLE_SIZE (which has thus to be set beforehand).

  • -
-
-
Returns:
-

mean relative absolute error

-
-
-
- -
-
-quapy.error.mse(prevs, prevs_hat)
-

Computes the mean squared error (see quapy.error.se()) across the sample pairs.

-
-
Parameters:
-
    -
  • prevs – array-like of shape (n_samples, n_classes,) with the -true prevalence values

  • -
  • prevs_hat – array-like of shape (n_samples, n_classes,) with the -predicted prevalence values

  • -
-
-
Returns:
-

mean squared error

-
-
-
- -
-
-quapy.error.nkld(prevs, prevs_hat, eps=None)
-
-
Computes the Normalized Kullback-Leibler divergence between the two prevalence distributions.

Normalized Kullback-Leibler divergence between two prevalence distributions \(p\) and -\(\hat{p}\) is computed as -math:NKLD(p,hat{p}) = 2frac{e^{KLD(p,hat{p})}}{e^{KLD(p,hat{p})}+1}-1, -where -\(\mathcal{Y}\) are the classes of interest. -The distributions are smoothed using the eps factor (see quapy.error.smooth()).

-
-
-
-
Parameters:
-
    -
  • prevs – array-like of shape (n_classes,) with the true prevalence values

  • -
  • prevs_hat – array-like of shape (n_classes,) with the predicted prevalence values

  • -
  • eps – smoothing factor. NKLD is not defined in cases in which the distributions -contain zeros; eps is typically set to be \(\frac{1}{2T}\), with \(T\) the sample -size. If eps=None, the sample size will be taken from the environment variable -SAMPLE_SIZE (which has thus to be set beforehand).

  • -
-
-
Returns:
-

Normalized Kullback-Leibler divergence between the two distributions

-
-
-
- -
-
-quapy.error.rae(prevs, prevs_hat, eps=None)
-
-
Computes the absolute relative error between the two prevalence vectors.

Relative absolute error between two prevalence vectors \(p\) and \(\hat{p}\) -is computed as -\(RAE(p,\hat{p})= -\frac{1}{|\mathcal{Y}|}\sum_{y\in \mathcal{Y}}\frac{|\hat{p}(y)-p(y)|}{p(y)}\), -where \(\mathcal{Y}\) are the classes of interest. -The distributions are smoothed using the eps factor (see quapy.error.smooth()).

-
-
-
-
Parameters:
-
    -
  • prevs – array-like of shape (n_classes,) with the true prevalence values

  • -
  • prevs_hat – array-like of shape (n_classes,) with the predicted prevalence values

  • -
  • eps – smoothing factor. rae is not defined in cases in which the true distribution -contains zeros; eps is typically set to be \(\frac{1}{2T}\), with \(T\) the -sample size. If eps=None, the sample size will be taken from the environment variable -SAMPLE_SIZE (which has thus to be set beforehand).

  • -
-
-
Returns:
-

relative absolute error

-
-
-
- -
-
-quapy.error.relative_absolute_error(prevs, prevs_hat, eps=None)
-
-
Computes the absolute relative error between the two prevalence vectors.

Relative absolute error between two prevalence vectors \(p\) and \(\hat{p}\) -is computed as -\(RAE(p,\hat{p})= -\frac{1}{|\mathcal{Y}|}\sum_{y\in \mathcal{Y}}\frac{|\hat{p}(y)-p(y)|}{p(y)}\), -where \(\mathcal{Y}\) are the classes of interest. -The distributions are smoothed using the eps factor (see quapy.error.smooth()).

-
-
-
-
Parameters:
-
    -
  • prevs – array-like of shape (n_classes,) with the true prevalence values

  • -
  • prevs_hat – array-like of shape (n_classes,) with the predicted prevalence values

  • -
  • eps – smoothing factor. rae is not defined in cases in which the true distribution -contains zeros; eps is typically set to be \(\frac{1}{2T}\), with \(T\) the -sample size. If eps=None, the sample size will be taken from the environment variable -SAMPLE_SIZE (which has thus to be set beforehand).

  • -
-
-
Returns:
-

relative absolute error

-
-
-
- -
-
-quapy.error.se(prevs, prevs_hat)
-
-
Computes the squared error between the two prevalence vectors.

Squared error between two prevalence vectors \(p\) and \(\hat{p}\) is computed as -\(SE(p,\hat{p})=\frac{1}{|\mathcal{Y}|}\sum_{y\in \mathcal{Y}}(\hat{p}(y)-p(y))^2\), -where -\(\mathcal{Y}\) are the classes of interest.

-
-
-
-
Parameters:
-
    -
  • prevs – array-like of shape (n_classes,) with the true prevalence values

  • -
  • prevs_hat – array-like of shape (n_classes,) with the predicted prevalence values

  • -
-
-
Returns:
-

absolute error

-
-
-
- -
-
-quapy.error.smooth(prevs, eps)
-

Smooths a prevalence distribution with \(\epsilon\) (eps) as: -\(\underline{p}(y)=\frac{\epsilon+p(y)}{\epsilon|\mathcal{Y}|+ -\displaystyle\sum_{y\in \mathcal{Y}}p(y)}\)

-
-
Parameters:
-
    -
  • prevs – array-like of shape (n_classes,) with the true prevalence values

  • -
  • eps – smoothing factor

  • -
-
-
Returns:
-

array-like of shape (n_classes,) with the smoothed distribution

-
-
-
- -
-
-

quapy.evaluation

-
-
-quapy.evaluation.evaluate(model: BaseQuantifier, protocol: AbstractProtocol, error_metric: Union[str, Callable], aggr_speedup: Union[str, bool] = 'auto', verbose=False)
-

Evaluates a quantification model according to a specific sample generation protocol and in terms of one -evaluation metric (error).

-
-
Parameters:
-
    -
  • model – a quantifier, instance of quapy.method.base.BaseQuantifier

  • -
  • protocolquapy.protocol.AbstractProtocol; if this object is also instance of -quapy.protocol.OnLabelledCollectionProtocol, then the aggregation speed-up can be run. This is the -protocol in charge of generating the samples in which the model is evaluated.

  • -
  • error_metric – a string representing the name(s) of an error function in qp.error -(e.g., ‘mae’), or a callable function implementing the error function itself.

  • -
  • aggr_speedup – whether or not to apply the speed-up. Set to “force” for applying it even if the number of -instances in the original collection on which the protocol acts is larger than the number of instances -in the samples to be generated. Set to True or “auto” (default) for letting QuaPy decide whether it is -convenient or not. Set to False to deactivate.

  • -
  • verbose – boolean, show or not information in stdout

  • -
-
-
Returns:
-

if the error metric is not averaged (e.g., ‘ae’, ‘rae’), returns an array of shape (n_samples,) with -the error scores for each sample; if the error metric is averaged (e.g., ‘mae’, ‘mrae’) then returns -a single float

-
-
-
- -
-
-quapy.evaluation.evaluate_on_samples(model: BaseQuantifier, samples: Iterable[LabelledCollection], error_metric: Union[str, Callable], verbose=False)
-

Evaluates a quantification model on a given set of samples and in terms of one evaluation metric (error).

-
-
Parameters:
-
    -
  • model – a quantifier, instance of quapy.method.base.BaseQuantifier

  • -
  • samples – a list of samples on which the quantifier is to be evaluated

  • -
  • error_metric – a string representing the name(s) of an error function in qp.error -(e.g., ‘mae’), or a callable function implementing the error function itself.

  • -
  • verbose – boolean, show or not information in stdout

  • -
-
-
Returns:
-

if the error metric is not averaged (e.g., ‘ae’, ‘rae’), returns an array of shape (n_samples,) with -the error scores for each sample; if the error metric is averaged (e.g., ‘mae’, ‘mrae’) then returns -a single float

-
-
-
- -
-
-quapy.evaluation.evaluation_report(model: BaseQuantifier, protocol: AbstractProtocol, error_metrics: Iterable[Union[str, Callable]] = 'mae', aggr_speedup: Union[str, bool] = 'auto', verbose=False)
-

Generates a report (a pandas’ DataFrame) containing information of the evaluation of the model as according -to a specific protocol and in terms of one or more evaluation metrics (errors).

-
-
Parameters:
-
    -
  • model – a quantifier, instance of quapy.method.base.BaseQuantifier

  • -
  • protocolquapy.protocol.AbstractProtocol; if this object is also instance of -quapy.protocol.OnLabelledCollectionProtocol, then the aggregation speed-up can be run. This is the protocol -in charge of generating the samples in which the model is evaluated.

  • -
  • error_metrics – a string, or list of strings, representing the name(s) of an error function in qp.error -(e.g., ‘mae’, the default value), or a callable function, or a list of callable functions, implementing -the error function itself.

  • -
  • aggr_speedup – whether or not to apply the speed-up. Set to “force” for applying it even if the number of -instances in the original collection on which the protocol acts is larger than the number of instances -in the samples to be generated. Set to True or “auto” (default) for letting QuaPy decide whether it is -convenient or not. Set to False to deactivate.

  • -
  • verbose – boolean, show or not information in stdout

  • -
-
-
Returns:
-

a pandas’ DataFrame containing the columns ‘true-prev’ (the true prevalence of each sample), -‘estim-prev’ (the prevalence estimated by the model for each sample), and as many columns as error metrics -have been indicated, each displaying the score in terms of that metric for every sample.

-
-
-
- -
-
-quapy.evaluation.prediction(model: BaseQuantifier, protocol: AbstractProtocol, aggr_speedup: Union[str, bool] = 'auto', verbose=False)
-

Uses a quantification model to generate predictions for the samples generated via a specific protocol. -This function is central to all evaluation processes, and is endowed with an optimization to speed-up the -prediction of protocols that generate samples from a large collection. The optimization applies to aggregative -quantifiers only, and to OnLabelledCollectionProtocol protocols, and comes down to generating the classification -predictions once and for all, and then generating samples over the classification predictions (instead of over -the raw instances), so that the classifier prediction is never called again. This behaviour is obtained by -setting aggr_speedup to ‘auto’ or True, and is only carried out if the overall process is convenient in terms -of computations (e.g., if the number of classification predictions needed for the original collection exceed the -number of classification predictions needed for all samples, then the optimization is not undertaken).

-
-
Parameters:
-
    -
  • model – a quantifier, instance of quapy.method.base.BaseQuantifier

  • -
  • protocolquapy.protocol.AbstractProtocol; if this object is also instance of -quapy.protocol.OnLabelledCollectionProtocol, then the aggregation speed-up can be run. This is the protocol -in charge of generating the samples for which the model has to issue class prevalence predictions.

  • -
  • aggr_speedup – whether or not to apply the speed-up. Set to “force” for applying it even if the number of -instances in the original collection on which the protocol acts is larger than the number of instances -in the samples to be generated. Set to True or “auto” (default) for letting QuaPy decide whether it is -convenient or not. Set to False to deactivate.

  • -
  • verbose – boolean, show or not information in stdout

  • -
-
-
Returns:
-

a tuple (true_prevs, estim_prevs) in which each element in the tuple is an array of shape -(n_samples, n_classes) containing the true, or predicted, prevalence values for each sample

-
-
-
- -
-
-

quapy.protocol

-
-

New in version 0.1.7.

-
-
-
-class quapy.protocol.APP(data: LabelledCollection, sample_size=None, n_prevalences=21, repeats=10, smooth_limits_epsilon=0, random_state=0, sanity_check=10000, return_type='sample_prev')
-

Bases: AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol

-

Implementation of the artificial prevalence protocol (APP). -The APP consists of exploring a grid of prevalence values containing n_prevalences points (e.g., -[0, 0.05, 0.1, 0.15, …, 1], if n_prevalences=21), and generating all valid combinations of -prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], …, -[1, 0, 0] prevalence values of size sample_size will be yielded). The number of samples for each valid -combination of prevalence values is indicated by repeats.

-
-
Parameters:
-
    -
  • data – a LabelledCollection from which the samples will be drawn

  • -
  • sample_size – integer, number of instances in each sample; if None (default) then it is taken from -qp.environ[“SAMPLE_SIZE”]. If this is not set, a ValueError exception is raised.

  • -
  • n_prevalences – the number of equidistant prevalence points to extract from the [0,1] interval for the -grid (default is 21)

  • -
  • repeats – number of copies for each valid prevalence vector (default is 10)

  • -
  • smooth_limits_epsilon – the quantity to add and subtract to the limits 0 and 1

  • -
  • random_state – allows replicating samples across runs (default 0, meaning that the sequence of samples -will be the same every time the protocol is called)

  • -
  • sanity_check – int, raises an exception warning the user that the number of examples to be generated exceed -this number; set to None for skipping this check

  • -
  • return_type – set to “sample_prev” (default) to get the pairs of (sample, prevalence) at each iteration, or -to “labelled_collection” to get instead instances of LabelledCollection

  • -
-
-
-
-
-prevalence_grid()
-

Generates vectors of prevalence values from an exhaustive grid of prevalence values. The -number of prevalence values explored for each dimension depends on n_prevalences, so that, if, for example, -n_prevalences=11 then the prevalence values of the grid are taken from [0, 0.1, 0.2, …, 0.9, 1]. Only -valid prevalence distributions are returned, i.e., vectors of prevalence values that sum up to 1. For each -valid vector of prevalence values, repeat copies are returned. The vector of prevalence values can be -implicit (by setting return_constrained_dim=False), meaning that the last dimension (which is constrained -to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to -1). Note that this method is deterministic, i.e., there is no random sampling anywhere.

-
-
Returns:
-

a np.ndarray of shape (n, dimensions) if return_constrained_dim=True or of shape -(n, dimensions-1) if return_constrained_dim=False, where n is the number of valid combinations found -in the grid multiplied by repeat

-
-
-
- -
-
-sample(index)
-

Realizes the sample given the index of the instances.

-
-
Parameters:
-

index – indexes of the instances to select

-
-
Returns:
-

an instance of qp.data.LabelledCollection

-
-
-
- -
-
-samples_parameters()
-

Return all the necessary parameters to replicate the samples as according to the APP protocol.

-
-
Returns:
-

a list of indexes that realize the APP sampling

-
-
-
- -
-
-total()
-

Returns the number of samples that will be generated

-
-
Returns:
-

int

-
-
-
- -
- -
-
-class quapy.protocol.AbstractProtocol
-

Bases: object

-

Abstract parent class for sample generation protocols.

-
-
-total()
-

Indicates the total number of samples that the protocol generates.

-
-
Returns:
-

The number of samples to generate if known, or None otherwise.

-
-
-
- -
- -
-
-class quapy.protocol.AbstractStochasticSeededProtocol(random_state=0)
-

Bases: AbstractProtocol

-

An AbstractStochasticSeededProtocol is a protocol that generates, via any random procedure (e.g., -via random sampling), sequences of quapy.data.base.LabelledCollection samples. -The protocol abstraction enforces -the object to be instantiated using a seed, so that the sequence can be fully replicated. -In order to make this functionality possible, the classes extending this abstraction need to -implement only two functions, samples_parameters() which generates all the parameters -needed for extracting the samples, and sample() that, given some parameters as input, -deterministically generates a sample.

-
-
Parameters:
-

random_state – the seed for allowing to replicate any sequence of samples. Default is 0, meaning that -the sequence will be consistent every time the protocol is called.

-
-
-
-
-collator(sample, *args)
-

The collator prepares the sample to accommodate the desired output format before returning the output. -This collator simply returns the sample as it is. Classes inheriting from this abstract class can -implement their custom collators.

-
-
Parameters:
-
    -
  • sample – the sample to be returned

  • -
  • args – additional arguments

  • -
-
-
Returns:
-

the sample adhering to a desired output format (in this case, the sample is returned as it is)

-
-
-
- -
-
-property random_state
-
- -
-
-abstract sample(params)
-

Extract one sample determined by the given parameters

-
-
Parameters:
-

params – all the necessary parameters to generate a sample

-
-
Returns:
-

one sample (the same sample has to be generated for the same parameters)

-
-
-
- -
-
-abstract samples_parameters()
-

This function has to return all the necessary parameters to replicate the samples

-
-
Returns:
-

a list of parameters, each of which serves to deterministically generate a sample

-
-
-
- -
- -
-
-quapy.protocol.ArtificialPrevalenceProtocol
-

alias of APP

-
- -
-
-class quapy.protocol.DomainMixer(domainA: LabelledCollection, domainB: LabelledCollection, sample_size, repeats=1, prevalence=None, mixture_points=11, random_state=0, return_type='sample_prev')
-

Bases: AbstractStochasticSeededProtocol

-

Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence.

-
-
Parameters:
-
    -
  • domainA – one domain, an object of qp.data.LabelledCollection

  • -
  • domainB – another domain, an object of qp.data.LabelledCollection

  • -
  • sample_size – integer, the number of instances in each sample; if None (default) then it is taken from -qp.environ[“SAMPLE_SIZE”]. If this is not set, a ValueError exception is raised.

  • -
  • repeats – int, number of samples to draw for every mixture rate

  • -
  • prevalence – the prevalence to preserv along the mixtures. If specified, should be an array containing -one prevalence value (positive float) for each class and summing up to one. If not specified, the prevalence -will be taken from the domain A (default).

  • -
  • mixture_points – an integer indicating the number of points to take from a linear scale (e.g., 21 will -generate the mixture points [1, 0.95, 0.9, …, 0]), or the array of mixture values itself. -the specific points

  • -
  • random_state – allows replicating samples across runs (default 0, meaning that the sequence of samples -will be the same every time the protocol is called)

  • -
-
-
-
-
-sample(indexes)
-

Realizes the sample given a pair of indexes of the instances from A and B.

-
-
Parameters:
-

indexes – indexes of the instances to select from A and B

-
-
Returns:
-

an instance of qp.data.LabelledCollection

-
-
-
- -
-
-samples_parameters()
-

Return all the necessary parameters to replicate the samples as according to the this protocol.

-
-
Returns:
-

a list of zipped indexes (from A and B) that realize the sampling

-
-
-
- -
-
-total()
-

Returns the number of samples that will be generated (equals to “repeats * mixture_points”)

-
-
Returns:
-

int

-
-
-
- -
- -
-
-class quapy.protocol.IterateProtocol(samples: [<class 'quapy.data.base.LabelledCollection'>])
-

Bases: AbstractProtocol

-

A very simple protocol which simply iterates over a list of previously generated samples

-
-
Parameters:
-

samples – a list of quapy.data.base.LabelledCollection

-
-
-
-
-total()
-

Returns the number of samples in this protocol

-
-
Returns:
-

int

-
-
-
- -
- -
-
-class quapy.protocol.NPP(data: LabelledCollection, sample_size=None, repeats=100, random_state=0, return_type='sample_prev')
-

Bases: AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol

-

A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing -samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.

-
-
Parameters:
-
    -
  • data – a LabelledCollection from which the samples will be drawn

  • -
  • sample_size – integer, the number of instances in each sample; if None (default) then it is taken from -qp.environ[“SAMPLE_SIZE”]. If this is not set, a ValueError exception is raised.

  • -
  • repeats – the number of samples to generate. Default is 100.

  • -
  • random_state – allows replicating samples across runs (default 0, meaning that the sequence of samples -will be the same every time the protocol is called)

  • -
  • return_type – set to “sample_prev” (default) to get the pairs of (sample, prevalence) at each iteration, or -to “labelled_collection” to get instead instances of LabelledCollection

  • -
-
-
-
-
-sample(index)
-

Realizes the sample given the index of the instances.

-
-
Parameters:
-

index – indexes of the instances to select

-
-
Returns:
-

an instance of qp.data.LabelledCollection

-
-
-
- -
-
-samples_parameters()
-

Return all the necessary parameters to replicate the samples as according to the NPP protocol.

-
-
Returns:
-

a list of indexes that realize the NPP sampling

-
-
-
- -
-
-total()
-

Returns the number of samples that will be generated (equals to “repeats”)

-
-
Returns:
-

int

-
-
-
- -
- -
-
-quapy.protocol.NaturalPrevalenceProtocol
-

alias of NPP

-
- -
-
-class quapy.protocol.OnLabelledCollectionProtocol
-

Bases: object

-

Protocols that generate samples from a qp.data.LabelledCollection object.

-
-
-RETURN_TYPES = ['sample_prev', 'labelled_collection', 'index']
-
- -
-
-classmethod get_collator(return_type='sample_prev')
-

Returns a collator function, i.e., a function that prepares the yielded data

-
-
Parameters:
-

return_type – either ‘sample_prev’ (default) if the collator is requested to yield tuples of -(sample, prevalence), or ‘labelled_collection’ when it is requested to yield instances of -qp.data.LabelledCollection

-
-
Returns:
-

the collator function (a callable function that takes as input an instance of -qp.data.LabelledCollection)

-
-
-
- -
-
-get_labelled_collection()
-

Returns the labelled collection on which this protocol acts.

-
-
Returns:
-

an object of type qp.data.LabelledCollection

-
-
-
- -
-
-on_preclassified_instances(pre_classifications, in_place=False)
-

Returns a copy of this protocol that acts on a modified version of the original -qp.data.LabelledCollection in which the original instances have been replaced -with the outputs of a classifier for each instance. (This is convenient for speeding-up -the evaluation procedures for many samples, by pre-classifying the instances in advance.)

-
-
Parameters:
-
    -
  • pre_classifications – the predictions issued by a classifier, typically an array-like -with shape (n_instances,) when the classifier is a hard one, or with shape -(n_instances, n_classes) when the classifier is a probabilistic one.

  • -
  • in_place – whether or not to apply the modification in-place or in a new copy (default).

  • -
-
-
Returns:
-

a copy of this protocol

-
-
-
- -
- -
-
-class quapy.protocol.UPP(data: LabelledCollection, sample_size=None, repeats=100, random_state=0, return_type='sample_prev')
-

Bases: AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol

-

A variant of APP that, instead of using a grid of equidistant prevalence values, -relies on the Kraemer algorithm for sampling unit (k-1)-simplex uniformly at random, with -k the number of classes. This protocol covers the entire range of prevalence values in a -statistical sense, i.e., unlike APP there is no guarantee that it is covered precisely -equally for all classes, but it is preferred in cases in which the number of possible -combinations of the grid values of APP makes this endeavour intractable.

-
-
Parameters:
-
    -
  • data – a LabelledCollection from which the samples will be drawn

  • -
  • sample_size – integer, the number of instances in each sample; if None (default) then it is taken from -qp.environ[“SAMPLE_SIZE”]. If this is not set, a ValueError exception is raised.

  • -
  • repeats – the number of samples to generate. Default is 100.

  • -
  • random_state – allows replicating samples across runs (default 0, meaning that the sequence of samples -will be the same every time the protocol is called)

  • -
  • return_type – set to “sample_prev” (default) to get the pairs of (sample, prevalence) at each iteration, or -to “labelled_collection” to get instead instances of LabelledCollection

  • -
-
-
-
-
-sample(index)
-

Realizes the sample given the index of the instances.

-
-
Parameters:
-

index – indexes of the instances to select

-
-
Returns:
-

an instance of qp.data.LabelledCollection

-
-
-
- -
-
-samples_parameters()
-

Return all the necessary parameters to replicate the samples as according to the UPP protocol.

-
-
Returns:
-

a list of indexes that realize the UPP sampling

-
-
-
- -
-
-total()
-

Returns the number of samples that will be generated (equals to “repeats”)

-
-
Returns:
-

int

-
-
-
- -
- -
-
-quapy.protocol.UniformPrevalenceProtocol
-

alias of UPP

-
- -
-
-

quapy.functional

-
-
-quapy.functional.HellingerDistance(P, Q)
-

Computes the Hellingher Distance (HD) between (discretized) distributions P and Q. -The HD for two discrete distributions of k bins is defined as:

-
-\[HD(P,Q) = \frac{ 1 }{ \sqrt{ 2 } } \sqrt{ \sum_{i=1}^k ( \sqrt{p_i} - \sqrt{q_i} )^2 }\]
-
-
Parameters:
-
    -
  • P – real-valued array-like of shape (k,) representing a discrete distribution

  • -
  • Q – real-valued array-like of shape (k,) representing a discrete distribution

  • -
-
-
Returns:
-

float

-
-
-
- -
-
-quapy.functional.TopsoeDistance(P, Q, epsilon=1e-20)
-

Topsoe distance between two (discretized) distributions P and Q. -The Topsoe distance for two discrete distributions of k bins is defined as:

-
-\[Topsoe(P,Q) = \sum_{i=1}^k \left( p_i \log\left(\frac{ 2 p_i + \epsilon }{ p_i+q_i+\epsilon }\right) + - q_i \log\left(\frac{ 2 q_i + \epsilon }{ p_i+q_i+\epsilon }\right) \right)\]
-
-
Parameters:
-
    -
  • P – real-valued array-like of shape (k,) representing a discrete distribution

  • -
  • Q – real-valued array-like of shape (k,) representing a discrete distribution

  • -
-
-
Returns:
-

float

-
-
-
- -
-
-quapy.functional.adjusted_quantification(prevalence_estim, tpr, fpr, clip=True)
-

Implements the adjustment of ACC and PACC for the binary case. The adjustment for a prevalence estimate of the -positive class p comes down to computing:

-
-\[ACC(p) = \frac{ p - fpr }{ tpr - fpr }\]
-
-
Parameters:
-
    -
  • prevalence_estim – float, the estimated value for the positive class

  • -
  • tpr – float, the true positive rate of the classifier

  • -
  • fpr – float, the false positive rate of the classifier

  • -
  • clip – set to True (default) to clip values that might exceed the range [0,1]

  • -
-
-
Returns:
-

float, the adjusted count

-
-
-
- -
-
-quapy.functional.check_prevalence_vector(p, raise_exception=False, toleranze=1e-08)
-

Checks that p is a valid prevalence vector, i.e., that it contains values in [0,1] and that the values sum up to 1.

-
-
Parameters:
-

p – the prevalence vector to check

-
-
Returns:
-

True if p is valid, False otherwise

-
-
-
- -
-
-quapy.functional.get_nprevpoints_approximation(combinations_budget: int, n_classes: int, n_repeats: int = 1)
-

Searches for the largest number of (equidistant) prevalence points to define for each of the n_classes classes so -that the number of valid prevalence values generated as combinations of prevalence points (points in a -n_classes-dimensional simplex) do not exceed combinations_budget.

-
-
Parameters:
-
    -
  • combinations_budget – integer, maximum number of combinations allowed

  • -
  • n_classes – integer, number of classes

  • -
  • n_repeats – integer, number of repetitions for each prevalence combination

  • -
-
-
Returns:
-

the largest number of prevalence points that generate less than combinations_budget valid prevalences

-
-
-
- -
-
-quapy.functional.normalize_prevalence(prevalences)
-

Normalize a vector or matrix of prevalence values. The normalization consists of applying a L1 normalization in -cases in which the prevalence values are not all-zeros, and to convert the prevalence values into 1/n_classes in -cases in which all values are zero.

-
-
Parameters:
-

prevalences – array-like of shape (n_classes,) or of shape (n_samples, n_classes,) with prevalence values

-
-
Returns:
-

a normalized vector or matrix of prevalence values

-
-
-
- -
-
-quapy.functional.num_prevalence_combinations(n_prevpoints: int, n_classes: int, n_repeats: int = 1)
-

Computes the number of valid prevalence combinations in the n_classes-dimensional simplex if n_prevpoints equally -distant prevalence values are generated and n_repeats repetitions are requested. -The computation comes down to calculating:

-
-\[\binom{N+C-1}{C-1} \times r\]
-

where N is n_prevpoints-1, i.e., the number of probability mass blocks to allocate, C is the number of -classes, and r is n_repeats. This solution comes from the -Stars and Bars problem.

-
-
Parameters:
-
    -
  • n_classes – integer, number of classes

  • -
  • n_prevpoints – integer, number of prevalence points.

  • -
  • n_repeats – integer, number of repetitions for each prevalence combination

  • -
-
-
Returns:
-

The number of possible combinations. For example, if n_classes=2, n_prevpoints=5, n_repeats=1, then the -number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0]

-
-
-
- -
-
-quapy.functional.prevalence_from_labels(labels, classes)
-

Computed the prevalence values from a vector of labels.

-
-
Parameters:
-
    -
  • labels – array-like of shape (n_instances) with the label for each instance

  • -
  • classes – the class labels. This is needed in order to correctly compute the prevalence vector even when -some classes have no examples.

  • -
-
-
Returns:
-

an ndarray of shape (len(classes)) with the class prevalence values

-
-
-
- -
-
-quapy.functional.prevalence_from_probabilities(posteriors, binarize: bool = False)
-

Returns a vector of prevalence values from a matrix of posterior probabilities.

-
-
Parameters:
-
    -
  • posteriors – array-like of shape (n_instances, n_classes,) with posterior probabilities for each class

  • -
  • binarize – set to True (default is False) for computing the prevalence values on crisp decisions (i.e., -converting the vectors of posterior probabilities into class indices, by taking the argmax).

  • -
-
-
Returns:
-

array of shape (n_classes,) containing the prevalence values

-
-
-
- -
-
-quapy.functional.prevalence_linspace(n_prevalences=21, repeats=1, smooth_limits_epsilon=0.01)
-

Produces an array of uniformly separated values of prevalence. -By default, produces an array of 21 prevalence values, with -step 0.05 and with the limits smoothed, i.e.: -[0.01, 0.05, 0.10, 0.15, …, 0.90, 0.95, 0.99]

-
-
Parameters:
-
    -
  • n_prevalences – the number of prevalence values to sample from the [0,1] interval (default 21)

  • -
  • repeats – number of times each prevalence is to be repeated (defaults to 1)

  • -
  • smooth_limits_epsilon – the quantity to add and subtract to the limits 0 and 1

  • -
-
-
Returns:
-

an array of uniformly separated prevalence values

-
-
-
- -
-
-quapy.functional.strprev(prevalences, prec=3)
-

Returns a string representation for a prevalence vector. E.g.,

-
>>> strprev([1/3, 2/3], prec=2)
->>> '[0.33, 0.67]'
-
-
-
-
Parameters:
-
    -
  • prevalences – a vector of prevalence values

  • -
  • prec – float precision

  • -
-
-
Returns:
-

string

-
-
-
- -
-
-quapy.functional.uniform_prevalence_sampling(n_classes, size=1)
-

Implements the Kraemer algorithm -for sampling uniformly at random from the unit simplex. This implementation is adapted from this -post <https://cs.stackexchange.com/questions/3227/uniform-sampling-from-a-simplex>_.

-
-
Parameters:
-
    -
  • n_classes – integer, number of classes (dimensionality of the simplex)

  • -
  • size – number of samples to return

  • -
-
-
Returns:
-

np.ndarray of shape (size, n_classes,) if size>1, or of shape (n_classes,) otherwise

-
-
-
- -
-
-quapy.functional.uniform_simplex_sampling(n_classes, size=1)
-

Implements the Kraemer algorithm -for sampling uniformly at random from the unit simplex. This implementation is adapted from this -post <https://cs.stackexchange.com/questions/3227/uniform-sampling-from-a-simplex>_.

-
-
Parameters:
-
    -
  • n_classes – integer, number of classes (dimensionality of the simplex)

  • -
  • size – number of samples to return

  • -
-
-
Returns:
-

np.ndarray of shape (size, n_classes,) if size>1, or of shape (n_classes,) otherwise

-
-
-
- -
-
-

quapy.model_selection

-
-
-class quapy.model_selection.GridSearchQ(model: ~quapy.method.base.BaseQuantifier, param_grid: dict, protocol: ~quapy.protocol.AbstractProtocol, error: ~typing.Union[~typing.Callable, str] = <function mae>, refit=True, timeout=-1, n_jobs=None, verbose=False)
-

Bases: BaseQuantifier

-

Grid Search optimization targeting a quantification-oriented metric.

-

Optimizes the hyperparameters of a quantification method, based on an evaluation method and on an evaluation -protocol for quantification.

-
-
Parameters:
-
    -
  • model (BaseQuantifier) – the quantifier to optimize

  • -
  • param_grid – a dictionary with keys the parameter names and values the list of values to explore

  • -
  • protocol – a sample generation protocol, an instance of quapy.protocol.AbstractProtocol

  • -
  • error – an error function (callable) or a string indicating the name of an error function (valid ones -are those in quapy.error.QUANTIFICATION_ERROR

  • -
  • refit – whether or not to refit the model on the whole labelled collection (training+validation) with -the best chosen hyperparameter combination. Ignored if protocol=’gen’

  • -
  • timeout – establishes a timer (in seconds) for each of the hyperparameters configurations being tested. -Whenever a run takes longer than this timer, that configuration will be ignored. If all configurations end up -being ignored, a TimeoutError exception is raised. If -1 (default) then no time bound is set.

  • -
  • verbose – set to True to get information through the stdout

  • -
-
-
-
-
-best_model()
-

Returns the best model found after calling the fit() method, i.e., the one trained on the combination -of hyper-parameters that minimized the error function.

-
-
Returns:
-

a trained quantifier

-
-
-
- -
-
-fit(training: LabelledCollection)
-
-
Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing

the error metric.

-
-
-
-
Parameters:
-

training – the training set on which to optimize the hyperparameters

-
-
Returns:
-

self

-
-
-
- -
-
-get_params(deep=True)
-

Returns the dictionary of hyper-parameters to explore (param_grid)

-
-
Parameters:
-

deep – Unused

-
-
Returns:
-

the dictionary param_grid

-
-
-
- -
-
-quantify(instances)
-

Estimate class prevalence values using the best model found after calling the fit() method.

-
-
Parameters:
-

instances – sample contanining the instances

-
-
Returns:
-

a ndarray of shape (n_classes) with class prevalence estimates as according to the best model found -by the model selection process.

-
-
-
- -
-
-set_params(**parameters)
-

Sets the hyper-parameters to explore.

-
-
Parameters:
-

parameters – a dictionary with keys the parameter names and values the list of values to explore

-
-
-
- -
- -
-
-quapy.model_selection.cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfolds=3, random_state=0)
-

Akin to scikit-learn’s cross_val_predict -but for quantification.

-
-
Parameters:
-
    -
  • quantifier – a quantifier issuing class prevalence values

  • -
  • data – a labelled collection

  • -
  • nfolds – number of folds for k-fold cross validation generation

  • -
  • random_state – random seed for reproducibility

  • -
-
-
Returns:
-

a vector of class prevalence values

-
-
-
- -
-
-

quapy.plot

-
-
-quapy.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=5, colormap=<matplotlib.colors.ListedColormap object>, vertical_xticks=False, legend=True, savepath=None)
-

Box-plots displaying the local bias (i.e., signed error computed as the estimated value minus the true value) -for different bins of (true) prevalence of the positive classs, for each quantification method.

-
-
Parameters:
-
    -
  • method_names – array-like with the method names for each experiment

  • -
  • true_prevs – array-like with the true prevalence values (each being a ndarray with n_classes components) for -each experiment

  • -
  • estim_prevs – array-like with the estimated prevalence values (each being a ndarray with n_classes components) -for each experiment

  • -
  • pos_class – index of the positive class

  • -
  • title – the title to be displayed in the plot

  • -
  • nbins – number of bins

  • -
  • colormap – the matplotlib colormap to use (default cm.tab10)

  • -
  • vertical_xticks – whether or not to add secondary grid (default is False)

  • -
  • legend – whether or not to display the legend (default is True)

  • -
  • savepath – path where to save the plot. If not indicated (as default), the plot is shown.

  • -
-
-
-
- -
-
-quapy.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None)
-

Box-plots displaying the global bias (i.e., signed error computed as the estimated value minus the true value) -for each quantification method with respect to a given positive class.

-
-
Parameters:
-
    -
  • method_names – array-like with the method names for each experiment

  • -
  • true_prevs – array-like with the true prevalence values (each being a ndarray with n_classes components) for -each experiment

  • -
  • estim_prevs – array-like with the estimated prevalence values (each being a ndarray with n_classes components) -for each experiment

  • -
  • pos_class – index of the positive class

  • -
  • title – the title to be displayed in the plot

  • -
  • savepath – path where to save the plot. If not indicated (as default), the plot is shown.

  • -
-
-
-
- -
-
-quapy.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, show_std=True, legend=True, train_prev=None, savepath=None, method_order=None)
-

The diagonal plot displays the predicted prevalence values (along the y-axis) as a function of the true prevalence -values (along the x-axis). The optimal quantifier is described by the diagonal (0,0)-(1,1) of the plot (hence the -name). It is convenient for binary quantification problems, though it can be used for multiclass problems by -indicating which class is to be taken as the positive class. (For multiclass quantification problems, other plots -like the error_by_drift() might be preferable though).

-
-
Parameters:
-
    -
  • method_names – array-like with the method names for each experiment

  • -
  • true_prevs – array-like with the true prevalence values (each being a ndarray with n_classes components) for -each experiment

  • -
  • estim_prevs – array-like with the estimated prevalence values (each being a ndarray with n_classes components) -for each experiment

  • -
  • pos_class – index of the positive class

  • -
  • title – the title to be displayed in the plot

  • -
  • show_std – whether or not to show standard deviations (represented by color bands). This might be inconvenient -for cases in which many methods are compared, or when the standard deviations are high – default True)

  • -
  • legend – whether or not to display the leyend (default True)

  • -
  • train_prev – if indicated (default is None), the training prevalence (for the positive class) is hightlighted -in the plot. This is convenient when all the experiments have been conducted in the same dataset.

  • -
  • savepath – path where to save the plot. If not indicated (as default), the plot is shown.

  • -
  • method_order – if indicated (default is None), imposes the order in which the methods are processed (i.e., -listed in the legend and associated with matplotlib colors).

  • -
-
-
-
- -
-
-quapy.plot.brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, binning='isomerous', x_error='ae', y_error='ae', ttest_alpha=0.005, tail_density_threshold=0.005, method_order=None, savepath=None)
-

Displays (only) the top performing methods for different regions of the train-test shift in form of a broken -bar chart, in which each method has bars only for those regions in which either one of the following conditions -hold: (i) it is the best method (in average) for the bin, or (ii) it is not statistically significantly different -(in average) as according to a two-sided t-test on independent samples at confidence ttest_alpha. -The binning can be made “isometric” (same size), or “isomerous” (same number of experiments – default). A second -plot is displayed on top, that displays the distribution of experiments for each bin (when binning=”isometric”) or -the percentiles points of the distribution (when binning=”isomerous”).

-
-
Parameters:
-
    -
  • method_names – array-like with the method names for each experiment

  • -
  • true_prevs – array-like with the true prevalence values (each being a ndarray with n_classes components) for -each experiment

  • -
  • estim_prevs – array-like with the estimated prevalence values (each being a ndarray with n_classes components) -for each experiment

  • -
  • tr_prevs – training prevalence of each experiment

  • -
  • n_bins – number of bins in which the y-axis is to be divided (default is 20)

  • -
  • binning – type of binning, either “isomerous” (default) or “isometric”

  • -
  • x_error – a string representing the name of an error function (as defined in quapy.error) to be used for -measuring the amount of train-test shift (default is “ae”)

  • -
  • y_error – a string representing the name of an error function (as defined in quapy.error) to be used for -measuring the amount of error in the prevalence estimations (default is “ae”)

  • -
  • ttest_alpha – the confidence interval above which a p-value (two-sided t-test on independent samples) is -to be considered as an indicator that the two means are not statistically significantly different. Default is -0.005, meaning that a p-value > 0.005 indicates the two methods involved are to be considered similar

  • -
  • tail_density_threshold – sets a threshold on the density of experiments (over the total number of experiments) -below which a bin in the tail (i.e., the right-most ones) will be discarded. This is in order to avoid some -bins to be shown for train-test outliers.

  • -
  • method_order – if indicated (default is None), imposes the order in which the methods are processed (i.e., -listed in the legend and associated with matplotlib colors).

  • -
  • savepath – path where to save the plot. If not indicated (as default), the plot is shown.

  • -
-
-
Returns:
-

-
-
-
- -
-
-quapy.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, error_name='ae', show_std=False, show_density=True, show_legend=True, logscale=False, title='Quantification error as a function of distribution shift', vlines=None, method_order=None, savepath=None)
-

Plots the error (along the x-axis, as measured in terms of error_name) as a function of the train-test shift -(along the y-axis, as measured in terms of quapy.error.ae()). This plot is useful especially for multiclass -problems, in which “diagonal plots” may be cumbersone, and in order to gain understanding about how methods -fare in different regions of the prior probability shift spectrum (e.g., in the low-shift regime vs. in the -high-shift regime).

-
-
Parameters:
-
    -
  • method_names – array-like with the method names for each experiment

  • -
  • true_prevs – array-like with the true prevalence values (each being a ndarray with n_classes components) for -each experiment

  • -
  • estim_prevs – array-like with the estimated prevalence values (each being a ndarray with n_classes components) -for each experiment

  • -
  • tr_prevs – training prevalence of each experiment

  • -
  • n_bins – number of bins in which the y-axis is to be divided (default is 20)

  • -
  • error_name – a string representing the name of an error function (as defined in quapy.error, default is “ae”)

  • -
  • show_std – whether or not to show standard deviations as color bands (default is False)

  • -
  • show_density – whether or not to display the distribution of experiments for each bin (default is True)

  • -
  • show_density – whether or not to display the legend of the chart (default is True)

  • -
  • logscale – whether or not to log-scale the y-error measure (default is False)

  • -
  • title – title of the plot (default is “Quantification error as a function of distribution shift”)

  • -
  • vlines – array-like list of values (default is None). If indicated, highlights some regions of the space -using vertical dotted lines.

  • -
  • method_order – if indicated (default is None), imposes the order in which the methods are processed (i.e., -listed in the legend and associated with matplotlib colors).

  • -
  • savepath – path where to save the plot. If not indicated (as default), the plot is shown.

  • -
-
-
-
- -
-
-

quapy.util

-
-
-class quapy.util.EarlyStop(patience, lower_is_better=True)
-

Bases: object

-

A class implementing the early-stopping condition typically used for training neural networks.

-
>>> earlystop = EarlyStop(patience=2, lower_is_better=True)
->>> earlystop(0.9, epoch=0)
->>> earlystop(0.7, epoch=1)
->>> earlystop.IMPROVED  # is True
->>> earlystop(1.0, epoch=2)
->>> earlystop.STOP  # is False (patience=1)
->>> earlystop(1.0, epoch=3)
->>> earlystop.STOP  # is True (patience=0)
->>> earlystop.best_epoch  # is 1
->>> earlystop.best_score  # is 0.7
-
-
-
-
Parameters:
-
    -
  • patience – the number of (consecutive) times that a monitored evaluation metric (typically obtaind in a -held-out validation split) can be found to be worse than the best one obtained so far, before flagging the -stopping condition. An instance of this class is callable, and is to be used as follows:

  • -
  • lower_is_better – if True (default) the metric is to be minimized.

  • -
-
-
Variables:
-
    -
  • best_score – keeps track of the best value seen so far

  • -
  • best_epoch – keeps track of the epoch in which the best score was set

  • -
  • STOP – flag (boolean) indicating the stopping condition

  • -
  • IMPROVED – flag (boolean) indicating whether there was an improvement in the last call

  • -
-
-
-
- -
-
-quapy.util.create_if_not_exist(path)
-

An alias to os.makedirs(path, exist_ok=True) that also returns the path. This is useful in cases like, e.g.:

-
>>> path = create_if_not_exist(os.path.join(dir, subdir, anotherdir))
-
-
-
-
Parameters:
-

path – path to create

-
-
Returns:
-

the path itself

-
-
-
- -
-
-quapy.util.create_parent_dir(path)
-

Creates the parent dir (if any) of a given path, if not exists. E.g., for ./path/to/file.txt, the path ./path/to -is created.

-
-
Parameters:
-

path – the path

-
-
-
- -
-
-quapy.util.download_file(url, archive_filename)
-

Downloads a file from a url

-
-
Parameters:
-
    -
  • url – the url

  • -
  • archive_filename – destination filename

  • -
-
-
-
- -
-
-quapy.util.download_file_if_not_exists(url, archive_filename)
-

Dowloads a function (using download_file()) if the file does not exist.

-
-
Parameters:
-
    -
  • url – the url

  • -
  • archive_filename – destination filename

  • -
-
-
-
- -
-
-quapy.util.get_quapy_home()
-

Gets the home directory of QuaPy, i.e., the directory where QuaPy saves permanent data, such as dowloaded datasets. -This directory is ~/quapy_data

-
-
Returns:
-

a string representing the path

-
-
-
- -
-
-quapy.util.map_parallel(func, args, n_jobs)
-

Applies func to n_jobs slices of args. E.g., if args is an array of 99 items and `n_jobs`=2, then -func is applied in two parallel processes to args[0:50] and to args[50:99]. func is a function -that already works with a list of arguments.

-
-
Parameters:
-
    -
  • func – function to be parallelized

  • -
  • args – array-like of arguments to be passed to the function in different parallel calls

  • -
  • n_jobs – the number of workers

  • -
-
-
-
- -
-
-quapy.util.parallel(func, args, n_jobs, seed=None)
-

A wrapper of multiprocessing:

-
>>> Parallel(n_jobs=n_jobs)(
->>>      delayed(func)(args_i) for args_i in args
->>> )
-
-
-

that takes the quapy.environ variable as input silently. -Seeds the child processes to ensure reproducibility when n_jobs>1

-
- -
-
-quapy.util.pickled_resource(pickle_path: str, generation_func: callable, *args)
-

Allows for fast reuse of resources that are generated only once by calling generation_func(*args). The next times -this function is invoked, it loads the pickled resource. Example:

-
>>> def some_array(n):  # a mock resource created with one parameter (`n`)
->>>     return np.random.rand(n)
->>> pickled_resource('./my_array.pkl', some_array, 10)  # the resource does not exist: it is created by calling some_array(10)
->>> pickled_resource('./my_array.pkl', some_array, 10)  # the resource exists; it is loaded from './my_array.pkl'
-
-
-
-
Parameters:
-
    -
  • pickle_path – the path where to save (first time) and load (next times) the resource

  • -
  • generation_func – the function that generates the resource, in case it does not exist in pickle_path

  • -
  • args – any arg that generation_func uses for generating the resources

  • -
-
-
Returns:
-

the resource

-
-
-
- -
-
-quapy.util.save_text_file(path, text)
-

Saves a text file to disk, given its full path, and creates the parent directory if missing.

-
-
Parameters:
-
    -
  • path – path where to save the path.

  • -
  • text – text to save.

  • -
-
-
-
- -
-
-quapy.util.temp_seed(random_state)
-

Can be used in a “with” context to set a temporal seed without modifying the outer numpy’s current state. E.g.:

-
>>> with temp_seed(random_seed):
->>>  pass # do any computation depending on np.random functionality
-
-
-
-
Parameters:
-

random_state – the seed to set within the “with” context

-
-
-
- -
-
-

Subpackages

- -
-
-

Module contents

-

QuaPy module for quantification

-
-
- - -
-
-
-
- -
-
- - - - \ No newline at end of file diff --git a/docs/build/html/quapy.method.html b/docs/build/html/quapy.method.html deleted file mode 100644 index 7c3689c..0000000 --- a/docs/build/html/quapy.method.html +++ /dev/null @@ -1,1997 +0,0 @@ - - - - - - - - - - quapy.method package — QuaPy 0.1.7 documentation - - - - - - - - - - - - - - - - - - - -
-
-
-
- -
-

quapy.method package

-
-

Submodules

-
-
-

quapy.method.aggregative

-
-
-class quapy.method.aggregative.ACC(classifier: BaseEstimator, val_split=0.4, n_jobs=None)
-

Bases: AggregativeQuantifier

-

Adjusted Classify & Count, -the “adjusted” variant of CC, that corrects the predictions of CC -according to the misclassification rates.

-
-
Parameters:
-
    -
  • classifier – a sklearn’s Estimator that generates a classifier

  • -
  • val_split – indicates the proportion of data to be used as a stratified held-out validation set in which the -misclassification rates are to be estimated. -This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of -validation data, or as an integer, indicating that the misclassification rates should be estimated via -k-fold cross validation (this integer stands for the number of folds k), or as a -quapy.data.base.LabelledCollection (the split itself).

  • -
-
-
-
-
-aggregate(classif_predictions)
-

Implements the aggregation of label predictions.

-
-
Parameters:
-

classif_predictionsnp.ndarray of label predictions

-
-
Returns:
-

np.ndarray of shape (n_classes,) with class prevalence estimates.

-
-
-
- -
-
-classify(data)
-

Provides the label predictions for the given instances. The predictions should respect the format expected by -aggregate(), i.e., posterior probabilities for probabilistic quantifiers, or crisp predictions for -non-probabilistic quantifiers

-
-
Parameters:
-

instances – array-like

-
-
Returns:
-

np.ndarray of shape (n_instances,) with label predictions

-
-
-
- -
-
-fit(data: LabelledCollection, fit_classifier=True, val_split: Optional[Union[float, int, LabelledCollection]] = None)
-

Trains a ACC quantifier.

-
-
Parameters:
-
    -
  • data – the training set

  • -
  • fit_classifier – set to False to bypass the training (the learner is assumed to be already fit)

  • -
  • val_split – either a float in (0,1) indicating the proportion of training instances to use for -validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection -indicating the validation set itself, or an int indicating the number k of folds to be used in k-fold -cross validation to estimate the parameters

  • -
-
-
Returns:
-

self

-
-
-
- -
-
-classmethod getPteCondEstim(classes, y, y_)
-
- -
-
-classmethod solve_adjustment(PteCondEstim, prevs_estim)
-

Solves the system linear system \(Ax = B\) with \(A\) = PteCondEstim and \(B\) = prevs_estim

-
-
Parameters:
-
    -
  • PteCondEstim – a np.ndarray of shape (n_classes,n_classes,) with entry (i,j) being the estimate -of \(P(y_i|y_j)\), that is, the probability that an instance that belongs to \(y_j\) ends up being -classified as belonging to \(y_i\)

  • -
  • prevs_estim – a np.ndarray of shape (n_classes,) with the class prevalence estimates

  • -
-
-
Returns:
-

an adjusted np.ndarray of shape (n_classes,) with the corrected class prevalence estimates

-
-
-
- -
- -
-
-quapy.method.aggregative.AdjustedClassifyAndCount
-

alias of ACC

-
- -
-
-class quapy.method.aggregative.AggregativeProbabilisticQuantifier
-

Bases: AggregativeQuantifier

-

Abstract class for quantification methods that base their estimations on the aggregation of posterior probabilities -as returned by a probabilistic classifier. Aggregative Probabilistic Quantifiers thus extend Aggregative -Quantifiers by implementing a _posterior_probabilities_ method returning values in [0,1] – the posterior -probabilities.

-
-
-classify(instances)
-

Provides the label predictions for the given instances. The predictions should respect the format expected by -aggregate(), i.e., posterior probabilities for probabilistic quantifiers, or crisp predictions for -non-probabilistic quantifiers

-
-
Parameters:
-

instances – array-like

-
-
Returns:
-

np.ndarray of shape (n_instances,) with label predictions

-
-
-
- -
- -
-
-class quapy.method.aggregative.AggregativeQuantifier
-

Bases: BaseQuantifier

-

Abstract class for quantification methods that base their estimations on the aggregation of classification -results. Aggregative Quantifiers thus implement a classify() method and maintain a classifier -attribute. Subclasses of this abstract class must implement the method aggregate() which computes the -aggregation of label predictions. The method quantify() comes with a default implementation based on -classify() and aggregate().

-
-
-abstract aggregate(classif_predictions: ndarray)
-

Implements the aggregation of label predictions.

-
-
Parameters:
-

classif_predictionsnp.ndarray of label predictions

-
-
Returns:
-

np.ndarray of shape (n_classes,) with class prevalence estimates.

-
-
-
- -
-
-property classes_
-

Class labels, in the same order in which class prevalence values are to be computed. -This default implementation actually returns the class labels of the learner.

-
-
Returns:
-

array-like

-
-
-
- -
-
-property classifier
-

Gives access to the classifier

-
-
Returns:
-

the classifier (typically an sklearn’s Estimator)

-
-
-
- -
-
-classify(instances)
-

Provides the label predictions for the given instances. The predictions should respect the format expected by -aggregate(), i.e., posterior probabilities for probabilistic quantifiers, or crisp predictions for -non-probabilistic quantifiers

-
-
Parameters:
-

instances – array-like

-
-
Returns:
-

np.ndarray of shape (n_instances,) with label predictions

-
-
-
- -
-
-abstract fit(data: LabelledCollection, fit_classifier=True)
-

Trains the aggregative quantifier

-
-
Parameters:
-
    -
  • data – a quapy.data.base.LabelledCollection consisting of the training data

  • -
  • fit_classifier – whether or not to train the learner (default is True). Set to False if the -learner has been trained outside the quantifier.

  • -
-
-
Returns:
-

self

-
-
-
- -
-
-quantify(instances)
-

Generate class prevalence estimates for the sample’s instances by aggregating the label predictions generated -by the classifier.

-
-
Parameters:
-

instances – array-like

-
-
Returns:
-

np.ndarray of shape (n_classes) with class prevalence estimates.

-
-
-
- -
- -
-
-class quapy.method.aggregative.CC(classifier: BaseEstimator)
-

Bases: AggregativeQuantifier

-

The most basic Quantification method. One that simply classifies all instances and counts how many have been -attributed to each of the classes in order to compute class prevalence estimates.

-
-
Parameters:
-

classifier – a sklearn’s Estimator that generates a classifier

-
-
-
-
-aggregate(classif_predictions: ndarray)
-

Computes class prevalence estimates by counting the prevalence of each of the predicted labels.

-
-
Parameters:
-

classif_predictions – array-like with label predictions

-
-
Returns:
-

np.ndarray of shape (n_classes,) with class prevalence estimates.

-
-
-
- -
-
-fit(data: LabelledCollection, fit_classifier=True)
-

Trains the Classify & Count method unless fit_classifier is False, in which case, the classifier is assumed to -be already fit and there is nothing else to do.

-
-
Parameters:
-
-
-
Returns:
-

self

-
-
-
- -
- -
-
-quapy.method.aggregative.ClassifyAndCount
-

alias of CC

-
- -
-
-class quapy.method.aggregative.DistributionMatching(classifier, val_split=0.4, nbins=8, divergence: Union[str, Callable] = 'HD', cdf=False, n_jobs=None)
-

Bases: AggregativeProbabilisticQuantifier

-

Generic Distribution Matching quantifier for binary or multiclass quantification. -This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters.

-
-
Parameters:
-
    -
  • classifier – a sklearn’s Estimator that generates a probabilistic classifier

  • -
  • val_split – indicates the proportion of data to be used as a stratified held-out validation set to model the -validation distribution. -This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of -validation data, or as an integer, indicating that the validation distribution should be estimated via -k-fold cross validation (this integer stands for the number of folds k), or as a -quapy.data.base.LabelledCollection (the split itself).

  • -
  • nbins – number of bins used to discretize the distributions (default 8)

  • -
  • divergence – a string representing a divergence measure (currently, “HD” and “topsoe” are implemented) -or a callable function taking two ndarrays of the same dimension as input (default “HD”, meaning Hellinger -Distance)

  • -
  • cdf – whether or not to use CDF instead of PDF (default False)

  • -
  • n_jobs – number of parallel workers (default None)

  • -
-
-
-
-
-aggregate(posteriors: ndarray)
-

Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution -(the mixture) that best matches the test distribution, in terms of the divergence measure of choice. -In the multiclass case, with n the number of classes, the test and mixture distributions contain -n channels (proper distributions of binned posterior probabilities), on which the divergence is computed -independently. The matching is computed as an average of the divergence across all channels.

-
-
Parameters:
-

instances – instances in the sample

-
-
Returns:
-

a vector of class prevalence estimates

-
-
-
- -
-
-fit(data: LabelledCollection, fit_classifier=True, val_split: Optional[Union[float, LabelledCollection]] = None)
-

Trains the classifier (if requested) and generates the validation distributions out of the training data. -The validation distributions have shape (n, ch, nbins), with n the number of classes, ch the number of -channels, and nbins the number of bins. In particular, let V be the validation distributions; di=V[i] -are the distributions obtained from training data labelled with class i; dij = di[j] is the discrete -distribution of posterior probabilities P(Y=j|X=x) for training data labelled with class i, and dij[k] -is the fraction of instances with a value in the k-th bin.

-
-
Parameters:
-
    -
  • data – the training set

  • -
  • fit_classifier – set to False to bypass the training (the learner is assumed to be already fit)

  • -
  • val_split – either a float in (0,1) indicating the proportion of training instances to use for -validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection -indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV -to estimate the parameters

  • -
-
-
-
- -
- -
-
-class quapy.method.aggregative.DyS(classifier: BaseEstimator, val_split=0.4, n_bins=8, divergence: Union[str, Callable] = 'HD', tol=1e-05)
-

Bases: AggregativeProbabilisticQuantifier, BinaryQuantifier

-

DyS framework (DyS). -DyS is a generalization of HDy method, using a Ternary Search in order to find the prevalence that -minimizes the distance between distributions. -Details for the ternary search have been got from <https://dl.acm.org/doi/pdf/10.1145/3219819.3220059>

-
-
Parameters:
-
    -
  • classifier – a sklearn’s Estimator that generates a binary classifier

  • -
  • val_split – a float in range (0,1) indicating the proportion of data to be used as a stratified held-out -validation distribution, or a quapy.data.base.LabelledCollection (the split itself).

  • -
  • n_bins – an int with the number of bins to use to compute the histograms.

  • -
  • divergence – a str indicating the name of divergence (currently supported ones are “HD” or “topsoe”), or a -callable function computes the divergence between two distributions (two equally sized arrays).

  • -
  • tol – a float with the tolerance for the ternary search algorithm.

  • -
-
-
-
-
-aggregate(classif_posteriors)
-

Implements the aggregation of label predictions.

-
-
Parameters:
-

classif_predictionsnp.ndarray of label predictions

-
-
Returns:
-

np.ndarray of shape (n_classes,) with class prevalence estimates.

-
-
-
- -
-
-fit(data: LabelledCollection, fit_classifier=True, val_split: Optional[Union[float, LabelledCollection]] = None)
-

Trains the aggregative quantifier

-
-
Parameters:
-
    -
  • data – a quapy.data.base.LabelledCollection consisting of the training data

  • -
  • fit_classifier – whether or not to train the learner (default is True). Set to False if the -learner has been trained outside the quantifier.

  • -
-
-
Returns:
-

self

-
-
-
- -
- -
-
-class quapy.method.aggregative.EMQ(classifier: BaseEstimator, exact_train_prev=True, recalib=None)
-

Bases: AggregativeProbabilisticQuantifier

-

Expectation Maximization for Quantification (EMQ), -aka Saerens-Latinne-Decaestecker (SLD) algorithm. -EMQ consists of using the well-known Expectation Maximization algorithm to iteratively update the posterior -probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via -maximum-likelihood estimation, in a mutually recursive way, until convergence.

-
-
Parameters:
-
    -
  • classifier – a sklearn’s Estimator that generates a classifier

  • -
  • exact_train_prev – set to True (default) for using, as the initial observation, the true training prevalence; -or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected -value of the posterior probabilities of the training instances as suggested in -Alexandari et al. paper:

  • -
  • recalib – a string indicating the method of recalibration. Available choices include “nbvs” (No-Bias Vector -Scaling), “bcts” (Bias-Corrected Temperature Scaling), “ts” (Temperature Scaling), and “vs” (Vector Scaling). -The default value is None, indicating no recalibration.

  • -
-
-
-
-
-classmethod EM(tr_prev, posterior_probabilities, epsilon=0.0001)
-

Computes the Expectation Maximization routine.

-
-
Parameters:
-
    -
  • tr_prev – array-like, the training prevalence

  • -
  • posterior_probabilitiesnp.ndarray of shape (n_instances, n_classes,) with the -posterior probabilities

  • -
  • epsilon – float, the threshold different between two consecutive iterations -to reach before stopping the loop

  • -
-
-
Returns:
-

a tuple with the estimated prevalence values (shape (n_classes,)) and -the corrected posterior probabilities (shape (n_instances, n_classes,))

-
-
-
- -
-
-EPSILON = 0.0001
-
- -
-
-MAX_ITER = 1000
-
- -
-
-aggregate(classif_posteriors, epsilon=0.0001)
-

Implements the aggregation of label predictions.

-
-
Parameters:
-

classif_predictionsnp.ndarray of label predictions

-
-
Returns:
-

np.ndarray of shape (n_classes,) with class prevalence estimates.

-
-
-
- -
-
-fit(data: LabelledCollection, fit_classifier=True)
-

Trains the aggregative quantifier

-
-
Parameters:
-
    -
  • data – a quapy.data.base.LabelledCollection consisting of the training data

  • -
  • fit_classifier – whether or not to train the learner (default is True). Set to False if the -learner has been trained outside the quantifier.

  • -
-
-
Returns:
-

self

-
-
-
- -
-
-predict_proba(instances, epsilon=0.0001)
-
- -
- -
-
-quapy.method.aggregative.ExpectationMaximizationQuantifier
-

alias of EMQ

-
- -
-
-class quapy.method.aggregative.HDy(classifier: BaseEstimator, val_split=0.4)
-

Bases: AggregativeProbabilisticQuantifier, BinaryQuantifier

-

Hellinger Distance y (HDy). -HDy is a probabilistic method for training binary quantifiers, that models quantification as the problem of -minimizing the divergence (in terms of the Hellinger Distance) between two cumulative distributions of posterior -probabilities returned by the classifier. One of the distributions is generated from the unlabelled examples and -the other is generated from a validation set. This latter distribution is defined as a mixture of the -class-conditional distributions of the posterior probabilities returned for the positive and negative validation -examples, respectively. The parameters of the mixture thus represent the estimates of the class prevalence values.

-
-
Parameters:
-
    -
  • classifier – a sklearn’s Estimator that generates a binary classifier

  • -
  • val_split – a float in range (0,1) indicating the proportion of data to be used as a stratified held-out -validation distribution, or a quapy.data.base.LabelledCollection (the split itself).

  • -
-
-
-
-
-aggregate(classif_posteriors)
-

Implements the aggregation of label predictions.

-
-
Parameters:
-

classif_predictionsnp.ndarray of label predictions

-
-
Returns:
-

np.ndarray of shape (n_classes,) with class prevalence estimates.

-
-
-
- -
-
-fit(data: LabelledCollection, fit_classifier=True, val_split: Optional[Union[float, LabelledCollection]] = None)
-

Trains a HDy quantifier.

-
-
Parameters:
-
    -
  • data – the training set

  • -
  • fit_classifier – set to False to bypass the training (the learner is assumed to be already fit)

  • -
  • val_split – either a float in (0,1) indicating the proportion of training instances to use for -validation (e.g., 0.3 for using 30% of the training set as validation data), or a -quapy.data.base.LabelledCollection indicating the validation set itself

  • -
-
-
Returns:
-

self

-
-
-
- -
- -
-
-quapy.method.aggregative.HellingerDistanceY
-

alias of HDy

-
- -
-
-class quapy.method.aggregative.MAX(classifier: BaseEstimator, val_split=0.4)
-

Bases: ThresholdOptimization

-

Threshold Optimization variant for ACC as proposed by -Forman 2006 and -Forman 2008 that looks -for the threshold that maximizes tpr-fpr. -The goal is to bring improved stability to the denominator of the adjustment.

-
-
Parameters:
-
    -
  • classifier – a sklearn’s Estimator that generates a classifier

  • -
  • val_split – indicates the proportion of data to be used as a stratified held-out validation set in which the -misclassification rates are to be estimated. -This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of -validation data, or as an integer, indicating that the misclassification rates should be estimated via -k-fold cross validation (this integer stands for the number of folds k), or as a -quapy.data.base.LabelledCollection (the split itself).

  • -
-
-
-
- -
-
-class quapy.method.aggregative.MS(classifier: BaseEstimator, val_split=0.4)
-

Bases: ThresholdOptimization

-

Median Sweep. Threshold Optimization variant for ACC as proposed by -Forman 2006 and -Forman 2008 that generates -class prevalence estimates for all decision thresholds and returns the median of them all. -The goal is to bring improved stability to the denominator of the adjustment.

-
-
Parameters:
-
    -
  • classifier – a sklearn’s Estimator that generates a classifier

  • -
  • val_split – indicates the proportion of data to be used as a stratified held-out validation set in which the -misclassification rates are to be estimated. -This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of -validation data, or as an integer, indicating that the misclassification rates should be estimated via -k-fold cross validation (this integer stands for the number of folds k), or as a -quapy.data.base.LabelledCollection (the split itself).

  • -
-
-
-
- -
-
-class quapy.method.aggregative.MS2(classifier: BaseEstimator, val_split=0.4)
-

Bases: MS

-

Median Sweep 2. Threshold Optimization variant for ACC as proposed by -Forman 2006 and -Forman 2008 that generates -class prevalence estimates for all decision thresholds and returns the median of for cases in -which tpr-fpr>0.25 -The goal is to bring improved stability to the denominator of the adjustment.

-
-
Parameters:
-
    -
  • classifier – a sklearn’s Estimator that generates a classifier

  • -
  • val_split – indicates the proportion of data to be used as a stratified held-out validation set in which the -misclassification rates are to be estimated. -This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of -validation data, or as an integer, indicating that the misclassification rates should be estimated via -k-fold cross validation (this integer stands for the number of folds k), or as a -quapy.data.base.LabelledCollection (the split itself).

  • -
-
-
-
- -
-
-quapy.method.aggregative.MedianSweep
-

alias of MS

-
- -
-
-quapy.method.aggregative.MedianSweep2
-

alias of MS2

-
- -
-
-class quapy.method.aggregative.OneVsAllAggregative(binary_quantifier, n_jobs=None, parallel_backend='multiprocessing')
-

Bases: OneVsAllGeneric, AggregativeQuantifier

-

Allows any binary quantifier to perform quantification on single-label datasets. -The method maintains one binary quantifier for each class, and then l1-normalizes the outputs so that the -class prevelences sum up to 1. -This variant was used, along with the EMQ quantifier, in -Gao and Sebastiani, 2016.

-
-
Parameters:
-
    -
  • binary_quantifier – a quantifier (binary) that will be employed to work on multiclass model in a -one-vs-all manner

  • -
  • n_jobs – number of parallel workers

  • -
  • parallel_backend – the parallel backend for joblib (default “loky”); this is helpful for some quantifiers -(e.g., ELM-based ones) that cannot be run with multiprocessing, since the temp dir they create during fit will -is removed and no longer available at predict time.

  • -
-
-
-
-
-aggregate(classif_predictions)
-

Implements the aggregation of label predictions.

-
-
Parameters:
-

classif_predictionsnp.ndarray of label predictions

-
-
Returns:
-

np.ndarray of shape (n_classes,) with class prevalence estimates.

-
-
-
- -
-
-classify(instances)
-

If the base quantifier is not probabilistic, returns a matrix of shape (n,m,) with n the number of -instances and m the number of classes. The entry (i,j) is a binary value indicating whether instance -i `belongs to class `j. The binary classifications are independent of each other, meaning that an instance -can end up be attributed to 0, 1, or more classes. -If the base quantifier is probabilistic, returns a matrix of shape (n,m,2) with n the number of instances -and m the number of classes. The entry (i,j,1) (resp. (i,j,0)) is a value in [0,1] indicating the -posterior probability that instance i belongs (resp. does not belong) to class j. The posterior -probabilities are independent of each other, meaning that, in general, they do not sum up to one.

-
-
Parameters:
-

instances – array-like

-
-
Returns:
-

np.ndarray

-
-
-
- -
- -
-
-class quapy.method.aggregative.PACC(classifier: BaseEstimator, val_split=0.4, n_jobs=None)
-

Bases: AggregativeProbabilisticQuantifier

-

Probabilistic Adjusted Classify & Count, -the probabilistic variant of ACC that relies on the posterior probabilities returned by a probabilistic classifier.

-
-
Parameters:
-
    -
  • classifier – a sklearn’s Estimator that generates a classifier

  • -
  • val_split – indicates the proportion of data to be used as a stratified held-out validation set in which the -misclassification rates are to be estimated. -This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of -validation data, or as an integer, indicating that the misclassification rates should be estimated via -k-fold cross validation (this integer stands for the number of folds k), or as a -quapy.data.base.LabelledCollection (the split itself).

  • -
  • n_jobs – number of parallel workers

  • -
-
-
-
-
-aggregate(classif_posteriors)
-

Implements the aggregation of label predictions.

-
-
Parameters:
-

classif_predictionsnp.ndarray of label predictions

-
-
Returns:
-

np.ndarray of shape (n_classes,) with class prevalence estimates.

-
-
-
- -
-
-classify(data)
-

Provides the label predictions for the given instances. The predictions should respect the format expected by -aggregate(), i.e., posterior probabilities for probabilistic quantifiers, or crisp predictions for -non-probabilistic quantifiers

-
-
Parameters:
-

instances – array-like

-
-
Returns:
-

np.ndarray of shape (n_instances,) with label predictions

-
-
-
- -
-
-fit(data: LabelledCollection, fit_classifier=True, val_split: Optional[Union[float, int, LabelledCollection]] = None)
-

Trains a PACC quantifier.

-
-
Parameters:
-
    -
  • data – the training set

  • -
  • fit_classifier – set to False to bypass the training (the learner is assumed to be already fit)

  • -
  • val_split – either a float in (0,1) indicating the proportion of training instances to use for -validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection -indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV -to estimate the parameters

  • -
-
-
Returns:
-

self

-
-
-
- -
-
-classmethod getPteCondEstim(classes, y, y_)
-
- -
- -
-
-class quapy.method.aggregative.PCC(classifier: BaseEstimator)
-

Bases: AggregativeProbabilisticQuantifier

-

Probabilistic Classify & Count, -the probabilistic variant of CC that relies on the posterior probabilities returned by a probabilistic classifier.

-
-
Parameters:
-

classifier – a sklearn’s Estimator that generates a classifier

-
-
-
-
-aggregate(classif_posteriors)
-

Implements the aggregation of label predictions.

-
-
Parameters:
-

classif_predictionsnp.ndarray of label predictions

-
-
Returns:
-

np.ndarray of shape (n_classes,) with class prevalence estimates.

-
-
-
- -
-
-fit(data: LabelledCollection, fit_classifier=True)
-

Trains the aggregative quantifier

-
-
Parameters:
-
    -
  • data – a quapy.data.base.LabelledCollection consisting of the training data

  • -
  • fit_classifier – whether or not to train the learner (default is True). Set to False if the -learner has been trained outside the quantifier.

  • -
-
-
Returns:
-

self

-
-
-
- -
- -
-
-quapy.method.aggregative.ProbabilisticAdjustedClassifyAndCount
-

alias of PACC

-
- -
-
-quapy.method.aggregative.ProbabilisticClassifyAndCount
-

alias of PCC

-
- -
-
-quapy.method.aggregative.SLD
-

alias of EMQ

-
- -
-
-class quapy.method.aggregative.SMM(classifier: BaseEstimator, val_split=0.4)
-

Bases: AggregativeProbabilisticQuantifier, BinaryQuantifier

-

SMM method (SMM). -SMM is a simplification of matching distribution methods where the representation of the examples -is created using the mean instead of a histogram.

-
-
Parameters:
-
    -
  • classifier – a sklearn’s Estimator that generates a binary classifier.

  • -
  • val_split – a float in range (0,1) indicating the proportion of data to be used as a stratified held-out -validation distribution, or a quapy.data.base.LabelledCollection (the split itself).

  • -
-
-
-
-
-aggregate(classif_posteriors)
-

Implements the aggregation of label predictions.

-
-
Parameters:
-

classif_predictionsnp.ndarray of label predictions

-
-
Returns:
-

np.ndarray of shape (n_classes,) with class prevalence estimates.

-
-
-
- -
-
-fit(data: LabelledCollection, fit_classifier=True, val_split: Optional[Union[float, LabelledCollection]] = None)
-

Trains the aggregative quantifier

-
-
Parameters:
-
    -
  • data – a quapy.data.base.LabelledCollection consisting of the training data

  • -
  • fit_classifier – whether or not to train the learner (default is True). Set to False if the -learner has been trained outside the quantifier.

  • -
-
-
Returns:
-

self

-
-
-
- -
- -
-
-class quapy.method.aggregative.T50(classifier: BaseEstimator, val_split=0.4)
-

Bases: ThresholdOptimization

-

Threshold Optimization variant for ACC as proposed by -Forman 2006 and -Forman 2008 that looks -for the threshold that makes tpr cosest to 0.5. -The goal is to bring improved stability to the denominator of the adjustment.

-
-
Parameters:
-
    -
  • classifier – a sklearn’s Estimator that generates a classifier

  • -
  • val_split – indicates the proportion of data to be used as a stratified held-out validation set in which the -misclassification rates are to be estimated. -This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of -validation data, or as an integer, indicating that the misclassification rates should be estimated via -k-fold cross validation (this integer stands for the number of folds k), or as a -quapy.data.base.LabelledCollection (the split itself).

  • -
-
-
-
- -
-
-class quapy.method.aggregative.ThresholdOptimization(classifier: BaseEstimator, val_split=0.4, n_jobs=None)
-

Bases: AggregativeQuantifier, BinaryQuantifier

-

Abstract class of Threshold Optimization variants for ACC as proposed by -Forman 2006 and -Forman 2008. -The goal is to bring improved stability to the denominator of the adjustment. -The different variants are based on different heuristics for choosing a decision threshold -that would allow for more true positives and many more false positives, on the grounds this -would deliver larger denominators.

-
-
Parameters:
-
    -
  • classifier – a sklearn’s Estimator that generates a classifier

  • -
  • val_split – indicates the proportion of data to be used as a stratified held-out validation set in which the -misclassification rates are to be estimated. -This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of -validation data, or as an integer, indicating that the misclassification rates should be estimated via -k-fold cross validation (this integer stands for the number of folds k), or as a -quapy.data.base.LabelledCollection (the split itself).

  • -
-
-
-
-
-aggregate(classif_predictions)
-

Implements the aggregation of label predictions.

-
-
Parameters:
-

classif_predictionsnp.ndarray of label predictions

-
-
Returns:
-

np.ndarray of shape (n_classes,) with class prevalence estimates.

-
-
-
- -
-
-fit(data: LabelledCollection, fit_classifier=True, val_split: Optional[Union[float, int, LabelledCollection]] = None)
-

Trains the aggregative quantifier

-
-
Parameters:
-
    -
  • data – a quapy.data.base.LabelledCollection consisting of the training data

  • -
  • fit_classifier – whether or not to train the learner (default is True). Set to False if the -learner has been trained outside the quantifier.

  • -
-
-
Returns:
-

self

-
-
-
- -
- -
-
-class quapy.method.aggregative.X(classifier: BaseEstimator, val_split=0.4)
-

Bases: ThresholdOptimization

-

Threshold Optimization variant for ACC as proposed by -Forman 2006 and -Forman 2008 that looks -for the threshold that yields tpr=1-fpr. -The goal is to bring improved stability to the denominator of the adjustment.

-
-
Parameters:
-
    -
  • classifier – a sklearn’s Estimator that generates a classifier

  • -
  • val_split – indicates the proportion of data to be used as a stratified held-out validation set in which the -misclassification rates are to be estimated. -This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of -validation data, or as an integer, indicating that the misclassification rates should be estimated via -k-fold cross validation (this integer stands for the number of folds k), or as a -quapy.data.base.LabelledCollection (the split itself).

  • -
-
-
-
- -
-
-quapy.method.aggregative.cross_generate_predictions(data, classifier, val_split, probabilistic, fit_classifier, n_jobs)
-
- -
-
-quapy.method.aggregative.newELM(svmperf_base=None, loss='01', C=1)
-

Explicit Loss Minimization (ELM) quantifiers. -Quantifiers based on ELM represent a family of methods based on structured output learning; -these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss -measure. This implementation relies on -Joachims’ SVM perf structured output -learning algorithm, which has to be installed and patched for the purpose (see this -script). -This function equivalent to:

-
>>> CC(SVMperf(svmperf_base, loss, C))
-
-
-
-
Parameters:
-
    -
  • svmperf_base – path to the folder containing the binary files of SVM perf; if set to None (default) -this path will be obtained from qp.environ[‘SVMPERF_HOME’]

  • -
  • loss – the loss to optimize (see quapy.classification.svmperf.SVMperf.valid_losses)

  • -
  • C – trade-off between training error and margin (default 0.01)

  • -
-
-
Returns:
-

returns an instance of CC set to work with SVMperf (with loss and C set properly) as the -underlying classifier

-
-
-
- -
-
-quapy.method.aggregative.newSVMAE(svmperf_base=None, C=1)
-

SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Absolute Error as first used by -Moreo and Sebastiani, 2021. -Equivalent to:

-
>>> CC(SVMperf(svmperf_base, loss='mae', C=C))
-
-
-

Quantifiers based on ELM represent a family of methods based on structured output learning; -these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss -measure. This implementation relies on -Joachims’ SVM perf structured output -learning algorithm, which has to be installed and patched for the purpose (see this -script). -This function is a wrapper around CC(SVMperf(svmperf_base, loss, C))

-
-
Parameters:
-
    -
  • svmperf_base – path to the folder containing the binary files of SVM perf; if set to None (default) -this path will be obtained from qp.environ[‘SVMPERF_HOME’]

  • -
  • C – trade-off between training error and margin (default 0.01)

  • -
-
-
Returns:
-

returns an instance of CC set to work with SVMperf (with loss and C set properly) as the -underlying classifier

-
-
-
- -
-
-quapy.method.aggregative.newSVMKLD(svmperf_base=None, C=1)
-

SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Kullback-Leibler Divergence -normalized via the logistic function, as proposed by -Esuli et al. 2015. -Equivalent to:

-
>>> CC(SVMperf(svmperf_base, loss='nkld', C=C))
-
-
-

Quantifiers based on ELM represent a family of methods based on structured output learning; -these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss -measure. This implementation relies on -Joachims’ SVM perf structured output -learning algorithm, which has to be installed and patched for the purpose (see this -script). -This function is a wrapper around CC(SVMperf(svmperf_base, loss, C))

-
-
Parameters:
-
    -
  • svmperf_base – path to the folder containing the binary files of SVM perf; if set to None (default) -this path will be obtained from qp.environ[‘SVMPERF_HOME’]

  • -
  • C – trade-off between training error and margin (default 0.01)

  • -
-
-
Returns:
-

returns an instance of CC set to work with SVMperf (with loss and C set properly) as the -underlying classifier

-
-
-
- -
-
-quapy.method.aggregative.newSVMQ(svmperf_base=None, C=1)
-

SVM(Q) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Q loss combining a -classification-oriented loss and a quantification-oriented loss, as proposed by -Barranquero et al. 2015. -Equivalent to:

-
>>> CC(SVMperf(svmperf_base, loss='q', C=C))
-
-
-

Quantifiers based on ELM represent a family of methods based on structured output learning; -these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss -measure. This implementation relies on -Joachims’ SVM perf structured output -learning algorithm, which has to be installed and patched for the purpose (see this -script). -This function is a wrapper around CC(SVMperf(svmperf_base, loss, C))

-
-
Parameters:
-
    -
  • svmperf_base – path to the folder containing the binary files of SVM perf; if set to None (default) -this path will be obtained from qp.environ[‘SVMPERF_HOME’]

  • -
  • C – trade-off between training error and margin (default 0.01)

  • -
-
-
Returns:
-

returns an instance of CC set to work with SVMperf (with loss and C set properly) as the -underlying classifier

-
-
-
- -
-
-quapy.method.aggregative.newSVMRAE(svmperf_base=None, C=1)
-

SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Relative Absolute Error as first -used by Moreo and Sebastiani, 2021. -Equivalent to:

-
>>> CC(SVMperf(svmperf_base, loss='mrae', C=C))
-
-
-

Quantifiers based on ELM represent a family of methods based on structured output learning; -these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss -measure. This implementation relies on -Joachims’ SVM perf structured output -learning algorithm, which has to be installed and patched for the purpose (see this -script). -This function is a wrapper around CC(SVMperf(svmperf_base, loss, C))

-
-
Parameters:
-
    -
  • svmperf_base – path to the folder containing the binary files of SVM perf; if set to None (default) -this path will be obtained from qp.environ[‘SVMPERF_HOME’]

  • -
  • C – trade-off between training error and margin (default 0.01)

  • -
-
-
Returns:
-

returns an instance of CC set to work with SVMperf (with loss and C set properly) as the -underlying classifier

-
-
-
- -
-
-

quapy.method.base

-
-
-class quapy.method.base.BaseQuantifier
-

Bases: BaseEstimator

-

Abstract Quantifier. A quantifier is defined as an object of a class that implements the method fit() on -quapy.data.base.LabelledCollection, the method quantify(), and the set_params() and -get_params() for model selection (see quapy.model_selection.GridSearchQ())

-
-
-abstract fit(data: LabelledCollection)
-

Trains a quantifier.

-
-
Parameters:
-

data – a quapy.data.base.LabelledCollection consisting of the training data

-
-
Returns:
-

self

-
-
-
- -
-
-abstract quantify(instances)
-

Generate class prevalence estimates for the sample’s instances

-
-
Parameters:
-

instances – array-like

-
-
Returns:
-

np.ndarray of shape (n_classes,) with class prevalence estimates.

-
-
-
- -
- -
-
-class quapy.method.base.BinaryQuantifier
-

Bases: BaseQuantifier

-

Abstract class of binary quantifiers, i.e., quantifiers estimating class prevalence values for only two classes -(typically, to be interpreted as one class and its complement).

-
- -
-
-class quapy.method.base.OneVsAll
-

Bases: object

-
- -
-
-class quapy.method.base.OneVsAllGeneric(binary_quantifier, n_jobs=None)
-

Bases: OneVsAll, BaseQuantifier

-

Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary -quantifier for each class, and then l1-normalizes the outputs so that the class prevelence values sum up to 1.

-
-
-property classes_
-
- -
-
-fit(data: LabelledCollection, fit_classifier=True)
-

Trains a quantifier.

-
-
Parameters:
-

data – a quapy.data.base.LabelledCollection consisting of the training data

-
-
Returns:
-

self

-
-
-
- -
-
-quantify(instances)
-

Generate class prevalence estimates for the sample’s instances

-
-
Parameters:
-

instances – array-like

-
-
Returns:
-

np.ndarray of shape (n_classes,) with class prevalence estimates.

-
-
-
- -
- -
-
-quapy.method.base.newOneVsAll(binary_quantifier, n_jobs=None)
-
- -
-
-

quapy.method.meta

-
-
-quapy.method.meta.EACC(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs)
-

Implements an ensemble of quapy.method.aggregative.ACC quantifiers, as used by -Pérez-Gállego et al., 2019.

-

Equivalent to:

-
>>> ensembleFactory(classifier, ACC, param_grid, optim, param_mod_sel, **kwargs)
-
-
-

See ensembleFactory() for further details.

-
-
Parameters:
-
    -
  • classifier – sklearn’s Estimator that generates a classifier

  • -
  • param_grid – a dictionary with the grid of parameters to optimize for

  • -
  • optim – a valid quantification or classification error, or a string name of it

  • -
  • param_model_sel – a dictionary containing any keyworded argument to pass to -quapy.model_selection.GridSearchQ

  • -
  • kwargs – kwargs for the class Ensemble

  • -
-
-
Returns:
-

an instance of Ensemble

-
-
-
- -
-
-quapy.method.meta.ECC(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs)
-

Implements an ensemble of quapy.method.aggregative.CC quantifiers, as used by -Pérez-Gállego et al., 2019.

-

Equivalent to:

-
>>> ensembleFactory(classifier, CC, param_grid, optim, param_mod_sel, **kwargs)
-
-
-

See ensembleFactory() for further details.

-
-
Parameters:
-
    -
  • classifier – sklearn’s Estimator that generates a classifier

  • -
  • param_grid – a dictionary with the grid of parameters to optimize for

  • -
  • optim – a valid quantification or classification error, or a string name of it

  • -
  • param_model_sel – a dictionary containing any keyworded argument to pass to -quapy.model_selection.GridSearchQ

  • -
  • kwargs – kwargs for the class Ensemble

  • -
-
-
Returns:
-

an instance of Ensemble

-
-
-
- -
-
-quapy.method.meta.EEMQ(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs)
-

Implements an ensemble of quapy.method.aggregative.EMQ quantifiers.

-

Equivalent to:

-
>>> ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs)
-
-
-

See ensembleFactory() for further details.

-
-
Parameters:
-
    -
  • classifier – sklearn’s Estimator that generates a classifier

  • -
  • param_grid – a dictionary with the grid of parameters to optimize for

  • -
  • optim – a valid quantification or classification error, or a string name of it

  • -
  • param_model_sel – a dictionary containing any keyworded argument to pass to -quapy.model_selection.GridSearchQ

  • -
  • kwargs – kwargs for the class Ensemble

  • -
-
-
Returns:
-

an instance of Ensemble

-
-
-
- -
-
-quapy.method.meta.EHDy(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs)
-

Implements an ensemble of quapy.method.aggregative.HDy quantifiers, as used by -Pérez-Gállego et al., 2019.

-

Equivalent to:

-
>>> ensembleFactory(classifier, HDy, param_grid, optim, param_mod_sel, **kwargs)
-
-
-

See ensembleFactory() for further details.

-
-
Parameters:
-
    -
  • classifier – sklearn’s Estimator that generates a classifier

  • -
  • param_grid – a dictionary with the grid of parameters to optimize for

  • -
  • optim – a valid quantification or classification error, or a string name of it

  • -
  • param_model_sel – a dictionary containing any keyworded argument to pass to -quapy.model_selection.GridSearchQ

  • -
  • kwargs – kwargs for the class Ensemble

  • -
-
-
Returns:
-

an instance of Ensemble

-
-
-
- -
-
-quapy.method.meta.EPACC(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs)
-

Implements an ensemble of quapy.method.aggregative.PACC quantifiers.

-

Equivalent to:

-
>>> ensembleFactory(classifier, PACC, param_grid, optim, param_mod_sel, **kwargs)
-
-
-

See ensembleFactory() for further details.

-
-
Parameters:
-
    -
  • classifier – sklearn’s Estimator that generates a classifier

  • -
  • param_grid – a dictionary with the grid of parameters to optimize for

  • -
  • optim – a valid quantification or classification error, or a string name of it

  • -
  • param_model_sel – a dictionary containing any keyworded argument to pass to -quapy.model_selection.GridSearchQ

  • -
  • kwargs – kwargs for the class Ensemble

  • -
-
-
Returns:
-

an instance of Ensemble

-
-
-
- -
-
-class quapy.method.meta.Ensemble(quantifier: BaseQuantifier, size=50, red_size=25, min_pos=5, policy='ave', max_sample_size=None, val_split: Optional[Union[float, LabelledCollection]] = None, n_jobs=None, verbose=False)
-

Bases: BaseQuantifier

-
-
-VALID_POLICIES = {'ave', 'ds', 'mae', 'mkld', 'mnkld', 'mrae', 'mse', 'ptr'}
-

Implementation of the Ensemble methods for quantification described by -Pérez-Gállego et al., 2017 -and -Pérez-Gállego et al., 2019. -The policies implemented include:

-
    -
  • Average (policy=’ave’): computes class prevalence estimates as the average of the estimates -returned by the base quantifiers.

  • -
  • Training Prevalence (policy=’ptr’): applies a dynamic selection to the ensemble’s members by retaining only -those members such that the class prevalence values in the samples they use as training set are closest to -preliminary class prevalence estimates computed as the average of the estimates of all the members. The final -estimate is recomputed by considering only the selected members.

  • -
  • Distribution Similarity (policy=’ds’): performs a dynamic selection of base members by retaining -the members trained on samples whose distribution of posterior probabilities is closest, in terms of the -Hellinger Distance, to the distribution of posterior probabilities in the test sample

  • -
  • Accuracy (policy=’<valid error name>’): performs a static selection of the ensemble members by -retaining those that minimize a quantification error measure, which is passed as an argument.

  • -
-

Example:

-
>>> model = Ensemble(quantifier=ACC(LogisticRegression()), size=30, policy='ave', n_jobs=-1)
-
-
-
-
Parameters:
-
    -
  • quantifier – base quantification member of the ensemble

  • -
  • size – number of members

  • -
  • red_size – number of members to retain after selection (depending on the policy)

  • -
  • min_pos – minimum number of positive instances to consider a sample as valid

  • -
  • policy – the selection policy; available policies include: ave (default), ptr, ds, and accuracy -(which is instantiated via a valid error name, e.g., mae)

  • -
  • max_sample_size – maximum number of instances to consider in the samples (set to None -to indicate no limit, default)

  • -
  • val_split – a float in range (0,1) indicating the proportion of data to be used as a stratified held-out -validation split, or a quapy.data.base.LabelledCollection (the split itself).

  • -
  • n_jobs – number of parallel workers (default 1)

  • -
  • verbose – set to True (default is False) to get some information in standard output

  • -
-
-
-
- -
-
-property aggregative
-

Indicates that the quantifier is not aggregative.

-
-
Returns:
-

False

-
-
-
- -
-
-fit(data: LabelledCollection, val_split: Optional[Union[float, LabelledCollection]] = None)
-

Trains a quantifier.

-
-
Parameters:
-

data – a quapy.data.base.LabelledCollection consisting of the training data

-
-
Returns:
-

self

-
-
-
- -
-
-get_params(deep=True)
-

This function should not be used within quapy.model_selection.GridSearchQ (is here for compatibility -with the abstract class). -Instead, use Ensemble(GridSearchQ(q),…), with q a Quantifier (recommended), or -Ensemble(Q(GridSearchCV(l))) with Q a quantifier class that has a classifier l optimized for -classification (not recommended).

-
-
Parameters:
-

deep – for compatibility with scikit-learn

-
-
Returns:
-

raises an Exception

-
-
-
- -
-
-property probabilistic
-

Indicates that the quantifier is not probabilistic.

-
-
Returns:
-

False

-
-
-
- -
-
-quantify(instances)
-

Generate class prevalence estimates for the sample’s instances

-
-
Parameters:
-

instances – array-like

-
-
Returns:
-

np.ndarray of shape (n_classes,) with class prevalence estimates.

-
-
-
- -
-
-set_params(**parameters)
-

This function should not be used within quapy.model_selection.GridSearchQ (is here for compatibility -with the abstract class). -Instead, use Ensemble(GridSearchQ(q),…), with q a Quantifier (recommended), or -Ensemble(Q(GridSearchCV(l))) with Q a quantifier class that has a classifier l optimized for -classification (not recommended).

-
-
Parameters:
-

parameters – dictionary

-
-
Returns:
-

raises an Exception

-
-
-
- -
- -
-
-quapy.method.meta.ensembleFactory(classifier, base_quantifier_class, param_grid=None, optim=None, param_model_sel: Optional[dict] = None, **kwargs)
-

Ensemble factory. Provides a unified interface for instantiating ensembles that can be optimized (via model -selection for quantification) for a given evaluation metric using quapy.model_selection.GridSearchQ. -If the evaluation metric is classification-oriented -(instead of quantification-oriented), then the optimization will be carried out via sklearn’s -GridSearchCV.

-

Example to instantiate an Ensemble based on quapy.method.aggregative.PACC -in which the base members are optimized for quapy.error.mae() via -quapy.model_selection.GridSearchQ. The ensemble follows the policy Accuracy based -on quapy.error.mae() (the same measure being optimized), -meaning that a static selection of members of the ensemble is made based on their performance -in terms of this error.

-
>>> param_grid = {
->>>     'C': np.logspace(-3,3,7),
->>>     'class_weight': ['balanced', None]
->>> }
->>> param_mod_sel = {
->>>     'sample_size': 500,
->>>     'protocol': 'app'
->>> }
->>> common={
->>>     'max_sample_size': 1000,
->>>     'n_jobs': -1,
->>>     'param_grid': param_grid,
->>>     'param_mod_sel': param_mod_sel,
->>> }
->>>
->>> ensembleFactory(LogisticRegression(), PACC, optim='mae', policy='mae', **common)
-
-
-
-
Parameters:
-
    -
  • classifier – sklearn’s Estimator that generates a classifier

  • -
  • base_quantifier_class – a class of quantifiers

  • -
  • param_grid – a dictionary with the grid of parameters to optimize for

  • -
  • optim – a valid quantification or classification error, or a string name of it

  • -
  • param_model_sel – a dictionary containing any keyworded argument to pass to -quapy.model_selection.GridSearchQ

  • -
  • kwargs – kwargs for the class Ensemble

  • -
-
-
Returns:
-

an instance of Ensemble

-
-
-
- -
-
-quapy.method.meta.get_probability_distribution(posterior_probabilities, bins=8)
-

Gets a histogram out of the posterior probabilities (only for the binary case).

-
-
Parameters:
-
    -
  • posterior_probabilities – array-like of shape (n_instances, 2,)

  • -
  • bins – integer

  • -
-
-
Returns:
-

np.ndarray with the relative frequencies for each bin (for the positive class only)

-
-
-
- -
-
-

quapy.method.neural

-
-
-class quapy.method.neural.QuaNetModule(doc_embedding_size, n_classes, stats_size, lstm_hidden_size=64, lstm_nlayers=1, ff_layers=[1024, 512], bidirectional=True, qdrop_p=0.5, order_by=0)
-

Bases: Module

-

Implements the QuaNet forward pass. -See QuaNetTrainer for training QuaNet.

-
-
Parameters:
-
    -
  • doc_embedding_size – integer, the dimensionality of the document embeddings

  • -
  • n_classes – integer, number of classes

  • -
  • stats_size – integer, number of statistics estimated by simple quantification methods

  • -
  • lstm_hidden_size – integer, hidden dimensionality of the LSTM cell

  • -
  • lstm_nlayers – integer, number of LSTM layers

  • -
  • ff_layers – list of integers, dimensions of the densely-connected FF layers on top of the -quantification embedding

  • -
  • bidirectional – boolean, whether or not to use bidirectional LSTM

  • -
  • qdrop_p – float, dropout probability

  • -
  • order_by – integer, class for which the document embeddings are to be sorted

  • -
-
-
-
-
-property device
-
- -
-
-forward(doc_embeddings, doc_posteriors, statistics)
-

Defines the computation performed at every call.

-

Should be overridden by all subclasses.

-
-

Note

-

Although the recipe for forward pass needs to be defined within -this function, one should call the Module instance afterwards -instead of this since the former takes care of running the -registered hooks while the latter silently ignores them.

-
-
- -
-
-training: bool
-
- -
- -
-
-class quapy.method.neural.QuaNetTrainer(classifier, sample_size=None, n_epochs=100, tr_iter_per_poch=500, va_iter_per_poch=100, lr=0.001, lstm_hidden_size=64, lstm_nlayers=1, ff_layers=[1024, 512], bidirectional=True, qdrop_p=0.5, patience=10, checkpointdir='../checkpoint', checkpointname=None, device='cuda')
-

Bases: BaseQuantifier

-

Implementation of QuaNet, a neural network for -quantification. This implementation uses PyTorch and can take advantage of GPU -for speeding-up the training phase.

-

Example:

-
>>> import quapy as qp
->>> from quapy.method.meta import QuaNet
->>> from quapy.classification.neural import NeuralClassifierTrainer, CNNnet
->>>
->>> # use samples of 100 elements
->>> qp.environ['SAMPLE_SIZE'] = 100
->>>
->>> # load the kindle dataset as text, and convert words to numerical indexes
->>> dataset = qp.datasets.fetch_reviews('kindle', pickle=True)
->>> qp.data.preprocessing.index(dataset, min_df=5, inplace=True)
->>>
->>> # the text classifier is a CNN trained by NeuralClassifierTrainer
->>> cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes)
->>> classifier = NeuralClassifierTrainer(cnn, device='cuda')
->>>
->>> # train QuaNet (QuaNet is an alias to QuaNetTrainer)
->>> model = QuaNet(classifier, qp.environ['SAMPLE_SIZE'], device='cuda')
->>> model.fit(dataset.training)
->>> estim_prevalence = model.quantify(dataset.test.instances)
-
-
-
-
Parameters:
-
    -
  • classifier – an object implementing fit (i.e., that can be trained on labelled data), -predict_proba (i.e., that can generate posterior probabilities of unlabelled examples) and -transform (i.e., that can generate embedded representations of the unlabelled instances).

  • -
  • sample_size – integer, the sample size; default is None, meaning that the sample size should be -taken from qp.environ[“SAMPLE_SIZE”]

  • -
  • n_epochs – integer, maximum number of training epochs

  • -
  • tr_iter_per_poch – integer, number of training iterations before considering an epoch complete

  • -
  • va_iter_per_poch – integer, number of validation iterations to perform after each epoch

  • -
  • lr – float, the learning rate

  • -
  • lstm_hidden_size – integer, hidden dimensionality of the LSTM cells

  • -
  • lstm_nlayers – integer, number of LSTM layers

  • -
  • ff_layers – list of integers, dimensions of the densely-connected FF layers on top of the -quantification embedding

  • -
  • bidirectional – boolean, indicates whether the LSTM is bidirectional or not

  • -
  • qdrop_p – float, dropout probability

  • -
  • patience – integer, number of epochs showing no improvement in the validation set before stopping the -training phase (early stopping)

  • -
  • checkpointdir – string, a path where to store models’ checkpoints

  • -
  • checkpointname – string (optional), the name of the model’s checkpoint

  • -
  • device – string, indicate “cpu” or “cuda”

  • -
-
-
-
-
-property classes_
-
- -
-
-clean_checkpoint()
-

Removes the checkpoint

-
- -
-
-clean_checkpoint_dir()
-

Removes anything contained in the checkpoint directory

-
- -
-
-fit(data: LabelledCollection, fit_classifier=True)
-

Trains QuaNet.

-
-
Parameters:
-
    -
  • data – the training data on which to train QuaNet. If fit_classifier=True, the data will be split in -40/40/20 for training the classifier, training QuaNet, and validating QuaNet, respectively. If -fit_classifier=False, the data will be split in 66/34 for training QuaNet and validating it, respectively.

  • -
  • fit_classifier – if True, trains the classifier on a split containing 40% of the data

  • -
-
-
Returns:
-

self

-
-
-
- -
-
-get_params(deep=True)
-

Get parameters for this estimator.

-
-
Parameters:
-

deep (bool, default=True) – If True, will return the parameters for this estimator and -contained subobjects that are estimators.

-
-
Returns:
-

params – Parameter names mapped to their values.

-
-
Return type:
-

dict

-
-
-
- -
-
-quantify(instances)
-

Generate class prevalence estimates for the sample’s instances

-
-
Parameters:
-

instances – array-like

-
-
Returns:
-

np.ndarray of shape (n_classes,) with class prevalence estimates.

-
-
-
- -
-
-set_params(**parameters)
-

Set the parameters of this estimator.

-

The method works on simple estimators as well as on nested objects -(such as Pipeline). The latter have -parameters of the form <component>__<parameter> so that it’s -possible to update each component of a nested object.

-
-
Parameters:
-

**params (dict) – Estimator parameters.

-
-
Returns:
-

self – Estimator instance.

-
-
Return type:
-

estimator instance

-
-
-
- -
- -
-
-quapy.method.neural.mae_loss(output, target)
-

Torch-like wrapper for the Mean Absolute Error

-
-
Parameters:
-
    -
  • output – predictions

  • -
  • target – ground truth values

  • -
-
-
Returns:
-

mean absolute error loss

-
-
-
- -
-
-

quapy.method.non_aggregative

-
-
-class quapy.method.non_aggregative.MaximumLikelihoodPrevalenceEstimation
-

Bases: BaseQuantifier

-

The Maximum Likelihood Prevalence Estimation (MLPE) method is a lazy method that assumes there is no prior -probability shift between training and test instances (put it other way, that the i.i.d. assumpion holds). -The estimation of class prevalence values for any test sample is always (i.e., irrespective of the test sample -itself) the class prevalence seen during training. This method is considered to be a lower-bound quantifier that -any quantification method should beat.

-
-
-fit(data: LabelledCollection)
-

Computes the training prevalence and stores it.

-
-
Parameters:
-

data – the training sample

-
-
Returns:
-

self

-
-
-
- -
-
-quantify(instances)
-

Ignores the input instances and returns, as the class prevalence estimantes, the training prevalence.

-
-
Parameters:
-

instances – array-like (ignored)

-
-
Returns:
-

the class prevalence seen during training

-
-
-
- -
- -
-
-

Module contents

-
-
- - -
-
-
-
- -
-
- - - - \ No newline at end of file diff --git a/docs/build/html/search.html b/docs/build/html/search.html deleted file mode 100644 index 480e246..0000000 --- a/docs/build/html/search.html +++ /dev/null @@ -1,111 +0,0 @@ - - - - - - - - - Search — QuaPy 0.1.7 documentation - - - - - - - - - - - - - - - - - - - - - - - -
-
-
-
- -

Search

- - - - -

- Searching for multiple words only shows matches that contain - all words. -

- - -
- - - -
- - - -
- -
- - -
-
-
-
- -
-
- - - - \ No newline at end of file diff --git a/docs/build/html/searchindex.js b/docs/build/html/searchindex.js deleted file mode 100644 index 6e202ab..0000000 --- a/docs/build/html/searchindex.js +++ /dev/null @@ -1 +0,0 @@ -Search.setIndex({"docnames": ["Datasets", "Evaluation", "ExplicitLossMinimization", "Home", "Installation", "Methods", "Model-Selection", "Plotting", "Protocols", "index", "modules", "quapy", "quapy.classification", "quapy.data", "quapy.method"], "filenames": ["Datasets.md", "Evaluation.md", "ExplicitLossMinimization.md", "Home.md", "Installation.rst", "Methods.md", "Model-Selection.md", "Plotting.md", "Protocols.md", "index.rst", "modules.rst", "quapy.rst", "quapy.classification.rst", "quapy.data.rst", "quapy.method.rst"], "titles": ["Datasets", "Evaluation", "Explicit Loss Minimization", "<no title>", "Installation", "Quantification Methods", "Model Selection", "Plotting", "Protocols", "Welcome to QuaPy\u2019s documentation!", "quapy", "quapy package", "quapy.classification package", "quapy.data package", "quapy.method package"], "terms": {"quapi": [0, 1, 2, 3, 4, 5, 6, 7, 8], "make": [0, 2, 5, 11, 14], "avail": [0, 1, 2, 4, 5, 7, 9, 12, 14], "sever": [0, 2, 13], "have": [0, 1, 4, 5, 6, 7, 8, 11, 13, 14], "been": [0, 5, 6, 7, 8, 11, 12, 13, 14], "us": [0, 1, 5, 6, 7, 8, 9, 11, 12, 13, 14], "quantif": [0, 1, 2, 8, 9, 11, 12, 13, 14], "literatur": [0, 1, 6, 8, 9], "well": [0, 5, 7, 14], "an": [0, 1, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "interfac": [0, 1, 5, 14], "allow": [0, 2, 4, 5, 7, 8, 11, 12, 13, 14], "anyon": 0, "import": [0, 5, 6, 7, 8, 9, 13, 14], "A": [0, 1, 5, 11, 12, 13, 14], "object": [0, 8, 11, 12, 13, 14], "i": [0, 1, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "roughli": 0, "pair": [0, 11], "labelledcollect": [0, 5, 6, 8, 11, 13, 14], "one": [0, 1, 2, 5, 6, 7, 8, 11, 13, 14], "plai": 0, "role": 0, "train": [0, 1, 5, 6, 7, 8, 9, 11, 12, 13, 14], "set": [0, 1, 5, 6, 7, 8, 9, 11, 12, 13, 14], "anoth": [0, 1, 5, 7, 8, 11], "test": [0, 1, 5, 6, 7, 8, 9, 11, 12, 13, 14], "class": [0, 1, 5, 6, 7, 8, 9, 11, 12, 13, 14], "consist": [0, 6, 7, 8, 11, 12, 13, 14], "iter": [0, 11, 13, 14], "instanc": [0, 1, 5, 6, 7, 8, 9, 11, 12, 13, 14], "label": [0, 5, 6, 7, 8, 9, 11, 12, 13, 14], "thi": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "handl": 0, "most": [0, 1, 5, 7, 8, 9, 11, 13, 14], "sampl": [0, 1, 5, 6, 7, 9, 11, 12, 13, 14], "function": [0, 1, 5, 6, 7, 8, 9, 10, 12, 13, 14], "take": [0, 5, 7, 8, 11, 13, 14], "look": [0, 5, 7, 14], "follow": [0, 1, 5, 6, 7, 8, 9, 11, 14], "code": [0, 1, 2, 5, 6, 7, 8, 12], "qp": [0, 1, 5, 6, 7, 8, 9, 11, 13, 14], "f": [0, 1, 5, 6, 7, 8, 9, 13], "1st": 0, "posit": [0, 5, 7, 11, 13, 14], "document": [0, 5, 7, 12, 13, 14], "2nd": 0, "onli": [0, 1, 5, 7, 11, 12, 13, 14], "neg": [0, 7, 11, 14], "neutral": 0, "3rd": 0, "2": [0, 1, 5, 6, 7, 8, 11, 13, 14], "0": [0, 1, 5, 6, 7, 8, 11, 12, 13, 14], "1": [0, 1, 5, 6, 7, 8, 9, 11, 12, 13, 14], "print": [0, 1, 5, 6, 8, 9, 12, 13], "strprev": [0, 1, 11], "preval": [0, 1, 5, 6, 7, 9, 11, 12, 13, 14], "prec": [0, 11], "output": [0, 1, 5, 6, 8, 11, 12, 13, 14], "show": [0, 3, 5, 6, 7, 8, 11, 12, 13, 14], "digit": 0, "precis": [0, 11], "17": [0, 5], "50": [0, 7, 11, 14], "33": [0, 7, 11], "One": [0, 1, 5, 8, 14], "can": [0, 1, 4, 5, 6, 7, 8, 11, 13, 14], "easili": [0, 4, 7, 12], "produc": [0, 1, 7, 8, 11], "new": [0, 5, 8, 11, 12, 13], "desir": [0, 1, 5, 8, 11, 13], "sample_s": [0, 1, 5, 6, 7, 8, 11, 14], "10": [0, 6, 7, 8, 11, 12, 14], "prev": [0, 1, 8, 11, 13], "4": [0, 1, 5, 6, 7, 8, 9, 13, 14], "5": [0, 1, 5, 6, 7, 8, 11, 12, 13, 14], "which": [0, 1, 5, 6, 7, 11, 12, 13, 14], "40": [0, 5, 14], "made": [0, 2, 4, 11, 13, 14], "across": [0, 6, 7, 9, 11, 14], "differ": [0, 5, 6, 7, 8, 9, 11, 13, 14], "run": [0, 2, 4, 5, 7, 11, 13, 14], "e": [0, 1, 5, 6, 7, 8, 9, 11, 12, 13, 14], "g": [0, 1, 5, 6, 8, 9, 11, 13, 14], "method": [0, 1, 2, 3, 6, 7, 8, 9, 11], "same": [0, 5, 6, 7, 8, 11, 13, 14], "exact": [0, 8, 13], "retain": [0, 5, 12, 14], "index": [0, 5, 9, 11, 12, 13, 14], "gener": [0, 1, 5, 6, 7, 8, 11, 12, 13, 14], "sampling_index": [0, 13], "sampling_from_index": [0, 13], "also": [0, 1, 2, 4, 5, 6, 7, 8, 9, 11, 12], "implement": [0, 1, 5, 6, 7, 8, 9, 11, 12, 13, 14], "artifici": [5, 6, 7, 9, 11], "protocol": [0, 3, 5, 6, 7, 9, 10, 13, 14], "via": [4, 5, 8, 11, 12, 14], "python": [0, 9], "": [0, 1, 5, 6, 7, 11, 12, 13, 14], "seri": [0, 6, 13], "equidist": 11, "rang": [6, 7, 8, 11, 14], "entir": [0, 1, 5, 6, 7, 8, 11], "spectrum": [7, 8, 11], "simplex": [9, 11], "space": [6, 11, 12], "artificial_sampling_gener": [], "100": [0, 1, 5, 6, 7, 8, 11, 12, 13, 14], "n_preval": [8, 11], "each": [0, 1, 5, 6, 7, 8, 11, 12, 13, 14], "valid": [0, 5, 6, 7, 8, 11, 12, 13, 14], "combin": [6, 8, 11, 14], "origin": [0, 1, 5, 8, 11, 13], "from": [0, 1, 5, 6, 7, 9, 11, 12, 13, 14], "split": [0, 5, 6, 7, 11, 12, 13, 14], "point": [5, 8, 11, 13], "25": [7, 11, 12, 14], "75": [6, 7, 8, 11], "00": [], "see": [0, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "evalu": [0, 3, 5, 6, 7, 8, 9, 10, 12, 13, 14], "wiki": [0, 1, 3, 5], "further": [0, 5, 12, 13, 14], "detail": [0, 1, 2, 5, 9, 12, 13, 14], "how": [0, 5, 6, 7, 11, 13, 14], "properli": [5, 14], "three": [0, 7], "about": [0, 7, 11, 13], "kindl": [0, 5, 7, 13, 14], "devic": [0, 5, 7, 12, 14], "harri": 0, "potter": 0, "known": [0, 5, 6, 11, 14], "imdb": [0, 6, 7, 8, 13], "movi": 0, "fetch": [0, 9], "unifi": [0, 14], "For": [0, 1, 6, 7, 8, 9, 11, 13], "exampl": [0, 2, 3, 5, 6, 7, 8, 11, 12, 13, 14], "fetch_review": [0, 5, 6, 7, 8, 13, 14], "These": [0, 1, 2, 5, 8, 12], "esuli": [0, 2, 4, 5, 12, 13, 14], "moreo": [0, 5, 6, 13, 14], "sebastiani": [0, 5, 6, 13, 14], "2018": [0, 5, 13], "octob": [0, 5], "recurr": [0, 5, 13], "neural": [0, 11, 13], "network": [0, 11, 12, 13, 14], "In": [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "proceed": [0, 5, 13], "27th": [0, 5, 13], "acm": [0, 5, 13, 14], "intern": [0, 1, 5, 12, 13], "confer": [0, 5, 12, 13], "inform": [0, 5, 6, 8, 11, 12, 13, 14], "knowledg": [0, 5, 13], "manag": [0, 5, 13], "pp": [0, 5, 6, 12], "1775": [0, 5], "1778": [0, 5], "The": [0, 1, 2, 4, 6, 7, 8, 9, 11, 12, 13, 14], "list": [0, 7, 11, 12, 13, 14], "id": [0, 5, 13], "reviews_sentiment_dataset": [0, 13], "some": [0, 1, 5, 7, 8, 11, 13, 14], "statist": [0, 8, 11, 14], "fhe": 0, "ar": [0, 1, 5, 6, 7, 8, 11, 12, 13, 14], "summar": 0, "below": [0, 4, 5, 7, 11, 13], "size": [0, 1, 5, 11, 12, 13, 14], "type": [0, 5, 11, 13, 14], "hp": [0, 5, 13], "9533": 0, "18399": 0, "018": 0, "982": 0, "065": 0, "935": 0, "text": [0, 5, 11, 12, 13, 14], "3821": [0, 13], "21591": [0, 13], "081": [0, 13], "919": [0, 13], "063": [0, 13], "937": [0, 13], "25000": 0, "500": [0, 1, 7, 14], "11": [0, 8, 9, 11], "analysi": [0, 5, 9, 13], "access": [0, 5, 13, 14], "were": 0, "tf": [0, 13], "idf": 0, "format": [0, 7, 11, 13, 14], "present": [0, 5, 13], "two": [0, 5, 7, 8, 11, 13, 14], "val": [0, 8, 12, 13], "model": [0, 1, 3, 7, 8, 9, 11, 12, 14], "select": [0, 1, 3, 5, 8, 9, 11, 13, 14], "purpos": [0, 8, 14], "exemplifi": 0, "load": [0, 5, 8, 11, 13, 14], "fetch_twitt": [0, 5, 9, 13], "gasp": [0, 13], "for_model_select": [0, 13], "true": [0, 1, 5, 6, 7, 8, 9, 11, 12, 13, 14], "gao": [0, 5, 13, 14], "w": [0, 5, 13], "2015": [0, 2, 4, 5, 12, 14], "august": 0, "tweet": [0, 5, 13], "classif": [0, 1, 5, 9, 11, 13, 14], "ieee": 0, "advanc": [0, 6, 8, 9, 11], "social": [0, 5, 13], "mine": [0, 5], "asonam": 0, "97": 0, "104": [0, 1], "semeval13": [0, 13], "semeval14": [0, 13], "semeval15": [0, 13], "share": [0, 13], "semev": 0, "mean": [0, 1, 5, 6, 7, 8, 9, 11, 12, 13, 14], "would": [0, 1, 5, 7, 9, 13, 14], "get": [0, 7, 8, 11, 12, 13, 14], "when": [0, 1, 5, 7, 8, 11, 12, 13], "request": [0, 6, 11, 13, 14], "ani": [0, 5, 6, 7, 8, 9, 11, 12, 13, 14], "them": [0, 5, 13, 14], "consult": [0, 8], "twitter_sentiment_datasets_test": [0, 13], "9": [0, 5, 7, 11], "replac": [0, 5, 11, 13], "twitter_sentiment_datasets_train": [0, 13], "found": [0, 5, 11, 12, 13], "featur": [0, 13], "3": [0, 1, 5, 6, 7, 8, 9, 11, 12, 13, 14], "8788": 0, "3765": 0, "694582": 0, "421": 0, "496": 0, "082": 0, "407": 0, "507": 0, "086": 0, "spars": [0, 13], "hcr": [0, 5, 13], "1594": 0, "798": 0, "222046": 0, "546": 0, "211": 0, "243": 0, "640": 0, "167": 0, "193": 0, "omd": [0, 13], "1839": 0, "787": 0, "199151": 0, "463": 0, "271": 0, "266": 0, "437": 0, "283": 0, "280": 0, "sander": [0, 13], "2155": 0, "923": 0, "229399": 0, "161": 0, "691": 0, "148": 0, "164": [0, 5], "688": 0, "11338": 0, "3813": 0, "1215742": 0, "159": 0, "470": 0, "372": 0, "158": 0, "430": 0, "412": 0, "1853": 0, "109": 0, "361": 0, "530": 0, "2390": 0, "153": 0, "413": 0, "434": 0, "semeval16": [0, 9, 13], "8000": 0, "2000": 0, "889504": 0, "157": 0, "351": 0, "492": 0, "163": 0, "341": 0, "497": 0, "sst": [0, 13], "2971": 0, "1271": 0, "376132": 0, "261": 0, "452": 0, "288": 0, "207": 0, "481": 0, "312": 0, "wa": [0, 5, 7, 8, 11, 13, 14], "2184": 0, "936": 0, "248563": 0, "305": 0, "414": 0, "281": 0, "282": 0, "446": 0, "272": [0, 1], "wb": [0, 13], "4259": 0, "1823": 0, "404333": 0, "270": 0, "392": 0, "337": 0, "274": 0, "335": 0, "32": [0, 6, 9], "repositori": [0, 13], "p\u00e9rez": [0, 5, 13, 14], "g\u00e1llego": [0, 5, 13, 14], "p": [0, 5, 11, 12, 13, 14], "quevedo": [0, 5, 13], "j": [0, 5, 13, 14], "r": [0, 5, 11, 13], "del": [0, 5, 13], "coz": [0, 5, 13], "2017": [0, 5, 13, 14], "ensembl": [0, 9, 13, 14], "problem": [0, 5, 7, 11, 13, 14], "characteriz": [0, 5, 13], "chang": [0, 1, 5, 13], "distribut": [0, 1, 5, 7, 8, 11, 13, 14], "case": [0, 1, 5, 7, 8, 11, 12, 13, 14], "studi": [0, 5, 13], "fusion": [0, 5, 13], "34": [0, 5, 13, 14], "87": [0, 5, 13], "doe": [0, 2, 4, 5, 11, 14], "exactli": 0, "coincid": [0, 9], "et": [0, 2, 4, 5, 12, 13, 14], "al": [0, 2, 4, 5, 12, 13, 14], "sinc": [0, 1, 5, 6, 7, 8, 13, 14], "we": [0, 1, 3, 5, 6, 7, 8, 9, 13], "unabl": 0, "find": [0, 6, 14], "diabet": 0, "phonem": 0, "call": [0, 1, 5, 7, 8, 11, 13, 14], "fetch_ucidataset": [0, 5, 13], "yeast": [0, 13], "verbos": [0, 6, 11, 12, 13, 14], "return": [0, 1, 5, 6, 7, 8, 11, 12, 13, 14], "randomli": [0, 13], "drawn": [0, 8, 11, 13], "stratifi": [0, 5, 12, 13, 14], "manner": [0, 12, 14], "whole": [0, 1, 5, 6, 11, 12], "collect": [0, 8, 11, 12, 13], "70": 0, "30": [0, 5, 6, 8, 14], "respect": [0, 1, 7, 11, 14], "option": [0, 5, 7, 13, 14], "indic": [0, 1, 5, 6, 7, 8, 11, 12, 13, 14], "descript": [0, 13], "should": [0, 5, 6, 7, 9, 11, 12, 13, 14], "standard": [0, 7, 8, 11, 12, 13, 14], "paper": [0, 5, 12, 14], "submit": 0, "kfcv": [0, 12, 13, 14], "order": [0, 4, 5, 6, 7, 8, 11, 13, 14], "accommod": [0, 11], "practic": 0, "could": [0, 1, 5, 6, 7, 8, 9], "first": [0, 1, 2, 4, 5, 7, 11, 13, 14], "instanti": [0, 1, 5, 6, 8, 11, 12, 14], "creat": [0, 9, 11, 14], "time": [0, 1, 5, 8, 11, 13, 14], "fetch_ucilabelledcollect": [0, 13], "nfold": [0, 11, 13], "nrepeat": [0, 13], "abov": [0, 2, 5, 7, 11], "conduct": [0, 11], "2x5fcv": 0, "all": [0, 1, 2, 4, 5, 7, 8, 11, 12, 14], "come": [0, 1, 8, 11, 13, 14], "numer": [0, 1, 5, 9, 13, 14], "form": [0, 11, 13, 14], "dens": [0, 14], "matric": [0, 7, 13], "acut": 0, "120": 0, "6": [0, 1, 5, 7, 13], "508": 0, "b": [0, 11, 13, 14], "583": 0, "417": 0, "balanc": [0, 6, 7, 14], "625": 0, "539": 0, "461": 0, "922": 0, "078": 0, "breast": 0, "cancer": 0, "683": 0, "350": 0, "650": 0, "cmc": 0, "1473": 0, "573": 0, "427": 0, "774": 0, "226": 0, "653": 0, "347": 0, "ctg": 0, "2126": 0, "22": [0, 5, 12, 13], "222": [0, 12], "778": 0, "861": 0, "139": 0, "917": 0, "083": 0, "german": 0, "1000": [0, 6, 14], "24": [0, 1, 12], "300": [0, 1, 12], "700": 0, "haberman": [0, 5], "306": 0, "735": 0, "265": 0, "ionospher": 0, "641": 0, "359": 0, "iri": 0, "150": 0, "667": 0, "333": 0, "mammograph": 0, "830": 0, "514": 0, "486": 0, "pageblock": 0, "5473": 0, "979": 0, "021": 0, "semeion": 0, "1593": 0, "256": [0, 12], "901": 0, "099": 0, "sonar": 0, "208": 0, "60": 0, "534": 0, "466": 0, "spambas": 0, "4601": 0, "57": 0, "606": 0, "394": 0, "spectf": 0, "267": 0, "44": 0, "794": 0, "206": 0, "tictacto": 0, "958": 0, "transfus": 0, "748": 0, "762": 0, "238": 0, "wdbc": 0, "569": 0, "627": 0, "373": 0, "wine": 0, "178": 0, "13": [0, 12], "669": 0, "331": 0, "601": 0, "399": 0, "730": 0, "q": [0, 2, 4, 5, 11, 12, 14], "red": 0, "1599": 0, "465": 0, "535": 0, "white": 0, "4898": 0, "665": 0, "1484": 0, "8": [0, 6, 7, 13, 14], "711": 0, "289": 0, "download": [0, 2, 4, 5, 11, 13], "automat": [0, 12], "thei": [0, 5, 14], "store": [0, 12, 13, 14], "quapy_data": [0, 11], "folder": [0, 6, 8, 13, 14], "faster": [0, 13], "reus": [0, 5, 11, 13], "howev": [0, 6, 7], "requir": [0, 1, 2, 5, 8, 9, 12], "special": [0, 7, 13], "action": 0, "moment": [0, 1, 5], "fulli": [0, 11], "autom": [0, 9], "cardiotocographi": 0, "excel": 0, "file": [0, 7, 11, 12, 13, 14], "user": [0, 7, 8, 11], "instal": [0, 5, 9, 12, 14], "xlrd": [0, 4], "modul": [0, 1, 5, 7, 8, 9, 10], "open": [0, 9, 13], "page": [0, 4, 9], "block": [0, 11], "need": [0, 5, 8, 11, 13, 14], "unix": 0, "compress": 0, "extens": [0, 2, 4, 7], "z": [0, 13], "directli": [0, 5], "doabl": 0, "packag": [0, 2, 4, 5, 9, 10], "like": [0, 1, 5, 7, 8, 11, 12, 13, 14], "gzip": 0, "zip": [0, 7, 11], "uncompress": 0, "o": [0, 11], "depend": [0, 6, 7, 11, 14], "softwar": 0, "manual": 0, "do": [0, 5, 6, 11, 12, 13, 14], "invok": [0, 5, 8, 11, 13], "provid": [0, 5, 7, 8, 9, 13, 14], "loader": [0, 13], "simpl": [0, 5, 7, 11, 14], "deal": 0, "t": [0, 1, 5, 11, 12, 14], "pre": [0, 5, 11], "n": [0, 8, 11, 12, 14], "second": [0, 1, 5, 7, 11, 13], "represent": [0, 5, 11, 12, 14], "col": [0, 13], "int": [0, 7, 11, 13, 14], "float": [0, 5, 11, 12, 13, 14], "charg": [0, 11, 13], "classmethod": [0, 11, 13, 14], "def": [0, 1, 5, 7, 11], "cl": 0, "path": [0, 5, 7, 11, 12, 13, 14], "str": [0, 11, 13, 14], "loader_func": [0, 13], "callabl": [0, 11, 13, 14], "defin": [0, 5, 8, 11, 12, 13, 14], "argument": [0, 1, 5, 7, 8, 11, 13, 14], "initi": [0, 12, 14], "particular": [0, 5, 14], "receiv": [0, 5, 7], "addition": 0, "number": [0, 1, 5, 6, 7, 8, 11, 12, 13, 14], "specifi": [0, 5, 7, 8, 11, 12, 13], "otherwis": [0, 5, 11, 13], "infer": [0, 13], "least": [0, 13], "pass": [0, 1, 7, 11, 12, 14], "along": [0, 1, 5, 11, 14], "train_path": [0, 13], "my_data": 0, "dat": [0, 12], "test_path": [0, 13], "my_custom_load": 0, "rb": 0, "fin": 0, "preprocess": [0, 5, 11, 14], "includ": [0, 2, 3, 5, 6, 7, 8, 9, 13, 14], "text2tfidf": [0, 5, 13], "tfidf": [0, 6, 7, 8, 13], "vector": [0, 5, 8, 11, 12, 13, 14], "reduce_column": [0, 13], "reduc": [0, 1, 13], "column": [0, 1, 11, 13], "base": [0, 5, 9, 11, 12], "term": [0, 5, 6, 7, 9, 11, 12, 13, 14], "frequenc": [0, 13, 14], "transform": [0, 12, 13, 14], "valu": [0, 1, 5, 6, 8, 11, 12, 13, 14], "score": [0, 1, 6, 11, 12, 13], "subtract": [0, 11, 13], "normal": [0, 1, 5, 11, 13, 14], "deviat": [0, 7, 8, 11, 13], "so": [0, 5, 7, 8, 11, 12, 13, 14], "zero": [0, 11], "unit": [0, 9, 11], "varianc": [0, 7], "textual": [0, 9, 13], "token": [0, 12, 13], "appeal": 1, "tool": [1, 9], "scenario": [1, 5, 6, 7, 9], "dataset": [1, 3, 5, 6, 7, 8, 9, 11, 12, 14], "shift": [1, 6, 8, 9, 11, 12, 14], "particularli": 1, "prior": [1, 5, 6, 7, 8, 9, 11, 14], "probabl": [1, 5, 6, 7, 8, 9, 11, 12, 14], "That": [1, 6], "interest": [1, 7, 8, 9, 11], "estim": [0, 1, 5, 7, 9, 11, 12, 13, 14], "aris": 1, "under": [1, 8], "belief": 1, "those": [1, 5, 6, 7, 11, 12, 14], "might": [1, 11, 13], "ones": [1, 5, 7, 11, 13, 14], "observ": [1, 14], "dure": [1, 7, 14], "other": [1, 5, 7, 9, 11, 13, 14], "word": [1, 5, 9, 12, 13, 14], "simpli": [1, 2, 4, 5, 6, 7, 9, 11, 14], "predictor": 1, "assum": [1, 9, 14], "unlik": [1, 11], "machin": [1, 6, 9, 12], "learn": [1, 4, 5, 6, 9, 11, 12, 13, 14], "govern": 1, "iid": [1, 7, 9], "assumpt": [1, 7, 9], "brief": [0, 1, 13], "dedic": [0, 1, 13], "explain": [1, 7], "here": [1, 14], "mae": [1, 6, 8, 9, 11, 12, 14], "absolut": [1, 5, 7, 9, 11, 14], "mrae": [1, 9, 11, 12, 14], "rel": [1, 5, 11, 13, 14], "mse": [1, 5, 9, 11, 14], "squar": [1, 5, 11], "mkld": [1, 11, 14], "kullback": [1, 5, 11, 14], "leibler": [1, 5, 11, 14], "diverg": [1, 5, 11, 14], "mnkld": [1, 11, 14], "ae": [1, 2, 4, 5, 7, 11], "rae": [1, 2, 4, 5, 11], "se": [1, 11], "kld": [1, 2, 4, 5, 11, 12, 14], "nkld": [1, 2, 4, 5, 9, 11, 12, 14], "individu": [1, 5], "without": [1, 5, 11, 13], "averag": [1, 5, 11, 13, 14], "acc": [1, 5, 7, 8, 9, 11, 14], "accuraci": [1, 7, 11, 14], "f1e": [1, 11], "f1": [1, 11, 12], "true_prev": [1, 7, 11], "prevs_hat": [1, 11], "ndarrai": [1, 5, 11, 13, 14], "contain": [1, 2, 4, 5, 7, 8, 11, 12, 13, 14], "smooth": [1, 11], "stabil": [1, 14], "third": [1, 7], "ep": [1, 11], "none": [1, 6, 8, 11, 12, 13, 14], "paramet": [1, 5, 6, 8, 11, 12, 13, 14], "epsilon": [1, 11, 14], "tradition": 1, "2t": [1, 11], "past": 1, "either": [1, 5, 11, 14], "environ": [1, 5, 6, 7, 8, 11, 14], "variabl": [1, 5, 7, 11, 13], "onc": [0, 1, 5, 6, 7, 8, 11, 13], "ommit": [], "thereaft": 1, "recommend": [1, 7, 14], "np": [1, 5, 6, 7, 8, 11, 13, 14], "asarrai": 1, "let": [1, 5, 6, 11, 14], "estim_prev": [1, 7, 11], "ae_": [], "3f": [1, 9], "200": [1, 12], "600": 1, "914": 1, "final": [1, 5, 7, 14], "possibl": [1, 5, 8, 11, 14], "string": [1, 11, 13, 14], "error_funct": 1, "from_nam": [1, 11], "accord": [5, 6, 11, 12, 13, 14], "fix": 8, "cover": [8, 11, 12], "full": [8, 11], "contrast": [], "natur": [9, 11], "despit": [], "introduc": [], "approxim": [5, 7, 11, 12], "preserv": [7, 11, 13], "procol": [], "equal": [8, 11, 14], "distant": [8, 11], "interv": [7, 8, 11], "n_prevpoint": [8, 11], "determin": [6, 7, 8, 11], "constrain": [7, 8, 11, 13], "obtain": [8, 11, 12, 14], "66": [8, 14], "given": [1, 5, 6, 8, 11, 12, 13, 14], "num_prevalence_combin": [8, 11], "21": [5, 7, 8, 11], "n_class": [5, 8, 11, 12, 13, 14], "n_repeat": [8, 11], "1771": 8, "note": [1, 5, 7, 8, 11, 13], "last": [5, 7, 8, 11, 12, 13], "typic": [1, 6, 7, 8, 11, 12, 13, 14], "singl": [1, 5, 8, 9, 11, 14], "higher": [7, 8], "comput": [1, 5, 7, 8, 11, 14], "perform": [1, 5, 6, 7, 8, 9, 11, 12, 14], "signific": 8, "instead": [0, 1, 5, 6, 8, 11, 13, 14], "work": [5, 7, 8, 11, 13, 14], "wai": [5, 8, 14], "around": [8, 13, 14], "maximum": [8, 11, 12, 14], "budg": 8, "close": [8, 13], "than": [1, 6, 7, 8, 11, 12, 13], "budget": 8, "achiev": [5, 6, 7, 8], "get_nprevpoints_approxim": [8, 11], "5000": [0, 1, 7, 8], "4960": 8, "cost": [], "sometim": 8, "cumbersom": 8, "control": [6, 8, 11], "overal": 11, "experi": [0, 4, 5, 6, 7, 11, 13], "rather": 6, "By": [5, 11], "avoid": 11, "lead": 13, "closer": [], "surpass": [], "script": [2, 4, 5, 9, 14], "pacc": [5, 7, 11, 14], "reli": [5, 8, 11, 14], "logist": [5, 12, 14], "regressor": 5, "classifi": [6, 7, 9, 11, 12, 14], "variou": 7, "metric": [5, 6, 9, 11, 14], "sklearn": [5, 6, 7, 8, 9, 12, 13, 14], "linear_model": [5, 6, 8, 9, 12], "logisticregress": [5, 6, 8, 9, 12, 14], "data": [5, 6, 7, 9, 11, 12, 14], "min_df": [5, 6, 7, 8, 13, 14], "inplac": [5, 13, 14], "lr": [5, 12, 14], "aggreg": [1, 6, 7, 8, 9, 11], "fit": [5, 6, 7, 8, 9, 11, 12, 13, 14], "df": [], "artificial_sampling_report": [], "mani": [1, 5, 6, 7, 8, 9, 11, 14], "extract": [11, 13], "categori": 11, "n_repetit": [], "n_job": [5, 6, 8, 11, 12, 13, 14], "parallel": [5, 6, 11, 12, 13, 14], "worker": [11, 12, 13, 14], "cpu": [12, 14], "random_se": 11, "42": [], "random": [5, 7, 8, 11, 13], "seed": [8, 11, 13], "replic": [8, 11], "error_metr": [1, 6, 8, 11], "line": [5, 11], "result": [1, 2, 4, 5, 7, 9, 14], "report": [1, 11], "panda": [1, 4, 11], "datafram": [1, 11], "displai": [1, 7, 8, 11, 12], "just": [5, 8], "clearer": [], "shown": [7, 11], "convert": [5, 11, 12, 13, 14], "repres": [5, 7, 11, 13, 14], "decim": [], "default": [5, 8, 11, 12, 13, 14], "pd": 1, "set_opt": 1, "expand_frame_repr": 1, "fals": [1, 5, 7, 11, 12, 13, 14], "map": [1, 12, 14], "000": [], "000e": [], "091": 1, "909": 1, "009": [], "048": [], "426e": [], "04": [], "837": [], "037": [], "114": [], "633e": [], "03": [], "7": [5, 6, 7, 8, 9, 11, 12, 14], "717": [], "017": [], "041": [], "383e": [], "366": [], "634": [], "034": [], "070": [], "412e": [], "459": [], "541": [], "387e": [], "565": [], "435": [], "035": 1, "073": [], "535e": [], "654": [], "346": [], "046": [], "108": [], "701e": [], "725": [], "275": [], "075": [], "235": [], "515e": [], "02": [], "858": [], "142": [], "042": [], "229": [], "740e": [], "945": [], "055": [], "27": [5, 12], "357": [], "219e": [], "578": [], "dtype": [1, 13], "float64": 1, "artificial_sampling_ev": [], "artificial_sampling_predict": [], "arrai": [5, 7, 11, 12, 13, 14], "pip": 4, "older": 4, "version": [2, 4, 11, 12], "scikit": [4, 5, 6, 11, 12, 13, 14], "numpi": [4, 6, 8, 11, 12], "scipi": [4, 13], "pytorch": [4, 14], "quanet": [4, 9, 12, 14], "svmperf": [2, 3, 4, 5, 11, 14], "patch": [2, 4, 5, 12, 14], "joblib": [4, 14], "tqdm": 4, "matplotlib": [4, 11], "involv": [4, 7, 11], "you": [4, 5], "appli": [1, 2, 4, 5, 6, 7, 11, 12, 13, 14], "ext": [2, 4], "compil": [2, 4, 5], "sourc": [2, 4, 5, 9, 12], "prepare_svmperf": [2, 4, 5], "sh": [2, 4, 5], "job": [2, 4], "directori": [2, 4, 11, 12, 13, 14], "svm_perf_quantif": [2, 4, 5], "optim": [1, 2, 4, 6, 11, 12, 14], "measur": [2, 4, 5, 6, 7, 9, 11, 14], "propos": [2, 4, 5, 8, 14], "barranquero": [2, 4, 5, 12, 14], "extend": [2, 4, 5, 11, 14], "former": [4, 14], "categor": [5, 13], "belong": [5, 6, 14], "non": [5, 14], "group": 5, "though": [5, 11], "plan": 5, "add": [5, 6, 11, 13], "more": [1, 2, 5, 7, 8, 11, 14], "futur": 5, "character": [1, 5, 9], "fact": [5, 7], "product": [0, 5, 13], "quantifi": [0, 1, 5, 6, 7, 8, 9, 11, 13, 14], "shoud": 5, "basequantifi": [5, 11, 14], "abstract": [5, 11, 12, 13, 14], "abstractmethod": 5, "self": [5, 6, 11, 12, 13, 14], "set_param": [5, 11, 12, 14], "get_param": [5, 11, 12, 14], "deep": [5, 11, 12, 14], "familiar": 5, "structur": [5, 14], "inspir": 5, "reason": [5, 7, 8, 9], "why": 5, "ha": [1, 5, 6, 7, 8, 11, 12, 13, 14], "adopt": [5, 6, 13], "respond": 5, "predict": [1, 5, 7, 11, 12, 14], "input": [5, 7, 11, 12, 13, 14], "element": [5, 11, 13, 14], "while": [0, 5, 7, 12, 13, 14], "selector": 5, "process": [1, 6, 11], "hyperparamet": [5, 8, 11, 14], "search": [6, 9, 11, 14], "part": [5, 13], "aggregativequantifi": [1, 5, 14], "must": [5, 13, 14], "fit_learn": 5, "classif_predict": [5, 14], "mention": 5, "befor": [5, 11, 12, 13, 14], "inde": [5, 8], "alreadi": [1, 5, 11, 14], "preclassifi": [], "maintain": [5, 14], "through": [5, 11], "properti": [5, 11, 12, 13, 14], "learner": [5, 6, 12, 14], "extern": 5, "probabilist": [5, 11, 12, 14], "inherit": [5, 8, 11], "aggregativeprobabilisticquantifi": [5, 14], "posterior": [5, 11, 12, 14], "crisp": [1, 5, 11, 14], "decis": [5, 11, 12, 14], "hard": [5, 11, 12], "classif_posterior": 14, "posterior_prob": 14, "advantag": [5, 8, 14], "procedur": [1, 5, 9, 11], "veri": [5, 7, 11], "effici": 5, "everi": [0, 1, 5, 6, 8, 11, 14], "leverag": 5, "speed": [1, 5, 11, 14], "up": [1, 5, 11, 12, 14], "over": [5, 6, 11], "customarili": [5, 6], "done": [5, 6], "four": 5, "cc": [5, 7, 14], "simplest": 5, "deliv": [5, 6, 14], "adjust": [5, 9, 11, 14], "pcc": [5, 7, 14], "soft": [1, 5], "serv": [5, 11, 13], "complet": [5, 7, 14], "equip": [5, 7], "svm": [2, 5, 7, 9, 12, 13, 14], "linearsvc": [5, 7, 13], "pickl": [5, 11, 13, 14], "alia": [5, 11, 13, 14], "classifyandcount": [5, 14], "estim_preval": [5, 9, 14], "rate": [5, 11, 12, 14], "binari": [0, 5, 7, 9, 11, 12, 13, 14], "init": 5, "addit": [5, 11], "val_split": [5, 6, 12, 14], "integ": [5, 11, 12, 13, 14], "k": [5, 9, 11, 12, 13, 14], "fold": [5, 11, 13, 14], "cross": [5, 11, 12, 13, 14], "specif": [1, 5, 6, 8, 11], "held": [5, 6, 11, 12, 14], "out": [0, 1, 5, 6, 7, 11, 12, 13, 14], "postpon": [1, 5], "constructor": 5, "prevail": 5, "overrid": 5, "illustr": [3, 5, 6, 7], "seem": 5, "calibr": [5, 11], "calibratedclassifiercv": 5, "base_estim": 5, "cv": [5, 6], "predict_proba": [5, 12, 14], "As": [5, 6], "calibratedclassifi": 5, "except": [5, 11, 14], "rais": [5, 11, 14], "lastli": 5, "everyth": 5, "said": 5, "aboud": 5, "sld": [5, 14], "expectationmaximizationquantifi": [5, 14], "describ": [5, 11, 14], "saeren": [5, 14], "m": [5, 11, 14], "latinn": [5, 14], "decaesteck": [5, 14], "c": [5, 6, 11, 12, 13, 14], "2002": 5, "priori": 5, "14": 5, "41": 5, "attempt": 5, "although": [5, 6, 7, 8, 14], "improv": [5, 11, 12, 14], "rank": [5, 12], "almost": 5, "alwai": [5, 7, 14], "among": 5, "effect": 5, "carri": [0, 1, 5, 11, 13, 14], "gonz\u00e1lez": 5, "castro": 5, "v": [5, 11, 12, 14], "alaiz": 5, "rodr\u0131": 5, "guez": 5, "alegr": 5, "2013": 5, "scienc": 5, "218": 5, "146": 5, "It": [1, 5, 6, 7, 11], "allia": 5, "hellingerdistancei": [5, 14], "mixtur": [5, 11, 14], "previou": 5, "overridden": [5, 14], "proport": [5, 6, 12, 13, 14], "taken": [5, 11, 12, 13, 14], "itself": [5, 11, 14], "accept": 5, "elm": [2, 5, 14], "famili": [5, 14], "target": [5, 7, 9, 11, 12, 14], "orient": [2, 5, 9, 11, 14], "joachim": [5, 12, 14], "svmq": 5, "d\u00edez": 5, "reliabl": 5, "pattern": 5, "recognit": 5, "48": 5, "591": 5, "604": 5, "svmkld": [], "multivari": [5, 12], "transact": 5, "discoveri": 5, "articl": [5, 6], "svmnkld": [], "svmae": [], "error": [5, 6, 9, 10, 12, 14], "svmrae": [], "what": 5, "nowadai": 5, "consid": [5, 7, 8, 11, 12, 13, 14], "behav": [5, 7, 8], "If": [5, 7, 11, 13, 14], "want": [5, 6], "custom": [5, 8, 9, 11, 13], "modifi": [5, 11], "assign": [5, 13], "Then": 5, "re": [5, 6, 12, 13], "thing": [5, 8], "your": 5, "svmperf_hom": [5, 14], "valid_loss": [5, 12, 14], "mycustomloss": 5, "28": [0, 1, 5, 13], "current": [5, 11, 12, 13, 14], "support": [5, 9, 13, 14], "oper": 5, "trivial": 5, "strategi": [5, 6], "2016": [5, 13, 14], "sentiment": [5, 9, 13], "19": [5, 13], "onevsal": [5, 14], "know": [5, 6], "where": [5, 7, 11, 12, 13, 14], "top": [5, 11, 14], "thu": [1, 5, 6, 7, 11, 12, 14], "nor": 5, "castano": [5, 13], "2019": [5, 13, 14], "dynam": [5, 12, 13, 14], "task": [0, 5, 6, 9, 13], "45": [5, 7, 13], "15": [5, 11, 13], "polici": [5, 14], "processor": 5, "av": [5, 14], "ptr": [5, 14], "member": [5, 14], "d": [5, 14], "static": [5, 14], "red_siz": [5, 14], "pleas": 5, "check": [5, 11], "offer": [5, 9], "torch": [5, 12, 14], "embed": [5, 12, 14], "lstm": [5, 12, 14], "cnn": [5, 14], "its": [5, 6, 8, 11, 12, 14], "layer": [5, 12, 14], "neuralclassifiertrain": [5, 12, 14], "cnnnet": [5, 12, 14], "vocabulary_s": [5, 12, 13, 14], "cuda": [5, 12, 14], "supervis": [6, 9], "strongli": [6, 7], "good": [6, 7], "choic": [1, 6, 14], "hyper": [6, 11, 12], "wherebi": 6, "chosen": [1, 6, 11], "pick": 6, "best": [6, 11, 12, 14], "being": [1, 6, 8, 11, 14], "criteria": 6, "solv": [6, 14], "assess": 6, "own": 6, "right": [6, 11, 13], "impos": [6, 11], "aim": [6, 7], "appropri": 6, "configur": [6, 11], "design": 6, "long": [6, 12], "regard": 6, "next": [6, 11, 12, 13], "section": [6, 8], "argu": 6, "alejandro": 6, "fabrizio": 6, "count": [6, 7, 9, 11, 13, 14], "arxiv": [], "preprint": [], "2011": [], "02552": [], "2020": [5, 12], "varieti": 6, "exhibit": [6, 7, 8], "degre": 6, "model_select": [6, 8, 10, 14], "gridsearchq": [6, 8, 11, 14], "grid": [6, 8, 11, 14], "explor": [6, 11], "portion": [], "param_grid": [6, 8, 11, 14], "logspac": [6, 8, 14], "class_weight": [6, 7, 14], "eval_budget": [], "refit": [6, 11], "retrain": [6, 12], "goe": 6, "end": [6, 11, 14], "best_params_": 6, "best_model_": 6, "101": [], "5f": 6, "system": [1, 6, 14], "start": 6, "hyperparam": 6, "0001": 14, "got": [6, 14], "24987": [], "48135": [], "001": [6, 12, 14], "24866": [], "100000": [], "43676": [], "finish": [1, 6], "param": [6, 11, 12, 14], "19982": [], "develop": [6, 9], "1010": [], "5005": [], "54it": [], "20342": [], "altern": [1, 6], "computation": 6, "costli": 6, "try": 6, "theoret": 6, "suboptim": 6, "opt": 6, "gridsearchcv": [6, 14], "10000": 11, "5379": [], "55it": [], "41734": [], "wors": [7, 11], "larg": 11, "between": [7, 9, 11, 12, 14], "modal": [], "turn": [], "better": [], "nonetheless": [], "happen": 7, "basic": [7, 14], "help": [1, 7, 14], "analys": [7, 9], "outcom": 7, "main": [3, 7, 8], "method_nam": [7, 11], "name": [5, 7, 11, 12, 13, 14], "shape": [7, 11, 12, 13, 14], "correspond": [1, 7, 13], "matrix": [7, 11, 14], "appear": 7, "occur": [7, 13], "merg": 7, "emq": [7, 14], "55": 7, "showcas": 7, "wide": [1, 7, 8], "variant": [7, 9, 11, 14], "linear": [7, 11, 14], "review": [7, 9, 13], "step": [7, 11], "05": [7, 11, 14], "gen_data": 7, "base_classifi": 7, "yield": [7, 8, 11, 13, 14], "tr_prev": [7, 11, 14], "append": 7, "__class__": [], "__name__": [], "insight": 7, "view": 7, "y": [7, 11, 12, 13, 14], "axi": [7, 11], "against": [6, 7], "x": [1, 5, 7, 11, 12, 13, 14], "unfortun": 7, "limit": [7, 8, 11, 14], "binary_diagon": [7, 11], "train_prev": [7, 11], "savepath": [7, 11], "bin_diag": 7, "png": 7, "save": [7, 11], "pdf": [7, 14], "cyan": 7, "dot": [7, 11], "color": [7, 11], "band": [7, 11], "hidden": [7, 12, 14], "show_std": [7, 11], "unadjust": 7, "bias": 7, "toward": [7, 13], "seen": [7, 11, 14], "evinc": 7, "box": [7, 11], "binary_bias_glob": [7, 11], "bin_bia": 7, "unbias": 7, "center": 7, "tend": 7, "overestim": 7, "high": [7, 11], "lower": [7, 14], "again": [7, 11], "accordingli": 7, "20": [1, 7, 11, 14], "90": [7, 11], "rewrit": 7, "method_data": 7, "training_preval": 7, "linspac": 7, "training_s": 7, "suffic": 7, "latex": [], "syntax": [], "_": [7, 9, 11, 13], "now": [5, 7, 8], "clearli": 7, "binary_bias_bin": [7, 11], "broken": [7, 11], "down": [1, 7, 8, 11, 13], "bin": [6, 7, 11, 14], "To": [7, 13], "nbin": [6, 7, 11, 14], "isometr": [7, 11], "subinterv": 7, "interestingli": 7, "enough": 7, "seemingli": 7, "tendenc": 7, "low": [6, 7, 11, 12], "underestim": 7, "beyond": 7, "67": [7, 11], "curios": 7, "pretti": 7, "discuss": 7, "analyz": 7, "compar": [7, 11], "both": [7, 13], "irrespect": [1, 7, 14], "harder": 7, "interpret": [7, 9, 14], "error_by_drift": [7, 11], "error_nam": [7, 11], "n_bin": [7, 11, 14], "err_drift": 7, "whenev": [7, 11], "clear": 7, "lowest": 7, "difficult": 7, "rememb": 7, "solid": 7, "comparison": [7, 8], "detriment": 7, "visual": [1, 7, 9], "hide": 7, "framework": [5, 9, 14], "written": 9, "root": 9, "concept": [3, 9], "baselin": 9, "integr": 9, "commonli": [8, 9], "facilit": 9, "twitter": [9, 13], "true_preval": 9, "hold": [9, 11, 14], "endeavour": [9, 11], "popular": [8, 9], "expect": [8, 9, 14], "maxim": [9, 14], "hdy": [9, 14], "versatil": 9, "etc": [5, 9], "uci": [9, 13], "nativ": 9, "loss": [9, 12, 14], "perf": [2, 9, 12, 14], "ad": [8, 9], "meta": [9, 11], "plot": [3, 9, 10], "diagon": [9, 11], "bia": [5, 9, 11, 12, 14], "drift": 9, "api": [5, 9], "subpackag": 10, "submodul": 10, "util": [10, 12, 13], "content": 10, "bctscalibr": 12, "nbvscalibr": 12, "recalibratedprobabilisticclassifi": 12, "recalibratedprobabilisticclassifierbas": 12, "classes_": [12, 13, 14], "fit_cv": 12, "fit_tr_val": 12, "tscalibr": 12, "vscalibr": 12, "lowranklogisticregress": 12, "document_embed": 12, "lstmnet": 12, "reset_net_param": 12, "textclassifiernet": 12, "dimens": [11, 12, 13, 14], "forward": [12, 14], "xavier_uniform": 12, "torchdataset": 12, "asdataload": 12, "decision_funct": 12, "splitstratifi": 13, "stat": 13, "train_test": [5, 6, 7, 8, 13], "xp": 13, "xy": 13, "split_random": 13, "split_stratifi": [6, 8, 13], "uniform_sampl": 13, "uniform_sampling_index": 13, "fetch_lequa2022": [0, 13], "warn": [11, 13], "indextransform": 13, "add_word": 13, "fit_transform": 13, "reader": 11, "binar": [11, 13], "from_csv": 13, "from_spars": 13, "from_text": 13, "reindex_label": 13, "getptecondestim": 14, "solve_adjust": 14, "adjustedclassifyandcount": 14, "distributionmatch": [5, 6, 14], "dy": [5, 14], "em": 14, "max_it": 14, "explicitlossminimis": [], "max": [5, 14], "ms2": [5, 14], "mediansweep": 14, "mediansweep2": 14, "probabilisticadjustedclassifyandcount": 14, "probabilisticclassifyandcount": 14, "smm": [5, 14], "t50": [5, 14], "thresholdoptim": 14, "cross_generate_predict": 14, "cross_generate_predictions_depr": [], "binaryquantifi": 14, "onevsallgener": [5, 14], "eacc": 14, "ecc": 14, "eemq": 14, "ehdi": 14, "epacc": 14, "valid_polici": 14, "ensemblefactori": 14, "get_probability_distribut": 14, "quanetmodul": 14, "quanettrain": 14, "clean_checkpoint": 14, "clean_checkpoint_dir": 14, "mae_loss": 14, "non_aggreg": 11, "maximumlikelihoodprevalenceestim": 14, "absolute_error": 11, "hat": 11, "frac": 11, "mathcal": 11, "sum_": 11, "acc_error": 11, "y_true": 11, "y_pred": 11, "tp": 11, "tn": 11, "fp": 11, "fn": 11, "stand": [5, 11, 14], "f1_error": 11, "macro": 11, "f_1": 11, "harmon": 11, "recal": 11, "2tp": 11, "independ": [11, 14], "err_nam": 11, "p_hat": [], "d_": 11, "kl": 11, "log": [11, 13], "factor": 11, "beforehand": 11, "n_sampl": [11, 12], "mean_absolute_error": 11, "mean_relative_absolute_error": 11, "relative_absolute_error": 11, "underlin": 11, "displaystyl": 11, "abstractprotocol": [8, 11], "union": [11, 13, 14], "aggr_speedup": [1, 11], "auto": [1, 11], "evaluation_report": [1, 11], "app": [6, 7, 8, 11, 14], "repeat": [7, 8, 11], "smooth_limits_epsilon": 11, "random_st": [7, 8, 11, 13], "return_typ": [8, 11], "sample_prev": [8, 11], "abstractstochasticseededprotocol": [8, 11], "onlabelledcollectionprotocol": [1, 8, 11], "95": 11, "copi": [11, 13], "quantiti": 11, "labelled_collect": [8, 11], "prevalence_grid": 11, "exhaust": 11, "sum": [11, 14], "implicit": 11, "return_constrained_dim": 11, "rest": [11, 12, 13], "quit": 11, "obvious": 11, "determinist": 11, "anywher": 11, "multipli": 11, "necessari": 11, "samples_paramet": 11, "total": [8, 11], "parent": 11, "sequenc": [8, 11], "enforc": 11, "collat": 11, "arg": [11, 13], "domainmix": 11, "domaina": 11, "domainb": 11, "mixture_point": 11, "domain": 11, "scale": [5, 11, 12, 14], "npp": [8, 11], "draw": 11, "uniformli": [8, 11], "therefor": 11, "get_col": 11, "get_labelled_collect": 11, "on_preclassified_inst": 11, "pre_classif": 11, "in_plac": 11, "usimplexpp": [], "kraemer": [8, 11], "algorithm": [8, 11, 14], "sens": 11, "guarante": [8, 11, 13], "prefer": [1, 8, 11], "intract": 11, "hellingerdist": 11, "hellingh": 11, "distanc": [11, 14], "hd": [5, 11, 14], "discret": [11, 14], "sqrt": 11, "p_i": 11, "q_i": 11, "real": [11, 12, 13, 14], "topsoedist": 11, "1e": [11, 12, 14], "topso": [11, 14], "adjusted_quantif": 11, "prevalence_estim": 11, "tpr": [11, 14], "fpr": [11, 14], "clip": 11, "exce": 11, "check_prevalence_vector": 11, "raise_except": 11, "toleranz": 11, "08": 11, "combinations_budget": 11, "largest": 11, "dimension": [11, 12, 13, 14], "repetit": 11, "less": [11, 13], "normalize_preval": 11, "l1": [11, 14], "calcul": 11, "binom": 11, "mass": 11, "alloc": [11, 12], "solut": 11, "star": 11, "bar": 11, "prevalence_from_label": 11, "n_instanc": [11, 12, 14], "correctli": 11, "even": 11, "len": 11, "prevalence_from_prob": 11, "bool": [11, 12, 14], "argmax": 11, "prevalence_linspac": 11, "01": [6, 11, 12, 14], "separ": [11, 13], "99": 11, "uniform_prevalence_sampl": 11, "adapt": [11, 12], "post": 11, "http": [11, 13, 14], "stackexchang": 11, "com": 11, "question": 11, "3227": 11, "uniform": [9, 11, 13], "uniform_simplex_sampl": 11, "dict": [11, 13, 14], "timeout": 11, "dictionari": [11, 12, 13, 14], "kei": [11, 13], "quantification_error": 11, "whether": [11, 12, 13, 14], "ignor": [11, 13, 14], "gen": 11, "establish": 11, "timer": 11, "longer": [11, 14], "timeouterror": 11, "bound": [11, 14], "stdout": 11, "best_model": 11, "after": [11, 14], "minim": [11, 14], "routin": [11, 13, 14], "unus": [11, 12], "contanin": 11, "cross_val_predict": 11, "akin": [11, 14], "issu": 11, "reproduc": [11, 13], "pos_class": [11, 13], "titl": 11, "colormap": 11, "listedcolormap": 11, "vertical_xtick": 11, "legend": 11, "local": 11, "sign": 11, "minu": 11, "classs": 11, "compon": [11, 12, 14], "cm": 11, "tab10": 11, "secondari": 11, "global": 11, "method_ord": 11, "henc": [11, 13], "conveni": [1, 5, 8, 11], "multiclass": [0, 5, 8, 11, 13, 14], "inconveni": 11, "leyend": 11, "hightlight": 11, "associ": 11, "brokenbar_supremacy_by_drift": 11, "isomer": 11, "x_error": 11, "y_error": 11, "ttest_alpha": 11, "005": 11, "tail_density_threshold": 11, "region": 11, "chart": 11, "condit": [8, 11, 14], "ii": 11, "significantli": 11, "side": 11, "confid": 11, "percentil": 11, "divid": 11, "amount": [1, 8, 11], "similar": [11, 14], "threshold": [11, 14], "densiti": 11, "tail": 11, "discard": 11, "outlier": 11, "show_dens": 11, "show_legend": 11, "logscal": 11, "vline": 11, "especi": 11, "mai": 11, "cumberson": 11, "gain": 11, "understand": 11, "fare": 11, "regim": 11, "highlight": 11, "vertic": 11, "earlystop": 11, "patienc": [11, 12, 14], "lower_is_bett": 11, "earli": [11, 12, 14], "stop": [11, 12, 14], "epoch": [11, 12, 14], "best_epoch": 11, "best_scor": 11, "consecut": [11, 12, 14], "monitor": 11, "obtaind": 11, "far": [11, 12, 13], "flag": 11, "keep": [11, 13], "track": 11, "boolean": [11, 13, 14], "create_if_not_exist": 11, "makedir": 11, "exist_ok": 11, "join": [11, 13], "dir": [11, 14], "subdir": 11, "anotherdir": 11, "create_parent_dir": 11, "exist": [8, 11], "txt": 11, "download_fil": 11, "url": 11, "archive_filenam": 11, "destin": 11, "filenam": 11, "download_file_if_not_exist": 11, "dowload": 11, "get_quapy_hom": 11, "home": [11, 13], "perman": 11, "map_parallel": 11, "func": 11, "slice": 11, "item": 11, "wrapper": [11, 12, 13, 14], "multiprocess": [11, 14], "delai": 11, "args_i": 11, "silent": [11, 14], "child": 11, "ensur": 11, "pickled_resourc": 11, "pickle_path": 11, "generation_func": 11, "fast": [0, 11, 13], "resourc": 11, "some_arrai": 11, "mock": [11, 12], "rand": 11, "my_arrai": 11, "pkl": 11, "save_text_fil": 11, "disk": 11, "miss": 11, "temp_se": 11, "context": 11, "tempor": [11, 12], "outer": 11, "state": 11, "within": [11, 14], "get_njob": [], "correct": [5, 12, 14], "temperatur": [5, 12, 14], "bct": [12, 14], "abstent": 12, "alexandari": [5, 12, 14], "afterward": [12, 14], "No": [12, 14], "nbv": [12, 14], "baseestim": [5, 12, 14], "calibratorfactori": 12, "n_compon": 12, "kwarg": [12, 13, 14], "decomposit": 12, "truncatedsvd": 12, "princip": 12, "regress": 12, "n_featur": 12, "length": [12, 13], "eventu": [12, 13], "unalt": 12, "emb": 12, "embedding_s": 12, "hidden_s": 12, "repr_siz": 12, "kernel_height": 12, "stride": 12, "pad": [12, 13], "drop_p": 12, "convolut": 12, "vocabulari": [12, 13], "kernel": 12, "drop": 12, "dropout": [12, 14], "batch": 12, "dataload": 12, "tensor": 12, "n_dimens": 12, "lstm_class_nlay": 12, "short": 12, "memori": 12, "net": 12, "weight_decai": 12, "batch_siz": 12, "64": [6, 12, 14], "batch_size_test": 12, "512": [12, 14], "padding_length": 12, "checkpointpath": 12, "checkpoint": [12, 14], "classifier_net": 12, "weight": [12, 13], "decai": 12, "wait": 12, "enabl": 12, "gpu": [12, 14], "vocab_s": 12, "reiniti": 12, "trainer": 12, "disjoint": 12, "embed_s": 12, "nn": 12, "pad_length": 12, "xavier": 12, "shuffl": [12, 13], "longest": 12, "shorter": 12, "svmperf_bas": [12, 14], "classifiermixin": 12, "thorsten": 12, "refer": [0, 12, 13], "svm_perf_learn": 12, "svm_perf_classifi": 12, "trade": [12, 14], "off": [12, 14], "margin": [12, 14], "std": 12, "qacc": 12, "qf1": 12, "qgm": 12, "12": 12, "26": 12, "23": [5, 12], "train_siz": 13, "conform": 13, "round": 13, "loader_kwarg": 13, "read": 13, "tupl": [8, 11, 13, 14], "tr": 13, "te": 13, "csr": 13, "csr_matrix": 13, "4403": 13, "my_collect": 13, "codefram": 13, "larger": [11, 13, 14], "actual": [13, 14], "empti": 13, "met": 13, "whose": [13, 14], "train_prop": [6, 8, 13], "left": [11, 13], "stratif": 13, "greater": 13, "dataset_nam": 13, "data_hom": 13, "test_split": 13, "predefin": 13, "uci_dataset": 13, "dump": 13, "leav": 13, "quay_data": 13, "ml": 13, "5fcvx2": 13, "x2": 13, "offici": 13, "lequa": [8, 9, 13], "competit": [0, 9, 13], "t1a": [0, 13], "t1b": [0, 13], "t2a": [0, 13], "t2b": [0, 13], "raw": [0, 11, 13], "merchandis": [0, 13], "sperduti": [0, 13], "2022": [0, 8, 13], "overview": [0, 13], "clef": [0, 13], "lequa2022_experi": [0, 13], "py": [0, 5, 8, 13], "guid": 13, "val_gen": 13, "test_gen": 13, "samplesfromdir": 13, "minimun": 13, "kept": 13, "subsequ": 13, "mining6": 13, "devel": 13, "style": 13, "countvector": 13, "keyword": [13, 14], "nogap": 13, "regardless": 13, "codifi": 13, "unknown": 13, "surfac": 13, "assert": 13, "gap": 13, "preced": 13, "decid": [8, 11, 13], "uniqu": 13, "rare": 13, "unk": 13, "minimum": [13, 14], "occurr": 13, "org": [13, 14], "stabl": 13, "feature_extract": 13, "html": 13, "subtyp": 13, "spmatrix": 13, "remov": [13, 14], "infrequ": 13, "aka": [13, 14], "sublinear_tf": 13, "scall": 13, "counter": 13, "tfidfvector": 13, "whcih": 13, "had": 13, "encod": 13, "utf": 13, "csv": 13, "feat1": 13, "feat2": 13, "featn": 13, "covari": 13, "express": 13, "row": [1, 13], "class2int": 13, "collet": 13, "fomart": 13, "progress": 13, "sentenc": 13, "classnam": 13, "u1": 13, "misclassif": 14, "n_classes_": [], "fit_classifi": 14, "bypass": 14, "y_": 14, "ptecondestim": 14, "prevs_estim": 14, "ax": 14, "entri": [0, 1, 14], "y_i": 14, "y_j": 14, "_posterior_probabilities_": 14, "attribut": 14, "subclass": 14, "give": [8, 14], "outsid": 14, "unless": 14, "noth": 14, "els": 14, "cdf": [5, 14], "match": [5, 14], "helling": 14, "sought": 14, "channel": 14, "proper": 14, "ch": 14, "di": 14, "dij": 14, "fraction": 14, "th": 14, "tol": 14, "ternari": 14, "dl": 14, "doi": 14, "1145": 14, "3219819": 14, "3220059": 14, "histogram": 14, "toler": 14, "explicit": 14, "exact_train_prev": [5, 14], "recalib": [5, 14], "updat": 14, "likelihood": [12, 14], "mutual": 14, "recurs": 14, "until": 14, "converg": 14, "suggest": [5, 14], "recalibr": 14, "reach": 14, "loop": 14, "cumul": 14, "unlabel": 14, "latter": 14, "forman": [5, 8, 14], "2006": [5, 14], "2008": [5, 14], "goal": 14, "bring": 14, "denomin": 14, "median": [5, 14], "sweep": [5, 14], "binary_quantifi": 14, "prevel": 14, "emploi": 14, "resp": 14, "subobject": 14, "nest": 14, "pipelin": 14, "__": 14, "simplif": 14, "2021": [5, 6, 14], "equival": 14, "cosest": 14, "heurist": [1, 14], "choos": [5, 14], "ground": 14, "complement": 14, "param_mod_sel": 14, "param_model_sel": 14, "min_po": 14, "max_sample_s": 14, "closest": 14, "preliminari": 14, "recomput": 14, "compat": [12, 14], "l": 14, "base_quantifier_class": 14, "factori": 14, "common": 14, "doc_embedding_s": 14, "stats_siz": 14, "lstm_hidden_s": 14, "lstm_nlayer": 14, "ff_layer": 14, "1024": 14, "bidirect": 14, "qdrop_p": 14, "order_bi": 14, "cell": 14, "connect": 14, "ff": 14, "sort": 14, "doc_embed": 14, "doc_posterior": 14, "recip": 14, "care": 14, "regist": 14, "hook": 14, "n_epoch": 14, "tr_iter_per_poch": 14, "va_iter_per_poch": 14, "checkpointdir": 14, "checkpointnam": 14, "phase": 14, "anyth": 14, "truth": 14, "mlpe": 14, "lazi": 14, "put": 14, "assumpion": 14, "beat": [12, 14], "estimant": 14, "kundaj": 12, "shrikumar": 12, "novemb": 12, "232": 12, "pmlr": 12, "outpu": [], "partit": 12, "ight": [], "valueerror": 11, "attach": 13, "mix": [], "onevsallaggreg": [5, 14], "parallel_backend": 14, "loki": 14, "backend": 14, "cannot": 14, "temp": 14, "getonevsal": 5, "realiz": 11, "prepar": 11, "act": 11, "modif": 11, "place": [11, 13], "host_fold": 12, "tmp": 12, "delet": 12, "newelm": 14, "underli": [5, 6, 14], "newsvma": [5, 14], "newsvmkld": [5, 14], "newsvmq": [5, 14], "newsvmra": [5, 14], "newonevsal": 14, "onlabelledcollect": [], "forc": [1, 11], "deactiv": [1, 11], "evaluate_on_sampl": 11, "central": 11, "endow": 11, "never": [8, 11], "behaviour": [1, 5, 8, 11], "undertaken": 11, "artificialprevalenceprotocol": 11, "iterateprotocol": 11, "previous": 11, "naturalprevalenceprotocol": 11, "upp": [9, 11], "uniformprevalenceprotocol": 11, "n_train": 13, "n_test": 13, "quick": 13, "omit": 1, "procotol": 1, "vari": [1, 8], "u": 1, "prot": 1, "our": [1, 8], "evaluatio": 1, "4f": [1, 8], "often": 1, "account": 1, "rise": [1, 8], "straightforward": 1, "308": 1, "692": 1, "314": 1, "686": 1, "005649": 1, "013182": 1, "000074": 1, "896": 1, "013145": 1, "069323": 1, "000985": 1, "848": 1, "152": 1, "809": 1, "191": 1, "039063": 1, "149806": 1, "005175": 1, "016": 1, "984": 1, "033": 1, "967": 1, "017236": 1, "487529": 1, "005298": 1, "728": 1, "751": 1, "249": 1, "022769": 1, "057146": 1, "001350": 1, "4995": 1, "72": 1, "698": 1, "302": 1, "021752": 1, "053631": 1, "001133": 1, "4996": 1, "868": 1, "132": 1, "888": 1, "112": 1, "020490": 1, "088230": 1, "001985": 1, "4997": 1, "292": 1, "708": 1, "298": 1, "702": 1, "006149": 1, "014788": 1, "000090": 1, "4998": 1, "76": 1, "220": 1, "780": 1, "019950": 1, "054309": 1, "001127": 1, "4999": 1, "948": 1, "052": 1, "965": 1, "016941": 1, "165776": 1, "003538": 1, "023588": 1, "108779": 1, "003631": 1, "exit": 1, "smaller": 1, "1m": 1, "convers": 1, "precomput": 1, "execut": 1, "lot": 1, "welcom": 3, "behind": 3, "simplifi": 5, "remain": 5, "unchang": 5, "v0": [5, 8, 9], "construct": 5, "depart": 5, "approach": [5, 8], "firat": 5, "mutliclasshdi": 5, "maletzk": 5, "hassan": 5, "thank": 5, "pablo": 5, "contribut": 5, "newsvmnkld": 5, "experiment": 5, "plo": 5, "ONE": 5, "There": 5, "explicit_loss_minim": 5, "one_vs_al": 5, "robustli": 8, "presenc": 8, "confront": [6, 8], "stochast": 8, "fair": 8, "radom_st": 8, "technic": 8, "explan": 8, "custom_protocol": 8, "subject": 8, "2005": 8, "usag": 8, "classifier__c": [6, 8], "equial": 8, "val_app": 8, "increas": 8, "rapidli": 8, "becom": 8, "impract": 8, "legitim": 8, "drawback": 8, "elect": 8, "yet": 8, "burden": 8, "incur": 8, "idea": 8, "deprec": 8, "due": 8, "capabl": 8, "md": 2, "_new": 9, "adher": 11, "ecir": 6, "retriev": 6, "91": 6, "devot": 6, "mark": 6, "prefix": 6, "classifier__": 6, "16": 6, "mae_scor": 6, "04021": 6, "took": 6, "1356": 6, "04286": 6, "2139": 6, "04888": 6, "2491": 6, "05163": 6, "5372": 6, "02445": 6, "9056": 6, "02234": 6, "3114": 6, "03102": 6, "conceptu": 6, "flaw": 6, "hand": 6, "surrog": 6, "train_sampl": 7, "tackl": 0, "val_gener": 0, "test_gener": 0, "doc": 0, "250": 0, "20000": 0, "math": 11, "2frac": 11, "sanity_check": 11, "skip": 11}, "objects": {"": [[11, 0, 0, "-", "quapy"]], "quapy": [[12, 0, 0, "-", "classification"], [13, 0, 0, "-", "data"], [11, 0, 0, "-", "error"], [11, 0, 0, "-", "evaluation"], [11, 0, 0, "-", "functional"], [14, 0, 0, "-", "method"], [11, 0, 0, "-", "model_selection"], [11, 0, 0, "-", "plot"], [11, 0, 0, "-", "protocol"], [11, 0, 0, "-", "util"]], "quapy.classification": [[12, 0, 0, "-", "calibration"], [12, 0, 0, "-", "methods"], [12, 0, 0, "-", "neural"], [12, 0, 0, "-", "svmperf"]], "quapy.classification.calibration": [[12, 1, 1, "", "BCTSCalibration"], [12, 1, 1, "", "NBVSCalibration"], [12, 1, 1, "", "RecalibratedProbabilisticClassifier"], [12, 1, 1, "", "RecalibratedProbabilisticClassifierBase"], [12, 1, 1, "", "TSCalibration"], [12, 1, 1, "", "VSCalibration"]], "quapy.classification.calibration.RecalibratedProbabilisticClassifierBase": [[12, 2, 1, "", "classes_"], [12, 3, 1, "", "fit"], [12, 3, 1, "", "fit_cv"], [12, 3, 1, "", "fit_tr_val"], [12, 3, 1, "", "predict"], [12, 3, 1, "", "predict_proba"]], "quapy.classification.methods": [[12, 1, 1, "", "LowRankLogisticRegression"]], "quapy.classification.methods.LowRankLogisticRegression": [[12, 3, 1, "", "fit"], [12, 3, 1, "", "get_params"], [12, 3, 1, "", "predict"], [12, 3, 1, "", "predict_proba"], [12, 3, 1, "", "set_params"], [12, 3, 1, "", "transform"]], "quapy.classification.neural": [[12, 1, 1, "", "CNNnet"], [12, 1, 1, "", "LSTMnet"], [12, 1, 1, "", "NeuralClassifierTrainer"], [12, 1, 1, "", "TextClassifierNet"], [12, 1, 1, "", "TorchDataset"]], "quapy.classification.neural.CNNnet": [[12, 3, 1, "", "document_embedding"], [12, 3, 1, "", "get_params"], [12, 4, 1, "", "training"], [12, 2, 1, "", "vocabulary_size"]], "quapy.classification.neural.LSTMnet": [[12, 3, 1, "", "document_embedding"], [12, 3, 1, "", "get_params"], [12, 4, 1, "", "training"], [12, 2, 1, "", "vocabulary_size"]], "quapy.classification.neural.NeuralClassifierTrainer": [[12, 2, 1, "", "device"], [12, 3, 1, "", "fit"], [12, 3, 1, "", "get_params"], [12, 3, 1, "", "predict"], [12, 3, 1, "", "predict_proba"], [12, 3, 1, "", "reset_net_params"], [12, 3, 1, "", "set_params"], [12, 3, 1, "", "transform"]], "quapy.classification.neural.TextClassifierNet": [[12, 3, 1, "", "dimensions"], [12, 3, 1, "", "document_embedding"], [12, 3, 1, "", "forward"], [12, 3, 1, "", "get_params"], [12, 3, 1, "", "predict_proba"], [12, 4, 1, "", "training"], [12, 2, 1, "", "vocabulary_size"], [12, 3, 1, "", "xavier_uniform"]], "quapy.classification.neural.TorchDataset": [[12, 3, 1, "", "asDataloader"]], "quapy.classification.svmperf": [[12, 1, 1, "", "SVMperf"]], "quapy.classification.svmperf.SVMperf": [[12, 3, 1, "", "decision_function"], [12, 3, 1, "", "fit"], [12, 3, 1, "", "predict"], [12, 4, 1, "", "valid_losses"]], "quapy.data": [[13, 0, 0, "-", "base"], [13, 0, 0, "-", "datasets"], [13, 0, 0, "-", "preprocessing"], [13, 0, 0, "-", "reader"]], "quapy.data.base": [[13, 1, 1, "", "Dataset"], [13, 1, 1, "", "LabelledCollection"]], "quapy.data.base.Dataset": [[13, 3, 1, "", "SplitStratified"], [13, 2, 1, "", "binary"], [13, 2, 1, "", "classes_"], [13, 3, 1, "", "kFCV"], [13, 3, 1, "", "load"], [13, 2, 1, "", "n_classes"], [13, 3, 1, "", "reduce"], [13, 3, 1, "", "stats"], [13, 2, 1, "", "train_test"], [13, 2, 1, "", "vocabulary_size"]], "quapy.data.base.LabelledCollection": [[13, 2, 1, "", "X"], [13, 2, 1, "", "Xp"], [13, 2, 1, "", "Xy"], [13, 2, 1, "", "binary"], [13, 3, 1, "", "counts"], [13, 3, 1, "", "join"], [13, 3, 1, "", "kFCV"], [13, 3, 1, "", "load"], [13, 2, 1, "", "n_classes"], [13, 2, 1, "", "p"], [13, 3, 1, "", "prevalence"], [13, 3, 1, "", "sampling"], [13, 3, 1, "", "sampling_from_index"], [13, 3, 1, "", "sampling_index"], [13, 3, 1, "", "split_random"], [13, 3, 1, "", "split_stratified"], [13, 3, 1, "", "stats"], [13, 3, 1, "", "uniform_sampling"], [13, 3, 1, "", "uniform_sampling_index"], [13, 2, 1, "", "y"]], "quapy.data.datasets": [[13, 5, 1, "", "fetch_UCIDataset"], [13, 5, 1, "", "fetch_UCILabelledCollection"], [13, 5, 1, "", "fetch_lequa2022"], [13, 5, 1, "", "fetch_reviews"], [13, 5, 1, "", "fetch_twitter"], [13, 5, 1, "", "warn"]], "quapy.data.preprocessing": [[13, 1, 1, "", "IndexTransformer"], [13, 5, 1, "", "index"], [13, 5, 1, "", "reduce_columns"], [13, 5, 1, "", "standardize"], [13, 5, 1, "", "text2tfidf"]], "quapy.data.preprocessing.IndexTransformer": [[13, 3, 1, "", "add_word"], [13, 3, 1, "", "fit"], [13, 3, 1, "", "fit_transform"], [13, 3, 1, "", "transform"], [13, 3, 1, "", "vocabulary_size"]], "quapy.data.reader": [[13, 5, 1, "", "binarize"], [13, 5, 1, "", "from_csv"], [13, 5, 1, "", "from_sparse"], [13, 5, 1, "", "from_text"], [13, 5, 1, "", "reindex_labels"]], "quapy.error": [[11, 5, 1, "", "absolute_error"], [11, 5, 1, "", "acc_error"], [11, 5, 1, "", "acce"], [11, 5, 1, "", "ae"], [11, 5, 1, "", "f1_error"], [11, 5, 1, "", "f1e"], [11, 5, 1, "", "from_name"], [11, 5, 1, "", "kld"], [11, 5, 1, "", "mae"], [11, 5, 1, "", "mean_absolute_error"], [11, 5, 1, "", "mean_relative_absolute_error"], [11, 5, 1, "", "mkld"], [11, 5, 1, "", "mnkld"], [11, 5, 1, "", "mrae"], [11, 5, 1, "", "mse"], [11, 5, 1, "", "nkld"], [11, 5, 1, "", "rae"], [11, 5, 1, "", "relative_absolute_error"], [11, 5, 1, "", "se"], [11, 5, 1, "", "smooth"]], "quapy.evaluation": [[11, 5, 1, "", "evaluate"], [11, 5, 1, "", "evaluate_on_samples"], [11, 5, 1, "", "evaluation_report"], [11, 5, 1, "", "prediction"]], "quapy.functional": [[11, 5, 1, "", "HellingerDistance"], [11, 5, 1, "", "TopsoeDistance"], [11, 5, 1, "", "adjusted_quantification"], [11, 5, 1, "", "check_prevalence_vector"], [11, 5, 1, "", "get_nprevpoints_approximation"], [11, 5, 1, "", "normalize_prevalence"], [11, 5, 1, "", "num_prevalence_combinations"], [11, 5, 1, "", "prevalence_from_labels"], [11, 5, 1, "", "prevalence_from_probabilities"], [11, 5, 1, "", "prevalence_linspace"], [11, 5, 1, "", "strprev"], [11, 5, 1, "", "uniform_prevalence_sampling"], [11, 5, 1, "", "uniform_simplex_sampling"]], "quapy.method": [[14, 0, 0, "-", "aggregative"], [14, 0, 0, "-", "base"], [14, 0, 0, "-", "meta"], [14, 0, 0, "-", "neural"], [14, 0, 0, "-", "non_aggregative"]], "quapy.method.aggregative": [[14, 1, 1, "", "ACC"], [14, 4, 1, "", "AdjustedClassifyAndCount"], [14, 1, 1, "", "AggregativeProbabilisticQuantifier"], [14, 1, 1, "", "AggregativeQuantifier"], [14, 1, 1, "", "CC"], [14, 4, 1, "", "ClassifyAndCount"], [14, 1, 1, "", "DistributionMatching"], [14, 1, 1, "", "DyS"], [14, 1, 1, "", "EMQ"], [14, 4, 1, "", "ExpectationMaximizationQuantifier"], [14, 1, 1, "", "HDy"], [14, 4, 1, "", "HellingerDistanceY"], [14, 1, 1, "", "MAX"], [14, 1, 1, "", "MS"], [14, 1, 1, "", "MS2"], [14, 4, 1, "", "MedianSweep"], [14, 4, 1, "", "MedianSweep2"], [14, 1, 1, "", "OneVsAllAggregative"], [14, 1, 1, "", "PACC"], [14, 1, 1, "", "PCC"], [14, 4, 1, "", "ProbabilisticAdjustedClassifyAndCount"], [14, 4, 1, "", "ProbabilisticClassifyAndCount"], [14, 4, 1, "", "SLD"], [14, 1, 1, "", "SMM"], [14, 1, 1, "", "T50"], [14, 1, 1, "", "ThresholdOptimization"], [14, 1, 1, "", "X"], [14, 5, 1, "", "cross_generate_predictions"], [14, 5, 1, "", "newELM"], [14, 5, 1, "", "newSVMAE"], [14, 5, 1, "", "newSVMKLD"], [14, 5, 1, "", "newSVMQ"], [14, 5, 1, "", "newSVMRAE"]], "quapy.method.aggregative.ACC": [[14, 3, 1, "", "aggregate"], [14, 3, 1, "", "classify"], [14, 3, 1, "", "fit"], [14, 3, 1, "", "getPteCondEstim"], [14, 3, 1, "", "solve_adjustment"]], "quapy.method.aggregative.AggregativeProbabilisticQuantifier": [[14, 3, 1, "", "classify"]], "quapy.method.aggregative.AggregativeQuantifier": [[14, 3, 1, "", "aggregate"], [14, 2, 1, "", "classes_"], [14, 2, 1, "", "classifier"], [14, 3, 1, "", "classify"], [14, 3, 1, "", "fit"], [14, 3, 1, "", "quantify"]], "quapy.method.aggregative.CC": [[14, 3, 1, "", "aggregate"], [14, 3, 1, "", "fit"]], "quapy.method.aggregative.DistributionMatching": [[14, 3, 1, "", "aggregate"], [14, 3, 1, "", "fit"]], "quapy.method.aggregative.DyS": [[14, 3, 1, "", "aggregate"], [14, 3, 1, "", "fit"]], "quapy.method.aggregative.EMQ": [[14, 3, 1, "", "EM"], [14, 4, 1, "", "EPSILON"], [14, 4, 1, "", "MAX_ITER"], [14, 3, 1, "", "aggregate"], [14, 3, 1, "", "fit"], [14, 3, 1, "", "predict_proba"]], "quapy.method.aggregative.HDy": [[14, 3, 1, "", "aggregate"], [14, 3, 1, "", "fit"]], "quapy.method.aggregative.OneVsAllAggregative": [[14, 3, 1, "", "aggregate"], [14, 3, 1, "", "classify"]], "quapy.method.aggregative.PACC": [[14, 3, 1, "", "aggregate"], [14, 3, 1, "", "classify"], [14, 3, 1, "", "fit"], [14, 3, 1, "", "getPteCondEstim"]], "quapy.method.aggregative.PCC": [[14, 3, 1, "", "aggregate"], [14, 3, 1, "", "fit"]], "quapy.method.aggregative.SMM": [[14, 3, 1, "", "aggregate"], [14, 3, 1, "", "fit"]], "quapy.method.aggregative.ThresholdOptimization": [[14, 3, 1, "", "aggregate"], [14, 3, 1, "", "fit"]], "quapy.method.base": [[14, 1, 1, "", "BaseQuantifier"], [14, 1, 1, "", "BinaryQuantifier"], [14, 1, 1, "", "OneVsAll"], [14, 1, 1, "", "OneVsAllGeneric"], [14, 5, 1, "", "newOneVsAll"]], "quapy.method.base.BaseQuantifier": [[14, 3, 1, "", "fit"], [14, 3, 1, "", "quantify"]], "quapy.method.base.OneVsAllGeneric": [[14, 2, 1, "", "classes_"], [14, 3, 1, "", "fit"], [14, 3, 1, "", "quantify"]], "quapy.method.meta": [[14, 5, 1, "", "EACC"], [14, 5, 1, "", "ECC"], [14, 5, 1, "", "EEMQ"], [14, 5, 1, "", "EHDy"], [14, 5, 1, "", "EPACC"], [14, 1, 1, "", "Ensemble"], [14, 5, 1, "", "ensembleFactory"], [14, 5, 1, "", "get_probability_distribution"]], "quapy.method.meta.Ensemble": [[14, 4, 1, "", "VALID_POLICIES"], [14, 2, 1, "", "aggregative"], [14, 3, 1, "", "fit"], [14, 3, 1, "", "get_params"], [14, 2, 1, "", "probabilistic"], [14, 3, 1, "", "quantify"], [14, 3, 1, "", "set_params"]], "quapy.method.neural": [[14, 1, 1, "", "QuaNetModule"], [14, 1, 1, "", "QuaNetTrainer"], [14, 5, 1, "", "mae_loss"]], "quapy.method.neural.QuaNetModule": [[14, 2, 1, "", "device"], [14, 3, 1, "", "forward"], [14, 4, 1, "", "training"]], "quapy.method.neural.QuaNetTrainer": [[14, 2, 1, "", "classes_"], [14, 3, 1, "", "clean_checkpoint"], [14, 3, 1, "", "clean_checkpoint_dir"], [14, 3, 1, "", "fit"], [14, 3, 1, "", "get_params"], [14, 3, 1, "", "quantify"], [14, 3, 1, "", "set_params"]], "quapy.method.non_aggregative": [[14, 1, 1, "", "MaximumLikelihoodPrevalenceEstimation"]], "quapy.method.non_aggregative.MaximumLikelihoodPrevalenceEstimation": [[14, 3, 1, "", "fit"], [14, 3, 1, "", "quantify"]], "quapy.model_selection": [[11, 1, 1, "", "GridSearchQ"], [11, 5, 1, "", "cross_val_predict"]], "quapy.model_selection.GridSearchQ": [[11, 3, 1, "", "best_model"], [11, 3, 1, "", "fit"], [11, 3, 1, "", "get_params"], [11, 3, 1, "", "quantify"], [11, 3, 1, "", "set_params"]], "quapy.plot": [[11, 5, 1, "", "binary_bias_bins"], [11, 5, 1, "", "binary_bias_global"], [11, 5, 1, "", "binary_diagonal"], [11, 5, 1, "", "brokenbar_supremacy_by_drift"], [11, 5, 1, "", "error_by_drift"]], "quapy.protocol": [[11, 1, 1, "", "APP"], [11, 1, 1, "", "AbstractProtocol"], [11, 1, 1, "", "AbstractStochasticSeededProtocol"], [11, 4, 1, "", "ArtificialPrevalenceProtocol"], [11, 1, 1, "", "DomainMixer"], [11, 1, 1, "", "IterateProtocol"], [11, 1, 1, "", "NPP"], [11, 4, 1, "", "NaturalPrevalenceProtocol"], [11, 1, 1, "", "OnLabelledCollectionProtocol"], [11, 1, 1, "", "UPP"], [11, 4, 1, "", "UniformPrevalenceProtocol"]], "quapy.protocol.APP": [[11, 3, 1, "", "prevalence_grid"], [11, 3, 1, "", "sample"], [11, 3, 1, "", "samples_parameters"], [11, 3, 1, "", "total"]], "quapy.protocol.AbstractProtocol": [[11, 3, 1, "", "total"]], "quapy.protocol.AbstractStochasticSeededProtocol": [[11, 3, 1, "", "collator"], [11, 2, 1, "", "random_state"], [11, 3, 1, "", "sample"], [11, 3, 1, "", "samples_parameters"]], "quapy.protocol.DomainMixer": [[11, 3, 1, "", "sample"], [11, 3, 1, "", "samples_parameters"], [11, 3, 1, "", "total"]], "quapy.protocol.IterateProtocol": [[11, 3, 1, "", "total"]], "quapy.protocol.NPP": [[11, 3, 1, "", "sample"], [11, 3, 1, "", "samples_parameters"], [11, 3, 1, "", "total"]], "quapy.protocol.OnLabelledCollectionProtocol": [[11, 4, 1, "", "RETURN_TYPES"], [11, 3, 1, "", "get_collator"], [11, 3, 1, "", "get_labelled_collection"], [11, 3, 1, "", "on_preclassified_instances"]], "quapy.protocol.UPP": [[11, 3, 1, "", "sample"], [11, 3, 1, "", "samples_parameters"], [11, 3, 1, "", "total"]], "quapy.util": [[11, 1, 1, "", "EarlyStop"], [11, 5, 1, "", "create_if_not_exist"], [11, 5, 1, "", "create_parent_dir"], [11, 5, 1, "", "download_file"], [11, 5, 1, "", "download_file_if_not_exists"], [11, 5, 1, "", "get_quapy_home"], [11, 5, 1, "", "map_parallel"], [11, 5, 1, "", "parallel"], [11, 5, 1, "", "pickled_resource"], [11, 5, 1, "", "save_text_file"], [11, 5, 1, "", "temp_seed"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:property", "3": "py:method", "4": "py:attribute", "5": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "property", "Python property"], "3": ["py", "method", "Python method"], "4": ["py", "attribute", "Python attribute"], "5": ["py", "function", "Python function"]}, "titleterms": {"dataset": [0, 13], "review": 0, "twitter": 0, "sentiment": 0, "uci": 0, "machin": 0, "learn": 0, "issu": 0, "ad": 0, "custom": 0, "data": [0, 13], "process": 0, "evalu": [1, 11], "error": [1, 7, 11], "measur": 1, "protocol": [1, 8, 11], "instal": 4, "requir": 4, "svm": 4, "perf": 4, "quantif": [4, 5, 6, 7], "orient": [4, 6], "loss": [2, 4, 5, 6], "method": [5, 12, 14], "aggreg": [5, 14], "The": 5, "classifi": 5, "count": 5, "variant": 5, "expect": 5, "maxim": 5, "emq": 5, "helling": 5, "distanc": 5, "y": 5, "hdy": 5, "explicit": [2, 5], "minim": [2, 5], "meta": [5, 14], "model": [5, 6], "ensembl": 5, "quanet": 5, "neural": [5, 12, 14], "network": 5, "select": 6, "target": 6, "classif": [6, 12], "plot": [7, 11], "diagon": 7, "bia": 7, "drift": 7, "welcom": 9, "quapi": [9, 10, 11, 12, 13, 14], "": 9, "document": 9, "introduct": 9, "A": 9, "quick": 9, "exampl": 9, "featur": 9, "content": [9, 11, 12, 13, 14], "indic": 9, "tabl": 9, "packag": [11, 12, 13, 14], "subpackag": 11, "submodul": [11, 12, 13, 14], "function": 11, "model_select": 11, "util": 11, "modul": [11, 12, 13, 14], "calibr": 12, "svmperf": 12, "base": [13, 14], "preprocess": 13, "reader": 13, "non_aggreg": 14, "threshold": 5, "optim": 5, "artifici": 8, "preval": 8, "sampl": 8, "from": 8, "unit": 8, "simplex": 8, "uniform": 8, "upp": 8, "natur": 8, "other": 8, "lequa": 0}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 57}, "alltitles": {"Installation": [[4, "installation"]], "Requirements": [[4, "requirements"]], "SVM-perf with quantification-oriented losses": [[4, "svm-perf-with-quantification-oriented-losses"]], "quapy": [[10, "quapy"]], "Welcome to QuaPy\u2019s documentation!": [[9, "welcome-to-quapy-s-documentation"]], "Introduction": [[9, "introduction"]], "A quick example:": [[9, "a-quick-example"]], "Features": [[9, "features"]], "Contents:": [[9, null]], "Indices and tables": [[9, "indices-and-tables"]], "Datasets": [[0, "datasets"]], "Reviews Datasets": [[0, "reviews-datasets"]], "Twitter Sentiment Datasets": [[0, "twitter-sentiment-datasets"]], "UCI Machine Learning": [[0, "uci-machine-learning"]], "Issues:": [[0, "issues"]], "LeQua Datasets": [[0, "lequa-datasets"]], "Adding Custom Datasets": [[0, "adding-custom-datasets"]], "Data Processing": [[0, "data-processing"]], "Evaluation": [[1, "evaluation"]], "Error Measures": [[1, "error-measures"]], "Evaluation Protocols": [[1, "evaluation-protocols"]], "Explicit Loss Minimization": [[2, "explicit-loss-minimization"], [5, "explicit-loss-minimization"]], "Quantification Methods": [[5, "quantification-methods"]], "Aggregative Methods": [[5, "aggregative-methods"]], "The Classify & Count variants": [[5, "the-classify-count-variants"]], "Expectation Maximization (EMQ)": [[5, "expectation-maximization-emq"]], "Hellinger Distance y (HDy)": [[5, "hellinger-distance-y-hdy"]], "Threshold Optimization methods": [[5, "threshold-optimization-methods"]], "Meta Models": [[5, "meta-models"]], "Ensembles": [[5, "ensembles"]], "The QuaNet neural network": [[5, "the-quanet-neural-network"]], "Model Selection": [[6, "model-selection"]], "Targeting a Quantification-oriented loss": [[6, "targeting-a-quantification-oriented-loss"]], "Targeting a Classification-oriented loss": [[6, "targeting-a-classification-oriented-loss"]], "Plotting": [[7, "plotting"]], "Diagonal Plot": [[7, "diagonal-plot"]], "Quantification bias": [[7, "quantification-bias"]], "Error by Drift": [[7, "error-by-drift"]], "Protocols": [[8, "protocols"]], "Artificial-Prevalence Protocol": [[8, "artificial-prevalence-protocol"]], "Sampling from the unit-simplex, the Uniform-Prevalence Protocol (UPP)": [[8, "sampling-from-the-unit-simplex-the-uniform-prevalence-protocol-upp"]], "Natural-Prevalence Protocol": [[8, "natural-prevalence-protocol"]], "Other protocols": [[8, "other-protocols"]], "Submodules": [[12, "submodules"], [13, "submodules"], [14, "submodules"], [11, "submodules"]], "Module contents": [[12, "module-quapy.classification"], [13, "module-quapy.data"], [14, "module-quapy.method"], [11, "module-quapy"]], "quapy.classification package": [[12, "quapy-classification-package"]], "quapy.classification.calibration": [[12, "quapy-classification-calibration"]], "quapy.classification.methods": [[12, "module-quapy.classification.methods"]], "quapy.classification.neural": [[12, "module-quapy.classification.neural"]], "quapy.classification.svmperf": [[12, "module-quapy.classification.svmperf"]], "quapy.data package": [[13, "quapy-data-package"]], "quapy.data.base": [[13, "module-quapy.data.base"]], "quapy.data.datasets": [[13, "module-quapy.data.datasets"]], "quapy.data.preprocessing": [[13, "module-quapy.data.preprocessing"]], "quapy.data.reader": [[13, "module-quapy.data.reader"]], "quapy.method package": [[14, "quapy-method-package"]], "quapy.method.aggregative": [[14, "module-quapy.method.aggregative"]], "quapy.method.base": [[14, "module-quapy.method.base"]], "quapy.method.meta": [[14, "module-quapy.method.meta"]], "quapy.method.neural": [[14, "module-quapy.method.neural"]], "quapy.method.non_aggregative": [[14, "module-quapy.method.non_aggregative"]], "quapy package": [[11, "quapy-package"]], "quapy.error": [[11, "module-quapy.error"]], "quapy.evaluation": [[11, "module-quapy.evaluation"]], "quapy.protocol": [[11, "quapy-protocol"]], "quapy.functional": [[11, "module-quapy.functional"]], "quapy.model_selection": [[11, "module-quapy.model_selection"]], "quapy.plot": [[11, "module-quapy.plot"]], "quapy.util": [[11, "module-quapy.util"]], "Subpackages": [[11, "subpackages"]]}, "indexentries": {"app (class in quapy.protocol)": [[11, "quapy.protocol.APP"]], "abstractprotocol (class in quapy.protocol)": [[11, "quapy.protocol.AbstractProtocol"]], "abstractstochasticseededprotocol (class in quapy.protocol)": [[11, "quapy.protocol.AbstractStochasticSeededProtocol"]], "artificialprevalenceprotocol (in module quapy.protocol)": [[11, "quapy.protocol.ArtificialPrevalenceProtocol"]], "domainmixer (class in quapy.protocol)": [[11, "quapy.protocol.DomainMixer"]], "earlystop (class in quapy.util)": [[11, "quapy.util.EarlyStop"]], "gridsearchq (class in quapy.model_selection)": [[11, "quapy.model_selection.GridSearchQ"]], "hellingerdistance() (in module quapy.functional)": [[11, "quapy.functional.HellingerDistance"]], "iterateprotocol (class in quapy.protocol)": [[11, "quapy.protocol.IterateProtocol"]], "npp (class in quapy.protocol)": [[11, "quapy.protocol.NPP"]], "naturalprevalenceprotocol (in module quapy.protocol)": [[11, "quapy.protocol.NaturalPrevalenceProtocol"]], "onlabelledcollectionprotocol (class in quapy.protocol)": [[11, "quapy.protocol.OnLabelledCollectionProtocol"]], "return_types (quapy.protocol.onlabelledcollectionprotocol attribute)": [[11, "quapy.protocol.OnLabelledCollectionProtocol.RETURN_TYPES"]], "topsoedistance() (in module quapy.functional)": [[11, "quapy.functional.TopsoeDistance"]], "upp (class in quapy.protocol)": [[11, "quapy.protocol.UPP"]], "uniformprevalenceprotocol (in module quapy.protocol)": [[11, "quapy.protocol.UniformPrevalenceProtocol"]], "absolute_error() (in module quapy.error)": [[11, "quapy.error.absolute_error"]], "acc_error() (in module quapy.error)": [[11, "quapy.error.acc_error"]], "acce() (in module quapy.error)": [[11, "quapy.error.acce"]], "adjusted_quantification() (in module quapy.functional)": [[11, "quapy.functional.adjusted_quantification"]], "ae() (in module quapy.error)": [[11, "quapy.error.ae"]], "best_model() (quapy.model_selection.gridsearchq method)": [[11, "quapy.model_selection.GridSearchQ.best_model"]], "binary_bias_bins() (in module quapy.plot)": [[11, "quapy.plot.binary_bias_bins"]], "binary_bias_global() (in module quapy.plot)": [[11, "quapy.plot.binary_bias_global"]], "binary_diagonal() (in module quapy.plot)": [[11, "quapy.plot.binary_diagonal"]], "brokenbar_supremacy_by_drift() (in module quapy.plot)": [[11, "quapy.plot.brokenbar_supremacy_by_drift"]], "check_prevalence_vector() (in module quapy.functional)": [[11, "quapy.functional.check_prevalence_vector"]], "collator() (quapy.protocol.abstractstochasticseededprotocol method)": [[11, "quapy.protocol.AbstractStochasticSeededProtocol.collator"]], "create_if_not_exist() (in module quapy.util)": [[11, "quapy.util.create_if_not_exist"]], "create_parent_dir() (in module quapy.util)": [[11, "quapy.util.create_parent_dir"]], "cross_val_predict() (in module quapy.model_selection)": [[11, "quapy.model_selection.cross_val_predict"]], "download_file() (in module quapy.util)": [[11, "quapy.util.download_file"]], "download_file_if_not_exists() (in module quapy.util)": [[11, "quapy.util.download_file_if_not_exists"]], "error_by_drift() (in module quapy.plot)": [[11, "quapy.plot.error_by_drift"]], "evaluate() (in module quapy.evaluation)": [[11, "quapy.evaluation.evaluate"]], "evaluate_on_samples() (in module quapy.evaluation)": [[11, "quapy.evaluation.evaluate_on_samples"]], "evaluation_report() (in module quapy.evaluation)": [[11, "quapy.evaluation.evaluation_report"]], "f1_error() (in module quapy.error)": [[11, "quapy.error.f1_error"]], "f1e() (in module quapy.error)": [[11, "quapy.error.f1e"]], "fit() (quapy.model_selection.gridsearchq method)": [[11, "quapy.model_selection.GridSearchQ.fit"]], "from_name() (in module quapy.error)": [[11, "quapy.error.from_name"]], "get_collator() (quapy.protocol.onlabelledcollectionprotocol class method)": [[11, "quapy.protocol.OnLabelledCollectionProtocol.get_collator"]], "get_labelled_collection() (quapy.protocol.onlabelledcollectionprotocol method)": [[11, "quapy.protocol.OnLabelledCollectionProtocol.get_labelled_collection"]], "get_nprevpoints_approximation() (in module quapy.functional)": [[11, "quapy.functional.get_nprevpoints_approximation"]], "get_params() (quapy.model_selection.gridsearchq method)": [[11, "quapy.model_selection.GridSearchQ.get_params"]], "get_quapy_home() (in module quapy.util)": [[11, "quapy.util.get_quapy_home"]], "kld() (in module quapy.error)": [[11, "quapy.error.kld"]], "mae() (in module quapy.error)": [[11, "quapy.error.mae"]], "map_parallel() (in module quapy.util)": [[11, "quapy.util.map_parallel"]], "mean_absolute_error() (in module quapy.error)": [[11, "quapy.error.mean_absolute_error"]], "mean_relative_absolute_error() (in module quapy.error)": [[11, "quapy.error.mean_relative_absolute_error"]], "mkld() (in module quapy.error)": [[11, "quapy.error.mkld"]], "mnkld() (in module quapy.error)": [[11, "quapy.error.mnkld"]], "module": [[11, "module-quapy"], [11, "module-quapy.error"], [11, "module-quapy.evaluation"], [11, "module-quapy.functional"], [11, "module-quapy.model_selection"], [11, "module-quapy.plot"], [11, "module-quapy.protocol"], [11, "module-quapy.util"]], "mrae() (in module quapy.error)": [[11, "quapy.error.mrae"]], "mse() (in module quapy.error)": [[11, "quapy.error.mse"]], "nkld() (in module quapy.error)": [[11, "quapy.error.nkld"]], "normalize_prevalence() (in module quapy.functional)": [[11, "quapy.functional.normalize_prevalence"]], "num_prevalence_combinations() (in module quapy.functional)": [[11, "quapy.functional.num_prevalence_combinations"]], "on_preclassified_instances() (quapy.protocol.onlabelledcollectionprotocol method)": [[11, "quapy.protocol.OnLabelledCollectionProtocol.on_preclassified_instances"]], "parallel() (in module quapy.util)": [[11, "quapy.util.parallel"]], "pickled_resource() (in module quapy.util)": [[11, "quapy.util.pickled_resource"]], "prediction() (in module quapy.evaluation)": [[11, "quapy.evaluation.prediction"]], "prevalence_from_labels() (in module quapy.functional)": [[11, "quapy.functional.prevalence_from_labels"]], "prevalence_from_probabilities() (in module quapy.functional)": [[11, "quapy.functional.prevalence_from_probabilities"]], "prevalence_grid() (quapy.protocol.app method)": [[11, "quapy.protocol.APP.prevalence_grid"]], "prevalence_linspace() (in module quapy.functional)": [[11, "quapy.functional.prevalence_linspace"]], "quantify() (quapy.model_selection.gridsearchq method)": [[11, "quapy.model_selection.GridSearchQ.quantify"]], "quapy": [[11, "module-quapy"]], "quapy.error": [[11, "module-quapy.error"]], "quapy.evaluation": [[11, "module-quapy.evaluation"]], "quapy.functional": [[11, "module-quapy.functional"]], "quapy.model_selection": [[11, "module-quapy.model_selection"]], "quapy.plot": [[11, "module-quapy.plot"]], "quapy.protocol": [[11, "module-quapy.protocol"]], "quapy.util": [[11, "module-quapy.util"]], "rae() (in module quapy.error)": [[11, "quapy.error.rae"]], "random_state (quapy.protocol.abstractstochasticseededprotocol property)": [[11, "quapy.protocol.AbstractStochasticSeededProtocol.random_state"]], "relative_absolute_error() (in module quapy.error)": [[11, "quapy.error.relative_absolute_error"]], "sample() (quapy.protocol.app method)": [[11, "quapy.protocol.APP.sample"]], "sample() (quapy.protocol.abstractstochasticseededprotocol method)": [[11, "quapy.protocol.AbstractStochasticSeededProtocol.sample"]], "sample() (quapy.protocol.domainmixer method)": [[11, "quapy.protocol.DomainMixer.sample"]], "sample() (quapy.protocol.npp method)": [[11, "quapy.protocol.NPP.sample"]], "sample() (quapy.protocol.upp method)": [[11, "quapy.protocol.UPP.sample"]], "samples_parameters() (quapy.protocol.app method)": [[11, "quapy.protocol.APP.samples_parameters"]], "samples_parameters() (quapy.protocol.abstractstochasticseededprotocol method)": [[11, "quapy.protocol.AbstractStochasticSeededProtocol.samples_parameters"]], "samples_parameters() (quapy.protocol.domainmixer method)": [[11, "quapy.protocol.DomainMixer.samples_parameters"]], "samples_parameters() (quapy.protocol.npp method)": [[11, "quapy.protocol.NPP.samples_parameters"]], "samples_parameters() (quapy.protocol.upp method)": [[11, "quapy.protocol.UPP.samples_parameters"]], "save_text_file() (in module quapy.util)": [[11, "quapy.util.save_text_file"]], "se() (in module quapy.error)": [[11, "quapy.error.se"]], "set_params() (quapy.model_selection.gridsearchq method)": [[11, "quapy.model_selection.GridSearchQ.set_params"]], "smooth() (in module quapy.error)": [[11, "quapy.error.smooth"]], "strprev() (in module quapy.functional)": [[11, "quapy.functional.strprev"]], "temp_seed() (in module quapy.util)": [[11, "quapy.util.temp_seed"]], "total() (quapy.protocol.app method)": [[11, "quapy.protocol.APP.total"]], "total() (quapy.protocol.abstractprotocol method)": [[11, "quapy.protocol.AbstractProtocol.total"]], "total() (quapy.protocol.domainmixer method)": [[11, "quapy.protocol.DomainMixer.total"]], "total() (quapy.protocol.iterateprotocol method)": [[11, "quapy.protocol.IterateProtocol.total"]], "total() (quapy.protocol.npp method)": [[11, "quapy.protocol.NPP.total"]], "total() (quapy.protocol.upp method)": [[11, "quapy.protocol.UPP.total"]], "uniform_prevalence_sampling() (in module quapy.functional)": [[11, "quapy.functional.uniform_prevalence_sampling"]], "uniform_simplex_sampling() (in module quapy.functional)": [[11, "quapy.functional.uniform_simplex_sampling"]]}}) \ No newline at end of file diff --git a/docs/index.html b/docs/index.html deleted file mode 100644 index 0707108..0000000 --- a/docs/index.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/docs/leeme.txt b/docs/leeme.txt new file mode 100644 index 0000000..cabe457 --- /dev/null +++ b/docs/leeme.txt @@ -0,0 +1,10 @@ +Para meter los módulos dentro de doc hay que hacer un + +sphinx-apidoc -o docs/source/ quapy/ -P + +Eso importa todo lo que haya en quapy/ (incluidos los ficheros _ gracias a -P) en source y crea un rst para cada uno. + +Parece que lo del -P no funciona. Hay que meterlos a mano en quapy.method.rst + +Luego, simplemente +make html \ No newline at end of file diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..747ffb7 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/.gitignore b/docs/source/.gitignore new file mode 100644 index 0000000..dab1498 --- /dev/null +++ b/docs/source/.gitignore @@ -0,0 +1 @@ +!*.png \ No newline at end of file diff --git a/docs/source/EUfooter.png b/docs/source/EUfooter.png new file mode 100644 index 0000000..0898c74 Binary files /dev/null and b/docs/source/EUfooter.png differ diff --git a/SoBigData.png b/docs/source/SoBigData.png similarity index 100% rename from SoBigData.png rename to docs/source/SoBigData.png diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..689cc6e --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,73 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +import pathlib +import sys +from os.path import join +quapy_path = join(pathlib.Path(__file__).parents[2].resolve().as_posix(), 'quapy') +wiki_path = join(pathlib.Path(__file__).parents[0].resolve().as_posix(), 'wiki') +source_path = pathlib.Path(__file__).parents[2].resolve().as_posix() +print(f'quapy path={quapy_path}') +print(f'quapy source path={source_path}') +sys.path.insert(0, quapy_path) +sys.path.insert(0, wiki_path) +sys.path.insert(0, source_path) + +print(sys.path) + + +project = 'QuaPy: A Python-based open-source framework for quantification' +copyright = '2024, Alejandro Moreo' +author = 'Alejandro Moreo' + + + +import quapy + +release = quapy.__version__ + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + 'sphinx.ext.autosectionlabel', + 'sphinx.ext.duration', + 'sphinx.ext.doctest', + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', + 'sphinx.ext.intersphinx', + 'myst_parser', +] + +autosectionlabel_prefix_document = True + +source_suffix = ['.rst', '.md'] + +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = 'sphinx_rtd_theme' +# html_theme = 'furo' +# need to be installed: pip install furo (not working...) +# html_static_path = ['_static'] + +# intersphinx configuration +intersphinx_mapping = { + "sklearn": ("https://scikit-learn.org/stable/", None), +} + diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 0000000..d52093e --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,103 @@ +```{toctree} +:hidden: + +self +``` + +# Quickstart + +QuaPy is an open source framework for quantification (a.k.a. supervised prevalence estimation, or learning to quantify) written in Python. + +QuaPy is based on the concept of "data sample", and provides implementations of the most important aspects of the quantification workflow, such as (baseline and advanced) quantification methods, quantification-oriented model selection mechanisms, evaluation measures, and evaluations protocols used for evaluating quantification methods. QuaPy also makes available commonly used datasets, and offers visualization tools for facilitating the analysis and interpretation of the experimental results. + +QuaPy is hosted on GitHub at [https://github.com/HLT-ISTI/QuaPy](https://github.com/HLT-ISTI/QuaPy). + +## Installation + +```sh +pip install quapy +``` + +## Usage + +The following script fetches a dataset of tweets, trains, applies, and evaluates a quantifier based on the *Adjusted Classify & Count* quantification method, using, as the evaluation measure, the *Mean Absolute Error* (MAE) between the predicted and the true class prevalence values of the test set: + +```python +import quapy as qp + +training, test = qp.datasets.fetch_UCIBinaryDataset("yeast").train_test + +# create an "Adjusted Classify & Count" quantifier +model = qp.method.aggregative.ACC() +Xtr, ytr = training.Xy +model.fit(Xtr, ytr) + +estim_prevalence = model.predict(test.X) +true_prevalence = test.prevalence() + +error = qp.error.mae(true_prevalence, estim_prevalence) +print(f'Mean Absolute Error (MAE)={error:.3f}') +``` + +Quantification is useful in scenarios characterized by prior probability shift. In other words, we would be little interested in estimating the class prevalence values of the test set if we could assume the IID assumption to hold, as this prevalence would be roughly equivalent to the class prevalence of the training set. For this reason, any quantification model should be tested across many samples, even ones characterized by class prevalence values different or very different from those found in the training set. QuaPy implements sampling procedures and evaluation protocols that automate this workflow. See the [](./manuals) for detailed examples. + +## Manuals + +The following manuals illustrate several aspects of QuaPy through examples: + +```{toctree} +:maxdepth: 3 + +manuals +``` + +```{toctree} +:hidden: + +API +``` + +## Features + +* Implementation of many popular quantification methods (Classify-&-Count and its variants, Expectation Maximization, +quantification methods based on structured output learning, HDy, QuaNet, quantification ensembles, among others). +* Versatile functionality for performing evaluation based on sampling generation protocols (e.g., APP, NPP, etc.). +* Implementation of most commonly used evaluation metrics (e.g., AE, RAE, NAE, NRAE, SE, KLD, NKLD, etc.). +* Datasets frequently used in quantification (textual and numeric), including: + * 32 UCI Machine Learning datasets. + * 11 Twitter quantification-by-sentiment datasets. + * 3 product reviews quantification-by-sentiment datasets. + * 4 tasks from LeQua 2022 competition and 4 tasks from LeQua 2024 competition + * IFCB for Plancton quantification +* Native support for binary and single-label multiclass quantification scenarios. +* Model selection functionality that minimizes quantification-oriented loss functions. +* Visualization tools for analysing the experimental results. + +## Citing QuaPy + +If you find QuaPy useful (and we hope you will), please consider citing the original paper in your research. + +```bibtex +@inproceedings{moreo2021quapy, + title={QuaPy: a python-based framework for quantification}, + author={Moreo, Alejandro and Esuli, Andrea and Sebastiani, Fabrizio}, + booktitle={Proceedings of the 30th ACM International Conference on Information \& Knowledge Management}, + pages={4534--4543}, + year={2021} +} +``` + +## Contributing + +In case you want to contribute improvements to quapy, please generate pull request to the "devel" branch. + +## Acknowledgments + +```{image} SoBigData.png +:width: 250px +:alt: SoBigData++ +``` + +This work has been supported by the QuaDaSh project +_"Finanziato dall’Unione europea---Next Generation EU, +Missione 4 Componente 2 CUP B53D23026250001"_. diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..cc5b4dc --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,41 @@ +.. QuaPy: A Python-based open-source framework for quantification documentation master file, created by + sphinx-quickstart on Wed Feb 7 16:26:46 2024. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to QuaPy's documentation! +========================================================================================== + +QuaPy is a Python-based open-source framework for quantification. + +This document contains the API of the modules included in QuaPy. + +Installation +------------ + +`pip install quapy` + +GitHub +------------ + +QuaPy is hosted in GitHub at `https://github.com/HLT-ISTI/QuaPy `_ + + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + +Contents +-------- + +.. toctree:: + + modules + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/source/manuals.rst b/docs/source/manuals.rst new file mode 100644 index 0000000..a426786 --- /dev/null +++ b/docs/source/manuals.rst @@ -0,0 +1,14 @@ +Manuals +======= + +.. toctree:: + :maxdepth: 2 + :numbered: + + manuals/datasets + manuals/evaluation + manuals/explicit-loss-minimization + manuals/methods + manuals/model-selection + manuals/plotting + manuals/protocols diff --git a/docs/source/manuals/datasets.md b/docs/source/manuals/datasets.md new file mode 100644 index 0000000..b7d8827 --- /dev/null +++ b/docs/source/manuals/datasets.md @@ -0,0 +1,529 @@ +# Datasets + +QuaPy makes available several datasets that have been used in +quantification literature, as well as an interface to allow +anyone import their custom datasets. + +A _Dataset_ object in QuaPy is roughly a pair of _LabelledCollection_ objects, +one playing the role of the training set, another the test set. +_LabelledCollection_ is a data class consisting of the (iterable) +instances and labels. This class handles most of the sampling functionality in QuaPy. +Take a look at the following code: + +```python + +import quapy as qp +import quapy.functional as F + +instances = [ + '1st positive document', '2nd positive document', + 'the only negative document', + '1st neutral document', '2nd neutral document', '3rd neutral document' +] +labels = [2, 2, 0, 1, 1, 1] + +data = qp.data.LabelledCollection(instances, labels) +print(F.strprev(data.prevalence(), prec=2)) + +``` + +Output the class prevalences (showing 2 digit precision): +``` +[0.17, 0.50, 0.33] +``` + +One can easily produce new samples at desired class prevalence values: + +```python +sample_size = 10 +prev = [0.4, 0.1, 0.5] +sample = data.sampling(sample_size, *prev) + +print('instances:', sample.instances) +print('labels:', sample.labels) +print('prevalence:', F.strprev(sample.prevalence(), prec=2)) +``` + +Which outputs: +``` +instances: ['the only negative document' '2nd positive document' + '2nd positive document' '2nd neutral document' '1st positive document' + 'the only negative document' 'the only negative document' + 'the only negative document' '2nd positive document' + '1st positive document'] +labels: [0 2 2 1 2 0 0 0 2 2] +prevalence: [0.40, 0.10, 0.50] +``` + +Samples can be made consistent across different runs (e.g., to test +different methods on the same exact samples) by sampling and retaining +the indexes, that can then be used to generate the sample: + +```python +index = data.sampling_index(sample_size, *prev) +for method in methods: + sample = data.sampling_from_index(index) + ... +``` + +However, generating samples for evaluation purposes is tackled in QuaPy +by means of the evaluation protocols (see the dedicated entries in the manuals +for [evaluation](./evaluation) and [protocols](./protocols)). + + +## Reviews Datasets + +Three datasets of reviews about Kindle devices, Harry Potter's series, and +the well-known IMDb movie reviews can be fetched using a unified interface. +For example: + +```python +import quapy as qp +data = qp.datasets.fetch_reviews('kindle') +``` + +These datasets have been used in: +``` +Esuli, A., Moreo, A., & Sebastiani, F. (2018, October). +A recurrent neural network for sentiment quantification. +In Proceedings of the 27th ACM International Conference on +Information and Knowledge Management (pp. 1775-1778). +``` + +The list of reviews ids is available in: + +```python +qp.datasets.REVIEWS_SENTIMENT_DATASETS +``` + +Some statistics of the fhe available datasets are summarized below: + +| Dataset | classes | train size | test size | train prev | test prev | type | +|---|:---:|:---:|:---:|:---:|:---:|---| +| hp | 2 | 9533 | 18399 | \[0.018, 0.982\] | \[0.065, 0.935\] | text | +| kindle | 2 | 3821 | 21591 | \[0.081, 0.919\] | \[0.063, 0.937\] | text | +| imdb | 2 | 25000 | 25000 | \[0.500, 0.500\] | \[0.500, 0.500\] | text | + +## Twitter Sentiment Datasets + +11 Twitter datasets for sentiment analysis. +Text is not accessible, and the documents were made available +in tf-idf format. Each dataset presents two splits: a train/val +split for model selection purposes, and a train+val/test split +for model evaluation. The following code exemplifies how to load +a twitter dataset for model selection. + +```python +import quapy as qp +data = qp.datasets.fetch_twitter('gasp', for_model_selection=True) +``` + +The datasets were used in: + +``` +Gao, W., & Sebastiani, F. (2015, August). +Tweet sentiment: From classification to quantification. +In 2015 IEEE/ACM International Conference on Advances in +Social Networks Analysis and Mining (ASONAM) (pp. 97-104). IEEE. +``` + +Three of the datasets (semeval13, semeval14, and semeval15) share the +same training set (semeval), meaning that the training split one would get +when requesting any of them is the same. The dataset "semeval" can only +be requested with "for_model_selection=True". +The lists of the Twitter dataset's ids can be consulted in: + +```python +# a list of 11 dataset ids that can be used for model selection or model evaluation +qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST + +# 9 dataset ids in which "semeval13", "semeval14", and "semeval15" are replaced with "semeval" +qp.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN +``` + +Some details can be found below: + +| Dataset | classes | train size | test size | features | train prev | test prev | type | +|---|:---:|:---:|:---:|:---:|:---:|:---:|---| +| gasp | 3 | 8788 | 3765 | 694582 | [0.421, 0.496, 0.082] | [0.407, 0.507, 0.086] | sparse | +| hcr | 3 | 1594 | 798 | 222046 | [0.546, 0.211, 0.243] | [0.640, 0.167, 0.193] | sparse | +| omd | 3 | 1839 | 787 | 199151 | [0.463, 0.271, 0.266] | [0.437, 0.283, 0.280] | sparse | +| sanders | 3 | 2155 | 923 | 229399 | [0.161, 0.691, 0.148] | [0.164, 0.688, 0.148] | sparse | +| semeval13 | 3 | 11338 | 3813 | 1215742 | [0.159, 0.470, 0.372] | [0.158, 0.430, 0.412] | sparse | +| semeval14 | 3 | 11338 | 1853 | 1215742 | [0.159, 0.470, 0.372] | [0.109, 0.361, 0.530] | sparse | +| semeval15 | 3 | 11338 | 2390 | 1215742 | [0.159, 0.470, 0.372] | [0.153, 0.413, 0.434] | sparse | +| semeval16 | 3 | 8000 | 2000 | 889504 | [0.157, 0.351, 0.492] | [0.163, 0.341, 0.497] | sparse | +| sst | 3 | 2971 | 1271 | 376132 | [0.261, 0.452, 0.288] | [0.207, 0.481, 0.312] | sparse | +| wa | 3 | 2184 | 936 | 248563 | [0.305, 0.414, 0.281] | [0.282, 0.446, 0.272] | sparse | +| wb | 3 | 4259 | 1823 | 404333 | [0.270, 0.392, 0.337] | [0.274, 0.392, 0.335] | sparse | + + +## UCI Machine Learning + +### Binary datasets + +A set of 32 datasets from the [UCI Machine Learning repository](https://archive.ics.uci.edu/ml/datasets.php) +used in: + +``` +Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). +Using ensembles for problems with characterizable changes +in data distribution: A case study on quantification. +Information Fusion, 34, 87-100. +``` + +The list does not exactly coincide with that used in Pérez-Gállego et al. 2017 +since we were unable to find the datasets with ids "diabetes" and "phoneme". + +These dataset can be loaded by calling, e.g.: + +```python +import quapy as qp + +data = qp.datasets.fetch_UCIBinaryDataset('yeast', verbose=True) +``` + +This call will return a _Dataset_ object in which the training and +test splits are randomly drawn, in a stratified manner, from the whole +collection at 70% and 30%, respectively. The _verbose=True_ option indicates +that the dataset description should be printed in standard output. +The original data is not split, +and some papers submit the entire collection to a kFCV validation. +In order to accommodate with these practices, one could first instantiate +the entire collection, and then creating a generator that will return one +training+test dataset at a time, following a kFCV protocol: + +```python +import quapy as qp + +collection = qp.datasets.fetch_UCIBinaryLabelledCollection("yeast") +for data in qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=2): + ... +``` + +Above code will allow to conduct a 2x5FCV evaluation on the "yeast" dataset. + +All datasets come in numerical form (dense matrices); some statistics +are summarized below. + +| Dataset | classes | instances | features | prev | type | +|---|:---:|:---:|:---:|:---:|---| +| acute.a | 2 | 120 | 6 | [0.508, 0.492] | dense | +| acute.b | 2 | 120 | 6 | [0.583, 0.417] | dense | +| balance.1 | 2 | 625 | 4 | [0.539, 0.461] | dense | +| balance.2 | 2 | 625 | 4 | [0.922, 0.078] | dense | +| balance.3 | 2 | 625 | 4 | [0.539, 0.461] | dense | +| breast-cancer | 2 | 683 | 9 | [0.350, 0.650] | dense | +| cmc.1 | 2 | 1473 | 9 | [0.573, 0.427] | dense | +| cmc.2 | 2 | 1473 | 9 | [0.774, 0.226] | dense | +| cmc.3 | 2 | 1473 | 9 | [0.653, 0.347] | dense | +| ctg.1 | 2 | 2126 | 21 | [0.222, 0.778] | dense | +| ctg.2 | 2 | 2126 | 21 | [0.861, 0.139] | dense | +| ctg.3 | 2 | 2126 | 21 | [0.917, 0.083] | dense | +| german | 2 | 1000 | 24 | [0.300, 0.700] | dense | +| haberman | 2 | 306 | 3 | [0.735, 0.265] | dense | +| ionosphere | 2 | 351 | 34 | [0.641, 0.359] | dense | +| iris.1 | 2 | 150 | 4 | [0.667, 0.333] | dense | +| iris.2 | 2 | 150 | 4 | [0.667, 0.333] | dense | +| iris.3 | 2 | 150 | 4 | [0.667, 0.333] | dense | +| mammographic | 2 | 830 | 5 | [0.514, 0.486] | dense | +| pageblocks.5 | 2 | 5473 | 10 | [0.979, 0.021] | dense | +| semeion | 2 | 1593 | 256 | [0.901, 0.099] | dense | +| sonar | 2 | 208 | 60 | [0.534, 0.466] | dense | +| spambase | 2 | 4601 | 57 | [0.606, 0.394] | dense | +| spectf | 2 | 267 | 44 | [0.794, 0.206] | dense | +| tictactoe | 2 | 958 | 9 | [0.653, 0.347] | dense | +| transfusion | 2 | 748 | 4 | [0.762, 0.238] | dense | +| wdbc | 2 | 569 | 30 | [0.627, 0.373] | dense | +| wine.1 | 2 | 178 | 13 | [0.669, 0.331] | dense | +| wine.2 | 2 | 178 | 13 | [0.601, 0.399] | dense | +| wine.3 | 2 | 178 | 13 | [0.730, 0.270] | dense | +| wine-q-red | 2 | 1599 | 11 | [0.465, 0.535] | dense | +| wine-q-white | 2 | 4898 | 11 | [0.335, 0.665] | dense | +| yeast | 2 | 1484 | 8 | [0.711, 0.289] | dense | + +#### Notes: +All datasets will be downloaded automatically the first time they are requested, and +stored in the _quapy_data_ folder for faster further reuse. + +However, notice that it is a good idea to ignore datasets: +* _acute.a_ and _acute.b_: these are very easy and many classifiers would score 100% accuracy +* _balance.2_: this is extremely difficult; probably there is some problem with this dataset, +the errors it tends to produce are orders of magnitude greater than for other datasets, +and this has a disproportionate impact in the average performance. + +### Multiclass datasets + +A collection of 24 multiclass datasets from the [UCI Machine Learning repository](https://archive.ics.uci.edu/ml/datasets.php). +Some of the datasets were first used in [this paper](https://arxiv.org/abs/2401.00490) and can be instantiated as follows: + +```python +import quapy as qp +data = qp.datasets.fetch_UCIMulticlassLabelledCollection('dry-bean', verbose=True) +``` + +A dataset can be instantiated filtering classes with a minimum number of instances using the `min_class_support` parameter +(default: `100`) as folows: + + +```python +import quapy as qp +data = qp.datasets.fetch_UCIMulticlassLabelledCollection('dry-bean', min_class_support=50, verbose=True) +``` + +There are no pre-defined train-test partitions for these datasets, but you can easily create your own with the +`split_stratified` method, e.g., `data.split_stratified()`. This can be also achieved using the method `fetch_UCIMulticlassDataset` +as shown below: + +```python +data = qp.datasets.fetch_UCIMulticlassDataset('dry-bean', min_test_split=0.4, verbose=True) +train, test = data.train_test +``` + +This method tries to respect the `min_test_split` value while generating the train-test partition, but the resulting training set +will not be bigger than `max_train_instances`, which defaults to `25000`. A bigger value can be passed as a parameter: + +```python +data = qp.datasets.fetch_UCIMulticlassDataset('dry-bean', min_test_split=0.4, max_train_instances=30000, verbose=True) +train, test = data.train_test +``` + +The datasets correspond to a part of the datasets that can be retrieved from the platform using the following filters: +* datasets for classification +* more than 2 classes +* containing at least 1,000 instances +* can be imported using the Python API. + +Some statistics about these datasets are displayed below : + +| **Dataset** | **classes** | **instances** | **features** | **prevs** | **type** | +|:------------|:-----------:|:-------------:|:------------:|:----------|:--------:| +| dry-bean | 7 | 13611 | 16 | [0.097, 0.038, 0.120, 0.261, 0.142, 0.149, 0.194] | dense | +| wine-quality | 5 | 6462 | 11 | [0.033, 0.331, 0.439, 0.167, 0.030] | dense | +| academic-success | 3 | 4424 | 36 | [0.321, 0.179, 0.499] | dense | +| digits | 10 | 5620 | 64 | [0.099, 0.102, 0.099, 0.102, 0.101, 0.099, 0.099, 0.101, 0.099, 0.100] | dense | +| letter | 26 | 20000 | 16 | [0.039, 0.038, 0.037, 0.040, 0.038, 0.039, 0.039, 0.037, 0.038, 0.037, 0.037, 0.038, 0.040, 0.039, 0.038, 0.040, 0.039, 0.038, 0.037, 0.040, 0.041, 0.038, 0.038, 0.039, 0.039, 0.037] | dense | +| abalone | 11 | 3842 | 9 | [0.030, 0.067, 0.102, 0.148, 0.179, 0.165, 0.127, 0.069, 0.053, 0.033, 0.027] | dense | +| obesity | 7 | 2111 | 23 | [0.129, 0.136, 0.166, 0.141, 0.153, 0.137, 0.137] | dense | +| nursery | 4 | 12958 | 19 | [0.333, 0.329, 0.312, 0.025] | dense | +| yeast | 4 | 1299 | 8 | [0.356, 0.125, 0.188, 0.330] | dense | +| hand_digits | 10 | 10992 | 16 | [0.104, 0.104, 0.104, 0.096, 0.104, 0.096, 0.096, 0.104, 0.096, 0.096] | dense | +| satellite | 6 | 6435 | 36 | [0.238, 0.109, 0.211, 0.097, 0.110, 0.234] | dense | +| shuttle | 4 | 57927 | 7 | [0.787, 0.003, 0.154, 0.056] | dense | +| cmc | 3 | 1473 | 9 | [0.427, 0.226, 0.347] | dense | +| isolet | 26 | 7797 | 617 | [0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038, 0.038] | dense | +| waveform-v1 | 3 | 5000 | 21 | [0.331, 0.329, 0.339] | dense | +| molecular | 3 | 3190 | 227 | [0.240, 0.241, 0.519] | dense | +| poker_hand | 8 | 1024985 | 10 | [0.501, 0.423, 0.048, 0.021, 0.004, 0.002, 0.001, 0.000] | dense | +| connect-4 | 3 | 67557 | 84 | [0.095, 0.246, 0.658] | dense | +| mhr | 3 | 1014 | 6 | [0.268, 0.400, 0.331] | dense | +| chess | 15 | 27870 | 20 | [0.100, 0.051, 0.102, 0.078, 0.017, 0.007, 0.163, 0.061, 0.025, 0.021, 0.014, 0.071, 0.150, 0.129, 0.009] | dense | +| page_block | 3 | 5357 | 10 | [0.917, 0.061, 0.021] | dense | +| phishing | 3 | 1353 | 9 | [0.519, 0.076, 0.405] | dense | +| image_seg | 7 | 2310 | 19 | [0.143, 0.143, 0.143, 0.143, 0.143, 0.143, 0.143] | dense | +| hcv | 4 | 1385 | 28 | [0.243, 0.240, 0.256, 0.261] | dense | + +Values shown above refer to datasets obtained through `fetchUCIMulticlassLabelledCollection` using all default parameters. + +## LeQua 2022 Datasets + +QuaPy also provides the datasets used for the LeQua 2022 competition. +In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification +problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide +raw documents instead. +Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B +are multiclass quantification problems consisting of estimating the class prevalence +values of 28 different merchandise products. + +Every task consists of a training set, a set of validation samples (for model selection) +and a set of test samples (for evaluation). QuaPy returns this data as a LabelledCollection +(training) and two generation protocols (for validation and test samples), as follows: + +```python +training, val_generator, test_generator = qp.datasets.fetch_lequa2022(task=task) +``` + +See the `5a.lequa2022_experiments.py` in the examples folder for further details on how to +carry out experiments using these datasets. + +The datasets are downloaded only once, and stored for fast reuse. + +Some statistics are summarized below: + +| Dataset | classes | train size | validation samples | test samples | docs by sample | type | +|---------|:-------:|:----------:|:------------------:|:------------:|:----------------:|:--------:| +| T1A | 2 | 5000 | 1000 | 5000 | 250 | vector | +| T1B | 28 | 20000 | 1000 | 5000 | 1000 | vector | +| T2A | 2 | 5000 | 1000 | 5000 | 250 | text | +| T2B | 28 | 20000 | 1000 | 5000 | 1000 | text | + +For further details on the datasets, we refer to the original +[paper](https://ceur-ws.org/Vol-3180/paper-146.pdf): + +``` +Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022). +A Detailed Overview of LeQua@ CLEF 2022: Learning to Quantify. +``` + +## LeQua 2024 Datasets + +QuaPy also provides the datasets used for the [LeQua 2024 competition](https://lequa2024.github.io/). +In brief, there are 4 tasks: +* T1: binary quantification (by sentiment) +* T2: multiclass quantification (28 classes, merchandise products) +* T3: ordinal quantification (5-stars sentiment ratings) +* T4: binary sentiment quantification under a combination of covariate shift and prior shift + +In all cases, the covariate space has 256 dimensions (extracted using the `ELECTRA-Small` model). + +Every task consists of a training set, a set of validation samples (for model selection) +and a set of test samples (for evaluation). QuaPy returns this data as a LabelledCollection +(training bags) and sampling generation protocols (for validation and test bags). +T3 also offers the possibility to obtain a series of training bags (in form of a +sampling generation protocol) instead of one single training bag. Use it as follows: + +```python +training, val_generator, test_generator = qp.datasets.fetch_lequa2024(task=task) +``` + +See the `5b.lequa2024_experiments.py` in the examples folder for further details on how to +carry out experiments using these datasets. + +The datasets are downloaded only once, and stored for fast reuse. + +Some statistics are summarized below: + +| Dataset | classes | train size | validation samples | test samples | docs by sample | type | +|---------|:-------:|:-----------:|:------------------:|:------------:|:--------------:|:--------:| +| T1 | 2 | 5000 | 1000 | 5000 | 250 | vector | +| T2 | 28 | 20000 | 1000 | 5000 | 1000 | vector | +| T3 | 5 | 100 samples | 1000 | 5000 | 200 | vector | +| T4 | 2 | 5000 | 1000 | 5000 | 250 | vector | + +For further details on the datasets or the competition, we refer to +[the official site](https://lequa2024.github.io/data/) and +[the overview paper](http://nmis.isti.cnr.it/sebastiani/Publications/LQ2024.pdf). + +``` +Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022). +An Overview of LeQua 2024, the 2nd International Data Challenge on Learning to Quantify, +Proceedings of the 4th International Workshop on Learning to Quantify (LQ 2024), +ECML-PKDD 2024, Vilnius, Lithuania. +``` + + +## IFCB Plankton dataset + +IFCB is a dataset of plankton species in water samples hosted in `Zenodo `_. +This dataset is based on the data available publicly at `WHOI-Plankton repo `_ +and in the scripts for the processing are available at `P. González's repo `_. + +This dataset comes with precomputed features for testing quantification algorithms. + +Some statistics: + +| | **Training** | **Validation** | **Test** | +|-----------------|:------------:|:--------------:|:--------:| +| samples | 200 | 86 | 678 | +| total instances | 584474 | 246916 | 2626429 | +| mean per sample | 2922.3 | 2871.1 | 3873.8 | +| min per sample | 266 | 59 | 33 | +| max per sample | 6645 | 7375 | 9112 | + +The number of features is 512, while the number of classes is 50. +In terms of prevalence, the mean is 0.020, the minimum is 0, and the maximum is 0.978. + +The dataset can be loaded for model selection (`for_model_selection=True`, thus returning the training and validation) +or for test (`for_model_selection=False`, thus returning the training+validation and the test). + +Additionally, the training can be interpreted as a list (a generator) of samples (`single_sample_train=False`) +or as a single training set (`single_sample_train=True`). + +Example: + +```python +train, val_gen = qp.datasets.fetch_IFCB(for_model_selection=True, single_sample_train=True) +# ... model selection + +train, test_gen = qp.datasets.fetch_IFCB(for_model_selection=False, single_sample_train=True) +# ... train and evaluation +``` + +See also [Automatic plankton quantification using deep features +P González, A Castaño, EE Peacock, J Díez, JJ Del Coz, HM Sosik +Journal of Plankton Research 41 (4), 449-463](https://par.nsf.gov/servlets/purl/10172325). + + + +## Adding Custom Datasets + +It is straightforward to import your own datasets into QuaPy. +I what follows, there are some code snippets for doing so; see also the example +[3.custom_collection.py](https://github.com/HLT-ISTI/QuaPy/blob/master/examples/3.custom_collection.py). + +QuaPy provides data loaders for simple formats dealing with +text; for example, use `qp.data.reader.from_text` for the following the format: + +``` +class-id \t first document's pre-processed text \n +class-id \t second document's pre-processed text \n +... +``` + +or `qp.data.reader.from_sparse` for sparse representations of the form: + +``` +{-1, 0, or +1} col(int):val(float) col(int):val(float) ... \n +... +``` + +both functions return a tuple `X, y` containing a list of strings and the corresponding +labels, respectively. + +The code in charge in loading a LabelledCollection is: + +```python +@classmethod +def load(cls, path:str, loader_func:callable): + return LabelledCollection(*loader_func(path)) +``` + +indicating that any `loader_func` (e.g., `from_text`, `from_sparse`, `from_csv`, or a user-defined one) which +returns valid arguments for initializing a _LabelledCollection_ object will allow +to load any collection. More specifically, the _LabelledCollection_ receives as +arguments the _instances_ (iterable) and the _labels_ (iterable) and, +optionally, the number of classes (it would be +inferred from the labels if not indicated, but this requires at least one +positive example for +all classes to be present in the collection). + +The same _loader_func_ can be passed to a Dataset, along with two +paths, in order to create a training and test pair of _LabelledCollection_, +e.g.: + +```python +import quapy as qp + +train_path = '../my_data/train.dat' +test_path = '../my_data/test.dat' + +def my_custom_loader(path, **custom_kwargs): + with open(path, 'rb') as fin: + ... + return instances, labels + +data = qp.data.Dataset.load(train_path, test_path, my_custom_loader, **custom_kwargs) +``` + +### Data Processing + +QuaPy implements a number of preprocessing functions in the package `qp.data.preprocessing`, including: + +* _text2tfidf_: tfidf vectorization +* _reduce_columns_: reducing the number of columns based on term frequency +* _standardize_: transforms the column values into z-scores (i.e., subtract the mean and normalizes by the standard deviation, so +that the column values have zero mean and unit variance). +* _index_: transforms textual tokens into lists of numeric ids + +These functions are applied to `Dataset` objects, and offer the possibility to apply the transformation +inline (thus modifying the original dataset), or to return a modified copy. \ No newline at end of file diff --git a/docs/build/html/_sources/Evaluation.md.txt b/docs/source/manuals/evaluation.md similarity index 83% rename from docs/build/html/_sources/Evaluation.md.txt rename to docs/source/manuals/evaluation.md index a0175d2..aba7068 100644 --- a/docs/build/html/_sources/Evaluation.md.txt +++ b/docs/source/manuals/evaluation.md @@ -14,19 +14,9 @@ which are implemented in QuaPy and explained here. ## Error Measures -The module quapy.error implements the following error measures for quantification: -* _mae_: mean absolute error -* _mrae_: mean relative absolute error -* _mse_: mean squared error -* _mkld_: mean Kullback-Leibler Divergence -* _mnkld_: mean normalized Kullback-Leibler Divergence +The module quapy.error implements the most popular error measures for quantification, e.g., mean absolute error (_mae_), mean relative absolute error (_mrae_), among others. For each such measure (e.g., _mrae_) there are corresponding functions (e.g., _rae_) that do not average the results across samples. -Functions _ae_, _rae_, _se_, _kld_, and _nkld_ are also available, -which return the individual errors (i.e., without averaging the whole). - -Some errors of classification are also available: -* _acce_: accuracy error (1-accuracy) -* _f1e_: F-1 score error (1-F1 score) +Some errors of classification are also available, e.g., accuracy error (_acce_) or F-1 error (_f1e_). The error functions implement the following interface, e.g.: @@ -56,18 +46,18 @@ e.g.: ```python qp.environ['SAMPLE_SIZE'] = 100 # once for all -true_prev = np.asarray([0.5, 0.3, 0.2]) # let's assume 3 classes -estim_prev = np.asarray([0.1, 0.3, 0.6]) +true_prev = [0.5, 0.3, 0.2] # let's assume 3 classes +estim_prev = [0.1, 0.3, 0.6] error = qp.error.mrae(true_prev, estim_prev) print(f'mrae({true_prev}, {estim_prev}) = {error:.3f}') ``` will print: ``` -mrae([0.500, 0.300, 0.200], [0.100, 0.300, 0.600]) = 0.914 +mrae([0.5, 0.3, 0.2], [0.1, 0.3, 0.6]) = 0.914 ``` -Finally, it is possible to instantiate QuaPy's quantification +It is also possible to instantiate QuaPy's quantification error functions from strings using, e.g.: ```python @@ -82,8 +72,8 @@ one specific _sample generation procotol_ to genereate many samples, typically characterized by widely varying amounts of _shift_ with respect to the original distribution, that are then used to evaluate the performance of a (trained) quantifier. -These protocols are explained in more detail in a dedicated [entry -in the wiki](Protocols.md). For the moment being, let us assume we already have +These protocols are explained in more detail in a dedicated [manual](./protocols.md). +For the moment being, let us assume we already have chosen and instantiated one specific such protocol, that we here simply call _prot_. Let also assume our model is called _quantifier_ and that our evaluatio measure of choice is @@ -95,7 +85,7 @@ print(f'MAE = {mae:.4f}') ``` It is often desirable to evaluate our system using more than one -single evaluatio measure. In this case, it is convenient to generate +single evaluation measure. In this case, it is convenient to generate a _report_. A report in QuaPy is a dataframe accounting for all the true prevalence values with their corresponding prevalence values as estimated by the quantifier, along with the error each has given @@ -114,7 +104,7 @@ report['estim-prev'] = report['estim-prev'].map(F.strprev) print(report) print('Averaged values:') -print(report.mean()) +print(report.mean(numeric_only=True)) ``` This will produce an output like: @@ -151,11 +141,14 @@ true_prevs, estim_prevs = qp.evaluation.prediction(quantifier, protocol=prot) All the evaluation functions implement specific optimizations for speeding-up the evaluation of aggregative quantifiers (i.e., of instances of _AggregativeQuantifier_). + The optimization comes down to generating classification predictions (either crisp or soft) only once for the entire test set, and then applying the sampling procedure to the predictions, instead of generating samples of instances and then computing the classification predictions every time. This is only possible when the protocol -is an instance of _OnLabelledCollectionProtocol_. The optimization is only +is an instance of _OnLabelledCollectionProtocol_. + +The optimization is only carried out when the number of classification predictions thus generated would be smaller than the number of predictions required for the entire protocol; e.g., if the original dataset contains 1M instances, but the protocol is such that it would @@ -166,4 +159,4 @@ precompute all the predictions irrespectively of the number of instances and num Finally, this can be deactivated by setting _aggr_speedup=False_. Note that this optimization is not only applied for the final evaluation, but also for the internal evaluations carried out during _model selection_. Since these are typically many, the heuristic can help reduce the -execution time a lot. \ No newline at end of file +execution time significatively. \ No newline at end of file diff --git a/docs/source/manuals/explicit-loss-minimization.md b/docs/source/manuals/explicit-loss-minimization.md new file mode 100644 index 0000000..f80c434 --- /dev/null +++ b/docs/source/manuals/explicit-loss-minimization.md @@ -0,0 +1,26 @@ +# Explicit Loss Minimization + +QuaPy makes available several Explicit Loss Minimization (ELM) methods, including +SVM(Q), SVM(KLD), SVM(NKLD), SVM(AE), or SVM(RAE). +These methods require to first download the +[svmperf](http://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html) +package, apply the patch +[svm-perf-quantification-ext.patch](https://github.com/HLT-ISTI/QuaPy/blob/master/svm-perf-quantification-ext.patch), and compile the sources. +The script [prepare_svmperf.sh](https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh) does all the job. Simply run: + +``` +./prepare_svmperf.sh +``` + +The resulting directory `svm_perf_quantification/` contains the +patched version of _svmperf_ with quantification-oriented losses. + +The [svm-perf-quantification-ext.patch](https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh) is an extension of the patch made available by +[Esuli et al. 2015](https://dl.acm.org/doi/abs/10.1145/2700406?casa_token=8D2fHsGCVn0AAAAA:ZfThYOvrzWxMGfZYlQW_y8Cagg-o_l6X_PcF09mdETQ4Tu7jK98mxFbGSXp9ZSO14JkUIYuDGFG0) +that allows SVMperf to optimize for +the _Q_ measure as proposed by [Barranquero et al. 2015](https://www.sciencedirect.com/science/article/abs/pii/S003132031400291X) +and for the _KLD_ and _NKLD_ measures as proposed by [Esuli et al. 2015](https://dl.acm.org/doi/abs/10.1145/2700406?casa_token=8D2fHsGCVn0AAAAA:ZfThYOvrzWxMGfZYlQW_y8Cagg-o_l6X_PcF09mdETQ4Tu7jK98mxFbGSXp9ZSO14JkUIYuDGFG0). +This patch extends the above one by also allowing SVMperf to optimize for +_AE_ and _RAE_. +See the [](./methods) manual for more details and code examples. + diff --git a/docs/source/manuals/methods.md b/docs/source/manuals/methods.md new file mode 100644 index 0000000..93aa1dd --- /dev/null +++ b/docs/source/manuals/methods.md @@ -0,0 +1,627 @@ +# Quantification Methods + +Quantification methods can be categorized as belonging to +`aggregative`, `non-aggregative`, and `meta-learning` groups. +Most methods included in QuaPy at the moment are of type `aggregative` +(though we plan to add many more methods in the near future), i.e., +are methods characterized by the fact that +quantification is performed as an aggregation function of the individual +products of classification. + +Any quantifier in QuaPy shoud extend the class `BaseQuantifier`, +and implement some abstract methods: +```python + @abstractmethod + def fit(self, X, y): ... + + @abstractmethod + def predict(self, X): ... +``` +The meaning of those functions should be familiar to those +used to work with scikit-learn since the class structure of QuaPy +is directly inspired by scikit-learn's _Estimators_. Functions +`fit` and `predict` (for which there is an alias `quantify`) +are used to train the model and to provide +class estimations. +Quantifiers also extend from scikit-learn's `BaseEstimator`, in order +to simplify the use of `set_params` and `get_params` used in +[model selection](./model-selection). + +## Aggregative Methods + +All quantification methods are implemented as part of the +`qp.method` package. In particular, `aggregative` methods are defined in +`qp.method.aggregative`, and extend `AggregativeQuantifier(BaseQuantifier)`. +The methods that any `aggregative` quantifier must implement are: + +```python + @abstractmethod + def aggregation_fit(self, classif_predictions, labels): + + @abstractmethod + def aggregate(self, classif_predictions): ... +``` + +The argument `classif_predictions` is whatever the method `classify` returns. +QuaPy comes with default implementations that cover most common cases, but you can +override `classify` in case your method requires further or different information to work. + +These two functions replace the `fit` and `predict` methods, which +come with default implementations. For instance, the `fit` function is +provided and amounts to: + +```python + def fit(self, X, y): + self._check_init_parameters() + classif_predictions, labels = self.classifier_fit_predict(X, y) + self.aggregation_fit(classif_predictions, labels) + return self +``` + +Note that this function fits the classifier, and generates the predictions. This is assumed +to be a routine common to all aggregative quantifiers, and is provided by QuaPy. What remains +ahead is to define the `aggregation_fit` function, that takes as input the classifier predictions +and the original training data (this latter is typically unused). The classifier predictions +can be: +- confidence scores: quantifiers inheriting directly from `AggregativeQuantifier` +- crisp predictions: quantifiers inheriting from `AggregativeCrispQuantifier` +- posterior probabilities: quantifiers inheriting from `AggregativeSoftQuantifier` +- _anything_: custom quantifiers overriding the `classify` method + +Note also that the `fit` method also calls `_check_init_parameters`; this function is meant to be +overriden (if needed) and allows the method to quickly raise any exception based on any inconsistency +found in the `__init__` arguments, thus avoiding to break after training the classifier and generating +predictions. + +Similarly, the function `predict` (alias `quantify`) is provided, and amounts to: + +```python +def predict(self, X): + classif_predictions = self.classify(X) + return self.aggregate(classif_predictions) +``` + +in which only the function `aggregate` is required to be overriden in most cases. + +Aggregative quantifiers are expected to maintain a classifier (which is +accessed through the `@property` `classifier`). This classifier is +given as input to the quantifier, and will be trained by the quantifier's fit (default). +Alternatively, the classifier can be already fit on external data; in this case, the `fit_learner` +argument in the `__init__` should be set to False (see [4.using_pretrained_classifier.py](https://github.com/HLT-ISTI/QuaPy/blob/master/examples/4.using_pretrained_classifier.py) +for a full code example). + +The above patterns (in training: (i) fit the classifier, then (ii) fit the aggregation; +in test: (i) classify, then (ii) aggregate) allows QuaPy to optimize many internal procedures, +on the grounds that steps (i) are slower than steps (ii). +In particular, the model selection routing takes advantage of this two-step process +and generates classifiers only for the valid combinations of hyperparameters of the +classifier, and then _clones_ these classifiers and explores the combinations +of hyperparameters that are specific to the quantifier (this can result in huge +time savings). +Concerning the inference phase, this two-step process allow the evaluation of many +standard protocols (e.g., the [artificial sampling protocol](./evaluation)) to be +carried out very efficiently. The reason is that the entire set can be pre-classified +once, and the quantification estimations for different samples can directly +reuse these predictions, without requiring to classify each element every time. +QuaPy leverages this property to speed-up any procedure having to do with +quantification over samples, as is customarily done in model selection or +in evaluation. + +### The Classify & Count variants + +QuaPy implements the four CC variants, i.e.: + +* _CC_ (Classify & Count), the simplest aggregative quantifier; one that + simply relies on the label predictions of a classifier to deliver class estimates. +* _ACC_ (Adjusted Classify & Count), the adjusted variant of CC. +* _PCC_ (Probabilistic Classify & Count), the probabilistic variant of CC that +relies on the soft estimations (or posterior probabilities) returned by a (probabilistic) classifier. +* _PACC_ (Probabilistic Adjusted Classify & Count), the adjusted variant of PCC. + +The following code serves as a complete example using CC equipped +with a SVM as the classifier: + +```python +import quapy as qp +import quapy.functional as F +from sklearn.svm import LinearSVC + +training, test = qp.datasets.fetch_twitter('hcr', pickle=True).train_test +Xtr, ytr = training.Xy + +# instantiate a classifier learner, in this case a SVM +svm = LinearSVC() + +# instantiate a Classify & Count with the SVM +# (an alias is available in qp.method.aggregative.ClassifyAndCount) +model = qp.method.aggregative.CC(svm) +model.fit(Xtr, ytr) +estim_prevalence = model.predict(test.instances) +``` + +The same code could be used to instantiate an ACC, by simply replacing +the instantiation of the model with: +```python +model = qp.method.aggregative.ACC(svm) +``` +Note that the adjusted variants (ACC and PACC) need to estimate +some parameters for performing the adjustment (e.g., the +_true positive rate_ and the _false positive rate_ in case of +binary classification) that are estimated on a validation split +of the labelled set. In this case, the `__init__` method of +ACC defines an additional parameter, `val_split`. If this parameter +is set to a float in [0,1] representing a fraction (e.g., 0.4) +then that fraction of labelled data (e.g., 40%) +will be used for estimating the parameters for adjusting the +predictions. This parameters can also be set with an integer, +indicating that the parameters should be estimated by means of +_k_-fold cross-validation, for which the integer indicates the +number _k_ of folds (the default value is 5). Finally, `val_split` can be set to a +specific held-out validation set (i.e., an tuple `(X,y)`). + +The following code illustrates the case in which PCC is used: + +```python +model = qp.method.aggregative.PCC(svm) +model.fit(Xtr, ytr) +estim_prevalence = model.predict(Xte) +print('classifier:', model.classifier) +``` +In this case, QuaPy will print: +``` +The learner LinearSVC does not seem to be probabilistic. The learner will be calibrated. +classifier: CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5) +``` +The first output indicates that the learner (`LinearSVC` in this case) +is not a probabilistic classifier (i.e., it does not implement the +`predict_proba` method) and so, the classifier will be converted to +a probabilistic one through [calibration](https://scikit-learn.org/stable/modules/calibration.html). +As a result, the classifier that is printed in the second line points +to a `CalibratedClassifierCV` instance. Note that calibration can only +be applied to hard classifiers if `fit_learner=True`; an exception +will be raised otherwise. + +Lastly, everything we said about ACC and PCC +applies to PACC as well. + +_New in v0.1.9_: quantifiers ACC and PACC now have three additional arguments: `method`, `solver` and `norm`: + +* Argument `method` specifies how to solve, for `p`, the linear system `q = Mp` (where `q` is the unadjusted counts for the +test sample, `M` contains the class-conditional unadjusted counts --i.e., the missclassification rates-- and `p` is the +sought prevalence vector): + * option `"inversion"`: attempts to invert matrix `M`, thus solving `Minv q = p`. In degenerated cases, this + inversion may not exist. In such cases, the method defaults to returning `q` (the unadjusted counts) + * option `"invariant-ratio""` uses the invariant ratio estimator system proposed in Remark 5 of +[Vaz, A.F., Izbicki F. and Stern, R.B. "Quantification Under Prior Probability Shift: the Ratio Estimator +and its Extensions", in Journal of Machine Learning Research 20 (2019)](https://jmlr.csail.mit.edu/papers/volume20/18-456/18-456.pdf). + +* Argument `solver` specifies how to solve the linear system. + * `"exact-raise"` solves the system of linear equations and raises an exception if the system is not solvable + * `"exact-cc"` returns the original unadjusted count if the system is not solvable + * `"minimize"` minimizes the L2 norm of :math:`|Mp-q|`. This one generally works better, and is the + default parameter. More details about this can be consulted in + [Bunse, M. "On Multi-Class Extensions of Adjusted Classify and Count", + on proceedings of the 2nd International Workshop on Learning to Quantify: Methods and Applications (LQ 2022), + ECML/PKDD 2022, Grenoble (France)](https://lq-2022.github.io/proceedings/CompleteVolume.pdf)). + +* Argument `norm` specifies how to normalize the estimate `p` when the vector lies outside of the probability simplex. +Options are: + * `"clip"` which clips the values to range `[0, 1]` and then L1-normalizes the vector + * `"mapsimplex"` which projects the results on the probability simplex, as proposed by Vaz et al. in + [Remark 5 of Vaz, et. (2019)](https://jmlr.csail.mit.edu/papers/volume20/18-456/18-456.pdf). This implementation + relies on [Mathieu Blondel's `projection_simplex_sort`](https://gist.github.com/mblondel/6f3b7aaad90606b98f71)) + * `"condsoftmax"` applies softmax normalization only if the prevalence vector lies outside of the probability simplex. + + +#### BayesianCC + +The `BayesianCC` is a variant of ACC introduced in +[Ziegler, A. and Czyż, P. "Bayesian quantification with black-box estimators", arXiv (2023)](https://arxiv.org/abs/2302.09159), +which models the probabilities `q = Mp` using latent random variables with weak Bayesian priors, rather than +plug-in probability estimates. In particular, it uses Markov Chain Monte Carlo sampling to find the values of +`p` compatible with the observed quantities. +The `aggregate` method returns the posterior mean and the `get_prevalence_samples` method can be used to find +uncertainty around `p` estimates (conditional on the observed data and the trained classifier) +and is suitable for problems in which the `q = Mp` matrix is nearly non-invertible. + +Note that this quantification method requires `val_split` to be a `float` and installation of additional dependencies (`$ pip install quapy[bayes]`) needed to run Markov chain Monte Carlo sampling. Markov Chain Monte Carlo is is slower than matrix inversion methods, but is guaranteed to sample proper probability vectors, so no clipping strategies are required. +An example presenting how to run the method and use posterior samples is available in `examples/bayesian_quantification.py`. + +### Expectation Maximization (EMQ) + +The Expectation Maximization Quantifier (EMQ), also known as +the SLD, is available at `qp.method.aggregative.EMQ` or via the +alias `qp.method.aggregative.ExpectationMaximizationQuantifier`. +The method is described in: + +_Saerens, M., Latinne, P., and Decaestecker, C. (2002). Adjusting the outputs of a classifier +to new a priori probabilities: A simple procedure. Neural Computation, 14(1):21–41._ + +EMQ works with a probabilistic classifier (if the classifier +given as input is a hard one, a calibration will be attempted). +Although this method was originally proposed for improving the +posterior probabilities of a probabilistic classifier, and not +for improving the estimation of prior probabilities, EMQ ranks +almost always among the most effective quantifiers in the +experiments we have carried out. + +An example of use can be found below: + +```python +import quapy as qp +from sklearn.linear_model import LogisticRegression + +train, test = qp.datasets.fetch_twitter('hcr', pickle=True).train_test + +model = qp.method.aggregative.EMQ(LogisticRegression()) +model.fit(*train.Xy) +estim_prevalence = model.predict(test.X) +``` + +EMQ accepts additional parameters in the construction method: +* `exact_train_prev`: set to True for using the true training prevalence as the departing +prevalence estimation (default behaviour), or to False for using an approximation of it as +suggested by [Alexandari et al. (2020)](http://proceedings.mlr.press/v119/alexandari20a.html) +* `calib`: allows to indicate a calibration method, among those +proposed by [Alexandari et al. (2020)](http://proceedings.mlr.press/v119/alexandari20a.html), +including the Bias-Corrected Temperature Scaling +(`bcts`), Vector Scaling (`bcts`), No-Bias Temperature Scaling (`nbvs`), +or Temperature Scaling (`ts`); default is `None` (no calibration). +* `on_calib_error`: indicates the policy to follow in case the calibrator fails at runtime. + Options include `raise` (default), in which case a RuntimeException is raised; and `backup`, in which + case the calibrator is silently skipped. + +You can use the class method `EMQ_BCTS` to effortlessly instantiate EMQ with the best performing +heuristics found by [Alexandari et al. (2020)](http://proceedings.mlr.press/v119/alexandari20a.html). See the API documentation for further details. + + +### Hellinger Distance y (HDy) + +Implementation of the method based on the Hellinger Distance y (HDy) proposed by +[González-Castro, V., Alaiz-Rodríguez, R., and Alegre, E. (2013). Class distribution +estimation based on the Hellinger distance. Information Sciences, 218:146-164.](https://www.sciencedirect.com/science/article/pii/S0020025512004069) + +It is implemented in `qp.method.aggregative.HDy` (also accessible +through the allias `qp.method.aggregative.HellingerDistanceY`). +This method works with a probabilistic classifier (hard classifiers +can be used as well and will be calibrated) and requires a validation +set to estimate parameter for the mixture model. Just like +ACC and PACC, this quantifier receives a `val_split` argument +in the constructor that can either be a float indicating the proportion +of training data to be taken as the validation set (in a random +stratified split), or the validation set itself (i.e., an tuple +`(X,y)`). + +HDy was proposed as a binary classifier and the implementation +provided in QuaPy accepts only binary datasets. + +The following code shows an example of use: + +```python +import quapy as qp +from sklearn.linear_model import LogisticRegression + +# load a binary dataset +dataset = qp.datasets.fetch_reviews('hp', pickle=True) +qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True) + +model = qp.method.aggregative.HDy(LogisticRegression()) +model.fit(*dataset.training.Xy) +estim_prevalence = model.predict(dataset.test.X) +``` + +QuaPy also provides an implementation of the generalized +"Distribution Matching" approaches for multiclass, inspired by the framework +of [Firat (2016)](https://arxiv.org/abs/1606.00868). One can instantiate +a variant of HDy for multiclass quantification as follows: + +```python +mutliclassHDy = qp.method.aggregative.DMy(classifier=LogisticRegression(), divergence='HD', cdf=False) +``` + +QuaPy also provides an implementation of the "DyS" +framework proposed by [Maletzke et al (2020)](https://ojs.aaai.org/index.php/AAAI/article/view/4376) +and the "SMM" method proposed by [Hassan et al (2019)](https://ieeexplore.ieee.org/document/9260028) +(thanks to _Pablo González_ for the contributions!) + +### Threshold Optimization methods + +QuaPy implements Forman's threshold optimization methods; +see, e.g., [(Forman 2006)](https://dl.acm.org/doi/abs/10.1145/1150402.1150423) +and [(Forman 2008)](https://link.springer.com/article/10.1007/s10618-008-0097-y). +These include: `T50`, `MAX`, `X`, Median Sweep (`MS`), and its variant `MS2`. + +These methods are binary-only and implement different heuristics for +improving the stability of the denominator of the ACC adjustment (`tpr-fpr`). +The methods are called "threshold" since said heuristics have to do +with different choices of the underlying classifier's threshold. + +### Explicit Loss Minimization + +The Explicit Loss Minimization (ELM) represent a family of methods +based on structured output learning, i.e., quantifiers relying on +classifiers that have been optimized targeting a +quantification-oriented evaluation measure. +The original methods are implemented in QuaPy as classify & count (CC) +quantifiers that use Joachim's [SVMperf](https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html) +as the underlying classifier, properly set to optimize for the desired loss. + +In QuaPy, this can be more achieved by calling the functions: + +* `newSVMQ`: returns the quantification method called SVM(Q) that optimizes for the metric _Q_ defined +in [_Barranquero, J., Díez, J., and del Coz, J. J. (2015). Quantification-oriented learning based +on reliable classifiers. Pattern Recognition, 48(2):591–604._](https://www.sciencedirect.com/science/article/pii/S003132031400291X) +* `newSVMKLD` and `newSVMNKLD`: returns the quantification method called SVM(KLD) and SVM(nKLD), standing for + Kullback-Leibler Divergence and Normalized Kullback-Leibler Divergence, as proposed in [_Esuli, A. and Sebastiani, F. (2015). + Optimizing text quantifiers for multivariate loss functions. + ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27._](https://dl.acm.org/doi/abs/10.1145/2700406) +* `newSVMAE` and `newSVMRAE`: returns a quantification method called SVM(AE) and SVM(RAE) that optimizes for the (Mean) Absolute Error and for the + (Mean) Relative Absolute Error, as first used by + [_Moreo, A. and Sebastiani, F. (2021). Tweet sentiment quantification: An experimental re-evaluation. PLOS ONE 17 (9), 1-23._](https://arxiv.org/abs/2011.02552) + +the last two methods (SVM(AE) and SVM(RAE)) have been implemented in +QuaPy in order to make available ELM variants for what nowadays +are considered the most well-behaved evaluation metrics in quantification. + +In order to make these models work, you would need to run the script +`prepare_svmperf.sh` (distributed along with QuaPy) that +downloads `SVMperf`' source code, applies a patch that +implements the quantification oriented losses, and compiles the +sources. + +If you want to add any custom loss, you would need to modify +the source code of `SVMperf` in order to implement it, and +assign a valid loss code to it. Then you must re-compile +the whole thing and instantiate the quantifier in QuaPy +as follows: + +```python +# you can either set the path to your custom svm_perf_quantification implementation +# in the environment variable, or as an argument to the constructor of ELM +qp.environ['SVMPERF_HOME'] = './path/to/svm_perf_quantification' + +# assign an alias to your custom loss and the id you have assigned to it +svmperf = qp.classification.svmperf.SVMperf +svmperf.valid_losses['mycustomloss'] = 28 + +# instantiate the ELM method indicating the loss +model = qp.method.aggregative.ELM(loss='mycustomloss') +``` + +All ELM are binary quantifiers since they rely on `SVMperf`, that +currently supports only binary classification. +ELM variants (any binary quantifier in general) can be extended +to operate in single-label scenarios trivially by adopting a +"one-vs-all" strategy (as, e.g., in +[_Gao, W. and Sebastiani, F. (2016). From classification to quantification in tweet sentiment +analysis. Social Network Analysis and Mining, 6(19):1–22_](https://link.springer.com/article/10.1007/s13278-016-0327-z)). +In QuaPy this is possible by using the `OneVsAll` class. + +There are two ways for instantiating this class, `OneVsAllGeneric` that works for +any quantifier, and `OneVsAllAggregative` that is optimized for aggregative quantifiers. +In general, you can simply use the `newOneVsAll` function and QuaPy will choose +the more convenient of the two. + +```python +import quapy as qp +from quapy.method.aggregative import SVMQ + +# load a single-label dataset (this one contains 3 classes) +dataset = qp.datasets.fetch_twitter('hcr', pickle=True) + +# let qp know where svmperf is +qp.environ['SVMPERF_HOME'] = '../svm_perf_quantification' + +model = newOneVsAll(SVMQ(), n_jobs=-1) # run them on parallel +model.fit(dataset.training) +estim_prevalence = model.predict(dataset.test.instances) +``` + +Check the examples on [explicit loss minimization](https://github.com/HLT-ISTI/QuaPy/blob/devel/examples/17.explicit_loss_minimization.py) +and on [one versus all quantification](https://github.com/HLT-ISTI/QuaPy/blob/devel/examples/10.one_vs_all.py) for more details. +**Note** that the _one versus all_ approach is considered inappropriate under prior probability shift, though. + +### Kernel Density Estimation methods (KDEy) + +QuaPy provides implementations for the three variants +of KDE-based methods proposed in +_[Moreo, A., González, P. and del Coz, J.J.. +Kernel Density Estimation for Multiclass Quantification. +Machine Learning. Vol 114 (92), 2025](https://link.springer.com/article/10.1007/s10994-024-06726-5)_ +(a [preprint](https://arxiv.org/abs/2401.00490) is available online). +The variants differ in the divergence metric to be minimized: + +- KDEy-HD: minimizes the (squared) Hellinger Distance and solves the problem via a Monte Carlo approach +- KDEy-CS: minimizes the Cauchy-Schwarz divergence and solves the problem via a closed-form solution +- KDEy-ML: minimizes the Kullback-Leibler divergence and solves the problem via maximum-likelihood + +These methods are specifically devised for multiclass problems (although they can tackle +binary problems too). + +All KDE-based methods depend on the hyperparameter `bandwidth` of the kernel. Typical values +that can be explored in model selection range in [0.01, 0.25]. Previous experiments reveal the methods' performance +varies smoothly at small variations of this hyperparameter. + + +## Composable Methods + +The `quapy.method.composable` module integrates [qunfold](https://github.com/mirkobunse/qunfold) allows the composition +of quantification methods from loss functions and feature transformations (thanks to Mirko Bunse for the integration!). + +Any composed method solves a linear system of equations by minimizing the loss after transforming the data. Methods of this kind include ACC, PACC, HDx, HDy, and many other well-known methods, as well as an unlimited number of re-combinations of their building blocks. + +### Installation + +```sh +pip install --upgrade pip setuptools wheel +pip install "jax[cpu]" +pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.5" +``` + +**Note:** since version 0.2.0, QuaPy is only compatible with qunfold >=0.1.5. + +### Basics + +The composition of a method is implemented through the [](quapy.method.composable.ComposableQuantifier) class. Its documentation also features an example to get you started in composing your own methods. + +```python +from quapy.method.composable import ( + ComposableQuantifier, + TikhonovRegularized, + LeastSquaresLoss, + ClassRepresentation, +) + +ComposableQuantifier( # ordinal ACC, as proposed by Bunse et al., 2022 + TikhonovRegularized(LeastSquaresLoss(), 0.01), + ClassRepresentation(RandomForestClassifier(oob_score=True)) +) +``` + +More exhaustive examples of method compositions, including hyper-parameter optimization, can be found in [the example directory](https://github.com/HLT-ISTI/QuaPy/tree/master/examples). + +To implement your own loss functions and feature representations, follow the corresponding manual of the [qunfold package](https://github.com/mirkobunse/qunfold), which provides the back-end of QuaPy's composable module. + +### Loss functions + +- [](quapy.method.composable.LeastSquaresLoss) +- [](quapy.method.composable.EnergyLoss) +- [](quapy.method.composable.HellingerSurrogateLoss) +- [](quapy.method.composable.BlobelLoss) +- [](quapy.method.composable.CombinedLoss) + +```{hint} +You can use the [](quapy.method.composable.CombinedLoss) to create arbitrary, weighted sums of losses and regularizers. +``` + +### Regularization functions + +- [](quapy.method.composable.TikhonovRegularized) +- [](quapy.method.composable.TikhonovRegularization) + +### Feature transformations + +- [](quapy.method.composable.ClassRepresentation) +- [](quapy.method.composable.DistanceRepresentation) +- [](quapy.method.composable.HistogramRepresentation) +- [](quapy.method.composable.EnergyKernelRepresentation) +- [](quapy.method.composable.GaussianKernelRepresentation) +- [](quapy.method.composable.LaplacianKernelRepresentation) +- [](quapy.method.composable.GaussianRFFKernelRepresentation) + +```{hint} +The [](quapy.method.composable.ClassRepresentation) requires the classifier to have a property `oob_score==True` and to produce a property `oob_decision_function` during fitting. In [scikit-learn](https://scikit-learn.org/), this requirement is fulfilled by any bagging classifier, such as random forests. Any other classifier needs to be cross-validated through the [](quapy.method.composable.CVClassifier). +``` + + +## Meta Models + +By _meta_ models we mean quantification methods that are defined on top of other +quantification methods, and that thus do not squarely belong to the aggregative nor +the non-aggregative group (indeed, _meta_ models could use quantifiers from any of those +groups). +_Meta_ models are implemented in the `qp.method.meta` module. + +### Ensembles + +QuaPy implements (some of) the variants proposed in: + +* [_Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). +Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. +Information Fusion, 34, 87-100._](https://www.sciencedirect.com/science/article/pii/S1566253516300628) +* [_Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019). + Dynamic ensemble selection for quantification tasks. + Information Fusion, 45, 1-15._](https://www.sciencedirect.com/science/article/pii/S1566253517303652) + +The following code shows how to instantiate an Ensemble of 30 _Adjusted Classify & Count_ (ACC) +quantifiers operating with a _Logistic Regressor_ (LR) as the base classifier, and using the +_average_ as the aggregation policy (see the original article for further details). +The last parameter indicates to use all processors for parallelization. + +```python +import quapy as qp +from quapy.method.aggregative import ACC +from quapy.method.meta import Ensemble +from sklearn.linear_model import LogisticRegression + +dataset = qp.datasets.fetch_UCIBinaryDataset('haberman') +train, test = dataset.train_test + +model = Ensemble(quantifier=ACC(LogisticRegression()), size=30, policy='ave', n_jobs=-1) +model.fit(*train.Xy) +estim_prevalence = model.predict(test.X) +``` + +Other aggregation policies implemented in QuaPy include: +* 'ptr' for applying a dynamic selection based on the training prevalence of the ensemble's members +* 'ds' for applying a dynamic selection based on the Hellinger Distance +* _any valid quantification measure_ (e.g., 'mse') for performing a static selection based on +the performance estimated for each member of the ensemble in terms of that evaluation metric. + +When using any of the above options, it is important to set the `red_size` parameter, which +informs of the number of members to retain. + +Please, check the [model selection manual](./model-selection) if you want to optimize the hyperparameters of ensemble for classification or quantification. + +### The QuaNet neural network + +QuaPy offers an implementation of QuaNet, a deep learning model presented in: + +[_Esuli, A., Moreo, A., & Sebastiani, F. (2018, October). +A recurrent neural network for sentiment quantification. +In Proceedings of the 27th ACM International Conference on +Information and Knowledge Management (pp. 1775-1778)._](https://dl.acm.org/doi/abs/10.1145/3269206.3269287) + +This model requires `torch` to be installed. +QuaNet also requires a classifier that can provide embedded representations +of the inputs. +In the original paper, QuaNet was tested using an LSTM as the base classifier. +In the following example, we show an instantiation of QuaNet that instead uses CNN as a probabilistic classifier, taking its last layer representation as the document embedding: + +```python +import quapy as qp +from quapy.method.meta import QuaNet +from quapy.classification.neural import NeuralClassifierTrainer, CNNnet + +# use samples of 100 elements +qp.environ['SAMPLE_SIZE'] = 100 + +# load the kindle dataset as text, and convert words to numerical indexes +dataset = qp.datasets.fetch_reviews('kindle', pickle=True) +qp.data.preprocessing.index(dataset, min_df=5, inplace=True) + +# the text classifier is a CNN trained by NeuralClassifierTrainer +cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes) +learner = NeuralClassifierTrainer(cnn, device='cuda') + +# train QuaNet +model = QuaNet(learner, device='cuda') +model.fit(*dataset.training.Xy) +estim_prevalence = model.predict(dataset.test.X) +``` + +## Confidence Regions for Class Prevalence Estimation + +_(New in v0.2.0!)_ Some quantification methods go beyond providing a single point estimate of class prevalence values and also produce confidence regions, which characterize the uncertainty around the point estimate. In QuaPy, two such methods are currently implemented: + +* Aggregative Bootstrap: The Aggregative Bootstrap method extends any aggregative quantifier by generating confidence regions for class prevalence estimates through bootstrapping. The method is described in the paper [Moreo, A., Salvati, N. + An Efficient Method for Deriving Confidence Intervals in Aggregative Quantification. + Learning to Quantify: Methods and Applications (LQ 2025), co-located at ECML-PKDD 2025. + pp 12-33, Porto (Portugal)](https://lq-2025.github.io/proceedings/CompleteVolume.pdf). Key features of this method include: + + * Optimized Computation: The bootstrap is applied to pre-classified instances, significantly speeding up training and inference. +During training, bootstrap repetitions are performed only after training the classifier once. These repetitions are used to train multiple aggregation functions. +During inference, bootstrap is applied over pre-classified test instances. + * General Applicability: Aggregative Bootstrap can be applied to any aggregative quantifier. + For further information, check the [example](https://github.com/HLT-ISTI/QuaPy/tree/master/examples/16.confidence_regions.py) provided. + +* BayesianCC: is a Bayesian variant of the Adjusted Classify & Count (ACC) quantifier; see more details in the [example](https://github.com/HLT-ISTI/QuaPy/tree/master/examples/14.bayesian_quantification.py) provided. + +Confidence regions are constructed around a point estimate, which is typically computed as the mean value of a set of samples. +The confidence region can be instantiated in three ways: +* Confidence intervals: are standard confidence intervals generated for each class independently (_method="intervals"_). +* Confidence ellipse in the simplex: an ellipse constructed around the mean point; the ellipse lies on the simplex and takes + into account possible inter-class dependencies in the data (_method="ellipse"_). +* Confidence ellipse in the Centered-Log Ratio (CLR) space: the underlying assumption of the ellipse is that the components are + normally distributed. However, we know elements from the simplex have an inner structure. A better approach is to first + transform the components into an unconstrained space (the CLR), and then construct the ellipse in such space (_method="ellipse-clr"_). \ No newline at end of file diff --git a/docs/build/html/_sources/Model-Selection.md.txt b/docs/source/manuals/model-selection.md similarity index 88% rename from docs/build/html/_sources/Model-Selection.md.txt rename to docs/source/manuals/model-selection.md index 1df9107..6470ebf 100644 --- a/docs/build/html/_sources/Model-Selection.md.txt +++ b/docs/source/manuals/model-selection.md @@ -33,18 +33,18 @@ of scenarios exhibiting different degrees of prior probability shift. The class _qp.model_selection.GridSearchQ_ implements a grid-search exploration over the space of -hyper-parameter combinations that [evaluates](https://github.com/HLT-ISTI/QuaPy/wiki/Evaluation) +hyper-parameter combinations that [evaluates](./evaluation) each combination of hyper-parameters by means of a given quantification-oriented error metric (e.g., any of the error functions implemented in _qp.error_) and according to a -[sampling generation protocol](https://github.com/HLT-ISTI/QuaPy/wiki/Protocols). +[sampling generation protocol](./protocols). The following is an example (also included in the examples folder) of model selection for quantification: ```python import quapy as qp from quapy.protocol import APP -from quapy.method.aggregative import DistributionMatching +from quapy.method.aggregative import DMy from sklearn.linear_model import LogisticRegression import numpy as np @@ -52,7 +52,7 @@ import numpy as np In this example, we show how to perform model selection on a DistributionMatching quantifier. """ -model = DistributionMatching(LogisticRegression()) +model = DMy(LogisticRegression()) qp.environ['SAMPLE_SIZE'] = 100 qp.environ['N_JOBS'] = -1 # explore hyper-parameters in parallel @@ -76,7 +76,7 @@ protocol = APP(validation) # in order to let the quantifier know this hyper-parameter belongs to its underlying # classifier. param_grid = { - 'classifier__C': np.logspace(-3,3,7), + 'classifier__C': np.logspace(-3, 3, 7), 'nbins': [8, 16, 32, 64], } @@ -85,9 +85,9 @@ model = qp.model_selection.GridSearchQ( param_grid=param_grid, protocol=protocol, error='mae', # the error to optimize is the MAE (a quantification-oriented loss) - refit=True, # retrain on the whole labelled set once done + refit=True, # retrain on the whole labelled set once done verbose=True # show information as the process goes on -).fit(training) +).fit(*training.Xy) print(f'model selection ended: best hyper-parameters={model.best_params_}') model = model.best_model_ @@ -114,11 +114,6 @@ model selection ended: best hyper-parameters={'classifier__C': 100.0, 'nbins': 3 MAE=0.03102 ``` -The parameter _val_split_ can alternatively be used to indicate -a validation set (i.e., an instance of _LabelledCollection_) instead -of a proportion. This could be useful if one wants to have control -on the specific data split to be used across different model selection -experiments. ## Targeting a Classification-oriented loss @@ -138,7 +133,7 @@ learner = GridSearchCV( LogisticRegression(), param_grid={'C': np.logspace(-4, 5, 10), 'class_weight': ['balanced', None]}, cv=5) -model = DistributionMatching(learner).fit(dataset.training) +model = DistributionMatching(learner).fit(*dataset.train.Xy) ``` However, this is conceptually flawed, since the model should be diff --git a/docs/build/html/_images/bin_bias.png b/docs/source/manuals/plots/bin_bias.png similarity index 100% rename from docs/build/html/_images/bin_bias.png rename to docs/source/manuals/plots/bin_bias.png diff --git a/docs/build/html/_images/bin_bias_bin_cc.png b/docs/source/manuals/plots/bin_bias_bin_cc.png similarity index 100% rename from docs/build/html/_images/bin_bias_bin_cc.png rename to docs/source/manuals/plots/bin_bias_bin_cc.png diff --git a/docs/build/html/_images/bin_bias_cc.png b/docs/source/manuals/plots/bin_bias_cc.png similarity index 100% rename from docs/build/html/_images/bin_bias_cc.png rename to docs/source/manuals/plots/bin_bias_cc.png diff --git a/docs/build/html/_images/bin_diag.png b/docs/source/manuals/plots/bin_diag.png similarity index 100% rename from docs/build/html/_images/bin_diag.png rename to docs/source/manuals/plots/bin_diag.png diff --git a/docs/build/html/_images/bin_diag_cc.png b/docs/source/manuals/plots/bin_diag_cc.png similarity index 100% rename from docs/build/html/_images/bin_diag_cc.png rename to docs/source/manuals/plots/bin_diag_cc.png diff --git a/docs/build/html/_images/err_drift.png b/docs/source/manuals/plots/err_drift.png similarity index 100% rename from docs/build/html/_images/err_drift.png rename to docs/source/manuals/plots/err_drift.png diff --git a/docs/build/html/_sources/Plotting.md.txt b/docs/source/manuals/plotting.md similarity index 94% rename from docs/build/html/_sources/Plotting.md.txt rename to docs/source/manuals/plotting.md index 99f3f7e..67f9f16 100644 --- a/docs/build/html/_sources/Plotting.md.txt +++ b/docs/source/manuals/plotting.md @@ -2,6 +2,9 @@ The module _qp.plot_ implements some basic plotting functions that can help analyse the performance of a quantification method. +See the provided +[code example](https://github.com/HLT-ISTI/QuaPy/blob/master/examples/13.plotting.py) +for a full example. All plotting functions receive as inputs the outcomes of some experiments and include, for each experiment, @@ -43,7 +46,7 @@ quantification methods across different scenarios showcasing the accuracy of the quantifier in predicting class prevalences for a wide range of prior distributions. This can easily be achieved by means of the -[artificial sampling protocol](https://github.com/HLT-ISTI/QuaPy/wiki/Protocols) +[artificial sampling protocol](./protocols) that is implemented in QuaPy. The following code shows how to perform one simple experiment @@ -77,7 +80,7 @@ def gen_data(): method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], [] for method_name, model in models(): - model.fit(train) + model.fit(*train.Xy) true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0)) method_names.append(method_name) @@ -113,7 +116,7 @@ are '.png' or '.pdf'). If this path is not provided, then the plot will be shown but not saved. The resulting plot should look like: -![diagonal plot on Kindle](./wiki_examples/selected_plots/bin_diag.png) +![diagonal plot on Kindle](./plots/bin_diag.png) Note that in this case, we are also indicating the training prevalence, which is plotted in the diagonal a as cyan dot. @@ -138,7 +141,7 @@ qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, savepath='./pl and should look like: -![bias plot on Kindle](./wiki_examples/selected_plots/bin_bias.png) +![bias plot on Kindle](./plots/bin_bias.png) The box plots show some interesting facts: * all methods are biased towards the training prevalence but specially @@ -171,7 +174,7 @@ def gen_data(): training_size = 5000 # since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained train_sample = train.sampling(training_size, 1-training_prevalence) - model.fit(train_sample) + model.fit(*train_sample.Xy) true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0)) method_name = 'CC$_{'+f'{int(100*training_prevalence)}' + '\%}$' method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence())) @@ -181,7 +184,7 @@ def gen_data(): and the plot should now look like: -![bias plot on IMDb](./wiki_examples/selected_plots/bin_bias_cc.png) +![bias plot on IMDb](./plots/bin_bias_cc.png) which clearly shows a negative bias for CC variants trained on data containing more negatives (i.e., < 50%) and positive biases @@ -195,7 +198,7 @@ To this aim, an argument _nbins_ is passed which indicates how many isometric subintervals to take. For example the following plot is produced for _nbins=3_: -![bias plot on IMDb](./wiki_examples/selected_plots/bin_bias_bin_cc.png) +![bias plot on IMDb](./plots/bin_bias_bin_cc.png) Interestingly enough, the seemingly unbiased estimator (CC at 50%) happens to display a positive bias (or a tendency to overestimate) in cases of low prevalence @@ -205,7 +208,7 @@ and a negative bias (or a tendency to underestimate) in cases of high prevalence Out of curiosity, the diagonal plot for this experiment looks like: -![diag plot on IMDb](./wiki_examples/selected_plots/bin_diag_cc.png) +![diag plot on IMDb](./plots/bin_diag_cc.png) showing pretty clearly the dependency of CC on the prior probabilities of the labeled set it was trained on. @@ -234,7 +237,7 @@ qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, error_name='ae', n_bins=10, savepath='./plots/err_drift.png') ``` -![diag plot on IMDb](./wiki_examples/selected_plots/err_drift.png) +![diag plot on IMDb](./plots/err_drift.png) Note that all methods work reasonably well in cases of low prevalence drift (i.e., any CC-variant is a good quantifier whenever the IID diff --git a/docs/source/manuals/protocols.md b/docs/source/manuals/protocols.md new file mode 100644 index 0000000..17bc41a --- /dev/null +++ b/docs/source/manuals/protocols.md @@ -0,0 +1,176 @@ +# Protocols + +Quantification methods are expected to behave robustly in the presence of +shift. For this reason, quantification methods need to be confronted with +samples exhibiting widely varying amounts of shift. +_Protocols_ implement specific ways for generating such samples. + +In QuaPy, a protocol is an instance of _AbstractProtocol_ implementing a +_call_ method that returns a generator yielding a tuple _(sample, prev)_ +every time. The protocol can also implement the function _total()_ informing +of the number of total samples that the protocol generates. + +Protocols can inherit from _AbstractStochasticSeededProtocol_, the class of +protocols that generate samples stochastically, but that can be set with +a seed in order to allow for replicating the exact same samples. This is important +for evaluation purposes, since we typically require all our methods be evaluated +on the exact same test samples in order to allow for a fair comparison. +Indeed, the seed is set by default to 0, since this is the most commonly +desired behaviour. Indicate _radom_state=None_ for allowing different sequences of samples to be +generated every time the protocol is invoked. + +Protocols that also inherit from _OnLabelledCollectionProtocol_ are such that +samples are generated from a _LabelledCollection_ object (e.g., a test collection, +or a validation collection). These protocols also allow for generating sequences of +_LabelledCollection_ instead of _(sample, prev)_ by indicating +_return_type='labelled_collection'_ instead of the default value _return_type='sample_prev'_. + +For a more technical explanation on _AbstractStochasticSeededProtocol_ and +_OnLabelledCollectionProtocol_, see the "custom_protocol.py" provided in the +example folder. + +QuaPy provides implementations of most popular sample generation protocols +used in literature. This is the subject of the following sections. + + +## Artificial-Prevalence Protocol + +The "artificial-sampling protocol" (APP) proposed by +[Forman (2005)](https://link.springer.com/chapter/10.1007/11564096_55) +is likely the most popular protocol used for quantification evaluation. +In APP, a test set is used to generate samples at +desired prevalence values covering the full spectrum. + +In APP, the user specifies the number +of (equally distant) points to be generated from the interval [0,1]; +in QuaPy this is achieved by setting _n_prevpoints_. +For example, if _n_prevpoints=11_ then, for each class, the prevalence values +[0., 0.1, 0.2, ..., 1.] will be used. This means that, for two classes, +the number of different prevalence values will be 11 (since, once the prevalence +of one class is determined, the other one is constrained). For 3 classes, +the number of valid combinations can be obtained as 11 + 10 + ... + 1 = 66. +In general, the number of valid combinations that will be produced for a given +value of n_prevpoints can be consulted by invoking +_num_prevalence_combinations_, e.g.: + +```python +import quapy.functional as F +n_prevpoints = 21 +n_classes = 4 +n = F.num_prevalence_combinations(n_prevpoints, n_classes, n_repeats=1) +``` + +in this example, _n=1771_. Note the last argument, _n_repeats_, that +informs of the number of examples that will be generated for any +valid combination (typical values are, e.g., 1 for a single sample, +or 10 or higher for computing standard deviations of performing statistical +significance tests). + +One can instead work the other way around, i.e., one could decide for a +maximum budged of evaluations and get the number of prevalence points that +will give rise to a number of evaluations close, but not higher, than +this budget. This can be achieved with the function +_get_nprevpoints_approximation_, e.g.: + +```python +budget = 5000 +n_prevpoints = F.get_nprevpoints_approximation(budget, n_classes, n_repeats=1) +n = F.num_prevalence_combinations(n_prevpoints, n_classes, n_repeats=1) +print(f'by setting n_prevpoints={n_prevpoints} the number of evaluations for {n_classes} classes will be {n}') +``` +this will produce the following output: +``` +by setting n_prevpoints=30 the number of evaluations for 4 classes will be 4960 +``` + +The following code shows an example of usage of APP for model selection +and evaluation: + +```python +import quapy as qp +from quapy.method.aggregative import ACC +from quapy.protocol import APP +import numpy as np +from sklearn.linear_model import LogisticRegression + +qp.environ['SAMPLE_SIZE'] = 100 +qp.environ['N_JOBS'] = -1 + +# define an instance of our custom quantifier +quantifier = ACC(LogisticRegression()) + +# load the IMDb dataset +train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test + +# model selection +train, val = train.split_stratified(train_prop=0.75) +Xtr, ytr = train.Xy +quantifier = qp.model_selection.GridSearchQ( + quantifier, + param_grid={'classifier__C': np.logspace(-2, 2, 5)}, + protocol=APP(val) # <- this is the protocol we use for generating validation samples +).fit(Xtr, ytr) + +# default values are n_prevalences=21, repeats=10, random_state=0; this is equialent to: +# val_app = APP(val, n_prevalences=21, repeats=10, random_state=0) +# quantifier = GridSearchQ(quantifier, param_grid, protocol=val_app).fit(Xtr, ytr) + +# evaluation with APP +mae = qp.evaluation.evaluate(quantifier, protocol=APP(test), error_metric='mae') +print(f'MAE = {mae:.4f}') +``` + +Note that APP is an instance of _AbstractStochasticSeededProtocol_ and that the +_random_state_ is by default set to 0, meaning that all the generated validation +samples will be consistent for all the combinations of hyperparameters being tested. +Note also that the _sample_size_ is not indicated when instantiating the protocol; +in such cases QuaPy takes the value of _qp.environ['SAMPLE_SIZE']_. + +This protocol is useful for testing a quantifier under conditions of +_prior probability shift_. + +## Sampling from the unit-simplex, the Uniform-Prevalence Protocol (UPP) + +Generating all possible combinations from a grid of prevalence values (APP) in +multiclass is cumbersome, and when the number of classes increases it rapidly +becomes impractical. In some cases, it is preferable to generate a fixed number +of samples displaying prevalence values that are uniformly drawn from the unit-simplex, +that is, so that every legitimate distribution is equally likely. The main drawback +of this approach is that we are not guaranteed that all classes have been tested +in the entire range of prevalence values. The main advantage is that every possible +prevalence value is electable (this was not possible with standard APP, since values +not included in the grid are never tested). Yet another advantage is that we can +control the computational burden every evaluation incurs, by deciding in advance +the number of samples to generate. + +The UPP protocol implements this idea by relying on the Kraemer algorithm +for sampling from the unit-simplex as many vectors of prevalence values as indicated +in the _repeats_ parameter. UPP can be instantiated as: + +```python +protocol = qp.in_protocol.UPP(test, repeats=100) +``` + +This is the most convenient protocol for datasets +containing many classes; see, e.g., +[LeQua (2022)](https://ceur-ws.org/Vol-3180/paper-146.pdf), +and is useful for testing a quantifier under conditions of +_prior probability shift_. + + +## Natural-Prevalence Protocol + +The "natural-prevalence protocol" (NPP) comes down to generating samples drawn +uniformly at random from the original labelled collection. This protocol has +sometimes been used in literature, although it is now considered to be deprecated, +due to its limited capability to generate interesting amounts of shift. +All other things being equal, this protocol can be used just like APP or UPP, +and is instantiated via: + +```python +protocol = qp.in_protocol.NPP(test, repeats=100) +``` + +## Other protocols + +Other protocols exist in QuaPy and will be added to the `qp.protocol.py` module. \ No newline at end of file diff --git a/docs/build/html/_sources/modules.rst.txt b/docs/source/modules.rst similarity index 100% rename from docs/build/html/_sources/modules.rst.txt rename to docs/source/modules.rst diff --git a/docs/build/html/_sources/quapy.classification.rst.txt b/docs/source/quapy.classification.rst similarity index 67% rename from docs/build/html/_sources/quapy.classification.rst.txt rename to docs/source/quapy.classification.rst index 3d14431..cfc7d9b 100644 --- a/docs/build/html/_sources/quapy.classification.rst.txt +++ b/docs/source/quapy.classification.rst @@ -1,38 +1,35 @@ -:tocdepth: 2 - quapy.classification package ============================ Submodules ---------- -quapy.classification.calibration --------------------------------- +quapy.classification.calibration module +--------------------------------------- -.. versionadded:: 0.1.7 .. automodule:: quapy.classification.calibration :members: :undoc-members: :show-inheritance: -quapy.classification.methods ----------------------------- +quapy.classification.methods module +----------------------------------- .. automodule:: quapy.classification.methods :members: :undoc-members: :show-inheritance: -quapy.classification.neural ---------------------------- +quapy.classification.neural module +---------------------------------- .. automodule:: quapy.classification.neural :members: :undoc-members: :show-inheritance: -quapy.classification.svmperf ----------------------------- +quapy.classification.svmperf module +----------------------------------- .. automodule:: quapy.classification.svmperf :members: diff --git a/docs/build/html/_sources/quapy.data.rst.txt b/docs/source/quapy.data.rst similarity index 75% rename from docs/build/html/_sources/quapy.data.rst.txt rename to docs/source/quapy.data.rst index fda5ff0..cadace6 100644 --- a/docs/build/html/_sources/quapy.data.rst.txt +++ b/docs/source/quapy.data.rst @@ -1,37 +1,36 @@ -:tocdepth: 2 - quapy.data package ================== Submodules ---------- -quapy.data.base ---------------- +quapy.data.base module +---------------------- .. automodule:: quapy.data.base :members: :undoc-members: :show-inheritance: -quapy.data.datasets -------------------- +quapy.data.datasets module +-------------------------- .. automodule:: quapy.data.datasets :members: :undoc-members: :show-inheritance: -quapy.data.preprocessing ------------------------- + +quapy.data.preprocessing module +------------------------------- .. automodule:: quapy.data.preprocessing :members: :undoc-members: :show-inheritance: -quapy.data.reader ------------------ +quapy.data.reader module +------------------------ .. automodule:: quapy.data.reader :members: diff --git a/docs/source/quapy.method.rst b/docs/source/quapy.method.rst new file mode 100644 index 0000000..88fcc7d --- /dev/null +++ b/docs/source/quapy.method.rst @@ -0,0 +1,77 @@ +quapy.method package +==================== + +Submodules +---------- + +quapy.method.aggregative module +------------------------------- + +.. automodule:: quapy.method.aggregative + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: quapy.method._kdey + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: quapy.method._neural + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: quapy.method._threshold_optim + :members: + :undoc-members: + :show-inheritance: + + +quapy.method.base module +------------------------ + +.. automodule:: quapy.method.base + :members: + :undoc-members: + :show-inheritance: + +quapy.method.meta module +------------------------ + +.. automodule:: quapy.method.meta + :members: + :undoc-members: + :show-inheritance: + +quapy.method.non\_aggregative module +------------------------------------ + +.. automodule:: quapy.method.non_aggregative + :members: + :undoc-members: + :show-inheritance: + +quapy.method.composable module +------------------------------ + +.. automodule:: quapy.method.composable + :members: + :undoc-members: + :show-inheritance: + +quapy.method.confidence module +------------------------------ + +.. automodule:: quapy.method.confidence + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: quapy.method + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/build/html/_sources/quapy.rst.txt b/docs/source/quapy.rst similarity index 72% rename from docs/build/html/_sources/quapy.rst.txt rename to docs/source/quapy.rst index e3e1697..af2708b 100644 --- a/docs/build/html/_sources/quapy.rst.txt +++ b/docs/source/quapy.rst @@ -1,79 +1,76 @@ -:tocdepth: 2 - quapy package ============= +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + quapy.classification + quapy.data + quapy.method + + Submodules ---------- -quapy.error ------------ +quapy.error module +------------------ .. automodule:: quapy.error :members: :undoc-members: :show-inheritance: -quapy.evaluation ----------------- +quapy.evaluation module +----------------------- .. automodule:: quapy.evaluation :members: :undoc-members: :show-inheritance: -quapy.protocol --------------- - -.. versionadded:: 0.1.7 -.. automodule:: quapy.protocol - :members: - :undoc-members: - :show-inheritance: - -quapy.functional ----------------- +quapy.functional module +----------------------- .. automodule:: quapy.functional :members: :undoc-members: :show-inheritance: -quapy.model\_selection ----------------------- +quapy.model\_selection module +----------------------------- .. automodule:: quapy.model_selection :members: :undoc-members: :show-inheritance: -quapy.plot ----------- +quapy.plot module +----------------- .. automodule:: quapy.plot :members: :undoc-members: :show-inheritance: -quapy.util ----------- +quapy.protocol module +--------------------- + +.. automodule:: quapy.protocol + :members: + :undoc-members: + :show-inheritance: + +quapy.util module +----------------- .. automodule:: quapy.util :members: :undoc-members: :show-inheritance: -Subpackages ------------ - -.. toctree:: - :maxdepth: 3 - - quapy.classification - quapy.data - quapy.method - - Module contents --------------- @@ -81,4 +78,3 @@ Module contents :members: :undoc-members: :show-inheritance: - diff --git a/examples/0.basics.py b/examples/0.basics.py new file mode 100644 index 0000000..a891475 --- /dev/null +++ b/examples/0.basics.py @@ -0,0 +1,86 @@ +""" +This is a basic example showcasing some of the important concepts behind quapy. +First of all, import quapy. Wou would typically import quapy in the following way +""" +import numpy as np +from sklearn.linear_model import LogisticRegression + +import quapy as qp +from quapy.method.aggregative import PACC + +# let's fetch some dataset to run one experiment +# datasets are available in the "qp.data.datasets" module (there is a shortcut in qp.datasets) + +data = qp.datasets.fetch_reviews('hp') + +# The data are in plain text format. You can convert them into tfidf using some utilities available in the +# qp.data.preprocessing module, e.g.: + +data = qp.data.preprocessing.text2tfidf(data, min_df=5) + +# you can obtain the same result by specifying tfidf=True it in the fetch function: +# data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=5) + +# data is an object of type Dataset, a very basic collection that contains a "training" and a "test" collection inside. +train, test = data.train_test + +# train and test are instances of LabelledCollection, a class that contains covariates (X) and true labels (y), along +# with sampling functionality. Here are some examples of usage: +X, y = train.Xy +print(f'number of classes {train.n_classes}') +print(f'class names {train.classes_}') + +import quapy.functional as F # <- this module has some functional utilities, like a string formatter for prevalences +print(f'training prevalence = {F.strprev(train.prevalence())}') + +# let us train one quantifier, for example, PACC using a sklearn's Logistic Regressor as the underlying classifier +classifier = LogisticRegression() +pacc = PACC(classifier) + +print(f'training {pacc}') +pacc.fit(X, y) + +# let's now test our quantifier on the test data (of course, we should not use the test labels y at this point, only X) +X_test = test.X +estim_prevalence = pacc.predict(X_test) + +print(f'estimated test prevalence = {F.strprev(estim_prevalence)}') +print(f'true test prevalence = {F.strprev(test.prevalence())}') + +# let us use some evaluation metric to check how well our quantifier fared. +# Error metrics are available in the qp.error module. + +mae_error = qp.error.mae(test.prevalence(), estim_prevalence) +print(f'MAE={mae_error:.4f}') + +# In quantification, we typically use an evaluation protocol to test the performance of a quantification method. +# The reason is that, even though the test set contains many instances, the whole counts as 1 single datapoint to +# the quantifier, because quantification targets samples of instances as a whole (while classification, or regression, +# target instances individually). +# Quapy provides some standard protocols in qp.protocol. We will use the artificial prevalence protocol (APP). APP +# works by generating many test samples, out of our original test collection, characterized by different prevalence +# values. To do so, a grid of prevalence values is explored, and different samples are generated conditioned on each +# prevalence vector. This way, the quantifier is stress-tested on a wide range of prevalence values, i.e., under +# prior probability shift conditions. + +# In this case we use "test" and not only "test.X" since the protocol needs to know the class labels in order +# to generate samples at different prevalences. We will generate samples of 100 instances, from a grid of 21 values, +# i.e., from a grid = [0.0, 0.05, 0.10, ..., 1.00], and only one sample (repeats) for each combination. +app = qp.protocol.APP(test, sample_size=100, n_prevalences=21, repeats=1) + +# let's print some examples: +show=5 +for i, (sample, prev) in enumerate(app()): + print(f'sample-{i}: {F.strprev(prev)}') + if i+1==5: + break + +# we can use the evaluation routine provided in quapy to test our method using a given protocol in terms of +# one specific error metric +absolute_errors = qp.evaluation.evaluate(model=pacc, protocol=app, error_metric='ae') +print(f'MAE = {np.mean(absolute_errors):.4f}+-{np.std(absolute_errors):.4f}') + + + + + diff --git a/examples/1.model_selection.py b/examples/1.model_selection.py new file mode 100644 index 0000000..6c96671 --- /dev/null +++ b/examples/1.model_selection.py @@ -0,0 +1,84 @@ +import quapy as qp +from quapy.protocol import UPP +from quapy.method.aggregative import DMy +from sklearn.linear_model import LogisticRegression +import numpy as np +from time import time + +""" +In this example, we show how to perform model selection on a DistributionMatching quantifier. +""" + +model = DMy() + +qp.environ['SAMPLE_SIZE'] = 100 +qp.environ['N_JOBS'] = -1 + +print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; ' + f'to increase/decrease the number of jobs use:\n' + f'> N_JOBS=-1 python3 1.model_selection.py\n' + f'alternatively, you can set this variable within the script as:\n' + f'import quapy as qp\n' + f'qp.environ["N_JOBS"]=-1') + +training, test = qp.datasets.fetch_UCIMulticlassDataset('letter').train_test + +# evaluation in terms of MAE with default hyperparameters +Xtr, ytr = training.Xy +model.fit(Xtr, ytr) +mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae') +print(f'MAE (non optimized)={mae_score:.5f}') + + +with qp.util.temp_seed(0): + + # The model will be returned by the fit method of GridSearchQ. + # Every combination of hyper-parameters will be evaluated by confronting the + # quantifier thus configured against a series of samples generated by means + # of a sample generation protocol. For this example, we will use the + # artificial-prevalence protocol (APP), that generates samples with prevalence + # values in the entire range of values from a grid (e.g., [0, 0.1, 0.2, ..., 1]). + # We devote 30% of the dataset for this exploration. + training, validation = training.split_stratified(train_prop=0.7) + protocol = UPP(validation) + + # We will explore a classification-dependent hyper-parameter (e.g., the 'C' + # hyper-parameter of LogisticRegression) and a quantification-dependent hyper-parameter + # (e.g., the number of bins in a DistributionMatching quantifier). + # Classifier-dependent hyper-parameters have to be marked with a prefix "classifier__" + # in order to let the quantifier know this hyper-parameter belongs to its underlying + # classifier. + # We consider 7 values for the classifier and 7 values for the quantifier. + # QuaPy is optimized so that only 7 classifiers are trained, and then reused to test the + # different configurations of the quantifier. In other words, quapy avoids to train + # the classifier 7x7 times. + param_grid = { + 'classifier__C': np.logspace(-3, 3, 7), + 'nbins': [2, 3, 4, 5, 10, 15, 20] + } + + tinit = time() + + Xtr, ytr = training.Xy + model = qp.model_selection.GridSearchQ( + model=model, + param_grid=param_grid, + protocol=protocol, + error='mae', # the error to optimize is the MAE (a quantification-oriented loss) + refit=False, # retrain on the whole labelled set once done + # raise_errors=False, + verbose=True # show information as the process goes on + ).fit(Xtr, ytr) + +tend = time() + +print(f'model selection ended: best hyper-parameters={model.best_params_}') +model = model.best_model_ + +# evaluation in terms of MAE +# we use the same evaluation protocol (APP) on the test set +mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae') + +print(f'MAE={mae_score:.5f}') +print(f'model selection took {tend-tinit:.1f}s') + diff --git a/examples/one_vs_all.py b/examples/10.one_vs_all.py similarity index 84% rename from examples/one_vs_all.py rename to examples/10.one_vs_all.py index 3f5c4ac..ca70662 100644 --- a/examples/one_vs_all.py +++ b/examples/10.one_vs_all.py @@ -9,6 +9,11 @@ import numpy as np """ In this example, we will create a quantifier for tweet sentiment analysis considering three classes: negative, neutral, and positive. We will use a one-vs-all approach using a binary quantifier for demonstration purposes. + +Caveat: the one-vs-all approach is deemed inadequate under prior probability shift conditions. The reasons +are discussed in: +Donyavi, Z., Serapio, A., & Batista, G. (2023). MC-SQ: A highly accurate ensemble for multi-class quantifi- +cation. In: Proceedings of the 2023 SIAM International Conference on Data Mining (SDM), SIAM, pp. 622–630 """ qp.environ['SAMPLE_SIZE'] = 100 @@ -40,11 +45,11 @@ param_grid = { } print('starting model selection') model_selection = GridSearchQ(quantifier, param_grid, protocol=UPP(val), verbose=True, refit=False) -quantifier = model_selection.fit(train_modsel).best_model() +quantifier = model_selection.fit(*train_modsel.Xy).best_model() print('training on the whole training set') train, test = qp.datasets.fetch_twitter('hcr', for_model_selection=False, pickle=True).train_test -quantifier.fit(train) +quantifier.fit(*train.Xy) # evaluation mae = qp.evaluation.evaluate(quantifier, protocol=UPP(test), error_metric='mae') diff --git a/examples/comparing_HDy_HDx.py b/examples/11.comparing_HDy_HDx.py similarity index 80% rename from examples/comparing_HDy_HDx.py rename to examples/11.comparing_HDy_HDx.py index e7a32ef..a95b780 100644 --- a/examples/comparing_HDy_HDx.py +++ b/examples/11.comparing_HDy_HDx.py @@ -23,30 +23,35 @@ qp.environ['SAMPLE_SIZE']=100 df = pd.DataFrame(columns=['method', 'dataset', 'MAE', 'MRAE', 'tr-time', 'te-time']) +datasets = qp.datasets.UCI_BINARY_DATASETS -for dataset_name in tqdm(qp.datasets.UCI_DATASETS, total=len(qp.datasets.UCI_DATASETS)): - if dataset_name in ['acute.a', 'acute.b', 'balance.2', 'iris.1']: continue +for dataset_name in tqdm(datasets, total=len(datasets), desc='datasets processed'): + if dataset_name in ['acute.a', 'acute.b', 'balance.2', 'iris.1']: + # these datasets tend to produce either too good or too bad results... + continue - collection = qp.datasets.fetch_UCILabelledCollection(dataset_name, verbose=False) + collection = qp.datasets.fetch_UCIBinaryLabelledCollection(dataset_name, verbose=False) train, test = collection.split_stratified() + Xtr, ytr = train.Xy + # HDy............................................ tinit = time() - hdy = HDy(LogisticRegression()).fit(train) + hdy = HDy(LogisticRegression()).fit(Xtr, ytr) t_hdy_train = time()-tinit tinit = time() - hdy_report = qp.evaluation.evaluation_report(hdy, APP(test), error_metrics=['mae', 'mrae']).mean() + hdy_report = qp.evaluation.evaluation_report(hdy, APP(test), error_metrics=['mae', 'mrae']).mean(numeric_only=True) t_hdy_test = time() - tinit df.loc[len(df)] = ['HDy', dataset_name, hdy_report['mae'], hdy_report['mrae'], t_hdy_train, t_hdy_test] # HDx............................................ tinit = time() - hdx = DMx.HDx(n_jobs=-1).fit(train) + hdx = DMx.HDx(n_jobs=-1).fit(Xtr, ytr) t_hdx_train = time() - tinit tinit = time() - hdx_report = qp.evaluation.evaluation_report(hdx, APP(test), error_metrics=['mae', 'mrae']).mean() + hdx_report = qp.evaluation.evaluation_report(hdx, APP(test), error_metrics=['mae', 'mrae']).mean(numeric_only=True) t_hdx_test = time() - tinit df.loc[len(df)] = ['HDx', dataset_name, hdx_report['mae'], hdx_report['mrae'], t_hdx_train, t_hdx_test] diff --git a/examples/12.custom_protocol.py b/examples/12.custom_protocol.py new file mode 100644 index 0000000..774a0ed --- /dev/null +++ b/examples/12.custom_protocol.py @@ -0,0 +1,91 @@ +import numpy as np +from sklearn.linear_model import LogisticRegression + +import quapy as qp +from quapy.method.aggregative import PACC +from quapy.protocol import AbstractStochasticSeededProtocol +import quapy.functional as F + +""" +In this example, we create a custom protocol. +The protocol generates synthetic samples of a Gaussian mixture model with random mixture parameter +(the sample prevalence). Datapoints are univariate and we consider 2 classes only for simplicity. +""" +class GaussianMixProtocol(AbstractStochasticSeededProtocol): + # We need to extend AbstractStochasticSeededProtocol if we want the samples to be replicable + + def __init__(self, mu_1:float, std_1:float, mu_2:float, std_2:float, num_samples, sample_size, random_state=0): + super(GaussianMixProtocol, self).__init__(random_state) # this sets the random state + self.mu_1 = mu_1 + self.std_1 = std_1 + self.mu_2 = mu_2 + self.std_2 = std_2 + self.num_samples = num_samples + self.sample_size = sample_size + + def samples_parameters(self): + # This function is inherited and has to be overriden. + # This function should return all the necessary parameters for producing the samples. + # In this case, we consider returning a vector of seeds (one for each sample) and a vector of + # randomly sampled prevalence values. + # This function will be invoked within a context that sets the seed, so it will always return the + # same parameters. In case you want different outcomes, then simply set random_state=None. + rand_offset = np.random.randint(1000) + sample_seeds = np.random.permutation(self.num_samples*2) + rand_offset + random_prevs = np.random.rand(self.num_samples) + params = np.hstack([sample_seeds.reshape(-1,2), random_prevs.reshape(-1,1)]) + # each row in params contains two seeds (for generating the negatives and the positives, respectively) and + # the prevalence vector + return params + + def sample(self, params): + # the params are two seeds and the positive prevalence of the sample + seed0, seed1, pos_prev = params + num_positives = int(pos_prev * self.sample_size) + num_negatives = self.sample_size - num_positives + with qp.util.temp_seed(int(seed0)): + Xneg = np.random.normal(loc=self.mu_1, scale=self.std_1, size=num_negatives) + with qp.util.temp_seed(int(seed1)): + Xpos = np.random.normal(loc=self.mu_2, scale=self.std_2, size=num_positives) + X = np.concatenate((Xneg,Xpos)) + np.random.shuffle(X) + X = X.reshape(-1,1) + prev = F.as_binary_prevalence(pos_prev) + return X, prev + + def total(self): + # overriding this function will allow some methods display a meaningful progress bar + return self.num_samples + + +mu_1, std_1 = 0, 1 +mu_2, std_2 = 1, 1 + +gm = GaussianMixProtocol(mu_1=mu_1, std_1=std_1, mu_2=mu_2, std_2=std_2, num_samples=10, sample_size=50) + +# let's see if the samples are replicated +for i, (X, prev) in enumerate(gm()): + if i>4: break + print(f'sample-{i}: {F.strprev(prev)}, some covariates={X[:5].flatten()}...') + +print() +for i, (X, prev) in enumerate(gm()): + if i > 4: break + print(f'sample-{i}: {F.strprev(prev)}, some covariates={X[:5].flatten()}...') + +# let's generate some training data +# The samples are replicable, but by setting a temp seed we achieve repicable training as well +with qp.util.temp_seed(0): + Xneg = np.random.normal(loc=mu_1, scale=std_1, size=100) + Xpos = np.random.normal(loc=mu_2, scale=std_2, size=100) + X = np.concatenate([Xneg, Xpos]).reshape(-1,1) + y = [0]*100 + [1]*100 + + pacc = PACC(LogisticRegression()) + pacc.fit(X, y) + + +mae = qp.evaluation.evaluate(pacc, protocol=gm, error_metric='mae', verbose=True) +print(f'PACC MAE={mae:.5f}') + + diff --git a/examples/13.plotting.py b/examples/13.plotting.py new file mode 100644 index 0000000..77230c8 --- /dev/null +++ b/examples/13.plotting.py @@ -0,0 +1,73 @@ +import quapy as qp +import numpy as np + +from protocol import APP +from quapy.method.aggregative import CC, ACC, PCC, PACC +from sklearn.svm import LinearSVC + +qp.environ['SAMPLE_SIZE'] = 500 + + +''' +In this example, we show how to create some plots for the analysis of experimental results. +The main functions are included in qp.plot but, before, we will generate some basic experimental data +''' + +def gen_data(): + # this function generates some experimental data to plot + + def base_classifier(): + return LinearSVC(class_weight='balanced') + + def datasets(): + # the plots can handle experiments in different datasets + yield qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5).train_test + # by uncommenting thins line, the experiments will be carried out in more than one dataset + # yield qp.datasets.fetch_reviews('hp', tfidf=True, min_df=5).train_test + + def models(): + yield 'CC', CC(base_classifier()) + yield 'ACC', ACC(base_classifier()) + yield 'PCC', PCC(base_classifier()) + yield 'PACC', PACC(base_classifier()) + + # these are the main parameters we need to fill for generating the plots; + # note that each these list must have the same number of elements, since the ith entry of each list regards + # an independent experiment + method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], [] + + for train, test in datasets(): + for method_name, model in models(): + model.fit(*train.Xy) + true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0)) + + # gather all the data for this experiment + method_names.append(method_name) + true_prevs.append(true_prev) + estim_prevs.append(estim_prev) + tr_prevs.append(train.prevalence()) + + return method_names, true_prevs, estim_prevs, tr_prevs + +# generate some experimental data +method_names, true_prevs, estim_prevs, tr_prevs = gen_data() +# if you want to play around with the different plots and parameters, you might prefer to generate the data only once, +# so you better replace the above line of code with this one, that pickles the experimental results for faster reuse +# method_names, true_prevs, estim_prevs, tr_prevs = qp.util.pickled_resource('./plots/data.pickle', gen_data) + +# if there is only one training prevalence, we can display it +only_train_prev = tr_prevs[0] if len(np.unique(tr_prevs, axis=0))==1 else None + +# diagonal plot (useful for analyzing the performance of quantifiers on binary data) +qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, + train_prev=only_train_prev, savepath='./plots/bin_diag.png') + +# bias plot (box plots displaying the bias of each method) +qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, savepath='./plots/bin_bias.png') + +# error by drift allows to plot the quantification error as a function of the amount of prior probability shift, and +# is preferable than diagonal plots for multiclass datasets +qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, + error_name='ae', n_bins=10, savepath='./plots/err_drift.png') + +# each functions return (fig, ax) objects from matplotlib; use them to customize the plots to your liking diff --git a/examples/14.bayesian_quantification.py b/examples/14.bayesian_quantification.py new file mode 100644 index 0000000..21a1be1 --- /dev/null +++ b/examples/14.bayesian_quantification.py @@ -0,0 +1,195 @@ +""" +.. author:: Paweł Czyż + +This example shows how to use Bayesian quantification (https://arxiv.org/abs/2302.09159), +which is suitable for low-data situations and when the uncertainty of the prevalence estimate is of interest. + +For this, we will need to install extra dependencies: + +``` +$ pip install quapy[bayesian] +``` + +Running the script via: + +``` +$ python examples/14.bayesian_quantification.py +``` + +will produce a plot `bayesian_quantification.pdf`. + +Due to a low sample size and the fact that classes 2 and 3 are hard to distinguish, +it is hard to estimate the proportions accurately, what is visible by looking at the posterior samples, +showing large uncertainty. +""" + +import numpy as np +import matplotlib.pyplot as plt +import quapy as qp + +from sklearn.ensemble import RandomForestClassifier + +from quapy.method.aggregative import ACC, PACC +from method.confidence import BayesianCC +from quapy.data import LabelledCollection, Dataset + + +FIGURE_PATH = "bayesian_quantification.pdf" + + +def simulate_data(rng) -> Dataset: + """Generates a simulated data set with three classes.""" + + # Number of examples of each class in both data sets + n_train = [400, 400, 400] + n_test = [40, 25, 15] + + # Mean vectors and shared covariance of P(X|Y) distributions + mus = [np.zeros(2), np.array([1, 1.5]), np.array([1.5, 1])] + cov = np.eye(2) + + def gen_Xy(centers, sizes): + X = np.concatenate([rng.multivariate_normal(mu_i, cov, size_i) for mu_i, size_i in zip(centers, sizes)]) + y = np.concatenate([[i] * n for i, n in enumerate(sizes)]) + return X, y + + # Generate the features accordingly + train = LabelledCollection(*gen_Xy(centers=mus, sizes=n_train)) + test = LabelledCollection(*gen_Xy(centers=mus, sizes=n_test)) + + return Dataset(training=train, test=test) + + +def plot_simulated_data(axs, data: Dataset) -> None: + """Plots a simulated data set. + + :param axs: a list of three `plt.Axes` objects, on which the samples will be plotted. + :param data: the simulated data set. + """ + train, test = data.train_test + xlim = ( + -0.3 + min(train.X[:, 0].min(), test.X[:, 0].min()), + 0.3 + max(train.X[:, 0].max(), test.X[:, 0].max()) + ) + ylim = ( + -0.3 + min(train.X[:, 1].min(), test.X[:, 1].min()), + 0.3 + max(train.X[:, 1].max(), test.X[:, 1].max()) + ) + + for ax in axs: + ax.set_xlabel("$X_1$") + ax.set_ylabel("$X_2$") + ax.set_aspect("equal") + ax.set_xlim(*xlim) + ax.set_ylim(*ylim) + ax.set_xticks([]) + ax.set_yticks([]) + + ax = axs[0] + ax.set_title("Training set") + for i in range(data.n_classes): + ax.scatter(train.X[train.y == i, 0], train.X[train.y == i, 1], c=f"C{i}", s=3, rasterized=True) + + ax = axs[1] + ax.set_title("Test set\n(with labels)") + for i in range(data.n_classes): + ax.scatter(test.X[test.y == i, 0], test.X[test.y == i, 1], c=f"C{i}", s=3, rasterized=True) + + ax = axs[2] + ax.set_title("Test set\n(as observed)") + ax.scatter(test.X[:, 0], test.X[:, 1], c="C5", s=3, rasterized=True) + + +def plot_true_proportions(ax: plt.Axes, test_prevalence: np.ndarray) -> None: + """Plots the true proportions.""" + n_classes = len(test_prevalence) + x_ax = np.arange(n_classes) + ax.plot(x_ax, test_prevalence, c="black", linewidth=2, label="True") + + ax.set_xlabel("Class") + ax.set_ylabel("Prevalence") + ax.set_xticks(x_ax, x_ax + 1) + ax.set_yticks([0, 0.25, 0.5, 0.75, 1.0]) + ax.set_xlim(-0.1, n_classes - 0.9) + ax.set_ylim(-0.01, 1.01) + + +def get_random_forest() -> RandomForestClassifier: + """An auxiliary factory method to generate a random forest.""" + return RandomForestClassifier(n_estimators=10, random_state=5) + + +def _get_estimate(estimator_class, training: LabelledCollection, test: np.ndarray) -> None: + """Auxiliary method for running ACC and PACC.""" + estimator = estimator_class(get_random_forest()) + estimator.fit(*training.Xy) + return estimator.predict(test) + + +def train_and_plot_bayesian_quantification(ax: plt.Axes, training: LabelledCollection, test: LabelledCollection) -> None: + """Fits Bayesian quantification and plots posterior mean as well as individual samples""" + print('training model Bayesian CC...', end='') + quantifier = BayesianCC(classifier=get_random_forest()) + quantifier.fit(*training.Xy) + + # Obtain mean prediction + mean_prediction = quantifier.predict(test.X) + mae = qp.error.mae(test.prevalence(), mean_prediction) + x_ax = np.arange(training.n_classes) + ax.plot(x_ax, mean_prediction, c="salmon", linewidth=2, linestyle=":", label="Bayesian") + + # Obtain individual samples + samples = quantifier.get_prevalence_samples() + for sample in samples[::5, :]: + ax.plot(x_ax, sample, c="salmon", alpha=0.1, linewidth=0.3, rasterized=True) + print(f'MAE={mae:.4f} [done]') + + +def train_and_plot_acc(ax: plt.Axes, training: LabelledCollection, test: LabelledCollection) -> None: + print('training model ACC...', end='') + estimate = _get_estimate(ACC, training, test.X) + mae = qp.error.mae(test.prevalence(), estimate) + ax.plot(np.arange(training.n_classes), estimate, c="darkblue", linewidth=2, linestyle=":", label="ACC") + print(f'MAE={mae:.4f} [done]') + + +def train_and_plot_pacc(ax: plt.Axes, training: LabelledCollection, test: LabelledCollection) -> None: + print('training model PACC...', end='') + estimate = _get_estimate(PACC, training, test.X) + mae = qp.error.mae(test.prevalence(), estimate) + ax.plot(np.arange(training.n_classes), estimate, c="limegreen", linewidth=2, linestyle=":", label="PACC") + print(f'MAE={mae:.4f} [done]') + + +def main() -> None: + # --- Simulate data --- + print('generating simulated data') + rng = np.random.default_rng(42) + data = simulate_data(rng) + training, test = data.train_test + + # --- Plot simulated data --- + fig, axs = plt.subplots(1, 4, figsize=(13, 3), dpi=300) + for ax in axs: + ax.spines[['top', 'right']].set_visible(False) + plot_simulated_data(axs[:3], data) + + # --- Plot quantification results --- + ax = axs[3] + plot_true_proportions(ax, test_prevalence=test.prevalence()) + + train_and_plot_acc(ax, training=training, test=test) + train_and_plot_pacc(ax, training=training, test=test) + train_and_plot_bayesian_quantification(ax=ax, training=training, test=test) + print('[done]') + + ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', frameon=False) + + print(f'saving plot in path {FIGURE_PATH}...', end='') + fig.tight_layout() + fig.savefig(FIGURE_PATH) + print('[done]') + + +if __name__ == '__main__': + main() diff --git a/examples/15.composable_methods.py b/examples/15.composable_methods.py new file mode 100644 index 0000000..df3b34c --- /dev/null +++ b/examples/15.composable_methods.py @@ -0,0 +1,143 @@ +""" +This example illustrates the composition of quantification methods from +arbitrary loss functions and feature representations. It will extend the basic +example on the usage of quapy with this composition. + +This example requires the installation of qunfold, the back-end of QuaPy's +composition module: + + pip install --upgrade pip setuptools wheel + pip install "jax[cpu]" + pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.5" +""" + +import numpy as np +import quapy as qp +import quapy.functional as F + +# First of all, we load the same data as in the basic example. + +data = qp.data.preprocessing.text2tfidf( + qp.datasets.fetch_reviews("hp"), + min_df = 5, +) +training, testing = data.train_test +Xtr, ytr = training.Xy + +# We start by recovering PACC from its building blocks, a LeastSquaresLoss and +# a probabilistic ClassRepresentation. A 5-fold cross-validation is implemented +# through a CVClassifier. + +from quapy.method.composable import ( + ComposableQuantifier, + LeastSquaresLoss, + ClassRepresentation, + CVClassifier, +) +from sklearn.linear_model import LogisticRegression + +pacc = ComposableQuantifier( + LeastSquaresLoss(), + ClassRepresentation( + CVClassifier(LogisticRegression(random_state=0), 5), + is_probabilistic = True + ), +) + +# Let's evaluate this quantifier. + +print(f"Evaluating PACC: {pacc}") +pacc.fit(Xtr, ytr) +app = qp.protocol.APP(testing, sample_size=100, n_prevalences=21, repeats=1) +absolute_errors = qp.evaluation.evaluate( + model = pacc, + protocol = app, + error_metric = "ae", +) +print(f"MAE = {np.mean(absolute_errors):.4f}+-{np.std(absolute_errors):.4f}") + +# We now turn to the composition of novel methods. As an example, we use the +# (squared) Hellinger distance as a loss function but, unlike HDy, we do not +# compute any histograms from the output of the classifier. + +from quapy.method.composable import HellingerSurrogateLoss + +model = ComposableQuantifier( + HellingerSurrogateLoss(), # the loss is different from before + ClassRepresentation( # we use the same representation + CVClassifier(LogisticRegression(random_state=0), 5), + is_probabilistic = True + ), +) + +print(f"Evaluating {model}") +model.fit(Xtr, ytr) +absolute_errors = qp.evaluation.evaluate( + model = model, + protocol = app, # use the same protocol for evaluation + error_metric = "ae", +) +print(f"MAE = {np.mean(absolute_errors):.4f}+-{np.std(absolute_errors):.4f}") + +# In general, any composed method solves a linear system of equations by +# minimizing the loss after representing the data. Methods of this kind include +# ACC, PACC, HDx, HDy, and many other well-known methods, as well as an +# unlimited number of re-combinations of their building blocks. + +# To illustrate hyper-parameter optimization, we now define a method that +# employs a weighted sum of the LeastSquaresLoss and the +# HellingerSurrogateLoss. We will consider both the weighting of these losses +# and the C parameter of the LogisticRegression as hyper-parameters to be +# optimized. + +from quapy.method.composable import CombinedLoss + +model = ComposableQuantifier( + CombinedLoss(HellingerSurrogateLoss(), LeastSquaresLoss()), + ClassRepresentation( + CVClassifier(LogisticRegression(random_state=0), 5), + is_probabilistic = True + ), +) + +from quapy.method.composable import QUnfoldWrapper +from qunfold import LinearMethod + +model = QUnfoldWrapper(LinearMethod( + CombinedLoss(HellingerSurrogateLoss(), LeastSquaresLoss()), + ClassRepresentation( + CVClassifier(LogisticRegression(random_state=0), 5), + is_probabilistic = True + ), +)) + +# The names of the parameters stem from the comparably deep object hierarchy +# that composable methods define. + +param_grid = { + "loss__weights": [ (w, 1-w) for w in [.1, .5, .9] ], + "representation__classifier__estimator__C": [1e-1, 1e1], +} + +grid_search = qp.model_selection.GridSearchQ( + model = model, + param_grid = param_grid, + protocol = app, # use the protocol that we used for testing before + error = "mae", + refit = False, + verbose = True, +).fit(Xtr, ytr) +print( + f"Best hyper-parameters = {grid_search.best_params_}", + f"Best MAE = {grid_search.best_score_}", + sep = "\n", +) + +# Note that a proper evaluation would still require the best model to be +# evaluated on a separate test set. + +# To implement your own loss functions and feature representations, please +# follow the corresponding manual of the qunfold package. This package provides +# the back-end of QuaPy’s composable module and is fully compatible with QuaPy. +# +# https://mirkobunse.github.io/qunfold/developer-guide.html#custom-implementations diff --git a/examples/16.KDEy_bandwidth.py b/examples/16.KDEy_bandwidth.py new file mode 100644 index 0000000..cc81ade --- /dev/null +++ b/examples/16.KDEy_bandwidth.py @@ -0,0 +1,83 @@ +import quapy as qp +import numpy as np +from quapy.protocol import UPP +from quapy.method.aggregative import KDEyML +import quapy.functional as F +from time import time + +""" +Let see one example: +""" + +# load some data +qp.environ['SAMPLE_SIZE'] = 100 +data = qp.datasets.fetch_UCIMulticlassDataset('molecular') +training, test = data.train_test +training, validation = training.split_stratified(train_prop=0.7, random_state=0) +protocol = UPP(validation) + +hyper_C = np.logspace(-3, 3, 7) + +model = KDEyML() + +with qp.util.temp_seed(0): + + param_grid = { + 'classifier__C': hyper_C, + 'bandwidth': np.linspace(0.01, 0.20, 20) # [0.01, 0.02, 0.03, ..., 0.20] + } + + model = qp.model_selection.GridSearchQ( + model=model, + param_grid=param_grid, + protocol=protocol, + error='mae', # the error to optimize is the MAE (a quantification-oriented loss) + refit=False, # retrain on the whole labelled set once done + n_jobs=-1, + verbose=True # show information as the process goes on + ).fit(training) + +best_params = model.best_params_ +took = model.fit_time_ +model = model.best_model_ +print(f'model selection ended: best hyper-parameters={best_params}') + +# evaluation in terms of MAE +# we use the same evaluation protocol (APP) on the test set +mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae') + +print(f'MAE={mae_score:.5f}') +print(f'model selection took {took:.1f}s') + + +model = KDEyML(bandwidth='auto') + +with qp.util.temp_seed(0): + + param_grid = { + 'classifier__C': hyper_C, + } + + model = qp.model_selection.GridSearchQ( + model=model, + param_grid=param_grid, + protocol=protocol, + error='mae', # the error to optimize is the MAE (a quantification-oriented loss) + refit=False, # retrain on the whole labelled set once done + n_jobs=-1, + verbose=True # show information as the process goes on + ).fit(training) + +best_params = model.best_params_ +took = model.fit_time_ +model = model.best_model_ +bandwidth = model.bandwidth_val +print(f'model selection ended: best hyper-parameters={best_params} ({bandwidth=})') + +# evaluation in terms of MAE +# we use the same evaluation protocol (APP) on the test set +mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae') + +print(f'MAE={mae_score:.5f}') +print(f'model selection took {took:.1f}s') + diff --git a/examples/16.confidence_regions.py b/examples/16.confidence_regions.py new file mode 100644 index 0000000..c8e95dd --- /dev/null +++ b/examples/16.confidence_regions.py @@ -0,0 +1,81 @@ +from quapy.method.confidence import AggregativeBootstrap +from quapy.method.aggregative import PACC +import quapy.functional as F +import quapy as qp + +""" +Just like any other type of estimator, quantifier predictions are affected by error. It is therefore useful to provide, +along with the point estimate (the class prevalence values) a measure of uncertainty. These, typically come in the +form of credible regions around the point estimate. + +QuaPy implements a method for deriving confidence regions around point estimates of class prevalence based on bootstrap. + +Bootstrap method comes down to resampling the population several times, thus generating a series of point estimates. +QuaPy provides a variant of bootstrap for aggregative quantifiers, that only applies resampling to the pre-classified +instances. + +Let see one example: +""" + +# load some data +data = qp.datasets.fetch_UCIMulticlassDataset('molecular') +train, test = data.train_test +Xtr, ytr = train.Xy + +# by simply wrapping an aggregative quantifier within the AggregativeBootstrap class, we can obtain confidence +# intervals around the point estimate, in this case, at 95% of confidence +pacc = AggregativeBootstrap(PACC(), n_test_samples=500, confidence_level=0.95) + +with qp.util.temp_seed(0): + # we train the quantifier the usual way + pacc.fit(Xtr, ytr) + + # let us simulate some shift in the test data + random_prevalence = F.uniform_prevalence_sampling(n_classes=test.n_classes) + shifted_test = test.sampling(200, *random_prevalence) + true_prev = shifted_test.prevalence() + + # by calling "quantify_conf", we obtain the point estimate and the confidence intervals around it + pred_prev, conf_intervals = pacc.predict_conf(shifted_test.X) + + # conf_intervals is an instance of ConfidenceRegionABC, which provides some useful utilities like: + # - coverage: a function which computes the fraction of true values that belong to the confidence region + # - simplex_proportion: estimates the proportion of the simplex covered by the confidence region (amplitude) + # ideally, we are interested in obtaining confidence regions with high level of coverage and small amplitude + + # the point estimate is computed as the mean of all bootstrap predictions; let us see the prediction error + error = qp.error.ae(true_prev, pred_prev) + + # some useful outputs + print(f'train prevalence: {F.strprev(train.prevalence())}') + print(f'test prevalence: {F.strprev(true_prev)}') + print(f'point-estimate: {F.strprev(pred_prev)}') + print(f'absolute error: {error:.3f}') + print(f'Is the true value in the confidence region?: {conf_intervals.coverage(true_prev)==1}') + print(f'Proportion of simplex covered at confidence level {pacc.confidence_level*100:.1f}%: {conf_intervals.simplex_portion()*100:.2f}%') + +""" +Final remarks: +There are various ways for performing bootstrap: +- the population-based approach (default): performs resampling of the test instances + e.g., use AggregativeBootstrap(PACC(), n_train_samples=1, n_test_samples=100, confidence_level=0.95) +- the model-based approach: performs resampling of the training instances, thus training several quantifiers + e.g., use AggregativeBootstrap(PACC(), n_train_samples=100, n_test_samples=1, confidence_level=0.95) + this implementation avoids retraining the classifier, and performs resampling only to train different aggregation functions +- the combined approach: a combination of the above + e.g., use AggregativeBootstrap(PACC(), n_train_samples=100, n_test_samples=100, confidence_level=0.95) + this example will generate 100 x 100 predictions + +There are different ways for constructing confidence regions implemented in QuaPy: +- confidence intervals: the simplest way, and one that typically works well in practice + use: AggregativeBootstrap(PACC(), confidence_level=0.95, method='intervals') +- confidence ellipse in the simplex: creates an ellipse, which lies on the probability simplex, around the point estimate + use: AggregativeBootstrap(PACC(), confidence_level=0.95, method='ellipse') +- confidence ellipse in the Centered-Log Ratio (CLR) space: creates an ellipse in the CLR space (this should be + convenient for taking into account the inner structure of the probability simplex) + use: AggregativeBootstrap(PACC(), confidence_level=0.95, method='ellipse-clr') + +Other methods that return confidence regions in QuaPy include the BayesianCC method. +""" + + diff --git a/examples/explicit_loss_minimization.py b/examples/17.explicit_loss_minimization.py similarity index 92% rename from examples/explicit_loss_minimization.py rename to examples/17.explicit_loss_minimization.py index fcc07f3..b38728d 100644 --- a/examples/explicit_loss_minimization.py +++ b/examples/17.explicit_loss_minimization.py @@ -33,7 +33,7 @@ returns an instance of SVM(Q) (i.e., an instance of CC properly set to work with Since we wan to explore the losses, we will instead use newELM. For this example we will create a quantifier for tweet sentiment analysis considering three classes: negative, neutral, and positive. Since SVMperf is a binary classifier, our quantifier will be binary as well. We will use a one-vs-all approach to work in multiclass model. -For more details about how one-vs-all works, we refer to the example "one_vs_all.py" and to the API documentation. +For more details about how one-vs-all works, we refer to the example "10.one_vs_all.py" and to the API documentation. """ qp.environ['SAMPLE_SIZE'] = 100 @@ -50,7 +50,7 @@ train_modsel, val = qp.datasets.fetch_twitter('hcr', for_model_selection=True, p model selection: We explore the classifier's loss and the classifier's C hyperparameters. Since our model is actually an instance of OneVsAllAggregative, we need to add the prefix "binary_quantifier", and -since our binary quantifier is an instance of CC, we need to add the prefix "classifier". +since our binary quantifier is an instance of CC (an aggregative quantifier), we need to add the prefix "classifier". """ param_grid = { 'binary_quantifier__classifier__loss': ['q', 'kld', 'mae'], # classifier-dependent hyperparameter @@ -58,11 +58,11 @@ param_grid = { } print('starting model selection') model_selection = GridSearchQ(quantifier, param_grid, protocol=UPP(val), verbose=True, refit=False) -quantifier = model_selection.fit(train_modsel).best_model() +quantifier = model_selection.fit(*train_modsel.Xy).best_model() print('training on the whole training set') train, test = qp.datasets.fetch_twitter('hcr', for_model_selection=False, pickle=True).train_test -quantifier.fit(train) +quantifier.fit(*train.Xy) # evaluation mae = qp.evaluation.evaluate(quantifier, protocol=UPP(test), error_metric='mae') diff --git a/examples/18.ReadMe_for_text_analysis.py b/examples/18.ReadMe_for_text_analysis.py new file mode 100644 index 0000000..7a70022 --- /dev/null +++ b/examples/18.ReadMe_for_text_analysis.py @@ -0,0 +1,60 @@ +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_selection import SelectKBest, chi2 + +import quapy as qp +from quapy.method.non_aggregative import ReadMe +import quapy.functional as F +from sklearn.pipeline import Pipeline + +""" +This example showcases how to use the non-aggregative method ReadMe proposed by Hopkins and King. +This method is for text analysis, so let us first instantiate a dataset for sentiment quantification (we +use IMDb for this example). The method is quite computationally expensive, so we will restrict the training +set to 1000 documents only. +""" +reviews = qp.datasets.fetch_reviews('imdb').reduce(n_train=1000, random_state=0) + +""" +We need to convert text to bag-of-words representations. Actually, ReadMe requires the representations to be +binary (i.e., storing a 1 whenever a document contains certain word, or 0 otherwise), so we will not use +TFIDF weighting. We will also retain the top 1000 most important features according to chi2. +""" +encode_0_1 = Pipeline([ + ('0_1_terms', CountVectorizer(min_df=5, binary=True)), + ('feat_sel', SelectKBest(chi2, k=1000)) +]) +train, test = qp.data.preprocessing.instance_transformation(reviews, encode_0_1, inplace=True).train_test + +""" +We now instantiate ReadMe, with the prob_model='full' (default behaviour, implementing the Hopkins and King original +idea). This method consists of estimating Q(Y) by solving: + +Q(X) = \sum_i Q(X|Y=i) Q(Y=i) + +without resorting to estimating the posteriors Q(Y=i|X), by solving a linear least-squares problem. +However, since Q(X) and Q(X|Y=i) are matrices of shape (2^K, 1) and (2^K, n), with K the number of features +and n the number of classes, their calculation becomes intractable. ReadMe instead performs bagging (i.e., it +samples small sets of features and averages the results) thus reducing K to a few terms. In our example we +set K (bagging_range) to 20, and the number of bagging_trials to 100. + +ReadMe also computes confidence intervals via bootstrap. We set the number of bootstrap trials to 100. +""" +readme = ReadMe(prob_model='full', bootstrap_trials=100, bagging_trials=100, bagging_range=20, random_state=0, verbose=True) +readme.fit(*train.Xy) # <- there is actually nothing happening here (only bootstrap resampling); the method is "lazy" + # and postpones most of the calculations to the test phase. + +# since the method is slow, we will only test 3 cases with different imbalances +few_negatives = [0.25, 0.75] +balanced = [0.5, 0.5] +few_positives = [0.75, 0.25] + +for test_prev in [few_negatives, balanced, few_positives]: + sample = reviews.test.sampling(500, *test_prev, random_state=0) # draw sets of 500 documents with desired prevs + prev_estim, conf = readme.predict_conf(sample.X) + err = qp.error.mae(sample.prevalence(), prev_estim) + print(f'true-prevalence={F.strprev(sample.prevalence())},\n' + f'predicted-prevalence={F.strprev(prev_estim)}, with confidence intervals {conf},\n' + f'MAE={err:.4f}') + + + diff --git a/examples/2.custom_quantifier.py b/examples/2.custom_quantifier.py new file mode 100644 index 0000000..ac6f7b5 --- /dev/null +++ b/examples/2.custom_quantifier.py @@ -0,0 +1,155 @@ +import quapy as qp +from quapy.data import LabelledCollection +from quapy.method.base import BinaryQuantifier, BaseQuantifier +from quapy.model_selection import GridSearchQ +from quapy.method.aggregative import AggregativeSoftQuantifier +from quapy.protocol import APP +import quapy.functional as F +import numpy as np +from sklearn.linear_model import LogisticRegression +from time import time + + +# Define a custom quantifier: for this example, we will consider a new quantification algorithm that uses a +# logistic regressor for generating posterior probabilities, and then applies a custom threshold value to the +# posteriors. Since the quantifier internally uses a classifier, it is an aggregative quantifier; and since it +# relies on posterior probabilities, it is a probabilistic-aggregative quantifier (aka AggregativeSoftQuantifier). +# Note also it has an internal hyperparameter (let say, alpha) which is the decision threshold. +# +# Let's also assume the quantifier is binary, for simplicity. Any quantifier (i.e., any subclass of BaseQuantifier) +# is required to implement the "fit" and "quantify" methods. Aggregative quantifiers are special subtypes of base +# quantifiers, i.e., are quantifiers that undertake a classification-phase followed by an aggregation-phase. QuaPy +# already implements most common functionality, and requires the developer to simply implement the "aggregation_fit" +# and the "aggregation" methods. +# +# We are providing two implementations of the same method to illustrate this characteristic of QuaPy. Let us begin +# with the general case, in which we implement a (base) quantifier + +class MyQuantifier(BaseQuantifier): + + def __init__(self, classifier, alpha=0.5): + self.alpha = alpha + self.classifier = classifier + + # in general, we would need to implement the method fit(self, X, y); this would amount to: + def fit(self, X, y): + n_classes = F.num_classes_from_labels(y) + assert n_classes==2, \ + 'this quantifier is only valid for binary problems [abort]' + self.classifier.fit(X, y) + return self + + # in general, we would need to implement the method quantify(self, instances); this would amount to: + def predict(self, X): + assert hasattr(self.classifier, 'predict_proba'), \ + 'the underlying classifier is not probabilistic! [abort]' + posterior_probabilities = self.classifier.predict_proba(X) + positive_probabilities = posterior_probabilities[:, 1] + crisp_decisions = positive_probabilities > self.alpha + pos_prev = crisp_decisions.mean() + neg_prev = 1 - pos_prev + return np.asarray([neg_prev, pos_prev]) + + +# Note that the above implementation contains a lot of boilerplate code. Many parts can be omitted since QuaPy +# provides implementations for them. Some of these routines (like, for example, training a classifier and generating +# posterior probabilities) are often carried out in a k-fold cross-validation manner. These, along with many other +# common routines are already provided by highly-optimized routines in QuaPy. Let's see a much better implementation +# of the method, now adhering to the AggregativeSoftQuantifier: + +class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier): + + def __init__(self, classifier, alpha=0.5): + # aggregative quantifiers have an internal attribute called self.classifier, but this is defined + # within the super's init + super().__init__(classifier, fit_classifier=True, val_split=None) + self.alpha = alpha + + # since this method is of type aggregative, we can simply implement the method aggregation_fit, which + # assumes the classifier has already been fitted properly and the predictions for the training set required + # to train the aggregation function have been properly generated (i.e., on a validation split, or using a + # k-fold cross validation strategy). What remains ahead is to learn an aggregation function. In our case + # this amounts to doing... nothing, since our method was pretty basic. BinaryQuantifier also add some + # basic functionality for checking binary consistency. + def aggregation_fit(self, classif_predictions, labels): + pass + + # since this method is of type aggregative, we can simply implement the method aggregate (i.e., we should + # only describe what to do with the classifier predictions --which in this case are posterior probabilities + # because we are inheriting from the "Soft" subtype). This comes down to: + def aggregate(self, classif_predictions: np.ndarray): + # the posterior probabilities have already been generated by the quantify method; we only need to + # specify what to do with them + positive_probabilities = classif_predictions[:, 1] + crisp_decisions = positive_probabilities > self.alpha + pos_prev = crisp_decisions.mean() + neg_prev = 1-pos_prev + return np.asarray([neg_prev, pos_prev]) + + +# a small example using these two implementations of our method + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = 250 + + # load the IMDb dataset + train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test + train, val = train.split_stratified(train_prop=0.75) # let's create a validation set for optimizing hyperparams + + def try_implementation(quantifier): + class_name = quantifier.__class__.__name__ + print(f'\ntesting implementation {class_name}...') + # model selection + # let us assume we want to explore our hyperparameter alpha along with one hyperparameter of the classifier + tinit = time() + param_grid = { + 'alpha': np.linspace(0, 1, 11), # quantifier-dependent hyperparameter + 'classifier__C': np.logspace(-2, 2, 5) # classifier-dependent hyperparameter + } + gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=True).fit(*train.Xy) + t_modsel = time() - tinit + print(f'\tmodel selection took {t_modsel:.2f}s', flush=True) + + # evaluation + optimized_model = gridsearch.best_model_ + mae = qp.evaluation.evaluate( + optimized_model, + protocol=APP(test, repeats=500, sanity_check=None), # disable the check, we want to generate many tests! + error_metric='mae', + verbose=True) + + t_eval = time() - t_modsel - tinit + print(f'\tevaluation took {t_eval:.2f}s [MAE = {mae:.4f}]') + + # define an instance of our custom quantifier and test it! + quantifier = MyQuantifier(LogisticRegression(), alpha=0.5) + try_implementation(quantifier) + + # define an instance of our custom quantifier, with the second implementation, and test it! + quantifier = MyAggregativeSoftQuantifier(LogisticRegression(), alpha=0.5) + try_implementation(quantifier) + + # the output should look like this: + """ + testing implementation MyQuantifier... + model selection took 12.86s + predicting: 100%|██████████| 105000/105000 [00:22<00:00, 4626.30it/s] + evaluation took 22.75s [MAE = 0.0630] + + testing implementation MyAggregativeSoftQuantifier... + model selection took 3.10s + speeding up the prediction for the aggregative quantifier, total classifications 25000 instead of 26250000 + predicting: 100%|██████████| 105000/105000 [00:04<00:00, 22779.62it/s] + evaluation took 4.66s [MAE = 0.0630] + """ + # Note that the first implementation is much slower, both in terms of grid-search optimization and in terms of + # evaluation. The reason why, is that QuaPy is highly optimized for aggregative quantifiers (by far, the most + # popular type of quantification methods), thus significantly speeding up model selection and test routines. + # Furthermore, it is simpler to extend an aggregation type since QuaPy implements boilerplate functions for you. + + # Final remarks: this method is only for demonstration purposes and makes little sense in general. The method relies + # on an hyperparameter alpha for binarizing the posterior probabilities. A much better way for fulfilling this + # goal would be to calibrate the classifier (LogisticRegression is already reasonably well calibrated) and then + # simply cut at 0.5. + diff --git a/examples/3.custom_collection.py b/examples/3.custom_collection.py new file mode 100644 index 0000000..13baeef --- /dev/null +++ b/examples/3.custom_collection.py @@ -0,0 +1,103 @@ +import quapy as qp +from quapy.method.aggregative import PACC +from quapy.data import LabelledCollection, Dataset +from quapy.protocol import ArtificialPrevalenceProtocol +import quapy.functional as F +import os +from os.path import join + +# While quapy comes with ready-to-use datasets for experimental purposes, you may prefer to run experiments using +# your own data. Most of the quapy's functionality relies on an internal class called LabelledCollection, for fast +# indexing and sampling, and so this example provides guidance on how to convert your datasets into a LabelledCollection +# so all the functionality becomes available. This includes procedures for tuning the hyperparameters of your methods, +# evaluating the performance using high level sampling protocols, etc. + +# Let us assume that we have a binary sentiment dataset of opinions in natural language. We will use the "IMDb" +# dataset of reviews, which can be downloaded as follows +URL_TRAIN = f'https://zenodo.org/record/4117827/files/imdb_train.txt' +URL_TEST = f'https://zenodo.org/record/4117827/files/imdb_test.txt' +os.makedirs('./reviews', exist_ok=True) +train_path = join('reviews', 'hp_train.txt') +test_path = join('reviews', 'hp_test.txt') +qp.util.download_file_if_not_exists(URL_TRAIN, train_path) +qp.util.download_file_if_not_exists(URL_TEST, test_path) + +# these files contain 2 columns separated by a \t: +# the first one is a binary value (0=negative, 1=positive), and the second is the text +# Everything we need is to implement a function returning the instances and the labels as follows +def my_data_loader(path): + with open(path, 'rt') as fin: + labels, texts = zip(*[line.split('\t') for line in fin.readlines()]) + labels = list(map(int, labels)) # convert string numbers to int + return texts, labels + +# check that our function is working properly... +train_texts, train_labels = my_data_loader(train_path) +for i, (text, label) in enumerate(zip(train_texts, train_labels)): + print(f'#{i}: {label=}\t{text=}') + if i>=5: + print('...') + break + +# We can now instantiate a LabelledCollection simply as +train_lc = LabelledCollection(instances=train_texts, labels=train_labels) +print('my training collection:', train_lc) + +# We can instantiate directly a LabelledCollection using the data loader function, +# without having to load the data ourselves: +train_lc = LabelledCollection.load(train_path, loader_func=my_data_loader) +print('my training collection:', train_lc) + +# We can do the same for the test set, or we can instead directly instantiate a Dataset object (this is by and large +# simply a tuple with training and test LabelledCollections) as follows: +my_data = Dataset.load(train_path, test_path, loader_func=my_data_loader) +print('my dataset:', my_data) + +# However, since this is a textual dataset, we must vectorize it prior to training any quantification algorithm. +# We can do this in several ways in quapy. For example, manually... +# from sklearn.feature_extraction.text import TfidfVectorizer +# tfidf = TfidfVectorizer(min_df=5) +# Xtr = tfidf.fit_transform(my_data.training.instances) +# Xte = tfidf.transform(my_data.test.instances) +# ... or using some preprocessing functionality of quapy (recommended): +my_data_tfidf = qp.data.preprocessing.text2tfidf(my_data, min_df=5) + +training, test = my_data_tfidf.train_test + +# Once you have loaded your training and test data, you have access to a series of quapy's utilities, e.g.: +print(f'the training prevalence is {F.strprev(training.prevalence())}') +print(f'the test prevalence is {F.strprev(test.prevalence())}') +print(f'let us generate a small balanced training sample:') +desired_size = 200 +desired_prevalence = [0.5, 0.5] +small_training_balanced = training.sampling(desired_size, *desired_prevalence, shuffle=True, random_state=0) +print(small_training_balanced) +print(f'or generating train/val splits such as: {training.split_stratified(train_prop=0.7)}') + +# training +print('let us train a simple quantifier:...') +Xtr, ytr = training.Xy +quantifier = PACC() +quantifier.fit(Xtr, ytr) # or: quantifier.fit(*training.Xy) + +# test +print("and use quapy' evaluation functions") +evaluation_protocol = ArtificialPrevalenceProtocol( + data=test, + sample_size=200, + random_state=0 +) + +report = qp.evaluation.evaluation_report(quantifier, protocol=evaluation_protocol, error_metrics=['ae']) +print(report) +print(f'mean absolute error across {len(report)} experiments: {report.mean(numeric_only=True)}') + + + + + + + + + + diff --git a/examples/4.using_pretrained_classifier.py b/examples/4.using_pretrained_classifier.py new file mode 100644 index 0000000..5b5ead5 --- /dev/null +++ b/examples/4.using_pretrained_classifier.py @@ -0,0 +1,75 @@ +""" +Aggregative quantifiers use an underlying classifier. Often, one has one pre-trained classifier available, and +needs to use this classifier at the basis of a quantification system. In such cases, the classifier should not +be retrained, but only used to issue classifier predictions for the quantifier. +In this example, we show how to instantiate a quantifier with a pre-trained classifier. +""" +from typing import List, Dict + +import quapy as qp +from quapy.method.aggregative import PACC +from sklearn.base import BaseEstimator, ClassifierMixin +from transformers import pipeline +import numpy as np +import quapy.functional as F + + +# A scikit-learn's style wrapper for a huggingface-based pre-trained transformer for binary sentiment classification +class HFTextClassifier(BaseEstimator, ClassifierMixin): + def __init__(self, model_name='distilbert-base-uncased-finetuned-sst-2-english'): + self.pipe = pipeline("sentiment-analysis", model=model_name) + self.classes_ = np.asarray([0,1]) + + def fit(self, X, y=None): + return self + + def _binary_decisions(self, transformer_output: List[Dict]): + return np.array([(1 if p['label']=='POSITIVE' else 0) for p in transformer_output], dtype=int) + + def predict(self, X): + X = list(map(str, X)) + preds = self.pipe(X, truncation=True) + return self._binary_decisions(preds) + + def predict_proba(self, X): + X = list(map(str, X)) + n_examples = len(X) + preds = self.pipe(X, truncation=True) + decisions = self._binary_decisions(preds) + scores = np.array([p['score'] for p in preds], dtype=float) + probas = np.zeros(shape=(len(X), 2), dtype=float) + probas[np.arange(n_examples),decisions] = scores + probas[np.arange(n_examples),~decisions] = 1-scores + return probas + +# load a sentiment dataset +dataset = qp.datasets.fetch_reviews('imdb', tfidf=False) # raw text +train, test = dataset.training, dataset.test + +# instantiate a pre-trained classifier +clf = HFTextClassifier() + +# Let us fit a quantifier based on our pre-trained classifier. +# Note that, since the classifier is already fit, we will use the entire training set for +# learning the aggregation function of the quantifier. +# To do so, we only need to indicate "fit_classifier"=False, as follows: +quantifier = PACC(clf, fit_classifier=False) # Probabilistic Classify & Count using a pre-trained model + +print('training PACC...') +quantifier.fit(*train.Xy) + +# let us simulate some shifted test data... +new_prevalence = [0.75, 0.25] +shifted_test = test.sampling(500, *new_prevalence, random_state=0) + +# and do some evaluation +print('predicting with PACC...') +estim_prevalence = quantifier.predict(shifted_test.X) + +print('Result:\n'+('='*20)) +print(f'training prevalence: {F.strprev(train.prevalence())}') +print(f'(shifted) test prevalence: {F.strprev(shifted_test.prevalence())}') +print(f'estimated prevalence: {F.strprev(estim_prevalence)}') + +absolute_error = qp.error.ae(new_prevalence, estim_prevalence) +print(f'absolute error={absolute_error:.4f}') \ No newline at end of file diff --git a/examples/lequa2022_experiments.py b/examples/5a.lequa2022_experiments.py similarity index 86% rename from examples/lequa2022_experiments.py rename to examples/5a.lequa2022_experiments.py index f3eec55..40632d5 100644 --- a/examples/lequa2022_experiments.py +++ b/examples/5a.lequa2022_experiments.py @@ -15,7 +15,7 @@ https://lequa2022.github.io/index (the site of the competition) https://ceur-ws.org/Vol-3180/paper-146.pdf (the overview paper) """ -# there are 4 tasks (T1A, T1B, T2A, T2B) +# there are 4 tasks (T1A, T1B, T2A, T2B), let us symply consider T1A (binary quantification, vector form) task = 'T1A' # set the sample size in the environment. The sample size is task-dendendent and can be consulted by doing: @@ -28,18 +28,19 @@ qp.environ['N_JOBS'] = -1 # of SamplesFromDir, a protocol that simply iterates over pre-generated samples (those provided for the competition) # stored in a directory. training, val_generator, test_generator = fetch_lequa2022(task=task) +Xtr, ytr = training.Xy # define the quantifier -quantifier = EMQ(classifier=LogisticRegression()) +quantifier = EMQ(classifier=LogisticRegression(), val_split=5) # model selection param_grid = { 'classifier__C': np.logspace(-3, 3, 7), # classifier-dependent: inverse of regularization strength 'classifier__class_weight': ['balanced', None], # classifier-dependent: weights of each class - 'recalib': ['bcts', 'platt', None] # quantifier-dependent: recalibration method (new in v0.1.7) + 'calib': ['bcts', None] # quantifier-dependent: recalibration method (new in v0.1.7) } model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True) -quantifier = model_selection.fit(training) +quantifier = model_selection.fit(Xtr, ytr) # evaluation report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae', 'mkld'], verbose=True) @@ -50,4 +51,4 @@ report['estim-prev'] = report['estim-prev'].map(F.strprev) print(report) print('Averaged values:') -print(report.mean()) +print(report.mean(numeric_only=True)) diff --git a/examples/5b.lequa2024_experiments.py b/examples/5b.lequa2024_experiments.py new file mode 100644 index 0000000..351fed1 --- /dev/null +++ b/examples/5b.lequa2024_experiments.py @@ -0,0 +1,55 @@ +import quapy as qp +import numpy as np +from sklearn.linear_model import LogisticRegression +import quapy.functional as F +from quapy.data.datasets import LEQUA2024_SAMPLE_SIZE, fetch_lequa2024 +from quapy.evaluation import evaluation_report +from quapy.method.aggregative import KDEyML +from quapy.model_selection import GridSearchQ +import pandas as pd + +""" +This example shows hoy to use the LeQua datasets (new in v0.1.9). For more information about the datasets, and the +LeQua competition itself, check: +https://lequa2024.github.io/index (the site of the competition) +""" + + +# there are 4 tasks: T1 (binary), T2 (multiclass), T3 (ordinal), T4 (binary - covariate & prior shift) +task = 'T2' + +# set the sample size in the environment. The sample size is task-dendendent and can be consulted by doing: +qp.environ['SAMPLE_SIZE'] = LEQUA2024_SAMPLE_SIZE[task] +qp.environ['N_JOBS'] = -1 + +# the fetch method returns a training set (an instance of LabelledCollection) and two generators: one for the +# validation set and another for the test sets. These generators are both instances of classes that extend +# AbstractProtocol (i.e., classes that implement sampling generation procedures) and, in particular, are instances +# of SamplesFromDir, a protocol that simply iterates over pre-generated samples (those provided for the competition) +# stored in a directory. +training, val_generator, test_generator = fetch_lequa2024(task=task) +Xtr, ytr = training.Xy + +# define the quantifier +quantifier = KDEyML(classifier=LogisticRegression()) + +# model selection +param_grid = { + 'classifier__C': np.logspace(-3, 3, 7), # classifier-dependent: inverse of regularization strength + 'classifier__class_weight': ['balanced', None], # classifier-dependent: weights of each class + 'bandwidth': np.linspace(0.01, 0.2, 20) # quantifier-dependent: bandwidth of the kernel +} + +model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True) +quantifier = model_selection.fit(Xtr, ytr) + +# evaluation +report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae'], verbose=True) + +# printing results +pd.set_option('display.expand_frame_repr', False) +report['estim-prev'] = report['estim-prev'].map(F.strprev) +print(report) + +print('Averaged values:') +print(report.mean()) diff --git a/examples/quanet_example.py b/examples/6.quanet_example.py similarity index 87% rename from examples/quanet_example.py rename to examples/6.quanet_example.py index 4be3132..bbcad5d 100644 --- a/examples/quanet_example.py +++ b/examples/6.quanet_example.py @@ -20,14 +20,13 @@ train, test = dataset.train_test # train the text classifier: cnn_module = CNNnet(dataset.vocabulary_size, dataset.training.n_classes) cnn_classifier = NeuralClassifierTrainer(cnn_module, device='cuda') -cnn_classifier.fit(*dataset.training.Xy) # train QuaNet (alternatively, we can set fit_classifier=True and let QuaNet train the classifier) quantifier = QuaNet(cnn_classifier, device='cuda') -quantifier.fit(train, fit_classifier=False) +quantifier.fit(*train.Xy) # prediction and evaluation -estim_prevalence = quantifier.quantify(test.instances) +estim_prevalence = quantifier.predict(test.instances) mae = qp.error.mae(test.prevalence(), estim_prevalence) print(f'true prevalence: {F.strprev(test.prevalence())}') diff --git a/examples/uci_experiments.py b/examples/7.uci_binary_experiments.py similarity index 67% rename from examples/uci_experiments.py rename to examples/7.uci_binary_experiments.py index 2cf5bac..04e07ee 100644 --- a/examples/uci_experiments.py +++ b/examples/7.uci_binary_experiments.py @@ -1,4 +1,7 @@ from copy import deepcopy +from pathlib import Path + +import pandas as pd import quapy as qp from sklearn.calibration import CalibratedClassifierCV @@ -15,6 +18,18 @@ import itertools import argparse import torch import shutil +from glob import glob + + +""" +This example shows how to generate experiments for the UCI ML repository binary datasets following the protocol +proposed in "Pérez-Gállego , P., Quevedo , J. R., and del Coz, J. J. Using ensembles for problems with characteriz- +able changes in data distribution: A case study on quantification. Information Fusion 34 (2017), 87–100." + +This example covers most important steps in the experimentation pipeline, namely, the training and optimization +of the hyperparameters of different quantifiers, and the evaluation of these quantifiers based on standard +prevalence sampling protocols aimed at simulating different levels of prior probability shift. +""" N_JOBS = -1 @@ -28,13 +43,14 @@ def newLR(): return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1) -def calibratedLR(): - return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)) - - __C_range = np.logspace(-3, 3, 7) -lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']} -svmperf_params = {'classifier__C': __C_range} +lr_params = { + 'classifier__C': __C_range, + 'classifier__class_weight': [None, 'balanced'] +} +svmperf_params = { + 'classifier__C': __C_range +} def quantification_models(): @@ -45,7 +61,7 @@ def quantification_models(): yield 'MAX', MAX(newLR()), lr_params yield 'MS', MS(newLR()), lr_params yield 'MS2', MS2(newLR()), lr_params - yield 'sldc', EMQ(newLR(), recalib='platt'), lr_params + yield 'sldc', EMQ(newLR()), lr_params yield 'svmmae', newSVMAE(), svmperf_params yield 'hdy', HDy(newLR()), lr_params @@ -69,6 +85,13 @@ def result_path(path, dataset_name, model_name, run, optim_loss): return os.path.join(path, f'{dataset_name}-{model_name}-run{run}-{optim_loss}.pkl') +def parse_result_path(path): + *dataset, method, run, metric = Path(path).name.split('-') + dataset = '-'.join(dataset) + run = int(run.replace('run','')) + return dataset, method, run, metric + + def is_already_computed(dataset_name, model_name, run, optim_loss): return os.path.exists(result_path(args.results, dataset_name, model_name, run, optim_loss)) @@ -84,7 +107,7 @@ def run(experiment): optim_loss, dataset_name, (model_name, model, hyperparams) = experiment if dataset_name in ['acute.a', 'acute.b', 'iris.1']: return - collection = qp.datasets.fetch_UCILabelledCollection(dataset_name) + collection = qp.datasets.fetch_UCIBinaryLabelledCollection(dataset_name) for run, data in enumerate(qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=1)): if is_already_computed(dataset_name, model_name, run=run, optim_loss=optim_loss): print(f'result for dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5 already computed.') @@ -93,8 +116,8 @@ def run(experiment): print(f'running dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5') # model selection (hyperparameter optimization for a quantification-oriented loss) train, test = data.train_test - train, val = train.split_stratified() if hyperparams is not None: + train, val = train.split_stratified() model_selection = qp.model_selection.GridSearchQ( deepcopy(model), param_grid=hyperparams, @@ -102,13 +125,13 @@ def run(experiment): error=optim_loss, refit=True, timeout=60*60, - verbose=True + verbose=False ) - model_selection.fit(data.training) + model_selection.fit(*train.Xy) model = model_selection.best_model() best_params = model_selection.best_params_ else: - model.fit(data.training) + model.fit(*train.Xy) best_params = {} # model evaluation @@ -116,19 +139,37 @@ def run(experiment): model, protocol=APP(test, n_prevalences=21, repeats=100) ) - test_true_prevalence = data.test.prevalence() + test_true_prevalence = test.prevalence() evaluate_experiment(true_prevalences, estim_prevalences) save_results(dataset_name, model_name, run, optim_loss, true_prevalences, estim_prevalences, - data.training.prevalence(), test_true_prevalence, + train.prevalence(), test_true_prevalence, best_params) +def show_results(result_folder): + result_data = [] + for file in glob(os.path.join(result_folder,'*.pkl')): + true_prevalences, estim_prevalences, *_ = pickle.load(open(file, 'rb')) + dataset, method, run, metric = parse_result_path(file) + mae = qp.error.mae(true_prevalences, estim_prevalences) + result_data.append({ + 'dataset': dataset, + 'method': method, + 'run': run, + metric: mae + }) + df = pd.DataFrame(result_data) + pd.set_option("display.max_columns", None) + pd.set_option("display.expand_frame_repr", False) + print(df.pivot_table(index='dataset', columns='method', values=metric)) + + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification') - parser.add_argument('results', metavar='RESULT_PATH', type=str, - help='path to the directory where to store the results') + parser.add_argument('--results', metavar='RESULT_PATH', type=str, + help='path to the directory where to store the results', default='./results/uci_binary') parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='../svm_perf_quantification', help='path to the directory with svmperf') parser.add_argument('--checkpointdir', metavar='PATH', type=str, default='./checkpoint', @@ -141,7 +182,7 @@ if __name__ == '__main__': qp.environ['SVMPERF_HOME'] = args.svmperfpath optim_losses = ['mae'] - datasets = qp.datasets.UCI_DATASETS + datasets = qp.datasets.UCI_BINARY_DATASETS models = quantification_models() qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=N_JOBS) @@ -150,3 +191,5 @@ if __name__ == '__main__': qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=CUDA_N_JOBS) shutil.rmtree(args.checkpointdir, ignore_errors=True) + + show_results(args.results) diff --git a/examples/8.uci_multiclass_experiments.py b/examples/8.uci_multiclass_experiments.py new file mode 100644 index 0000000..06f7ea7 --- /dev/null +++ b/examples/8.uci_multiclass_experiments.py @@ -0,0 +1,131 @@ +import os +from time import time +from collections import defaultdict + +import numpy as np +from sklearn.linear_model import LogisticRegression + +import quapy as qp +from quapy.method.aggregative import PACC, EMQ, KDEyML +from quapy.model_selection import GridSearchQ +from quapy.protocol import UPP +from pathlib import Path + +""" +This example is the analogous counterpart of example 7 but involving multiclass quantification problems +using datasets from the UCI ML repository. +""" + + +SEED = 1 + + +def newLR(): + return LogisticRegression(max_iter=3000) + +# typical hyperparameters explored for Logistic Regression +logreg_grid = { + 'C': np.logspace(-3, 3, 7), + 'class_weight': ['balanced', None] +} + +def wrap_hyper(classifier_hyper_grid:dict): + return {'classifier__'+k:v for k, v in classifier_hyper_grid.items()} + +METHODS = [ + ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)), + ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)), + ('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}), +] + + +def show_results(result_path): + import pandas as pd + df = pd.read_csv(result_path+'.csv', sep='\t') + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE", "t_train"], margins=True) + print(pv) + + +def load_timings(result_path): + import pandas as pd + timings = defaultdict(lambda: {}) + if not Path(result_path + '.csv').exists(): + return timings + + df = pd.read_csv(result_path+'.csv', sep='\t') + return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict() + + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = 500 + qp.environ['N_JOBS'] = -1 + n_bags_val = 250 + n_bags_test = 1000 + result_dir = f'results/uci_multiclass' + + os.makedirs(result_dir, exist_ok=True) + + global_result_path = f'{result_dir}/allmethods' + timings = load_timings(global_result_path) + with open(global_result_path + '.csv', 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\tt_train\n') + + for method_name, quantifier, param_grid in METHODS: + + print('Init method', method_name) + + with open(global_result_path + '.csv', 'at') as csv: + + for dataset in qp.datasets.UCI_MULTICLASS_DATASETS: + + print('init', dataset) + + local_result_path = os.path.join(Path(global_result_path).parent, method_name + '_' + dataset + '.dataframe') + + if os.path.exists(local_result_path): + print(f'result file {local_result_path} already exist; skipping') + report = qp.util.load_report(local_result_path) + + else: + with qp.util.temp_seed(SEED): + + data = qp.datasets.fetch_UCIMulticlassDataset(dataset, verbose=True) + + # model selection + train, test = data.train_test + train, val = train.split_stratified(random_state=SEED) + + protocol = UPP(val, repeats=n_bags_val) + modsel = GridSearchQ( + quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae' + ) + + t_init = time() + try: + modsel.fit(*train.Xy) + + print(f'best params {modsel.best_params_}') + print(f'best score {modsel.best_score_}') + + quantifier = modsel.best_model() + except: + print('something went wrong... trying to fit the default model') + quantifier.fit(*train.Xy) + + timings[method_name][dataset] = time() - t_init + + + protocol = UPP(test, repeats=n_bags_test) + report = qp.evaluation.evaluation_report( + quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True + ) + report.to_csv(local_result_path) + + means = report.mean(numeric_only=True) + csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{timings[method_name][dataset]:.3f}\n') + csv.flush() + + show_results(global_result_path) \ No newline at end of file diff --git a/examples/9.ifcb_experiments.py b/examples/9.ifcb_experiments.py new file mode 100644 index 0000000..580be6b --- /dev/null +++ b/examples/9.ifcb_experiments.py @@ -0,0 +1,61 @@ +import numpy as np + +import quapy as qp +from sklearn.linear_model import LogisticRegression + +from quapy.model_selection import GridSearchQ +from quapy.evaluation import evaluation_report + +""" +This example shows a complete experiment using the IFCB Plankton dataset; +see https://hlt-isti.github.io/QuaPy/manuals/datasets.html#ifcb-plankton-dataset + +Note that this dataset can be downloaded in two modes: for model selection or for evaluation. + +See also: +Automatic plankton quantification using deep features +P González, A Castaño, EE Peacock, J Díez, JJ Del Coz, HM Sosik +Journal of Plankton Research 41 (4), 449-463 +""" + + +print('Quantifying the IFCB dataset with PACC\n') + +# model selection +print('loading dataset for model selection...', end='') +train, val_gen = qp.datasets.fetch_IFCB(for_model_selection=True, single_sample_train=True) +print('[done]') +print(f'\ttraining size={len(train)}, features={train.X.shape[1]}, classes={train.n_classes}') +print(f'\tvalidation samples={val_gen.total()}') + +print('model selection starts') +quantifier = qp.method.aggregative.PACC(LogisticRegression()) + +mod_sel = GridSearchQ( + quantifier, + param_grid={ + 'classifier__C': np.logspace(-3,3,7), + 'classifier__class_weight': [None, 'balanced'] + }, + protocol=val_gen, + refit=False, + n_jobs=-1, + verbose=True, + raise_errors=True +).fit(*train.Xy) + +print(f'model selection chose hyperparameters: {mod_sel.best_params_}') +quantifier = mod_sel.best_model_ + +print('loading dataset for test...', end='') +train, test_gen = qp.datasets.fetch_IFCB(for_model_selection=False, single_sample_train=True) +print('[done]') +print(f'\ttraining size={len(train)}, features={train.X.shape[1]}, classes={train.n_classes}') +print(f'\ttest samples={test_gen.total()}') + +print('training on the whole dataset before test') +quantifier.fit(*train.Xy) + +print('testing...') +report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True) +print(report.mean()) diff --git a/examples/custom_quantifier.py b/examples/custom_quantifier.py deleted file mode 100644 index 31a69cd..0000000 --- a/examples/custom_quantifier.py +++ /dev/null @@ -1,69 +0,0 @@ -import quapy as qp -from quapy.data import LabelledCollection -from quapy.method.base import BinaryQuantifier -from quapy.model_selection import GridSearchQ -from quapy.method.aggregative import AggregativeProbabilisticQuantifier -from quapy.protocol import APP -import numpy as np -from sklearn.linear_model import LogisticRegression - - -# Define a custom quantifier: for this example, we will consider a new quantification algorithm that uses a -# logistic regressor for generating posterior probabilities, and then applies a custom threshold value to the -# posteriors. Since the quantifier internally uses a classifier, it is an aggregative quantifier; and since it -# relies on posterior probabilities, it is a probabilistic-aggregative quantifier. Note also it has an -# internal hyperparameter (let say, alpha) which is the decision threshold. Let's also assume the quantifier -# is binary, for simplicity. - -class MyQuantifier(AggregativeProbabilisticQuantifier, BinaryQuantifier): - def __init__(self, classifier, alpha=0.5): - self.alpha = alpha - # aggregative quantifiers have an internal self.classifier attribute - self.classifier = classifier - - def fit(self, data: LabelledCollection, fit_classifier=True): - assert fit_classifier, 'this quantifier needs to fit the classifier!' - self.classifier.fit(*data.Xy) - return self - - # in general, we would need to implement the method quantify(self, instances) but, since this method is of - # type aggregative, we can simply implement the method aggregate, which has the following interface - def aggregate(self, classif_predictions: np.ndarray): - # the posterior probabilities have already been generated by the quantify method; we only need to - # specify what to do with them - positive_probabilities = classif_predictions[:, 1] - crisp_decisions = positive_probabilities > self.alpha - pos_prev = crisp_decisions.mean() - neg_prev = 1-pos_prev - return np.asarray([neg_prev, pos_prev]) - - -if __name__ == '__main__': - - qp.environ['SAMPLE_SIZE'] = 100 - - # define an instance of our custom quantifier - quantifier = MyQuantifier(LogisticRegression(), alpha=0.5) - - # load the IMDb dataset - train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test - - # model selection - # let us assume we want to explore our hyperparameter alpha along with one hyperparameter of the classifier - train, val = train.split_stratified(train_prop=0.75) - param_grid = { - 'alpha': np.linspace(0, 1, 11), # quantifier-dependent hyperparameter - 'classifier__C': np.logspace(-2, 2, 5) # classifier-dependent hyperparameter - } - quantifier = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=True).fit(train) - - # evaluation - mae = qp.evaluation.evaluate(quantifier, protocol=APP(test), error_metric='mae') - - print(f'MAE = {mae:.4f}') - - # final remarks: this method is only for demonstration purposes and makes little sense in general. The method relies - # on an hyperparameter alpha for binarizing the posterior probabilities. A much better way for fulfilling this - # goal would be to calibrate the classifier (LogisticRegression is already reasonably well calibrated) and then - # simply cut at 0.5. - diff --git a/examples/distributing_samples.py b/examples/distributing_samples.py new file mode 100644 index 0000000..76a9731 --- /dev/null +++ b/examples/distributing_samples.py @@ -0,0 +1,38 @@ +""" +Imagine we want to generate many samples out of a collection, that we want to distribute for others to run their +own experiments in the very same test samples. One naive solution would come down to applying a given protocol to +our collection (say the artificial prevalence protocol on the 'academic-success' UCI dataset), store all those samples +on disk and make them available online. Distributing many such samples is undesirable. +In this example, we generate the indexes that allow anyone to regenerate the samples out of the original collection. +""" + +import quapy as qp +from quapy.method.aggregative import PACC +from quapy.protocol import UPP + +data = qp.datasets.fetch_UCIMulticlassDataset('academic-success') +train, test = data.train_test + +# let us train a quantifier to check whether we can actually replicate the results +quantifier = PACC() +quantifier.fit(train) + +# let us simulate our experimental results +protocol = UPP(test, sample_size=100, repeats=100, random_state=0) +our_mae = qp.evaluation.evaluate(quantifier, protocol=protocol, error_metric='mae') + +print(f'We have obtained a MAE={our_mae:.3f}') + +# let us distribute the indexes; we specify that we want the indexes, not the samples +protocol = UPP(test, sample_size=100, repeats=100, random_state=0, return_type='index') +indexes = protocol.samples_parameters() + +# Imagine we distribute the indexes; now we show how to replicate our experiments. +from quapy.protocol import ProtocolFromIndex +data = qp.datasets.fetch_UCIMulticlassDataset('academic-success') +train, test = data.train_test +protocol = ProtocolFromIndex(data=test, indexes=indexes) +their_mae = qp.evaluation.evaluate(quantifier, protocol=protocol, error_metric='mae') + +print(f'Another lab obtains a MAE={our_mae:.3f}') + diff --git a/examples/ensembles.py b/examples/ensembles.py new file mode 100644 index 0000000..84aeb2c --- /dev/null +++ b/examples/ensembles.py @@ -0,0 +1,56 @@ +from sklearn.exceptions import ConvergenceWarning +from sklearn.linear_model import LogisticRegression +from sklearn.naive_bayes import MultinomialNB +from sklearn.neighbors import KNeighborsClassifier +from statsmodels.sandbox.distributions.genpareto import quant + +import quapy as qp +from quapy.protocol import UPP +from quapy.method.aggregative import PACC, DMy, EMQ, KDEyML +from quapy.method.meta import SCMQ, MCMQ, MCSQ +import warnings +warnings.filterwarnings("ignore", category=DeprecationWarning) +warnings.filterwarnings("ignore", category=ConvergenceWarning) + +qp.environ["SAMPLE_SIZE"]=100 + +def train_and_test_model(quantifier, train, test): + quantifier.fit(train) + report = qp.evaluation.evaluation_report(quantifier, UPP(test), error_metrics=['mae', 'mrae']) + print(quantifier.__class__.__name__) + print(report.mean(numeric_only=True)) + + +quantifiers = [ + PACC(), + DMy(), + EMQ(), + KDEyML() +] + +classifier = LogisticRegression() + +dataset_name = qp.datasets.UCI_MULTICLASS_DATASETS[0] +data = qp.datasets.fetch_UCIMulticlassDataset(dataset_name) +train, test = data.train_test + +scmq = SCMQ(classifier, quantifiers) + +train_and_test_model(scmq, train, test) + +# for quantifier in quantifiers: +# train_and_test_model(quantifier, train, test) + +classifiers = [ + LogisticRegression(), + KNeighborsClassifier(), + # MultinomialNB() +] + +mcmq = MCMQ(classifiers, quantifiers) + +train_and_test_model(mcmq, train, test) + +mcsq = MCSQ(classifiers, PACC()) + +train_and_test_model(mcsq, train, test) \ No newline at end of file diff --git a/examples/ifcb_experiments.py b/examples/ifcb_experiments.py deleted file mode 100644 index 4cf9448..0000000 --- a/examples/ifcb_experiments.py +++ /dev/null @@ -1,28 +0,0 @@ -import quapy as qp -from sklearn.linear_model import LogisticRegression -from quapy.evaluation import evaluation_report - - -def newLR(): - return LogisticRegression(n_jobs=-1) - - -quantifiers = [ - ('CC', qp.method.aggregative.CC(newLR())), - ('ACC', qp.method.aggregative.ACC(newLR())), - ('PCC', qp.method.aggregative.PCC(newLR())), - ('PACC', qp.method.aggregative.PACC(newLR())), - ('HDy', qp.method.aggregative.DMy(newLR())), - ('EMQ', qp.method.aggregative.EMQ(newLR())) -] - - -for quant_name, quantifier in quantifiers: - print("Experiment with "+quant_name) - - train, test_gen = qp.datasets.fetch_IFCB() - - quantifier.fit(train) - - report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True) - print(report.mean()) diff --git a/examples/lequa2022_experiments_recalib.py b/examples/lequa2022_experiments_recalib.py deleted file mode 100644 index a5a0e05..0000000 --- a/examples/lequa2022_experiments_recalib.py +++ /dev/null @@ -1,63 +0,0 @@ -import numpy as np -from abstention.calibration import NoBiasVectorScaling, VectorScaling, TempScaling -from sklearn.calibration import CalibratedClassifierCV -from sklearn.linear_model import LogisticRegression -import quapy as qp -import quapy.functional as F -from classification.calibration import RecalibratedProbabilisticClassifierBase, NBVSCalibration, \ - BCTSCalibration -from data.datasets import LEQUA2022_SAMPLE_SIZE, fetch_lequa2022 -from evaluation import evaluation_report -from method.aggregative import EMQ -from model_selection import GridSearchQ -import pandas as pd - -for task in ['T1A', 'T1B']: - - # calibration = TempScaling(verbose=False, bias_positions='all') - - qp.environ['SAMPLE_SIZE'] = LEQUA2022_SAMPLE_SIZE[task] - training, val_generator, test_generator = fetch_lequa2022(task=task) - - # define the quantifier - # learner = BCTSCalibration(LogisticRegression(), n_jobs=-1) - # learner = CalibratedClassifierCV(LogisticRegression()) - learner = LogisticRegression() - quantifier = EMQ(classifier=learner) - - # model selection - param_grid = { - 'classifier__C': np.logspace(-3, 3, 7), - 'classifier__class_weight': ['balanced', None], - 'recalib': ['platt', 'ts', 'vs', 'nbvs', 'bcts', None], - 'exact_train_prev': [False, True] - } - model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', n_jobs=-1, refit=False, verbose=True) - quantifier = model_selection.fit(training) - - # evaluation - report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae', 'mkld'], verbose=True) - - # import os - # os.makedirs(f'./out', exist_ok=True) - # with open(f'./out/EMQ_{calib}_{task}.txt', 'wt') as foo: - # estim_prev = report['estim-prev'].values - # nclasses = len(estim_prev[0]) - # foo.write(f'id,'+','.join([str(x) for x in range(nclasses)])+'\n') - # for id, prev in enumerate(estim_prev): - # foo.write(f'{id},'+','.join([f'{p:.5f}' for p in prev])+'\n') - # - # #os.makedirs(f'./errors/{task}', exist_ok=True) - # with open(f'./out/EMQ_{calib}_{task}_errors.txt', 'wt') as foo: - # maes, mraes = report['mae'].values, report['mrae'].values - # foo.write(f'id,AE,RAE\n') - # for id, (ae_i, rae_i) in enumerate(zip(maes, mraes)): - # foo.write(f'{id},{ae_i:.5f},{rae_i:.5f}\n') - - # printing results - pd.set_option('display.expand_frame_repr', False) - report['estim-prev'] = report['estim-prev'].map(F.strprev) - print(report) - - print('Averaged values:') - print(report.mean()) diff --git a/examples/model_selection.py b/examples/model_selection.py deleted file mode 100644 index ae7fb6a..0000000 --- a/examples/model_selection.py +++ /dev/null @@ -1,57 +0,0 @@ -import quapy as qp -from quapy.protocol import APP -from quapy.method.aggregative import DMy -from sklearn.linear_model import LogisticRegression -import numpy as np - -""" -In this example, we show how to perform model selection on a DistributionMatching quantifier. -""" - -model = DMy(LogisticRegression()) - -qp.environ['SAMPLE_SIZE'] = 100 -qp.environ['N_JOBS'] = -1 - -training, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test - -# The model will be returned by the fit method of GridSearchQ. -# Every combination of hyper-parameters will be evaluated by confronting the -# quantifier thus configured against a series of samples generated by means -# of a sample generation protocol. For this example, we will use the -# artificial-prevalence protocol (APP), that generates samples with prevalence -# values in the entire range of values from a grid (e.g., [0, 0.1, 0.2, ..., 1]). -# We devote 30% of the dataset for this exploration. -training, validation = training.split_stratified(train_prop=0.7) -protocol = APP(validation) - -# We will explore a classification-dependent hyper-parameter (e.g., the 'C' -# hyper-parameter of LogisticRegression) and a quantification-dependent hyper-parameter -# (e.g., the number of bins in a DistributionMatching quantifier. -# Classifier-dependent hyper-parameters have to be marked with a prefix "classifier__" -# in order to let the quantifier know this hyper-parameter belongs to its underlying -# classifier. -param_grid = { - 'classifier__C': np.logspace(-3,3,7), - 'nbins': [8, 16, 32, 64], -} - -model = qp.model_selection.GridSearchQ( - model=model, - param_grid=param_grid, - protocol=protocol, - error='mae', # the error to optimize is the MAE (a quantification-oriented loss) - refit=True, # retrain on the whole labelled set once done - verbose=True # show information as the process goes on -).fit(training) - -print(f'model selection ended: best hyper-parameters={model.best_params_}') -model = model.best_model_ - -# evaluation in terms of MAE -# we use the same evaluation protocol (APP) on the test set -mae_score = qp.evaluation.evaluate(model, protocol=APP(test), error_metric='mae') - -print(f'MAE={mae_score:.5f}') - - diff --git a/experimental_non_aggregative/custom_vectorizers.py b/experimental_non_aggregative/custom_vectorizers.py new file mode 100644 index 0000000..13337b9 --- /dev/null +++ b/experimental_non_aggregative/custom_vectorizers.py @@ -0,0 +1,254 @@ +from scipy.sparse import csc_matrix, csr_matrix +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer +import numpy as np +from joblib import Parallel, delayed +import sklearn +import math +from scipy.stats import t + + +class ContTable: + def __init__(self, tp=0, tn=0, fp=0, fn=0): + self.tp=tp + self.tn=tn + self.fp=fp + self.fn=fn + + def get_d(self): return self.tp + self.tn + self.fp + self.fn + + def get_c(self): return self.tp + self.fn + + def get_not_c(self): return self.tn + self.fp + + def get_f(self): return self.tp + self.fp + + def get_not_f(self): return self.tn + self.fn + + def p_c(self): return (1.0*self.get_c())/self.get_d() + + def p_not_c(self): return 1.0-self.p_c() + + def p_f(self): return (1.0*self.get_f())/self.get_d() + + def p_not_f(self): return 1.0-self.p_f() + + def p_tp(self): return (1.0*self.tp) / self.get_d() + + def p_tn(self): return (1.0*self.tn) / self.get_d() + + def p_fp(self): return (1.0*self.fp) / self.get_d() + + def p_fn(self): return (1.0*self.fn) / self.get_d() + + def tpr(self): + c = 1.0*self.get_c() + return self.tp / c if c > 0.0 else 0.0 + + def fpr(self): + _c = 1.0*self.get_not_c() + return self.fp / _c if _c > 0.0 else 0.0 + + +def __ig_factor(p_tc, p_t, p_c): + den = p_t * p_c + if den != 0.0 and p_tc != 0: + return p_tc * math.log(p_tc / den, 2) + else: + return 0.0 + + +def information_gain(cell): + return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \ + __ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\ + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \ + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c()) + + +def squared_information_gain(cell): + return information_gain(cell)**2 + + +def posneg_information_gain(cell): + ig = information_gain(cell) + if cell.tpr() < cell.fpr(): + return -ig + else: + return ig + + +def pos_information_gain(cell): + if cell.tpr() < cell.fpr(): + return 0 + else: + return information_gain(cell) + +def pointwise_mutual_information(cell): + return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + + +def gss(cell): + return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn() + + +def chi_square(cell): + den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c() + if den==0.0: return 0.0 + num = gss(cell)**2 + return num / den + + +def conf_interval(xt, n): + if n>30: + z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2 + else: + z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2 + p = (xt + 0.5 * z2) / (n + z2) + amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2)) + return p, amplitude + + +def strength(minPosRelFreq, minPos, maxNeg): + if minPos > maxNeg: + return math.log(2.0 * minPosRelFreq, 2.0) + else: + return 0.0 + + +#set cancel_features=True to allow some features to be weighted as 0 (as in the original article) +#however, for some extremely imbalanced dataset caused all documents to be 0 +def conf_weight(cell, cancel_features=False): + c = cell.get_c() + not_c = cell.get_not_c() + tp = cell.tp + fp = cell.fp + + pos_p, pos_amp = conf_interval(tp, c) + neg_p, neg_amp = conf_interval(fp, not_c) + + min_pos = pos_p-pos_amp + max_neg = neg_p+neg_amp + den = (min_pos + max_neg) + minpos_relfreq = min_pos / (den if den != 0 else 1) + + str_tplus = strength(minpos_relfreq, min_pos, max_neg); + + if str_tplus == 0 and not cancel_features: + return 1e-20 + + return str_tplus + + +def get_tsr_matrix(cell_matrix, tsr_score_funtion): + nC = len(cell_matrix) + nF = len(cell_matrix[0]) + tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)] + return np.array(tsr_matrix) + + +def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD): + tp_ = len(positive_document_indexes & feature_document_indexes) + fp_ = len(feature_document_indexes - positive_document_indexes) + fn_ = len(positive_document_indexes - feature_document_indexes) + tn_ = nD - (tp_ + fp_ + fn_) + return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_) + + +def category_tables(feature_sets, category_sets, c, nD, nF): + return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)] + + +def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1): + """ + Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c. + Efficiency O(nF x nC x log(S)) where S is the sparse factor + """ + + nD, nF = coocurrence_matrix.shape + nD2, nC = label_matrix.shape + + if nD != nD2: + raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' % + (coocurrence_matrix.shape,label_matrix.shape)) + + def nonzero_set(matrix, col): + return set(matrix[:, col].nonzero()[0]) + + if isinstance(coocurrence_matrix, csr_matrix): + coocurrence_matrix = csc_matrix(coocurrence_matrix) + feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)] + category_sets = [nonzero_set(label_matrix, c) for c in range(nC)] + cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")( + delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC) + ) + return np.array(cell_matrix) + + +class TSRweighting(BaseEstimator,TransformerMixin): + """ + Supervised Term Weighting function based on any Term Selection Reduction (TSR) function (e.g., information gain, + chi-square, etc.) or, more generally, on any function that could be computed on the 4-cell contingency table for + each category-feature pair. + The supervised_4cell_matrix is a `(n_classes, n_words)` matrix containing the 4-cell contingency tables + for each class-word pair, and can be pre-computed (e.g., during the feature selection phase) and passed as an + argument. + When `n_classes>1`, i.e., in multiclass scenarios, a global_policy is used in order to determine a + single feature-score which informs about its relevance. Accepted policies include "max" (takes the max score + across categories), "ave" and "wave" (take the average, or weighted average, across all categories -- weights + correspond to the class prevalence), and "sum" (which sums all category scores). + """ + + def __init__(self, tsr_function, global_policy='max', supervised_4cell_matrix=None, sublinear_tf=True, norm='l2', min_df=3, n_jobs=-1): + if global_policy not in ['max', 'ave', 'wave', 'sum']: raise ValueError('Global policy should be in {"max", "ave", "wave", "sum"}') + self.tsr_function = tsr_function + self.global_policy = global_policy + self.supervised_4cell_matrix = supervised_4cell_matrix + self.sublinear_tf = sublinear_tf + self.norm = norm + self.min_df = min_df + self.n_jobs = n_jobs + + def fit(self, X, y): + self.count_vectorizer = CountVectorizer(min_df=self.min_df) + X = self.count_vectorizer.fit_transform(X) + + self.tf_vectorizer = TfidfTransformer( + norm=None, use_idf=False, smooth_idf=False, sublinear_tf=self.sublinear_tf + ).fit(X) + + if len(y.shape) == 1: + y = np.expand_dims(y, axis=1) + + nD, nC = y.shape + nF = len(self.tf_vectorizer.get_feature_names_out()) + + if self.supervised_4cell_matrix is None: + self.supervised_4cell_matrix = get_supervised_matrix(X, y, n_jobs=self.n_jobs) + else: + if self.supervised_4cell_matrix.shape != (nC, nF): + raise ValueError("Shape of supervised information matrix is inconsistent with X and y") + + tsr_matrix = get_tsr_matrix(self.supervised_4cell_matrix, self.tsr_function) + + if self.global_policy == 'ave': + self.global_tsr_vector = np.average(tsr_matrix, axis=0) + elif self.global_policy == 'wave': + category_prevalences = [sum(y[:,c])*1.0/nD for c in range(nC)] + self.global_tsr_vector = np.average(tsr_matrix, axis=0, weights=category_prevalences) + elif self.global_policy == 'sum': + self.global_tsr_vector = np.sum(tsr_matrix, axis=0) + elif self.global_policy == 'max': + self.global_tsr_vector = np.amax(tsr_matrix, axis=0) + return self + + def fit_transform(self, X, y): + return self.fit(X,y).transform(X) + + def transform(self, X): + if not hasattr(self, 'global_tsr_vector'): raise NameError('TSRweighting: transform method called before fit.') + X = self.count_vectorizer.transform(X) + tf_X = self.tf_vectorizer.transform(X).toarray() + weighted_X = np.multiply(tf_X, self.global_tsr_vector) + if self.norm is not None and self.norm!='none': + weighted_X = sklearn.preprocessing.normalize(weighted_X, norm=self.norm, axis=1, copy=False) + return csr_matrix(weighted_X) diff --git a/experimental_non_aggregative/method_dxs.py b/experimental_non_aggregative/method_dxs.py new file mode 100644 index 0000000..93fb67e --- /dev/null +++ b/experimental_non_aggregative/method_dxs.py @@ -0,0 +1,208 @@ +from scipy.sparse import issparse +from sklearn.decomposition import TruncatedSVD +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler + +import quapy as qp +from data import LabelledCollection +import numpy as np + +from experimental_non_aggregative.custom_vectorizers import * +from method._kdey import KDEBase +from protocol import APP +from quapy.method.aggregative import HDy, DistributionMatchingY +from quapy.method.base import BaseQuantifier +from scipy import optimize +import pandas as pd +import quapy.functional as F + + +# TODO: explore the bernoulli (term presence/absence) variant +# TODO: explore the multinomial (term frequency) variant +# TODO: explore the multinomial + length normalization variant +# TODO: consolidate the TSR-variant (e.g., using information gain) variant; +# - works better with the idf? +# - works better with length normalization? +# - etc + +class DxS(BaseQuantifier): + def __init__(self, vectorizer=None, divergence='topsoe'): + self.vectorizer = vectorizer + self.divergence = divergence + + # def __as_distribution(self, instances): + # return np.asarray(instances.sum(axis=0) / instances.sum()).flatten() + + def __as_distribution(self, instances): + dist = instances.mean(axis=0) + return np.asarray(dist).flatten() + + def fit(self, text_instances, labels): + + classes = np.unique(labels) + + if self.vectorizer is not None: + text_instances = self.vectorizer.fit_transform(text_instances, y=labels) + + distributions = [] + for class_i in classes: + distributions.append(self.__as_distribution(text_instances[labels == class_i])) + + self.validation_distribution = np.asarray(distributions) + + return self + + def predict(self, text_instances): + if self.vectorizer is not None: + text_instances = self.vectorizer.transform(text_instances) + + test_distribution = self.__as_distribution(text_instances) + divergence = qp.functional.get_divergence(self.divergence) + n_classes, n_feats = self.validation_distribution.shape + + def match(prev): + prev = np.expand_dims(prev, axis=0) + mixture_distribution = (prev @ self.validation_distribution).flatten() + return divergence(test_distribution, mixture_distribution) + + # the initial point is set as the uniform distribution + uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + + # solutions are bounded to those contained in the unit-simplex + bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1] + constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 + r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) + return r.x + + + +class KDExML(BaseQuantifier, KDEBase): + + def __init__(self, bandwidth=0.1, standardize=False): + self._check_bandwidth(bandwidth) + self.bandwidth = bandwidth + self.standardize = standardize + + def fit(self, X, y): + classes = sorted(np.unique(y)) + + if self.standardize: + self.scaler = StandardScaler() + X = self.scaler.fit_transform(X) + + if issparse(X): + X = X.toarray() + + self.mix_densities = self.get_mixture_components(X, y, classes, self.bandwidth) + return self + + def predict(self, X): + """ + Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood + of the data (i.e., that minimizes the negative log-likelihood) + + :param X: instances in the sample + :return: a vector of class prevalence estimates + """ + epsilon = 1e-10 + if issparse(X): + X = X.toarray() + n_classes = len(self.mix_densities) + if self.standardize: + X = self.scaler.transform(X) + test_densities = [self.pdf(kde_i, X) for kde_i in self.mix_densities] + + def neg_loglikelihood(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) + + return F.optim_minimize(neg_loglikelihood, n_classes) + + + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = 250 + qp.environ['N_JOBS'] = -1 + min_df = 10 + # dataset = 'imdb' + repeats = 10 + error = 'mae' + + div = 'topsoe' + + # generates tuples (dataset, method, method_name) + # (the dataset is needed for methods that process the dataset differently) + def gen_methods(): + + for dataset in qp.datasets.REVIEWS_SENTIMENT_DATASETS: + + data = qp.datasets.fetch_reviews(dataset, tfidf=False) + + # bernoulli_vectorizer = CountVectorizer(min_df=min_df, binary=True) + # dxs = DxS(divergence=div, vectorizer=bernoulli_vectorizer) + # yield data, dxs, 'DxS-Bernoulli' + # + # multinomial_vectorizer = CountVectorizer(min_df=min_df, binary=False) + # dxs = DxS(divergence=div, vectorizer=multinomial_vectorizer) + # yield data, dxs, 'DxS-multinomial' + # + # tf_vectorizer = TfidfVectorizer(sublinear_tf=False, use_idf=False, min_df=min_df, norm=None) + # dxs = DxS(divergence=div, vectorizer=tf_vectorizer) + # yield data, dxs, 'DxS-TF' + # + # logtf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=False, min_df=min_df, norm=None) + # dxs = DxS(divergence=div, vectorizer=logtf_vectorizer) + # yield data, dxs, 'DxS-logTF' + # + # tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm=None) + # dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer) + # yield data, dxs, 'DxS-TFIDF' + # + # tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm='l2') + # dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer) + # yield data, dxs, 'DxS-TFIDF-l2' + + tsr_vectorizer = TSRweighting(tsr_function=information_gain, min_df=min_df, norm='l2') + dxs = DxS(divergence=div, vectorizer=tsr_vectorizer) + yield data, dxs, 'DxS-TFTSR-l2' + + data = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=min_df) + + kdex = KDExML() + reduction = TruncatedSVD(n_components=100, random_state=0) + red_data = qp.data.preprocessing.instance_transformation(data, transformer=reduction, inplace=False) + yield red_data, kdex, 'KDEx' + + hdy = HDy(LogisticRegression()) + yield data, hdy, 'HDy' + + # dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=5) + # yield data, dm, 'DM-5b' + # + # dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=10) + # yield data, dm, 'DM-10b' + + + + + result_path = 'results.csv' + with open(result_path, 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\n') + for data, quantifier, quant_name in gen_methods(): + quantifier.fit(*data.training.Xy) + report = qp.evaluation.evaluation_report(quantifier, APP(data.test, repeats=repeats), error_metrics=['mae','mrae'], verbose=True) + means = report.mean(numeric_only=True) + csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n') + + df = pd.read_csv(result_path, sep='\t') + # print(df) + + pv = df.pivot_table(index='Method', columns="Dataset", values=["MAE", "MRAE"]) + print(pv) + + + + diff --git a/logo/LogoQuaDaSh.png b/logo/LogoQuaDaSh.png new file mode 100644 index 0000000..5daff40 Binary files /dev/null and b/logo/LogoQuaDaSh.png differ diff --git a/logo/NextGenerationEU.jpg b/logo/NextGenerationEU.jpg new file mode 100644 index 0000000..c377101 Binary files /dev/null and b/logo/NextGenerationEU.jpg differ diff --git a/prepare_svmperf.sh b/prepare_svmperf.sh index b609f6c..3da8bfe 100755 --- a/prepare_svmperf.sh +++ b/prepare_svmperf.sh @@ -11,13 +11,5 @@ rm $FILE patch -s -p0 < svm-perf-quantification-ext.patch mv svm_perf svm_perf_quantification cd svm_perf_quantification -make - - - - - - - - +make CFLAGS="-O3 -Wall -Wno-unused-result -fcommon" diff --git a/quapy/CHANGE_LOG.txt b/quapy/CHANGE_LOG.txt deleted file mode 100644 index 1534038..0000000 --- a/quapy/CHANGE_LOG.txt +++ /dev/null @@ -1,95 +0,0 @@ -Change Log 0.1.8 ----------------- - -- Added HDx and DistributionMatchingX to non-aggregative quantifiers (see also the new example "comparing_HDy_HDx.py") -- New UCI multiclass datasets added (thanks to Pablo González). The 5 UCI multiclass datasets are those corresponding - to the following criteria: - - >1000 instances - - >2 classes - - classification datasets - - Python API available -- New IFCB (plankton) dataset added. See fetch_IFCB. -- Added new evaluation measures NAE, NRAE -- Added new meta method "MedianEstimator"; an ensemble of binary base quantifiers that receives as input a dictionary - of hyperparameters that will explore exhaustively, fitting and generating predictions for each combination of - hyperparameters, and that returns, as the prevalence estimates, the median across all predictions. - -Change Log 0.1.7 ----------------- - -- Protocols are now abstracted as instances of AbstractProtocol. There is a new class extending AbstractProtocol called - AbstractStochasticSeededProtocol, which implements a seeding policy to allow replicate the series of samplings. - There are some examples of protocols, APP, NPP, UPP, DomainMixer (experimental). - The idea is to start the sample generation by simply calling the __call__ method. - This change has a great impact in the framework, since many functions in qp.evaluation, qp.model_selection, - and sampling functions in LabelledCollection relied of the old functions. E.g., the functionality of - qp.evaluation.artificial_prevalence_report or qp.evaluation.natural_prevalence_report is now obtained by means of - qp.evaluation.report which takes a protocol as an argument. I have not maintained compatibility with the old - interfaces because I did not really like them. Check the wiki guide and the examples for more details. - -- Exploration of hyperparameters in Model selection can now be run in parallel (there was a n_jobs argument in - QuaPy 0.1.6 but only the evaluation part for one specific hyperparameter was run in parallel). - -- The prediction function has been refactored, so it applies the optimization for aggregative quantifiers (that - consists in pre-classifying all instances, and then only invoking aggregate on the samples) only in cases in - which the total number of classifications would be smaller than the number of classifications with the standard - procedure. The user can now specify "force", "auto", True of False, in order to actively decide for applying it - or not. - -- examples directory created! - -- DyS, Topsoe distance and binary search (thanks to Pablo González) - -- Multi-thread reproducibility via seeding (thanks to Pablo González) - -- n_jobs is now taken from the environment if set to None - -- ACC, PACC, Forman's threshold variants have been parallelized. - -- cross_val_predict (for quantification) added to model_selection: would be nice to allow the user specifies a - test protocol maybe, or None for bypassing it? - -- Bugfix: adding two labelled collections (with +) now checks for consistency in the classes - -- newer versions of numpy raise a warning when accessing types (e.g., np.float). I have replaced all such instances - with the plain python type (e.g., float). - -- new dependency "abstention" (to add to the project requirements and setup). Calibration methods from - https://github.com/kundajelab/abstention added. - -- the internal classifier of aggregative methods is now called "classifier" instead of "learner" - -- when optimizing the hyperparameters of an aggregative quantifier, the classifier's specific hyperparameters - should be marked with a "classifier__" prefix (just like in scikit-learn with estimators), while the quantifier's - specific hyperparameters are named directly. For example, PCC(LogisticRegression()) quantifier has hyperparameters - "classifier__C", "classifier__class_weight", etc., instead of "C" and "class_weight" as in v0.1.6. - -- hyperparameters yielding to inconsistent runs raise a ValueError exception, while hyperparameter combinations - yielding to internal errors of surrogate functions are reported and skipped, without stopping the grid search. - -- DistributionMatching methods added. This is a general framework for distribution matching methods that catters for - multiclass quantification. That is to say, one could get a multiclass variant of the (originally binary) HDy - method aligned with the Firat's formulation. - -- internal method properties "binary", "aggregative", and "probabilistic" have been removed; these conditions are - checked via isinstance - -- quantifiers (i.e., classes that inherit from BaseQuantifier) are not forced to implement classes_ or n_classes; - these can be used anyway internally, but the framework will not suppose (nor impose) that a quantifier implements - them - -- qp.evaluation.prediction has been optimized so that, if a quantifier is of type aggregative, and if the evaluation - protocol is of type OnLabelledCollection, then the computation is faster. In this specific case, the predictions - are issued only once and for all, and not for each sample. An exception to this (which is implement also), is - when the number of instances across all samples is anyway smaller than the number of instances in the original - labelled collection; in this case the heuristic is of no help, and is therefore not applied. - -- the distinction between "classify" and "posterior_probabilities" has been removed in Aggregative quantifiers, - so that probabilistic classifiers return posterior probabilities, while non-probabilistic quantifiers - return crisp decisions. - -- OneVsAll fixed. There are now two classes: a generic one OneVsAllGeneric that works with any quantifier (e.g., - any instance of BaseQuantifier), and a subclass of it called OneVsAllAggregative which implements the - classify / aggregate interface. Both are instances of OneVsAll. There is a method getOneVsAll that returns the - best instance based on the type of quantifier. - diff --git a/quapy/__init__.py b/quapy/__init__.py index da534df..a952fbc 100644 --- a/quapy/__init__.py +++ b/quapy/__init__.py @@ -1,17 +1,25 @@ """QuaPy module for quantification""" + from quapy.data import datasets from . import error from . import data from . import functional -# from . import method +from . import method from . import evaluation from . import protocol from . import plot from . import util from . import model_selection from . import classification +import os + +__version__ = '0.2.1' + + +def _default_cls(): + from sklearn.linear_model import LogisticRegression + return LogisticRegression() -__version__ = '0.1.8' environ = { 'SAMPLE_SIZE': None, @@ -20,7 +28,8 @@ environ = { 'PAD_TOKEN': '[PAD]', 'PAD_INDEX': 1, 'SVMPERF_HOME': './svm_perf_quantification', - 'N_JOBS': 1 + 'N_JOBS': int(os.getenv('N_JOBS', 1)), + 'DEFAULT_CLS': _default_cls() } @@ -48,3 +57,21 @@ def _get_sample_size(sample_size): if sample_size is None: raise ValueError('neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified') return sample_size + + +def _get_classifier(classifier): + """ + If `classifier` is None, then it returns `environ['DEFAULT_CLS']`; + if otherwise, returns `classifier`. + + :param classifier: sklearn's estimator or None + :return: sklearn's estimator + """ + if classifier is None: + from sklearn.base import clone + classifier = clone(environ['DEFAULT_CLS']) + if classifier is None: + raise ValueError('neither classifier nor qp.environ["DEFAULT_CLS"] have been specified') + return classifier + + diff --git a/quapy/classification/calibration.py b/quapy/classification/calibration.py index a3f1543..0f5e9f7 100644 --- a/quapy/classification/calibration.py +++ b/quapy/classification/calibration.py @@ -24,7 +24,8 @@ class RecalibratedProbabilisticClassifier: class RecalibratedProbabilisticClassifierBase(BaseEstimator, RecalibratedProbabilisticClassifier): """ Applies a (re)calibration method from `abstention.calibration`, as defined in - `Alexandari et al. paper `_: + `Alexandari et al. paper `_. + :param classifier: a scikit-learn probabilistic classifier :param calibrator: the calibration object (an instance of abstention.calibration.CalibratorFactory) @@ -59,7 +60,7 @@ class RecalibratedProbabilisticClassifierBase(BaseEstimator, RecalibratedProbabi elif isinstance(k, float): if not (0 < k < 1): raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)') - return self.fit_cv(X, y) + return self.fit_tr_val(X, y) def fit_cv(self, X, y): """ @@ -94,7 +95,7 @@ class RecalibratedProbabilisticClassifierBase(BaseEstimator, RecalibratedProbabi self.classifier.fit(Xtr, ytr) posteriors = self.classifier.predict_proba(Xva) nclasses = len(np.unique(yva)) - self.calibrator = self.calibrator(posteriors, np.eye(nclasses)[yva], posterior_supplied=True) + self.calibration_function = self.calibrator(posteriors, np.eye(nclasses)[yva], posterior_supplied=True) return self def predict(self, X): diff --git a/quapy/classification/svmperf.py b/quapy/classification/svmperf.py index 6c85084..71f2ac3 100644 --- a/quapy/classification/svmperf.py +++ b/quapy/classification/svmperf.py @@ -33,27 +33,16 @@ class SVMperf(BaseEstimator, ClassifierMixin): valid_losses = {'01':0, 'f1':1, 'kld':12, 'nkld':13, 'q':22, 'qacc':23, 'qf1':24, 'qgm':25, 'mae':26, 'mrae':27} def __init__(self, svmperf_base, C=0.01, verbose=False, loss='01', host_folder=None): - assert exists(svmperf_base), f'path {svmperf_base} does not seem to point to a valid path' + assert exists(svmperf_base), \ + (f'path {svmperf_base} does not seem to point to a valid path;' + f'did you install svm-perf? ' + f'see instructions in https://hlt-isti.github.io/QuaPy/manuals/explicit-loss-minimization.html') self.svmperf_base = svmperf_base self.C = C self.verbose = verbose self.loss = loss self.host_folder = host_folder - # def set_params(self, **parameters): - # """ - # Set the hyper-parameters for svm-perf. Currently, only the `C` and `loss` parameters are supported - # - # :param parameters: a `**kwargs` dictionary `{'C': }` - # """ - # assert sorted(list(parameters.keys())) == ['C', 'loss'], \ - # 'currently, only the C and loss parameters are supported' - # self.C = parameters.get('C', self.C) - # self.loss = parameters.get('loss', self.loss) - # - # def get_params(self, deep=True): - # return {'C': self.C, 'loss': self.loss} - def fit(self, X, y): """ Trains the SVM for the multivariate performance loss diff --git a/quapy/data/_ifcb.py b/quapy/data/_ifcb.py index 4eb780d..c18e415 100644 --- a/quapy/data/_ifcb.py +++ b/quapy/data/_ifcb.py @@ -1,16 +1,52 @@ import os import pandas as pd +import math +from typing import Optional +from quapy.data import LabelledCollection from quapy.protocol import AbstractProtocol +from pathlib import Path + + +def get_sample_list(path_dir): + """ + Gets a sample list finding the csv files in a directory + + :param path_dir: directory to look for samples + :return: list of samples + """ + samples = [] + for filename in sorted(os.listdir(path_dir)): + if filename.endswith('.csv'): + samples.append(filename) + return samples + + +def generate_modelselection_split(samples, test_prop=0.3): + """This function generates a train/test partition for model selection + without the use of random numbers so the split is always the same + + :param samples: list of samples + :param test_prop: float, percentage saved for test. Defaults to 0.3. + :return: list of samples to use as train and list of samples to use as test + """ + num_items_to_pick = math.ceil(len(samples) * test_prop) + step_size = math.floor(len(samples) / num_items_to_pick) + test_indices = [i * step_size for i in range(num_items_to_pick)] + test = [samples[i] for i in test_indices] + train = [item for i, item in enumerate(samples) if i not in test_indices] + return train, test + class IFCBTrainSamplesFromDir(AbstractProtocol): - def __init__(self, path_dir:str, classes: list): + def __init__(self, path_dir:str, classes: list, samples: list = None): self.path_dir = path_dir self.classes = classes self.samples = [] - for filename in os.listdir(path_dir): - if filename.endswith('.csv'): - self.samples.append(filename) + if samples is not None: + self.samples = samples + else: + self.samples = get_sample_list(path_dir) def __call__(self): for sample in self.samples: @@ -18,7 +54,7 @@ class IFCBTrainSamplesFromDir(AbstractProtocol): # all columns but the first where we get the class X = s.iloc[:, 1:].to_numpy() y = s.iloc[:, 0].to_numpy() - yield X, y + yield LabelledCollection(X, y, classes=self.classes) def total(self): """ @@ -31,21 +67,33 @@ class IFCBTrainSamplesFromDir(AbstractProtocol): class IFCBTestSamples(AbstractProtocol): - def __init__(self, path_dir:str, test_prevalences_path: str): + def __init__(self, path_dir:str, test_prevalences: Optional[pd.DataFrame]=None, samples: list=None, classes: list=None): self.path_dir = path_dir - self.test_prevalences = pd.read_csv(os.path.join(path_dir, test_prevalences_path)) + self.test_prevalences = test_prevalences + self.classes = classes + if samples is not None: + self.samples = samples + else: + self.samples = get_sample_list(path_dir) def __call__(self): - for _, test_sample in self.test_prevalences.iterrows(): - #Load the sample from disk - X = pd.read_csv(os.path.join(self.path_dir,test_sample['sample']+'.csv')).to_numpy() - prevalences = test_sample.iloc[1:].to_numpy().astype(float) + for test_sample in self.samples: + s = pd.read_csv(os.path.join(self.path_dir,test_sample)) + if self.test_prevalences is not None: + X = s + # If we are working with the test samples, we have a dataframe with the prevalences and no labels for the test + prevalences = self.test_prevalences.loc[self.test_prevalences['sample']==Path(test_sample).stem].to_numpy()[:,1:].flatten().astype(float) + else: + X = s.iloc[:, 1:].to_numpy() + y = s.iloc[:,0] + # In this case we compute the sample prevalences from the labels + prevalences = y[y.isin(self.classes)].value_counts().reindex(self.classes, fill_value=0).to_numpy()/len(s) yield X, prevalences def total(self): """ Returns the total number of samples that the protocol generates. - :return: The number of test samples to generate. + :return: The number of training samples to generate. """ - return len(self.test_prevalences.index) \ No newline at end of file + return len(self.samples) diff --git a/quapy/data/_lequa2022.py b/quapy/data/_lequa.py similarity index 71% rename from quapy/data/_lequa2022.py rename to quapy/data/_lequa.py index 449eab6..e162f4c 100644 --- a/quapy/data/_lequa2022.py +++ b/quapy/data/_lequa.py @@ -4,6 +4,8 @@ import numpy as np import os from quapy.protocol import AbstractProtocol +from quapy.data import LabelledCollection + DEV_SAMPLES = 1000 TEST_SAMPLES = 5000 @@ -12,6 +14,13 @@ ERROR_TOL = 1E-3 def load_category_map(path): + """ + Loads the category map, i.e., a mapping of numerical ids of labels with a human readable name. + + :param path: path to the label map file + :return: a dictionary cat2code (i.e., cat2code[cat_name] gives access to the category id) and a list code2cat (i.e., + code2cat[cat_id] gives access to the category name) + """ cat2code = {} with open(path, 'rt') as fin: for line in fin: @@ -22,6 +31,16 @@ def load_category_map(path): def load_raw_documents(path): + """ + Loads raw documents. In case the sample is unlabelled, + the labels returned are None + + :param path: path to the data sample containing the raw documents + :return: a tuple with the documents (np.ndarray of strings of shape `(n,)`) and + the labels (a np.ndarray of shape `(n,)` if the sample is labelled, + or None if the sample is unlabelled), with `n` the number of instances in the sample + (250 for T1A, 1000 for T1B) + """ df = pd.read_csv(path) documents = list(df["text"].values) labels = None @@ -30,7 +49,16 @@ def load_raw_documents(path): return documents, labels -def load_vector_documents(path): +def load_vector_documents_2022(path): + """ + Loads vectorized documents. In case the sample is unlabelled, + the labels returned are None + + :param path: path to the data sample containing the raw documents + :return: a tuple with the documents (np.ndarray of shape `(n,300)`) and the labels (a np.ndarray of shape `(n,)` if + the sample is labelled, or None if the sample is unlabelled), with `n` the number of instances in the sample + (250 for T1A, 1000 for T1B) + """ D = pd.read_csv(path).to_numpy(dtype=float) labelled = D.shape[1] == 301 if labelled: @@ -40,6 +68,25 @@ def load_vector_documents(path): return X, y +def load_vector_documents_2024(path): + """ + Loads vectorized documents. In case the sample is unlabelled, + the labels returned are None + + :param path: path to the data sample containing the raw documents + :return: a tuple with the documents (np.ndarray of shape `(n,256)`) and the labels (a np.ndarray of shape `(n,)` if + the sample is labelled, or None if the sample is unlabelled), with `n` the number of instances in the sample + (250 for T1 and T4, 1000 for T2, and 200 for T3) + """ + D = pd.read_csv(path).to_numpy(dtype=float) + labelled = D.shape[1] == 257 + if labelled: + X, y = D[:,1:], D[:,0].astype(int).flatten() + else: + X, y = D, None + return X, y + + class SamplesFromDir(AbstractProtocol): def __init__(self, path_dir:str, ground_truth_path:str, load_fn): @@ -53,6 +100,20 @@ class SamplesFromDir(AbstractProtocol): yield sample, prevalence +class LabelledCollectionsFromDir(AbstractProtocol): + + def __init__(self, path_dir:str, ground_truth_path:str, load_fn): + self.path_dir = path_dir + self.load_fn = load_fn + self.true_prevs = pd.read_csv(ground_truth_path, index_col=0) + + def __call__(self): + for id, prevalence in self.true_prevs.iterrows(): + collection_path = os.path.join(self.path_dir, f'{id}.txt') + lc = LabelledCollection.load(path=collection_path, loader_func=self.load_fn) + yield lc + + class ResultSubmission: def __init__(self): diff --git a/quapy/data/base.py b/quapy/data/base.py index 9cc6441..9bdf135 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -9,6 +9,7 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold from numpy.random import RandomState from quapy.functional import strprev from quapy.util import temp_seed +import quapy.functional as F class LabelledCollection: @@ -32,16 +33,20 @@ class LabelledCollection: else: self.instances = np.asarray(instances) self.labels = np.asarray(labels) - n_docs = len(self) if classes is None: - self.classes_ = np.unique(self.labels) - self.classes_.sort() + self.classes_ = F.classes_from_labels(self.labels) else: self.classes_ = np.unique(np.asarray(classes)) self.classes_.sort() if len(set(self.labels).difference(set(classes))) > 0: raise ValueError(f'labels ({set(self.labels)}) contain values not included in classes_ ({set(classes)})') - self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_} + self._index = None + + @property + def index(self): + if not hasattr(self, '_index') or self._index is None: + self._index = {class_: np.arange(len(self))[self.labels == class_] for class_ in self.classes_} + return self._index @classmethod def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs): @@ -95,6 +100,15 @@ class LabelledCollection: """ return len(self.classes_) + @property + def n_instances(self): + """ + The number of instances + + :return: integer + """ + return len(self.labels) + @property def binary(self): """ @@ -108,8 +122,7 @@ class LabelledCollection: """ Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the prevalence values are not specified, then returns the index of a uniform sampling. - For each class, the sampling is drawn with replacement if the requested prevalence is larger than - the actual prevalence of the class, or without replacement otherwise. + For each class, the sampling is drawn with replacement. :param size: integer, the requested size :param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since @@ -124,7 +137,7 @@ class LabelledCollection: if len(prevs) == self.n_classes - 1: prevs = prevs + (1 - sum(prevs),) assert len(prevs) == self.n_classes, 'unexpected number of prevalences' - assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})' + assert np.isclose(sum(prevs), 1), f'prevalences ({prevs}) wrong range (sum={sum(prevs)})' # Decide how many instances should be taken for each class in order to satisfy the requested prevalence # accurately, and the number of instances in the sample (exactly). If int(size * prevs[i]) (which is @@ -153,7 +166,7 @@ class LabelledCollection: for class_, n_requested in n_requests.items(): n_candidates = len(self.index[class_]) index_sample = self.index[class_][ - np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates)) + np.random.choice(n_candidates, size=n_requested, replace=True) ] if n_requested > 0 else [] indexes_sample.append(index_sample) @@ -168,8 +181,7 @@ class LabelledCollection: def uniform_sampling_index(self, size, random_state=None): """ Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn - with replacement if the requested size is greater than the number of instances, or without replacement - otherwise. + with replacement. :param size: integer, the size of the uniform sample :param random_state: if specified, guarantees reproducibility of the split. @@ -179,13 +191,12 @@ class LabelledCollection: ng = RandomState(seed=random_state) else: ng = np.random - return ng.choice(len(self), size, replace=size > len(self)) + return ng.choice(len(self), size, replace=True) def sampling(self, size, *prevs, shuffle=True, random_state=None): """ Return a random sample (an instance of :class:`LabelledCollection`) of desired size and desired prevalence - values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than - the actual prevalence of the class, or with replacement otherwise. + values. For each class, the sampling is drawn with replacement. :param size: integer, the requested size :param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since @@ -202,8 +213,7 @@ class LabelledCollection: def uniform_sampling(self, size, random_state=None): """ Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn - with replacement if the requested size is greater than the number of instances, or without replacement - otherwise. + with replacement. :param size: integer, the requested size :param random_state: if specified, guarantees reproducibility of the split. @@ -236,11 +246,11 @@ class LabelledCollection: :return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the second one with `1-train_prop` elements """ - tr_docs, te_docs, tr_labels, te_labels = train_test_split( + tr_X, te_X, tr_y, te_y = train_test_split( self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state ) - training = LabelledCollection(tr_docs, tr_labels, classes=self.classes_) - test = LabelledCollection(te_docs, te_labels, classes=self.classes_) + training = LabelledCollection(tr_X, tr_y, classes=self.classes_) + test = LabelledCollection(te_X, te_y, classes=self.classes_) return training, test def split_random(self, train_prop=0.6, random_state=None): @@ -322,6 +332,15 @@ class LabelledCollection: classes = np.unique(labels).sort() return LabelledCollection(instances, labels, classes=classes) + @property + def classes(self): + """ + Gets an array-like with the classes used in this collection + + :return: array-like + """ + return self.classes_ + @property def Xy(self): """ @@ -418,6 +437,11 @@ class LabelledCollection: test = self.sampling_from_index(test_index) yield train, test + def __repr__(self): + repr=f'<{self.n_instances} instances (dtype={type(self.instances[0])}), ' + repr+=f'n_classes={self.n_classes} {self.classes_}, prevalence={F.strprev(self.prevalence())}>' + return repr + class Dataset: """ @@ -553,7 +577,7 @@ class Dataset: yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})') - def reduce(self, n_train=100, n_test=100): + def reduce(self, n_train=100, n_test=100, random_state=None): """ Reduce the number of instances in place for quick experiments. Preserves the prevalence of each set. @@ -561,6 +585,17 @@ class Dataset: :param n_test: number of test documents to keep (default 100) :return: self """ - self.training = self.training.sampling(n_train, *self.training.prevalence()) - self.test = self.test.sampling(n_test, *self.test.prevalence()) - return self \ No newline at end of file + self.training = self.training.sampling( + n_train, + *self.training.prevalence(), + random_state = random_state + ) + self.test = self.test.sampling( + n_test, + *self.test.prevalence(), + random_state = random_state + ) + return self + + def __repr__(self): + return f'training={self.training}; test={self.test}' \ No newline at end of file diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 32edb78..c08748f 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -1,56 +1,92 @@ -def warn(*args, **kwargs): - pass -import warnings -warnings.warn = warn import os +from contextlib import contextmanager import zipfile from os.path import join import pandas as pd from ucimlrepo import fetch_ucirepo from quapy.data.base import Dataset, LabelledCollection from quapy.data.preprocessing import text2tfidf, reduce_columns +from quapy.data.preprocessing import standardize as standardizer from quapy.data.reader import * from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource +from sklearn.preprocessing import StandardScaler REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb'] -TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders', - 'semeval13', 'semeval14', 'semeval15', 'semeval16', - 'sst', 'wa', 'wb'] -TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders', - 'semeval', 'semeval16', - 'sst', 'wa', 'wb'] -UCI_DATASETS = ['acute.a', 'acute.b', - 'balance.1', 'balance.2', 'balance.3', - 'breast-cancer', - 'cmc.1', 'cmc.2', 'cmc.3', - 'ctg.1', 'ctg.2', 'ctg.3', - #'diabetes', # <-- I haven't found this one... - 'german', - 'haberman', - 'ionosphere', - 'iris.1', 'iris.2', 'iris.3', - 'mammographic', - 'pageblocks.5', - #'phoneme', # <-- I haven't found this one... - 'semeion', - 'sonar', - 'spambase', - 'spectf', - 'tictactoe', - 'transfusion', - 'wdbc', - 'wine.1', 'wine.2', 'wine.3', - 'wine-q-red', 'wine-q-white', - 'yeast'] -UCI_MULTICLASS_DATASETS = ['dry-bean', - 'wine-quality', - 'academic-success', - 'digits', - 'letter'] +TWITTER_SENTIMENT_DATASETS_TEST = [ + 'gasp', 'hcr', 'omd', 'sanders', + 'semeval13', 'semeval14', 'semeval15', 'semeval16', + 'sst', 'wa', 'wb', +] -LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B'] +TWITTER_SENTIMENT_DATASETS_TRAIN = [ + 'gasp', 'hcr', 'omd', 'sanders', + 'semeval', 'semeval16', + 'sst', 'wa', 'wb', +] + +UCI_BINARY_DATASETS = [ + #'acute.a', 'acute.b', + 'balance.1', + #'balance.2', + 'balance.3', + 'breast-cancer', + 'cmc.1', 'cmc.2', 'cmc.3', + 'ctg.1', 'ctg.2', 'ctg.3', + #'diabetes', # <-- I haven't found this one... + 'german', + 'haberman', + 'ionosphere', + 'iris.1', 'iris.2', 'iris.3', + 'mammographic', + 'pageblocks.5', + #'phoneme', # <-- I haven't found this one... + 'semeion', + 'sonar', + 'spambase', + 'spectf', + 'tictactoe', + 'transfusion', + 'wdbc', + 'wine.1', 'wine.2', 'wine.3', + 'wine-q-red', + 'wine-q-white', + 'yeast', +] + +UCI_MULTICLASS_DATASETS = [ + 'dry-bean', + 'wine-quality', + 'academic-success', + 'digits', + 'letter', + 'abalone', + 'obesity', + 'nursery', + 'yeast', + 'hand_digits', + 'satellite', + 'shuttle', + 'cmc', + 'isolet', + 'waveform-v1', + 'molecular', + 'poker_hand', + 'connect-4', + 'mhr', + 'chess', + 'page_block', + 'phishing', + 'image_seg', + 'hcv', +] + +LEQUA2022_VECTOR_TASKS = ['T1A', 'T1B'] +LEQUA2022_TEXT_TASKS = ['T2A', 'T2B'] +LEQUA2022_TASKS = LEQUA2022_VECTOR_TASKS + LEQUA2022_TEXT_TASKS + +LEQUA2024_TASKS = ['T1', 'T2', 'T3', 'T4'] _TXA_SAMPLE_SIZE = 250 _TXB_SAMPLE_SIZE = 1000 @@ -66,12 +102,20 @@ LEQUA2022_SAMPLE_SIZE = { 'multiclass': _TXB_SAMPLE_SIZE } +LEQUA2024_SAMPLE_SIZE = { + 'T1': 250, + 'T2': 1000, + 'T3': 200, + 'T4': 250, +} + def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset: """ Loads a Reviews dataset as a Dataset instance, as used in `Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification." - Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. `_. + Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. + `_. The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS` :param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb' @@ -187,7 +231,7 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom return data -def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: +def fetch_UCIBinaryDataset(dataset_name, data_home=None, test_split=0.3, standardize=True, verbose=False) -> Dataset: """ Loads a UCI dataset as an instance of :class:`quapy.data.base.Dataset`, as used in `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). @@ -205,14 +249,20 @@ def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default ~/quay_data/ directory) :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param standardize: indicates whether the covariates should be standardized or not (default is True). If requested, + standardization applies after the LabelledCollection is split, that is, the mean an std are computed only on the + training portion of the data. :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets :return: a :class:`quapy.data.base.Dataset` instance """ - data = fetch_UCILabelledCollection(dataset_name, data_home, verbose) - return Dataset(*data.split_stratified(1 - test_split, random_state=0)) + data = fetch_UCIBinaryLabelledCollection(dataset_name, data_home, verbose) + dataset = Dataset(*data.split_stratified(1 - test_split, random_state=0), name=dataset_name) + if standardize: + dataset = standardizer(dataset) + return dataset -def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: +def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, standardize=True, verbose=False) -> LabelledCollection: """ Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). @@ -227,8 +277,8 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.: >>> import quapy as qp - >>> collection = qp.datasets.fetch_UCILabelledCollection("yeast") - >>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2): + >>> collection = qp.datasets.fetch_UCIBinaryLabelledCollection("yeast") + >>> for data in qp.datasets.Dataset.kFCV(collection, nfolds=5, nrepeats=2): >>> ... The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS` @@ -236,327 +286,309 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> :param dataset_name: a dataset name :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default ~/quay_data/ directory) - :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param standardize: indicates whether the covariates should be standardized or not (default is True). :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets :return: a :class:`quapy.data.base.LabelledCollection` instance """ - - assert dataset_name in UCI_DATASETS, \ - f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \ - f'Valid ones are {UCI_DATASETS}' + assert dataset_name in UCI_BINARY_DATASETS, ( + f"Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. " + f"Valid ones are {UCI_BINARY_DATASETS}" + ) if data_home is None: data_home = get_quapy_home() - dataset_fullname = { - 'acute.a': 'Acute Inflammations (urinary bladder)', - 'acute.b': 'Acute Inflammations (renal pelvis)', - 'balance.1': 'Balance Scale Weight & Distance Database (left)', - 'balance.2': 'Balance Scale Weight & Distance Database (balanced)', - 'balance.3': 'Balance Scale Weight & Distance Database (right)', - 'breast-cancer': 'Breast Cancer Wisconsin (Original)', - 'cmc.1': 'Contraceptive Method Choice (no use)', - 'cmc.2': 'Contraceptive Method Choice (long term)', - 'cmc.3': 'Contraceptive Method Choice (short term)', - 'ctg.1': 'Cardiotocography Data Set (normal)', - 'ctg.2': 'Cardiotocography Data Set (suspect)', - 'ctg.3': 'Cardiotocography Data Set (pathologic)', - 'german': 'Statlog German Credit Data', - 'haberman': "Haberman's Survival Data", - 'ionosphere': 'Johns Hopkins University Ionosphere DB', - 'iris.1': 'Iris Plants Database(x)', - 'iris.2': 'Iris Plants Database(versicolour)', - 'iris.3': 'Iris Plants Database(virginica)', - 'mammographic': 'Mammographic Mass', - 'pageblocks.5': 'Page Blocks Classification (5)', - 'semeion': 'Semeion Handwritten Digit (8)', - 'sonar': 'Sonar, Mines vs. Rocks', - 'spambase': 'Spambase Data Set', - 'spectf': 'SPECTF Heart Data', - 'tictactoe': 'Tic-Tac-Toe Endgame Database', - 'transfusion': 'Blood Transfusion Service Center Data Set', - 'wdbc': 'Wisconsin Diagnostic Breast Cancer', - 'wine.1': 'Wine Recognition Data (1)', - 'wine.2': 'Wine Recognition Data (2)', - 'wine.3': 'Wine Recognition Data (3)', - 'wine-q-red': 'Wine Quality Red (6-10)', - 'wine-q-white': 'Wine Quality White (6-10)', - 'yeast': 'Yeast', + # mapping bewteen dataset names and UCI api ids + identifiers = { + "acute.a": 184, + "acute.b": 184, + "balance.1": 12, + "balance.2": 12, + "balance.3": 12, + "breast-cancer": 15, + "cmc.1": 30, + "cmc.2": 30, + "cmc.3": 30, + # "ctg.1": , # not python importable + # "ctg.2": , # not python importable + # "ctg.3": , # not python importable + # "german": , # not python importable + "haberman": 43, + "ionosphere": 52, + "iris.1": 53, + "iris.2": 53, + "iris.3": 53, + "mammographic": 161, + "pageblocks.5": 78, + # "semeion": , # not python importable + "sonar": 151, + "spambase": 94, + "spectf": 96, + "tictactoe": 101, + "transfusion": 176, + "wdbc": 17, + "wine.1": 109, + "wine.2": 109, + "wine.3": 109, + "wine-q-red": 186, + "wine-q-white": 186, + "yeast": 110, } - # the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use - # to download the raw dataset - identifier_map = { - 'acute.a': 'acute', - 'acute.b': 'acute', - 'balance.1': 'balance-scale', - 'balance.2': 'balance-scale', - 'balance.3': 'balance-scale', - 'breast-cancer': 'breast-cancer-wisconsin', - 'cmc.1': 'cmc', - 'cmc.2': 'cmc', - 'cmc.3': 'cmc', - 'ctg.1': '00193', - 'ctg.2': '00193', - 'ctg.3': '00193', - 'german': 'statlog/german', - 'haberman': 'haberman', - 'ionosphere': 'ionosphere', - 'iris.1': 'iris', - 'iris.2': 'iris', - 'iris.3': 'iris', - 'mammographic': 'mammographic-masses', - 'pageblocks.5': 'page-blocks', - 'semeion': 'semeion', - 'sonar': 'undocumented/connectionist-bench/sonar', - 'spambase': 'spambase', - 'spectf': 'spect', - 'tictactoe': 'tic-tac-toe', - 'transfusion': 'blood-transfusion', - 'wdbc': 'breast-cancer-wisconsin', - 'wine-q-red': 'wine-quality', - 'wine-q-white': 'wine-quality', - 'wine.1': 'wine', - 'wine.2': 'wine', - 'wine.3': 'wine', - 'yeast': 'yeast', + # mapping between dataset names and dataset groups + groups = { + "acute.a": "acute", + "acute.b": "acute", + "balance.1": "balance", + "balance.2": "balance", + "balance.3": "balance", + "breast-cancer": "breast-cancer", + "cmc.1": "cmc", + "cmc.2": "cmc", + "cmc.3": "cmc", + "ctg.1": "ctg", + "ctg.2": "ctg", + "ctg.3": "ctg", + "german": "german", + "haberman": "haberman", + "ionosphere": "ionosphere", + "iris.1": "iris", + "iris.2": "iris", + "iris.3": "iris", + "mammographic": "mammographic", + "pageblocks.5": "pageblocks", + "semeion": "semeion", + "sonar": "sonar", + "spambase": "spambase", + "spectf": "spectf", + "tictactoe": "tictactoe", + "transfusion": "transfusion", + "wdbc": "wdbc", + "wine-q-red": "wine-quality", + "wine-q-white": "wine-quality", + "wine.1": "wine", + "wine.2": "wine", + "wine.3": "wine", + "yeast": "yeast", } - # the filename is the name of the file within the data_folder indexed by the identifier - file_name = { - 'acute': 'diagnosis.data', - '00193': 'CTG.xls', - 'statlog/german': 'german.data-numeric', - 'mammographic-masses': 'mammographic_masses.data', - 'page-blocks': 'page-blocks.data.Z', - 'undocumented/connectionist-bench/sonar': 'sonar.all-data', - 'spect': ['SPECTF.train', 'SPECTF.test'], - 'blood-transfusion': 'transfusion.data', - 'wine-quality': ['winequality-red.csv', 'winequality-white.csv'], - 'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data' + # mapping between dataset short names and full names + full_names = { + "acute.a": "Acute Inflammations (urinary bladder)", + "acute.b": "Acute Inflammations (renal pelvis)", + "balance.1": "Balance Scale Weight & Distance Database (left)", + "balance.2": "Balance Scale Weight & Distance Database (balanced)", + "balance.3": "Balance Scale Weight & Distance Database (right)", + "breast-cancer": "Breast Cancer Wisconsin (Original)", + "cmc.1": "Contraceptive Method Choice (no use)", + "cmc.2": "Contraceptive Method Choice (long term)", + "cmc.3": "Contraceptive Method Choice (short term)", + "ctg.1": "Cardiotocography Data Set (normal)", + "ctg.2": "Cardiotocography Data Set (suspect)", + "ctg.3": "Cardiotocography Data Set (pathologic)", + "german": "Statlog German Credit Data", + "haberman": "Haberman's Survival Data", + "ionosphere": "Johns Hopkins University Ionosphere DB", + "iris.1": "Iris Plants Database(x)", + "iris.2": "Iris Plants Database(versicolour)", + "iris.3": "Iris Plants Database(virginica)", + "mammographic": "Mammographic Mass", + "pageblocks.5": "Page Blocks Classification (5)", + "semeion": "Semeion Handwritten Digit (8)", + "sonar": "Sonar, Mines vs. Rocks", + "spambase": "Spambase Data Set", + "spectf": "SPECTF Heart Data", + "tictactoe": "Tic-Tac-Toe Endgame Database", + "transfusion": "Blood Transfusion Service Center Data Set", + "wdbc": "Wisconsin Diagnostic Breast Cancer", + "wine.1": "Wine Recognition Data (1)", + "wine.2": "Wine Recognition Data (2)", + "wine.3": "Wine Recognition Data (3)", + "wine-q-red": "Wine Quality Red (6-10)", + "wine-q-white": "Wine Quality White (6-10)", + "yeast": "Yeast", } - # the filename containing the dataset description (if any) - desc_name = { - 'acute': 'diagnosis.names', - '00193': None, - 'statlog/german': 'german.doc', - 'mammographic-masses': 'mammographic_masses.names', - 'undocumented/connectionist-bench/sonar': 'sonar.names', - 'spect': 'SPECTF.names', - 'blood-transfusion': 'transfusion.names', - 'wine-quality': 'winequality.names', - 'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names' + # mapping between dataset names and values of positive class + pos_class = { + "acute.a": "yes", + "acute.b": "yes", + "balance.1": "L", + "balance.2": "B", + "balance.3": "R", + "breast-cancer": 2, + "cmc.1": 1, + "cmc.2": 2, + "cmc.3": 3, + "ctg.1": 1, # 1==Normal + "ctg.2": 2, # 2==Suspect + "ctg.3": 3, # 3==Pathologic + "german": 1, + "haberman": 2, + "ionosphere": "b", + "iris.1": "Iris-setosa", # 1==Setosa + "iris.2": "Iris-versicolor", # 2==Versicolor + "iris.3": "Iris-virginica", # 3==Virginica + "mammographic": 1, + "pageblocks.5": 5, # 5==block "graphic" + "semeion": 1, + "sonar": "R", + "spambase": 1, + "spectf": 0, + "tictactoe": "negative", + "transfusion": 1, + "wdbc": "M", + "wine.1": 1, + "wine.2": 2, + "wine.3": 3, + "wine-q-red": 1, + "wine-q-white": 1, + "yeast": "NUC", } - identifier = identifier_map[dataset_name] - filename = file_name.get(identifier, f'{identifier}.data') - descfile = desc_name.get(identifier, f'{identifier}.names') - fullname = dataset_fullname[dataset_name] - - URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}' - data_dir = join(data_home, 'uci_datasets', identifier) - if isinstance(filename, str): # filename could be a list of files, in which case it will be processed later - data_path = join(data_dir, filename) - download_file_if_not_exists(f'{URL}/{filename}', data_path) - - if descfile: - try: - download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}') - if verbose: - print(open(f'{data_dir}/{descfile}', 'rt').read()) - except Exception: - print('could not read the description file') - elif verbose: - print('no file description available') + identifier = identifiers.get(dataset_name, None) + dataset_group = groups[dataset_name] + fullname = full_names[dataset_name] if verbose: - print(f'Loading {dataset_name} ({fullname})') - if identifier == 'acute': - df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t') + print(f"Loading UCI Binary {dataset_name} ({fullname})") - df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False) - [_df_replace(df, col) for col in range(1, 6)] - X = df.loc[:, 0:5].values - if dataset_name == 'acute.a': - y = binarize(df[6], pos_class='yes') - elif dataset_name == 'acute.b': - y = binarize(df[7], pos_class='yes') + file = join(data_home, "uci_datasets", dataset_group + ".pkl") - if identifier == 'balance-scale': - df = pd.read_csv(data_path, header=None, sep=',') - if dataset_name == 'balance.1': - y = binarize(df[0], pos_class='L') - elif dataset_name == 'balance.2': - y = binarize(df[0], pos_class='B') - elif dataset_name == 'balance.3': - y = binarize(df[0], pos_class='R') - X = df.loc[:, 1:].astype(float).values + @contextmanager + def download_tmp_file(url_group: str, filename: str): + """ + Download a data file for a group of datasets temporarely. + When used as a context, the file is removed once the context exits. - if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer': - df = pd.read_csv(data_path, header=None, sep=',') - Xy = df.loc[:, 1:10] - Xy[Xy=='?']=np.nan - Xy = Xy.dropna(axis=0) - X = Xy.loc[:, 1:9] - X = X.astype(float).values - y = binarize(Xy[10], pos_class=2) - - if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc': - df = pd.read_csv(data_path, header=None, sep=',') - X = df.loc[:, 2:32].astype(float).values - y = df[1].values - y = binarize(y, pos_class='M') - - if identifier == 'cmc': - df = pd.read_csv(data_path, header=None, sep=',') - X = df.loc[:, 0:8].astype(float).values - y = df[9].astype(int).values - if dataset_name == 'cmc.1': - y = binarize(y, pos_class=1) - elif dataset_name == 'cmc.2': - y = binarize(y, pos_class=2) - elif dataset_name == 'cmc.3': - y = binarize(y, pos_class=3) - - if identifier == '00193': - df = pd.read_excel(data_path, sheet_name='Data', skipfooter=3) - df = df[list(range(1,24))] # select columns numbered (number 23 is the target label) - # replaces the header with the first row - new_header = df.iloc[0] # grab the first row for the header - df = df[1:] # take the data less the header row - df.columns = new_header # set the header row as the df header - X = df.iloc[:, 0:22].astype(float).values - y = df['NSP'].astype(int).values - if dataset_name == 'ctg.1': - y = binarize(y, pos_class=1) # 1==Normal - elif dataset_name == 'ctg.2': - y = binarize(y, pos_class=2) # 2==Suspect - elif dataset_name == 'ctg.3': - y = binarize(y, pos_class=3) # 3==Pathologic - - if identifier == 'statlog/german': - df = pd.read_csv(data_path, header=None, delim_whitespace=True) - X = df.iloc[:, 0:24].astype(float).values - y = df[24].astype(int).values - y = binarize(y, pos_class=1) - - if identifier == 'haberman': - df = pd.read_csv(data_path, header=None) - X = df.iloc[:, 0:3].astype(float).values - y = df[3].astype(int).values - y = binarize(y, pos_class=2) - - if identifier == 'ionosphere': - df = pd.read_csv(data_path, header=None) - X = df.iloc[:, 0:34].astype(float).values - y = df[34].values - y = binarize(y, pos_class='b') - - if identifier == 'iris': - df = pd.read_csv(data_path, header=None) - X = df.iloc[:, 0:4].astype(float).values - y = df[4].values - if dataset_name == 'iris.1': - y = binarize(y, pos_class='Iris-setosa') # 1==Setosa - elif dataset_name == 'iris.2': - y = binarize(y, pos_class='Iris-versicolor') # 2==Versicolor - elif dataset_name == 'iris.3': - y = binarize(y, pos_class='Iris-virginica') # 3==Virginica - - if identifier == 'mammographic-masses': - df = pd.read_csv(data_path, header=None, sep=',') - df[df == '?'] = np.nan - Xy = df.dropna(axis=0) - X = Xy.iloc[:, 0:5] - X = X.astype(float).values - y = binarize(Xy.iloc[:,5], pos_class=1) - - if identifier == 'page-blocks': - data_path_ = data_path.replace('.Z', '') - if not os.path.exists(data_path_): - raise FileNotFoundError(f'Warning: file {data_path_} does not exist. If this is the first time you ' - f'attempt to load this dataset, then you have to manually unzip the {data_path} ' - f'and name the extracted file {data_path_} (unfortunately, neither zipfile, nor ' - f'gzip can handle unix compressed files automatically -- there is a repo in GitHub ' - f'https://github.com/umeat/unlzw where the problem seems to be solved anyway).') - df = pd.read_csv(data_path_, header=None, delim_whitespace=True) - X = df.iloc[:, 0:10].astype(float).values - y = df[10].values - y = binarize(y, pos_class=5) # 5==block "graphic" - - if identifier == 'semeion': - df = pd.read_csv(data_path, header=None, delim_whitespace=True ) - X = df.iloc[:, 0:256].astype(float).values - y = df[263].values # 263 stands for digit 8 (labels are one-hot vectors from col 256-266) - y = binarize(y, pos_class=1) - - if identifier == 'undocumented/connectionist-bench/sonar': - df = pd.read_csv(data_path, header=None, sep=',') - X = df.iloc[:, 0:60].astype(float).values - y = df[60].values - y = binarize(y, pos_class='R') - - if identifier == 'spambase': - df = pd.read_csv(data_path, header=None, sep=',') - X = df.iloc[:, 0:57].astype(float).values - y = df[57].values - y = binarize(y, pos_class=1) - - if identifier == 'spect': - dfs = [] - for file in filename: - data_path = join(data_dir, file) - download_file_if_not_exists(f'{URL}/{file}', data_path) - dfs.append(pd.read_csv(data_path, header=None, sep=',')) - df = pd.concat(dfs) - X = df.iloc[:, 1:45].astype(float).values - y = df[0].values - y = binarize(y, pos_class=0) - - if identifier == 'tic-tac-toe': - df = pd.read_csv(data_path, header=None, sep=',') - X = df.iloc[:, 0:9].replace('o',0).replace('b',1).replace('x',2).values - y = df[9].values - y = binarize(y, pos_class='negative') - - if identifier == 'blood-transfusion': - df = pd.read_csv(data_path, sep=',') - X = df.iloc[:, 0:4].astype(float).values - y = df.iloc[:, 4].values - y = binarize(y, pos_class=1) - - if identifier == 'wine': - df = pd.read_csv(data_path, header=None, sep=',') - X = df.iloc[:, 1:14].astype(float).values - y = df[0].values - if dataset_name == 'wine.1': - y = binarize(y, pos_class=1) - elif dataset_name == 'wine.2': - y = binarize(y, pos_class=2) - elif dataset_name == 'wine.3': - y = binarize(y, pos_class=3) - - if identifier == 'wine-quality': - filename = filename[0] if dataset_name=='wine-q-red' else filename[1] + :param url_group: identifier of the dataset group in the URL + :param filename: name of the file to be downloaded + """ + data_dir = join(data_home, "uci_datasets", "tmp") + os.makedirs(data_dir, exist_ok=True) data_path = join(data_dir, filename) - download_file_if_not_exists(f'{URL}/{filename}', data_path) - df = pd.read_csv(data_path, sep=';') - X = df.iloc[:, 0:11].astype(float).values - y = df.iloc[:, 11].values > 5 + url = f"http://archive.ics.uci.edu/ml/machine-learning-databases/{url_group}/{filename}" + download_file_if_not_exists(url, data_path) + try: + yield data_path + finally: + os.remove(data_path) - if identifier == 'yeast': - df = pd.read_csv(data_path, header=None, delim_whitespace=True) - X = df.iloc[:, 1:9].astype(float).values - y = df.iloc[:, 9].values - y = binarize(y, pos_class='NUC') + def download(id: int | None, group: str) -> dict: + """ + Download the data to be pickled for a dataset group. Use the `fetch_ucirepo` api when possible. - data = LabelledCollection(X, y) + :param id: numeric identifier for the group; can be None + :param group: group name + :return: a dictionary with X and y as keys and, optionally, extra data. + """ + + # use the fetch_ucirepo api, when possible, to download data + # fall back to direct download when needed + if group == "german": + with download_tmp_file("statlog/german", "german.data-numeric") as tmp: + df = pd.read_csv(tmp, header=None, delim_whitespace=True) + X, y = df.iloc[:, 0:24].astype(float).values, df[24].astype(int).values + elif group == "ctg": + with download_tmp_file("00193", "CTG.xls") as tmp: + df = pd.read_excel(tmp, sheet_name="Data", skipfooter=3) + df = df[list(range(1, 24))] # select columns numbered (number 23 is the target label) + # replaces the header with the first row + new_header = df.iloc[0] # grab the first row for the header + df = df[1:] # take the data less the header row + df.columns = new_header # set the header row as the df header + X = df.iloc[:, 0:21].astype(float).values # column 21 is skipped, it is a class column + y = df["NSP"].astype(int).values + elif group == "semeion": + with download_tmp_file("semeion", "semeion.data") as tmp: + df = pd.read_csv(tmp, header=None, sep='\s+') + X = df.iloc[:, 0:256].astype(float).values + y = df[263].values # 263 stands for digit 8 (labels are one-hot vectors from col 256-266) + else: + df = fetch_ucirepo(id=id) + X, y = df.data.features.to_numpy(), df.data.targets.to_numpy().squeeze() + + # transform data when needed before returning (returned data will be pickled) + if group == "acute": + _array_replace(X) + data = {"X": X, "y": y} + elif group == "balance": + # features' order is reversed to match data retrieved via direct download + X = X[:, np.arange(X.shape[1])[::-1]] + data = {"X": X, "y": y} + elif group == "breast-cancer": + # remove rows with nan values + Xy = np.hstack([X, y[:, np.newaxis]]) + nan_rows = np.isnan(Xy).sum(axis=-1) > 0 + Xy = Xy[~nan_rows] + data = {"X": Xy[:, :-1], "y": Xy[:, -1]} + elif group == "mammographic": + # remove rows with nan values + Xy = np.hstack([X, y[:, np.newaxis]]) + nan_rows = np.isnan(Xy).sum(axis=-1) > 0 + Xy = Xy[~nan_rows] + data = {"X": Xy[:, :-1], "y": Xy[:, -1]} + elif group == "tictactoe": + _array_replace(X, repl={"o": 0, "b": 1, "x": 2}) + data = {"X": X, "y": y} + elif group == "wine-quality": + # add color data to split the final datasets + color = df.data.original["color"].to_numpy() + data = {"X": X, "y": y, "color": color} + else: + data = {"X": X, "y": y} + + return data + + def binarize_data(name, data: dict) -> LabelledCollection: + """ + Filter and transform data to extract a binary dataset. + + :param name: name of the dataset + :param data: dictionary containing X and y fields, plus additional data when needed + :return: a :class:`quapy.data.base.LabelledCollection` with the extracted dataset + """ + if name == "acute.a": + X, y = data["X"], data["y"][:, 0] + elif name == "acute.b": + X, y = data["X"], data["y"][:, 1] + elif name == "wine-q-red": + X, y, color = data["X"], data["y"], data["color"] + red_idx = color == "red" + X, y = X[red_idx, :], y[red_idx] + y = (y > 5).astype(int) + elif name == "wine-q-white": + X, y, color = data["X"], data["y"], data["color"] + white_idx = color == "white" + X, y = X[white_idx, :], y[white_idx] + y = (y > 5).astype(int) + else: + X, y = data["X"], data["y"] + + y = binarize(y, pos_class=pos_class[name]) + + return LabelledCollection(X, y) + + data = pickled_resource(file, download, identifier, dataset_group) + data = binarize_data(dataset_name, data) + + if standardize: + stds = StandardScaler() + data.instances = stds.fit_transform(data.instances) + if verbose: data.stats() + return data -def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: +def fetch_UCIMulticlassDataset( + dataset_name, + data_home=None, + min_test_split=0.3, + max_train_instances=25000, + min_class_support=100, + standardize=True, + verbose=False) -> Dataset: """ Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. @@ -578,15 +610,38 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver :param dataset_name: a dataset name :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default ~/quay_data/ directory) - :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param min_test_split: minimum proportion of instances to be included in the test set. This value is interpreted + as a minimum proportion, meaning that the real proportion could be higher in case the training proportion + (1-`min_test_split`% of the instances) surpasses `max_train_instances`. In such case, only `max_train_instances` + are taken for training, and the rest (irrespective of `min_test_split`) is taken for test. + :param max_train_instances: maximum number of instances to keep for training (defaults to 25000); + set to -1 or None to avoid this check + :param min_class_support: minimum number of istances per class. Classes with fewer instances + are discarded (deafult is 100) + :param standardize: indicates whether the covariates should be standardized or not (default is True). If requested, + standardization applies after the LabelledCollection is split, that is, the mean an std are computed only on the + training portion of the data. :param verbose: set to True (default is False) to get information (stats) about the dataset :return: a :class:`quapy.data.base.Dataset` instance """ - data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose) - return Dataset(*data.split_stratified(1 - test_split, random_state=0)) + + data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, min_class_support, verbose=verbose) + n = len(data) + train_prop = (1.-min_test_split) + if (max_train_instances is not None) and (max_train_instances > 0): + n_train = int(n*train_prop) + if n_train > max_train_instances: + train_prop = (max_train_instances / n) + + data = Dataset(*data.split_stratified(train_prop, random_state=0), name=dataset_name) + + if standardize: + data = standardizer(data) + + return data -def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: +def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_class_support=100, standardize=True, verbose=False) -> LabelledCollection: """ Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`. @@ -608,7 +663,9 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose= :param dataset_name: a dataset name :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default ~/quay_data/ directory) - :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param min_class_support: minimum number of istances per class. Classes with fewer instances + are discarded (deafult is 100) + :param standardize: indicates whether the covariates should be standardized or not (default is True). :param verbose: set to True (default is False) to get information (stats) about the dataset :return: a :class:`quapy.data.base.LabelledCollection` instance """ @@ -621,19 +678,57 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose= data_home = get_quapy_home() identifiers = { - "dry-bean": 602, - "wine-quality": 186, - "academic-success": 697, - "digits": 80, - "letter": 59 + 'dry-bean': 602, + 'wine-quality': 186, + 'academic-success': 697, + 'digits': 80, + 'letter': 59, + 'abalone': 1, + 'obesity': 544, + 'nursery': 76, + 'yeast': 110, + 'hand_digits': 81, + 'satellite': 146, + 'shuttle': 148, + 'cmc': 30, + 'isolet': 54, + 'waveform-v1': 107, + 'molecular': 69, + 'poker_hand': 158, + 'connect-4': 26, + 'mhr': 863, + 'chess': 23, + 'page_block': 78, + 'phishing': 379, + 'image_seg': 147, + 'hcv': 503, } full_names = { - "dry-bean": "Dry Bean Dataset", - "wine-quality": "Wine Quality", - "academic-success": "Predict students' dropout and academic success", - "digits": "Optical Recognition of Handwritten Digits", - "letter": "Letter Recognition" + 'dry-bean': 'Dry Bean Dataset', + 'wine-quality': 'Wine Quality', + 'academic-success': 'Predict students\' dropout and academic success', + 'digits': 'Optical Recognition of Handwritten Digits', + 'letter': 'Letter Recognition', + 'abalone': 'Abalone', + 'obesity': 'Estimation of Obesity Levels Based On Eating Habits and Physical Condition', + 'nursery': 'Nursery', + 'yeast': 'Yeast', + 'hand_digits': 'Pen-Based Recognition of Handwritten Digits', + 'satellite': 'Statlog Landsat Satellite', + 'shuttle': 'Statlog Shuttle', + 'cmc': 'Contraceptive Method Choice', + 'isolet': 'ISOLET', + 'waveform-v1': 'Waveform Database Generator (Version 1)', + 'molecular': 'Molecular Biology (Splice-junction Gene Sequences)', + 'poker_hand': 'Poker Hand', + 'connect-4': 'Connect-4', + 'mhr': 'Maternal Health Risk', + 'chess': 'Chess (King-Rook vs. King)', + 'page_block': 'Page Blocks Classification', + 'phishing': 'Website Phishing', + 'image_seg': 'Statlog (Image Segmentation)', + 'hcv': 'Hepatitis C Virus (HCV) for Egyptian patients', } identifier = identifiers[dataset_name] @@ -644,14 +739,42 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose= file = join(data_home, 'uci_multiclass', dataset_name+'.pkl') - def download(id): - data = fetch_ucirepo(id=id) - X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze() + def download(id, name): + df = fetch_ucirepo(id=id) + + df.data.features = pd.get_dummies(df.data.features, drop_first=True) + X, y = df.data.features.to_numpy(dtype=np.float64), df.data.targets.to_numpy().squeeze() + + assert y.ndim == 1, 'more than one y' + classes = np.sort(np.unique(y)) y = np.searchsorted(classes, y) return LabelledCollection(X, y) - data = pickled_resource(file, download, identifier) + def filter_classes(data: LabelledCollection, min_ipc): + if min_ipc is None: + min_ipc = 0 + classes = data.classes_ + # restrict classes to only those with at least min_ipc instances + classes = classes[data.counts() >= min_ipc] + # filter X and y keeping only datapoints belonging to valid classes + filter_idx = np.in1d(data.y, classes) + X, y = data.X[filter_idx], data.y[filter_idx] + # map classes to range(len(classes)) + y = np.searchsorted(classes, y) + return LabelledCollection(X, y) + + data = pickled_resource(file, download, identifier, dataset_name) + data = filter_classes(data, min_class_support) + if data.n_classes <= 2: + raise ValueError( + f'After filtering out classes with less than {min_class_support=} instances, the dataset {dataset_name} ' + f'is no longer multiclass. Try a reducing this value.' + ) + + if standardize: + stds = StandardScaler() + data.instances = stds.fit_transform(data.instances) if verbose: data.stats() @@ -663,9 +786,14 @@ def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float): df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False) +def _array_replace(arr, repl={"yes": 1, "no": 0}): + for k, v in repl.items(): + arr[arr == k] = v + + def fetch_lequa2022(task, data_home=None): """ - Loads the official datasets provided for the `LeQua `_ competition. + Loads the official datasets provided for the `LeQua 2022 `_ competition. In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide raw documents instead. Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B are multiclass quantification @@ -677,20 +805,19 @@ def fetch_lequa2022(task, data_home=None): The datasets are downloaded only once, and stored for fast reuse. - See `lequa2022_experiments.py` provided in the example folder, that can serve as a guide on how to use these + See `4.lequa2022_experiments.py` provided in the example folder, that can serve as a guide on how to use these datasets. - :param task: a string representing the task name; valid ones are T1A, T1B, T2A, and T2B :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default ~/quay_data/ directory) :return: a tuple `(train, val_gen, test_gen)` where `train` is an instance of :class:`quapy.data.base.LabelledCollection`, `val_gen` and `test_gen` are instances of - :class:`quapy.data._lequa2022.SamplesFromDir`, a subclass of :class:`quapy.protocol.AbstractProtocol`, + :class:`quapy.data._lequa.SamplesFromDir`, a subclass of :class:`quapy.protocol.AbstractProtocol`, that return a series of samples stored in a directory which are labelled by prevalence. """ - from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir + from quapy.data._lequa import load_raw_documents, load_vector_documents_2022, SamplesFromDir assert task in LEQUA2022_TASKS, \ f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}' @@ -708,7 +835,9 @@ def fetch_lequa2022(task, data_home=None): tmp_path = join(lequa_dir, task + '_tmp.zip') download_file_if_not_exists(url, tmp_path) with zipfile.ZipFile(tmp_path) as file: + print(f'Unzipping {tmp_path}...', end='') file.extractall(unzipped_path) + print(f'[done]') os.remove(tmp_path) if not os.path.exists(join(lequa_dir, task)): @@ -717,7 +846,7 @@ def fetch_lequa2022(task, data_home=None): download_unzip_and_remove(lequa_dir, URL_TEST_PREV) if task in ['T1A', 'T1B']: - load_fn = load_vector_documents + load_fn = load_vector_documents_2022 elif task in ['T2A', 'T2B']: load_fn = load_raw_documents @@ -735,32 +864,119 @@ def fetch_lequa2022(task, data_home=None): return train, val_gen, test_gen -def fetch_IFCB(single_sample_train=True, data_home=None): +def fetch_lequa2024(task, data_home=None, merge_T3=False): """ - Loads the IFCB dataset for quantification `. For more - information on this dataset check the zenodo site. - This dataset is based on the data available publicly at . - The scripts for the processing are available at + Loads the official datasets provided for the `LeQua 2024 `_ competition. + LeQua 2024 defines four tasks (T1, T2, T3, T4) related to the problem of quantification; + all tasks are affected by some type of dataset shift. Tasks T1 and T2 are akin to tasks T1A and T1B of LeQua 2022, + while T3 and T4 are new tasks introduced in LeQua 2024. - Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms. + - Task T1 evaluates binary quantifiers under prior probability shift (akin to T1A of LeQua 2022). + - Task T2 evaluates single-label multi-class quantifiers (for n > 2 classes) under prior probability shift (akin to T1B of LeQua 2022). + - Task T3 evaluates ordinal quantifiers, where the classes are totally ordered. + - Task T4 also evaluates binary quantifiers, but under some mix of covariate shift and prior probability shift. + + For a broader discussion, we refer to the `online official documentation `_ + + The datasets are downloaded only once, and stored locally for future reuse. + + See `4b.lequa2024_experiments.py` provided in the example folder, which can serve as a guide on how to use these + datasets. + + :param task: a string representing the task name; valid ones are T1, T2, T3, and T4 + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quapy_data/ directory) + :param merge_T3: bool, if False (default), returns a generator of training collections, corresponding to natural + groups of reviews; if True, returns one single :class:`quapy.data.base.LabelledCollection` representing the + entire training set, as a concatenation of all the training collections + :return: a tuple `(train, val_gen, test_gen)` where `train` is an instance of + :class:`quapy.data.base.LabelledCollection`, `val_gen` and `test_gen` are instances of + :class:`quapy.data._lequa.SamplesFromDir`, a subclass of :class:`quapy.protocol.AbstractProtocol`, + that return a series of samples stored in a directory which are labelled by prevalence. + """ + + from quapy.data._lequa import load_vector_documents_2024, SamplesFromDir, LabelledCollectionsFromDir + + assert task in LEQUA2024_TASKS, \ + f'Unknown task {task}. Valid ones are {LEQUA2024_TASKS}' + + if data_home is None: + data_home = get_quapy_home() + + lequa_dir = data_home + + LEQUA2024_ZENODO = 'https://zenodo.org/records/11661820' # v3, last one with labels + + URL_TRAINDEV=f'{LEQUA2024_ZENODO}/files/{task}.train_dev.zip' + URL_TEST=f'{LEQUA2024_ZENODO}/files/{task}.test.zip' + URL_TEST_PREV=f'{LEQUA2024_ZENODO}/files/{task}.test_prevalences.zip' + + lequa_dir = join(data_home, 'lequa2024') + os.makedirs(lequa_dir, exist_ok=True) + + def download_unzip_and_remove(unzipped_path, url): + tmp_path = join(lequa_dir, task + '_tmp.zip') + download_file_if_not_exists(url, tmp_path) + with zipfile.ZipFile(tmp_path) as file: + file.extractall(unzipped_path) + os.remove(tmp_path) + + if not os.path.exists(join(lequa_dir, task)): + download_unzip_and_remove(lequa_dir, URL_TRAINDEV) + download_unzip_and_remove(lequa_dir, URL_TEST) + download_unzip_and_remove(lequa_dir, URL_TEST_PREV) + + load_fn = load_vector_documents_2024 + + val_samples_path = join(lequa_dir, task, 'public', 'dev_samples') + val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt') + val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn) + + test_samples_path = join(lequa_dir, task, 'public', 'test_samples') + test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt') + test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn) + + if task == 'T3': + training_samples_path = join(lequa_dir, task, 'public', 'training_samples') + training_true_prev_path = join(lequa_dir, task, 'public', 'training_prevalences.txt') + train_gen = LabelledCollectionsFromDir(training_samples_path, training_true_prev_path, load_fn=load_fn) + if merge_T3: + train = LabelledCollection.join(*list(train_gen())) + return train, val_gen, test_gen + else: + return train_gen, val_gen, test_gen + else: + tr_path = join(lequa_dir, task, 'public', 'training_data.txt') + train = LabelledCollection.load(tr_path, loader_func=load_fn) + return train, val_gen, test_gen + + +def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None): + """ + Loads the IFCB dataset for quantification from `Zenodo `_ (for more + information on this dataset, please follow the zenodo link). + This dataset is based on the data available publicly at + `WHOI-Plankton repo `_. + The dataset already comes with processed features. + The scripts used for the processing are available at `P. González's repo `_. The datasets are downloaded only once, and stored for fast reuse. - :param single_sample_train: boolean. If True (default), it returns the train dataset as an instance of + :param single_sample_train: a boolean. If true, it will return the train dataset as a :class:`quapy.data.base.LabelledCollection` (all examples together). - If False, a generator of training samples will be returned. - Each example in the training set has an individual class label. + If false, a generator of training samples will be returned. Each example in the training set has an individual label. + :param for_model_selection: if True, then returns a split 30% of the training set (86 out of 286 samples) to be used for model selection; + if False, then returns the full training set as training set and the test set as the test set :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default ~/quay_data/ directory) :return: a tuple `(train, test_gen)` where `train` is an instance of - :class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is True or - :class:`quapy.data._ifcb.IFCBTrainSamplesFromDir` otherwise, i.e. a sampling protocol that - returns a series of samples labelled example by example. - test_gen is an instance of :class:`quapy.data._ifcb.IFCBTestSamples`, + :class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is true or + :class:`quapy.data._ifcb.IFCBTrainSamplesFromDir`, i.e. a sampling protocol that returns a series of samples + labelled example by example. test_gen will be a :class:`quapy.data._ifcb.IFCBTestSamples`, i.e., a sampling protocol that returns a series of samples labelled by prevalence. """ - from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples + from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples, get_sample_list, generate_modelselection_split if data_home is None: data_home = get_quapy_home() @@ -791,25 +1007,26 @@ def fetch_IFCB(single_sample_train=True, data_home=None): test_true_prev = pd.read_csv(test_true_prev_path) classes = test_true_prev.columns[1:] - #Load train samples + #Load train and test samples train_samples_path = join(ifcb_dir,'train') - train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes) - - #Load test samples test_samples_path = join(ifcb_dir,'test') - test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences_path=test_true_prev_path) + + if for_model_selection: + # In this case, return 70% of training data as the training set and 30% as the test set + samples = get_sample_list(train_samples_path) + train, test = generate_modelselection_split(samples, test_prop=0.3) + train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train) + + # Test prevalence is computed from class labels + test_gen = IFCBTestSamples(path_dir=train_samples_path, test_prevalences=None, samples=test, classes=classes) + else: + # In this case, we use all training samples as the training set and the test samples as the test set + train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes) + test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences=test_true_prev) # In the case the user wants it, join all the train samples in one LabelledCollection if single_sample_train: - X = [] - y = [] - for X_, y_ in train_gen(): - X.append(X_) - y.append(y_) - - X = np.vstack(X) - y = np.concatenate(y) - train = LabelledCollection(X,y, classes=classes) + train = LabelledCollection.join(*[lc for lc in train_gen()]) return train, test_gen else: return train_gen, test_gen diff --git a/quapy/data/preprocessing.py b/quapy/data/preprocessing.py index 9aa8f8b..5f7e0a9 100644 --- a/quapy/data/preprocessing.py +++ b/quapy/data/preprocessing.py @@ -10,6 +10,37 @@ from quapy.util import map_parallel from .base import LabelledCollection +def instance_transformation(dataset:Dataset, transformer, inplace=False): + """ + Transforms a :class:`quapy.data.base.Dataset` applying the `fit_transform` and `transform` functions + of a (sklearn's) transformer. + + :param dataset: a :class:`quapy.data.base.Dataset` where the instances of training and test collections are + lists of str + :param transformer: TransformerMixin implementing `fit_transform` and `transform` functions + :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default) + :return: a new :class:`quapy.data.base.Dataset` with transformed instances (if inplace=False) or a reference to the + current Dataset (if inplace=True) where the instances have been transformed + """ + training_transformed = transformer.fit_transform(*dataset.training.Xy) + test_transformed = transformer.transform(dataset.test.X) + orig_name = dataset.name + + if inplace: + dataset.training = LabelledCollection(training_transformed, dataset.training.labels, dataset.classes_) + dataset.test = LabelledCollection(test_transformed, dataset.test.labels, dataset.classes_) + if hasattr(transformer, 'vocabulary_'): + dataset.vocabulary = transformer.vocabulary_ + return dataset + else: + training = LabelledCollection(training_transformed, dataset.training.labels.copy(), dataset.classes_) + test = LabelledCollection(test_transformed, dataset.test.labels.copy(), dataset.classes_) + vocab = None + if hasattr(transformer, 'vocabulary_'): + vocab = transformer.vocabulary_ + return Dataset(training, test, vocabulary=vocab, name=orig_name) + + def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs): """ Transforms a :class:`quapy.data.base.Dataset` of textual instances into a :class:`quapy.data.base.Dataset` of @@ -29,18 +60,7 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw __check_type(dataset.test.instances, np.ndarray, str) vectorizer = TfidfVectorizer(min_df=min_df, sublinear_tf=sublinear_tf, **kwargs) - training_documents = vectorizer.fit_transform(dataset.training.instances) - test_documents = vectorizer.transform(dataset.test.instances) - - if inplace: - dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.classes_) - dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.classes_) - dataset.vocabulary = vectorizer.vocabulary_ - return dataset - else: - training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.classes_) - test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.classes_) - return Dataset(training, test, vectorizer.vocabulary_) + return instance_transformation(dataset, vectorizer, inplace) def reduce_columns(dataset: Dataset, min_df=5, inplace=False): @@ -90,12 +110,17 @@ def standardize(dataset: Dataset, inplace=False): :class:`quapy.data.base.Dataset` is to be returned :return: an instance of :class:`quapy.data.base.Dataset` """ - s = StandardScaler(copy=not inplace) - training = s.fit_transform(dataset.training.instances) - test = s.transform(dataset.test.instances) + s = StandardScaler() + train, test = dataset.train_test + std_train_X = s.fit_transform(train.X) + std_test_X = s.transform(test.X) if inplace: + dataset.training.instances = std_train_X + dataset.test.instances = std_test_X return dataset else: + training = LabelledCollection(std_train_X, train.labels, classes=train.classes_) + test = LabelledCollection(std_test_X, test.labels, classes=test.classes_) return Dataset(training, test, dataset.vocabulary, dataset.name) diff --git a/quapy/error.py b/quapy/error.py index f2f5bd0..eb42cd6 100644 --- a/quapy/error.py +++ b/quapy/error.py @@ -45,89 +45,95 @@ def acce(y_true, y_pred): return 1. - (y_true == y_pred).mean() -def mae(prevs, prevs_hat): +def mae(prevs_true, prevs_hat): """Computes the mean absolute error (see :meth:`quapy.error.ae`) across the sample pairs. - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values :return: mean absolute error """ - return ae(prevs, prevs_hat).mean() + return ae(prevs_true, prevs_hat).mean() -def ae(prevs, prevs_hat): +def ae(prevs_true, prevs_hat): """Computes the absolute error between the two prevalence vectors. Absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as :math:`AE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}|\\hat{p}(y)-p(y)|`, where :math:`\\mathcal{Y}` are the classes of interest. - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :return: absolute error """ - assert prevs.shape == prevs_hat.shape, f'wrong shape {prevs.shape} vs. {prevs_hat.shape}' - return abs(prevs_hat - prevs).mean(axis=-1) + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + assert prevs_true.shape == prevs_hat.shape, f'wrong shape {prevs_true.shape} vs. {prevs_hat.shape}' + return abs(prevs_hat - prevs_true).mean(axis=-1) -def nae(prevs, prevs_hat): +def nae(prevs_true, prevs_hat): """Computes the normalized absolute error between the two prevalence vectors. Normalized absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as :math:`NAE(p,\\hat{p})=\\frac{AE(p,\\hat{p})}{z_{AE}}`, where :math:`z_{AE}=\\frac{2(1-\\min_{y\\in \\mathcal{Y}} p(y))}{|\\mathcal{Y}|}`, and :math:`\\mathcal{Y}` are the classes of interest. - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :return: normalized absolute error """ - assert prevs.shape == prevs_hat.shape, f'wrong shape {prevs.shape} vs. {prevs_hat.shape}' - return abs(prevs_hat - prevs).sum(axis=-1)/(2*(1-prevs.min(axis=-1))) + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + assert prevs_true.shape == prevs_hat.shape, f'wrong shape {prevs_true.shape} vs. {prevs_hat.shape}' + return abs(prevs_hat - prevs_true).sum(axis=-1)/(2 * (1 - prevs_true.min(axis=-1))) -def mnae(prevs, prevs_hat): +def mnae(prevs_true, prevs_hat): """Computes the mean normalized absolute error (see :meth:`quapy.error.nae`) across the sample pairs. - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values :return: mean normalized absolute error """ - return nae(prevs, prevs_hat).mean() + return nae(prevs_true, prevs_hat).mean() -def mse(prevs, prevs_hat): +def mse(prevs_true, prevs_hat): """Computes the mean squared error (see :meth:`quapy.error.se`) across the sample pairs. - :param prevs: array-like of shape `(n_samples, n_classes,)` with the + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values :return: mean squared error """ - return se(prevs, prevs_hat).mean() + return se(prevs_true, prevs_hat).mean() -def se(prevs, prevs_hat): +def se(prevs_true, prevs_hat): """Computes the squared error between the two prevalence vectors. Squared error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as :math:`SE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}(\\hat{p}(y)-p(y))^2`, where :math:`\\mathcal{Y}` are the classes of interest. - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :return: absolute error """ - return ((prevs_hat - prevs) ** 2).mean(axis=-1) + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + return ((prevs_hat - prevs_true) ** 2).mean(axis=-1) -def mkld(prevs, prevs_hat, eps=None): +def mkld(prevs_true, prevs_hat, eps=None): """Computes the mean Kullback-Leibler divergence (see :meth:`quapy.error.kld`) across the sample pairs. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values @@ -137,10 +143,10 @@ def mkld(prevs, prevs_hat, eps=None): (which has thus to be set beforehand). :return: mean Kullback-Leibler distribution """ - return kld(prevs, prevs_hat, eps).mean() + return kld(prevs_true, prevs_hat, eps).mean() -def kld(prevs, prevs_hat, eps=None): +def kld(prevs_true, prevs_hat, eps=None): """Computes the Kullback-Leibler divergence between the two prevalence distributions. Kullback-Leibler divergence between two prevalence distributions :math:`p` and :math:`\\hat{p}` is computed as @@ -149,7 +155,7 @@ def kld(prevs, prevs_hat, eps=None): where :math:`\\mathcal{Y}` are the classes of interest. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :param eps: smoothing factor. KLD is not defined in cases in which the distributions contain zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size. @@ -158,17 +164,17 @@ def kld(prevs, prevs_hat, eps=None): :return: Kullback-Leibler divergence between the two distributions """ eps = __check_eps(eps) - smooth_prevs = prevs + eps - smooth_prevs_hat = prevs_hat + eps + smooth_prevs = smooth(prevs_true, eps) + smooth_prevs_hat = smooth(prevs_hat, eps) return (smooth_prevs*np.log(smooth_prevs/smooth_prevs_hat)).sum(axis=-1) -def mnkld(prevs, prevs_hat, eps=None): +def mnkld(prevs_true, prevs_hat, eps=None): """Computes the mean Normalized Kullback-Leibler divergence (see :meth:`quapy.error.nkld`) across the sample pairs. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values :param eps: smoothing factor. NKLD is not defined in cases in which the distributions contain @@ -177,10 +183,10 @@ def mnkld(prevs, prevs_hat, eps=None): (which has thus to be set beforehand). :return: mean Normalized Kullback-Leibler distribution """ - return nkld(prevs, prevs_hat, eps).mean() + return nkld(prevs_true, prevs_hat, eps).mean() -def nkld(prevs, prevs_hat, eps=None): +def nkld(prevs_true, prevs_hat, eps=None): """Computes the Normalized Kullback-Leibler divergence between the two prevalence distributions. Normalized Kullback-Leibler divergence between two prevalence distributions :math:`p` and :math:`\\hat{p}` is computed as @@ -189,7 +195,7 @@ def nkld(prevs, prevs_hat, eps=None): :math:`\\mathcal{Y}` are the classes of interest. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :param eps: smoothing factor. NKLD is not defined in cases in which the distributions contain zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample @@ -197,16 +203,16 @@ def nkld(prevs, prevs_hat, eps=None): `SAMPLE_SIZE` (which has thus to be set beforehand). :return: Normalized Kullback-Leibler divergence between the two distributions """ - ekld = np.exp(kld(prevs, prevs_hat, eps)) + ekld = np.exp(kld(prevs_true, prevs_hat, eps)) return 2. * ekld / (1 + ekld) - 1. -def mrae(prevs, prevs_hat, eps=None): +def mrae(prevs_true, prevs_hat, eps=None): """Computes the mean relative absolute error (see :meth:`quapy.error.rae`) across the sample pairs. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values @@ -216,10 +222,10 @@ def mrae(prevs, prevs_hat, eps=None): the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand). :return: mean relative absolute error """ - return rae(prevs, prevs_hat, eps).mean() + return rae(prevs_true, prevs_hat, eps).mean() -def rae(prevs, prevs_hat, eps=None): +def rae(prevs_true, prevs_hat, eps=None): """Computes the absolute relative error between the two prevalence vectors. Relative absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as @@ -228,7 +234,7 @@ def rae(prevs, prevs_hat, eps=None): where :math:`\\mathcal{Y}` are the classes of interest. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :param eps: smoothing factor. `rae` is not defined in cases in which the true distribution contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the @@ -237,12 +243,12 @@ def rae(prevs, prevs_hat, eps=None): :return: relative absolute error """ eps = __check_eps(eps) - prevs = smooth(prevs, eps) + prevs_true = smooth(prevs_true, eps) prevs_hat = smooth(prevs_hat, eps) - return (abs(prevs - prevs_hat) / prevs).mean(axis=-1) + return (abs(prevs_true - prevs_hat) / prevs_true).mean(axis=-1) -def nrae(prevs, prevs_hat, eps=None): +def nrae(prevs_true, prevs_hat, eps=None): """Computes the normalized absolute relative error between the two prevalence vectors. Relative absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as @@ -252,7 +258,7 @@ def nrae(prevs, prevs_hat, eps=None): and :math:`\\mathcal{Y}` are the classes of interest. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :param eps: smoothing factor. `nrae` is not defined in cases in which the true distribution contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the @@ -261,18 +267,18 @@ def nrae(prevs, prevs_hat, eps=None): :return: normalized relative absolute error """ eps = __check_eps(eps) - prevs = smooth(prevs, eps) + prevs_true = smooth(prevs_true, eps) prevs_hat = smooth(prevs_hat, eps) - min_p = prevs.min(axis=-1) - return (abs(prevs - prevs_hat) / prevs).sum(axis=-1)/(prevs.shape[-1]-1+(1-min_p)/min_p) + min_p = prevs_true.min(axis=-1) + return (abs(prevs_true - prevs_hat) / prevs_true).sum(axis=-1)/(prevs_true.shape[-1] - 1 + (1 - min_p) / min_p) -def mnrae(prevs, prevs_hat, eps=None): +def mnrae(prevs_true, prevs_hat, eps=None): """Computes the mean normalized relative absolute error (see :meth:`quapy.error.nrae`) across the sample pairs. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values @@ -282,7 +288,66 @@ def mnrae(prevs, prevs_hat, eps=None): the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand). :return: mean normalized relative absolute error """ - return nrae(prevs, prevs_hat, eps).mean() + return nrae(prevs_true, prevs_hat, eps).mean() + + +def nmd(prevs_true, prevs_hat): + """ + Computes the Normalized Match Distance; which is the Normalized Distance multiplied by the factor + `1/(n-1)` to guarantee the measure ranges between 0 (best prediction) and 1 (worst prediction). + + :param prevs_true: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the predicted prevalence values + :return: float in [0,1] + """ + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + n = prevs_true.shape[-1] + return (1./(n-1))*np.mean(match_distance(prevs_true, prevs_hat)) + + +def bias_binary(prevs_true, prevs_hat): + """ + Computes the (positive) bias in a binary problem. The bias is simply the difference between the + predicted positive value and the true positive value, so that a positive such value indicates the + prediction has positive bias (i.e., it tends to overestimate) the true value, and negative otherwise. + :math:`bias(p,\\hat{p})=\\hat{p}_1-p_1`, + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted + prevalence values + :return: binary bias + """ + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + assert prevs_true.shape[-1] == 2 and prevs_true.shape[-1] == 2, f'bias_binary can only be applied to binary problems' + return prevs_hat[...,1]-prevs_true[...,1] + + +def mean_bias_binary(prevs_true, prevs_hat): + """ + Computes the mean of the (positive) bias in a binary problem. + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values + :return: mean binary bias + """ + return np.mean(bias_binary(prevs_true, prevs_hat)) + + +def md(prevs_true, prevs_hat, ERROR_TOL=1E-3): + """ + Computes the Match Distance, under the assumption that the cost in mistaking class i with class i+1 is 1 in + all cases. + + :param prevs_true: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the predicted prevalence values + :return: float + """ + P = np.cumsum(prevs_true, axis=-1) + P_hat = np.cumsum(prevs_hat, axis=-1) + assert np.all(np.isclose(P_hat[..., -1], 1.0, rtol=ERROR_TOL)), \ + 'arg error in match_distance: the array does not represent a valid distribution' + distances = np.abs(P-P_hat) + return distances[..., :-1].sum(axis=-1) def smooth(prevs, eps): @@ -294,6 +359,7 @@ def smooth(prevs, eps): :param eps: smoothing factor :return: array-like of shape `(n_classes,)` with the smoothed distribution """ + prevs = np.asarray(prevs) n_classes = prevs.shape[-1] return (prevs + eps) / (eps * n_classes + 1) @@ -328,3 +394,5 @@ normalized_absolute_error = nae normalized_relative_absolute_error = nrae mean_normalized_absolute_error = mnae mean_normalized_relative_absolute_error = mnrae +normalized_match_distance = nmd +match_distance = md diff --git a/quapy/evaluation.py b/quapy/evaluation.py index c198115..2805555 100644 --- a/quapy/evaluation.py +++ b/quapy/evaluation.py @@ -63,7 +63,7 @@ def prediction( protocol_with_predictions = protocol.on_preclassified_instances(pre_classified) return __prediction_helper(model.aggregate, protocol_with_predictions, verbose) else: - return __prediction_helper(model.quantify, protocol, verbose) + return __prediction_helper(model.predict, protocol, verbose) def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False): @@ -118,14 +118,15 @@ def _prevalence_report(true_prevs, estim_prevs, error_metrics: Iterable[Union[st assert all(hasattr(e, '__call__') for e in error_funcs), 'invalid error functions' error_names = [e.__name__ for e in error_funcs] - df = pd.DataFrame(columns=['true-prev', 'estim-prev'] + error_names) + row_entries = [] for true_prev, estim_prev in zip(true_prevs, estim_prevs): series = {'true-prev': true_prev, 'estim-prev': estim_prev} for error_name, error_metric in zip(error_names, error_funcs): score = error_metric(true_prev, estim_prev) series[error_name] = score - df = df.append(series, ignore_index=True) + row_entries.append(series) + df = pd.DataFrame.from_records(row_entries) return df diff --git a/quapy/functional.py b/quapy/functional.py index e29466f..408c62a 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -1,52 +1,73 @@ -import itertools +import warnings from collections import defaultdict -from typing import Union, Callable +from typing import Literal, Union, Callable +from numpy.typing import ArrayLike import scipy import numpy as np -def prevalence_linspace(n_prevalences=21, repeats=1, smooth_limits_epsilon=0.01): +# ------------------------------------------------------------------------------------------ +# General utils +# ------------------------------------------------------------------------------------------ + +def classes_from_labels(labels): """ - Produces an array of uniformly separated values of prevalence. - By default, produces an array of 21 prevalence values, with - step 0.05 and with the limits smoothed, i.e.: - [0.01, 0.05, 0.10, 0.15, ..., 0.90, 0.95, 0.99] - - :param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21) - :param repeats: number of times each prevalence is to be repeated (defaults to 1) - :param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1 - :return: an array of uniformly separated prevalence values + Obtains a np.ndarray with the (sorted) classes + :param labels: array-like with the instances' labels + :return: a sorted np.ndarray with the class labels """ - p = np.linspace(0., 1., num=n_prevalences, endpoint=True) - p[0] += smooth_limits_epsilon - p[-1] -= smooth_limits_epsilon - if p[0] > p[1]: - raise ValueError(f'the smoothing in the limits is greater than the prevalence step') - if repeats > 1: - p = np.repeat(p, repeats) - return p + classes = np.unique(labels) + classes.sort() + return classes -def prevalence_from_labels(labels, classes): +def num_classes_from_labels(labels): """ - Computed the prevalence values from a vector of labels. + Obtains the number of classes from an array-like of instance's labels + :param labels: array-like with the instances' labels + :return: int, the number of classes + """ + return len(classes_from_labels(labels)) - :param labels: array-like of shape `(n_instances)` with the label for each instance +# ------------------------------------------------------------------------------------------ +# Counter utils +# ------------------------------------------------------------------------------------------ + +def counts_from_labels(labels: ArrayLike, classes: ArrayLike) -> np.ndarray: + """ + Computes the raw count values from a vector of labels. + + :param labels: array-like of shape `(n_instances,)` with the label for each instance :param classes: the class labels. This is needed in order to correctly compute the prevalence vector even when some classes have no examples. - :return: an ndarray of shape `(len(classes))` with the class prevalence values + :return: ndarray of shape `(len(classes),)` with the raw counts for each class, in the same order + as they appear in `classes` """ - if labels.ndim != 1: + if np.asarray(labels).ndim != 1: raise ValueError(f'param labels does not seem to be a ndarray of label predictions') unique, counts = np.unique(labels, return_counts=True) by_class = defaultdict(lambda:0, dict(zip(unique, counts))) - prevalences = np.asarray([by_class[class_] for class_ in classes], dtype=float) - prevalences /= prevalences.sum() + counts = np.asarray([by_class[class_] for class_ in classes], dtype=int) + return counts + + +def prevalence_from_labels(labels: ArrayLike, classes: ArrayLike): + """ + Computes the prevalence values from a vector of labels. + + :param labels: array-like of shape `(n_instances,)` with the label for each instance + :param classes: the class labels. This is needed in order to correctly compute the prevalence vector even when + some classes have no examples. + :return: ndarray of shape `(len(classes),)` with the class proportions for each class, in the same order + as they appear in `classes` + """ + counts = counts_from_labels(labels, classes) + prevalences = counts.astype(float) / np.sum(counts) return prevalences -def prevalence_from_probabilities(posteriors, binarize: bool = False): +def prevalence_from_probabilities(posteriors: ArrayLike, binarize: bool = False): """ Returns a vector of prevalence values from a matrix of posterior probabilities. @@ -55,8 +76,9 @@ def prevalence_from_probabilities(posteriors, binarize: bool = False): converting the vectors of posterior probabilities into class indices, by taking the argmax). :return: array of shape `(n_classes,)` containing the prevalence values """ + posteriors = np.asarray(posteriors) if posteriors.ndim != 2: - raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities') + raise ValueError(f'param posteriors does not seem to be a ndarray of posterior probabilities') if binarize: predictions = np.argmax(posteriors, axis=-1) return prevalence_from_labels(predictions, np.arange(posteriors.shape[1])) @@ -66,7 +88,277 @@ def prevalence_from_probabilities(posteriors, binarize: bool = False): return prevalences -def HellingerDistance(P, Q) -> float: +def num_prevalence_combinations(n_prevpoints:int, n_classes:int, n_repeats:int=1) -> int: + """ + Computes the number of valid prevalence combinations in the n_classes-dimensional simplex if `n_prevpoints` equally + distant prevalence values are generated and `n_repeats` repetitions are requested. + The computation comes down to calculating: + + .. math:: + \\binom{N+C-1}{C-1} \\times r + + where `N` is `n_prevpoints-1`, i.e., the number of probability mass blocks to allocate, `C` is the number of + classes, and `r` is `n_repeats`. This solution comes from the + `Stars and Bars `_ problem. + + :param int n_classes: number of classes + :param int n_prevpoints: number of prevalence points. + :param int n_repeats: number of repetitions for each prevalence combination + :return: The number of possible combinations. For example, if `n_classes`=2, `n_prevpoints`=5, `n_repeats`=1, + then the number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], + and [1.0,0.0] + """ + N = n_prevpoints-1 + C = n_classes + r = n_repeats + return int(scipy.special.binom(N + C - 1, C - 1) * r) + + +def get_nprevpoints_approximation(combinations_budget:int, n_classes:int, n_repeats:int=1) -> int: + """ + Searches for the largest number of (equidistant) prevalence points to define for each of the `n_classes` classes so + that the number of valid prevalence values generated as combinations of prevalence points (points in a + `n_classes`-dimensional simplex) do not exceed combinations_budget. + + :param int combinations_budget: maximum number of combinations allowed + :param int n_classes: number of classes + :param int n_repeats: number of repetitions for each prevalence combination + :return: the largest number of prevalence points that generate less than combinations_budget valid prevalences + """ + assert n_classes > 0 and n_repeats > 0 and combinations_budget > 0, 'parameters must be positive integers' + n_prevpoints = 1 + while True: + combinations = num_prevalence_combinations(n_prevpoints, n_classes, n_repeats) + if combinations > combinations_budget: + return n_prevpoints-1 + else: + n_prevpoints += 1 + + +# ------------------------------------------------------------------------------------------ +# Prevalence vectors +# ------------------------------------------------------------------------------------------ + +def as_binary_prevalence(positive_prevalence: Union[float, ArrayLike], clip_if_necessary: bool=False) -> np.ndarray: + """ + Helper that, given a float representing the prevalence for the positive class, returns a np.ndarray of two + values representing a binary distribution. + + :param positive_prevalence: float or array-like of floats with the prevalence for the positive class + :param bool clip_if_necessary: if True, clips the value in [0,1] in order to guarantee the resulting distribution + is valid. If False, it then checks that the value is in the valid range, and raises an error if not. + :return: np.ndarray of shape `(2,)` + """ + positive_prevalence = np.asarray(positive_prevalence, float) + if clip_if_necessary: + positive_prevalence = np.clip(positive_prevalence, 0, 1) + else: + assert np.logical_and(0 <= positive_prevalence, positive_prevalence <= 1).all(), \ + 'the value provided is not a valid prevalence for the positive class' + return np.asarray([1-positive_prevalence, positive_prevalence]).T + + +def strprev(prevalences: ArrayLike, prec: int=3) -> str: + """ + Returns a string representation for a prevalence vector. E.g., + + >>> strprev([1/3, 2/3], prec=2) + >>> '[0.33, 0.67]' + + :param prevalences: array-like of prevalence values + :param prec: int, indicates the float precision (number of decimal values to print) + :return: string + """ + return '['+ ', '.join([f'{p:.{prec}f}' for p in prevalences]) + ']' + + +def check_prevalence_vector(prevalences: ArrayLike, raise_exception: bool=False, tolerance: float=1e-08, aggr=True): + """ + Checks that `prevalences` is a valid prevalence vector, i.e., it contains values in [0,1] and + the values sum up to 1. In other words, verifies that the `prevalences` vectors lies in the + probability simplex. + + :param ArrayLike prevalences: the prevalence vector, or vectors, to check + :param bool raise_exception: whether to raise an exception if the vector (or any of the vectors) does + not lie in the simplex (default False) + :param float tolerance: error tolerance for the check `sum(prevalences) - 1 = 0` + :param bool aggr: if True (default) returns one single bool (True if all prevalence vectors are valid, + False otherwise), if False returns an array of bool, one for each prevalence vector + :return: a single bool True if `prevalences` is a vector of prevalence values that lies on the simplex, + or False otherwise; alternatively, if `prevalences` is a matrix of shape `(num_vectors, n_classes,)` + then it returns one such bool for each prevalence vector + """ + prevalences = np.asarray(prevalences) + + all_positive = prevalences>=0 + if not all_positive.all(): + if raise_exception: + raise ValueError('some prevalence vectors contain negative numbers; ' + 'consider using the qp.functional.normalize_prevalence with ' + 'any method from ["clip", "mapsimplex", "softmax"]') + + all_close_1 = np.isclose(prevalences.sum(axis=-1), 1, atol=tolerance) + if not all_close_1.all(): + if raise_exception: + raise ValueError('some prevalence vectors do not sum up to 1; ' + 'consider using the qp.functional.normalize_prevalence with ' + 'any method from ["l1", "clip", "mapsimplex", "softmax"]') + + valid = np.logical_and(all_positive.all(axis=-1), all_close_1) + if aggr: + return valid.all() + else: + return valid + + +def uniform_prevalence(n_classes): + """ + Returns a vector representing the uniform distribution for `n_classes` + + :param n_classes: number of classes + :return: np.ndarray with all values 1/n_classes + """ + assert isinstance(n_classes, int) and n_classes>0, \ + (f'param {n_classes} not understood; must be a positive integer representing the ' + f'number of classes ') + return np.full(shape=n_classes, fill_value=1./n_classes) + + +def normalize_prevalence(prevalences: ArrayLike, method='l1'): + """ + Normalizes a vector or matrix of prevalence values. The normalization consists of applying a L1 normalization in + cases in which the prevalence values are not all-zeros, and to convert the prevalence values into `1/n_classes` in + cases in which all values are zero. + + :param prevalences: array-like of shape `(n_classes,)` or of shape `(n_samples, n_classes,)` with prevalence values + :param str method: indicates the normalization method to employ, options are: + + * `l1`: applies L1 normalization (default); a 0 vector is mapped onto the uniform prevalence + * `clip`: clip values in [0,1] and then rescales so that the L1 norm is 1 + * `mapsimplex`: projects vectors onto the probability simplex. This implementation relies on + `Mathieu Blondel's projection_simplex_sort `_ + * `softmax`: applies softmax to all vectors + * `condsoftmax`: applies softmax only to invalid prevalence vectors + + :return: a normalized vector or matrix of prevalence values + """ + if method in ['none', None]: + return prevalences + + prevalences = np.asarray(prevalences, dtype=float) + + if method=='l1': + normalized = l1_norm(prevalences) + check_prevalence_vector(normalized, raise_exception=True) + elif method=='clip': + normalized = clip(prevalences) # no need to check afterwards + elif method=='mapsimplex': + normalized = projection_simplex_sort(prevalences) + elif method=='softmax': + normalized = softmax(prevalences) + elif method=='condsoftmax': + normalized = condsoftmax(prevalences) + else: + raise ValueError(f'unknown {method=}, valid ones are ["l1", "clip", "mapsimplex", "softmax", "condsoftmax"]') + + return normalized + + +def l1_norm(prevalences: ArrayLike) -> np.ndarray: + """ + Applies L1 normalization to the `unnormalized_arr` so that it becomes a valid prevalence + vector. Zero vectors are mapped onto the uniform distribution. Raises an exception if + the resulting vectors are not valid distributions. This may happen when the original + prevalence vectors contain negative values. Use the `clip` normalization function + instead to avoid this possibility. + + :param prevalences: array-like of shape `(n_classes,)` or of shape `(n_samples, n_classes,)` with prevalence values + :return: np.ndarray representing a valid distribution + """ + n_classes = prevalences.shape[-1] + accum = prevalences.sum(axis=-1, keepdims=True) + prevalences = np.true_divide(prevalences, accum, where=accum > 0) + allzeros = accum.flatten() == 0 + if any(allzeros): + if prevalences.ndim == 1: + prevalences = np.full(shape=n_classes, fill_value=1. / n_classes) + else: + prevalences[allzeros] = np.full(shape=n_classes, fill_value=1. / n_classes) + return prevalences + + +def clip(prevalences: ArrayLike) -> np.ndarray: + """ + Clips the values in [0,1] and then applies the L1 normalization. + + :param prevalences: array-like of shape `(n_classes,)` or of shape `(n_samples, n_classes,)` with prevalence values + :return: np.ndarray representing a valid distribution + """ + clipped = np.clip(prevalences, 0, 1) + normalized = l1_norm(clipped) + return normalized + + +def projection_simplex_sort(unnormalized_arr: ArrayLike) -> np.ndarray: + """Projects a point onto the probability simplex. + + The code is adapted from Mathieu Blondel's BSD-licensed + `implementation `_ + (see function `projection_simplex_sort` in their repo) which is accompanying the paper + + Mathieu Blondel, Akinori Fujino, and Naonori Ueda. + Large-scale Multiclass Support Vector Machine Training via Euclidean Projection onto the Simplex, + ICPR 2014, `URL `_ + + :param `unnormalized_arr`: point in n-dimensional space, shape `(n,)` + :return: projection of `unnormalized_arr` onto the (n-1)-dimensional probability simplex, shape `(n,)` + """ + unnormalized_arr = np.asarray(unnormalized_arr) + n = len(unnormalized_arr) + u = np.sort(unnormalized_arr)[::-1] + cssv = np.cumsum(u) - 1.0 + ind = np.arange(1, n + 1) + cond = u - cssv / ind > 0 + rho = ind[cond][-1] + theta = cssv[cond][-1] / float(rho) + return np.maximum(unnormalized_arr - theta, 0) + + +def softmax(prevalences: ArrayLike) -> np.ndarray: + """ + Applies the softmax function to all vectors even if the original vectors were valid distributions. + If you want to leave valid vectors untouched, use condsoftmax instead. + + :param prevalences: array-like of shape `(n_classes,)` or of shape `(n_samples, n_classes,)` with prevalence values + :return: np.ndarray representing a valid distribution + """ + normalized = scipy.special.softmax(prevalences, axis=-1) + return normalized + + +def condsoftmax(prevalences: ArrayLike) -> np.ndarray: + """ + Applies the softmax function only to vectors that do not represent valid distributions. + + :param prevalences: array-like of shape `(n_classes,)` or of shape `(n_samples, n_classes,)` with prevalence values + :return: np.ndarray representing a valid distribution + """ + invalid_idx = ~ check_prevalence_vector(prevalences, aggr=False, raise_exception=False) + if isinstance(invalid_idx, np.bool_) and invalid_idx: + # only one vector + normalized = scipy.special.softmax(prevalences) + else: + prevalences = np.copy(prevalences) + prevalences[invalid_idx] = scipy.special.softmax(prevalences[invalid_idx], axis=-1) + normalized = prevalences + return normalized + + +# ------------------------------------------------------------------------------------------ +# Divergences +# ------------------------------------------------------------------------------------------ + +def HellingerDistance(P: np.ndarray, Q: np.ndarray) -> float: """ Computes the Hellingher Distance (HD) between (discretized) distributions `P` and `Q`. The HD for two discrete distributions of `k` bins is defined as: @@ -81,7 +373,7 @@ def HellingerDistance(P, Q) -> float: return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2)) -def TopsoeDistance(P, Q, epsilon=1e-20): +def TopsoeDistance(P: np.ndarray, Q: np.ndarray, epsilon: float=1e-20): """ Topsoe distance between two (discretized) distributions `P` and `Q`. The Topsoe distance for two discrete distributions of `k` bins is defined as: @@ -95,9 +387,138 @@ def TopsoeDistance(P, Q, epsilon=1e-20): :return: float """ return np.sum(P*np.log((2*P+epsilon)/(P+Q+epsilon)) + Q*np.log((2*Q+epsilon)/(P+Q+epsilon))) - -def uniform_prevalence_sampling(n_classes, size=1): + +def get_divergence(divergence: Union[str, Callable]): + """ + Guarantees that the divergence received as argument is a function. That is, if this argument is already + a callable, then it is returned, if it is instead a string, then tries to instantiate the corresponding + divergence from the string name. + + :param divergence: callable or string indicating the name of the divergence function + :return: callable + """ + if isinstance(divergence, str): + if divergence=='HD': + return HellingerDistance + elif divergence=='topsoe': + return TopsoeDistance + else: + raise ValueError(f'unknown divergence {divergence}') + elif callable(divergence): + return divergence + else: + raise ValueError(f'argument "divergence" not understood; use a str or a callable function') + + +# ------------------------------------------------------------------------------------------ +# Solvers +# ------------------------------------------------------------------------------------------ + +def argmin_prevalence(loss: Callable, + n_classes: int, + method: Literal["optim_minimize", "linear_search", "ternary_search"]='optim_minimize'): + """ + Searches for the prevalence vector that minimizes a loss function. + + :param loss: callable, the function to minimize + :param n_classes: int, number of classes + :param method: string indicating the search strategy. Possible values are:: + 'optim_minimize': uses scipy.optim + 'linear_search': carries out a linear search for binary problems in the space [0, 0.01, 0.02, ..., 1] + 'ternary_search': implements the ternary search (not yet implemented) + :return: np.ndarray, a prevalence vector + """ + if method == 'optim_minimize': + return optim_minimize(loss, n_classes) + elif method == 'linear_search': + return linear_search(loss, n_classes) + elif method == 'ternary_search': + ternary_search(loss, n_classes) + else: + raise NotImplementedError() + + +def optim_minimize(loss: Callable, n_classes: int, return_loss=False): + """ + Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex + that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's + SLSQP routine. + + :param loss: (callable) the function to minimize + :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector + :param return_loss: bool, if True, returns also the value of the loss (default is False). + :return: (ndarray) the best prevalence vector found or a tuple which also contains the value of the loss + if return_loss=True + """ + from scipy import optimize + + # the initial point is set as the uniform distribution + uniform_distribution = uniform_prevalence(n_classes=n_classes) + + # solutions are bounded to those contained in the unit-simplex + bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] + constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 + r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) + + if return_loss: + return r.x, r.fun + else: + return r.x + + +def linear_search(loss: Callable, n_classes: int): + """ + Performs a linear search for the best prevalence value in binary problems. The search is carried out by exploring + the range [0,1] stepping by 0.01. This search is inefficient, and is added only for completeness (some of the + early methods in quantification literature used it, e.g., HDy). A most powerful alternative is `optim_minimize`. + + :param loss: (callable) the function to minimize + :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector + :return: (ndarray) the best prevalence vector found + """ + assert n_classes==2, 'linear search is only available for binary problems' + + prev_selected, min_score = None, None + for prev in prevalence_linspace(grid_points=100, repeats=1, smooth_limits_epsilon=0.0): + score = loss(np.asarray([1 - prev, prev])) + if min_score is None or score < min_score: + prev_selected, min_score = prev, score + + return np.asarray([1 - prev_selected, prev_selected]) + + +def ternary_search(loss: Callable, n_classes: int): + raise NotImplementedError() + + +# ------------------------------------------------------------------------------------------ +# Sampling utils +# ------------------------------------------------------------------------------------------ + +def prevalence_linspace(grid_points:int=21, repeats:int=1, smooth_limits_epsilon:float=0.01) -> np.ndarray: + """ + Produces an array of uniformly separated values of prevalence. + By default, produces an array of 21 prevalence values, with + step 0.05 and with the limits smoothed, i.e.: + [0.01, 0.05, 0.10, 0.15, ..., 0.90, 0.95, 0.99] + + :param grid_points: the number of prevalence values to sample from the [0,1] interval (default 21) + :param repeats: number of times each prevalence is to be repeated (defaults to 1) + :param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1 + :return: an array of uniformly separated prevalence values + """ + p = np.linspace(0., 1., num=grid_points, endpoint=True) + p[0] += smooth_limits_epsilon + p[-1] -= smooth_limits_epsilon + if p[0] > p[1]: + raise ValueError(f'the smoothing in the limits is greater than the prevalence step') + if repeats > 1: + p = np.repeat(p, repeats) + return p + + +def uniform_prevalence_sampling(n_classes: int, size: int=1) -> np.ndarray: """ Implements the `Kraemer algorithm `_ for sampling uniformly at random from the unit simplex. This implementation is adapted from this @@ -126,21 +547,11 @@ def uniform_prevalence_sampling(n_classes, size=1): uniform_simplex_sampling = uniform_prevalence_sampling -def strprev(prevalences, prec=3): - """ - Returns a string representation for a prevalence vector. E.g., +# ------------------------------------------------------------------------------------------ +# Adjustment +# ------------------------------------------------------------------------------------------ - >>> strprev([1/3, 2/3], prec=2) - >>> '[0.33, 0.67]' - - :param prevalences: a vector of prevalence values - :param prec: float precision - :return: string - """ - return '['+ ', '.join([f'{p:.{prec}f}' for p in prevalences]) + ']' - - -def adjusted_quantification(prevalence_estim, tpr, fpr, clip=True): +def solve_adjustment_binary(prevalence_estim: ArrayLike, tpr: float, fpr: float, clip: bool=True): """ Implements the adjustment of ACC and PACC for the binary case. The adjustment for a prevalence estimate of the positive class `p` comes down to computing: @@ -148,10 +559,10 @@ def adjusted_quantification(prevalence_estim, tpr, fpr, clip=True): .. math:: ACC(p) = \\frac{ p - fpr }{ tpr - fpr } - :param prevalence_estim: float, the estimated value for the positive class - :param tpr: float, the true positive rate of the classifier - :param fpr: float, the false positive rate of the classifier - :param clip: set to True (default) to clip values that might exceed the range [0,1] + :param float prevalence_estim: the estimated value for the positive class (`p` in the formula) + :param float tpr: the true positive rate of the classifier + :param float fpr: the false positive rate of the classifier + :param bool clip: set to True (default) to clip values that might exceed the range [0,1] :return: float, the adjusted count """ @@ -164,184 +575,77 @@ def adjusted_quantification(prevalence_estim, tpr, fpr, clip=True): return adjusted -def normalize_prevalence(prevalences): +def solve_adjustment( + class_conditional_rates: np.ndarray, + unadjusted_counts: np.ndarray, + method: Literal["inversion", "invariant-ratio"], + solver: Literal["exact", "minimize", "exact-raise", "exact-cc"]) -> np.ndarray: """ - Normalize a vector or matrix of prevalence values. The normalization consists of applying a L1 normalization in - cases in which the prevalence values are not all-zeros, and to convert the prevalence values into `1/n_classes` in - cases in which all values are zero. + Function that tries to solve for :math:`p` the equation :math:`q = M p`, where :math:`q` is the vector of + `unadjusted counts` (as estimated, e.g., via classify and count) with :math:`q_i` an estimate of + :math:`P(\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an + estimate of :math:`P(\hat{Y}=y_i|Y=y_j)`. - :param prevalences: array-like of shape `(n_classes,)` or of shape `(n_samples, n_classes,)` with prevalence values - :return: a normalized vector or matrix of prevalence values + :param class_conditional_rates: array of shape `(n_classes, n_classes,)` with entry `(i,j)` being the estimate + of :math:`P(\hat{Y}=y_i|Y=y_j)`, that is, the probability that an instance that belongs to class :math:`y_j` + ends up being classified as belonging to class :math:`y_i` + + :param unadjusted_counts: array of shape `(n_classes,)` containing the unadjusted prevalence values (e.g., as + estimated by CC or PCC) + + :param str method: indicates the adjustment method to be used. Valid options are: + + * `inversion`: tries to solve the equation :math:`q = M p` as :math:`p = M^{-1} q` where + :math:`M^{-1}` is the matrix inversion of :math:`M`. This inversion may not exist in + degenerated cases. + * `invariant-ratio`: invariant ratio estimator of `Vaz et al. 2018 `_, + which replaces the last equation in :math:`M` with the normalization condition (i.e., that the sum of + all prevalence values must equal 1). + + :param str solver: the method to use for solving the system of linear equations. Valid options are: + + * `exact-raise`: tries to solve the system using matrix inversion. Raises an error if the matrix has rank + strictly lower than `n_classes`. + * `exact-cc`: if the matrix is not full rank, returns :math:`q` (i.e., the unadjusted counts) as the estimates + * `exact`: deprecated, defaults to 'exact-cc' (will be removed in future versions) + * `minimize`: minimizes a loss, so the solution always exists """ - prevalences = np.asarray(prevalences) - n_classes = prevalences.shape[-1] - accum = prevalences.sum(axis=-1, keepdims=True) - prevalences = np.true_divide(prevalences, accum, where=accum>0) - allzeros = accum.flatten()==0 - if any(allzeros): - if prevalences.ndim == 1: - prevalences = np.full(shape=n_classes, fill_value=1./n_classes) - else: - prevalences[accum.flatten()==0] = np.full(shape=n_classes, fill_value=1./n_classes) - return prevalences + if solver == "exact": + warnings.warn( + "The 'exact' solver is deprecated. Use 'exact-raise' or 'exact-cc'", DeprecationWarning, stacklevel=2) + solver = "exact-cc" + A = np.asarray(class_conditional_rates, dtype=float) + B = np.asarray(unadjusted_counts, dtype=float) -def __num_prevalence_combinations_depr(n_prevpoints:int, n_classes:int, n_repeats:int=1): - """ - Computes the number of prevalence combinations in the n_classes-dimensional simplex if `nprevpoints` equally distant - prevalence values are generated and `n_repeats` repetitions are requested. - - :param n_classes: integer, number of classes - :param n_prevpoints: integer, number of prevalence points. - :param n_repeats: integer, number of repetitions for each prevalence combination - :return: The number of possible combinations. For example, if n_classes=2, n_prevpoints=5, n_repeats=1, then the - number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0] - """ - __cache={} - def __f(nc,np): - if (nc,np) in __cache: # cached result - return __cache[(nc,np)] - if nc==1: # stop condition - return 1 - else: # recursive call - x = sum([__f(nc-1, np-i) for i in range(np)]) - __cache[(nc,np)] = x - return x - return __f(n_classes, n_prevpoints) * n_repeats - - -def num_prevalence_combinations(n_prevpoints:int, n_classes:int, n_repeats:int=1): - """ - Computes the number of valid prevalence combinations in the n_classes-dimensional simplex if `n_prevpoints` equally - distant prevalence values are generated and `n_repeats` repetitions are requested. - The computation comes down to calculating: - - .. math:: - \\binom{N+C-1}{C-1} \\times r - - where `N` is `n_prevpoints-1`, i.e., the number of probability mass blocks to allocate, `C` is the number of - classes, and `r` is `n_repeats`. This solution comes from the - `Stars and Bars `_ problem. - - :param n_classes: integer, number of classes - :param n_prevpoints: integer, number of prevalence points. - :param n_repeats: integer, number of repetitions for each prevalence combination - :return: The number of possible combinations. For example, if n_classes=2, n_prevpoints=5, n_repeats=1, then the - number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0] - """ - N = n_prevpoints-1 - C = n_classes - r = n_repeats - return int(scipy.special.binom(N + C - 1, C - 1) * r) - - -def get_nprevpoints_approximation(combinations_budget:int, n_classes:int, n_repeats:int=1): - """ - Searches for the largest number of (equidistant) prevalence points to define for each of the `n_classes` classes so - that the number of valid prevalence values generated as combinations of prevalence points (points in a - `n_classes`-dimensional simplex) do not exceed combinations_budget. - - :param combinations_budget: integer, maximum number of combinations allowed - :param n_classes: integer, number of classes - :param n_repeats: integer, number of repetitions for each prevalence combination - :return: the largest number of prevalence points that generate less than combinations_budget valid prevalences - """ - assert n_classes > 0 and n_repeats > 0 and combinations_budget > 0, 'parameters must be positive integers' - n_prevpoints = 1 - while True: - combinations = num_prevalence_combinations(n_prevpoints, n_classes, n_repeats) - if combinations > combinations_budget: - return n_prevpoints-1 - else: - n_prevpoints += 1 - - -def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08): - """ - Checks that p is a valid prevalence vector, i.e., that it contains values in [0,1] and that the values sum up to 1. - - :param p: the prevalence vector to check - :return: True if `p` is valid, False otherwise - """ - p = np.asarray(p) - if not all(p>=0): - if raise_exception: - raise ValueError('the prevalence vector contains negative numbers') - return False - if not all(p<=1): - if raise_exception: - raise ValueError('the prevalence vector contains values >1') - return False - if not np.isclose(p.sum(), 1, atol=toleranze): - if raise_exception: - raise ValueError('the prevalence vector does not sum up to 1') - return False - return True - - -def get_divergence(divergence: Union[str, Callable]): - if isinstance(divergence, str): - if divergence=='HD': - return HellingerDistance - elif divergence=='topsoe': - return TopsoeDistance - else: - raise ValueError(f'unknown divergence {divergence}') - elif callable(divergence): - return divergence + if method == "inversion": + pass # We leave A and B unchanged + elif method == "invariant-ratio": + # Change the last equation to replace it with the normalization condition + A[-1, :] = 1.0 + B[-1] = 1.0 else: - raise ValueError(f'argument "divergence" not understood; use a str or a callable function') + raise ValueError(f"unknown {method=}") - -def argmin_prevalence(loss, n_classes, method='optim_minimize'): - if method == 'optim_minimize': - return optim_minimize(loss, n_classes) - elif method == 'linear_search': - return linear_search(loss, n_classes) - elif method == 'ternary_search': - raise NotImplementedError() + if solver == "minimize": + def loss(prev): + return np.linalg.norm(A @ prev - B) + return optim_minimize(loss, n_classes=A.shape[0]) + elif solver in ["exact-raise", "exact-cc"]: + # Solvers based on matrix inversion, so we use try/except block + try: + return np.linalg.solve(A, B) + except np.linalg.LinAlgError: + # The matrix is not invertible. + # Depending on the solver, we either raise an error + # or return the classifier predictions without adjustment + if solver == "exact-raise": + raise + elif solver == "exact-cc": + return unadjusted_counts + else: + raise ValueError(f"Solver {solver} not known.") else: - raise NotImplementedError() + raise ValueError(f'unknown {solver=}') -def optim_minimize(loss, n_classes): - """ - Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex - that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's - SLSQP routine. - - :param loss: (callable) the function to minimize - :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector - :return: (ndarray) the best prevalence vector found - """ - from scipy import optimize - - # the initial point is set as the uniform distribution - uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) - - # solutions are bounded to those contained in the unit-simplex - bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] - constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 - r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) - return r.x - - -def linear_search(loss, n_classes): - """ - Performs a linear search for the best prevalence value in binary problems. The search is carried out by exploring - the range [0,1] stepping by 0.01. This search is inefficient, and is added only for completeness (some of the - early methods in quantification literature used it, e.g., HDy). A most powerful alternative is `optim_minimize`. - - :param loss: (callable) the function to minimize - :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector - :return: (ndarray) the best prevalence vector found - """ - assert n_classes==2, 'linear search is only available for binary problems' - - prev_selected, min_score = None, None - for prev in prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0): - score = loss(np.asarray([1 - prev, prev])) - if min_score is None or score < min_score: - prev_selected, min_score = prev, score - - return np.asarray([1 - prev_selected, prev_selected]) \ No newline at end of file diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py index 89ffcfc..ab7a59b 100644 --- a/quapy/method/__init__.py +++ b/quapy/method/__init__.py @@ -1,8 +1,14 @@ +import warnings +from sklearn.exceptions import ConvergenceWarning +warnings.simplefilter("ignore", ConvergenceWarning) + +from . import confidence from . import base from . import aggregative from . import non_aggregative from . import meta + AGGREGATIVE_METHODS = { aggregative.CC, aggregative.ACC, @@ -17,11 +23,42 @@ AGGREGATIVE_METHODS = { aggregative.MAX, aggregative.MS, aggregative.MS2, + aggregative.DMy, + aggregative.KDEyML, + aggregative.KDEyCS, + aggregative.KDEyHD, + # aggregative.OneVsAllAggregative, + confidence.BayesianCC, + confidence.PQ, } +BINARY_METHODS = { + aggregative.HDy, + aggregative.DyS, + aggregative.SMM, + aggregative.X, + aggregative.T50, + aggregative.MAX, + aggregative.MS, + aggregative.MS2, + confidence.PQ, +} + +MULTICLASS_METHODS = { + aggregative.CC, + aggregative.ACC, + aggregative.PCC, + aggregative.PACC, + aggregative.EMQ, + aggregative.KDEyML, + aggregative.KDEyCS, + aggregative.KDEyHD, + confidence.BayesianCC +} NON_AGGREGATIVE_METHODS = { - non_aggregative.MaximumLikelihoodPrevalenceEstimation + non_aggregative.MaximumLikelihoodPrevalenceEstimation, + non_aggregative.DMx } META_METHODS = { @@ -33,3 +70,5 @@ QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS | META_ME + + diff --git a/quapy/method/_bayesian.py b/quapy/method/_bayesian.py new file mode 100644 index 0000000..da65eed --- /dev/null +++ b/quapy/method/_bayesian.py @@ -0,0 +1,135 @@ +""" +Utility functions for `Bayesian quantification `_ methods. +""" +import numpy as np +import importlib.resources + +try: + import jax + import jax.numpy as jnp + import numpyro + import numpyro.distributions as dist + import stan + + DEPENDENCIES_INSTALLED = True +except ImportError: + jax = None + jnp = None + numpyro = None + dist = None + stan = None + + DEPENDENCIES_INSTALLED = False + + +P_TEST_Y: str = "P_test(Y)" +P_TEST_C: str = "P_test(C)" +P_C_COND_Y: str = "P(C|Y)" + + +def model(n_c_unlabeled: np.ndarray, n_y_and_c_labeled: np.ndarray) -> None: + """ + Defines a probabilistic model in `NumPyro `_. + + :param n_c_unlabeled: a `np.ndarray` of shape `(n_predicted_classes,)` + with entry `c` being the number of instances predicted as class `c`. + :param n_y_and_c_labeled: a `np.ndarray` of shape `(n_classes, n_predicted_classes)` + with entry `(y, c)` being the number of instances labeled as class `y` and predicted as class `c`. + """ + n_y_labeled = n_y_and_c_labeled.sum(axis=1) + + K = len(n_c_unlabeled) + L = len(n_y_labeled) + + pi_ = numpyro.sample(P_TEST_Y, dist.Dirichlet(jnp.ones(L))) + p_c_cond_y = numpyro.sample(P_C_COND_Y, dist.Dirichlet(jnp.ones(K).repeat(L).reshape(L, K))) + + with numpyro.plate('plate', L): + numpyro.sample('F_yc', dist.Multinomial(n_y_labeled, p_c_cond_y), obs=n_y_and_c_labeled) + + p_c = numpyro.deterministic(P_TEST_C, jnp.einsum("yc,y->c", p_c_cond_y, pi_)) + numpyro.sample('N_c', dist.Multinomial(jnp.sum(n_c_unlabeled), p_c), obs=n_c_unlabeled) + + +def sample_posterior( + n_c_unlabeled: np.ndarray, + n_y_and_c_labeled: np.ndarray, + num_warmup: int, + num_samples: int, + seed: int = 0, +) -> dict: + """ + Samples from the Bayesian quantification model in NumPyro using the + `NUTS `_ sampler. + + :param n_c_unlabeled: a `np.ndarray` of shape `(n_predicted_classes,)` + with entry `c` being the number of instances predicted as class `c`. + :param n_y_and_c_labeled: a `np.ndarray` of shape `(n_classes, n_predicted_classes)` + with entry `(y, c)` being the number of instances labeled as class `y` and predicted as class `c`. + :param num_warmup: the number of warmup steps. + :param num_samples: the number of samples to draw. + :seed: the random seed. + :return: a `dict` with the samples. The keys are the names of the latent variables. + """ + mcmc = numpyro.infer.MCMC( + numpyro.infer.NUTS(model), + num_warmup=num_warmup, + num_samples=num_samples, + progress_bar=False + ) + rng_key = jax.random.PRNGKey(seed) + mcmc.run(rng_key, n_c_unlabeled=n_c_unlabeled, n_y_and_c_labeled=n_y_and_c_labeled) + return mcmc.get_samples() + + + +def load_stan_file(): + return importlib.resources.files('quapy.method').joinpath('stan/pq.stan').read_text(encoding='utf-8') + +def pq_stan(stan_code, n_bins, pos_hist, neg_hist, test_hist, number_of_samples, num_warmup, stan_seed): + """ + Perform Bayesian prevalence estimation using a Stan model for probabilistic quantification. + + This function builds and samples from a Stan model that implements a bin-based Bayesian + quantifier. It uses the class-conditional histograms of the classifier + outputs for positive and negative examples, along with the test histogram, to estimate + the posterior distribution of prevalence in the test set. + + Parameters + ---------- + stan_code : str + The Stan model code as a string. + n_bins : int + Number of bins used to build the histograms for positive and negative examples. + pos_hist : array-like of shape (n_bins,) + Histogram counts of the classifier outputs for the positive class. + neg_hist : array-like of shape (n_bins,) + Histogram counts of the classifier outputs for the negative class. + test_hist : array-like of shape (n_bins,) + Histogram counts of the classifier outputs for the test set, binned using the same bins. + number_of_samples : int + Number of post-warmup samples to draw from the Stan posterior. + num_warmup : int + Number of warmup iterations for the sampler. + stan_seed : int + Random seed for Stan model compilation and sampling, ensuring reproducibility. + + Returns + ------- + prev_samples : numpy.ndarray + An array of posterior samples of the prevalence (`prev`) in the test set. + Each element corresponds to one draw from the posterior distribution. + """ + + stan_data = { + 'n_bucket': n_bins, + 'train_neg': neg_hist.tolist(), + 'train_pos': pos_hist.tolist(), + 'test': test_hist.tolist(), + 'posterior': 1 + } + + stan_model = stan.build(stan_code, data=stan_data, random_seed=stan_seed) + fit = stan_model.sample(num_chains=1, num_samples=number_of_samples,num_warmup=num_warmup) + + return fit['prev'] diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py new file mode 100644 index 0000000..f004c1a --- /dev/null +++ b/quapy/method/_kdey.py @@ -0,0 +1,357 @@ +import numpy as np +from sklearn.base import BaseEstimator +from sklearn.neighbors import KernelDensity + +import quapy as qp +from quapy.method.aggregative import AggregativeSoftQuantifier +import quapy.functional as F + +from sklearn.metrics.pairwise import rbf_kernel + + +class KDEBase: + """ + Common ancestor for KDE-based methods. Implements some common routines. + """ + + BANDWIDTH_METHOD = ['scott', 'silverman'] + + @classmethod + def _check_bandwidth(cls, bandwidth): + """ + Checks that the bandwidth parameter is correct + + :param bandwidth: either a string (see BANDWIDTH_METHOD) or a float + :return: the bandwidth if the check is passed, or raises an exception for invalid values + """ + assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \ + f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values' + if isinstance(bandwidth, float): + assert 0 < bandwidth < 1, \ + "the bandwith for KDEy should be in (0,1), since this method models the unit simplex" + return bandwidth + + def get_kde_function(self, X, bandwidth): + """ + Wraps the KDE function from scikit-learn. + + :param X: data for which the density function is to be estimated + :param bandwidth: the bandwidth of the kernel + :return: a scikit-learn's KernelDensity object + """ + return KernelDensity(bandwidth=bandwidth).fit(X) + + def pdf(self, kde, X): + """ + Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this + function returns :math:`e^{s}` + + :param kde: a previously fit KDE function + :param X: the data for which the density is to be estimated + :return: np.ndarray with the densities + """ + return np.exp(kde.score_samples(X)) + + def get_mixture_components(self, X, y, classes, bandwidth): + """ + Returns an array containing the mixture components, i.e., the KDE functions for each class. + + :param X: the data containing the covariates + :param y: the class labels + :param n_classes: integer, the number of classes + :param bandwidth: float, the bandwidth of the kernel + :return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates + """ + class_cond_X = [] + for cat in classes: + selX = X[y==cat] + if selX.size==0: + selX = [F.uniform_prevalence(len(classes))] + class_cond_X.append(selX) + return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X] + + +class KDEyML(AggregativeSoftQuantifier, KDEBase): + """ + Kernel Density Estimation model for quantification (KDEy) relying on the Kullback-Leibler divergence (KLD) as + the divergence measure to be minimized. This method was first proposed in the paper + `Kernel Density Estimation for Multiclass Quantification `_, in which + the authors show that minimizing the distribution mathing criterion for KLD is akin to performing + maximum likelihood (ML). + + The distribution matching optimization problem comes down to solving: + + :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})` + + where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence) + :math:`\\alpha` defined by + + :math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})` + + where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the + KDE function that uses the datapoints in X as the kernel centers. + + In KDEy-ML, the divergence is taken to be the Kullback-Leibler Divergence. This is equivalent to solving: + :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} - + \\mathbb{E}_{q_{\\widetilde{U}}} \\left[ \\log \\boldsymbol{p}_{\\alpha}(\\widetilde{x}) \\right]` + + which corresponds to the maximum likelihood estimate. + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. + :param bandwidth: float, the bandwidth of the Kernel + :param random_state: a seed to be set before fitting any base quantifier (default None) + """ + + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, + random_state=None): + super().__init__(classifier, fit_classifier, val_split) + self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.random_state=random_state + + def aggregation_fit(self, classif_predictions, labels): + self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth) + return self + + def aggregate(self, posteriors: np.ndarray): + """ + Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood + of the data (i.e., that minimizes the negative log-likelihood) + + :param posteriors: instances in the sample converted into posterior probabilities + :return: a vector of class prevalence estimates + """ + with qp.util.temp_seed(self.random_state): + epsilon = 1e-10 + n_classes = len(self.mix_densities) + test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities] + + def neg_loglikelihood(prev): + # test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) + test_mixture_likelihood = prev @ test_densities + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) + + return F.optim_minimize(neg_loglikelihood, n_classes) + + +class KDEyHD(AggregativeSoftQuantifier, KDEBase): + """ + Kernel Density Estimation model for quantification (KDEy) relying on the squared Hellinger Disntace (HD) as + the divergence measure to be minimized. This method was first proposed in the paper + `Kernel Density Estimation for Multiclass Quantification `_, in which + the authors proposed a Monte Carlo approach for minimizing the divergence. + + The distribution matching optimization problem comes down to solving: + + :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})` + + where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence) + :math:`\\alpha` defined by + + :math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})` + + where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the + KDE function that uses the datapoints in X as the kernel centers. + + In KDEy-HD, the divergence is taken to be the squared Hellinger Distance, an f-divergence with corresponding + f-generator function given by: + + :math:`f(u)=(\\sqrt{u}-1)^2` + + The authors proposed a Monte Carlo solution that relies on importance sampling: + + :math:`\\hat{D}_f(p||q)= \\frac{1}{t} \\sum_{i=1}^t f\\left(\\frac{p(x_i)}{q(x_i)}\\right) \\frac{q(x_i)}{r(x_i)}` + + where the datapoints (trials) :math:`x_1,\\ldots,x_t\\sim_{\\mathrm{iid}} r` with :math:`r` the + uniform distribution. + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. + :param bandwidth: float, the bandwidth of the Kernel + :param random_state: a seed to be set before fitting any base quantifier (default None) + :param montecarlo_trials: number of Monte Carlo trials (default 10000) + """ + + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, divergence: str='HD', + bandwidth=0.1, random_state=None, montecarlo_trials=10000): + + super().__init__(classifier, fit_classifier, val_split) + self.divergence = divergence + self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.random_state=random_state + self.montecarlo_trials = montecarlo_trials + + def aggregation_fit(self, classif_predictions, labels): + self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth) + + N = self.montecarlo_trials + rs = self.random_state + n = len(self.classes_) + self.reference_samples = np.vstack([kde_i.sample(N//n, random_state=rs) for kde_i in self.mix_densities]) + self.reference_classwise_densities = np.asarray([self.pdf(kde_j, self.reference_samples) for kde_j in self.mix_densities]) + self.reference_density = np.mean(self.reference_classwise_densities, axis=0) # equiv. to (uniform @ self.reference_classwise_densities) + + return self + + def aggregate(self, posteriors: np.ndarray): + # we retain all n*N examples (sampled from a mixture with uniform parameter), and then + # apply importance sampling (IS). In this version we compute D(p_alpha||q) with IS + n_classes = len(self.mix_densities) + + test_kde = self.get_kde_function(posteriors, self.bandwidth) + test_densities = self.pdf(test_kde, self.reference_samples) + + def f_squared_hellinger(u): + return (np.sqrt(u)-1)**2 + + # todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway + if self.divergence.lower() == 'hd': + f = f_squared_hellinger + else: + raise ValueError('only squared HD is currently implemented') + + epsilon = 1e-10 + qs = test_densities + epsilon + rs = self.reference_density + epsilon + iw = qs/rs #importance weights + p_class = self.reference_classwise_densities + epsilon + fracs = p_class/qs + + def divergence(prev): + # ps / qs = (prev @ p_class) / qs = prev @ (p_class / qs) = prev @ fracs + ps_div_qs = prev @ fracs + return np.mean( f(ps_div_qs) * iw ) + + return F.optim_minimize(divergence, n_classes) + + +class KDEyCS(AggregativeSoftQuantifier): + """ + Kernel Density Estimation model for quantification (KDEy) relying on the Cauchy-Schwarz divergence (CS) as + the divergence measure to be minimized. This method was first proposed in the paper + `Kernel Density Estimation for Multiclass Quantification `_, in which + the authors proposed a Monte Carlo approach for minimizing the divergence. + + The distribution matching optimization problem comes down to solving: + + :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})` + + where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence) + :math:`\\alpha` defined by + + :math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})` + + where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the + KDE function that uses the datapoints in X as the kernel centers. + + In KDEy-CS, the divergence is taken to be the Cauchy-Schwarz divergence given by: + + :math:`\\mathcal{D}_{\\mathrm{CS}}(p||q)=-\\log\\left(\\frac{\\int p(x)q(x)dx}{\\sqrt{\\int p(x)^2dx \\int q(x)^2dx}}\\right)` + + The authors showed that this distribution matching admits a closed-form solution + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. + :param bandwidth: float, the bandwidth of the Kernel + """ + + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1): + super().__init__(classifier, fit_classifier, val_split) + self.bandwidth = KDEBase._check_bandwidth(bandwidth) + + def gram_matrix_mix_sum(self, X, Y=None): + # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y)) + # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are + # two "scalar matrices" (h^2)*I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth) + h = self.bandwidth + variance = 2 * (h**2) + nD = X.shape[1] + gamma = 1/(2*variance) + norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD))) + gram = norm_factor * rbf_kernel(X, Y, gamma=gamma) + return gram.sum() + + def aggregation_fit(self, classif_predictions, labels): + + P, y = classif_predictions, labels + n = len(self.classes_) + + assert all(sorted(np.unique(y)) == np.arange(n)), \ + 'label name gaps not allowed in current implementation' + + # counts_inv keeps track of the relative weight of each datapoint within its class + # (i.e., the weight in its KDE model) + counts_inv = 1 / (F.counts_from_labels(y, classes=self.classes_)) + + # tr_tr_sums corresponds to symbol \overline{B} in the paper + tr_tr_sums = np.zeros(shape=(n,n), dtype=float) + for i in range(n): + for j in range(n): + if i > j: + tr_tr_sums[i,j] = tr_tr_sums[j,i] + else: + block = self.gram_matrix_mix_sum(P[y == i], P[y == j] if i!=j else None) + tr_tr_sums[i, j] = block + + # keep track of these data structures for the test phase + self.Ptr = P + self.ytr = y + self.tr_tr_sums = tr_tr_sums + self.counts_inv = counts_inv + + return self + + + def aggregate(self, posteriors: np.ndarray): + Ptr = self.Ptr + Pte = posteriors + y = self.ytr + tr_tr_sums = self.tr_tr_sums + + M, nD = Pte.shape + Minv = (1/M) # t in the paper + n = Ptr.shape[1] + + # becomes a constant that does not affect the optimization, no need to compute it + # partC = 0.5*np.log(self.gram_matrix_mix_sum(Pte) * Kinv * Kinv) + + # tr_te_sums corresponds to \overline{a}*(1/Li)*(1/M) in the paper (note the constants + # are already aggregated to tr_te_sums, so these multiplications are not carried out + # at each iteration of the optimization phase) + tr_te_sums = np.zeros(shape=n, dtype=float) + for i in range(n): + tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte) + + def divergence(alpha): + # called \overline{r} in the paper + alpha_ratio = alpha * self.counts_inv + + # recall that tr_te_sums already accounts for the constant terms (1/Li)*(1/M) + partA = -np.log((alpha_ratio @ tr_te_sums) * Minv) + partB = 0.5 * np.log(alpha_ratio @ tr_tr_sums @ alpha_ratio) + return partA + partB #+ partC + + return F.optim_minimize(divergence, n) + diff --git a/quapy/method/neural.py b/quapy/method/_neural.py similarity index 92% rename from quapy/method/neural.py rename to quapy/method/_neural.py index 2478055..404090f 100644 --- a/quapy/method/neural.py +++ b/quapy/method/_neural.py @@ -27,9 +27,9 @@ class QuaNetTrainer(BaseQuantifier): >>> # use samples of 100 elements >>> qp.environ['SAMPLE_SIZE'] = 100 >>> - >>> # load the kindle dataset as text, and convert words to numerical indexes + >>> # load the Kindle dataset as text, and convert words to numerical indexes >>> dataset = qp.datasets.fetch_reviews('kindle', pickle=True) - >>> qp.domains.preprocessing.index(dataset, min_df=5, inplace=True) + >>> qp.train.preprocessing.index(dataset, min_df=5, inplace=True) >>> >>> # the text classifier is a CNN trained by NeuralClassifierTrainer >>> cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes) @@ -37,12 +37,14 @@ class QuaNetTrainer(BaseQuantifier): >>> >>> # train QuaNet (QuaNet is an alias to QuaNetTrainer) >>> model = QuaNet(classifier, qp.environ['SAMPLE_SIZE'], device='cuda') - >>> model.fit(dataset.training) - >>> estim_prevalence = model.quantify(dataset.test.instances) + >>> model.fit(*dataset.training.Xy) + >>> estim_prevalence = model.predict(dataset.test.instances) :param classifier: an object implementing `fit` (i.e., that can be trained on labelled data), `predict_proba` (i.e., that can generate posterior probabilities of unlabelled examples) and `transform` (i.e., that can generate embedded representations of the unlabelled instances). + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. :param sample_size: integer, the sample size; default is None, meaning that the sample size should be taken from qp.environ["SAMPLE_SIZE"] :param n_epochs: integer, maximum number of training epochs @@ -64,6 +66,7 @@ class QuaNetTrainer(BaseQuantifier): def __init__(self, classifier, + fit_classifier=True, sample_size=None, n_epochs=100, tr_iter_per_poch=500, @@ -86,6 +89,7 @@ class QuaNetTrainer(BaseQuantifier): f'the classifier {classifier.__class__.__name__} does not seem to be able to produce posterior probabilities ' \ f'since it does not implement the method "predict_proba"' self.classifier = classifier + self.fit_classifier = fit_classifier self.sample_size = qp._get_sample_size(sample_size) self.n_epochs = n_epochs self.tr_iter = tr_iter_per_poch @@ -111,20 +115,21 @@ class QuaNetTrainer(BaseQuantifier): self.__check_params_colision(self.quanet_params, self.classifier.get_params()) self._classes_ = None - def fit(self, data: LabelledCollection, fit_classifier=True): + def fit(self, X, y): """ Trains QuaNet. - :param data: the training data on which to train QuaNet. If `fit_classifier=True`, the data will be split in + :param X: the training instances on which to train QuaNet. If `fit_classifier=True`, the data will be split in 40/40/20 for training the classifier, training QuaNet, and validating QuaNet, respectively. If `fit_classifier=False`, the data will be split in 66/34 for training QuaNet and validating it, respectively. - :param fit_classifier: if True, trains the classifier on a split containing 40% of the data + :param y: the labels of X :return: self """ + data = LabelledCollection(X, y) self._classes_ = data.classes_ os.makedirs(self.checkpointdir, exist_ok=True) - if fit_classifier: + if self.fit_classifier: classifier_data, unused_data = data.split_stratified(0.4) train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20% self.classifier.fit(*classifier_data.Xy) @@ -144,13 +149,13 @@ class QuaNetTrainer(BaseQuantifier): train_data_embed = LabelledCollection(self.classifier.transform(train_data.instances), train_data.labels, self._classes_) self.quantifiers = { - 'cc': CC(self.classifier).fit(None, fit_classifier=False), - 'acc': ACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data), - 'pcc': PCC(self.classifier).fit(None, fit_classifier=False), - 'pacc': PACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data), + 'cc': CC(self.classifier, fit_classifier=False).fit(*valid_data.Xy), + 'acc': ACC(self.classifier, fit_classifier=False).fit(*valid_data.Xy), + 'pcc': PCC(self.classifier, fit_classifier=False).fit(*valid_data.Xy), + 'pacc': PACC(self.classifier, fit_classifier=False).fit(*valid_data.Xy), } if classifier_data is not None: - self.quantifiers['emq'] = EMQ(self.classifier).fit(classifier_data, fit_classifier=False) + self.quantifiers['emq'] = EMQ(self.classifier, fit_classifier=False).fit(*valid_data.Xy) self.status = { 'tr-loss': -1, @@ -194,16 +199,16 @@ class QuaNetTrainer(BaseQuantifier): label_predictions = np.argmax(posteriors, axis=-1) prevs_estim = [] for quantifier in self.quantifiers.values(): - predictions = posteriors if isinstance(quantifier, AggregativeProbabilisticQuantifier) else label_predictions + predictions = posteriors if isinstance(quantifier, AggregativeSoftQuantifier) else label_predictions prevs_estim.extend(quantifier.aggregate(predictions)) # there is no real need for adding static estims like the TPR or FPR from training since those are constant return prevs_estim - def quantify(self, instances): - posteriors = self.classifier.predict_proba(instances) - embeddings = self.classifier.transform(instances) + def predict(self, X): + posteriors = self.classifier.predict_proba(X) + embeddings = self.classifier.transform(X) quant_estims = self._get_aggregative_estims(posteriors) self.quanet.eval() with torch.no_grad(): diff --git a/quapy/method/_threshold_optim.py b/quapy/method/_threshold_optim.py new file mode 100644 index 0000000..628f01a --- /dev/null +++ b/quapy/method/_threshold_optim.py @@ -0,0 +1,283 @@ +from abc import abstractmethod + +import numpy as np +from sklearn.base import BaseEstimator +import quapy as qp +import quapy.functional as F +from quapy.data import LabelledCollection +from quapy.method.aggregative import BinaryAggregativeQuantifier + + +class ThresholdOptimization(BinaryAggregativeQuantifier): + """ + Abstract class of Threshold Optimization variants for :class:`ACC` as proposed by + `Forman 2006 `_ and + `Forman 2008 `_. + The goal is to bring improved stability to the denominator of the adjustment. + The different variants are based on different heuristics for choosing a decision threshold + that would allow for more true positives and many more false positives, on the grounds this + would deliver larger denominators. + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. + + :param n_jobs: number of parallel workers + """ + + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=None, n_jobs=None): + super().__init__(classifier, fit_classifier, val_split) + self.n_jobs = qp._get_njobs(n_jobs) + + @abstractmethod + def condition(self, tpr, fpr) -> float: + """ + Implements the criterion according to which the threshold should be selected. + This function should return the (float) score to be minimized. + + :param tpr: float, true positive rate + :param fpr: float, false positive rate + :return: float, a score for the given `tpr` and `fpr` + """ + ... + + def discard(self, tpr, fpr) -> bool: + """ + Indicates whether a combination of tpr and fpr should be discarded + + :param tpr: float, true positive rate + :param fpr: float, false positive rate + :return: true if the combination is to be discarded, false otherwise + """ + return (tpr - fpr) == 0 + + + def _eval_candidate_thresholds(self, decision_scores, y): + """ + Seeks for the best `tpr` and `fpr` according to the score obtained at different + decision thresholds. The scoring function is implemented in function `_condition`. + + :param decision_scores: array-like with the classification scores + :param y: predicted labels for the validation set (or for the training set via `k`-fold cross validation) + :return: best `tpr` and `fpr` and `threshold` according to `_condition` + """ + candidate_thresholds = np.unique(decision_scores) + + candidates = [] + scores = [] + for candidate_threshold in candidate_thresholds: + y_ = self.classes_[1 * (decision_scores >= candidate_threshold)] + TP, FP, FN, TN = self._compute_table(y, y_) + tpr = self._compute_tpr(TP, FN) + fpr = self._compute_fpr(FP, TN) + if not self.discard(tpr, fpr): + candidate_score = self.condition(tpr, fpr) + candidates.append([tpr, fpr, candidate_threshold]) + scores.append(candidate_score) + + if len(candidates) == 0: + # if no candidate gives rise to a valid combination of tpr and fpr, this method defaults to the standard + # classify & count; this is akin to assign tpr=1, fpr=0, threshold=0 + tpr, fpr, threshold = 1, 0, 0 + candidates.append([tpr, fpr, threshold]) + scores.append(0) + + candidates = np.asarray(candidates) + candidates = candidates[np.argsort(scores)] # sort candidates by candidate_score + + return candidates + + def aggregate_with_threshold(self, classif_predictions, tprs, fprs, thresholds): + # This function performs the adjusted count for given tpr, fpr, and threshold. + # Note that, due to broadcasting, tprs, fprs, and thresholds could be arrays of length > 1 + prevs_estims = np.mean(classif_predictions[:, None] >= thresholds, axis=0) + prevs_estims = (prevs_estims - fprs) / (tprs - fprs) + prevs_estims = F.as_binary_prevalence(prevs_estims, clip_if_necessary=True) + return prevs_estims.squeeze() + + def _compute_table(self, y, y_): + TP = np.logical_and(y == y_, y == self.pos_label).sum() + FP = np.logical_and(y != y_, y == self.neg_label).sum() + FN = np.logical_and(y != y_, y == self.pos_label).sum() + TN = np.logical_and(y == y_, y == self.neg_label).sum() + return TP, FP, FN, TN + + def _compute_tpr(self, TP, FP): + if TP + FP == 0: + return 1 + return TP / (TP + FP) + + def _compute_fpr(self, FP, TN): + if FP + TN == 0: + return 0 + return FP / (FP + TN) + + def aggregation_fit(self, classif_predictions, labels): + decision_scores, y = classif_predictions, labels + # the standard behavior is to keep the best threshold only + self.tpr, self.fpr, self.threshold = self._eval_candidate_thresholds(decision_scores, y)[0] + return self + + def aggregate(self, classif_predictions: np.ndarray): + # the standard behavior is to compute the adjusted count using the best threshold found + return self.aggregate_with_threshold(classif_predictions, self.tpr, self.fpr, self.threshold) + + +class T50(ThresholdOptimization): + """ + Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 `_ and + `Forman 2008 `_ that looks + for the threshold that makes `tpr` closest to 0.5. + The goal is to bring improved stability to the denominator of the adjustment. + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. + + """ + + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5): + super().__init__(classifier, fit_classifier, val_split) + + def condition(self, tpr, fpr) -> float: + return abs(tpr - 0.5) + + +class MAX(ThresholdOptimization): + """ + Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 `_ and + `Forman 2008 `_ that looks + for the threshold that maximizes `tpr-fpr`. + The goal is to bring improved stability to the denominator of the adjustment. + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. + + """ + + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5): + super().__init__(classifier, fit_classifier, val_split) + + def condition(self, tpr, fpr) -> float: + # MAX strives to maximize (tpr - fpr), which is equivalent to minimize (fpr - tpr) + return (fpr - tpr) + + +class X(ThresholdOptimization): + """ + Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 `_ and + `Forman 2008 `_ that looks + for the threshold that yields `tpr=1-fpr`. + The goal is to bring improved stability to the denominator of the adjustment. + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. + + """ + + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5): + super().__init__(classifier, fit_classifier, val_split) + + def condition(self, tpr, fpr) -> float: + return abs(1 - (tpr + fpr)) + + +class MS(ThresholdOptimization): + """ + Median Sweep. Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 `_ and + `Forman 2008 `_ that generates + class prevalence estimates for all decision thresholds and returns the median of them all. + The goal is to bring improved stability to the denominator of the adjustment. + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. + """ + + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5): + super().__init__(classifier, fit_classifier, val_split) + + def condition(self, tpr, fpr) -> float: + return 1 + + def aggregation_fit(self, classif_predictions, labels): + decision_scores, y = classif_predictions, labels + # keeps all candidates + tprs_fprs_thresholds = self._eval_candidate_thresholds(decision_scores, y) + self.tprs = tprs_fprs_thresholds[:, 0] + self.fprs = tprs_fprs_thresholds[:, 1] + self.thresholds = tprs_fprs_thresholds[:, 2] + return self + + def aggregate(self, classif_predictions: np.ndarray): + prevalences = self.aggregate_with_threshold(classif_predictions, self.tprs, self.fprs, self.thresholds) + if prevalences.ndim==2: + prevalences = np.median(prevalences, axis=0) + return prevalences + + +class MS2(MS): + """ + Median Sweep 2. Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 `_ and + `Forman 2008 `_ that generates + class prevalence estimates for all decision thresholds and returns the median of for cases in + which `tpr-fpr>0.25` + The goal is to bring improved stability to the denominator of the adjustment. + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. + """ + + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5): + super().__init__(classifier, fit_classifier, val_split) + + def discard(self, tpr, fpr) -> bool: + return (tpr-fpr) <= 0.25 diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 232a92b..25fc1ef 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -1,43 +1,216 @@ -from abc import abstractmethod +from abc import ABC, abstractmethod +from argparse import ArgumentError from copy import deepcopy -from typing import Callable, Union +from typing import Callable, Literal, Union import numpy as np -from scipy import optimize +from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling +from numpy.f2py.crackfortran import true_intent_list from sklearn.base import BaseEstimator from sklearn.calibration import CalibratedClassifierCV +from sklearn.exceptions import NotFittedError from sklearn.metrics import confusion_matrix -from sklearn.model_selection import cross_val_predict +from sklearn.model_selection import cross_val_predict, train_test_split +from sklearn.utils.validation import check_is_fitted + import quapy as qp import quapy.functional as F -from functional import get_divergence -from quapy.classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration +from quapy.functional import get_divergence from quapy.classification.svmperf import SVMperf from quapy.data import LabelledCollection from quapy.method.base import BaseQuantifier, BinaryQuantifier, OneVsAllGeneric +from quapy.method import _bayesian + +# import warnings +# from sklearn.exceptions import ConvergenceWarning +# warnings.filterwarnings("ignore", category=ConvergenceWarning) # Abstract classes # ------------------------------------ -class AggregativeQuantifier(BaseQuantifier): +class AggregativeQuantifier(BaseQuantifier, ABC): """ Abstract class for quantification methods that base their estimations on the aggregation of classification - results. Aggregative Quantifiers thus implement a :meth:`classify` method and maintain a :attr:`classifier` - attribute. Subclasses of this abstract class must implement the method :meth:`aggregate` which computes the - aggregation of label predictions. The method :meth:`quantify` comes with a default implementation based on - :meth:`classify` and :meth:`aggregate`. + results. Aggregative quantifiers implement a pipeline that consists of generating classification predictions + and aggregating them. For this reason, the training phase is implemented by :meth:`classification_fit` followed + by :meth:`aggregation_fit`, while the testing phase is implemented by :meth:`classify` followed by + :meth:`aggregate`. Subclasses of this abstract class must provide implementations for these methods. + Aggregative quantifiers also maintain a :attr:`classifier` attribute. + + The method :meth:`fit` comes with a default implementation based on :meth:`classification_fit` + and :meth:`aggregation_fit`. + + The method :meth:`quantify` comes with a default implementation based on :meth:`classify` + and :meth:`aggregate`. + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to + None when the method does not require any validation data, in order to avoid that some portion of + the training data be wasted. """ - @abstractmethod - def fit(self, data: LabelledCollection, fit_classifier=True): - """ - Trains the aggregative quantifier + def __init__(self, + classifier: Union[None,BaseEstimator], + fit_classifier:bool=True, + val_split:Union[int,float,tuple,None]=5): - :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data - :param fit_classifier: whether or not to train the learner (default is True). Set to False if the - learner has been trained outside the quantifier. + self.classifier = qp._get_classifier(classifier) + self.fit_classifier = fit_classifier + self.val_split = val_split + + # basic type checks + assert hasattr(self.classifier, 'fit'), \ + f'the classifier does not implement "fit"' + + assert isinstance(fit_classifier, bool), \ + f'unexpected type for {fit_classifier=}; must be True or False' + + # val_split is indicated as a number of folds for cross-validation + if isinstance(val_split, int): + assert val_split > 1, \ + (f'when {val_split=} is indicated as an integer, it represents the number of folds in a kFCV ' + f'and must thus be >1') + if val_split==5 and not fit_classifier: + print(f'Warning: {val_split=} will be ignored when the classifier is already trained ' + f'({fit_classifier=}). Parameter {self.val_split=} will be set to None. Set {val_split=} ' + f'to None to avoid this warning.') + self.val_split=None + if val_split!=5: + assert fit_classifier, (f'Parameter {val_split=} has been modified, but {fit_classifier=} ' + f'indicates the classifier should not be retrained.') + # val_split is indicated as a fraction of validation instances + elif isinstance(val_split, float): + assert 0 < val_split < 1, \ + (f'when {val_split=} is indicated as a float, it represents the fraction of training instances ' + f'to be used for validation, and must thus be in the range (0,1)') + assert fit_classifier, (f'when {val_split=} is indicated as a float (the fraction of training instances ' + f'to be used for validation), the parameter {fit_classifier=} must be True') + # val_split is indicated as a validation collection (X,y) + elif isinstance(val_split, tuple): + assert len(val_split) == 2, \ + (f'when {val_split=} is indicated as a tuple, it represents the collection (X,y) on which the ' + f'validation must be performed, but this seems to have different cardinality') + elif val_split is None: + pass + else: + raise ValueError(f'unexpected type for {val_split=}') + + # classifier is fitted? + try: + check_is_fitted(self.classifier) + fitted = True + except NotFittedError: + fitted = False + + # consistency checks: fit_classifier? + if self.fit_classifier: + if fitted: + raise RuntimeWarning(f'the classifier is already fitted, but {fit_classifier=} was requested') + else: + assert fitted, (f'{fit_classifier=} requires the classifier to be already trained, ' + f'but this does not seem to be') + + def _check_init_parameters(self): + """ + Implements any check to be performed in the parameters of the init method before undertaking + the training of the quantifier. This is made as to allow for a quick execution stop when the + parameters are not valid. + + :return: Nothing. May raise an exception. + """ + pass + + def _check_non_empty_classes(self, y): + """ + Asserts all classes have positive instances. + + :param labels: array-like of shape `(n_instances,)` with the label for each instance + :param classes: the class labels. This is needed in order to correctly compute the prevalence vector even when + some classes have no examples. + :return: Nothing. May raise an exception. + """ + sample_prevs = F.prevalence_from_labels(y, self.classes_) + empty_classes = np.argwhere(sample_prevs == 0).flatten() + if len(empty_classes) > 0: + empty_class_names = self.classes_[empty_classes] + raise ValueError(f'classes {empty_class_names} have no training examples') + + def fit(self, X, y): + """ + Trains the aggregative quantifier. This comes down to training a classifier (if requested) and an + aggregation function. + + :param X: array-like of shape `(n_samples, n_features)`, the training instances + :param y: array-like of shape `(n_samples,)`, the labels :return: self """ + self._check_init_parameters() + classif_predictions, labels = self.classifier_fit_predict(X, y) + self.aggregation_fit(classif_predictions, labels) + return self + + def classifier_fit_predict(self, X, y): + """ + Trains the classifier if requested (`fit_classifier=True`) and generate the necessary predictions to + train the aggregation function. + + :param X: array-like of shape `(n_samples, n_features)`, the training instances + :param y: array-like of shape `(n_samples,)`, the labels + """ + self._check_classifier(adapt_if_necessary=self.fit_classifier) + + # self._check_non_empty_classes(y) + + predictions, labels = None, None + if isinstance(self.val_split, int): + assert self.fit_classifier, f'{self.__class__}: unexpected value for {self.fit_classifier=}' + num_folds = self.val_split + n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None) + predictions = cross_val_predict( + self.classifier, X, y, cv=num_folds, n_jobs=n_jobs, method=self._classifier_method() + ) + labels = y + self.classifier.fit(X, y) + elif isinstance(self.val_split, float): + assert self.fit_classifier, f'unexpected value for {self.fit_classifier=}' + train_prop = 1. - self.val_split + Xtr, Xval, ytr, yval = train_test_split(X, y, train_size=train_prop, stratify=y) + self.classifier.fit(Xtr, ytr) + predictions = self.classify(Xval) + labels = yval + elif isinstance(self.val_split, tuple): + Xval, yval = self.val_split + if self.fit_classifier: + self.classifier.fit(X, y) + predictions = self.classify(Xval) + labels = yval + elif self.val_split is None: + if self.fit_classifier: + self.classifier.fit(X, y) + predictions, labels = None, None + else: + predictions, labels = self.classify(X), y + else: + raise ValueError(f'unexpected type for {self.val_split=}') + + return predictions, labels + + @abstractmethod + def aggregation_fit(self, classif_predictions, labels): + """ + Trains the aggregation function. + + :param classif_predictions: array-like with the classification predictions + (whatever the method :meth:`classify` returns) + :param labels: array-like with the true labels associated to each classifier prediction + """ ... @property @@ -58,34 +231,52 @@ class AggregativeQuantifier(BaseQuantifier): """ self.classifier_ = classifier - def classify(self, instances): + def classify(self, X): """ Provides the label predictions for the given instances. The predictions should respect the format expected by - :meth:`aggregate`, i.e., posterior probabilities for probabilistic quantifiers, or crisp predictions for - non-probabilistic quantifiers + :meth:`aggregate`, e.g., posterior probabilities for probabilistic quantifiers, or crisp predictions for + non-probabilistic quantifiers. The default one is "decision_function". - :param instances: array-like - :return: np.ndarray of shape `(n_instances,)` with label predictions + :param X: array-like of shape `(n_samples, n_features)`, the data instances + :return: np.ndarray of shape `(n_instances,)` with classifier predictions """ - return self.classifier.predict(instances) + return getattr(self.classifier, self._classifier_method())(X) - def quantify(self, instances): + def _classifier_method(self): + """ + Name of the method that must be used for issuing label predictions. The default one is "decision_function". + + :return: string + """ + return 'decision_function' + + def _check_classifier(self, adapt_if_necessary=False): + """ + Guarantees that the underlying classifier implements the method required for issuing predictions, i.e., + the method indicated by the :meth:`_classifier_method` + + :param adapt_if_necessary: unused unless overridden + """ + assert hasattr(self.classifier, self._classifier_method()), \ + f"the method does not implement the required {self._classifier_method()} method" + + def predict(self, X): """ Generate class prevalence estimates for the sample's instances by aggregating the label predictions generated by the classifier. - :param instances: array-like + :param X: array-like of shape `(n_samples, n_features)`, the data instances :return: `np.ndarray` of shape `(n_classes)` with class prevalence estimates. """ - classif_predictions = self.classify(instances) + classif_predictions = self.classify(X) return self.aggregate(classif_predictions) @abstractmethod def aggregate(self, classif_predictions: np.ndarray): """ - Implements the aggregation of label predictions. + Implements the aggregation of the classifier predictions. - :param classif_predictions: `np.ndarray` of label predictions + :param classif_predictions: `np.ndarray` of classifier predictions :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates. """ ... @@ -96,208 +287,260 @@ class AggregativeQuantifier(BaseQuantifier): Class labels, in the same order in which class prevalence values are to be computed. This default implementation actually returns the class labels of the learner. - :return: array-like + :return: array-like, the class labels """ return self.classifier.classes_ -class AggregativeProbabilisticQuantifier(AggregativeQuantifier): +class AggregativeCrispQuantifier(AggregativeQuantifier, ABC): """ - Abstract class for quantification methods that base their estimations on the aggregation of posterior probabilities - as returned by a probabilistic classifier. Aggregative Probabilistic Quantifiers thus extend Aggregative - Quantifiers by implementing a _posterior_probabilities_ method returning values in [0,1] -- the posterior - probabilities. + Abstract class for quantification methods that base their estimations on the aggregation of crisp decisions + as returned by a hard classifier. Aggregative crisp quantifiers thus extend Aggregative + Quantifiers by implementing specifications about crisp predictions. """ - def classify(self, instances): - return self.classifier.predict_proba(instances) + def _classifier_method(self): + """ + Name of the method that must be used for issuing label predictions. For crisp quantifiers, the method + is 'predict', that returns an array of shape `(n_instances,)` of label predictions. + + :return: the string "predict", i.e., the standard method name for scikit-learn hard predictions + """ + return 'predict' -# Helper -# ------------------------------------ -def _ensure_probabilistic(classifier): - if not hasattr(classifier, 'predict_proba'): - print(f'The learner {classifier.__class__.__name__} does not seem to be probabilistic. ' - f'The learner will be calibrated.') - classifier = CalibratedClassifierCV(classifier, cv=5) - return classifier - - -def _training_helper(classifier, - data: LabelledCollection, - fit_classifier: bool = True, - ensure_probabilistic=False, - val_split: Union[LabelledCollection, float] = None): +class AggregativeSoftQuantifier(AggregativeQuantifier, ABC): """ - Training procedure common to all Aggregative Quantifiers. - - :param classifier: the learner to be fit - :param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner. - :param fit_classifier: whether or not to fit the learner (if False, then bypasses any action) - :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the - learner is not probabilistic, then a CalibratedCV instance of it is trained) - :param val_split: if specified as a float, indicates the proportion of training instances that will define the - validation split (e.g., 0.3 for using 30% of the training set as validation data); if specified as a - LabelledCollection, represents the validation split itself - :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0 - or None otherwise) to be used as a validation set for any subsequent parameter fitting + Abstract class for quantification methods that base their estimations on the aggregation of posterior + probabilities as returned by a probabilistic classifier. + Aggregative soft quantifiers thus extend Aggregative Quantifiers by implementing specifications + about soft predictions. """ - if fit_classifier: - if ensure_probabilistic: - classifier = _ensure_probabilistic(classifier) - if val_split is not None: - if isinstance(val_split, float): - if not (0 < val_split < 1): - raise ValueError(f'train/val split {val_split} out of range, must be in (0,1)') - train, unused = data.split_stratified(train_prop=1 - val_split) - elif isinstance(val_split, LabelledCollection): - train = data - unused = val_split + + def _classifier_method(self): + """ + Name of the method that must be used for issuing label predictions. For probabilistic quantifiers, the method + is 'predict_proba', that returns an array of shape `(n_instances, n_dimensions,)` with posterior + probabilities. + + :return: the string "predict_proba", i.e., the standard method name for scikit-learn soft predictions + """ + return 'predict_proba' + + def _check_classifier(self, adapt_if_necessary=False): + """ + Guarantees that the underlying classifier implements the method indicated by the :meth:`_classifier_method`. + In case it does not, the classifier is calibrated (by means of the Platt's calibration method implemented by + scikit-learn in CalibratedClassifierCV, with cv=5). This calibration is only allowed if `adapt_if_necessary` + is set to True. If otherwise (i.e., the classifier is not probabilistic, and `adapt_if_necessary` is set + to False), an exception will be raised. + + :param adapt_if_necessary: a hard classifier is turned into a soft classifier if `adapt_if_necessary==True` + """ + if not hasattr(self.classifier, self._classifier_method()): + if adapt_if_necessary: + print(f'warning: The learner {self.classifier.__class__.__name__} does not seem to be ' + f'probabilistic. The learner will be calibrated (using CalibratedClassifierCV).') + self.classifier = CalibratedClassifierCV(self.classifier, cv=5) else: - raise ValueError( - f'param "val_split" ({type(val_split)}) not understood; use either a float indicating the split ' - 'proportion, or a LabelledCollection indicating the validation split') - else: - train, unused = data, None - - if isinstance(classifier, BaseQuantifier): - classifier.fit(train) - else: - classifier.fit(*train.Xy) - else: - if ensure_probabilistic: - if not hasattr(classifier, 'predict_proba'): - raise AssertionError('error: the learner cannot be calibrated since fit_classifier is set to False') - unused = None - if isinstance(val_split, LabelledCollection): - unused = val_split - - return classifier, unused + raise AssertionError(f'error: The learner {self.classifier.__class__.__name__} does not ' + f'seem to be probabilistic. The learner cannot be calibrated since ' + f'fit_classifier is set to False') -def cross_generate_predictions( - data, - classifier, - val_split, - probabilistic, - fit_classifier, - n_jobs -): +class BinaryAggregativeQuantifier(AggregativeQuantifier, BinaryQuantifier): - n_jobs = qp._get_njobs(n_jobs) + @property + def pos_label(self): + return self.classifier.classes_[1] - if isinstance(val_split, int): - assert fit_classifier == True, \ - 'the parameters for the adjustment cannot be estimated with kFCV with fit_classifier=False' + @property + def neg_label(self): + return self.classifier.classes_[0] - if probabilistic: - classifier = _ensure_probabilistic(classifier) - predict = 'predict_proba' - else: - predict = 'predict' - y_pred = cross_val_predict(classifier, *data.Xy, cv=val_split, n_jobs=n_jobs, method=predict) - class_count = data.counts() - - # fit the learner on all data - classifier.fit(*data.Xy) - y = data.y - classes = data.classes_ - else: - classifier, val_data = _training_helper( - classifier, data, fit_classifier, ensure_probabilistic=probabilistic, val_split=val_split - ) - y_pred = classifier.predict_proba(val_data.instances) if probabilistic else classifier.predict(val_data.instances) - y = val_data.labels - classes = val_data.classes_ - class_count = val_data.counts() - - return classifier, y, y_pred, classes, class_count + def fit(self, X, y): + self._check_binary(y, self.__class__.__name__) + return super().fit(X, y) # Methods # ------------------------------------ -class CC(AggregativeQuantifier): +class CC(AggregativeCrispQuantifier): """ The most basic Quantification method. One that simply classifies all instances and counts how many have been attributed to each of the classes in order to compute class prevalence estimates. :param classifier: a sklearn's Estimator that generates a classifier """ + def __init__(self, classifier: BaseEstimator = None, fit_classifier: bool = True): + super().__init__(classifier, fit_classifier, val_split=None) - def __init__(self, classifier: BaseEstimator): - self.classifier = classifier - - def fit(self, data: LabelledCollection, fit_classifier=True): + def aggregation_fit(self, classif_predictions, labels): """ - Trains the Classify & Count method unless `fit_classifier` is False, in which case, the classifier is assumed to - be already fit and there is nothing else to do. + Nothing to do here! - :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data - :param fit_classifier: if False, the classifier is assumed to be fit - :return: self + :param classif_predictions: unused + :param labels: unused """ - self.classifier, _ = _training_helper(self.classifier, data, fit_classifier) - return self + pass def aggregate(self, classif_predictions: np.ndarray): """ Computes class prevalence estimates by counting the prevalence of each of the predicted labels. - :param classif_predictions: array-like with label predictions + :param classif_predictions: array-like with classifier predictions :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates. """ return F.prevalence_from_labels(classif_predictions, self.classes_) -class ACC(AggregativeQuantifier): +class PCC(AggregativeSoftQuantifier): + """ + `Probabilistic Classify & Count `_, + the probabilistic variant of CC that relies on the posterior probabilities returned by a probabilistic classifier. + + :param classifier: a sklearn's Estimator that generates a classifier + """ + + def __init__(self, classifier: BaseEstimator = None, fit_classifier: bool = True): + super().__init__(classifier, fit_classifier, val_split=None) + + def aggregation_fit(self, classif_predictions, labels): + """ + Nothing to do here! + + :param classif_predictions: unused + :param labels: unused + """ + pass + + def aggregate(self, classif_posteriors): + return F.prevalence_from_probabilities(classif_posteriors, binarize=False) + + +class ACC(AggregativeCrispQuantifier): """ `Adjusted Classify & Count `_, the "adjusted" variant of :class:`CC`, that corrects the predictions of CC according to the `misclassification rates`. - :param classifier: a sklearn's Estimator that generates a classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the - misclassification rates are to be estimated. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the misclassification rates should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. + + :param str method: adjustment method to be used: + + * 'inversion': matrix inversion method based on the matrix equality :math:`P(C)=P(C|Y)P(Y)`, + which tries to invert :math:`P(C|Y)` matrix. + * 'invariant-ratio': invariant ratio estimator of `Vaz et al. 2018 `_, + which replaces the last equation with the normalization condition. + + :param str solver: indicates the method to use for solving the system of linear equations. Valid options are: + + * 'exact-raise': tries to solve the system using matrix inversion. Raises an error if the matrix has rank + strictly less than `n_classes`. + * 'exact-cc': if the matrix is not of full rank, returns `p_c` as the estimates, which corresponds to + no adjustment (i.e., the classify and count method. See :class:`quapy.method.aggregative.CC`) + * 'exact': deprecated, defaults to 'exact-cc' + * 'minimize': minimizes the L2 norm of :math:`|Ax-B|`. This one generally works better, and is the + default parameter. More details about this can be consulted in `Bunse, M. "On Multi-Class Extensions of + Adjusted Classify and Count", on proceedings of the 2nd International Workshop on Learning to Quantify: + Methods and Applications (LQ 2022), ECML/PKDD 2022, Grenoble (France) + `_. + + :param str norm: the method to use for normalization. + + * `clip`, the values are clipped to the range [0,1] and then L1-normalized. + * `mapsimplex` projects vectors onto the probability simplex. This implementation relies on + `Mathieu Blondel's projection_simplex_sort `_ + * `condsoftmax`, applies a softmax normalization only to prevalence vectors that lie outside the simplex + + :param n_jobs: number of parallel workers """ - def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None): - self.classifier = classifier - self.val_split = val_split + def __init__( + self, + classifier: BaseEstimator = None, + fit_classifier = True, + val_split = 5, + solver: Literal['minimize', 'exact', 'exact-raise', 'exact-cc'] = 'minimize', + method: Literal['inversion', 'invariant-ratio'] = 'inversion', + norm: Literal['clip', 'mapsimplex', 'condsoftmax'] = 'clip', + n_jobs=None, + ): + super().__init__(classifier, fit_classifier, val_split) self.n_jobs = qp._get_njobs(n_jobs) + self.solver = solver + self.method = method + self.norm = norm - def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, int, LabelledCollection] = None): + SOLVERS = ['exact', 'minimize', 'exact-raise', 'exact-cc'] + METHODS = ['inversion', 'invariant-ratio'] + NORMALIZATIONS = ['clip', 'mapsimplex', 'condsoftmax', None] + + @classmethod + def newInvariantRatioEstimation(cls, classifier: BaseEstimator, fit_classifier=True, val_split=5, n_jobs=None): """ - Trains a ACC quantifier. + Constructs a quantifier that implements the Invariant Ratio Estimator of + `Vaz et al. 2018 `_. This amounts + to setting method to 'invariant-ratio' and clipping to 'project'. - :param data: the training set - :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit) - :param val_split: either a float in (0,1) indicating the proportion of training instances to use for - validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection - indicating the validation set itself, or an int indicating the number `k` of folds to be used in `k`-fold - cross validation to estimate the parameters - :return: self + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. + + :param n_jobs: number of parallel workers + + :return: an instance of ACC configured so that it implements the Invariant Ratio Estimator """ + return ACC(classifier, fit_classifier=fit_classifier, val_split=val_split, method='invariant-ratio', norm='mapsimplex', n_jobs=n_jobs) - if val_split is None: - val_split = self.val_split + def _check_init_parameters(self): + if self.solver not in ACC.SOLVERS: + raise ValueError(f"unknown solver; valid ones are {ACC.SOLVERS}") + if self.method not in ACC.METHODS: + raise ValueError(f"unknown method; valid ones are {ACC.METHODS}") + if self.norm not in ACC.NORMALIZATIONS: + raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}") - self.classifier, y, y_, classes, class_count = cross_generate_predictions( - data, self.classifier, val_split, probabilistic=False, fit_classifier=fit_classifier, n_jobs=self.n_jobs - ) - - self.cc = CC(self.classifier) - self.Pte_cond_estim_ = self.getPteCondEstim(self.classifier.classes_, y, y_) - - return self + def aggregation_fit(self, classif_predictions, labels): + """ + Estimates the misclassification rates. + :param classif_predictions: array-like with the predicted labels + :param labels: array-like with the true labels associated to each predicted label + """ + true_labels = labels + pred_labels = classif_predictions + self.cc = CC(self.classifier, fit_classifier=False) + self.Pte_cond_estim_ = ACC.getPteCondEstim(self.classifier.classes_, true_labels, pred_labels) @classmethod def getPteCondEstim(cls, classes, y, y_): - # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a - # document that belongs to yj ends up being classified as belonging to yi + """ + Estimate the matrix with entry (i,j) being the estimate of P(hat_yi|yj), that is, the probability that a + document that belongs to yj ends up being classified as belonging to yi + + :param classes: array-like with the class names + :param y: array-like with the true labels + :param y_: array-like with the estimated labels + :return: np.ndarray + """ conf = confusion_matrix(y, y_, labels=classes).T conf = conf.astype(float) class_counts = conf.sum(axis=0) @@ -308,102 +551,114 @@ class ACC(AggregativeQuantifier): conf[:, i] /= class_counts[i] return conf - def classify(self, data): - return self.cc.classify(data) - def aggregate(self, classif_predictions): prevs_estim = self.cc.aggregate(classif_predictions) - return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim) - - @classmethod - def solve_adjustment(cls, PteCondEstim, prevs_estim): - """ - Solves the system linear system :math:`Ax = B` with :math:`A` = `PteCondEstim` and :math:`B` = `prevs_estim` - - :param PteCondEstim: a `np.ndarray` of shape `(n_classes,n_classes,)` with entry `(i,j)` being the estimate - of :math:`P(y_i|y_j)`, that is, the probability that an instance that belongs to :math:`y_j` ends up being - classified as belonging to :math:`y_i` - :param prevs_estim: a `np.ndarray` of shape `(n_classes,)` with the class prevalence estimates - :return: an adjusted `np.ndarray` of shape `(n_classes,)` with the corrected class prevalence estimates - """ - A = PteCondEstim - B = prevs_estim - try: - adjusted_prevs = np.linalg.solve(A, B) - adjusted_prevs = np.clip(adjusted_prevs, 0, 1) - adjusted_prevs /= adjusted_prevs.sum() - except np.linalg.LinAlgError: - adjusted_prevs = prevs_estim # no way to adjust them! - return adjusted_prevs + estimate = F.solve_adjustment( + class_conditional_rates=self.Pte_cond_estim_, + unadjusted_counts=prevs_estim, + solver=self.solver, + method=self.method, + ) + return F.normalize_prevalence(estimate, method=self.norm) -class PCC(AggregativeProbabilisticQuantifier): - """ - `Probabilistic Classify & Count `_, - the probabilistic variant of CC that relies on the posterior probabilities returned by a probabilistic classifier. - - :param classifier: a sklearn's Estimator that generates a classifier - """ - - def __init__(self, classifier: BaseEstimator): - self.classifier = classifier - - def fit(self, data: LabelledCollection, fit_classifier=True): - self.classifier, _ = _training_helper(self.classifier, data, fit_classifier, ensure_probabilistic=True) - return self - - def aggregate(self, classif_posteriors): - return F.prevalence_from_probabilities(classif_posteriors, binarize=False) - - -class PACC(AggregativeProbabilisticQuantifier): +class PACC(AggregativeSoftQuantifier): """ `Probabilistic Adjusted Classify & Count `_, the probabilistic variant of ACC that relies on the posterior probabilities returned by a probabilistic classifier. - :param classifier: a sklearn's Estimator that generates a classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the - misclassification rates are to be estimated. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the misclassification rates should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. + + :param str method: adjustment method to be used: + + * 'inversion': matrix inversion method based on the matrix equality :math:`P(C)=P(C|Y)P(Y)`, + which tries to invert `P(C|Y)` matrix. + * 'invariant-ratio': invariant ratio estimator of `Vaz et al. `_, + which replaces the last equation with the normalization condition. + + :param str solver: the method to use for solving the system of linear equations. Valid options are: + + * 'exact-raise': tries to solve the system using matrix inversion. + Raises an error if the matrix has rank strictly less than `n_classes`. + * 'exact-cc': if the matrix is not of full rank, returns `p_c` as the estimates, which + corresponds to no adjustment (i.e., the classify and count method. See :class:`quapy.method.aggregative.CC`) + * 'exact': deprecated, defaults to 'exact-cc' + * 'minimize': minimizes the L2 norm of :math:`|Ax-B|`. This one generally works better, and is the + default parameter. More details about this can be consulted in `Bunse, M. "On Multi-Class Extensions + of Adjusted Classify and Count", on proceedings of the 2nd International Workshop on Learning to + Quantify: Methods and Applications (LQ 2022), ECML/PKDD 2022, Grenoble (France) + `_. + + :param str norm: the method to use for normalization. + + * `clip`, the values are clipped to the range [0,1] and then L1-normalized. + * `mapsimplex` projects vectors onto the probability simplex. This implementation relies on + `Mathieu Blondel's projection_simplex_sort `_ + * `condsoftmax`, applies a softmax normalization only to prevalence vectors that lie outside the simplex + :param n_jobs: number of parallel workers """ - def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None): - self.classifier = classifier - self.val_split = val_split + def __init__( + self, + classifier: BaseEstimator = None, + fit_classifier=True, + val_split=5, + solver: Literal['minimize', 'exact', 'exact-raise', 'exact-cc'] = 'minimize', + method: Literal['inversion', 'invariant-ratio'] = 'inversion', + norm: Literal['clip', 'mapsimplex', 'condsoftmax'] = 'clip', + n_jobs=None + ): + super().__init__(classifier, fit_classifier, val_split) self.n_jobs = qp._get_njobs(n_jobs) + self.solver = solver + self.method = method + self.norm = norm - def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, int, LabelledCollection] = None): + def _check_init_parameters(self): + if self.solver not in ACC.SOLVERS: + raise ValueError(f"unknown solver; valid ones are {ACC.SOLVERS}") + if self.method not in ACC.METHODS: + raise ValueError(f"unknown method; valid ones are {ACC.METHODS}") + if self.norm not in ACC.NORMALIZATIONS: + raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}") + + def aggregation_fit(self, classif_predictions, labels): """ - Trains a PACC quantifier. + Estimates the misclassification rates - :param data: the training set - :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit) - :param val_split: either a float in (0,1) indicating the proportion of training instances to use for - validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection - indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV - to estimate the parameters - :return: self + :param classif_predictions: array-like with posterior probabilities + :param labels: array-like with the true labels associated to each vector of posterior probabilities """ + posteriors = classif_predictions + true_labels = labels + self.pcc = PCC(self.classifier, fit_classifier=False) + self.Pte_cond_estim_ = PACC.getPteCondEstim(self.classifier.classes_, true_labels, posteriors) - if val_split is None: - val_split = self.val_split + def aggregate(self, classif_posteriors): + prevs_estim = self.pcc.aggregate(classif_posteriors) - self.classifier, y, y_, classes, class_count = cross_generate_predictions( - data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs + estimate = F.solve_adjustment( + class_conditional_rates=self.Pte_cond_estim_, + unadjusted_counts=prevs_estim, + solver=self.solver, + method=self.method, ) - - self.pcc = PCC(self.classifier) - self.Pte_cond_estim_ = self.getPteCondEstim(classes, y, y_) - - return self + return F.normalize_prevalence(estimate, method=self.norm) @classmethod def getPteCondEstim(cls, classes, y, y_): - # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a + # estimate the matrix with entry (i,j) being the estimate of P(hat_yi|yj), that is, the probability that a # document that belongs to yj ends up being classified as belonging to yi n_classes = len(classes) confusion = np.eye(n_classes) @@ -414,15 +669,8 @@ class PACC(AggregativeProbabilisticQuantifier): return confusion.T - def aggregate(self, classif_posteriors): - prevs_estim = self.pcc.aggregate(classif_posteriors) - return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim) - def classify(self, data): - return self.pcc.classify(data) - - -class EMQ(AggregativeProbabilisticQuantifier): +class EMQ(AggregativeSoftQuantifier): """ `Expectation Maximization for Quantification `_ (EMQ), aka `Saerens-Latinne-Decaestecker` (SLD) algorithm. @@ -430,61 +678,208 @@ class EMQ(AggregativeProbabilisticQuantifier): probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via maximum-likelihood estimation, in a mutually recursive way, until convergence. - :param classifier: a sklearn's Estimator that generates a classifier - :param exact_train_prev: set to True (default) for using, as the initial observation, the true training prevalence; - or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected - value of the posterior probabilities of the training instances as suggested in - `Alexandari et al. paper `_: - :param recalib: a string indicating the method of recalibration. Available choices include "nbvs" (No-Bias Vector - Scaling), "bcts" (Bias-Corrected Temperature Scaling), "ts" (Temperature Scaling), and "vs" (Vector Scaling). - The default value is None, indicating no recalibration. + This implementation also gives access to the heuristics proposed by `Alexandari et al. paper + `_. These heuristics consist of using, as the training + prevalence, an estimate of it obtained via k-fold cross validation (instead of the true training prevalence), + and to recalibrate the posterior probabilities of the classifier. + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + + :param fit_classifier: whether to train the classifier (default is True). Set to False if the + given classifier has already been trained. + + :param val_split: specifies the data used for generating the classifier predictions on which the + aggregation function is to be trained. This specification can be made as float in (0, 1) indicating + the proportion of stratified held-out validation set to be extracted from the training set; or as + an integer (default 5), indicating that the predictions are to be generated in a `k`-fold + cross-validation manner (with this integer indicating the value for `k`); or as a tuple (X,y) defining + the specific set of data to use for validation. This hyperparameter is only meant to be used when + the heuristics are to be applied, i.e., if a calibration is required. The default value is None + (meaning the calibration is not required). In case this hyperparameter is set to a value other than + None, but the calibration is not required (calib=None), a warning message will be raised. + + :param exact_train_prev: set to True (default) for using the true training prevalence as the initial + observation; set to False for computing the training prevalence as an estimate of it, i.e., as the + expected value of the posterior probabilities of the training instances. + + :param calib: a string indicating the method of calibration. + Available choices include "nbvs" (No-Bias Vector Scaling), "bcts" (Bias-Corrected Temperature Scaling), + "ts" (Temperature Scaling), and "vs" (Vector Scaling). Default is None (no calibration). + + :param on_calib_error: a string indicating the policy to follow in case the calibrator fails at runtime. + Options include "raise" (default), in which case a RuntimeException is raised; and "backup", in which + case the calibrator is silently skipped. + + :param n_jobs: number of parallel workers. Only used for recalibrating the classifier if `val_split` is set to + an integer `k` --the number of folds. """ MAX_ITER = 1000 EPSILON = 1e-4 + ON_CALIB_ERROR_VALUES = ['raise', 'backup'] + CALIB_OPTIONS = [None, 'nbvs', 'bcts', 'ts', 'vs'] - def __init__(self, classifier: BaseEstimator, exact_train_prev=True, recalib=None): - self.classifier = classifier - self.non_calibrated = classifier + def __init__(self, classifier: BaseEstimator = None, fit_classifier=True, val_split=None, exact_train_prev=True, + calib=None, on_calib_error='raise', n_jobs=None): + + assert calib in EMQ.CALIB_OPTIONS, \ + f'invalid value for {calib=}; valid ones are {EMQ.CALIB_OPTIONS}' + assert on_calib_error in EMQ.ON_CALIB_ERROR_VALUES, \ + f'invalid value for {on_calib_error=}; valid ones are {EMQ.ON_CALIB_ERROR_VALUES}' + + super().__init__(classifier, fit_classifier, val_split) self.exact_train_prev = exact_train_prev - self.recalib = recalib + self.calib = calib + self.on_calib_error = on_calib_error + self.n_jobs = n_jobs - def fit(self, data: LabelledCollection, fit_classifier=True): - if self.recalib is not None: - if self.recalib == 'nbvs': - self.classifier = NBVSCalibration(self.non_calibrated) - elif self.recalib == 'bcts': - self.classifier = BCTSCalibration(self.non_calibrated) - elif self.recalib == 'ts': - self.classifier = TSCalibration(self.non_calibrated) - elif self.recalib == 'vs': - self.classifier = VSCalibration(self.non_calibrated) - elif self.recalib == 'platt': - self.classifier = CalibratedClassifierCV(self.classifier, ensemble=False) - else: - raise ValueError('invalid param argument for recalibration method; available ones are ' - '"nbvs", "bcts", "ts", and "vs".') - self.recalib = None + @classmethod + def EMQ_BCTS(cls, classifier: BaseEstimator, fit_classifier=True, val_split=5, on_calib_error="raise", n_jobs=None): + """ + Constructs an instance of EMQ using the best configuration found in the `Alexandari et al. paper + `_, i.e., one that relies on Bias-Corrected Temperature + Scaling (BCTS) as a calibration function, and that uses an estimate of the training prevalence instead of + the true training prevalence. + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. + + :param on_calib_error: a string indicating the policy to follow in case the calibrator fails at runtime. + Options include "raise" (default), in which case a RuntimeException is raised; and "backup", in which + case the calibrator is silently skipped. + + :param n_jobs: number of parallel workers. Only used for recalibrating the classifier if `val_split` is set to + an integer `k` --the number of folds. + + :return: An instance of EMQ with BCTS + """ + return EMQ(classifier, fit_classifier=fit_classifier, val_split=val_split, exact_train_prev=False, + calib='bcts', on_calib_error=on_calib_error, n_jobs=n_jobs) + + def _check_init_parameters(self): + if self.val_split is not None: + if self.exact_train_prev and self.calib is None: + raise RuntimeWarning(f'The parameter {self.val_split=} was specified for EMQ, while the parameters ' + f'{self.exact_train_prev=} and {self.calib=}. This has no effect and causes an ' + f'unnecessary overload.') else: - self.classifier = self.non_calibrated - self.classifier, _ = _training_helper(self.classifier, data, fit_classifier, ensure_probabilistic=True) - if self.exact_train_prev: - self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_) - else: - self.train_prevalence = qp.model_selection.cross_val_predict( - quantifier=PCC(deepcopy(self.classifier)), - data=data, - nfolds=3, - random_state=0 - ) - return self + if self.calib is not None: + print(f'[warning] The parameter {self.calib=} requires the val_split be different from None. ' + f'This parameter will be set to 5. To avoid this warning, set this value to a float value ' + f'indicating the proportion of training data to be used as validation, or to an integer ' + f'indicating the number of folds for kFCV.') + self.val_split = 5 + + def classify(self, X): + """ + Provides the posterior probabilities for the given instances. The calibration function, if required, + has no effect in this step, and is only involved in the aggregate method. + + :param X: array-like of shape `(n_instances, n_dimensions,)` + :return: np.ndarray of shape `(n_instances, n_classes,)` with posterior probabilities + """ + return self.classifier.predict_proba(X) + + def classifier_fit_predict(self, X, y): + classif_predictions = super().classifier_fit_predict(X, y) + self.train_prevalence = F.prevalence_from_labels(y, classes=self.classes_) + return classif_predictions + + def _fit_calibration(self, calibrator, P, y): + n_classes = len(self.classes_) + + if not np.issubdtype(y.dtype, np.number): + y = np.searchsorted(self.classes_, y) + + try: + self.calibration_function = calibrator(P, np.eye(n_classes)[y], posterior_supplied=True) + except Exception as e: + if self.on_calib_error == 'raise': + raise RuntimeError(f'calibration {self.calib} failed at fit time: {e}') + elif self.on_calib_error == 'backup': + self.calibration_function = lambda P: P + + def _calibrate_if_requested(self, uncalib_posteriors): + if hasattr(self, 'calibration_function') and self.calibration_function is not None: + try: + calib_posteriors = self.calibration_function(uncalib_posteriors) + except Exception as e: + if self.on_calib_error == 'raise': + raise RuntimeError(f'calibration {self.calib} failed at predict time: {e}') + elif self.on_calib_error == 'backup': + calib_posteriors = uncalib_posteriors + else: + raise ValueError(f'unexpected {self.on_calib_error=}; ' + f'valid options are {EMQ.ON_CALIB_ERROR_VALUES}') + return calib_posteriors + return uncalib_posteriors + + def aggregation_fit(self, classif_predictions, labels): + """ + Trains the aggregation function of EMQ. This comes down to recalibrating the posterior probabilities + ir requested. + + :param classif_predictions: array-like with the raw (i.e., uncalibrated) posterior probabilities + returned by the classifier + :param labels: array-like with the true labels associated to each classifier prediction + """ + P = classif_predictions + y = labels + + requires_predictions = (self.calib is not None) or (not self.exact_train_prev) + if P is None and requires_predictions: + # classifier predictions were not generated because val_split=None + raise ArgumentError(self.val_split, self.__class__.__name__ + + ": Classifier predictions for the aggregative fit were not generated because " + "val_split=None. This usually happens when you enable calibrations or heuristics " + "during model selection but left val_split set to its default value (None). " + "Please provide one of the following values for val_split: (i) an integer >1 " + "(e.g. val_split=5) for k-fold cross-validation; (ii) a float in (0,1) (e.g. " + "val_split=0.3) for a proportion split; or (iii) a tuple (X, y) with explicit " + "validation data") + + if self.calib is not None: + calibrator = { + 'nbvs': NoBiasVectorScaling(), + 'bcts': TempScaling(bias_positions='all'), + 'ts': TempScaling(), + 'vs': VectorScaling() + }.get(self.calib, None) + + if calibrator is None: + raise ValueError(f'invalid value for {self.calib=}; valid ones are {EMQ.CALIB_OPTIONS}') + + self._fit_calibration(calibrator, P, y) + + if not self.exact_train_prev: + P = self._calibrate_if_requested(P) + self.train_prevalence = F.prevalence_from_probabilities(P) def aggregate(self, classif_posteriors, epsilon=EPSILON): + classif_posteriors = self._calibrate_if_requested(classif_posteriors) priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon) return priors def predict_proba(self, instances, epsilon=EPSILON): - classif_posteriors = self.classifier.predict_proba(instances) + """ + Returns the posterior probabilities updated by the EM algorithm. + + :param instances: np.ndarray of shape `(n_instances, n_dimensions)` + :param epsilon: error tolerance + :return: np.ndarray of shape `(n_instances, n_classes)` + """ + classif_posteriors = self.classify(instances) + classif_posteriors = self._calibrate_if_requested(classif_posteriors) priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon) return posteriors @@ -503,6 +898,11 @@ class EMQ(AggregativeProbabilisticQuantifier): """ Px = posterior_probabilities Ptr = np.copy(tr_prev) + + if np.prod(Ptr) == 0: # some entry is 0; we should smooth the values to avoid 0 division + Ptr += epsilon + Ptr /= Ptr.sum() + qs = np.copy(Ptr) # qs (the running estimate) is initialized as the training prevalence s, converged = 0, False @@ -527,7 +927,7 @@ class EMQ(AggregativeProbabilisticQuantifier): return qs, ps -class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): +class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): """ `Hellinger Distance y `_ (HDy). HDy is a probabilistic method for training binary quantifiers, that models quantification as the problem of @@ -537,50 +937,50 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): class-conditional distributions of the posterior probabilities returned for the positive and negative validation examples, respectively. The parameters of the mixture thus represent the estimates of the class prevalence values. - :param classifier: a sklearn's Estimator that generates a binary classifier - :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out - validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself). + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. """ - def __init__(self, classifier: BaseEstimator, val_split=0.4): - self.classifier = classifier - self.val_split = val_split + def __init__(self, classifier: BaseEstimator = None, fit_classifier=True, val_split=5): + super().__init__(classifier, fit_classifier, val_split) - def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None): + def aggregation_fit(self, classif_predictions, labels): """ - Trains a HDy quantifier. + Trains the aggregation function of HDy. - :param data: the training set - :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit) - :param val_split: either a float in (0,1) indicating the proportion of training instances to use for - validation (e.g., 0.3 for using 30% of the training set as validation data), or a - :class:`quapy.data.base.LabelledCollection` indicating the validation set itself - :return: self + :param classif_predictions: array-like with the posterior probabilities returned by the classifier + :param labels: array-like with the true labels associated to each posterior """ - if val_split is None: - val_split = self.val_split + P, y = classif_predictions, labels + Px = P[:, self.pos_label] # takes only the P(y=+1|x) + self.Pxy1 = Px[y == self.pos_label] + self.Pxy0 = Px[y == self.neg_label] - self._check_binary(data, self.__class__.__name__) - self.classifier, validation = _training_helper( - self.classifier, data, fit_classifier, ensure_probabilistic=True, val_split=val_split) - Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x) - self.Pxy1 = Px[validation.labels == self.classifier.classes_[1]] - self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]] # pre-compute the histogram for positive and negative examples self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110] + def hist(P, bins): h = np.histogram(P, bins=bins, range=(0, 1), density=True)[0] return h / h.sum() + self.Pxy1_density = {bins: hist(self.Pxy1, bins) for bins in self.bins} self.Pxy0_density = {bins: hist(self.Pxy0, bins) for bins in self.bins} - return self def aggregate(self, classif_posteriors): # "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10, # and the final estimated a priori probability was taken as the median of these 11 estimates." # (González-Castro, et al., 2013). - Px = classif_posteriors[:, 1] # takes only the P(y=+1|x) + Px = classif_posteriors[:, self.pos_label] # takes only the P(y=+1|x) prev_estimations = [] # for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110] @@ -596,7 +996,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): # at small steps (modern implementations resort to an optimization procedure, # see class DistributionMatching) prev_selected, min_dist = None, None - for prev in F.prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0): + for prev in F.prevalence_linspace(grid_points=101, repeats=1, smooth_limits_epsilon=0.0): Px_train = prev * Pxy1_density + (1 - prev) * Pxy0_density hdy = F.HellingerDistance(Px_train, Px_test) if prev_selected is None or hdy < min_dist: @@ -604,31 +1004,45 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): prev_estimations.append(prev_selected) class1_prev = np.median(prev_estimations) - return np.asarray([1 - class1_prev, class1_prev]) + return F.as_binary_prevalence(class1_prev) -class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier): +class DyS(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): """ `DyS framework `_ (DyS). DyS is a generalization of HDy method, using a Ternary Search in order to find the prevalence that minimizes the distance between distributions. Details for the ternary search have been got from - :param classifier: a sklearn's Estimator that generates a binary classifier - :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out - validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself). + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. + :param n_bins: an int with the number of bins to use to compute the histograms. + :param divergence: a str indicating the name of divergence (currently supported ones are "HD" or "topsoe"), or a callable function computes the divergence between two distributions (two equally sized arrays). + :param tol: a float with the tolerance for the ternary search algorithm. + + :param n_jobs: number of parallel workers. """ - def __init__(self, classifier: BaseEstimator, val_split=0.4, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05): - self.classifier = classifier - self.val_split = val_split + def __init__(self, classifier: BaseEstimator = None, fit_classifier=True, val_split=5, n_bins=8, + divergence: Union[str, Callable] = 'HD', tol=1e-05, n_jobs=None): + super().__init__(classifier, fit_classifier, val_split) self.tol = tol self.divergence = divergence self.n_bins = n_bins + self.n_jobs = n_jobs def _ternary_search(self, f, left, right, tol): """ @@ -646,22 +1060,23 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier): # Left and right are the current bounds; the maximum is between them return (left + right) / 2 - def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None): - if val_split is None: - val_split = self.val_split + def aggregation_fit(self, classif_predictions, labels): + """ + Trains the aggregation function of DyS. - self._check_binary(data, self.__class__.__name__) - self.classifier, validation = _training_helper( - self.classifier, data, fit_classifier, ensure_probabilistic=True, val_split=val_split) - Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x) - self.Pxy1 = Px[validation.labels == self.classifier.classes_[1]] - self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]] + :param classif_predictions: array-like with the posterior probabilities returned by the classifier + :param labels: array-like with the true labels associated to each posterior + """ + Px, y = classif_predictions, labels + Px = Px[:, self.pos_label] # takes only the P(y=+1|x) + self.Pxy1 = Px[y == self.pos_label] + self.Pxy0 = Px[y == self.neg_label] self.Pxy1_density = np.histogram(self.Pxy1, bins=self.n_bins, range=(0, 1), density=True)[0] self.Pxy0_density = np.histogram(self.Pxy0, bins=self.n_bins, range=(0, 1), density=True)[0] return self def aggregate(self, classif_posteriors): - Px = classif_posteriors[:, 1] # takes only the P(y=+1|x) + Px = classif_posteriors[:, self.pos_label] # takes only the P(y=+1|x) Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0] divergence = get_divergence(self.divergence) @@ -669,90 +1084,103 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier): def distribution_distance(prev): Px_train = prev * self.Pxy1_density + (1 - prev) * self.Pxy0_density return divergence(Px_train, Px_test) - + class1_prev = self._ternary_search(f=distribution_distance, left=0, right=1, tol=self.tol) - return np.asarray([1 - class1_prev, class1_prev]) + return F.as_binary_prevalence(class1_prev) -class SMM(AggregativeProbabilisticQuantifier, BinaryQuantifier): +class SMM(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): """ `SMM method `_ (SMM). SMM is a simplification of matching distribution methods where the representation of the examples - is created using the mean instead of a histogram. + is created using the mean instead of a histogram (conceptually equivalent to PACC). - :param classifier: a sklearn's Estimator that generates a binary classifier. - :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out - validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself). + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. """ - def __init__(self, classifier: BaseEstimator, val_split=0.4): - self.classifier = classifier - self.val_split = val_split - - def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None): - if val_split is None: - val_split = self.val_split + def __init__(self, classifier: BaseEstimator = None, fit_classifier=True, val_split=5): + super().__init__(classifier, fit_classifier, val_split) - self._check_binary(data, self.__class__.__name__) - self.classifier, validation = _training_helper( - self.classifier, data, fit_classifier, ensure_probabilistic=True, val_split=val_split) - Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x) - self.Pxy1 = Px[validation.labels == self.classifier.classes_[1]] - self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]] - self.Pxy1_mean = np.mean(self.Pxy1) - self.Pxy0_mean = np.mean(self.Pxy0) + def aggregation_fit(self, classif_predictions, labels): + """ + Trains the aggregation function of SMM. + + :param classif_predictions: array-like with the posterior probabilities returned by the classifier + :param labels: array-like with the true labels associated to each posterior + """ + Px, y = classif_predictions, labels + Px = Px[:, self.pos_label] # takes only the P(y=+1|x) + self.Pxy1 = Px[y == self.pos_label] + self.Pxy0 = Px[y == self.neg_label] + self.Pxy1_mean = np.mean(self.Pxy1) # equiv. TPR + self.Pxy0_mean = np.mean(self.Pxy0) # equiv. FPR return self def aggregate(self, classif_posteriors): - Px = classif_posteriors[:, 1] # takes only the P(y=+1|x) + Px = classif_posteriors[:, self.pos_label] # takes only the P(y=+1|x) Px_mean = np.mean(Px) - - class1_prev = (Px_mean - self.Pxy0_mean)/(self.Pxy1_mean - self.Pxy0_mean) - class1_prev = np.clip(class1_prev, 0, 1) - return np.asarray([1 - class1_prev, class1_prev]) + class1_prev = (Px_mean - self.Pxy0_mean) / (self.Pxy1_mean - self.Pxy0_mean) + return F.as_binary_prevalence(class1_prev, clip_if_necessary=True) -class DMy(AggregativeProbabilisticQuantifier): +class DMy(AggregativeSoftQuantifier): """ Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of posterior probabilities. This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters. - :param classifier: a `sklearn`'s Estimator that generates a probabilistic classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set to model the - validation distribution. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the validation distribution should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. + :param nbins: number of bins used to discretize the distributions (default 8) + :param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented) or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger Distance) + :param cdf: whether to use CDF instead of PDF (default False) + :param n_jobs: number of parallel workers (default None) """ - def __init__(self, classifier, val_split=0.4, nbins=8, divergence: Union[str, Callable]='HD', - cdf=False, search='optim_minimize', n_jobs=None): - self.classifier = classifier - self.val_split = val_split + def __init__(self, classifier: BaseEstimator = None, fit_classifier=True, val_split=5, nbins=8, + divergence: Union[str, Callable] = 'HD', cdf=False, search='optim_minimize', n_jobs=None): + super().__init__(classifier, fit_classifier, val_split) self.nbins = nbins self.divergence = divergence self.cdf = cdf self.search = search self.n_jobs = n_jobs - @classmethod - def HDy(cls, classifier, val_split=0.4, n_jobs=None): - from quapy.method.meta import MedianEstimator + # @classmethod + # def HDy(cls, classifier, val_split=5, n_jobs=None): + # from quapy.method.meta import MedianEstimator + # + # hdy = DMy(classifier=classifier, val_split=val_split, search='linear_search', divergence='HD') + # hdy = AggregativeMedianEstimator(hdy, param_grid={'nbins': np.linspace(10, 110, 11).astype(int)}, n_jobs=n_jobs) + # return hdy - hdy = DMy(classifier=classifier, val_split=val_split, search='linear_search', divergence='HD') - hdy = MedianEstimator(hdy, param_grid={'nbins': np.linspace(10, 110, 11).astype(int)}, n_jobs=n_jobs) - return hdy - - def __get_distributions(self, posteriors): + def _get_distributions(self, posteriors): histograms = [] post_dims = posteriors.shape[1] if post_dims == 2: @@ -763,40 +1191,34 @@ class DMy(AggregativeProbabilisticQuantifier): histograms.append(hist) counts = np.vstack(histograms) - distributions = counts/counts.sum(axis=1)[:,np.newaxis] + distributions = counts / counts.sum(axis=1)[:, np.newaxis] if self.cdf: distributions = np.cumsum(distributions, axis=1) return distributions - def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None): + def aggregation_fit(self, classif_predictions, labels): """ - Trains the classifier (if requested) and generates the validation distributions out of the training data. + Trains the aggregation function of a distribution matching method. This comes down to generating the + validation distributions out of the training data. The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; then `di=V[i]` are the distributions obtained from training data labelled with class `i`; while `dij = di[j]` is the discrete distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]` is the fraction of instances with a value in the `k`-th bin. - :param data: the training set - :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit) - :param val_split: either a float in (0,1) indicating the proportion of training instances to use for - validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection - indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV - to estimate the parameters + :param classif_predictions: array-like with the posterior probabilities returned by the classifier + :param labels: array-like with the true labels associated to each posterior """ - if val_split is None: - val_split = self.val_split + posteriors, true_labels = classif_predictions, labels + n_classes = len(self.classifier.classes_) - self.classifier, y, posteriors, classes, class_count = cross_generate_predictions( - data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs + self.validation_distribution = qp.util.parallel( + func=self._get_distributions, + args=[posteriors[true_labels == cat] for cat in range(n_classes)], + n_jobs=self.n_jobs, + backend='threading' ) - self.validation_distribution = np.asarray( - [self.__get_distributions(posteriors[y==cat]) for cat in range(data.n_classes)] - ) - - return self - def aggregate(self, posteriors: np.ndarray): """ Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution @@ -808,19 +1230,19 @@ class DMy(AggregativeProbabilisticQuantifier): :param posteriors: posterior probabilities of the instances in the sample :return: a vector of class prevalence estimates """ - test_distribution = self.__get_distributions(posteriors) + test_distribution = self._get_distributions(posteriors) divergence = get_divergence(self.divergence) n_classes, n_channels, nbins = self.validation_distribution.shape + def loss(prev): prev = np.expand_dims(prev, axis=0) - mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_channels, -1) + mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes, -1)).reshape(n_channels, -1) divs = [divergence(test_distribution[ch], mixture_distribution[ch]) for ch in range(n_channels)] return np.mean(divs) return F.argmin_prevalence(loss, n_classes, method=self.search) - def newELM(svmperf_base=None, loss='01', C=1): """ Explicit Loss Minimization (ELM) quantifiers. @@ -873,6 +1295,7 @@ def newSVMQ(svmperf_base=None, C=1): """ return newELM(svmperf_base, loss='q', C=C) + def newSVMKLD(svmperf_base=None, C=1): """ SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Kullback-Leibler Divergence @@ -923,6 +1346,7 @@ def newSVMKLD(svmperf_base=None, C=1): """ return newELM(svmperf_base, loss='nkld', C=C) + def newSVMAE(svmperf_base=None, C=1): """ SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Absolute Error as first used by @@ -947,6 +1371,7 @@ def newSVMAE(svmperf_base=None, C=1): """ return newELM(svmperf_base, loss='mae', C=C) + def newSVMRAE(svmperf_base=None, C=1): """ SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Relative Absolute Error as first @@ -972,254 +1397,6 @@ def newSVMRAE(svmperf_base=None, C=1): return newELM(svmperf_base, loss='mrae', C=C) -class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier): - """ - Abstract class of Threshold Optimization variants for :class:`ACC` as proposed by - `Forman 2006 `_ and - `Forman 2008 `_. - The goal is to bring improved stability to the denominator of the adjustment. - The different variants are based on different heuristics for choosing a decision threshold - that would allow for more true positives and many more false positives, on the grounds this - would deliver larger denominators. - - :param classifier: a sklearn's Estimator that generates a classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the - misclassification rates are to be estimated. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the misclassification rates should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). - """ - - def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None): - self.classifier = classifier - self.val_split = val_split - self.n_jobs = qp._get_njobs(n_jobs) - - def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, int, LabelledCollection] = None): - self._check_binary(data, "Threshold Optimization") - - if val_split is None: - val_split = self.val_split - - self.classifier, y, y_, classes, class_count = cross_generate_predictions( - data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs - ) - - self.cc = CC(self.classifier) - - self.tpr, self.fpr = self._optimize_threshold(y, y_) - - return self - - @abstractmethod - def _condition(self, tpr, fpr) -> float: - """ - Implements the criterion according to which the threshold should be selected. - This function should return the (float) score to be minimized. - - :param tpr: float, true positive rate - :param fpr: float, false positive rate - :return: float, a score for the given `tpr` and `fpr` - """ - ... - - def _optimize_threshold(self, y, probabilities): - """ - Seeks for the best `tpr` and `fpr` according to the score obtained at different - decision thresholds. The scoring function is implemented in function `_condition`. - - :param y: predicted labels for the validation set (or for the training set via `k`-fold cross validation) - :param probabilities: array-like with the posterior probabilities - :return: best `tpr` and `fpr` according to `_condition` - """ - best_candidate_threshold_score = None - best_tpr = 0 - best_fpr = 0 - candidate_thresholds = np.unique(probabilities[:, 1]) - for candidate_threshold in candidate_thresholds: - y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]] - TP, FP, FN, TN = self._compute_table(y, y_) - tpr = self._compute_tpr(TP, FP) - fpr = self._compute_fpr(FP, TN) - condition_score = self._condition(tpr, fpr) - if best_candidate_threshold_score is None or condition_score < best_candidate_threshold_score: - best_candidate_threshold_score = condition_score - best_tpr = tpr - best_fpr = fpr - - return best_tpr, best_fpr - - def aggregate(self, classif_predictions): - prevs_estim = self.cc.aggregate(classif_predictions) - if self.tpr - self.fpr == 0: - return prevs_estim - adjusted_prevs_estim = np.clip((prevs_estim[1] - self.fpr) / (self.tpr - self.fpr), 0, 1) - adjusted_prevs_estim = np.array((1 - adjusted_prevs_estim, adjusted_prevs_estim)) - return adjusted_prevs_estim - - def _compute_table(self, y, y_): - TP = np.logical_and(y == y_, y == self.classes_[1]).sum() - FP = np.logical_and(y != y_, y == self.classes_[0]).sum() - FN = np.logical_and(y != y_, y == self.classes_[1]).sum() - TN = np.logical_and(y == y_, y == self.classes_[0]).sum() - return TP, FP, FN, TN - - def _compute_tpr(self, TP, FP): - if TP + FP == 0: - return 1 - return TP / (TP + FP) - - def _compute_fpr(self, FP, TN): - if FP + TN == 0: - return 0 - return FP / (FP + TN) - - -class T50(ThresholdOptimization): - """ - Threshold Optimization variant for :class:`ACC` as proposed by - `Forman 2006 `_ and - `Forman 2008 `_ that looks - for the threshold that makes `tpr` cosest to 0.5. - The goal is to bring improved stability to the denominator of the adjustment. - - :param classifier: a sklearn's Estimator that generates a classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the - misclassification rates are to be estimated. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the misclassification rates should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). - """ - - def __init__(self, classifier: BaseEstimator, val_split=0.4): - super().__init__(classifier, val_split) - - def _condition(self, tpr, fpr) -> float: - return abs(tpr - 0.5) - - -class MAX(ThresholdOptimization): - """ - Threshold Optimization variant for :class:`ACC` as proposed by - `Forman 2006 `_ and - `Forman 2008 `_ that looks - for the threshold that maximizes `tpr-fpr`. - The goal is to bring improved stability to the denominator of the adjustment. - - :param classifier: a sklearn's Estimator that generates a classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the - misclassification rates are to be estimated. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the misclassification rates should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). - """ - - def __init__(self, classifier: BaseEstimator, val_split=0.4): - super().__init__(classifier, val_split) - - def _condition(self, tpr, fpr) -> float: - # MAX strives to maximize (tpr - fpr), which is equivalent to minimize (fpr - tpr) - return (fpr - tpr) - - -class X(ThresholdOptimization): - """ - Threshold Optimization variant for :class:`ACC` as proposed by - `Forman 2006 `_ and - `Forman 2008 `_ that looks - for the threshold that yields `tpr=1-fpr`. - The goal is to bring improved stability to the denominator of the adjustment. - - :param classifier: a sklearn's Estimator that generates a classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the - misclassification rates are to be estimated. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the misclassification rates should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). - """ - - def __init__(self, classifier: BaseEstimator, val_split=0.4): - super().__init__(classifier, val_split) - - def _condition(self, tpr, fpr) -> float: - return abs(1 - (tpr + fpr)) - - -class MS(ThresholdOptimization): - """ - Median Sweep. Threshold Optimization variant for :class:`ACC` as proposed by - `Forman 2006 `_ and - `Forman 2008 `_ that generates - class prevalence estimates for all decision thresholds and returns the median of them all. - The goal is to bring improved stability to the denominator of the adjustment. - - :param classifier: a sklearn's Estimator that generates a classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the - misclassification rates are to be estimated. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the misclassification rates should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). - """ - def __init__(self, classifier: BaseEstimator, val_split=0.4): - super().__init__(classifier, val_split) - - def _condition(self, tpr, fpr) -> float: - pass - - def _optimize_threshold(self, y, probabilities): - tprs = [] - fprs = [] - candidate_thresholds = np.unique(probabilities[:, 1]) - for candidate_threshold in candidate_thresholds: - y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]] - TP, FP, FN, TN = self._compute_table(y, y_) - tpr = self._compute_tpr(TP, FP) - fpr = self._compute_fpr(FP, TN) - tprs.append(tpr) - fprs.append(fpr) - return np.median(tprs), np.median(fprs) - - -class MS2(MS): - """ - Median Sweep 2. Threshold Optimization variant for :class:`ACC` as proposed by - `Forman 2006 `_ and - `Forman 2008 `_ that generates - class prevalence estimates for all decision thresholds and returns the median of for cases in - which `tpr-fpr>0.25` - The goal is to bring improved stability to the denominator of the adjustment. - - :param classifier: a sklearn's Estimator that generates a classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the - misclassification rates are to be estimated. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the misclassification rates should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). - """ - def __init__(self, classifier: BaseEstimator, val_split=0.4): - super().__init__(classifier, val_split) - - def _optimize_threshold(self, y, probabilities): - tprs = [0, 1] - fprs = [0, 1] - candidate_thresholds = np.unique(probabilities[:, 1]) - for candidate_threshold in candidate_thresholds: - y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]] - TP, FP, FN, TN = self._compute_table(y, y_) - tpr = self._compute_tpr(TP, FP) - fpr = self._compute_fpr(FP, TN) - if (tpr - fpr) > 0.25: - tprs.append(tpr) - fprs.append(fpr) - return np.median(tprs), np.median(fprs) - - class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier): """ Allows any binary quantifier to perform quantification on single-label datasets. @@ -1229,39 +1406,41 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier): `Gao and Sebastiani, 2016 `_. :param binary_quantifier: a quantifier (binary) that will be employed to work on multiclass model in a - one-vs-all manner + one-vs-all manner (default PACC(LogitsticRegression())) :param n_jobs: number of parallel workers :param parallel_backend: the parallel backend for joblib (default "loky"); this is helpful for some quantifiers (e.g., ELM-based ones) that cannot be run with multiprocessing, since the temp dir they create during fit will is removed and no longer available at predict time. """ - def __init__(self, binary_quantifier, n_jobs=None, parallel_backend='multiprocessing'): + def __init__(self, binary_quantifier=None, n_jobs=None, parallel_backend='multiprocessing'): + if binary_quantifier is None: + binary_quantifier = PACC() assert isinstance(binary_quantifier, BaseQuantifier), \ - f'{self.binary_quantifier} does not seem to be a Quantifier' + f'{binary_quantifier} does not seem to be a Quantifier' assert isinstance(binary_quantifier, AggregativeQuantifier), \ - f'{self.binary_quantifier} does not seem to be of type Aggregative' + f'{binary_quantifier} does not seem to be of type Aggregative' self.binary_quantifier = binary_quantifier self.n_jobs = qp._get_njobs(n_jobs) self.parallel_backend = parallel_backend - def classify(self, instances): + def classify(self, X): """ If the base quantifier is not probabilistic, returns a matrix of shape `(n,m,)` with `n` the number of instances and `m` the number of classes. The entry `(i,j)` is a binary value indicating whether instance - `i `belongs to class `j`. The binary classifications are independent of each other, meaning that an instance + `i` belongs to class `j`. The binary classifications are independent of each other, meaning that an instance can end up be attributed to 0, 1, or more classes. If the base quantifier is probabilistic, returns a matrix of shape `(n,m,2)` with `n` the number of instances and `m` the number of classes. The entry `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the posterior probability that instance `i` belongs (resp. does not belong) to class `j`. The posterior probabilities are independent of each other, meaning that, in general, they do not sum up to one. - :param instances: array-like + :param X: array-like :return: `np.ndarray` """ - classif_predictions = self._parallel(self._delayed_binary_classification, instances) - if isinstance(self.binary_quantifier, AggregativeProbabilisticQuantifier): + classif_predictions = self._parallel(self._delayed_binary_classification, X) + if isinstance(self.binary_quantifier, AggregativeSoftQuantifier): return np.swapaxes(classif_predictions, 0, 1) else: return classif_predictions.T @@ -1270,6 +1449,10 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier): prevalences = self._parallel(self._delayed_binary_aggregate, classif_predictions) return F.normalize_prevalence(prevalences) + def aggregation_fit(self, classif_predictions, labels): + self._parallel(self._delayed_binary_aggregate_fit(c, classif_predictions, labels)) + return self + def _delayed_binary_classification(self, c, X): return self.dict_binary_quantifiers[c].classify(X) @@ -1277,18 +1460,145 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier): # the estimation for the positive class prevalence return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1] + def _delayed_binary_aggregate_fit(self, c, classif_predictions, labels): + # trains the aggregation function of the cth quantifier + return self.dict_binary_quantifiers[c].aggregate_fit(classif_predictions[:, c], labels) -#--------------------------------------------------------------- + +class AggregativeMedianEstimator(BinaryQuantifier): + """ + This method is a meta-quantifier that returns, as the estimated class prevalence values, the median of the + estimation returned by differently (hyper)parameterized base quantifiers. + The median of unit-vectors is only guaranteed to be a unit-vector for n=2 dimensions, + i.e., in cases of binary quantification. + + :param base_quantifier: the base, binary quantifier + :param random_state: a seed to be set before fitting any base quantifier (default None) + :param param_grid: the grid or parameters towards which the median will be computed + :param n_jobs: number of parallel workers + """ + + def __init__(self, base_quantifier: AggregativeQuantifier, param_grid: dict, random_state=None, n_jobs=None): + self.base_quantifier = base_quantifier + self.param_grid = param_grid + self.random_state = random_state + self.n_jobs = qp._get_njobs(n_jobs) + + def get_params(self, deep=True): + return self.base_quantifier.get_params(deep) + + def set_params(self, **params): + self.base_quantifier.set_params(**params) + + def _delayed_fit(self, args): + with qp.util.temp_seed(self.random_state): + params, X, y = args + model = deepcopy(self.base_quantifier) + model.set_params(**params) + model.fit(X, y) + return model + + def _delayed_fit_classifier(self, args): + with qp.util.temp_seed(self.random_state): + cls_params, X, y = args + model = deepcopy(self.base_quantifier) + model.set_params(**cls_params) + predictions, labels = model.classifier_fit_predict(X, y) + return (model, predictions, labels) + + def _delayed_fit_aggregation(self, args): + with qp.util.temp_seed(self.random_state): + ((model, predictions, y), q_params) = args + model = deepcopy(model) + model.set_params(**q_params) + model.aggregation_fit(predictions, y) + return model + + def fit(self, X, y): + import itertools + + self._check_binary(y, self.__class__.__name__) + + if isinstance(self.base_quantifier, AggregativeQuantifier): + cls_configs, q_configs = qp.model_selection.group_params(self.param_grid) + + if len(cls_configs) > 1: + models_preds = qp.util.parallel( + self._delayed_fit_classifier, + ((params, X, y) for params in cls_configs), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + asarray=False, + backend='threading' + ) + else: + model = self.base_quantifier + model.set_params(**cls_configs[0]) + predictions, labels = model.classifier_fit_predict(X, y) + models_preds = [(model, predictions, labels)] + + self.models = qp.util.parallel( + self._delayed_fit_aggregation, + itertools.product(models_preds, q_configs), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + backend='threading' + ) + else: + configs = qp.model_selection.expand_grid(self.param_grid) + self.models = qp.util.parallel( + self._delayed_fit, + ((params, X, y) for params in configs), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + backend='threading' + ) + return self + + def _delayed_predict(self, args): + model, instances = args + return model.predict(instances) + + def predict(self, instances): + prev_preds = qp.util.parallel( + self._delayed_predict, + ((model, instances) for model in self.models), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + backend='threading' + ) + return np.median(prev_preds, axis=0) + + +# --------------------------------------------------------------- +# imports +# --------------------------------------------------------------- + +from . import _threshold_optim + +T50 = _threshold_optim.T50 +MAX = _threshold_optim.MAX +X = _threshold_optim.X +MS = _threshold_optim.MS +MS2 = _threshold_optim.MS2 + +from . import _kdey + +KDEyML = _kdey.KDEyML +KDEyHD = _kdey.KDEyHD +KDEyCS = _kdey.KDEyCS + +# --------------------------------------------------------------- # aliases -#--------------------------------------------------------------- +# --------------------------------------------------------------- ClassifyAndCount = CC AdjustedClassifyAndCount = ACC ProbabilisticClassifyAndCount = PCC ProbabilisticAdjustedClassifyAndCount = PACC ExpectationMaximizationQuantifier = EMQ -DistributionMatchingY = DMy SLD = EMQ +DistributionMatchingY = DMy HellingerDistanceY = HDy MedianSweep = MS MedianSweep2 = MS2 diff --git a/quapy/method/base.py b/quapy/method/base.py index e0363f1..85a0525 100644 --- a/quapy/method/base.py +++ b/quapy/method/base.py @@ -14,30 +14,40 @@ import numpy as np class BaseQuantifier(BaseEstimator): """ Abstract Quantifier. A quantifier is defined as an object of a class that implements the method :meth:`fit` on - :class:`quapy.data.base.LabelledCollection`, the method :meth:`quantify`, and the :meth:`set_params` and + a pair X, y, the method :meth:`predict`, and the :meth:`set_params` and :meth:`get_params` for model selection (see :meth:`quapy.model_selection.GridSearchQ`) """ @abstractmethod - def fit(self, data: LabelledCollection): + def fit(self, X, y): """ - Trains a quantifier. + Generates a quantifier. - :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + :param X: array-like, the training instances + :param y: array-like, the labels :return: self """ ... @abstractmethod - def quantify(self, instances): + def predict(self, X): """ Generate class prevalence estimates for the sample's instances - :param instances: array-like + :param X: array-like, the test instances :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates. """ ... + def quantify(self, X): + """ + Alias to :meth:`predict`, for old compatibility + + :param X: array-like + :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates. + """ + return self.predict(X) + class BinaryQuantifier(BaseQuantifier): """ @@ -45,8 +55,9 @@ class BinaryQuantifier(BaseQuantifier): (typically, to be interpreted as one class and its complement). """ - def _check_binary(self, data: LabelledCollection, quantifier_name): - assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \ + def _check_binary(self, y, quantifier_name): + n_classes = len(set(y)) + assert n_classes==2, f'{quantifier_name} works only on problems of binary classification. ' \ f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.' @@ -54,7 +65,7 @@ class OneVsAll: pass -def newOneVsAll(binary_quantifier, n_jobs=None): +def newOneVsAll(binary_quantifier: BaseQuantifier, n_jobs=None): assert isinstance(binary_quantifier, BaseQuantifier), \ f'{binary_quantifier} does not seem to be a Quantifier' if isinstance(binary_quantifier, qp.method.aggregative.AggregativeQuantifier): @@ -63,13 +74,13 @@ def newOneVsAll(binary_quantifier, n_jobs=None): return OneVsAllGeneric(binary_quantifier, n_jobs) -class OneVsAllGeneric(OneVsAll,BaseQuantifier): +class OneVsAllGeneric(OneVsAll, BaseQuantifier): """ Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary - quantifier for each class, and then l1-normalizes the outputs so that the class prevelence values sum up to 1. + quantifier for each class, and then l1-normalizes the outputs so that the class prevalence values sum up to 1. """ - def __init__(self, binary_quantifier, n_jobs=None): + def __init__(self, binary_quantifier: BaseQuantifier, n_jobs=None): assert isinstance(binary_quantifier, BaseQuantifier), \ f'{binary_quantifier} does not seem to be a Quantifier' if isinstance(binary_quantifier, qp.method.aggregative.AggregativeQuantifier): @@ -78,32 +89,32 @@ class OneVsAllGeneric(OneVsAll,BaseQuantifier): self.binary_quantifier = binary_quantifier self.n_jobs = qp._get_njobs(n_jobs) - def fit(self, data: LabelledCollection, fit_classifier=True): - assert not data.binary, f'{self.__class__.__name__} expect non-binary data' - assert fit_classifier == True, 'fit_classifier must be True' + def fit(self, X, y): + self.classes = sorted(np.unique(y)) + assert len(self.classes)!=2, f'{self.__class__.__name__} expect non-binary data' - self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_} - self._parallel(self._delayed_binary_fit, data) + self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in self.classes} + self._parallel(self._delayed_binary_fit, X, y) return self def _parallel(self, func, *args, **kwargs): return np.asarray( Parallel(n_jobs=self.n_jobs, backend='threading')( - delayed(func)(c, *args, **kwargs) for c in self.classes_ + delayed(func)(c, *args, **kwargs) for c in self.classes ) ) - def quantify(self, instances): - prevalences = self._parallel(self._delayed_binary_predict, instances) + def predict(self, X): + prevalences = self._parallel(self._delayed_binary_predict, X) return qp.functional.normalize_prevalence(prevalences) - @property - def classes_(self): - return sorted(self.dict_binary_quantifiers.keys()) + # @property + # def classes_(self): + # return sorted(self.dict_binary_quantifiers.keys()) def _delayed_binary_predict(self, c, X): - return self.dict_binary_quantifiers[c].quantify(X)[1] + return self.dict_binary_quantifiers[c].predict(X)[1] - def _delayed_binary_fit(self, c, data): - bindata = LabelledCollection(data.instances, data.labels == c, classes=[False, True]) - self.dict_binary_quantifiers[c].fit(bindata) + def _delayed_binary_fit(self, c, X, y): + bindata = LabelledCollection(X, y == c, classes=[False, True]) + self.dict_binary_quantifiers[c].fit(*bindata.Xy) diff --git a/quapy/method/composable.py b/quapy/method/composable.py new file mode 100644 index 0000000..c40e3bb --- /dev/null +++ b/quapy/method/composable.py @@ -0,0 +1,160 @@ +"""This module allows the composition of quantification methods from loss functions and feature transformations. This functionality is realized through an integration of the qunfold package: https://github.com/mirkobunse/qunfold.""" + +from dataclasses import dataclass +from packaging.version import Version + +from .base import BaseQuantifier + +# what to display when an ImportError is thrown +_IMPORT_ERROR_MESSAGE = """qunfold, the back-end of quapy.method.composable, is not properly installed. + +To fix this error, call: + + pip install --upgrade pip setuptools wheel + pip install "jax[cpu]" + pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.5" +""" + +# try to import members of qunfold as members of this module +try: + import qunfold + from qunfold.base import BaseMixin + from qunfold.methods import AbstractMethod + from qunfold.sklearn import CVClassifier + from qunfold import ( + LinearMethod, # methods + LeastSquaresLoss, # losses + BlobelLoss, + EnergyLoss, + HellingerSurrogateLoss, + CombinedLoss, + TikhonovRegularization, + TikhonovRegularized, + ClassRepresentation, # representations + HistogramRepresentation, + DistanceRepresentation, + KernelRepresentation, + EnergyKernelRepresentation, + LaplacianKernelRepresentation, + GaussianKernelRepresentation, + GaussianRFFKernelRepresentation, + ) +except ImportError as e: + raise ImportError(_IMPORT_ERROR_MESSAGE) from e + +__all__ = [ # control public members, e.g., for auto-documentation in sphinx + "QUnfoldWrapper", + "ComposableQuantifier", + "CVClassifier", + "LeastSquaresLoss", + "BlobelLoss", + "EnergyLoss", + "HellingerSurrogateLoss", + "CombinedLoss", + "TikhonovRegularization", + "TikhonovRegularized", + "ClassRepresentation", + "HistogramRepresentation", + "DistanceRepresentation", + "KernelRepresentation", + "EnergyKernelRepresentation", + "LaplacianKernelRepresentation", + "GaussianKernelRepresentation", + "GaussianRFFKernelRepresentation", +] + + +def check_compatible_qunfold_version(): + try: + version_str = qunfold.__version__ + except AttributeError: + # versions of qunfold <= 0.1.4 did not declare __version__ in the __init__.py but only in the setup.py + version_str = "0.1.4" + + installed_ver = Version(version_str) + required_ver = Version("0.1.5") + compatible = installed_ver.base_version == required_ver.base_version or installed_ver>=required_ver + return compatible + + +@dataclass +class QUnfoldWrapper(BaseQuantifier,BaseMixin): + """A thin wrapper for using qunfold methods in QuaPy. + + Args: + _method: An instance of `qunfold.methods.AbstractMethod` to wrap. + + Examples: + Here, we wrap an instance of ACC to perform a grid search with QuaPy. + + >>> from qunfold import ACC + >>> qunfold_method = QUnfoldWrapper(ACC(RandomForestClassifier(obb_score=True))) + >>> quapy.model_selection.GridSearchQ( + >>> model = qunfold_method, + >>> param_grid = { # try both splitting criteria + >>> "representation__classifier__estimator__criterion": ["gini", "entropy"], + >>> }, + >>> # ... + >>> ) + """ + _method: AbstractMethod + def fit(self, X, y): # data is a qp.LabelledCollection + self._method.fit(X, y) + return self + def predict(self, X): + return self._method.predict(X) + def set_params(self, **params): + self._method.set_params(**params) + return self + def get_params(self, deep=True): + return self._method.get_params(deep) + def __str__(self): + return self._method.__str__() + +def ComposableQuantifier(loss, representation, **kwargs): + """A generic quantification / unfolding method that solves a linear system of equations. + + This class represents any quantifier that can be described in terms of a loss function, a feature transformation, and a regularization term. In this implementation, the loss is minimized through unconstrained second-order minimization. Valid probability estimates are ensured through a soft-max trick by Bunse (2022). + + Args: + loss: An instance of a loss class from `quapy.methods.composable`. + representation: An instance of a representation class from `quapy.methods.composable`. + solver (optional): The `method` argument in `scipy.optimize.minimize`. Defaults to `"trust-ncg"`. + solver_options (optional): The `options` argument in `scipy.optimize.minimize`. Defaults to `{"gtol": 1e-8, "maxiter": 1000}`. + seed (optional): A random number generator seed from which a numpy RandomState is created. Defaults to `None`. + + Examples: + Here, we create the ordinal variant of ACC (Bunse et al., 2023). This variant consists of the original feature transformation of ACC and of the original loss of ACC, the latter of which is regularized towards smooth solutions. + + >>> from quapy.method.composable import ( + >>> ComposableQuantifier, + >>> TikhonovRegularized, + >>> LeastSquaresLoss, + >>> ClassRepresentation, + >>> ) + >>> from sklearn.ensemble import RandomForestClassifier + >>> o_acc = ComposableQuantifier( + >>> TikhonovRegularized(LeastSquaresLoss(), 0.01), + >>> ClassRepresentation(RandomForestClassifier(oob_score=True)) + >>> ) + + Here, we perform hyper-parameter optimization with the ordinal ACC. + + >>> quapy.model_selection.GridSearchQ( + >>> model = o_acc, + >>> param_grid = { # try both splitting criteria + >>> "representation__classifier__estimator__criterion": ["gini", "entropy"], + >>> }, + >>> # ... + >>> ) + + To use a classifier that does not provide the `oob_score` argument, such as logistic regression, you have to configure a cross validation of this classifier. Here, we employ 10 cross validation folds. 5 folds are the default. + + >>> from quapy.method.composable import CVClassifier + >>> from sklearn.linear_model import LogisticRegression + >>> acc_lr = ComposableQuantifier( + >>> LeastSquaresLoss(), + >>> ClassRepresentation(CVClassifier(LogisticRegression(), 10)) + >>> ) + """ + return QUnfoldWrapper(LinearMethod(loss, representation, **kwargs)) \ No newline at end of file diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py new file mode 100644 index 0000000..fd8c84d --- /dev/null +++ b/quapy/method/confidence.py @@ -0,0 +1,677 @@ +import numpy as np +from sklearn.base import BaseEstimator +from sklearn.metrics import confusion_matrix + +import quapy as qp +import quapy.functional as F +from quapy.method import _bayesian +from quapy.data import LabelledCollection +from quapy.method.aggregative import AggregativeQuantifier, AggregativeCrispQuantifier, AggregativeSoftQuantifier, BinaryAggregativeQuantifier +from scipy.stats import chi2 +from sklearn.utils import resample +from abc import ABC, abstractmethod +from scipy.special import softmax, factorial +import copy +from functools import lru_cache + +""" +This module provides implementation of different types of confidence regions, and the implementation of Bootstrap +for AggregativeQuantifiers. +""" + +class ConfidenceRegionABC(ABC): + """ + Abstract class of confidence regions + """ + + @abstractmethod + def point_estimate(self) -> np.ndarray: + """ + Returns the point estimate corresponding to a set of bootstrap estimates. + + :return: np.ndarray + """ + ... + + def ndim(self) -> int: + """ + Number of dimensions of the region. This number corresponds to the total number of classes. The dimensionality + of the simplex is therefore ndim-1 + + :return: int + """ + return len(self.point_estimate()) + + @abstractmethod + def coverage(self, true_value) -> float: + """ + Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the + fraction of these that are contained in the region, if more than one value is passed. If only one value is + passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively. + + :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,) + :return: float in [0,1] + """ + ... + + @lru_cache + def simplex_portion(self): + """ + Computes the fraction of the simplex which is covered by the region. This is not the volume of the region + itself (which could lie outside the boundaries of the simplex), but the actual fraction of the simplex + contained in the region. A default implementation, based on Monte Carlo approximation, is provided. + + :return: float, the fraction of the simplex covered by the region + """ + return self.montecarlo_proportion() + + @lru_cache + def montecarlo_proportion(self, n_trials=10_000): + """ + Estimates, via a Monte Carlo approach, the fraction of the simplex covered by the region. This is carried + out by returning the fraction of the `n_trials` points, uniformly drawn at random from the simplex, that + are included in the region. The value is only computed once when multiple calls are made. + + :return: float in [0,1] + """ + with qp.util.temp_seed(0): + uniform_simplex = F.uniform_simplex_sampling(n_classes=self.ndim(), size=n_trials) + proportion = np.clip(self.coverage(uniform_simplex), 0., 1.) + return proportion + + +class WithConfidenceABC(ABC): + """ + Abstract class for confidence regions. + """ + METHODS = ['intervals', 'ellipse', 'ellipse-clr'] + + @abstractmethod + def predict_conf(self, instances, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC): + """ + Adds the method `predict_conf` to the interface. This method returns not only the point-estimate, but + also the confidence region around it. + + :param instances: a np.ndarray of shape (n_instances, n_features,) + :param confidence_level: float in (0, 1), default is 0.95 + :return: a tuple (`point_estimate`, `conf_region`), where `point_estimate` is a np.ndarray of shape + (n_classes,) and `conf_region` is an object from :class:`ConfidenceRegionABC` + """ + ... + + def quantify_conf(self, instances, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC): + """ + Alias to `predict_conf`. This method returns not only the point-estimate, but + also the confidence region around it. + + :param instances: a np.ndarray of shape (n_instances, n_features,) + :param confidence_level: float in (0, 1), default is 0.95 + :return: a tuple (`point_estimate`, `conf_region`), where `point_estimate` is a np.ndarray of shape + (n_classes,) and `conf_region` is an object from :class:`ConfidenceRegionABC` + """ + return self.predict_conf(instances=instances, confidence_level=confidence_level) + + @classmethod + def construct_region(cls, prev_estims, confidence_level=0.95, method='intervals'): + """ + Construct a confidence region given many prevalence estimations. + + :param prev_estims: np.ndarray of shape (n_estims, n_classes) + :param confidence_level: float, the confidence level for the region (default 0.95) + :param method: str, indicates the method for constructing regions. Set to `intervals` for + constructing confidence intervals (default), or to `ellipse` for constructing an + ellipse in the probability simplex, or to `ellipse-clr` for constructing an ellipse + in the Centered-Log Ratio (CLR) unconstrained space. + """ + region = None + if method == 'intervals': + region = ConfidenceIntervals(prev_estims, confidence_level=confidence_level) + elif method == 'ellipse': + region = ConfidenceEllipseSimplex(prev_estims, confidence_level=confidence_level) + elif method == 'ellipse-clr': + region = ConfidenceEllipseCLR(prev_estims, confidence_level=confidence_level) + + if region is None: + raise NotImplementedError(f'unknown method {method}') + + return region + +def simplex_volume(n): + """ + Computes the volume of the n-dimensional simplex. For n classes, the corresponding volume + is :meth:`simplex_volume(n-1)` since the simplex has one degree of freedom less. + + :param n: int, the dimensionality of the simplex + :return: float, the volume of the n-dimensional simplex + """ + return 1 / factorial(n) + + +def within_ellipse_prop(values, mean, prec_matrix, chi2_critical): + """ + Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix` + at a distance `chi2_critical`. + + :param values: a np.ndarray of shape (n_dim,) or (n_values, n_dim,) + :param mean: a np.ndarray of shape (n_dim,) with the center of the ellipse + :param prec_matrix: a np.ndarray with the precision matrix (inverse of the + covariance matrix) of the ellipse. If this inverse cannot be computed + then None must be passed + :param chi2_critical: float, the chi2 critical value + + :return: float in [0,1], the fraction of values that are contained in the ellipse + defined by the mean (center), the precision matrix (shape), and the chi2_critical value (distance). + If `values` is only one value, then either 0. (not contained) or 1. (contained) is returned. + """ + if prec_matrix is None: + return 0. + + diff = values - mean # Mahalanobis distance + + d_M_squared = diff @ prec_matrix @ diff.T # d_M^2 + if d_M_squared.ndim == 2: + d_M_squared = np.diag(d_M_squared) + + within_elipse = (d_M_squared <= chi2_critical) + + if isinstance(within_elipse, np.ndarray): + within_elipse = np.mean(within_elipse) + + return within_elipse * 1.0 + + +class ConfidenceEllipseSimplex(ConfidenceRegionABC): + """ + Instantiates a Confidence Ellipse in the probability simplex. + + :param X: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param confidence_level: float, the confidence level (default 0.95) + """ + + def __init__(self, X, confidence_level=0.95): + + assert 0. < confidence_level < 1., f'{confidence_level=} must be in range(0,1)' + + X = np.asarray(X) + + self.mean_ = X.mean(axis=0) + self.cov_ = np.cov(X, rowvar=False, ddof=1) + + try: + self.precision_matrix_ = np.linalg.inv(self.cov_) + except: + self.precision_matrix_ = None + + self.dim = X.shape[-1] + self.ddof = self.dim - 1 + + # critical chi-square value + self.confidence_level = confidence_level + self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof) + + def point_estimate(self): + """ + Returns the point estimate, the center of the ellipse. + + :return: np.ndarray of shape (n_classes,) + """ + return self.mean_ + + def coverage(self, true_value): + """ + Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the + fraction of these that are contained in the region, if more than one value is passed. If only one value is + passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively. + + :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,) + :return: float in [0,1] + """ + return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_) + + +class ConfidenceEllipseCLR(ConfidenceRegionABC): + """ + Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space. + + :param X: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param confidence_level: float, the confidence level (default 0.95) + """ + + def __init__(self, X, confidence_level=0.95): + X = np.asarray(X) + self.clr = CLRtransformation() + Z = self.clr(X) + self.mean_ = np.mean(X, axis=0) + self.conf_region_clr = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level) + + def point_estimate(self): + """ + Returns the point estimate, the center of the ellipse. + + :return: np.ndarray of shape (n_classes,) + """ + # The inverse of the CLR does not coincide with the true mean, because the geometric mean + # requires smoothing the prevalence vectors and this affects the softmax (inverse); + # return self.clr.inverse(self.mean_) # <- does not coincide + return self.mean_ + + def coverage(self, true_value): + """ + Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the + fraction of these that are contained in the region, if more than one value is passed. If only one value is + passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively. + + :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,) + :return: float in [0,1] + """ + transformed_values = self.clr(true_value) + return self.conf_region_clr.coverage(transformed_values) + + +class ConfidenceIntervals(ConfidenceRegionABC): + """ + Instantiates a region based on (independent) Confidence Intervals. + + :param X: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param confidence_level: float, the confidence level (default 0.95) + """ + def __init__(self, X, confidence_level=0.95): + assert 0 < confidence_level < 1, f'{confidence_level=} must be in range(0,1)' + + X = np.asarray(X) + + self.means_ = X.mean(axis=0) + alpha = 1-confidence_level + low_perc = (alpha/2.)*100 + high_perc = (1-alpha/2.)*100 + self.I_low, self.I_high = np.percentile(X, q=[low_perc, high_perc], axis=0) + + def point_estimate(self): + """ + Returns the point estimate, the class-wise average of the bootstrapped estimates + + :return: np.ndarray of shape (n_classes,) + """ + return self.means_ + + def coverage(self, true_value): + """ + Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the + fraction of these that are contained in the region, if more than one value is passed. If only one value is + passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively. + + :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,) + :return: float in [0,1] + """ + within_intervals = np.logical_and(self.I_low <= true_value, true_value <= self.I_high) + within_all_intervals = np.all(within_intervals, axis=-1, keepdims=True) + proportion = within_all_intervals.mean() + + return proportion + + def __repr__(self): + return '['+', '.join(f'({low:.4f}, {high:.4f})' for (low,high) in zip(self.I_low, self.I_high))+']' + + +class CLRtransformation: + """ + Centered log-ratio, from component analysis + """ + def __call__(self, X, epsilon=1e-6): + """ + Applies the CLR function to X thus mapping the instances, which are contained in `\\mathcal{R}^{n}` but + actually lie on a `\\mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\mathcal{R}^{n}` + + :param X: np.ndarray of (n_instances, n_dimensions) to be transformed + :param epsilon: small float for prevalence smoothing + :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points + """ + X = np.asarray(X) + X = qp.error.smooth(X, epsilon) + G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True)) # geometric mean + return np.log(X / G) + + def inverse(self, X): + """ + Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing. + + :param X: np.ndarray of (n_instances, n_dimensions) to be transformed + :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points + """ + return softmax(X, axis=-1) + + +class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): + """ + Aggregative Bootstrap allows any AggregativeQuantifier to get confidence regions around + point-estimates of class prevalence values. This method implements some optimizations for + speeding up the computations, which are only possible due to the two phases of the aggregative + quantifiers. + + During training, the bootstrap repetitions are only carried out over pre-classified training instances, + after the classifier has been trained (only once), in order to train a series of aggregation + functions (model-based approach). + + During inference, the bootstrap repetitions are applied to the pre-classified test instances. + + See + `Moreo, A., Salvati, N. + An Efficient Method for Deriving Confidence Intervals in Aggregative Quantification. + Learning to Quantify: Methods and Applications (LQ 2025), co-located at ECML-PKDD 2025. + pp 12-33 `_ + + :param quantifier: an aggregative quantifier + :para n_train_samples: int, the number of training resamplings (defaults to 1, set to > 1 to activate a + model-based bootstrap approach) + :para n_test_samples: int, the number of test resamplings (defaults to 500, set to > 1 to activate a + population-based bootstrap approach) + :param confidence_level: float, the confidence level for the confidence region (default 0.95) + :param region: string, set to `intervals` for constructing confidence intervals (default), or to + `ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for + constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space. + :param random_state: int for replicating samples, None (default) for non-replicable samples + """ + + def __init__(self, + quantifier: AggregativeQuantifier, + n_train_samples=1, + n_test_samples=500, + confidence_level=0.95, + region='intervals', + random_state=None): + + assert isinstance(quantifier, AggregativeQuantifier), \ + f'base quantifier does not seem to be an instance of {AggregativeQuantifier.__name__}' + assert n_train_samples >= 1, \ + f'{n_train_samples=} must be >= 1' + assert n_test_samples >= 1, \ + f'{n_test_samples=} must be >= 1' + assert n_test_samples>1 or n_train_samples>1, \ + f'either {n_test_samples=} or {n_train_samples=} must be >1' + + self.quantifier = quantifier + self.n_train_samples = n_train_samples + self.n_test_samples = n_test_samples + self.confidence_level = confidence_level + self.region = region + self.random_state = random_state + + def aggregation_fit(self, classif_predictions, labels): + data = LabelledCollection(classif_predictions, labels, classes=self.classes_) + self.quantifiers = [] + if self.n_train_samples==1: + self.quantifier.aggregation_fit(classif_predictions, labels) + self.quantifiers.append(self.quantifier) + else: + # model-based bootstrap (only on the aggregative part) + n_examples = len(data) + full_index = np.arange(n_examples) + with qp.util.temp_seed(self.random_state): + for i in range(self.n_train_samples): + quantifier = copy.deepcopy(self.quantifier) + index = resample(full_index, n_samples=n_examples) + classif_predictions_i = classif_predictions.sampling_from_index(index) + data_i = data.sampling_from_index(index) + quantifier.aggregation_fit(classif_predictions_i, data_i) + self.quantifiers.append(quantifier) + return self + + def aggregate(self, classif_predictions: np.ndarray): + prev_mean, self.confidence = self.aggregate_conf(classif_predictions) + return prev_mean + + def aggregate_conf(self, classif_predictions: np.ndarray, confidence_level=None): + if confidence_level is None: + confidence_level = self.confidence_level + + n_samples = classif_predictions.shape[0] + prevs = [] + with qp.util.temp_seed(self.random_state): + for quantifier in self.quantifiers: + for i in range(self.n_test_samples): + sample_i = resample(classif_predictions, n_samples=n_samples) + prev_i = quantifier.aggregate(sample_i) + prevs.append(prev_i) + + conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region) + prev_estim = conf.point_estimate() + + return prev_estim, conf + + def fit(self, X, y): + self.quantifier._check_init_parameters() + classif_predictions, labels = self.quantifier.classifier_fit_predict(X, y) + self.aggregation_fit(classif_predictions, labels) + return self + + def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): + predictions = self.quantifier.classify(instances) + return self.aggregate_conf(predictions, confidence_level=confidence_level) + + @property + def classifier(self): + return self.quantifier.classifier + + def _classifier_method(self): + return self.quantifier._classifier_method() + + +class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): + """ + `Bayesian quantification `_ method (by Albert Ziegler and Paweł Czyż), + which is a variant of :class:`ACC` that calculates the posterior probability distribution + over the prevalence vectors, rather than providing a point estimate obtained + by matrix inversion. + + Can be used to diagnose degeneracy in the predictions visible when the confusion + matrix has high condition number or to quantify uncertainty around the point estimate. + + This method relies on extra dependencies, which have to be installed via: + `$ pip install quapy[bayes]` + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to + None when the method does not require any validation data, in order to avoid that some portion of + the training data be wasted. + :param num_warmup: number of warmup iterations for the MCMC sampler (default 500) + :param num_samples: number of samples to draw from the posterior (default 1000) + :param mcmc_seed: random seed for the MCMC sampler (default 0) + :param confidence_level: float in [0,1] to construct a confidence region around the point estimate (default 0.95) + :param region: string, set to `intervals` for constructing confidence intervals (default), or to + `ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for + constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space. + """ + def __init__(self, + classifier: BaseEstimator=None, + fit_classifier=True, + val_split: int = 5, + num_warmup: int = 500, + num_samples: int = 1_000, + mcmc_seed: int = 0, + confidence_level: float = 0.95, + region: str = 'intervals'): + + if num_warmup <= 0: + raise ValueError(f'parameter {num_warmup=} must be a positive integer') + if num_samples <= 0: + raise ValueError(f'parameter {num_samples=} must be a positive integer') + + if _bayesian.DEPENDENCIES_INSTALLED is False: + raise ImportError("Auxiliary dependencies are required. " + "Run `$ pip install quapy[bayes]` to install them.") + + super().__init__(classifier, fit_classifier, val_split) + self.num_warmup = num_warmup + self.num_samples = num_samples + self.mcmc_seed = mcmc_seed + self.confidence_level = confidence_level + self.region = region + + # Array of shape (n_classes, n_predicted_classes,) where entry (y, c) is the number of instances + # labeled as class y and predicted as class c. + # By default, this array is set to None and later defined as part of the `aggregation_fit` phase + self._n_and_c_labeled = None + + # Dictionary with posterior samples, set when `aggregate` is provided. + self._samples = None + + def aggregation_fit(self, classif_predictions, labels): + """ + Estimates the misclassification rates. + + :param classif_predictions: array-like with the label predictions returned by the classifier + :param labels: array-like with the true labels associated to each classifier prediction + """ + pred_labels = classif_predictions + true_labels = labels + self._n_and_c_labeled = confusion_matrix( + y_true=true_labels, + y_pred=pred_labels, + labels=self.classifier.classes_ + ).astype(float) + + def sample_from_posterior(self, classif_predictions): + if self._n_and_c_labeled is None: + raise ValueError("aggregation_fit must be called before sample_from_posterior") + + n_c_unlabeled = F.counts_from_labels(classif_predictions, self.classifier.classes_).astype(float) + + self._samples = _bayesian.sample_posterior( + n_c_unlabeled=n_c_unlabeled, + n_y_and_c_labeled=self._n_and_c_labeled, + num_warmup=self.num_warmup, + num_samples=self.num_samples, + seed=self.mcmc_seed, + ) + return self._samples + + def get_prevalence_samples(self): + if self._samples is None: + raise ValueError("sample_from_posterior must be called before get_prevalence_samples") + return self._samples[_bayesian.P_TEST_Y] + + def get_conditional_probability_samples(self): + if self._samples is None: + raise ValueError("sample_from_posterior must be called before get_conditional_probability_samples") + return self._samples[_bayesian.P_C_COND_Y] + + def aggregate(self, classif_predictions): + samples = self.sample_from_posterior(classif_predictions)[_bayesian.P_TEST_Y] + return np.asarray(samples.mean(axis=0), dtype=float) + + def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): + if confidence_level is None: + confidence_level = self.confidence_level + classif_predictions = self.classify(instances) + point_estimate = self.aggregate(classif_predictions) + samples = self.get_prevalence_samples() # available after calling "aggregate" function + region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region) + return point_estimate, region + + +class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): + """ + `Precise Quantifier: Bayesian distribution matching quantifier , + which is a variant of :class:`HDy` that calculates the posterior probability distribution + over the prevalence vectors, rather than providing a point estimate. + + This method relies on extra dependencies, which have to be installed via: + `$ pip install quapy[bayes]` + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to + None when the method does not require any validation data, in order to avoid that some portion of + the training data be wasted. + :param num_warmup: number of warmup iterations for the STAN sampler (default 500) + :param num_samples: number of samples to draw from the posterior (default 1000) + :param stan_seed: random seed for the STAN sampler (default 0) + :param region: string, set to `intervals` for constructing confidence intervals (default), or to + `ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for + constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space. + """ + def __init__(self, + classifier: BaseEstimator=None, + fit_classifier=True, + val_split: int = 5, + n_bins: int = 4, + fixed_bins: bool = False, + num_warmup: int = 500, + num_samples: int = 1_000, + stan_seed: int = 0, + confidence_level: float = 0.95, + region: str = 'intervals'): + + if num_warmup <= 0: + raise ValueError(f'parameter {num_warmup=} must be a positive integer') + if num_samples <= 0: + raise ValueError(f'parameter {num_samples=} must be a positive integer') + + if not _bayesian.DEPENDENCIES_INSTALLED: + raise ImportError("Auxiliary dependencies are required. " + "Run `$ pip install quapy[bayes]` to install them.") + + super().__init__(classifier, fit_classifier, val_split) + self.n_bins = n_bins + self.fixed_bins = fixed_bins + self.num_warmup = num_warmup + self.num_samples = num_samples + self.stan_seed = stan_seed + self.stan_code = _bayesian.load_stan_file() + self.confidence_level = confidence_level + self.region = region + + def aggregation_fit(self, classif_predictions, labels): + y_pred = classif_predictions[:, self.pos_label] + + # Compute bin limits + if self.fixed_bins: + # Uniform bins in [0,1] + self.bin_limits = np.linspace(0, 1, self.n_bins + 1) + else: + # Quantile bins + self.bin_limits = np.quantile(y_pred, np.linspace(0, 1, self.n_bins + 1)) + + # Assign each prediction to a bin + bin_indices = np.digitize(y_pred, self.bin_limits[1:-1], right=True) + + # Positive and negative masks + pos_mask = (labels == self.pos_label) + neg_mask = ~pos_mask + + # Count positives and negatives per bin + self.pos_hist = np.bincount(bin_indices[pos_mask], minlength=self.n_bins) + self.neg_hist = np.bincount(bin_indices[neg_mask], minlength=self.n_bins) + + def aggregate(self, classif_predictions): + Px_test = classif_predictions[:, self.pos_label] + test_hist, _ = np.histogram(Px_test, bins=self.bin_limits) + prevs = _bayesian.pq_stan( + self.stan_code, self.n_bins, self.pos_hist, self.neg_hist, test_hist, + self.num_samples, self.num_warmup, self.stan_seed + ).flatten() + self.prev_distribution = np.vstack([1-prevs, prevs]).T + return self.prev_distribution.mean(axis=0) + + def aggregate_conf(self, predictions, confidence_level=None): + if confidence_level is None: + confidence_level = self.confidence_level + point_estimate = self.aggregate(predictions) + samples = self.prev_distribution + region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region) + return point_estimate, region + + def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): + predictions = self.classify(instances) + return self.aggregate_conf(predictions, confidence_level=confidence_level) + + diff --git a/quapy/method/meta.py b/quapy/method/meta.py index 7f111c0..37749e1 100644 --- a/quapy/method/meta.py +++ b/quapy/method/meta.py @@ -1,6 +1,6 @@ import itertools from copy import deepcopy -from typing import Union +from typing import Union, List import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.metrics import f1_score, make_scorer, accuracy_score @@ -12,21 +12,21 @@ from quapy import functional as F from quapy.data import LabelledCollection from quapy.model_selection import GridSearchQ from quapy.method.base import BaseQuantifier, BinaryQuantifier -from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ +from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ, AggregativeQuantifier, AggregativeSoftQuantifier try: - from . import neural + from . import _neural except ModuleNotFoundError: - neural = None + _neural = None -if neural: - QuaNet = neural.QuaNetTrainer +if _neural: + QuaNet = _neural.QuaNetTrainer else: QuaNet = "QuaNet is not available due to missing torch package" -class MedianEstimator(BinaryQuantifier): +class MedianEstimator2(BinaryQuantifier): """ This method is a meta-quantifier that returns, as the estimated class prevalence values, the median of the estimation returned by differently (hyper)parameterized base quantifiers. @@ -52,20 +52,19 @@ class MedianEstimator(BinaryQuantifier): def _delayed_fit(self, args): with qp.util.temp_seed(self.random_state): - params, training = args + params, X, y = args model = deepcopy(self.base_quantifier) model.set_params(**params) - model.fit(training) + model.fit(X, y) return model - def fit(self, training: LabelledCollection): - self._check_binary(training, self.__class__.__name__) - params_keys = list(self.param_grid.keys()) - params_values = list(self.param_grid.values()) - hyper = [dict({k: val[i] for i, k in enumerate(params_keys)}) for val in itertools.product(*params_values)] + def fit(self, X, y): + self._check_binary(y, self.__class__.__name__) + + configs = qp.model_selection.expand_grid(self.param_grid) self.models = qp.util.parallel( self._delayed_fit, - ((params, training) for params in hyper), + ((params, X, y) for params in configs), seed=qp.environ.get('_R_SEED', None), n_jobs=self.n_jobs ) @@ -73,12 +72,12 @@ class MedianEstimator(BinaryQuantifier): def _delayed_predict(self, args): model, instances = args - return model.quantify(instances) + return model.predict(instances) - def quantify(self, instances): + def predict(self, X): prev_preds = qp.util.parallel( self._delayed_predict, - ((model, instances) for model in self.models), + ((model, X) for model in self.models), seed=qp.environ.get('_R_SEED', None), n_jobs=self.n_jobs ) @@ -86,6 +85,66 @@ class MedianEstimator(BinaryQuantifier): return np.median(prev_preds, axis=0) +class MedianEstimator(BinaryQuantifier): + """ + This method is a meta-quantifier that returns, as the estimated class prevalence values, the median of the + estimation returned by differently (hyper)parameterized base quantifiers. + The median of unit-vectors is only guaranteed to be a unit-vector for n=2 dimensions, + i.e., in cases of binary quantification. + + :param base_quantifier: the base, binary quantifier + :param random_state: a seed to be set before fitting any base quantifier (default None) + :param param_grid: the grid or parameters towards which the median will be computed + :param n_jobs: number of parallel workers + """ + def __init__(self, base_quantifier: BinaryQuantifier, param_grid: dict, random_state=None, n_jobs=None): + self.base_quantifier = base_quantifier + self.param_grid = param_grid + self.random_state = random_state + self.n_jobs = qp._get_njobs(n_jobs) + + def get_params(self, deep=True): + return self.base_quantifier.get_params(deep) + + def set_params(self, **params): + self.base_quantifier.set_params(**params) + + def _delayed_fit(self, args): + with qp.util.temp_seed(self.random_state): + params, X, y = args + model = deepcopy(self.base_quantifier) + model.set_params(**params) + model.fit(X, y) + return model + + def fit(self, X, y): + self._check_binary(y, self.__class__.__name__) + + configs = qp.model_selection.expand_grid(self.param_grid) + self.models = qp.util.parallel( + self._delayed_fit, + ((params, X, y) for params in configs), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + asarray=False + ) + return self + + def _delayed_predict(self, args): + model, instances = args + return model.predict(instances) + + def predict(self, X): + prev_preds = qp.util.parallel( + self._delayed_predict, + ((model, X) for model in self.models), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + asarray=False + ) + prev_preds = np.asarray(prev_preds) + return np.median(prev_preds, axis=0) + class Ensemble(BaseQuantifier): VALID_POLICIES = {'ave', 'ptr', 'ds'} | qp.error.QUANTIFICATION_ERROR_NAMES @@ -156,12 +215,14 @@ class Ensemble(BaseQuantifier): if self.verbose: print('[Ensemble]' + msg) - def fit(self, data: qp.data.LabelledCollection, val_split: Union[qp.data.LabelledCollection, float] = None): - self._sout('Fit') + def fit(self, X, y): + + data = LabelledCollection(X, y) + if self.policy == 'ds' and not data.binary: raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary') - if val_split is None: - val_split = self.val_split + + val_split = self.val_split # randomly chooses the prevalences for each member of the ensemble (preventing classes with less than # min_pos positive examples) @@ -182,6 +243,7 @@ class Ensemble(BaseQuantifier): self.ensemble = qp.util.parallel( _delayed_new_instance, tqdm(args, desc='fitting ensamble', total=self.size) if self.verbose else args, + asarray=False, n_jobs=self.n_jobs) # static selection policy (the name of a quantification-oriented error function to minimize) @@ -191,15 +253,15 @@ class Ensemble(BaseQuantifier): self._sout('Fit [Done]') return self - def quantify(self, instances): + def predict(self, X): predictions = np.asarray( - qp.util.parallel(_delayed_quantify, ((Qi, instances) for Qi in self.ensemble), n_jobs=self.n_jobs) + qp.util.parallel(_delayed_quantify, ((Qi, X) for Qi in self.ensemble), n_jobs=self.n_jobs) ) if self.policy == 'ptr': predictions = self._ptr_policy(predictions) elif self.policy == 'ds': - predictions = self._ds_policy(predictions, instances) + predictions = self._ds_policy(predictions, X) predictions = np.mean(predictions, axis=0) return F.normalize_prevalence(predictions) @@ -263,30 +325,31 @@ class Ensemble(BaseQuantifier): def _ds_policy_get_posteriors(self, data: LabelledCollection): """ - In the original article, this procedure is not described in a sufficient level of detail. The paper only says + In the original article, there are some aspects regarding this method that are not mentioned. The paper says that the distribution of posterior probabilities from training and test examples is compared by means of the Hellinger Distance. However, how these posterior probabilities are generated is not specified. In the article, a Logistic Regressor (LR) is used as the classifier device and that could be used for this purpose. However, in general, a Quantifier is not necessarily an instance of Aggreggative Probabilistic Quantifiers, and so, that the quantifier builds on top of a probabilistic classifier cannot be given for granted. Additionally, it would not - be correct to generate the posterior probabilities for training documents that have concurred in training the + be correct to generate the posterior probabilities for training instances that have concurred in training the classifier that generates them. + This function thus generates the posterior probabilities for all training documents in a cross-validation way, - using a LR with hyperparameters that have previously been optimized via grid search in 5FCV. - :return P,f, where P is a ndarray containing the posterior probabilities of the training data, generated via - cross-validation and using an optimized LR, and the function to be used in order to generate posterior - probabilities for test instances. + using LR with hyperparameters that have previously been optimized via grid search in 5FCV. + + :param data: a LabelledCollection + :return: (P,f,) where P is an ndarray containing the posterior probabilities of the training data, generated via + cross-validation and using an optimized LR, and the function to be used in order to generate posterior + probabilities for test instances. """ + X, y = data.Xy lr_base = LogisticRegression(class_weight='balanced', max_iter=1000) - optim = GridSearchCV( - lr_base, param_grid={'C': np.logspace(-4, 4, 9)}, cv=5, n_jobs=self.n_jobs, refit=True - ).fit(X, y) + param_grid = {'C': np.logspace(-4, 4, 9)} + optim = GridSearchCV(lr_base, param_grid=param_grid, cv=5, n_jobs=self.n_jobs, refit=True).fit(X, y) - posteriors = cross_val_predict( - optim.best_estimator_, X, y, cv=5, n_jobs=self.n_jobs, method='predict_proba' - ) + posteriors = cross_val_predict(optim.best_estimator_, X, y, cv=5, n_jobs=self.n_jobs, method='predict_proba') posteriors_generator = optim.best_estimator_.predict_proba return posteriors, posteriors_generator @@ -351,26 +414,29 @@ def _delayed_new_instance(args): sample = data.sampling_from_index(sample_index) if val_split is not None: - model.fit(sample, val_split=val_split) + model.fit(*sample.Xy, val_split=val_split) else: - model.fit(sample) + model.fit(*sample.Xy) tr_prevalence = sample.prevalence() tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None + if verbose: - print(f'\t\--fit-ended for prev {F.strprev(prev)}') + print(f'\t--fit-ended for prev {F.strprev(prev)}') + return (model, tr_prevalence, tr_distribution, sample if keep_samples else None) def _delayed_quantify(args): quantifier, instances = args - return quantifier[0].quantify(instances) + return quantifier[0].predict(instances) def _draw_simplex(ndim, min_val, max_trials=100): """ - returns a uniform sampling from the ndim-dimensional simplex but guarantees that all dimensions + Returns a uniform sampling from the ndim-dimensional simplex but guarantees that all dimensions are >= min_class_prev (for min_val>0, this makes the sampling not truly uniform) + :param ndim: number of dimensions of the simplex :param min_val: minimum class prevalence allowed. If less than 1/ndim a ValueError will be throw since there is no possible solution. @@ -584,3 +650,107 @@ def EEMQ(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs): """ return ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs) + + +def merge(prev_predictions, merge_fun): + prev_predictions = np.asarray(prev_predictions) + if merge_fun == 'median': + prevalences = np.median(prev_predictions, axis=0) + prevalences = F.normalize_prevalence(prevalences, method='l1') + elif merge_fun == 'mean': + prevalences = np.mean(prev_predictions, axis=0) + else: + raise NotImplementedError(f'merge function {merge_fun} not implemented!') + return prevalences + + +class SCMQ(AggregativeSoftQuantifier): + + MERGE_FUNCTIONS = ['median', 'mean'] + + def __init__(self, classifier, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5): + self.classifier = classifier + self.quantifiers = [deepcopy(q) for q in quantifiers] + assert merge_fun in self.MERGE_FUNCTIONS, f'unknown {merge_fun=}, valid ones are {self.MERGE_FUNCTIONS}' + self.merge_fun = merge_fun + self.val_split = val_split + + def aggregation_fit(self, classif_predictions, labels): + for quantifier in self.quantifiers: + quantifier.classifier = self.classifier + quantifier.aggregation_fit(classif_predictions, labels) + return self + + def aggregate(self, classif_predictions: np.ndarray): + prev_predictions = [] + for quantifier_i in self.quantifiers: + prevalence_i = quantifier_i.aggregate(classif_predictions) + prev_predictions.append(prevalence_i) + return merge(prev_predictions, merge_fun=self.merge_fun) + + +class MCSQ(BaseQuantifier): + def __init__(self, classifiers, quantifier: AggregativeSoftQuantifier, merge_fun='median', val_split=5): + self.merge_fun = merge_fun + self.val_split = val_split + self.mcsqs = [] + for classifier in classifiers: + quantifier = deepcopy(quantifier) + quantifier.classifier = classifier + self.mcsqs.append(quantifier) + + def fit(self, data: LabelledCollection): + for q in self.mcsqs: + q.fit(data, val_split=self.val_split) + return self + + def quantify(self, instances): + prev_predictions = [] + for q in self.mcsqs: + prevalence_i = q.quantify(instances) + prev_predictions.append(prevalence_i) + return merge(prev_predictions, merge_fun=self.merge_fun) + + +class MCMQ(BaseQuantifier): + def __init__(self, classifiers, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5): + self.merge_fun = merge_fun + self.scmqs = [] + for classifier in classifiers: + self.scmqs.append(SCMQ(classifier, quantifiers, val_split=val_split)) + + def fit(self, data: LabelledCollection): + for q in self.scmqs: + q.fit(data) + return self + + def quantify(self, instances): + prev_predictions = [] + for q in self.scmqs: + prevalence_i = q.quantify(instances) + prev_predictions.append(prevalence_i) + return merge(prev_predictions, merge_fun=self.merge_fun) + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/quapy/method/non_aggregative.py b/quapy/method/non_aggregative.py index 87e59fb..6f204e4 100644 --- a/quapy/method/non_aggregative.py +++ b/quapy/method/non_aggregative.py @@ -1,10 +1,17 @@ -from typing import Union, Callable +from itertools import product +from tqdm import tqdm +from typing import Union, Callable, Counter import numpy as np +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.utils import resample +from sklearn.preprocessing import normalize -from functional import get_divergence -from quapy.data import LabelledCollection +from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC +from quapy.functional import get_divergence from quapy.method.base import BaseQuantifier, BinaryQuantifier import quapy.functional as F +from scipy.optimize import lsq_linear +from scipy import sparse class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier): @@ -19,21 +26,23 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier): def __init__(self): self._classes_ = None - def fit(self, data: LabelledCollection): + def fit(self, X, y): """ Computes the training prevalence and stores it. - :param data: the training sample + :param X: array-like of shape `(n_samples, n_features)`, the training instances + :param y: array-like of shape `(n_samples,)`, the labels :return: self """ - self.estimated_prevalence = data.prevalence() + self._classes_ = F.classes_from_labels(labels=y) + self.estimated_prevalence = F.prevalence_from_labels(y, classes=self._classes_) return self - def quantify(self, instances): + def predict(self, X): """ Ignores the input instances and returns, as the class prevalence estimantes, the training prevalence. - :param instances: array-like (ignored) + :param X: array-like (ignored) :return: the class prevalence seen during training """ return self.estimated_prevalence @@ -99,7 +108,7 @@ class DMx(BaseQuantifier): return distributions - def fit(self, data: LabelledCollection): + def fit(self, X, y): """ Generates the validation distributions out of the training data (covariates). The validation distributions have shape `(n, nfeats, nbins)`, with `n` the number of classes, `nfeats` @@ -108,33 +117,33 @@ class DMx(BaseQuantifier): training data labelled with class `i`; while `dij = di[j]` is the discrete distribution for feature j in training data labelled with class `i`, and `dij[k]` is the fraction of instances with a value in the `k`-th bin. - :param data: the training set + :param X: array-like of shape `(n_samples, n_features)`, the training instances + :param y: array-like of shape `(n_samples,)`, the labels """ - X, y = data.Xy - self.nfeats = X.shape[1] self.feat_ranges = _get_features_range(X) + n_classes = len(np.unique(y)) self.validation_distribution = np.asarray( - [self.__get_distributions(X[y==cat]) for cat in range(data.n_classes)] + [self.__get_distributions(X[y==cat]) for cat in range(n_classes)] ) return self - def quantify(self, instances): + def predict(self, X): """ Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution (the mixture) that best matches the test distribution, in terms of the divergence measure of choice. The matching is computed as the average dissimilarity (in terms of the dissimilarity measure of choice) between all feature-specific discrete distributions. - :param instances: instances in the sample + :param X: instances in the sample :return: a vector of class prevalence estimates """ - assert instances.shape[1] == self.nfeats, f'wrong shape; expected {self.nfeats}, found {instances.shape[1]}' + assert X.shape[1] == self.nfeats, f'wrong shape; expected {self.nfeats}, found {X.shape[1]}' - test_distribution = self.__get_distributions(instances) + test_distribution = self.__get_distributions(X) divergence = get_divergence(self.divergence) n_classes, n_feats, nbins = self.validation_distribution.shape def loss(prev): @@ -147,6 +156,165 @@ class DMx(BaseQuantifier): + +class ReadMe(BaseQuantifier, WithConfidenceABC): + """ + ReadMe is a non-aggregative quantification system proposed by + `Daniel Hopkins and Gary King, 2007. A method of automated nonparametric content analysis for + social science. American Journal of Political Science, 54(1):229–247. + `_. + The idea is to estimate `Q(Y=i)` directly from: + + :math:`Q(X)=\\sum_{i=1} Q(X|Y=i) Q(Y=i)` + + via least-squares regression, i.e., without incurring the cost of computing posterior probabilities. + However, this poses a very difficult representation in which the vector `Q(X)` and the matrix `Q(X|Y=i)` + can be of very high dimensions. In order to render the problem tracktable, ReadMe performs bagging in + the feature space. ReadMe also combines bagging with bootstrap in order to derive confidence intervals + around point estimations. + + We use the same default parameters as in the official + `R implementation `_. + + :param prob_model: str ('naive', or 'full'), selects the modality in which the probabilities `Q(X)` and + `Q(X|Y)` are to be modelled. Options include "full", which corresponds to the original formulation of + ReadMe, in which X is constrained to be a binary matrix (e.g., of term presence/absence) and in which + `Q(X)` and `Q(X|Y)` are modelled, respectively, as matrices of `(2^K, 1)` and `(2^K, n)` values, where + `K` is the number of columns in the data matrix (i.e., `bagging_range`), and `n` is the number of classes. + Of course, this approach is computationally prohibited for large `K`, so the computation is restricted to data + matrices with `K<=25` (although we recommend even smaller values of `K`). A much faster model is "naive", which + considers the `Q(X)` and `Q(X|Y)` be multinomial distributions under the `bag-of-words` perspective. In this + case, `bagging_range` can be set to much larger values. Default is "full" (i.e., original ReadMe behavior). + :param bootstrap_trials: int, number of bootstrap trials (default 300) + :param bagging_trials: int, number of bagging trials (default 300) + :param bagging_range: int, number of features to keep for each bagging trial (default 15) + :param confidence_level: float, a value in (0,1) reflecting the desired confidence level (default 0.95) + :param region: str in 'intervals', 'ellipse', 'ellipse-clr'; indicates the preferred method for + defining the confidence region (see :class:`WithConfidenceABC`) + :param random_state: int or None, allows replicability (default None) + :param verbose: bool, whether to display information during the process (default False) + """ + + MAX_FEATURES_FOR_EMPIRICAL_ESTIMATION = 25 + PROBABILISTIC_MODELS = ["naive", "full"] + + def __init__(self, + prob_model="full", + bootstrap_trials=300, + bagging_trials=300, + bagging_range=15, + confidence_level=0.95, + region='intervals', + random_state=None, + verbose=False): + assert prob_model in ReadMe.PROBABILISTIC_MODELS, \ + f'unknown {prob_model=}, valid ones are {ReadMe.PROBABILISTIC_MODELS=}' + self.prob_model = prob_model + self.bootstrap_trials = bootstrap_trials + self.bagging_trials = bagging_trials + self.bagging_range = bagging_range + self.confidence_level = confidence_level + self.region = region + self.random_state = random_state + self.verbose = verbose + + def fit(self, X, y): + self._check_matrix(X) + + self.rng = np.random.default_rng(self.random_state) + self.classes_ = np.unique(y) + + + Xsize = X.shape[0] + + # Bootstrap loop + self.Xboots, self.yboots = [], [] + for _ in range(self.bootstrap_trials): + idx = self.rng.choice(Xsize, size=Xsize, replace=True) + self.Xboots.append(X[idx]) + self.yboots.append(y[idx]) + + return self + + def predict_conf(self, X, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC): + self._check_matrix(X) + + n_features = X.shape[1] + boots_prevalences = [] + for Xboots, yboots in tqdm( + zip(self.Xboots, self.yboots), + desc='bootstrap predictions', total=self.bootstrap_trials, disable=not self.verbose + ): + bagging_estimates = [] + for _ in range(self.bagging_trials): + feat_idx = self.rng.choice(n_features, size=self.bagging_range, replace=False) + Xboots_bagging = Xboots[:, feat_idx] + X_boots_bagging = X[:, feat_idx] + bagging_prev = self._quantify_iteration(Xboots_bagging, yboots, X_boots_bagging) + bagging_estimates.append(bagging_prev) + + boots_prevalences.append(np.mean(bagging_estimates, axis=0)) + + conf = WithConfidenceABC.construct_region(boots_prevalences, confidence_level, method=self.region) + prev_estim = conf.point_estimate() + + return prev_estim, conf + + def predict(self, X): + prev_estim, _ = self.predict_conf(X) + return prev_estim + + def _quantify_iteration(self, Xtr, ytr, Xte): + """Single ReadMe estimate.""" + PX_given_Y = np.asarray([self._compute_P(Xtr[ytr == c]) for i,c in enumerate(self.classes_)]) + PX = self._compute_P(Xte) + + res = lsq_linear(A=PX_given_Y.T, b=PX, bounds=(0, 1)) + pY = np.maximum(res.x, 0) + return pY / pY.sum() + + def _check_matrix(self, X): + """the "full" model requires estimating empirical distributions; due to the high computational cost, + this function is only made available for binary matrices""" + if self.prob_model == 'full' and not self._is_binary_matrix(X): + raise ValueError('the empirical distribution can only be computed efficiently on binary matrices') + + def _is_binary_matrix(self, X): + data = X.data if sparse.issparse(X) else X + return np.all((data == 0) | (data == 1)) + + def _compute_P(self, X): + if self.prob_model == 'naive': + return self._multinomial_distribution(X) + elif self.prob_model == 'full': + return self._empirical_distribution(X) + else: + raise ValueError(f'unknown {self.prob_model}; valid ones are {ReadMe.PROBABILISTIC_MODELS=}') + + def _empirical_distribution(self, X): + + if X.shape[1] > self.MAX_FEATURES_FOR_EMPIRICAL_ESTIMATION: + raise ValueError(f'the empirical distribution can only be computed efficiently for dimensions ' + f'less or equal than {self.MAX_FEATURES_FOR_EMPIRICAL_ESTIMATION}') + + # we convert every binary row (e.g., 0 0 1 0 1) into the equivalent number (e.g., 5) + K = X.shape[1] + binary_powers = 1 << np.arange(K-1, -1, -1) # (2^K, ..., 32, 16, 8, 4, 2, 1) + X_as_binary_numbers = X @ binary_powers + + # count occurrences and compute probs + counts = np.bincount(X_as_binary_numbers, minlength=2 ** K).astype(float) + probs = counts / counts.sum() + return probs + + def _multinomial_distribution(self, X): + PX = np.asarray(X.sum(axis=0)) + PX = normalize(PX, norm='l1', axis=1) + return PX.ravel() + + + + def _get_features_range(X): feat_ranges = [] ncols = X.shape[1] diff --git a/quapy/method/stan/pq.stan b/quapy/method/stan/pq.stan new file mode 100644 index 0000000..a5bc52a --- /dev/null +++ b/quapy/method/stan/pq.stan @@ -0,0 +1,39 @@ +data { + int n_bucket; + array[n_bucket] int train_pos; + array[n_bucket] int train_neg; + array[n_bucket] int test; + int posterior; +} + +transformed data{ + row_vector[n_bucket] train_pos_rv; + row_vector[n_bucket] train_neg_rv; + row_vector[n_bucket] test_rv; + real n_test; + + train_pos_rv = to_row_vector( train_pos ); + train_neg_rv = to_row_vector( train_neg ); + test_rv = to_row_vector( test ); + n_test = sum( test ); +} + +parameters { + simplex[n_bucket] p_neg; + simplex[n_bucket] p_pos; + real prev_prior; +} + +model { + if( posterior ) { + target += train_neg_rv * log( p_neg ); + target += train_pos_rv * log( p_pos ); + target += test_rv * log( p_neg * ( 1 - prev_prior) + p_pos * prev_prior ); + } +} + +generated quantities { + real prev; + prev = sum( binomial_rng(test, 1 / ( 1 + (p_neg./p_pos) *(1-prev_prior)/prev_prior ) ) ) / n_test; +} + diff --git a/quapy/model_selection.py b/quapy/model_selection.py index f02d9dc..0937fa8 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -1,7 +1,9 @@ import itertools import signal from copy import deepcopy +from enum import Enum from typing import Union, Callable +from functools import wraps import numpy as np from sklearn import clone @@ -10,10 +12,38 @@ import quapy as qp from quapy import evaluation from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol from quapy.data.base import LabelledCollection -from quapy.method.aggregative import BaseQuantifier +from quapy.method.aggregative import BaseQuantifier, AggregativeQuantifier +from quapy.util import timeout from time import time +class Status(Enum): + SUCCESS = 1 + TIMEOUT = 2 + INVALID = 3 + ERROR = 4 + + +class ConfigStatus: + + def __init__(self, params, status, msg=''): + self.params = params + self.status = status + self.msg = msg + + def __str__(self): + return f':params:{self.params} :status:{self.status} ' + self.msg + + def __repr__(self): + return str(self) + + def success(self): + return self.status == Status.SUCCESS + + def failed(self): + return self.status != Status.SUCCESS + + class GridSearchQ(BaseQuantifier): """Grid Search optimization targeting a quantification-oriented metric. @@ -26,11 +56,14 @@ class GridSearchQ(BaseQuantifier): :param protocol: a sample generation protocol, an instance of :class:`quapy.protocol.AbstractProtocol` :param error: an error function (callable) or a string indicating the name of an error function (valid ones are those in :class:`quapy.error.QUANTIFICATION_ERROR` - :param refit: whether or not to refit the model on the whole labelled collection (training+validation) with + :param refit: whether to refit the model on the whole labelled collection (training+validation) with the best chosen hyperparameter combination. Ignored if protocol='gen' :param timeout: establishes a timer (in seconds) for each of the hyperparameters configurations being tested. Whenever a run takes longer than this timer, that configuration will be ignored. If all configurations end up being ignored, a TimeoutError exception is raised. If -1 (default) then no time bound is set. + :param raise_errors: boolean, if True then raises an exception when a param combination yields any error, if + otherwise is False (default), then the combination is marked with an error status, but the process goes on. + However, if no configuration yields a valid model, then a ValueError exception will be raised. :param verbose: set to True to get information through the stdout """ @@ -42,6 +75,7 @@ class GridSearchQ(BaseQuantifier): refit=True, timeout=-1, n_jobs=None, + raise_errors=False, verbose=False): self.model = model @@ -50,15 +84,16 @@ class GridSearchQ(BaseQuantifier): self.refit = refit self.timeout = timeout self.n_jobs = qp._get_njobs(n_jobs) + self.raise_errors = raise_errors self.verbose = verbose - self.__check_error(error) + self.__check_error_measure(error) assert isinstance(protocol, AbstractProtocol), 'unknown protocol' def _sout(self, msg): if self.verbose: print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}') - def __check_error(self, error): + def __check_error_measure(self, error): if error in qp.error.QUANTIFICATION_ERROR: self.error = error elif isinstance(error, str): @@ -69,113 +104,194 @@ class GridSearchQ(BaseQuantifier): raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n' f'the name of an error function in {qp.error.QUANTIFICATION_ERROR_NAMES}') - def fit(self, training: LabelledCollection): - """ Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing - the error metric. + def _prepare_classifier(self, cls_params): + model = deepcopy(self.model) - :param training: the training set on which to optimize the hyperparameters - :return: self + def job(cls_params): + model.set_params(**cls_params) + predictions = model.classifier_fit_predict(self._training_X, self._training_y) + return predictions + + predictions, status, took = self._error_handler(job, cls_params) + self._sout(f'[classifier fit] hyperparams={cls_params} [took {took:.3f}s]') + return model, predictions, status, took + + def _prepare_aggregation(self, args): + model, predictions, cls_took, cls_params, q_params = args + model = deepcopy(model) + params = {**cls_params, **q_params} + + def job(q_params): + model.set_params(**q_params) + P, y = predictions + model.aggregation_fit(P, y) + score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error) + return score + + score, status, aggr_took = self._error_handler(job, q_params) + self._print_status(params, score, status, aggr_took) + return model, params, score, status, (cls_took+aggr_took) + + def _prepare_nonaggr_model(self, params): + model = deepcopy(self.model) + + def job(params): + model.set_params(**params) + model.fit(self._training_X, self._training_y) + score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error) + return score + + score, status, took = self._error_handler(job, params) + self._print_status(params, score, status, took) + return model, params, score, status, took + + def _break_down_fit(self): """ - params_keys = list(self.param_grid.keys()) - params_values = list(self.param_grid.values()) + Decides whether to break down the fit phase in two (classifier-fit followed by aggregation-fit). + In order to do so, some conditions should be met: a) the quantifier is of type aggregative, + b) the set of hyperparameters can be split into two disjoint non-empty groups. - protocol = self.protocol + :return: True if the conditions are met, False otherwise + """ + if not isinstance(self.model, AggregativeQuantifier): + return False + cls_configs, q_configs = group_params(self.param_grid) + if (len(cls_configs) == 1) or (len(q_configs)==1): + return False + return True - self.param_scores_ = {} - self.best_score_ = None + def _compute_scores_aggregative(self, X, y): + # break down the set of hyperparameters into two: classifier-specific, quantifier-specific + cls_configs, q_configs = group_params(self.param_grid) - tinit = time() + # train all classifiers and get the predictions + self._training_X = X + self._training_y = y + cls_outs = qp.util.parallel( + self._prepare_classifier, + cls_configs, + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + asarray=False + ) - hyper = [dict({k: val[i] for i, k in enumerate(params_keys)}) for val in itertools.product(*params_values)] - self._sout(f'starting model selection with {self.n_jobs =}') - #pass a seed to parallel so it is set in clild processes - scores = qp.util.parallel( - self._delayed_eval, - ((params, training) for params in hyper), + # filter out classifier configurations that yielded any error + success_outs = [] + for (model, predictions, status, took), cls_config in zip(cls_outs, cls_configs): + if status.success(): + success_outs.append((model, predictions, took, cls_config)) + else: + self.error_collector.append(status) + + if len(success_outs) == 0: + raise ValueError('No valid configuration found for the classifier!') + + # explore the quantifier-specific hyperparameters for each valid training configuration + aggr_configs = [(*out, q_config) for out, q_config in itertools.product(success_outs, q_configs)] + aggr_outs = qp.util.parallel( + self._prepare_aggregation, + aggr_configs, seed=qp.environ.get('_R_SEED', None), n_jobs=self.n_jobs ) - for params, score, model in scores: - if score is not None: + return aggr_outs + + def _compute_scores_nonaggregative(self, X, y): + configs = expand_grid(self.param_grid) + self._training_X = X + self._training_y = y + scores = qp.util.parallel( + self._prepare_nonaggr_model, + configs, + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs + ) + return scores + + def _print_status(self, params, score, status, took): + if status.success(): + self._sout(f'hyperparams=[{params}]\t got {self.error.__name__} = {score:.5f} [took {took:.3f}s]') + else: + self._sout(f'error={status}') + + def fit(self, X, y): + """ Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing + the error metric. + + :param X: array-like, training covariates + :param y: array-like, labels of training data + :return: self + """ + + if self.refit and not isinstance(self.protocol, OnLabelledCollectionProtocol): + raise RuntimeWarning( + f'"refit" was requested, but the protocol does not implement ' + f'the {OnLabelledCollectionProtocol.__name__} interface' + ) + + tinit = time() + + self.error_collector = [] + + self._sout(f'starting model selection with n_jobs={self.n_jobs}') + if self._break_down_fit(): + results = self._compute_scores_aggregative(X, y) + else: + results = self._compute_scores_nonaggregative(X, y) + + self.param_scores_ = {} + self.best_score_ = None + for model, params, score, status, took in results: + if status.success(): if self.best_score_ is None or score < self.best_score_: self.best_score_ = score self.best_params_ = params self.best_model_ = model self.param_scores_[str(params)] = score else: - self.param_scores_[str(params)] = 'timeout' + self.param_scores_[str(params)] = status.status + self.error_collector.append(status) - tend = time()-tinit + self.fit_time_ = time()-tinit if self.best_score_ is None: - raise TimeoutError('no combination of hyperparameters seem to work') + raise ValueError('no combination of hyperparameters seemed to work') self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) ' - f'[took {tend:.4f}s]') + f'[took {self.fit_time_:.4f}s]') + + no_errors = len(self.error_collector) + if no_errors>0: + self._sout(f'warning: {no_errors} errors found') + for err in self.error_collector: + self._sout(f'\t{str(err)}') if self.refit: - if isinstance(protocol, OnLabelledCollectionProtocol): + if isinstance(self.protocol, OnLabelledCollectionProtocol): + tinit = time() self._sout(f'refitting on the whole development set') - self.best_model_.fit(training + protocol.get_labelled_collection()) + validation_collection = self.protocol.get_labelled_collection() + training_collection = LabelledCollection(X, y, classes=validation_collection.classes) + devel_collection = training_collection + validation_collection + self.best_model_.fit(*devel_collection.Xy) + tend = time() - tinit + self.refit_time_ = tend else: - raise RuntimeWarning(f'"refit" was requested, but the protocol does not ' - f'implement the {OnLabelledCollectionProtocol.__name__} interface') + # already checked + raise RuntimeWarning(f'the model cannot be refit on the whole dataset') return self - def _delayed_eval(self, args): - params, training = args - - protocol = self.protocol - error = self.error - - if self.timeout > 0: - def handler(signum, frame): - raise TimeoutError() - - signal.signal(signal.SIGALRM, handler) - - tinit = time() - - if self.timeout > 0: - signal.alarm(self.timeout) - - try: - model = deepcopy(self.model) - # overrides default parameters with the parameters being explored at this iteration - model.set_params(**params) - model.fit(training) - score = evaluation.evaluate(model, protocol=protocol, error_metric=error) - - ttime = time()-tinit - self._sout(f'hyperparams={params}\t got {error.__name__} score {score:.5f} [took {ttime:.4f}s]') - - if self.timeout > 0: - signal.alarm(0) - except TimeoutError: - self._sout(f'timeout ({self.timeout}s) reached for config {params}') - score = None - except ValueError as e: - self._sout(f'the combination of hyperparameters {params} is invalid') - raise e - except Exception as e: - self._sout(f'something went wrong for config {params}; skipping:') - self._sout(f'\tException: {e}') - score = None - - return params, score, model - - - def quantify(self, instances): + def predict(self, X): """Estimate class prevalence values using the best model found after calling the :meth:`fit` method. - :param instances: sample contanining the instances + :param X: sample contanining the instances :return: a ndarray of shape `(n_classes)` with class prevalence estimates as according to the best model found by the model selection process. """ assert hasattr(self, 'best_model_'), 'quantify called before fit' - return self.best_model().quantify(instances) + return self.best_model().predict(X) def set_params(self, **parameters): """Sets the hyper-parameters to explore. @@ -203,7 +319,42 @@ class GridSearchQ(BaseQuantifier): return self.best_model_ raise ValueError('best_model called before fit') + def _error_handler(self, func, params): + """ + Endorses one job with two returned values: the status, and the time of execution + :param func: the function to be called + :param params: parameters of the function + :return: `tuple(out, status, time)` where `out` is the function output, + `status` is an enum value from `Status`, and `time` is the time it + took to complete the call + """ + + output = None + + def _handle(status, exception): + if self.raise_errors: + raise exception + else: + return ConfigStatus(params, status, msg=str(exception)) + + try: + with timeout(self.timeout): + tinit = time() + output = func(params) + status = ConfigStatus(params, Status.SUCCESS) + + except TimeoutError as e: + status = _handle(Status.TIMEOUT, e) + + except ValueError as e: + status = _handle(Status.INVALID, e) + + except Exception as e: + status = _handle(Status.ERROR, e) + + took = time() - tinit + return output, status, took def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfolds=3, random_state=0): @@ -221,11 +372,51 @@ def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfol total_prev = np.zeros(shape=data.n_classes) for train, test in data.kFCV(nfolds=nfolds, random_state=random_state): - quantifier.fit(train) - fold_prev = quantifier.quantify(test.X) + quantifier.fit(*train.Xy) + fold_prev = quantifier.predict(test.X) rel_size = 1. * len(test) / len(data) total_prev += fold_prev*rel_size return total_prev +def expand_grid(param_grid: dict): + """ + Expands a param_grid dictionary as a list of configurations. + Example: + + >>> combinations = expand_grid({'A': [1, 10, 100], 'B': [True, False]}) + >>> print(combinations) + >>> [{'A': 1, 'B': True}, {'A': 1, 'B': False}, {'A': 10, 'B': True}, {'A': 10, 'B': False}, {'A': 100, 'B': True}, {'A': 100, 'B': False}] + + :param param_grid: dictionary with keys representing hyper-parameter names, and values representing the range + to explore for that hyper-parameter + :return: a list of configurations, i.e., combinations of hyper-parameter assignments in the grid. + """ + params_keys = list(param_grid.keys()) + params_values = list(param_grid.values()) + configs = [{k: combs[i] for i, k in enumerate(params_keys)} for combs in itertools.product(*params_values)] + return configs + + +def group_params(param_grid: dict): + """ + Partitions a param_grid dictionary as two lists of configurations, one for the classifier-specific + hyper-parameters, and another for que quantifier-specific hyper-parameters + + :param param_grid: dictionary with keys representing hyper-parameter names, and values representing the range + to explore for that hyper-parameter + :return: two expanded grids of configurations, one for the classifier, another for the quantifier + """ + classifier_params, quantifier_params = {}, {} + for key, values in param_grid.items(): + if key.startswith('classifier__') or key == 'val_split': + classifier_params[key] = values + else: + quantifier_params[key] = values + + classifier_configs = expand_grid(classifier_params) + quantifier_configs = expand_grid(quantifier_params) + + return classifier_configs, quantifier_configs + diff --git a/quapy/plot.py b/quapy/plot.py index 7807f26..1f062e9 100644 --- a/quapy/plot.py +++ b/quapy/plot.py @@ -1,6 +1,6 @@ from collections import defaultdict import matplotlib.pyplot as plt -from matplotlib.cm import get_cmap +from matplotlib.pyplot import get_cmap import numpy as np from matplotlib import cm from scipy.stats import ttest_ind_from_stats @@ -23,21 +23,29 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No indicating which class is to be taken as the positive class. (For multiclass quantification problems, other plots like the :meth:`error_by_drift` might be preferable though). + The format convention is as follows: `method_names`, `true_prevs`, and `estim_prevs` are array-like of the same + length, with the ith element describing the output of an independent experiment. The elements of `true_prevs`, and + `estim_prevs` are `ndarrays` with coherent shape for the same experiment. Experiments for the same method on + different datasets can be used, in which case the method name can appear more than once in `method_names`. + :param method_names: array-like with the method names for each experiment - :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for - each experiment - :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components) - for each experiment - :param pos_class: index of the positive class - :param title: the title to be displayed in the plot - :param show_std: whether or not to show standard deviations (represented by color bands). This might be inconvenient + :param true_prevs: array-like with the true prevalence values for each experiment. Each entry is a ndarray of + shape `(n_samples, n_classes)` components. + :param estim_prevs: array-like with the estimated prevalence values for each experiment. Each entry is a ndarray of + shape `(n_samples, n_classes)` components and `n_samples` must coincide with the corresponding entry in + `true_prevs`. + :param pos_class: index of the positive class (default 1) + :param title: the title to be displayed in the plot (default None) + :param show_std: whether to show standard deviations (represented by color bands). This might be inconvenient for cases in which many methods are compared, or when the standard deviations are high -- default True) - :param legend: whether or not to display the leyend (default True) - :param train_prev: if indicated (default is None), the training prevalence (for the positive class) is hightlighted - in the plot. This is convenient when all the experiments have been conducted in the same dataset. + :param legend: whether to display the legend (default True) + :param train_prev: if indicated (default is None), the training prevalence (for the positive class) is highlighted + in the plot. This is convenient when all the experiments have been conducted in the same dataset, or in + datasets with the same training prevalence. :param savepath: path where to save the plot. If not indicated (as default), the plot is shown. :param method_order: if indicated (default is None), imposes the order in which the methods are processed (i.e., listed in the legend and associated with matplotlib colors). + :return: returns (fig, ax) matplotlib objects for eventual customisation """ fig, ax = plt.subplots() ax.set_aspect('equal') @@ -78,13 +86,9 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No if legend: ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) - # box = ax.get_position() - # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) - # ax.legend(loc='lower center', - # bbox_to_anchor=(1, -0.5), - # ncol=(len(method_names)+1)//2) _save_or_show(savepath) + return fig, ax def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None): @@ -92,14 +96,21 @@ def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title Box-plots displaying the global bias (i.e., signed error computed as the estimated value minus the true value) for each quantification method with respect to a given positive class. + The format convention is as follows: `method_names`, `true_prevs`, and `estim_prevs` are array-like of the same + length, with the ith element describing the output of an independent experiment. The elements of `true_prevs`, and + `estim_prevs` are `ndarrays` with coherent shape for the same experiment. Experiments for the same method on + different datasets can be used, in which case the method name can appear more than once in `method_names`. + :param method_names: array-like with the method names for each experiment - :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for - each experiment - :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components) - for each experiment + :param true_prevs: array-like with the true prevalence values for each experiment. Each entry is a ndarray of + shape `(n_samples, n_classes)` components. + :param estim_prevs: array-like with the estimated prevalence values for each experiment. Each entry is a ndarray of + shape `(n_samples, n_classes)` components and `n_samples` must coincide with the corresponding entry in + `true_prevs`. :param pos_class: index of the positive class - :param title: the title to be displayed in the plot + :param title: the title to be displayed in the plot (default None) :param savepath: path where to save the plot. If not indicated (as default), the plot is shown. + :return: returns (fig, ax) matplotlib objects for eventual customisation """ method_names, true_prevs, estim_prevs = _merge(method_names, true_prevs, estim_prevs) @@ -120,25 +131,34 @@ def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title _save_or_show(savepath) + return fig, ax + def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=5, colormap=cm.tab10, vertical_xticks=False, legend=True, savepath=None): """ Box-plots displaying the local bias (i.e., signed error computed as the estimated value minus the true value) - for different bins of (true) prevalence of the positive classs, for each quantification method. + for different bins of (true) prevalence of the positive class, for each quantification method. + + The format convention is as follows: `method_names`, `true_prevs`, and `estim_prevs` are array-like of the same + length, with the ith element describing the output of an independent experiment. The elements of `true_prevs`, and + `estim_prevs` are `ndarrays` with coherent shape for the same experiment. Experiments for the same method on + different datasets can be used, in which case the method name can appear more than once in `method_names`. :param method_names: array-like with the method names for each experiment - :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for - each experiment - :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components) - for each experiment + :param true_prevs: array-like with the true prevalence values for each experiment. Each entry is a ndarray of + shape `(n_samples, n_classes)` components. + :param estim_prevs: array-like with the estimated prevalence values for each experiment. Each entry is a ndarray of + shape `(n_samples, n_classes)` components and `n_samples` must coincide with the corresponding entry in + `true_prevs`. :param pos_class: index of the positive class - :param title: the title to be displayed in the plot - :param nbins: number of bins + :param title: the title to be displayed in the plot (default None) + :param nbins: number of bins (default 5) :param colormap: the matplotlib colormap to use (default cm.tab10) :param vertical_xticks: whether or not to add secondary grid (default is False) :param legend: whether or not to display the legend (default is True) :param savepath: path where to save the plot. If not indicated (as default), the plot is shown. + :return: returns (fig, ax) matplotlib objects for eventual customisation """ from pylab import boxplot, plot, setp @@ -210,13 +230,15 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N _save_or_show(savepath) + return fig, ax + def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, error_name='ae', show_std=False, show_density=True, show_legend=True, logscale=False, - title=f'Quantification error as a function of label shift', + title=None, vlines=None, method_order=None, fontsize=18, @@ -228,11 +250,17 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, fare in different regions of the prior probability shift spectrum (e.g., in the low-shift regime vs. in the high-shift regime). + The format convention is as follows: `method_names`, `true_prevs`, and `estim_prevs` are array-like of the same + length, with the ith element describing the output of an independent experiment. The elements of `true_prevs`, and + `estim_prevs` are `ndarrays` with coherent shape for the same experiment. Experiments for the same method on + different datasets can be used, in which case the method name can appear more than once in `method_names`. + :param method_names: array-like with the method names for each experiment - :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for - each experiment - :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components) - for each experiment + :param true_prevs: array-like with the true prevalence values for each experiment. Each entry is a ndarray of + shape `(n_samples, n_classes)` components. + :param estim_prevs: array-like with the estimated prevalence values for each experiment. Each entry is a ndarray of + shape `(n_samples, n_classes)` components and `n_samples` must coincide with the corresponding entry in + `true_prevs`. :param tr_prevs: training prevalence of each experiment :param n_bins: number of bins in which the y-axis is to be divided (default is 20) :param error_name: a string representing the name of an error function (as defined in `quapy.error`, default is "ae") @@ -240,12 +268,13 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, :param show_density: whether or not to display the distribution of experiments for each bin (default is True) :param show_density: whether or not to display the legend of the chart (default is True) :param logscale: whether or not to log-scale the y-error measure (default is False) - :param title: title of the plot (default is "Quantification error as a function of distribution shift") + :param title: title of the plot (default is None) :param vlines: array-like list of values (default is None). If indicated, highlights some regions of the space using vertical dotted lines. :param method_order: if indicated (default is None), imposes the order in which the methods are processed (i.e., listed in the legend and associated with matplotlib colors). :param savepath: path where to save the plot. If not indicated (as default), the plot is shown. + :return: returns (fig, ax) matplotlib objects for eventual customisation """ plt.rcParams['font.size'] = fontsize @@ -256,15 +285,15 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error = qp.error.ae y_error = getattr(qp.error, error_name) + if method_order is None: + method_order = [] + # get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same # order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to # x_error function) and 'y' is the estim-test shift (computed as according to y_error) data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order) - if method_order is None: - method_order = method_names - - # _set_colors(ax, n_methods=len(method_order)) + _set_colors(ax, n_methods=len(method_order)) bins = np.linspace(0, 1, n_bins+1) binwidth = 1 / n_bins @@ -322,8 +351,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, ax.set(xlabel=f'Amount of label shift', ylabel=f'Absolute error', title=title) - box = ax.get_position() - ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + # box = ax.get_position() + # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) if vlines: for vline in vlines: ax.axvline(vline, 0, 1, linestyle='--', color='k') @@ -333,7 +362,6 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, #nice scale for the logaritmic axis ax.set_ylim(0,10 ** math.ceil(math.log10(max_y))) - if show_legend: ax.legend(loc='center right', bbox_to_anchor=(1.31, 0.5)) # fig.legend(loc='lower center', @@ -342,6 +370,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, _save_or_show(savepath) + return fig, ax + def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, binning='isomerous', @@ -357,11 +387,17 @@ def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs plot is displayed on top, that displays the distribution of experiments for each bin (when binning="isometric") or the percentiles points of the distribution (when binning="isomerous"). + The format convention is as follows: `method_names`, `true_prevs`, and `estim_prevs` are array-like of the same + length, with the ith element describing the output of an independent experiment. The elements of `true_prevs`, and + `estim_prevs` are `ndarrays` with coherent shape for the same experiment. Experiments for the same method on + different datasets can be used, in which case the method name can appear more than once in `method_names`. + :param method_names: array-like with the method names for each experiment - :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for - each experiment - :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components) - for each experiment + :param true_prevs: array-like with the true prevalence values for each experiment. Each entry is a ndarray of + shape `(n_samples, n_classes)` components. + :param estim_prevs: array-like with the estimated prevalence values for each experiment. Each entry is a ndarray of + shape `(n_samples, n_classes)` components and `n_samples` must coincide with the corresponding entry in + `true_prevs`. :param tr_prevs: training prevalence of each experiment :param n_bins: number of bins in which the y-axis is to be divided (default is 20) :param binning: type of binning, either "isomerous" (default) or "isometric" @@ -378,13 +414,16 @@ def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs :param method_order: if indicated (default is None), imposes the order in which the methods are processed (i.e., listed in the legend and associated with matplotlib colors). :param savepath: path where to save the plot. If not indicated (as default), the plot is shown. - :return: + :return: returns (fig, ax) matplotlib objects for eventual customisation """ assert binning in ['isomerous', 'isometric'], 'unknown binning type; valid types are "isomerous" and "isometric"' x_error = getattr(qp.error, x_error) y_error = getattr(qp.error, y_error) + if method_order is None: + method_order = [] + # get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same # order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to # x_error function) and 'y' is the estim-test shift (computed as according to y_error) @@ -525,6 +564,8 @@ def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs _save_or_show(savepath) + return fig, ax + def _merge(method_names, true_prevs, estim_prevs): ndims = true_prevs[0].shape[1] @@ -542,8 +583,9 @@ def _merge(method_names, true_prevs, estim_prevs): def _set_colors(ax, n_methods): NUM_COLORS = n_methods - cm = plt.get_cmap('tab20') - ax.set_prop_cycle(color=[cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)]) + if NUM_COLORS>10: + cm = plt.get_cmap('tab20') + ax.set_prop_cycle(color=[cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)]) def _save_or_show(savepath): @@ -574,4 +616,40 @@ def _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error if method not in method_order: method_order.append(method) - return data \ No newline at end of file + return data + + +def calibration_plot(prob_classifier, X, y, nbins=10, savepath=None): + posteriors = prob_classifier.predict_proba(X) + assert posteriors.ndim==2, 'calibration plot only works for binary problems' + posteriors = posteriors[:,1] + pred_y = posteriors>=0.5 + bins = np.linspace(0, 1, nbins + 1) + binned_values = np.digitize(posteriors, bins, right=False) + print(np.unique(binned_values)) + correct = pred_y == y + bin_centers = (bins[:-1] + bins[1:]) / 2 + bins_names = np.arange(nbins) + y_axis = [correct[binned_values==bin].mean() for bin in bins_names] + y_axis = [v if not np.isnan(v) else 0 for v in y_axis] + # Crear el gráfico de barras + plt.bar(bin_centers, y_axis, width=bins[1]-bins[0], edgecolor='black', alpha=0.7) + + # Etiquetas y título + plt.xlabel("Bin") + plt.ylabel("Value") + plt.title("Bar plot of calculated values per bin") + plt.xticks(bin_centers, [f"{b:.2f}" for b in bin_centers], rotation=45) + + # Mostrar el gráfico + plt.tight_layout() + plt.show() + +if __name__ == '__main__': + import quapy as qp + from sklearn.linear_model import LogisticRegression + data = qp.datasets.fetch_UCIBinaryDataset(qp.datasets.UCI_BINARY_DATASETS[6]) + train, test = data.train_test + classifier = LogisticRegression() + classifier.fit(*train.Xy) + calibration_plot(classifier, *test.Xy) diff --git a/quapy/protocol.py b/quapy/protocol.py index 7d7d1df..9a7e5c4 100644 --- a/quapy/protocol.py +++ b/quapy/protocol.py @@ -1,4 +1,6 @@ from copy import deepcopy +from typing import Iterable + import quapy as qp import numpy as np import itertools @@ -62,6 +64,36 @@ class IterateProtocol(AbstractProtocol): return len(self.samples) +class ProtocolFromIndex(AbstractProtocol): + """ + A protocol from a list of indexes + + :param data: a :class:`quapy.data.base.LabelledCollection` + :param indexes: a list of indexes + """ + def __init__(self, data: LabelledCollection, indexes: Iterable): + self.data = data + self.indexes = indexes + + def __call__(self): + """ + Yields one sample at a time extracted using the indexes + + :return: yields a tuple `(sample, prev) at a time, where `sample` is a set of instances + and in which `prev` is an `nd.array` with the class prevalence values + """ + for index in self.indexes: + yield self.data.sampling_from_index(index).Xp + + def total(self): + """ + Returns the number of samples in this protocol + + :return: int + """ + return len(self.indexes) + + class AbstractStochasticSeededProtocol(AbstractProtocol): """ An `AbstractStochasticSeededProtocol` is a protocol that generates, via any random procedure (e.g., @@ -124,9 +156,9 @@ class AbstractStochasticSeededProtocol(AbstractProtocol): if self.random_state is not None: stack.enter_context(qp.util.temp_seed(self.random_state)) for params in self.samples_parameters(): - yield self.collator(self.sample(params)) + yield self.collator(self.sample(params), params) - def collator(self, sample, *args): + def collator(self, sample, params): """ The collator prepares the sample to accommodate the desired output format before returning the output. This collator simply returns the sample as it is. Classes inheriting from this abstract class can @@ -191,9 +223,11 @@ class OnLabelledCollectionProtocol: assert return_type in cls.RETURN_TYPES, \ f'unknown return type passed as argument; valid ones are {cls.RETURN_TYPES}' if return_type=='sample_prev': - return lambda lc:lc.Xp + return lambda lc,params:lc.Xp elif return_type=='labelled_collection': - return lambda lc:lc + return lambda lc,params:lc + elif return_type=='index': + return lambda lc,params:params class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol): @@ -257,8 +291,9 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol): """ dimensions = self.data.n_classes s = F.prevalence_linspace(self.n_prevalences, repeats=1, smooth_limits_epsilon=self.smooth_limits_epsilon) + eps = (s[1]-s[0])/2 # handling floating rounding s = [s] * (dimensions - 1) - prevs = [p for p in itertools.product(*s, repeat=1) if (sum(p) <= 1.0)] + prevs = [p for p in itertools.product(*s, repeat=1) if (sum(p) < (1.+eps))] prevs = np.asarray(prevs).reshape(len(prevs), -1) if self.repeats > 1: prevs = np.repeat(prevs, self.repeats, axis=0) diff --git a/quapy/tests/test_base.py b/quapy/tests/test_base.py index 4fd9faa..7e2b4f8 100644 --- a/quapy/tests/test_base.py +++ b/quapy/tests/test_base.py @@ -1,5 +1,11 @@ -import pytest +import unittest -def test_import(): - import quapy as qp - assert qp.__version__ is not None + +class ImportTest(unittest.TestCase): + def test_import(self): + import quapy as qp + self.assertIsNotNone(qp.__version__) + + +if __name__ == '__main__': + unittest.main() diff --git a/quapy/tests/test_datasets.py b/quapy/tests/test_datasets.py index b0c2f7a..de5f61a 100644 --- a/quapy/tests/test_datasets.py +++ b/quapy/tests/test_datasets.py @@ -1,52 +1,142 @@ -import pytest +import os +import unittest -from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \ - TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, LEQUA2022_TASKS, \ - fetch_reviews, fetch_twitter, fetch_UCIDataset, fetch_lequa2022 +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression + +import quapy.functional as F +from quapy.method.aggregative import PCC +from quapy.data.datasets import * -@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS) -def test_fetch_reviews(dataset_name): - dataset = fetch_reviews(dataset_name) - print(f'Dataset {dataset_name}') - print('Training set stats') - dataset.training.stats() - print('Test set stats') - dataset.test.stats() +class TestDatasets(unittest.TestCase): + def new_quantifier(self): + return PCC(LogisticRegression(C=0.001, max_iter=100)) -@pytest.mark.parametrize('dataset_name', TWITTER_SENTIMENT_DATASETS_TEST + TWITTER_SENTIMENT_DATASETS_TRAIN) -def test_fetch_twitter(dataset_name): - try: - dataset = fetch_twitter(dataset_name) - except ValueError as ve: - if dataset_name == 'semeval' and ve.args[0].startswith( - 'dataset "semeval" can only be used for model selection.'): - dataset = fetch_twitter(dataset_name, for_model_selection=True) - print(f'Dataset {dataset_name}') - print('Training set stats') - dataset.training.stats() - print('Test set stats') + def _check_dataset(self, dataset): + train, test = dataset.reduce().train_test + q = self.new_quantifier() + print(f'testing method {q} in {dataset.name}...', end='') + if len(train)>500: + train = train.sampling(500) + q.fit(*dataset.training.Xy) + estim_prevalences = q.predict(dataset.test.instances) + self.assertTrue(F.check_prevalence_vector(estim_prevalences)) + print(f'[done]') + def _check_samples(self, gen, q, max_samples_test=5, vectorizer=None): + for X, p in gen(): + if vectorizer is not None: + X = vectorizer.transform(X) + estim_prevalences = q.predict(X) + self.assertTrue(F.check_prevalence_vector(estim_prevalences)) + max_samples_test -= 1 + if max_samples_test == 0: + break -@pytest.mark.parametrize('dataset_name', UCI_DATASETS) -def test_fetch_UCIDataset(dataset_name): - try: - dataset = fetch_UCIDataset(dataset_name) - except FileNotFoundError as fnfe: - if dataset_name == 'pageblocks.5' and fnfe.args[0].find( - 'If this is the first time you attempt to load this dataset') > 0: - print('The pageblocks.5 dataset requires some hand processing to be usable, skipping this test.') + def test_reviews(self): + for dataset_name in REVIEWS_SENTIMENT_DATASETS: + print(f'loading dataset {dataset_name}...', end='') + dataset = fetch_reviews(dataset_name, tfidf=True, min_df=10) + dataset.stats() + dataset.reduce() + print(f'[done]') + self._check_dataset(dataset) + + def test_twitter(self): + # all the datasets are contained in the same resource; if the first one + # works, there is no need to test for the rest + for dataset_name in TWITTER_SENTIMENT_DATASETS_TEST[:1]: + print(f'loading dataset {dataset_name}...', end='') + dataset = fetch_twitter(dataset_name, min_df=10) + dataset.stats() + dataset.reduce() + print(f'[done]') + self._check_dataset(dataset) + + def test_UCIBinaryDataset(self): + for dataset_name in UCI_BINARY_DATASETS: + print(f'loading dataset {dataset_name}...', end='') + dataset = fetch_UCIBinaryDataset(dataset_name) + dataset.stats() + dataset.reduce() + print(f'[done]') + self._check_dataset(dataset) + + def test_UCIMultiDataset(self): + for dataset_name in UCI_MULTICLASS_DATASETS: + print(f'loading dataset {dataset_name}...', end='') + dataset = fetch_UCIMulticlassDataset(dataset_name) + dataset.stats() + n_classes = dataset.n_classes + uniform_prev = F.uniform_prevalence(n_classes) + dataset.training = dataset.training.sampling(100, *uniform_prev) + dataset.test = dataset.test.sampling(100, *uniform_prev) + print(f'[done]') + self._check_dataset(dataset) + + def test_lequa2022(self): + if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'): + print("omitting test_lequa2022 because QUAPY_TESTS_OMIT_LARGE_DATASETS is set") return - print(f'Dataset {dataset_name}') - print('Training set stats') - dataset.training.stats() - print('Test set stats') + + for dataset_name in LEQUA2022_VECTOR_TASKS: + print(f'LeQu2022: loading dataset {dataset_name}...', end='') + train, gen_val, gen_test = fetch_lequa2022(dataset_name) + train.stats() + n_classes = train.n_classes + train = train.sampling(100, *F.uniform_prevalence(n_classes)) + q = self.new_quantifier() + q.fit(*train.Xy) + self._check_samples(gen_val, q, max_samples_test=5) + self._check_samples(gen_test, q, max_samples_test=5) + + for dataset_name in LEQUA2022_TEXT_TASKS: + print(f'LeQu2022: loading dataset {dataset_name}...', end='') + train, gen_val, gen_test = fetch_lequa2022(dataset_name) + train.stats() + n_classes = train.n_classes + train = train.sampling(100, *F.uniform_prevalence(n_classes)) + tfidf = TfidfVectorizer() + train.instances = tfidf.fit_transform(train.instances) + q = self.new_quantifier() + q.fit(*train.Xy) + self._check_samples(gen_val, q, max_samples_test=5, vectorizer=tfidf) + self._check_samples(gen_test, q, max_samples_test=5, vectorizer=tfidf) + + def test_lequa2024(self): + if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'): + print("omitting test_lequa2024 because QUAPY_TESTS_OMIT_LARGE_DATASETS is set") + return + + for task in LEQUA2024_TASKS: + print(f'LeQu2024: loading task {task}...', end='') + train, gen_val, gen_test = fetch_lequa2024(task, merge_T3=True) + train.stats() + n_classes = train.n_classes + train = train.sampling(100, *F.uniform_prevalence(n_classes)) + q = self.new_quantifier() + q.fit(*train.Xy) + self._check_samples(gen_val, q, max_samples_test=5) + self._check_samples(gen_test, q, max_samples_test=5) -@pytest.mark.parametrize('dataset_name', LEQUA2022_TASKS) -def test_fetch_lequa2022(dataset_name): - train, gen_val, gen_test = fetch_lequa2022(dataset_name) - print(train.stats()) - print('Val:', gen_val.total()) - print('Test:', gen_test.total()) + def test_IFCB(self): + if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'): + print("omitting test_IFCB because QUAPY_TESTS_OMIT_LARGE_DATASETS is set") + return + + print(f'loading dataset IFCB.') + for mod_sel in [False, True]: + train, gen = fetch_IFCB(single_sample_train=True, for_model_selection=mod_sel) + train.stats() + n_classes = train.n_classes + train = train.sampling(100, *F.uniform_prevalence(n_classes)) + q = self.new_quantifier() + q.fit(*train.Xy) + self._check_samples(gen, q, max_samples_test=5) + + +if __name__ == '__main__': + unittest.main() diff --git a/quapy/tests/test_evaluation.py b/quapy/tests/test_evaluation.py index 5c50218..05d661a 100644 --- a/quapy/tests/test_evaluation.py +++ b/quapy/tests/test_evaluation.py @@ -6,14 +6,17 @@ import quapy as qp from sklearn.linear_model import LogisticRegression from time import time -from quapy.error import QUANTIFICATION_ERROR_SINGLE, QUANTIFICATION_ERROR, QUANTIFICATION_ERROR_NAMES, \ - QUANTIFICATION_ERROR_SINGLE_NAMES +from quapy.error import QUANTIFICATION_ERROR_SINGLE_NAMES from quapy.method.aggregative import EMQ, PCC from quapy.method.base import BaseQuantifier class EvalTestCase(unittest.TestCase): + def test_eval_speedup(self): + """ + Checks whether the speed-up heuristics used by qp.evaluation work, i.e., actually save time + """ data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True) train, test = data.training, data.test @@ -26,7 +29,7 @@ class EvalTestCase(unittest.TestCase): time.sleep(1) return super().predict_proba(X) - emq = EMQ(SlowLR()).fit(train) + emq = EMQ(SlowLR()).fit(*train.Xy) tinit = time() score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True, aggr_speedup='force') @@ -38,14 +41,14 @@ class EvalTestCase(unittest.TestCase): def __init__(self, cls): self.emq = EMQ(cls) - def quantify(self, instances): - return self.emq.quantify(instances) + def predict(self, X): + return self.emq.predict(X) - def fit(self, data): - self.emq.fit(data) + def fit(self, X, y): + self.emq.fit(X, y) return self - emq = NonAggregativeEMQ(SlowLR()).fit(train) + emq = NonAggregativeEMQ(SlowLR()).fit(*train.Xy) tinit = time() score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True) @@ -55,15 +58,18 @@ class EvalTestCase(unittest.TestCase): self.assertEqual(tend_no_optim>(tend_optim/2), True) def test_evaluation_output(self): + """ + Checks the evaluation functions return correct types for different error_metrics + """ - data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True) + data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True).reduce(n_train=100, n_test=100) train, test = data.training, data.test qp.environ['SAMPLE_SIZE']=100 protocol = qp.protocol.APP(test, random_state=0) - q = PCC(LogisticRegression()).fit(train) + q = PCC(LogisticRegression()).fit(*train.Xy) single_errors = list(QUANTIFICATION_ERROR_SINGLE_NAMES) averaged_errors = ['m'+e for e in single_errors] @@ -79,6 +85,5 @@ class EvalTestCase(unittest.TestCase): self.assertEqual(scores.mean(), score) - if __name__ == '__main__': unittest.main() diff --git a/quapy/tests/test_hierarchy.py b/quapy/tests/test_hierarchy.py index 2ea3af5..7a2d07e 100644 --- a/quapy/tests/test_hierarchy.py +++ b/quapy/tests/test_hierarchy.py @@ -1,31 +1,46 @@ import unittest - from sklearn.linear_model import LogisticRegression -import quapy as qp +from quapy.method import AGGREGATIVE_METHODS, BINARY_METHODS from quapy.method.aggregative import * - +import inspect class HierarchyTestCase(unittest.TestCase): def test_aggregative(self): - lr = LogisticRegression() - for m in [CC(lr), PCC(lr), ACC(lr), PACC(lr)]: - self.assertEqual(isinstance(m, AggregativeQuantifier), True) + for m in AGGREGATIVE_METHODS: + self.assertEqual(isinstance(m(), AggregativeQuantifier), True) + + def test_inspect_aggregative(self): + + import quapy.method.aggregative as methods + + members = inspect.getmembers(methods) + classes = set([cls for name, cls in members if inspect.isclass(cls)]) + quantifiers = [cls for cls in classes if issubclass(cls, BaseQuantifier)] + quantifiers = [cls for cls in quantifiers if issubclass(cls, AggregativeQuantifier)] + quantifiers = [cls for cls in quantifiers if not inspect.isabstract(cls) ] + quantifiers = [cls for cls in quantifiers if cls is not OneVsAllAggregative] + + for cls in quantifiers: + self.assertIn(cls, AGGREGATIVE_METHODS) def test_binary(self): lr = LogisticRegression() - for m in [HDy(lr)]: - self.assertEqual(isinstance(m, BinaryQuantifier), True) + for m in BINARY_METHODS: + self.assertEqual(isinstance(m(lr), BinaryQuantifier), True) def test_probabilistic(self): lr = LogisticRegression() for m in [CC(lr), ACC(lr)]: - self.assertEqual(isinstance(m, AggregativeProbabilisticQuantifier), False) + self.assertEqual(isinstance(m, AggregativeCrispQuantifier), True) + self.assertEqual(isinstance(m, AggregativeSoftQuantifier), False) for m in [PCC(lr), PACC(lr)]: - self.assertEqual(isinstance(m, AggregativeProbabilisticQuantifier), True) + self.assertEqual(isinstance(m, AggregativeCrispQuantifier), False) + self.assertEqual(isinstance(m, AggregativeSoftQuantifier), True) if __name__ == '__main__': unittest.main() + diff --git a/quapy/tests/test_methods.py b/quapy/tests/test_methods.py index bca34e3..3e4149e 100644 --- a/quapy/tests/test_methods.py +++ b/quapy/tests/test_methods.py @@ -1,235 +1,130 @@ -import numpy as np -import pytest +import itertools +import unittest + from sklearn.linear_model import LogisticRegression -from sklearn.svm import LinearSVC import quapy as qp -from quapy.model_selection import GridSearchQ -from quapy.method.base import BinaryQuantifier -from quapy.data import Dataset, LabelledCollection -from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS +from quapy.method.aggregative import ACC from quapy.method.meta import Ensemble -from quapy.protocol import APP -from quapy.method.aggregative import DMy -from quapy.method.meta import MedianEstimator +from quapy.method import AGGREGATIVE_METHODS, BINARY_METHODS, NON_AGGREGATIVE_METHODS +from quapy.functional import check_prevalence_vector -datasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True), id='hcr'), - pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')] +# a random selection of composed methods to test the qunfold integration +from quapy.method.composable import check_compatible_qunfold_version -tinydatasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True).reduce(), id='tiny_hcr'), - pytest.param(qp.datasets.fetch_UCIDataset('ionosphere').reduce(), id='tiny_ionosphere')] +from quapy.method.composable import ( + ComposableQuantifier, + LeastSquaresLoss, + HellingerSurrogateLoss, + ClassRepresentation, + HistogramRepresentation, + CVClassifier +) -learners = [LogisticRegression, LinearSVC] +COMPOSABLE_METHODS = [ + ComposableQuantifier( # ACC + LeastSquaresLoss(), + ClassRepresentation(CVClassifier(LogisticRegression())) + ), + ComposableQuantifier( # HDy + HellingerSurrogateLoss(), + HistogramRepresentation( + 3, # 3 bins per class + preprocessor = ClassRepresentation(CVClassifier(LogisticRegression())) + ) + ), +] + +class TestMethods(unittest.TestCase): + + tiny_dataset_multiclass = qp.datasets.fetch_UCIMulticlassDataset('academic-success').reduce(n_test=10) + tiny_dataset_binary = qp.datasets.fetch_UCIBinaryDataset('ionosphere').reduce(n_test=10) + datasets = [tiny_dataset_binary, tiny_dataset_multiclass] + + def test_aggregative(self): + for dataset in TestMethods.datasets: + learner = LogisticRegression() + learner.fit(*dataset.training.Xy) + + for model in AGGREGATIVE_METHODS: + if not dataset.binary and model in BINARY_METHODS: + print(f'skipping the test of binary model {model.__name__} on multiclass dataset {dataset.name}') + continue + + q = model(learner, fit_classifier=False) + print('testing', q) + q.fit(*dataset.training.Xy) + estim_prevalences = q.predict(dataset.test.X) + self.assertTrue(check_prevalence_vector(estim_prevalences)) + + def test_non_aggregative(self): + for dataset in TestMethods.datasets: + + for model in NON_AGGREGATIVE_METHODS: + if not dataset.binary and model in BINARY_METHODS: + print(f'skipping the test of binary model {model.__name__} on multiclass dataset {dataset.name}') + continue + + q = model() + print(f'testing {q} on dataset {dataset.name}') + q.fit(*dataset.training.Xy) + estim_prevalences = q.predict(dataset.test.X) + self.assertTrue(check_prevalence_vector(estim_prevalences)) + + def test_ensembles(self): + qp.environ['SAMPLE_SIZE'] = 10 + + base_quantifier = ACC(LogisticRegression()) + for dataset, policy in itertools.product(TestMethods.datasets, Ensemble.VALID_POLICIES): + if not dataset.binary and policy == 'ds': + print(f'skipping the test of binary policy ds on non-binary dataset {dataset}') + continue + + print(f'testing {base_quantifier} on dataset {dataset.name} with {policy=}') + ensemble = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1) + ensemble.fit(*dataset.training.Xy) + estim_prevalences = ensemble.predict(dataset.test.instances) + self.assertTrue(check_prevalence_vector(estim_prevalences)) + + def test_quanet(self): + try: + import quapy.classification.neural + except ModuleNotFoundError: + print('the torch package is not installed; skipping unit test for QuaNet') + return + + qp.environ['SAMPLE_SIZE'] = 10 + + # load the kindle dataset as text, and convert words to numerical indexes + dataset = qp.datasets.fetch_reviews('kindle', pickle=True).reduce() + qp.data.preprocessing.index(dataset, min_df=5, inplace=True) + + from quapy.classification.neural import CNNnet + cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes) + + from quapy.classification.neural import NeuralClassifierTrainer + learner = NeuralClassifierTrainer(cnn, device='cpu') + + from quapy.method.meta import QuaNet + model = QuaNet(learner, device='cpu', n_epochs=2, tr_iter_per_poch=10, va_iter_per_poch=10, patience=2) + + model.fit(*dataset.training.Xy) + estim_prevalences = model.predict(dataset.test.instances) + self.assertTrue(check_prevalence_vector(estim_prevalences)) + + def test_composable(self): + if check_compatible_qunfold_version(): + for dataset in TestMethods.datasets: + for q in COMPOSABLE_METHODS: + print('testing', q) + q.fit(*dataset.training.Xy) + estim_prevalences = q.predict(dataset.test.X) + print(estim_prevalences) + self.assertTrue(check_prevalence_vector(estim_prevalences)) + else: + from quapy.method.composable import __old_version_message + print(__old_version_message) -@pytest.mark.parametrize('dataset', datasets) -@pytest.mark.parametrize('aggregative_method', AGGREGATIVE_METHODS) -@pytest.mark.parametrize('learner', learners) -def test_aggregative_methods(dataset: Dataset, aggregative_method, learner): - model = aggregative_method(learner()) - - if isinstance(model, BinaryQuantifier) and not dataset.binary: - print(f'skipping the test of binary model {type(model)} on non-binary dataset {dataset}') - return - - model.fit(dataset.training) - - estim_prevalences = model.quantify(dataset.test.instances) - - true_prevalences = dataset.test.prevalence() - error = qp.error.mae(true_prevalences, estim_prevalences) - - assert type(error) == np.float64 - - -@pytest.mark.parametrize('dataset', datasets) -@pytest.mark.parametrize('non_aggregative_method', NON_AGGREGATIVE_METHODS) -def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method): - model = non_aggregative_method() - - if isinstance(model, BinaryQuantifier) and not dataset.binary: - print(f'skipping the test of binary model {model} on non-binary dataset {dataset}') - return - - model.fit(dataset.training) - - estim_prevalences = model.quantify(dataset.test.instances) - - true_prevalences = dataset.test.prevalence() - error = qp.error.mae(true_prevalences, estim_prevalences) - - assert type(error) == np.float64 - - -@pytest.mark.parametrize('base_method', AGGREGATIVE_METHODS) -@pytest.mark.parametrize('learner', [LogisticRegression]) -@pytest.mark.parametrize('dataset', tinydatasets) -@pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES) -def test_ensemble_method(base_method, learner, dataset: Dataset, policy): - qp.environ['SAMPLE_SIZE'] = 20 - base_quantifier=base_method(learner()) - if isinstance(base_quantifier, BinaryQuantifier) and not dataset.binary: - print(f'skipping the test of binary model {base_quantifier} on non-binary dataset {dataset}') - return - if not dataset.binary and policy=='ds': - print(f'skipping the test of binary policy ds on non-binary dataset {dataset}') - return - model = Ensemble(quantifier=base_quantifier, size=5, policy=policy, n_jobs=-1) - - model.fit(dataset.training) - - estim_prevalences = model.quantify(dataset.test.instances) - - true_prevalences = dataset.test.prevalence() - error = qp.error.mae(true_prevalences, estim_prevalences) - - assert type(error) == np.float64 - - -def test_quanet_method(): - try: - import quapy.classification.neural - except ModuleNotFoundError: - print('skipping QuaNet test due to missing torch package') - return - - - qp.environ['SAMPLE_SIZE'] = 100 - - # load the kindle dataset as text, and convert words to numerical indexes - dataset = qp.datasets.fetch_reviews('kindle', pickle=True) - dataset = Dataset(dataset.training.sampling(200, *dataset.training.prevalence()), - dataset.test.sampling(200, *dataset.test.prevalence())) - qp.data.preprocessing.index(dataset, min_df=5, inplace=True) - - from quapy.classification.neural import CNNnet - cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes) - - from quapy.classification.neural import NeuralClassifierTrainer - learner = NeuralClassifierTrainer(cnn, device='cuda') - - from quapy.method.meta import QuaNet - model = QuaNet(learner, device='cuda') - - if isinstance(model, BinaryQuantifier) and not dataset.binary: - print(f'skipping the test of binary model {model} on non-binary dataset {dataset}') - return - - model.fit(dataset.training) - - estim_prevalences = model.quantify(dataset.test.instances) - - true_prevalences = dataset.test.prevalence() - error = qp.error.mae(true_prevalences, estim_prevalences) - - assert type(error) == np.float64 - - -def test_str_label_names(): - model = qp.method.aggregative.CC(LogisticRegression()) - - dataset = qp.datasets.fetch_reviews('imdb', pickle=True) - dataset = Dataset(dataset.training.sampling(1000, *dataset.training.prevalence()), - dataset.test.sampling(1000, 0.25, 0.75)) - qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True) - - np.random.seed(0) - model.fit(dataset.training) - - int_estim_prevalences = model.quantify(dataset.test.instances) - true_prevalences = dataset.test.prevalence() - - error = qp.error.mae(true_prevalences, int_estim_prevalences) - assert type(error) == np.float64 - - dataset_str = Dataset(LabelledCollection(dataset.training.instances, - ['one' if label == 1 else 'zero' for label in dataset.training.labels]), - LabelledCollection(dataset.test.instances, - ['one' if label == 1 else 'zero' for label in dataset.test.labels])) - assert all(dataset_str.training.classes_ == dataset_str.test.classes_), 'wrong indexation' - np.random.seed(0) - model.fit(dataset_str.training) - - str_estim_prevalences = model.quantify(dataset_str.test.instances) - true_prevalences = dataset_str.test.prevalence() - - error = qp.error.mae(true_prevalences, str_estim_prevalences) - assert type(error) == np.float64 - - print(true_prevalences) - print(int_estim_prevalences) - print(str_estim_prevalences) - - np.testing.assert_almost_equal(int_estim_prevalences[1], - str_estim_prevalences[list(model.classes_).index('one')]) - -# helper -def __fit_test(quantifier, train, test): - quantifier.fit(train) - test_samples = APP(test) - true_prevs, estim_prevs = qp.evaluation.prediction(quantifier, test_samples) - return qp.error.mae(true_prevs, estim_prevs), estim_prevs - - -def test_median_meta(): - """ - This test compares the performance of the MedianQuantifier with respect to computing the median of the predictions - of a differently parameterized quantifier. We use the DistributionMatching base quantifier and the median is - computed across different values of nbins - """ - - qp.environ['SAMPLE_SIZE'] = 100 - - # grid of values - nbins_grid = list(range(2, 11)) - - dataset = 'kindle' - train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test - prevs = [] - errors = [] - for nbins in nbins_grid: - with qp.util.temp_seed(0): - q = DMy(LogisticRegression(), nbins=nbins) - mae, estim_prevs = __fit_test(q, train, test) - prevs.append(estim_prevs) - errors.append(mae) - print(f'{dataset} DistributionMatching(nbins={nbins}) got MAE {mae:.4f}') - prevs = np.asarray(prevs) - mae = np.mean(errors) - print(f'\tMAE={mae:.4f}') - - q = DMy(LogisticRegression()) - q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1) - median_mae, prev = __fit_test(q, train, test) - print(f'\tMAE={median_mae:.4f}') - - np.testing.assert_almost_equal(np.median(prevs, axis=0), prev) - assert median_mae < mae, 'the median-based quantifier provided a higher error...' - - -def test_median_meta_modsel(): - """ - This test checks the median-meta quantifier with model selection - """ - - qp.environ['SAMPLE_SIZE'] = 100 - - dataset = 'kindle' - train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test - train, val = train.split_stratified(random_state=0) - - nbins_grid = [2, 4, 5, 10, 15] - - q = DMy(LogisticRegression()) - q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1) - median_mae, _ = __fit_test(q, train, test) - print(f'\tMAE={median_mae:.4f}') - - q = DMy(LogisticRegression()) - lr_params = {'classifier__C': np.logspace(-1, 1, 3)} - q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1) - q = GridSearchQ(q, param_grid=lr_params, protocol=APP(val), n_jobs=-1) - optimized_median_ave, _ = __fit_test(q, train, test) - print(f'\tMAE={optimized_median_ave:.4f}') - - assert optimized_median_ave < median_mae, "the optimized method yielded worse performance..." \ No newline at end of file +if __name__ == '__main__': + unittest.main() diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py index 180f680..6423b4e 100644 --- a/quapy/tests/test_modsel.py +++ b/quapy/tests/test_modsel.py @@ -2,7 +2,6 @@ import unittest import numpy as np from sklearn.linear_model import LogisticRegression -from sklearn.svm import SVC import quapy as qp from quapy.method.aggregative import PACC @@ -14,17 +13,20 @@ import time class ModselTestCase(unittest.TestCase): def test_modsel(self): + """ + Checks whether a model selection exploration takes a good hyperparameter + """ q = PACC(LogisticRegression(random_state=1, max_iter=5000)) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1) - param_grid = {'classifier__C': np.logspace(-3,3,7)} + param_grid = {'classifier__C': [0.000001, 10.]} app = APP(validation, sample_size=100, random_state=1) q = GridSearchQ( - q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True - ).fit(training) + q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, verbose=True, n_jobs=-1 + ).fit(*training.Xy) print('best params', q.best_params_) print('best score', q.best_score_) @@ -32,54 +34,39 @@ class ModselTestCase(unittest.TestCase): self.assertEqual(q.best_model().get_params()['classifier__C'], 10.0) def test_modsel_parallel(self): + """ + Checks whether a parallelized model selection actually is faster than a sequential exploration but + obtains the same optimal parameters + """ - q = PACC(LogisticRegression(random_state=1, max_iter=5000)) + q = PACC(LogisticRegression(random_state=1, max_iter=3000)) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) - training, validation = data.training.split_stratified(0.7, random_state=1) - # test = data.test - - param_grid = {'classifier__C': np.logspace(-3,3,7)} - app = APP(validation, sample_size=100, random_state=1) - q = GridSearchQ( - q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True - ).fit(training) - print('best params', q.best_params_) - print('best score', q.best_score_) - - self.assertEqual(q.best_params_['classifier__C'], 10.0) - self.assertEqual(q.best_model().get_params()['classifier__C'], 10.0) - - def test_modsel_parallel_speedup(self): - class SlowLR(LogisticRegression): - def fit(self, X, y, sample_weight=None): - time.sleep(1) - return super(SlowLR, self).fit(X, y, sample_weight) - - q = PACC(SlowLR(random_state=1, max_iter=5000)) - - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=50) training, validation = data.training.split_stratified(0.7, random_state=1) - param_grid = {'classifier__C': np.logspace(-3, 3, 7)} + param_grid = {'classifier__C': np.logspace(-3,3,7), 'classifier__class_weight': ['balanced', None]} app = APP(validation, sample_size=100, random_state=1) - tinit = time.time() - GridSearchQ( - q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True - ).fit(training) - tend_nooptim = time.time()-tinit + def do_gridsearch(n_jobs): + print('starting model selection in sequential exploration') + t_init = time.time() + modsel = GridSearchQ( + q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=n_jobs, verbose=True + ).fit(*training.Xy) + t_end = time.time()-t_init + best_c = modsel.best_params_['classifier__C'] + print(f'[done] took {t_end:.2f}s best C = {best_c}') + return t_end, best_c - tinit = time.time() - GridSearchQ( - q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True - ).fit(training) - tend_optim = time.time() - tinit + tend_seq, best_c_seq = do_gridsearch(n_jobs=1) + tend_par, best_c_par = do_gridsearch(n_jobs=-1) - print(f'parallel training took {tend_optim:.4f}s') - print(f'sequential training took {tend_nooptim:.4f}s') + print(tend_seq, best_c_seq) + print(tend_par, best_c_par) + + self.assertEqual(best_c_seq, best_c_par) + self.assertLess(tend_par, tend_seq) - self.assertEqual(tend_optim < (0.5*tend_nooptim), True) def test_modsel_timeout(self): @@ -91,17 +78,27 @@ class ModselTestCase(unittest.TestCase): q = PACC(SlowLR()) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1) - # test = data.test - param_grid = {'classifier__C': np.logspace(-3,3,7)} + param_grid = {'classifier__C': np.logspace(-1,1,3)} app = APP(validation, sample_size=100, random_state=1) - q = GridSearchQ( - q, param_grid, protocol=app, error='mae', refit=True, timeout=3, n_jobs=-1, verbose=True + + print('Expecting TimeoutError to be raised') + modsel = GridSearchQ( + q, param_grid, protocol=app, timeout=3, n_jobs=-1, verbose=True, raise_errors=True ) with self.assertRaises(TimeoutError): - q.fit(training) + modsel.fit(*training.Xy) + + print('Expecting ValueError to be raised') + modsel = GridSearchQ( + q, param_grid, protocol=app, timeout=3, n_jobs=-1, verbose=True, raise_errors=False + ) + with self.assertRaises(ValueError): + # this exception is not raised because of the timeout, but because no combination of hyperparams + # succedded (in this case, a ValueError is raised, regardless of "raise_errors" + modsel.fit(*training.Xy) if __name__ == '__main__': diff --git a/quapy/tests/test_protocols.py b/quapy/tests/test_protocols.py index 87bd358..4850bd4 100644 --- a/quapy/tests/test_protocols.py +++ b/quapy/tests/test_protocols.py @@ -71,7 +71,7 @@ class TestProtocols(unittest.TestCase): # surprisingly enough, for some n_prevalences the test fails, notwithstanding # everything is correct. The problem is that in function APP.prevalence_grid() # there is sometimes one rounding error that gets cumulated and - # surpasses 1.0 (by a very small float value, 0.0000000000002 or sthe like) + # surpasses 1.0 (by a very small float value, 0.0000000000002 or the like) # so these tuples are mistakenly removed... I have tried with np.close, and # other workarounds, but eventually happens that there is some negative probability # in the sampling function... diff --git a/quapy/tests/test_replicability.py b/quapy/tests/test_replicability.py index e89531a..a174992 100644 --- a/quapy/tests/test_replicability.py +++ b/quapy/tests/test_replicability.py @@ -3,31 +3,34 @@ import quapy as qp from quapy.data import LabelledCollection from quapy.functional import strprev from sklearn.linear_model import LogisticRegression - +import numpy as np from quapy.method.aggregative import PACC +import quapy.functional as F -class MyTestCase(unittest.TestCase): +class TestReplicability(unittest.TestCase): + def test_prediction_replicability(self): - dataset = qp.datasets.fetch_UCIDataset('yeast') + dataset = qp.datasets.fetch_UCIBinaryDataset('yeast') + train, test = dataset.train_test with qp.util.temp_seed(0): lr = LogisticRegression(random_state=0, max_iter=10000) pacc = PACC(lr) - prev = pacc.fit(dataset.training).quantify(dataset.test.X) + prev = pacc.fit(*train.Xy).predict(test.X) str_prev1 = strprev(prev, prec=5) with qp.util.temp_seed(0): lr = LogisticRegression(random_state=0, max_iter=10000) pacc = PACC(lr) - prev2 = pacc.fit(dataset.training).quantify(dataset.test.X) + prev2 = pacc.fit(*train.Xy).predict(test.X) str_prev2 = strprev(prev2, prec=5) - self.assertEqual(str_prev1, str_prev2) # add assertion here + self.assertEqual(str_prev1, str_prev2) + def test_samping_replicability(self): - import numpy as np def equal_collections(c1, c2, value=True): self.assertEqual(np.all(c1.X == c2.X), value) @@ -74,5 +77,36 @@ class MyTestCase(unittest.TestCase): equal_collections(sample1_te, sample2_te, True) + def test_parallel_replicability(self): + + train, test = qp.datasets.fetch_UCIMulticlassDataset('dry-bean').reduce().train_test + + test = test.sampling(500, *[0.1, 0.0, 0.1, 0.1, 0.2, 0.5, 0.0]) + + with qp.util.temp_seed(10): + pacc = PACC(LogisticRegression(), val_split=.5, n_jobs=2) + pacc.fit(*train.Xy) + prev1 = F.strprev(pacc.predict(test.instances)) + + with qp.util.temp_seed(0): + pacc = PACC(LogisticRegression(), val_split=.5, n_jobs=2) + pacc.fit(*train.Xy) + prev2 = F.strprev(pacc.predict(test.instances)) + + with qp.util.temp_seed(0): + pacc = PACC(LogisticRegression(), val_split=.5, n_jobs=2) + pacc.fit(*train.Xy) + prev3 = F.strprev(pacc.predict(test.instances)) + + print(prev1) + print(prev2) + print(prev3) + + self.assertNotEqual(prev1, prev2) + self.assertEqual(prev2, prev3) + + + + if __name__ == '__main__': unittest.main() diff --git a/quapy/util.py b/quapy/util.py index 733fbb8..94b75a5 100644 --- a/quapy/util.py +++ b/quapy/util.py @@ -6,10 +6,15 @@ import pickle import urllib from pathlib import Path from contextlib import ExitStack + +import pandas as pd + import quapy as qp import numpy as np from joblib import Parallel, delayed +from time import time +import signal def _get_parallel_slices(n_tasks, n_jobs): @@ -22,7 +27,7 @@ def _get_parallel_slices(n_tasks, n_jobs): def map_parallel(func, args, n_jobs): """ - Applies func to n_jobs slices of args. E.g., if args is an array of 99 items and `n_jobs`=2, then + Applies func to n_jobs slices of args. E.g., if args is an array of 99 items and n_jobs=2, then func is applied in two parallel processes to args[0:50] and to args[50:99]. func is a function that already works with a list of arguments. @@ -38,7 +43,7 @@ def map_parallel(func, args, n_jobs): return list(itertools.chain.from_iterable(results)) -def parallel(func, args, n_jobs, seed=None): +def parallel(func, args, n_jobs, seed=None, asarray=True, backend='loky'): """ A wrapper of multiprocessing: @@ -47,7 +52,14 @@ def parallel(func, args, n_jobs, seed=None): >>> ) that takes the `quapy.environ` variable as input silently. - Seeds the child processes to ensure reproducibility when n_jobs>1 + Seeds the child processes to ensure reproducibility when n_jobs>1. + + :param func: callable + :param args: args of func + :param seed: the numeric seed + :param asarray: set to True to return a np.ndarray instead of a list + :param backend: indicates the backend used for handling parallel works + :param open_args: if True, then the delayed function is called on *args_i, instead of on args_i """ def func_dec(environ, seed, *args): qp.environ = environ.copy() @@ -58,11 +70,48 @@ def parallel(func, args, n_jobs, seed=None): stack.enter_context(qp.util.temp_seed(seed)) return func(*args) - return Parallel(n_jobs=n_jobs)( + out = Parallel(n_jobs=n_jobs, backend=backend)( delayed(func_dec)(qp.environ, None if seed is None else seed+i, args_i) for i, args_i in enumerate(args) ) + if asarray: + out = np.asarray(out) + return out +def parallel_unpack(func, args, n_jobs, seed=None, asarray=True, backend='loky'): + """ + A wrapper of multiprocessing: + + >>> Parallel(n_jobs=n_jobs)( + >>> delayed(func)(*args_i) for args_i in args + >>> ) + + that takes the `quapy.environ` variable as input silently. + Seeds the child processes to ensure reproducibility when n_jobs>1. + + :param func: callable + :param args: args of func + :param seed: the numeric seed + :param asarray: set to True to return a np.ndarray instead of a list + :param backend: indicates the backend used for handling parallel works + """ + + def func_dec(environ, seed, *args): + qp.environ = environ.copy() + qp.environ['N_JOBS'] = 1 + # set a context with a temporal seed to ensure results are reproducibles in parallel + with ExitStack() as stack: + if seed is not None: + stack.enter_context(qp.util.temp_seed(seed)) + return func(*args) + + out = Parallel(n_jobs=n_jobs, backend=backend)( + delayed(func_dec)(qp.environ, None if seed is None else seed + i, *args_i) for i, args_i in enumerate(args) + ) + if asarray: + out = np.asarray(out) + return out + @contextlib.contextmanager def temp_seed(random_state): """ @@ -159,7 +208,7 @@ def save_text_file(path, text): :param text: text to save. """ create_parent_dir(path) - with open(text, 'wt') as fout: + with open(path, 'wt') as fout: fout.write(text) @@ -182,12 +231,14 @@ def pickled_resource(pickle_path:str, generation_func:callable, *args): return generation_func(*args) else: if os.path.exists(pickle_path): - return pickle.load(open(pickle_path, 'rb')) + with open(pickle_path, 'rb') as fin: + instance = pickle.load(fin) else: instance = generation_func(*args) os.makedirs(str(Path(pickle_path).parent), exist_ok=True) - pickle.dump(instance, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL) - return instance + with open(pickle_path, 'wb') as foo: + pickle.dump(instance, foo, pickle.HIGHEST_PROTOCOL) + return instance def _check_sample_size(sample_size): @@ -200,6 +251,28 @@ def _check_sample_size(sample_size): return sample_size +def load_report(path, as_dict=False): + def str2prev_arr(strprev): + within = strprev.strip('[]').split() + float_list = [float(p) for p in within] + float_list[-1] = 1. - sum(float_list[:-1]) + return np.asarray(float_list) + + df = pd.read_csv(path, index_col=0) + df['true-prev'] = df['true-prev'].apply(str2prev_arr) + df['estim-prev'] = df['estim-prev'].apply(str2prev_arr) + if as_dict: + d = {} + for col in df.columns.values: + vals = df[col].values + if col in ['true-prev', 'estim-prev']: + vals = np.vstack(vals) + d[col] = vals + return d + else: + return df + + class EarlyStop: """ A class implementing the early-stopping condition typically used for training neural networks. @@ -254,3 +327,35 @@ class EarlyStop: if self.patience <= 0: self.STOP = True + +@contextlib.contextmanager +def timeout(seconds): + """ + Opens a context that will launch an exception if not closed after a given number of seconds + + >>> def func(start_msg, end_msg): + >>> print(start_msg) + >>> sleep(2) + >>> print(end_msg) + >>> + >>> with timeout(1): + >>> func('begin function', 'end function') + >>> Out[] + >>> begin function + >>> TimeoutError + + + :param seconds: number of seconds, set to <=0 to ignore the timer + """ + if seconds > 0: + def handler(signum, frame): + raise TimeoutError() + + signal.signal(signal.SIGALRM, handler) + signal.alarm(seconds) + + yield + + if seconds > 0: + signal.alarm(0) + diff --git a/setup.py b/setup.py index 4ab3662..2464122 100644 --- a/setup.py +++ b/setup.py @@ -111,9 +111,15 @@ setup( # packages=find_packages(include=['quapy', 'quapy.*']), # Required + package_data={ + # For the 'quapy.method' package, include all files + # in the 'stan' subdirectory that end with .stan + 'quapy.method': ['stan/*.stan'] + }, + python_requires='>=3.8, <4', - install_requires=['scikit-learn', 'pandas', 'tqdm', 'matplotlib', 'joblib', 'xlrd', 'abstention'], + install_requires=['scikit-learn', 'pandas', 'tqdm', 'matplotlib', 'joblib', 'xlrd', 'abstention', 'ucimlrepo', 'certifi'], # List additional groups of dependencies here (e.g. development # dependencies). Users will be able to install these using the "extras" @@ -123,10 +129,12 @@ setup( # # Similar to `install_requires` above, these must be valid existing # projects. - # extras_require={ # Optional - # 'dev': ['check-manifest'], - # 'test': ['coverage'], - # }, + extras_require={ # Optional + 'bayes': ['jax', 'jaxlib', 'numpyro', 'pystan'], + 'neural': ['torch'], + 'tests': ['certifi'], + 'docs' : ['sphinx-rtd-theme', 'myst-parser'], + }, # If there are data files included in your packages that need to be # installed, specify them here. @@ -158,7 +166,7 @@ setup( 'Contributors': 'https://github.com/HLT-ISTI/QuaPy/graphs/contributors', 'Bug Reports': 'https://github.com/HLT-ISTI/QuaPy/issues', 'Wiki': 'https://github.com/HLT-ISTI/QuaPy/wiki', - 'Documentation': 'https://hlt-isti.github.io/QuaPy/build/html/index.html', + 'Documentation': 'https://hlt-isti.github.io/QuaPy/', 'Source': 'https://github.com/HLT-ISTI/QuaPy/', }, )