Compare commits

...

14 Commits

Author SHA1 Message Date
Alejandro Moreo Fernandez ca25f1d601 lab + some experimental methods based on distribution matching 2023-05-04 17:26:17 +02:00
Alejandro Moreo Fernandez 2df89c83e8 bugfix, method order set to method names if None is passed 2023-04-05 12:16:29 +02:00
Alejandro Moreo Fernandez 1efe13c538 import fix in uci_experiments.py 2023-03-24 10:41:53 +01:00
Alejandro Moreo Fernandez 763c008b6d all datasets 2023-03-23 15:47:40 +01:00
Alejandro Moreo Fernandez fa9d5ea243 Merge branch 'master' of github.com:HLT-ISTI/QuaPy 2023-03-23 15:46:30 +01:00
Alejandro Moreo Fernandez 67906f6f2d adding uci_experiments to examples folder 2023-03-23 15:46:03 +01:00
Alejandro Moreo Fernandez 4904475d26 improving code quality in terms of pylint 2023-02-28 10:47:59 +01:00
Alejandro Moreo Fernandez 1826d8a8dc
Create pylint.yml 2023-02-28 10:30:37 +01:00
Alejandro Moreo Fernandez de93cce391 Merge branch 'master' of github.com:HLT-ISTI/QuaPy 2023-02-28 10:27:47 +01:00
Alejandro Moreo Fernandez d1e11f8a6b
Merge pull request #18 from aesuli/aesuli-patch-1
Missing 'deep' argument
2023-02-28 10:27:41 +01:00
Alejandro Moreo Fernandez d0706005d7 Merge branch 'master' of github.com:HLT-ISTI/QuaPy 2023-02-28 10:25:52 +01:00
Alejandro Moreo Fernandez 368ee03fbc some minor improvements 2023-02-28 10:25:46 +01:00
Andrea Esuli e9d56e5801
missing argument
Added missing deep argument to get_params of LowRankLogisticRegression
2023-02-28 08:41:34 +01:00
Alejandro Moreo Fernandez 3779bb2123 specifying python >= 3.8 in setup 2023-02-20 09:39:04 +01:00
24 changed files with 1043 additions and 224 deletions

23
.github/workflows/pylint.yml vendored Normal file
View File

@ -0,0 +1,23 @@
name: Pylint
on: [push]
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10"]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pylint
- name: Analysing the code with pylint
run: |
pylint $(git ls-files '*.py')

View File

@ -1,3 +1,6 @@
ensembles seem to be broken; they have an internal model selection which takes the parameters, but since quapy now
works with protocols it would need to know the validation set in order to pass something like
"protocol: APP(val, etc.)"
sample_size should not be mandatory when qp.environ['SAMPLE_SIZE'] has been specified
clean all the cumbersome methods that have to be implemented for new quantifiers (e.g., n_classes_ prop, etc.)
make truly parallel the GridSearchQ

View File

@ -224,8 +224,6 @@
<li><a href="quapy.html#quapy.util.create_parent_dir">create_parent_dir() (in module quapy.util)</a>
</li>
<li><a href="quapy.method.html#quapy.method.aggregative.cross_generate_predictions">cross_generate_predictions() (in module quapy.method.aggregative)</a>
</li>
<li><a href="quapy.method.html#quapy.method.aggregative.cross_generate_predictions_depr">cross_generate_predictions_depr() (in module quapy.method.aggregative)</a>
</li>
<li><a href="quapy.html#quapy.model_selection.cross_val_predict">cross_val_predict() (in module quapy.model_selection)</a>
</li>

Binary file not shown.

View File

@ -316,11 +316,14 @@ fitting <cite>TruncatedSVD</cite> and then <cite>LogisticRegression</cite> on th
<dl class="py method">
<dt class="sig sig-object py" id="quapy.classification.methods.LowRankLogisticRegression.get_params">
<span class="sig-name descname"><span class="pre">get_params</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.classification.methods.LowRankLogisticRegression.get_params" title="Permalink to this definition"></a></dt>
<span class="sig-name descname"><span class="pre">get_params</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">deep</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.classification.methods.LowRankLogisticRegression.get_params" title="Permalink to this definition"></a></dt>
<dd><p>Get hyper-parameters for this estimator.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>a dictionary with parameter names mapped to their values</p>
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>deep</strong> compatibility with sklearn</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a dictionary with parameter names mapped to their values</p>
</dd>
</dl>
</dd></dl>
@ -524,7 +527,7 @@ dimensionality of the embedding</p>
<dl class="py class">
<dt class="sig sig-object py" id="quapy.classification.neural.NeuralClassifierTrainer">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">quapy.classification.neural.</span></span><span class="sig-name descname"><span class="pre">NeuralClassifierTrainer</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">net</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#quapy.classification.neural.TextClassifierNet" title="quapy.classification.neural.TextClassifierNet"><span class="pre">TextClassifierNet</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">lr</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.001</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">weight_decay</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">patience</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">10</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">epochs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">200</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">64</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_size_test</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">512</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">padding_length</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">300</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'cpu'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">checkpointpath</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'../checkpoint/classifier_net.dat'</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.classification.neural.NeuralClassifierTrainer" title="Permalink to this definition"></a></dt>
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">quapy.classification.neural.</span></span><span class="sig-name descname"><span class="pre">NeuralClassifierTrainer</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">net</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#quapy.classification.neural.TextClassifierNet" title="quapy.classification.neural.TextClassifierNet"><span class="pre">TextClassifierNet</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">lr</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.001</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">weight_decay</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">patience</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">10</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">epochs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">200</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">64</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_size_test</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">512</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">padding_length</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">300</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'cuda'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">checkpointpath</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'../checkpoint/classifier_net.dat'</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.classification.neural.NeuralClassifierTrainer" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>Trains a neural network for text classification.</p>
<dl class="field-list simple">

View File

@ -447,8 +447,8 @@ index.</p>
<span class="sig-name descname"><span class="pre">sampling_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shuffle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling_index" title="Permalink to this definition"></a></dt>
<dd><p>Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the
prevalence values are not specified, then returns the index of a uniform sampling.
For each class, the sampling is drawn without replacement if the requested prevalence is larger than
the actual prevalence of the class, or with replacement otherwise.</p>
For each class, the sampling is drawn with replacement if the requested prevalence is larger than
the actual prevalence of the class, or without replacement otherwise.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
@ -534,7 +534,7 @@ values for each class)</p>
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.uniform_sampling">
<span class="sig-name descname"><span class="pre">uniform_sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.uniform_sampling" title="Permalink to this definition"></a></dt>
<dd><p>Returns a uniform sample (an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>) of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement
with replacement if the requested size is greater than the number of instances, or without replacement
otherwise.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
@ -553,7 +553,7 @@ otherwise.</p>
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.uniform_sampling_index">
<span class="sig-name descname"><span class="pre">uniform_sampling_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.uniform_sampling_index" title="Permalink to this definition"></a></dt>
<dd><p>Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement
with replacement if the requested size is greater than the number of instances, or without replacement
otherwise.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>

View File

@ -61,6 +61,7 @@
</section>
<section id="module-quapy.error">
<span id="quapy-error"></span><h2>quapy.error<a class="headerlink" href="#module-quapy.error" title="Permalink to this heading"></a></h2>
<p>Implementation of error measures used for quantification</p>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.error.absolute_error">
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">absolute_error</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prevs_hat</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.absolute_error" title="Permalink to this definition"></a></dt>
@ -86,8 +87,9 @@ where <span class="math notranslate nohighlight">\(\mathcal{Y}\)</span> are the
<dl class="py function">
<dt class="sig sig-object py" id="quapy.error.acc_error">
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">acc_error</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y_true</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">y_pred</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.acc_error" title="Permalink to this definition"></a></dt>
<dd><p>Computes the error in terms of 1-accuracy. The accuracy is computed as <span class="math notranslate nohighlight">\(\frac{tp+tn}{tp+fp+fn+tn}\)</span>, with
<cite>tp</cite>, <cite>fp</cite>, <cite>fn</cite>, and <cite>tn</cite> standing for true positives, false positives, false negatives, and true negatives,
<dd><p>Computes the error in terms of 1-accuracy. The accuracy is computed as
<span class="math notranslate nohighlight">\(\frac{tp+tn}{tp+fp+fn+tn}\)</span>, with <cite>tp</cite>, <cite>fp</cite>, <cite>fn</cite>, and <cite>tn</cite> standing
for true positives, false positives, false negatives, and true negatives,
respectively</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
@ -105,8 +107,9 @@ respectively</p>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.error.acce">
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">acce</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y_true</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">y_pred</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.acce" title="Permalink to this definition"></a></dt>
<dd><p>Computes the error in terms of 1-accuracy. The accuracy is computed as <span class="math notranslate nohighlight">\(\frac{tp+tn}{tp+fp+fn+tn}\)</span>, with
<cite>tp</cite>, <cite>fp</cite>, <cite>fn</cite>, and <cite>tn</cite> standing for true positives, false positives, false negatives, and true negatives,
<dd><p>Computes the error in terms of 1-accuracy. The accuracy is computed as
<span class="math notranslate nohighlight">\(\frac{tp+tn}{tp+fp+fn+tn}\)</span>, with <cite>tp</cite>, <cite>fp</cite>, <cite>fn</cite>, and <cite>tn</cite> standing
for true positives, false positives, false negatives, and true negatives,
respectively</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
@ -146,10 +149,12 @@ where <span class="math notranslate nohighlight">\(\mathcal{Y}\)</span> are the
<dl class="py function">
<dt class="sig sig-object py" id="quapy.error.f1_error">
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">f1_error</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y_true</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">y_pred</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.f1_error" title="Permalink to this definition"></a></dt>
<dd><p>F1 error: simply computes the error in terms of macro <span class="math notranslate nohighlight">\(F_1\)</span>, i.e., <span class="math notranslate nohighlight">\(1-F_1^M\)</span>,
where <span class="math notranslate nohighlight">\(F_1\)</span> is the harmonic mean of precision and recall, defined as <span class="math notranslate nohighlight">\(\frac{2tp}{2tp+fp+fn}\)</span>,
with <cite>tp</cite>, <cite>fp</cite>, and <cite>fn</cite> standing for true positives, false positives, and false negatives, respectively.
<cite>Macro</cite> averaging means the <span class="math notranslate nohighlight">\(F_1\)</span> is computed for each category independently, and then averaged.</p>
<dd><p>F1 error: simply computes the error in terms of macro <span class="math notranslate nohighlight">\(F_1\)</span>, i.e.,
<span class="math notranslate nohighlight">\(1-F_1^M\)</span>, where <span class="math notranslate nohighlight">\(F_1\)</span> is the harmonic mean of precision and recall,
defined as <span class="math notranslate nohighlight">\(\frac{2tp}{2tp+fp+fn}\)</span>, with <cite>tp</cite>, <cite>fp</cite>, and <cite>fn</cite> standing
for true positives, false positives, and false negatives, respectively.
<cite>Macro</cite> averaging means the <span class="math notranslate nohighlight">\(F_1\)</span> is computed for each category independently,
and then averaged.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
@ -166,10 +171,12 @@ with <cite>tp</cite>, <cite>fp</cite>, and <cite>fn</cite> standing for true pos
<dl class="py function">
<dt class="sig sig-object py" id="quapy.error.f1e">
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">f1e</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y_true</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">y_pred</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.f1e" title="Permalink to this definition"></a></dt>
<dd><p>F1 error: simply computes the error in terms of macro <span class="math notranslate nohighlight">\(F_1\)</span>, i.e., <span class="math notranslate nohighlight">\(1-F_1^M\)</span>,
where <span class="math notranslate nohighlight">\(F_1\)</span> is the harmonic mean of precision and recall, defined as <span class="math notranslate nohighlight">\(\frac{2tp}{2tp+fp+fn}\)</span>,
with <cite>tp</cite>, <cite>fp</cite>, and <cite>fn</cite> standing for true positives, false positives, and false negatives, respectively.
<cite>Macro</cite> averaging means the <span class="math notranslate nohighlight">\(F_1\)</span> is computed for each category independently, and then averaged.</p>
<dd><p>F1 error: simply computes the error in terms of macro <span class="math notranslate nohighlight">\(F_1\)</span>, i.e.,
<span class="math notranslate nohighlight">\(1-F_1^M\)</span>, where <span class="math notranslate nohighlight">\(F_1\)</span> is the harmonic mean of precision and recall,
defined as <span class="math notranslate nohighlight">\(\frac{2tp}{2tp+fp+fn}\)</span>, with <cite>tp</cite>, <cite>fp</cite>, and <cite>fn</cite> standing
for true positives, false positives, and false negatives, respectively.
<cite>Macro</cite> averaging means the <span class="math notranslate nohighlight">\(F_1\)</span> is computed for each category independently,
and then averaged.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
@ -186,7 +193,8 @@ with <cite>tp</cite>, <cite>fp</cite>, and <cite>fn</cite> standing for true pos
<dl class="py function">
<dt class="sig sig-object py" id="quapy.error.from_name">
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">from_name</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">err_name</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.from_name" title="Permalink to this definition"></a></dt>
<dd><p>Gets an error function from its name. E.g., <cite>from_name(“mae”)</cite> will return function <a class="reference internal" href="#quapy.error.mae" title="quapy.error.mae"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.mae()</span></code></a></p>
<dd><p>Gets an error function from its name. E.g., <cite>from_name(“mae”)</cite>
will return function <a class="reference internal" href="#quapy.error.mae" title="quapy.error.mae"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.mae()</span></code></a></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>err_name</strong> string, the error name</p>
@ -199,11 +207,13 @@ with <cite>tp</cite>, <cite>fp</cite>, and <cite>fn</cite> standing for true pos
<dl class="py function">
<dt class="sig sig-object py" id="quapy.error.kld">
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">kld</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">p</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">p_hat</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.kld" title="Permalink to this definition"></a></dt>
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">kld</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prevs_hat</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.kld" title="Permalink to this definition"></a></dt>
<dd><dl class="simple">
<dt>Computes the Kullback-Leibler divergence between the two prevalence distributions.</dt><dd><p>Kullback-Leibler divergence between two prevalence distributions <span class="math notranslate nohighlight">\(p\)</span> and <span class="math notranslate nohighlight">\(\hat{p}\)</span> is computed as
<span class="math notranslate nohighlight">\(KLD(p,\hat{p})=D_{KL}(p||\hat{p})=\sum_{y\in \mathcal{Y}} p(y)\log\frac{p(y)}{\hat{p}(y)}\)</span>, where
<span class="math notranslate nohighlight">\(\mathcal{Y}\)</span> are the classes of interest.
<dt>Computes the Kullback-Leibler divergence between the two prevalence distributions.</dt><dd><p>Kullback-Leibler divergence between two prevalence distributions <span class="math notranslate nohighlight">\(p\)</span> and <span class="math notranslate nohighlight">\(\hat{p}\)</span>
is computed as
<span class="math notranslate nohighlight">\(KLD(p,\hat{p})=D_{KL}(p||\hat{p})=
\sum_{y\in \mathcal{Y}} p(y)\log\frac{p(y)}{\hat{p}(y)}\)</span>,
where <span class="math notranslate nohighlight">\(\mathcal{Y}\)</span> are the classes of interest.
The distributions are smoothed using the <cite>eps</cite> factor (see <a class="reference internal" href="#quapy.error.smooth" title="quapy.error.smooth"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.smooth()</span></code></a>).</p>
</dd>
</dl>
@ -212,9 +222,10 @@ The distributions are smoothed using the <cite>eps</cite> factor (see <a class="
<dd class="field-odd"><ul class="simple">
<li><p><strong>prevs</strong> array-like of shape <cite>(n_classes,)</cite> with the true prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_classes,)</cite> with the predicted prevalence values</p></li>
<li><p><strong>eps</strong> smoothing factor. KLD is not defined in cases in which the distributions contain zeros; <cite>eps</cite>
is typically set to be <span class="math notranslate nohighlight">\(\frac{1}{2T}\)</span>, with <span class="math notranslate nohighlight">\(T\)</span> the sample size. If <cite>eps=None</cite>, the sample size
will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has thus to be set beforehand).</p></li>
<li><p><strong>eps</strong> smoothing factor. KLD is not defined in cases in which the distributions contain
zeros; <cite>eps</cite> is typically set to be <span class="math notranslate nohighlight">\(\frac{1}{2T}\)</span>, with <span class="math notranslate nohighlight">\(T\)</span> the sample size.
If <cite>eps=None</cite>, the sample size will be taken from the environment variable <cite>SAMPLE_SIZE</cite>
(which has thus to be set beforehand).</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
@ -231,7 +242,8 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>prevs</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the true prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the predicted prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the predicted
prevalence values</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
@ -248,7 +260,8 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>prevs</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the true prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the predicted prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the predicted
prevalence values</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
@ -259,17 +272,21 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
<dl class="py function">
<dt class="sig sig-object py" id="quapy.error.mean_relative_absolute_error">
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">mean_relative_absolute_error</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">p</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">p_hat</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.mean_relative_absolute_error" title="Permalink to this definition"></a></dt>
<dd><p>Computes the mean relative absolute error (see <a class="reference internal" href="#quapy.error.rae" title="quapy.error.rae"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.rae()</span></code></a>) across the sample pairs.
The distributions are smoothed using the <cite>eps</cite> factor (see <a class="reference internal" href="#quapy.error.smooth" title="quapy.error.smooth"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.smooth()</span></code></a>).</p>
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">mean_relative_absolute_error</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prevs_hat</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.mean_relative_absolute_error" title="Permalink to this definition"></a></dt>
<dd><p>Computes the mean relative absolute error (see <a class="reference internal" href="#quapy.error.rae" title="quapy.error.rae"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.rae()</span></code></a>) across
the sample pairs. The distributions are smoothed using the <cite>eps</cite> factor (see
<a class="reference internal" href="#quapy.error.smooth" title="quapy.error.smooth"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.smooth()</span></code></a>).</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>prevs</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the true prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the predicted prevalence values</p></li>
<li><p><strong>eps</strong> smoothing factor. <cite>mrae</cite> is not defined in cases in which the true distribution contains zeros; <cite>eps</cite>
is typically set to be <span class="math notranslate nohighlight">\(\frac{1}{2T}\)</span>, with <span class="math notranslate nohighlight">\(T\)</span> the sample size. If <cite>eps=None</cite>, the sample size
will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has thus to be set beforehand).</p></li>
<li><p><strong>prevs</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the true
prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the predicted
prevalence values</p></li>
<li><p><strong>eps</strong> smoothing factor. <cite>mrae</cite> is not defined in cases in which the true
distribution contains zeros; <cite>eps</cite> is typically set to be <span class="math notranslate nohighlight">\(\frac{1}{2T}\)</span>,
with <span class="math notranslate nohighlight">\(T\)</span> the sample size. If <cite>eps=None</cite>, the sample size will be taken from
the environment variable <cite>SAMPLE_SIZE</cite> (which has thus to be set beforehand).</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
@ -281,16 +298,20 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
<dl class="py function">
<dt class="sig sig-object py" id="quapy.error.mkld">
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">mkld</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prevs_hat</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.mkld" title="Permalink to this definition"></a></dt>
<dd><p>Computes the mean Kullback-Leibler divergence (see <a class="reference internal" href="#quapy.error.kld" title="quapy.error.kld"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.kld()</span></code></a>) across the sample pairs.
The distributions are smoothed using the <cite>eps</cite> factor (see <a class="reference internal" href="#quapy.error.smooth" title="quapy.error.smooth"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.smooth()</span></code></a>).</p>
<dd><p>Computes the mean Kullback-Leibler divergence (see <a class="reference internal" href="#quapy.error.kld" title="quapy.error.kld"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.kld()</span></code></a>) across the
sample pairs. The distributions are smoothed using the <cite>eps</cite> factor
(see <a class="reference internal" href="#quapy.error.smooth" title="quapy.error.smooth"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.smooth()</span></code></a>).</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>prevs</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the true prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the predicted prevalence values</p></li>
<li><p><strong>eps</strong> smoothing factor. KLD is not defined in cases in which the distributions contain zeros; <cite>eps</cite>
is typically set to be <span class="math notranslate nohighlight">\(\frac{1}{2T}\)</span>, with <span class="math notranslate nohighlight">\(T\)</span> the sample size. If <cite>eps=None</cite>, the sample size
will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has thus to be set beforehand).</p></li>
<li><p><strong>prevs</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the true
prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the predicted
prevalence values</p></li>
<li><p><strong>eps</strong> smoothing factor. KLD is not defined in cases in which the distributions contain
zeros; <cite>eps</cite> is typically set to be <span class="math notranslate nohighlight">\(\frac{1}{2T}\)</span>, with <span class="math notranslate nohighlight">\(T\)</span> the sample size.
If <cite>eps=None</cite>, the sample size will be taken from the environment variable <cite>SAMPLE_SIZE</cite>
(which has thus to be set beforehand).</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
@ -302,16 +323,19 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
<dl class="py function">
<dt class="sig sig-object py" id="quapy.error.mnkld">
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">mnkld</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prevs_hat</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.mnkld" title="Permalink to this definition"></a></dt>
<dd><p>Computes the mean Normalized Kullback-Leibler divergence (see <a class="reference internal" href="#quapy.error.nkld" title="quapy.error.nkld"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.nkld()</span></code></a>) across the sample pairs.
The distributions are smoothed using the <cite>eps</cite> factor (see <a class="reference internal" href="#quapy.error.smooth" title="quapy.error.smooth"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.smooth()</span></code></a>).</p>
<dd><p>Computes the mean Normalized Kullback-Leibler divergence (see <a class="reference internal" href="#quapy.error.nkld" title="quapy.error.nkld"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.nkld()</span></code></a>)
across the sample pairs. The distributions are smoothed using the <cite>eps</cite> factor
(see <a class="reference internal" href="#quapy.error.smooth" title="quapy.error.smooth"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.smooth()</span></code></a>).</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>prevs</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the true prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the predicted prevalence values</p></li>
<li><p><strong>eps</strong> smoothing factor. NKLD is not defined in cases in which the distributions contain zeros; <cite>eps</cite>
is typically set to be <span class="math notranslate nohighlight">\(\frac{1}{2T}\)</span>, with <span class="math notranslate nohighlight">\(T\)</span> the sample size. If <cite>eps=None</cite>, the sample size
will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has thus to be set beforehand).</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the predicted
prevalence values</p></li>
<li><p><strong>eps</strong> smoothing factor. NKLD is not defined in cases in which the distributions contain
zeros; <cite>eps</cite> is typically set to be <span class="math notranslate nohighlight">\(\frac{1}{2T}\)</span>, with <span class="math notranslate nohighlight">\(T\)</span> the sample size.
If <cite>eps=None</cite>, the sample size will be taken from the environment variable <cite>SAMPLE_SIZE</cite>
(which has thus to be set beforehand).</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
@ -322,17 +346,21 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
<dl class="py function">
<dt class="sig sig-object py" id="quapy.error.mrae">
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">mrae</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">p</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">p_hat</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.mrae" title="Permalink to this definition"></a></dt>
<dd><p>Computes the mean relative absolute error (see <a class="reference internal" href="#quapy.error.rae" title="quapy.error.rae"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.rae()</span></code></a>) across the sample pairs.
The distributions are smoothed using the <cite>eps</cite> factor (see <a class="reference internal" href="#quapy.error.smooth" title="quapy.error.smooth"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.smooth()</span></code></a>).</p>
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">mrae</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prevs_hat</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.mrae" title="Permalink to this definition"></a></dt>
<dd><p>Computes the mean relative absolute error (see <a class="reference internal" href="#quapy.error.rae" title="quapy.error.rae"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.rae()</span></code></a>) across
the sample pairs. The distributions are smoothed using the <cite>eps</cite> factor (see
<a class="reference internal" href="#quapy.error.smooth" title="quapy.error.smooth"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.smooth()</span></code></a>).</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>prevs</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the true prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the predicted prevalence values</p></li>
<li><p><strong>eps</strong> smoothing factor. <cite>mrae</cite> is not defined in cases in which the true distribution contains zeros; <cite>eps</cite>
is typically set to be <span class="math notranslate nohighlight">\(\frac{1}{2T}\)</span>, with <span class="math notranslate nohighlight">\(T\)</span> the sample size. If <cite>eps=None</cite>, the sample size
will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has thus to be set beforehand).</p></li>
<li><p><strong>prevs</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the true
prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the predicted
prevalence values</p></li>
<li><p><strong>eps</strong> smoothing factor. <cite>mrae</cite> is not defined in cases in which the true
distribution contains zeros; <cite>eps</cite> is typically set to be <span class="math notranslate nohighlight">\(\frac{1}{2T}\)</span>,
with <span class="math notranslate nohighlight">\(T\)</span> the sample size. If <cite>eps=None</cite>, the sample size will be taken from
the environment variable <cite>SAMPLE_SIZE</cite> (which has thus to be set beforehand).</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
@ -348,8 +376,10 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>prevs</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the true prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the predicted prevalence values</p></li>
<li><p><strong>prevs</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the
true prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_samples, n_classes,)</cite> with the
predicted prevalence values</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
@ -360,10 +390,12 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
<dl class="py function">
<dt class="sig sig-object py" id="quapy.error.nkld">
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">nkld</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">p</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">p_hat</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.nkld" title="Permalink to this definition"></a></dt>
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">nkld</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prevs_hat</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.nkld" title="Permalink to this definition"></a></dt>
<dd><dl class="simple">
<dt>Computes the Normalized Kullback-Leibler divergence between the two prevalence distributions.</dt><dd><p>Normalized Kullback-Leibler divergence between two prevalence distributions <span class="math notranslate nohighlight">\(p\)</span> and <span class="math notranslate nohighlight">\(\hat{p}\)</span>
is computed as <span class="math notranslate nohighlight">\(NKLD(p,\hat{p}) = 2\frac{e^{KLD(p,\hat{p})}}{e^{KLD(p,\hat{p})}+1}-1\)</span>, where
<dt>Computes the Normalized Kullback-Leibler divergence between the two prevalence distributions.</dt><dd><p>Normalized Kullback-Leibler divergence between two prevalence distributions <span class="math notranslate nohighlight">\(p\)</span> and
<span class="math notranslate nohighlight">\(\hat{p}\)</span> is computed as
math:<cite>NKLD(p,hat{p}) = 2frac{e^{KLD(p,hat{p})}}{e^{KLD(p,hat{p})}+1}-1</cite>,
where
<span class="math notranslate nohighlight">\(\mathcal{Y}\)</span> are the classes of interest.
The distributions are smoothed using the <cite>eps</cite> factor (see <a class="reference internal" href="#quapy.error.smooth" title="quapy.error.smooth"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.smooth()</span></code></a>).</p>
</dd>
@ -373,9 +405,10 @@ The distributions are smoothed using the <cite>eps</cite> factor (see <a class="
<dd class="field-odd"><ul class="simple">
<li><p><strong>prevs</strong> array-like of shape <cite>(n_classes,)</cite> with the true prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_classes,)</cite> with the predicted prevalence values</p></li>
<li><p><strong>eps</strong> smoothing factor. NKLD is not defined in cases in which the distributions contain zeros; <cite>eps</cite>
is typically set to be <span class="math notranslate nohighlight">\(\frac{1}{2T}\)</span>, with <span class="math notranslate nohighlight">\(T\)</span> the sample size. If <cite>eps=None</cite>, the sample size
will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has thus to be set beforehand).</p></li>
<li><p><strong>eps</strong> smoothing factor. NKLD is not defined in cases in which the distributions
contain zeros; <cite>eps</cite> is typically set to be <span class="math notranslate nohighlight">\(\frac{1}{2T}\)</span>, with <span class="math notranslate nohighlight">\(T\)</span> the sample
size. If <cite>eps=None</cite>, the sample size will be taken from the environment variable
<cite>SAMPLE_SIZE</cite> (which has thus to be set beforehand).</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
@ -386,10 +419,12 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
<dl class="py function">
<dt class="sig sig-object py" id="quapy.error.rae">
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">rae</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">p</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">p_hat</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.rae" title="Permalink to this definition"></a></dt>
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">rae</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prevs_hat</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.rae" title="Permalink to this definition"></a></dt>
<dd><dl class="simple">
<dt>Computes the absolute relative error between the two prevalence vectors.</dt><dd><p>Relative absolute error between two prevalence vectors <span class="math notranslate nohighlight">\(p\)</span> and <span class="math notranslate nohighlight">\(\hat{p}\)</span> is computed as
<span class="math notranslate nohighlight">\(RAE(p,\hat{p})=\frac{1}{|\mathcal{Y}|}\sum_{y\in \mathcal{Y}}\frac{|\hat{p}(y)-p(y)|}{p(y)}\)</span>,
<dt>Computes the absolute relative error between the two prevalence vectors.</dt><dd><p>Relative absolute error between two prevalence vectors <span class="math notranslate nohighlight">\(p\)</span> and <span class="math notranslate nohighlight">\(\hat{p}\)</span>
is computed as
<span class="math notranslate nohighlight">\(RAE(p,\hat{p})=
\frac{1}{|\mathcal{Y}|}\sum_{y\in \mathcal{Y}}\frac{|\hat{p}(y)-p(y)|}{p(y)}\)</span>,
where <span class="math notranslate nohighlight">\(\mathcal{Y}\)</span> are the classes of interest.
The distributions are smoothed using the <cite>eps</cite> factor (see <a class="reference internal" href="#quapy.error.smooth" title="quapy.error.smooth"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.smooth()</span></code></a>).</p>
</dd>
@ -399,9 +434,10 @@ The distributions are smoothed using the <cite>eps</cite> factor (see <a class="
<dd class="field-odd"><ul class="simple">
<li><p><strong>prevs</strong> array-like of shape <cite>(n_classes,)</cite> with the true prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_classes,)</cite> with the predicted prevalence values</p></li>
<li><p><strong>eps</strong> smoothing factor. <cite>rae</cite> is not defined in cases in which the true distribution contains zeros; <cite>eps</cite>
is typically set to be <span class="math notranslate nohighlight">\(\frac{1}{2T}\)</span>, with <span class="math notranslate nohighlight">\(T\)</span> the sample size. If <cite>eps=None</cite>, the sample size
will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has thus to be set beforehand).</p></li>
<li><p><strong>eps</strong> smoothing factor. <cite>rae</cite> is not defined in cases in which the true distribution
contains zeros; <cite>eps</cite> is typically set to be <span class="math notranslate nohighlight">\(\frac{1}{2T}\)</span>, with <span class="math notranslate nohighlight">\(T\)</span> the
sample size. If <cite>eps=None</cite>, the sample size will be taken from the environment variable
<cite>SAMPLE_SIZE</cite> (which has thus to be set beforehand).</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
@ -412,10 +448,12 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
<dl class="py function">
<dt class="sig sig-object py" id="quapy.error.relative_absolute_error">
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">relative_absolute_error</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">p</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">p_hat</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.relative_absolute_error" title="Permalink to this definition"></a></dt>
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">relative_absolute_error</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prevs_hat</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.relative_absolute_error" title="Permalink to this definition"></a></dt>
<dd><dl class="simple">
<dt>Computes the absolute relative error between the two prevalence vectors.</dt><dd><p>Relative absolute error between two prevalence vectors <span class="math notranslate nohighlight">\(p\)</span> and <span class="math notranslate nohighlight">\(\hat{p}\)</span> is computed as
<span class="math notranslate nohighlight">\(RAE(p,\hat{p})=\frac{1}{|\mathcal{Y}|}\sum_{y\in \mathcal{Y}}\frac{|\hat{p}(y)-p(y)|}{p(y)}\)</span>,
<dt>Computes the absolute relative error between the two prevalence vectors.</dt><dd><p>Relative absolute error between two prevalence vectors <span class="math notranslate nohighlight">\(p\)</span> and <span class="math notranslate nohighlight">\(\hat{p}\)</span>
is computed as
<span class="math notranslate nohighlight">\(RAE(p,\hat{p})=
\frac{1}{|\mathcal{Y}|}\sum_{y\in \mathcal{Y}}\frac{|\hat{p}(y)-p(y)|}{p(y)}\)</span>,
where <span class="math notranslate nohighlight">\(\mathcal{Y}\)</span> are the classes of interest.
The distributions are smoothed using the <cite>eps</cite> factor (see <a class="reference internal" href="#quapy.error.smooth" title="quapy.error.smooth"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.smooth()</span></code></a>).</p>
</dd>
@ -425,9 +463,10 @@ The distributions are smoothed using the <cite>eps</cite> factor (see <a class="
<dd class="field-odd"><ul class="simple">
<li><p><strong>prevs</strong> array-like of shape <cite>(n_classes,)</cite> with the true prevalence values</p></li>
<li><p><strong>prevs_hat</strong> array-like of shape <cite>(n_classes,)</cite> with the predicted prevalence values</p></li>
<li><p><strong>eps</strong> smoothing factor. <cite>rae</cite> is not defined in cases in which the true distribution contains zeros; <cite>eps</cite>
is typically set to be <span class="math notranslate nohighlight">\(\frac{1}{2T}\)</span>, with <span class="math notranslate nohighlight">\(T\)</span> the sample size. If <cite>eps=None</cite>, the sample size
will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has thus to be set beforehand).</p></li>
<li><p><strong>eps</strong> smoothing factor. <cite>rae</cite> is not defined in cases in which the true distribution
contains zeros; <cite>eps</cite> is typically set to be <span class="math notranslate nohighlight">\(\frac{1}{2T}\)</span>, with <span class="math notranslate nohighlight">\(T\)</span> the
sample size. If <cite>eps=None</cite>, the sample size will be taken from the environment variable
<cite>SAMPLE_SIZE</cite> (which has thus to be set beforehand).</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
@ -438,10 +477,11 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
<dl class="py function">
<dt class="sig sig-object py" id="quapy.error.se">
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">se</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">p</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">p_hat</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.se" title="Permalink to this definition"></a></dt>
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">se</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prevs_hat</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.se" title="Permalink to this definition"></a></dt>
<dd><dl class="simple">
<dt>Computes the squared error between the two prevalence vectors.</dt><dd><p>Squared error between two prevalence vectors <span class="math notranslate nohighlight">\(p\)</span> and <span class="math notranslate nohighlight">\(\hat{p}\)</span> is computed as
<span class="math notranslate nohighlight">\(SE(p,\hat{p})=\frac{1}{|\mathcal{Y}|}\sum_{y\in \mathcal{Y}}(\hat{p}(y)-p(y))^2\)</span>, where
<span class="math notranslate nohighlight">\(SE(p,\hat{p})=\frac{1}{|\mathcal{Y}|}\sum_{y\in \mathcal{Y}}(\hat{p}(y)-p(y))^2\)</span>,
where
<span class="math notranslate nohighlight">\(\mathcal{Y}\)</span> are the classes of interest.</p>
</dd>
</dl>
@ -462,7 +502,8 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
<dt class="sig sig-object py" id="quapy.error.smooth">
<span class="sig-prename descclassname"><span class="pre">quapy.error.</span></span><span class="sig-name descname"><span class="pre">smooth</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.error.smooth" title="Permalink to this definition"></a></dt>
<dd><p>Smooths a prevalence distribution with <span class="math notranslate nohighlight">\(\epsilon\)</span> (<cite>eps</cite>) as:
<span class="math notranslate nohighlight">\(\underline{p}(y)=\frac{\epsilon+p(y)}{\epsilon|\mathcal{Y}|+\displaystyle\sum_{y\in \mathcal{Y}}p(y)}\)</span></p>
<span class="math notranslate nohighlight">\(\underline{p}(y)=\frac{\epsilon+p(y)}{\epsilon|\mathcal{Y}|+
\displaystyle\sum_{y\in \mathcal{Y}}p(y)}\)</span></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
@ -601,7 +642,7 @@ convenient or not. Set to False to deactivate.</p></li>
</div>
<span class="target" id="module-quapy.protocol"></span><dl class="py class">
<dt class="sig sig-object py" id="quapy.protocol.APP">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">quapy.protocol.</span></span><span class="sig-name descname"><span class="pre">APP</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="quapy.data.html#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">sample_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">21</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">10</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">smooth_limits_epsilon</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_type</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'sample_prev'</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.protocol.APP" title="Permalink to this definition"></a></dt>
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">quapy.protocol.</span></span><span class="sig-name descname"><span class="pre">APP</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="quapy.data.html#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">sample_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">21</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">10</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">smooth_limits_epsilon</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sanity_check</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">10000</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_type</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'sample_prev'</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.protocol.APP" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="#quapy.protocol.AbstractStochasticSeededProtocol" title="quapy.protocol.AbstractStochasticSeededProtocol"><code class="xref py py-class docutils literal notranslate"><span class="pre">AbstractStochasticSeededProtocol</span></code></a>, <a class="reference internal" href="#quapy.protocol.OnLabelledCollectionProtocol" title="quapy.protocol.OnLabelledCollectionProtocol"><code class="xref py py-class docutils literal notranslate"><span class="pre">OnLabelledCollectionProtocol</span></code></a></p>
<p>Implementation of the artificial prevalence protocol (APP).
The APP consists of exploring a grid of prevalence values containing <cite>n_prevalences</cite> points (e.g.,
@ -621,6 +662,8 @@ grid (default is 21)</p></li>
<li><p><strong>smooth_limits_epsilon</strong> the quantity to add and subtract to the limits 0 and 1</p></li>
<li><p><strong>random_state</strong> allows replicating samples across runs (default 0, meaning that the sequence of samples
will be the same every time the protocol is called)</p></li>
<li><p><strong>sanity_check</strong> int, raises an exception warning the user that the number of examples to be generated exceed
this number; set to None for skipping this check</p></li>
<li><p><strong>return_type</strong> set to “sample_prev” (default) to get the pairs of (sample, prevalence) at each iteration, or
to “labelled_collection” to get instead instances of LabelledCollection</p></li>
</ul>
@ -1819,6 +1862,7 @@ this function is invoked, it loads the pickled resource. Example:</p>
</section>
<section id="module-quapy">
<span id="module-contents"></span><h2>Module contents<a class="headerlink" href="#module-quapy" title="Permalink to this heading"></a></h2>
<p>QuaPy module for quantification</p>
</section>
</section>

View File

@ -1064,11 +1064,6 @@ validation data, or as an integer, indicating that the misclassification rates s
<span class="sig-prename descclassname"><span class="pre">quapy.method.aggregative.</span></span><span class="sig-name descname"><span class="pre">cross_generate_predictions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classifier</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">val_split</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">probabilistic</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">fit_classifier</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.method.aggregative.cross_generate_predictions" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.method.aggregative.cross_generate_predictions_depr">
<span class="sig-prename descclassname"><span class="pre">quapy.method.aggregative.</span></span><span class="sig-name descname"><span class="pre">cross_generate_predictions_depr</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classifier</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">val_split</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">probabilistic</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">fit_classifier</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">method_name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.method.aggregative.cross_generate_predictions_depr" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.method.aggregative.newELM">
<span class="sig-prename descclassname"><span class="pre">quapy.method.aggregative.</span></span><span class="sig-name descname"><span class="pre">newELM</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">svmperf_base</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loss</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'01'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">C</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.method.aggregative.newELM" title="Permalink to this definition"></a></dt>

File diff suppressed because one or more lines are too long

152
examples/uci_experiments.py Normal file
View File

@ -0,0 +1,152 @@
from copy import deepcopy
import quapy as qp
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from quapy.classification.methods import LowRankLogisticRegression
from quapy.method.meta import QuaNet
from quapy.protocol import APP
from quapy.method.aggregative import CC, ACC, PCC, PACC, MAX, MS, MS2, EMQ, HDy, newSVMAE
from quapy.method.meta import EHDy
import numpy as np
import os
import pickle
import itertools
import argparse
import torch
import shutil
N_JOBS = -1
CUDA_N_JOBS = 2
ENSEMBLE_N_JOBS = -1
qp.environ['SAMPLE_SIZE'] = 100
def newLR():
return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
def calibratedLR():
return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1))
__C_range = np.logspace(-3, 3, 7)
lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']}
svmperf_params = {'classifier__C': __C_range}
def quantification_models():
yield 'cc', CC(newLR()), lr_params
yield 'acc', ACC(newLR()), lr_params
yield 'pcc', PCC(newLR()), lr_params
yield 'pacc', PACC(newLR()), lr_params
yield 'MAX', MAX(newLR()), lr_params
yield 'MS', MS(newLR()), lr_params
yield 'MS2', MS2(newLR()), lr_params
yield 'sldc', EMQ(newLR(), recalib='platt'), lr_params
yield 'svmmae', newSVMAE(), svmperf_params
yield 'hdy', HDy(newLR()), lr_params
def quantification_cuda_models():
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Running QuaNet in {device}')
learner = LowRankLogisticRegression()
yield 'quanet', QuaNet(learner, checkpointdir=args.checkpointdir, device=device), lr_params
def evaluate_experiment(true_prevalences, estim_prevalences):
print('\nEvaluation Metrics:\n' + '=' * 22)
for eval_measure in [qp.error.mae, qp.error.mrae]:
err = eval_measure(true_prevalences, estim_prevalences)
print(f'\t{eval_measure.__name__}={err:.4f}')
print()
def result_path(path, dataset_name, model_name, run, optim_loss):
return os.path.join(path, f'{dataset_name}-{model_name}-run{run}-{optim_loss}.pkl')
def is_already_computed(dataset_name, model_name, run, optim_loss):
return os.path.exists(result_path(args.results, dataset_name, model_name, run, optim_loss))
def save_results(dataset_name, model_name, run, optim_loss, *results):
rpath = result_path(args.results, dataset_name, model_name, run, optim_loss)
qp.util.create_parent_dir(rpath)
with open(rpath, 'wb') as foo:
pickle.dump(tuple(results), foo, pickle.HIGHEST_PROTOCOL)
def run(experiment):
optim_loss, dataset_name, (model_name, model, hyperparams) = experiment
if dataset_name in ['acute.a', 'acute.b', 'iris.1']: return
collection = qp.datasets.fetch_UCILabelledCollection(dataset_name)
for run, data in enumerate(qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=1)):
if is_already_computed(dataset_name, model_name, run=run, optim_loss=optim_loss):
print(f'result for dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5 already computed.')
continue
print(f'running dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5')
# model selection (hyperparameter optimization for a quantification-oriented loss)
train, test = data.train_test
train, val = train.split_stratified()
if hyperparams is not None:
model_selection = qp.model_selection.GridSearchQ(
deepcopy(model),
param_grid=hyperparams,
protocol=APP(val, n_prevalences=21, repeats=25),
error=optim_loss,
refit=True,
timeout=60*60,
verbose=True
)
model_selection.fit(data.training)
model = model_selection.best_model()
best_params = model_selection.best_params_
else:
model.fit(data.training)
best_params = {}
# model evaluation
true_prevalences, estim_prevalences = qp.evaluation.prediction(
model,
protocol=APP(test, n_prevalences=21, repeats=100)
)
test_true_prevalence = data.test.prevalence()
evaluate_experiment(true_prevalences, estim_prevalences)
save_results(dataset_name, model_name, run, optim_loss,
true_prevalences, estim_prevalences,
data.training.prevalence(), test_true_prevalence,
best_params)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification')
parser.add_argument('results', metavar='RESULT_PATH', type=str,
help='path to the directory where to store the results')
parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='../svm_perf_quantification',
help='path to the directory with svmperf')
parser.add_argument('--checkpointdir', metavar='PATH', type=str, default='./checkpoint',
help='path to the directory where to dump QuaNet checkpoints')
args = parser.parse_args()
print(f'Result folder: {args.results}')
np.random.seed(0)
qp.environ['SVMPERF_HOME'] = args.svmperfpath
optim_losses = ['mae']
datasets = qp.datasets.UCI_DATASETS
models = quantification_models()
qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=N_JOBS)
models = quantification_cuda_models()
qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=CUDA_N_JOBS)
shutil.rmtree(args.checkpointdir, ignore_errors=True)

View File

@ -0,0 +1,244 @@
from scipy.sparse import csc_matrix, csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
import numpy as np
from joblib import Parallel, delayed
import sklearn
import math
from scipy.stats import t
class ContTable:
def __init__(self, tp=0, tn=0, fp=0, fn=0):
self.tp=tp
self.tn=tn
self.fp=fp
self.fn=fn
def get_d(self): return self.tp + self.tn + self.fp + self.fn
def get_c(self): return self.tp + self.fn
def get_not_c(self): return self.tn + self.fp
def get_f(self): return self.tp + self.fp
def get_not_f(self): return self.tn + self.fn
def p_c(self): return (1.0*self.get_c())/self.get_d()
def p_not_c(self): return 1.0-self.p_c()
def p_f(self): return (1.0*self.get_f())/self.get_d()
def p_not_f(self): return 1.0-self.p_f()
def p_tp(self): return (1.0*self.tp) / self.get_d()
def p_tn(self): return (1.0*self.tn) / self.get_d()
def p_fp(self): return (1.0*self.fp) / self.get_d()
def p_fn(self): return (1.0*self.fn) / self.get_d()
def tpr(self):
c = 1.0*self.get_c()
return self.tp / c if c > 0.0 else 0.0
def fpr(self):
_c = 1.0*self.get_not_c()
return self.fp / _c if _c > 0.0 else 0.0
def __ig_factor(p_tc, p_t, p_c):
den = p_t * p_c
if den != 0.0 and p_tc != 0:
return p_tc * math.log(p_tc / den, 2)
else:
return 0.0
def information_gain(cell):
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \
__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\
__ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \
__ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())
def squared_information_gain(cell):
return information_gain(cell)**2
def posneg_information_gain(cell):
ig = information_gain(cell)
if cell.tpr() < cell.fpr():
return -ig
else:
return ig
def pos_information_gain(cell):
if cell.tpr() < cell.fpr():
return 0
else:
return information_gain(cell)
def pointwise_mutual_information(cell):
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c())
def gss(cell):
return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn()
def chi_square(cell):
den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c()
if den==0.0: return 0.0
num = gss(cell)**2
return num / den
def conf_interval(xt, n):
if n>30:
z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2
else:
z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2
p = (xt + 0.5 * z2) / (n + z2)
amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2))
return p, amplitude
def strength(minPosRelFreq, minPos, maxNeg):
if minPos > maxNeg:
return math.log(2.0 * minPosRelFreq, 2.0)
else:
return 0.0
#set cancel_features=True to allow some features to be weighted as 0 (as in the original article)
#however, for some extremely imbalanced dataset caused all documents to be 0
def conf_weight(cell, cancel_features=False):
c = cell.get_c()
not_c = cell.get_not_c()
tp = cell.tp
fp = cell.fp
pos_p, pos_amp = conf_interval(tp, c)
neg_p, neg_amp = conf_interval(fp, not_c)
min_pos = pos_p-pos_amp
max_neg = neg_p+neg_amp
den = (min_pos + max_neg)
minpos_relfreq = min_pos / (den if den != 0 else 1)
str_tplus = strength(minpos_relfreq, min_pos, max_neg);
if str_tplus == 0 and not cancel_features:
return 1e-20
return str_tplus;
def get_tsr_matrix(cell_matrix, tsr_score_funtion):
nC = len(cell_matrix)
nF = len(cell_matrix[0])
tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)]
return np.array(tsr_matrix)
def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD):
tp_ = len(positive_document_indexes & feature_document_indexes)
fp_ = len(feature_document_indexes - positive_document_indexes)
fn_ = len(positive_document_indexes - feature_document_indexes)
tn_ = nD - (tp_ + fp_ + fn_)
return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_)
def category_tables(feature_sets, category_sets, c, nD, nF):
return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)]
def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1):
"""
Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c.
Efficiency O(nF x nC x log(S)) where S is the sparse factor
"""
nD, nF = coocurrence_matrix.shape
nD2, nC = label_matrix.shape
if nD != nD2:
raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' %
(coocurrence_matrix.shape,label_matrix.shape))
def nonzero_set(matrix, col):
return set(matrix[:, col].nonzero()[0])
if isinstance(coocurrence_matrix, csr_matrix):
coocurrence_matrix = csc_matrix(coocurrence_matrix)
feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)]
category_sets = [nonzero_set(label_matrix, c) for c in range(nC)]
cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC))
return np.array(cell_matrix)
class TSRweighting(BaseEstimator,TransformerMixin):
"""
Supervised Term Weighting function based on any Term Selection Reduction (TSR) function (e.g., information gain,
chi-square, etc.) or, more generally, on any function that could be computed on the 4-cell contingency table for
each category-feature pair.
The supervised_4cell_matrix (a CxF matrix containing the 4-cell contingency tables
for each category-feature pair) can be pre-computed (e.g., during the feature selection phase) and passed as an
argument.
When C>1, i.e., in multiclass scenarios, a global_policy is used in order to determine a single feature-score which
informs about its relevance. Accepted policies include "max" (takes the max score across categories), "ave" and "wave"
(take the average, or weighted average, across all categories -- weights correspond to the class prevalence), and "sum"
(which sums all category scores).
"""
def __init__(self, tsr_function, global_policy='max', supervised_4cell_matrix=None, sublinear_tf=True, norm='l2', min_df=3, n_jobs=-1):
if global_policy not in ['max', 'ave', 'wave', 'sum']: raise ValueError('Global policy should be in {"max", "ave", "wave", "sum"}')
self.tsr_function = tsr_function
self.global_policy = global_policy
self.supervised_4cell_matrix = supervised_4cell_matrix
self.sublinear_tf=sublinear_tf
self.norm=norm
self.min_df = min_df
self.n_jobs=n_jobs
def fit(self, X, y):
self.count_vectorizer = CountVectorizer(min_df=self.min_df)
X = self.count_vectorizer.fit_transform(X)
self.tf_vectorizer = TfidfTransformer(
norm=None, use_idf=False, smooth_idf=False, sublinear_tf=self.sublinear_tf).fit(X)
if len(y.shape) == 1:
y = np.expand_dims(y, axis=1)
nD, nC = y.shape
nF = len(self.tf_vectorizer.get_feature_names_out())
if self.supervised_4cell_matrix is None:
self.supervised_4cell_matrix = get_supervised_matrix(X, y, n_jobs=self.n_jobs)
else:
if self.supervised_4cell_matrix.shape != (nC, nF): raise ValueError("Shape of supervised information matrix is inconsistent with X and y")
tsr_matrix = get_tsr_matrix(self.supervised_4cell_matrix, self.tsr_function)
if self.global_policy == 'ave':
self.global_tsr_vector = np.average(tsr_matrix, axis=0)
elif self.global_policy == 'wave':
category_prevalences = [sum(y[:,c])*1.0/nD for c in range(nC)]
self.global_tsr_vector = np.average(tsr_matrix, axis=0, weights=category_prevalences)
elif self.global_policy == 'sum':
self.global_tsr_vector = np.sum(tsr_matrix, axis=0)
elif self.global_policy == 'max':
self.global_tsr_vector = np.amax(tsr_matrix, axis=0)
return self
def fit_transform(self, X, y):
return self.fit(X,y).transform(X)
def transform(self, X):
if not hasattr(self, 'global_tsr_vector'): raise NameError('TSRweighting: transform method called before fit.')
X = self.count_vectorizer.transform(X)
tf_X = self.tf_vectorizer.transform(X).toarray()
weighted_X = np.multiply(tf_X, self.global_tsr_vector)
if self.norm is not None and self.norm!='none':
weighted_X = sklearn.preprocessing.normalize(weighted_X, norm=self.norm, axis=1, copy=False)
return csr_matrix(weighted_X)

View File

0
laboratory/main.py Normal file
View File

148
laboratory/method_dxs.py Normal file
View File

@ -0,0 +1,148 @@
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
import quapy as qp
from data import LabelledCollection
import numpy as np
from laboratory.custom_vectorizers import *
from protocol import APP
from quapy.method.aggregative import _get_divergence, HDy, DistributionMatching
from quapy.method.base import BaseQuantifier
from scipy import optimize
import pandas as pd
# TODO: explore the bernoulli (term presence/absence) variant
# TODO: explore the multinomial (term frequency) variant
# TODO: explore the multinomial + length normalization variant
# TODO: consolidate the TSR-variant (e.g., using information gain) variant;
# - works better with the idf?
# - works better with length normalization?
# - etc
class DxS(BaseQuantifier):
def __init__(self, vectorizer=None, divergence='topsoe'):
self.vectorizer = vectorizer
self.divergence = divergence
# def __as_distribution(self, instances):
# return np.asarray(instances.sum(axis=0) / instances.sum()).flatten()
def __as_distribution(self, instances):
dist = instances.sum(axis=0) / instances.sum()
return np.asarray(dist).flatten()
def fit(self, data: LabelledCollection):
text_instances, labels = data.Xy
if self.vectorizer is not None:
text_instances = self.vectorizer.fit_transform(text_instances, y=labels)
distributions = []
for class_i in data.classes_:
distributions.append(self.__as_distribution(text_instances[labels == class_i]))
self.validation_distribution = np.asarray(distributions)
return self
def quantify(self, text_instances):
if self.vectorizer is not None:
text_instances = self.vectorizer.transform(text_instances)
test_distribution = self.__as_distribution(text_instances)
divergence = _get_divergence(self.divergence)
n_classes, n_feats = self.validation_distribution.shape
def match(prev):
prev = np.expand_dims(prev, axis=0)
mixture_distribution = (prev @ self.validation_distribution).flatten()
return divergence(test_distribution, mixture_distribution)
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 250
qp.environ['N_JOBS'] = -1
min_df = 10
# dataset = 'imdb'
repeats = 10
error = 'mae'
div = 'HD'
# generates tuples (dataset, method, method_name)
# (the dataset is needed for methods that process the dataset differently)
def gen_methods():
for dataset in qp.datasets.REVIEWS_SENTIMENT_DATASETS:
data = qp.datasets.fetch_reviews(dataset, tfidf=False)
bernoulli_vectorizer = CountVectorizer(min_df=min_df, binary=True)
dxs = DxS(divergence=div, vectorizer=bernoulli_vectorizer)
yield data, dxs, 'DxS-Bernoulli'
multinomial_vectorizer = CountVectorizer(min_df=min_df, binary=False)
dxs = DxS(divergence=div, vectorizer=multinomial_vectorizer)
yield data, dxs, 'DxS-multinomial'
tf_vectorizer = TfidfVectorizer(sublinear_tf=False, use_idf=False, min_df=min_df, norm=None)
dxs = DxS(divergence=div, vectorizer=tf_vectorizer)
yield data, dxs, 'DxS-TF'
logtf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=False, min_df=min_df, norm=None)
dxs = DxS(divergence=div, vectorizer=logtf_vectorizer)
yield data, dxs, 'DxS-logTF'
tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm=None)
dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer)
yield data, dxs, 'DxS-TFIDF'
tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm='l2')
dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer)
yield data, dxs, 'DxS-TFIDF-l2'
tsr_vectorizer = TSRweighting(tsr_function=information_gain, min_df=min_df, norm='l2')
dxs = DxS(divergence=div, vectorizer=tsr_vectorizer)
yield data, dxs, 'DxS-TFTSR-l2'
data = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=min_df)
hdy = HDy(LogisticRegression())
yield data, hdy, 'HDy'
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=5)
yield data, dm, 'DM-5b'
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=10)
yield data, dm, 'DM-10b'
result_path = 'results.csv'
with open(result_path, 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
for data, quantifier, quant_name in gen_methods():
quantifier.fit(data.training)
report = qp.evaluation.evaluation_report(quantifier, APP(data.test, repeats=repeats), error_metrics=['mae','mrae'], verbose=True)
means = report.mean()
csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
df = pd.read_csv(result_path, sep='\t')
# print(df)
pv = df.pivot_table(index='Method', columns="Dataset", values=["MAE", "MRAE"])
print(pv)

168
laboratory/method_kdey.py Normal file
View File

@ -0,0 +1,168 @@
from typing import Union, Callable
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.neighbors import KernelDensity
import quapy as qp
from data import LabelledCollection
from protocol import APP, UPP
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
DistributionMatching, _get_divergence
import scipy
from scipy import optimize
class KDEy(AggregativeProbabilisticQuantifier):
BANDWIDTH_METHOD = ['auto', 'scott', 'silverman']
ENGINE = ['scipy', 'sklearn']
def __init__(self, classifier: BaseEstimator, val_split=0.4, divergence: Union[str, Callable]='HD',
bandwidth_method='scott', engine='sklearn', n_jobs=None):
self.classifier = classifier
self.val_split = val_split
self.divergence = divergence
self.bandwidth_method = bandwidth_method
self.engine = engine
self.n_jobs = n_jobs
assert bandwidth_method in KDEy.BANDWIDTH_METHOD, f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}'
assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}'
def get_kde(self, posteriors):
if self.engine == 'scipy':
# scipy treats columns as datapoints, and need the datapoints not to lie in a lower-dimensional subspace, which
# requires removing the last dimension which is constrained
posteriors = posteriors[:,:-1].T
kde = scipy.stats.gaussian_kde(posteriors)
kde.set_bandwidth(self.bandwidth_method)
elif self.engine == 'sklearn':
kde = KernelDensity(bandwidth=self.bandwidth_method).fit(posteriors)
return kde
def pdf(self, kde, posteriors):
if self.engine == 'scipy':
return kde(posteriors[:,:-1].T)
elif self.engine == 'sklearn':
return np.exp(kde.score_samples(posteriors))
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
"""
Trains the classifier (if requested) and generates the validation distributions out of the training data.
The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
channels (a channel is a description, in form of a histogram, of a specific class -- there are as many channels
as classes, although in the binary case one can use only one channel, since the other one is constrained),
and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete
distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
is the fraction of instances with a value in the `k`-th bin.
:param data: the training set
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
to estimate the parameters
"""
if val_split is None:
val_split = self.val_split
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
self.val_densities = [self.get_kde(posteriors[y == cat]) for cat in range(data.n_classes)]
self.val_posteriors = posteriors
return self
def val_pdf(self, prev):
"""
Returns a function that computes the mixture model with the given prev as mixture factor
:param prev: a prevalence vector, ndarray
:return: a function implementing the validation distribution with fixed mixture factor
"""
return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev))
def aggregate(self, posteriors: np.ndarray):
"""
Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
(the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
In the multiclass case, with `n` the number of classes, the test and mixture distributions contain
`n` channels (proper distributions of binned posterior probabilities), on which the divergence is computed
independently. The matching is computed as an average of the divergence across all channels.
:param instances: instances in the sample
:return: a vector of class prevalence estimates
"""
test_density = self.get_kde(posteriors)
# val_test_posteriors = np.concatenate([self.val_posteriors, posteriors])
test_likelihood = self.pdf(test_density, posteriors)
divergence = _get_divergence(self.divergence)
n_classes = len(self.val_densities)
def match(prev):
val_pdf = self.val_pdf(prev)
val_likelihood = val_pdf(posteriors)
return divergence(val_likelihood, test_likelihood)
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1
div = 'HD'
# generates tuples (dataset, method, method_name)
# (the dataset is needed for methods that process the dataset differently)
def gen_methods():
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True)
# kdey = KDEy(LogisticRegression(), divergence=div, bandwidth_method='scott')
# yield data, kdey, f'KDEy-{div}-scott'
kdey = KDEy(LogisticRegression(), divergence=div, bandwidth_method='silverman', engine='sklearn')
yield data, kdey, f'KDEy-{div}-silverman'
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=5)
yield data, dm, f'DM-5b-{div}'
# dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=10)
# yield data, dm, f'DM-10b-{div}'
result_path = 'results_kdey.csv'
with open(result_path, 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
for data, quantifier, quant_name in gen_methods():
quantifier.fit(data.training)
protocol = UPP(data.test, repeats=100)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae','mrae'], verbose=True)
means = report.mean()
csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
csv.flush()
df = pd.read_csv(result_path, sep='\t')
# print(df)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
print(pv)

View File

@ -1,6 +1,7 @@
"""QuaPy module for quantification"""
from quapy.data import datasets
from . import error
from . import data
from quapy.data import datasets
from . import functional
# from . import method
from . import evaluation
@ -25,7 +26,8 @@ environ = {
def _get_njobs(n_jobs):
"""
If `n_jobs` is None, then it returns `environ['N_JOBS']`; if otherwise, returns `n_jobs`.
If `n_jobs` is None, then it returns `environ['N_JOBS']`;
if otherwise, returns `n_jobs`.
:param n_jobs: the number of `n_jobs` or None if not specified
:return: int
@ -35,7 +37,8 @@ def _get_njobs(n_jobs):
def _get_sample_size(sample_size):
"""
If `sample_size` is None, then it returns `environ['SAMPLE_SIZE']`; if otherwise, returns `sample_size`.
If `sample_size` is None, then it returns `environ['SAMPLE_SIZE']`;
if otherwise, returns `sample_size`.
If none of these are set, then a ValueError exception is raised.
:param sample_size: integer or None
@ -45,6 +48,3 @@ def _get_sample_size(sample_size):
if sample_size is None:
raise ValueError('neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified')
return sample_size

View File

@ -19,7 +19,7 @@ class LowRankLogisticRegression(BaseEstimator):
def __init__(self, n_components=100, **kwargs):
self.n_components = n_components
self.learner = LogisticRegression(**kwargs)
self.classifier = LogisticRegression(**kwargs)
def get_params(self):
"""
@ -28,7 +28,7 @@ class LowRankLogisticRegression(BaseEstimator):
:return: a dictionary with parameter names mapped to their values
"""
params = {'n_components': self.n_components}
params.update(self.learner.get_params())
params.update(self.classifier.get_params())
return params
def set_params(self, **params):
@ -43,7 +43,7 @@ class LowRankLogisticRegression(BaseEstimator):
if 'n_components' in params_:
self.n_components = params_['n_components']
del params_['n_components']
self.learner.set_params(**params_)
self.classifier.set_params(**params_)
def fit(self, X, y):
"""
@ -59,8 +59,8 @@ class LowRankLogisticRegression(BaseEstimator):
if nF > self.n_components:
self.pca = TruncatedSVD(self.n_components).fit(X)
X = self.transform(X)
self.learner.fit(X, y)
self.classes_ = self.learner.classes_
self.classifier.fit(X, y)
self.classes_ = self.classifier.classes_
return self
def predict(self, X):
@ -72,7 +72,7 @@ class LowRankLogisticRegression(BaseEstimator):
instances in `X`
"""
X = self.transform(X)
return self.learner.predict(X)
return self.classifier.predict(X)
def predict_proba(self, X):
"""
@ -82,7 +82,7 @@ class LowRankLogisticRegression(BaseEstimator):
:return: array-like of shape `(n_samples, n_classes)` with the posterior probabilities
"""
X = self.transform(X)
return self.learner.predict_proba(X)
return self.classifier.predict_proba(X)
def transform(self, X):
"""

View File

@ -207,7 +207,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> Dataset:
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
"""
Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
@ -223,7 +223,7 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
>>> import quapy as qp
>>> collection = qp.datasets.fetch_UCILabelledCollection("yeast")
>>> for data in qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
>>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
>>> ...
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
@ -233,7 +233,7 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
:return: a :class:`quapy.data.base.Dataset` instance
:return: a :class:`quapy.data.base.LabelledCollection` instance
"""
assert dataset_name in UCI_DATASETS, \

View File

@ -1,10 +1,13 @@
import quapy as qp
"""Implementation of error measures used for quantification"""
import numpy as np
from sklearn.metrics import f1_score
import quapy as qp
def from_name(err_name):
"""Gets an error function from its name. E.g., `from_name("mae")` will return function :meth:`quapy.error.mae`
"""Gets an error function from its name. E.g., `from_name("mae")`
will return function :meth:`quapy.error.mae`
:param err_name: string, the error name
:return: a callable implementing the requested error
@ -15,10 +18,12 @@ def from_name(err_name):
def f1e(y_true, y_pred):
"""F1 error: simply computes the error in terms of macro :math:`F_1`, i.e., :math:`1-F_1^M`,
where :math:`F_1` is the harmonic mean of precision and recall, defined as :math:`\\frac{2tp}{2tp+fp+fn}`,
with `tp`, `fp`, and `fn` standing for true positives, false positives, and false negatives, respectively.
`Macro` averaging means the :math:`F_1` is computed for each category independently, and then averaged.
"""F1 error: simply computes the error in terms of macro :math:`F_1`, i.e.,
:math:`1-F_1^M`, where :math:`F_1` is the harmonic mean of precision and recall,
defined as :math:`\\frac{2tp}{2tp+fp+fn}`, with `tp`, `fp`, and `fn` standing
for true positives, false positives, and false negatives, respectively.
`Macro` averaging means the :math:`F_1` is computed for each category independently,
and then averaged.
:param y_true: array-like of true labels
:param y_pred: array-like of predicted labels
@ -28,8 +33,9 @@ def f1e(y_true, y_pred):
def acce(y_true, y_pred):
"""Computes the error in terms of 1-accuracy. The accuracy is computed as :math:`\\frac{tp+tn}{tp+fp+fn+tn}`, with
`tp`, `fp`, `fn`, and `tn` standing for true positives, false positives, false negatives, and true negatives,
"""Computes the error in terms of 1-accuracy. The accuracy is computed as
:math:`\\frac{tp+tn}{tp+fp+fn+tn}`, with `tp`, `fp`, `fn`, and `tn` standing
for true positives, false positives, false negatives, and true negatives,
respectively
:param y_true: array-like of true labels
@ -43,7 +49,8 @@ def mae(prevs, prevs_hat):
"""Computes the mean absolute error (see :meth:`quapy.error.ae`) across the sample pairs.
:param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values
:param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted
prevalence values
:return: mean absolute error
"""
return ae(prevs, prevs_hat).mean()
@ -52,7 +59,7 @@ def mae(prevs, prevs_hat):
def ae(prevs, prevs_hat):
"""Computes the absolute error between the two prevalence vectors.
Absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as
:math:`AE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\in \mathcal{Y}}|\\hat{p}(y)-p(y)|`,
:math:`AE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}|\\hat{p}(y)-p(y)|`,
where :math:`\\mathcal{Y}` are the classes of interest.
:param prevs: array-like of shape `(n_classes,)` with the true prevalence values
@ -66,129 +73,153 @@ def ae(prevs, prevs_hat):
def mse(prevs, prevs_hat):
"""Computes the mean squared error (see :meth:`quapy.error.se`) across the sample pairs.
:param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values
:param prevs: array-like of shape `(n_samples, n_classes,)` with the
true prevalence values
:param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the
predicted prevalence values
:return: mean squared error
"""
return se(prevs, prevs_hat).mean()
def se(p, p_hat):
def se(prevs, prevs_hat):
"""Computes the squared error between the two prevalence vectors.
Squared error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as
:math:`SE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\in \mathcal{Y}}(\\hat{p}(y)-p(y))^2`, where
:math:`SE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}(\\hat{p}(y)-p(y))^2`,
where
:math:`\\mathcal{Y}` are the classes of interest.
:param prevs: array-like of shape `(n_classes,)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
:return: absolute error
"""
return ((p_hat-p)**2).mean(axis=-1)
return ((prevs_hat - prevs) ** 2).mean(axis=-1)
def mkld(prevs, prevs_hat, eps=None):
"""Computes the mean Kullback-Leibler divergence (see :meth:`quapy.error.kld`) across the sample pairs.
The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`).
"""Computes the mean Kullback-Leibler divergence (see :meth:`quapy.error.kld`) across the
sample pairs. The distributions are smoothed using the `eps` factor
(see :meth:`quapy.error.smooth`).
:param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values
:param eps: smoothing factor. KLD is not defined in cases in which the distributions contain zeros; `eps`
is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size. If `eps=None`, the sample size
will be taken from the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand).
:param prevs: array-like of shape `(n_samples, n_classes,)` with the true
prevalence values
:param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted
prevalence values
:param eps: smoothing factor. KLD is not defined in cases in which the distributions contain
zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size.
If `eps=None`, the sample size will be taken from the environment variable `SAMPLE_SIZE`
(which has thus to be set beforehand).
:return: mean Kullback-Leibler distribution
"""
return kld(prevs, prevs_hat, eps).mean()
def kld(p, p_hat, eps=None):
def kld(prevs, prevs_hat, eps=None):
"""Computes the Kullback-Leibler divergence between the two prevalence distributions.
Kullback-Leibler divergence between two prevalence distributions :math:`p` and :math:`\\hat{p}` is computed as
:math:`KLD(p,\\hat{p})=D_{KL}(p||\\hat{p})=\\sum_{y\\in \\mathcal{Y}} p(y)\\log\\frac{p(y)}{\\hat{p}(y)}`, where
:math:`\\mathcal{Y}` are the classes of interest.
The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`).
:param prevs: array-like of shape `(n_classes,)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
:param eps: smoothing factor. KLD is not defined in cases in which the distributions contain zeros; `eps`
is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size. If `eps=None`, the sample size
will be taken from the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand).
:return: Kullback-Leibler divergence between the two distributions
"""
eps = __check_eps(eps)
sp = p+eps
sp_hat = p_hat + eps
return (sp*np.log(sp/sp_hat)).sum(axis=-1)
def mnkld(prevs, prevs_hat, eps=None):
"""Computes the mean Normalized Kullback-Leibler divergence (see :meth:`quapy.error.nkld`) across the sample pairs.
The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`).
:param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values
:param eps: smoothing factor. NKLD is not defined in cases in which the distributions contain zeros; `eps`
is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size. If `eps=None`, the sample size
will be taken from the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand).
:return: mean Normalized Kullback-Leibler distribution
"""
return nkld(prevs, prevs_hat, eps).mean()
def nkld(p, p_hat, eps=None):
"""Computes the Normalized Kullback-Leibler divergence between the two prevalence distributions.
Normalized Kullback-Leibler divergence between two prevalence distributions :math:`p` and :math:`\\hat{p}`
is computed as :math:`NKLD(p,\\hat{p}) = 2\\frac{e^{KLD(p,\\hat{p})}}{e^{KLD(p,\\hat{p})}+1}-1`, where
:math:`\\mathcal{Y}` are the classes of interest.
The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`).
:param prevs: array-like of shape `(n_classes,)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
:param eps: smoothing factor. NKLD is not defined in cases in which the distributions contain zeros; `eps`
is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size. If `eps=None`, the sample size
will be taken from the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand).
:return: Normalized Kullback-Leibler divergence between the two distributions
"""
ekld = np.exp(kld(p, p_hat, eps))
return 2. * ekld / (1 + ekld) - 1.
def mrae(p, p_hat, eps=None):
"""Computes the mean relative absolute error (see :meth:`quapy.error.rae`) across the sample pairs.
The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`).
:param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values
:param eps: smoothing factor. `mrae` is not defined in cases in which the true distribution contains zeros; `eps`
is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size. If `eps=None`, the sample size
will be taken from the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand).
:return: mean relative absolute error
"""
return rae(p, p_hat, eps).mean()
def rae(p, p_hat, eps=None):
"""Computes the absolute relative error between the two prevalence vectors.
Relative absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as
:math:`RAE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\in \mathcal{Y}}\\frac{|\\hat{p}(y)-p(y)|}{p(y)}`,
Kullback-Leibler divergence between two prevalence distributions :math:`p` and :math:`\\hat{p}`
is computed as
:math:`KLD(p,\\hat{p})=D_{KL}(p||\\hat{p})=
\\sum_{y\\in \\mathcal{Y}} p(y)\\log\\frac{p(y)}{\\hat{p}(y)}`,
where :math:`\\mathcal{Y}` are the classes of interest.
The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`).
:param prevs: array-like of shape `(n_classes,)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
:param eps: smoothing factor. `rae` is not defined in cases in which the true distribution contains zeros; `eps`
is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size. If `eps=None`, the sample size
will be taken from the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand).
:param eps: smoothing factor. KLD is not defined in cases in which the distributions contain
zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size.
If `eps=None`, the sample size will be taken from the environment variable `SAMPLE_SIZE`
(which has thus to be set beforehand).
:return: Kullback-Leibler divergence between the two distributions
"""
eps = __check_eps(eps)
smooth_prevs = prevs + eps
smooth_prevs_hat = prevs_hat + eps
return (smooth_prevs*np.log(smooth_prevs/smooth_prevs_hat)).sum(axis=-1)
def mnkld(prevs, prevs_hat, eps=None):
"""Computes the mean Normalized Kullback-Leibler divergence (see :meth:`quapy.error.nkld`)
across the sample pairs. The distributions are smoothed using the `eps` factor
(see :meth:`quapy.error.smooth`).
:param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted
prevalence values
:param eps: smoothing factor. NKLD is not defined in cases in which the distributions contain
zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size.
If `eps=None`, the sample size will be taken from the environment variable `SAMPLE_SIZE`
(which has thus to be set beforehand).
:return: mean Normalized Kullback-Leibler distribution
"""
return nkld(prevs, prevs_hat, eps).mean()
def nkld(prevs, prevs_hat, eps=None):
"""Computes the Normalized Kullback-Leibler divergence between the two prevalence distributions.
Normalized Kullback-Leibler divergence between two prevalence distributions :math:`p` and
:math:`\\hat{p}` is computed as
math:`NKLD(p,\\hat{p}) = 2\\frac{e^{KLD(p,\\hat{p})}}{e^{KLD(p,\\hat{p})}+1}-1`,
where
:math:`\\mathcal{Y}` are the classes of interest.
The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`).
:param prevs: array-like of shape `(n_classes,)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
:param eps: smoothing factor. NKLD is not defined in cases in which the distributions
contain zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample
size. If `eps=None`, the sample size will be taken from the environment variable
`SAMPLE_SIZE` (which has thus to be set beforehand).
:return: Normalized Kullback-Leibler divergence between the two distributions
"""
ekld = np.exp(kld(prevs, prevs_hat, eps))
return 2. * ekld / (1 + ekld) - 1.
def mrae(prevs, prevs_hat, eps=None):
"""Computes the mean relative absolute error (see :meth:`quapy.error.rae`) across
the sample pairs. The distributions are smoothed using the `eps` factor (see
:meth:`quapy.error.smooth`).
:param prevs: array-like of shape `(n_samples, n_classes,)` with the true
prevalence values
:param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted
prevalence values
:param eps: smoothing factor. `mrae` is not defined in cases in which the true
distribution contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`,
with :math:`T` the sample size. If `eps=None`, the sample size will be taken from
the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand).
:return: mean relative absolute error
"""
return rae(prevs, prevs_hat, eps).mean()
def rae(prevs, prevs_hat, eps=None):
"""Computes the absolute relative error between the two prevalence vectors.
Relative absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}`
is computed as
:math:`RAE(p,\\hat{p})=
\\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}\\frac{|\\hat{p}(y)-p(y)|}{p(y)}`,
where :math:`\\mathcal{Y}` are the classes of interest.
The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`).
:param prevs: array-like of shape `(n_classes,)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
:param eps: smoothing factor. `rae` is not defined in cases in which the true distribution
contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the
sample size. If `eps=None`, the sample size will be taken from the environment variable
`SAMPLE_SIZE` (which has thus to be set beforehand).
:return: relative absolute error
"""
eps = __check_eps(eps)
p = smooth(p, eps)
p_hat = smooth(p_hat, eps)
return (abs(p-p_hat)/p).mean(axis=-1)
prevs = smooth(prevs, eps)
prevs_hat = smooth(prevs_hat, eps)
return (abs(prevs - prevs_hat) / prevs).mean(axis=-1)
def smooth(prevs, eps):
""" Smooths a prevalence distribution with :math:`\epsilon` (`eps`) as:
:math:`\\underline{p}(y)=\\frac{\\epsilon+p(y)}{\\epsilon|\\mathcal{Y}|+\\displaystyle\\sum_{y\\in \\mathcal{Y}}p(y)}`
""" Smooths a prevalence distribution with :math:`\\epsilon` (`eps`) as:
:math:`\\underline{p}(y)=\\frac{\\epsilon+p(y)}{\\epsilon|\\mathcal{Y}|+
\\displaystyle\\sum_{y\\in \\mathcal{Y}}p(y)}`
:param prevs: array-like of shape `(n_classes,)` with the true prevalence values
:param eps: smoothing factor
@ -200,12 +231,10 @@ def smooth(prevs, eps):
def __check_eps(eps=None):
if eps is None:
import quapy as qp
sample_size = qp.environ['SAMPLE_SIZE']
if sample_size is None:
raise ValueError('eps was not defined, and qp.environ["SAMPLE_SIZE"] was not set')
else:
eps = 1. / (2. * sample_size)
eps = 1. / (2. * sample_size)
return eps
@ -217,7 +246,8 @@ CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR}
QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR}
QUANTIFICATION_ERROR_SINGLE_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR_SINGLE}
QUANTIFICATION_ERROR_SMOOTH_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR_SMOOTH}
ERROR_NAMES = CLASSIFICATION_ERROR_NAMES | QUANTIFICATION_ERROR_NAMES | QUANTIFICATION_ERROR_SINGLE_NAMES
ERROR_NAMES = \
CLASSIFICATION_ERROR_NAMES | QUANTIFICATION_ERROR_NAMES | QUANTIFICATION_ERROR_SINGLE_NAMES
f1_error = f1e
acc_error = acce
@ -225,4 +255,3 @@ mean_absolute_error = mae
absolute_error = ae
mean_relative_absolute_error = mrae
relative_absolute_error = rae

View File

@ -444,24 +444,28 @@ class EMQ(AggregativeProbabilisticQuantifier):
def __init__(self, classifier: BaseEstimator, exact_train_prev=True, recalib=None):
self.classifier = classifier
self.non_calibrated = classifier
self.exact_train_prev = exact_train_prev
self.recalib = recalib
def fit(self, data: LabelledCollection, fit_classifier=True):
if self.recalib is not None:
if self.recalib == 'nbvs':
self.classifier = NBVSCalibration(self.classifier)
self.classifier = NBVSCalibration(self.non_calibrated)
elif self.recalib == 'bcts':
self.classifier = BCTSCalibration(self.classifier)
self.classifier = BCTSCalibration(self.non_calibrated)
elif self.recalib == 'ts':
self.classifier = TSCalibration(self.classifier)
self.classifier = TSCalibration(self.non_calibrated)
elif self.recalib == 'vs':
self.classifier = VSCalibration(self.classifier)
self.classifier = VSCalibration(self.non_calibrated)
elif self.recalib == 'platt':
self.classifier = CalibratedClassifierCV(self.classifier, ensemble=False)
else:
raise ValueError('invalid param argument for recalibration method; available ones are '
'"nbvs", "bcts", "ts", and "vs".')
self.recalib = None
else:
self.classifier = self.non_calibrated
self.classifier, _ = _training_helper(self.classifier, data, fit_classifier, ensure_probabilistic=True)
if self.exact_train_prev:
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
@ -766,7 +770,9 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
"""
Trains the classifier (if requested) and generates the validation distributions out of the training data.
The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
channels (a channel is a description, in form of a histogram, of a specific class -- there are as many channels
as classes, although in the binary case one can use only one channel, since the other one is constrained),
and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete
distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
is the fraction of instances with a value in the `k`-th bin.
@ -815,7 +821,7 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1]
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x

View File

@ -9,6 +9,7 @@ from torch.nn.functional import relu
from quapy.protocol import UPP
from quapy.method.aggregative import *
from quapy.util import EarlyStop
from tqdm import tqdm
class QuaNetTrainer(BaseQuantifier):
@ -28,7 +29,7 @@ class QuaNetTrainer(BaseQuantifier):
>>>
>>> # load the kindle dataset as text, and convert words to numerical indexes
>>> dataset = qp.datasets.fetch_reviews('kindle', pickle=True)
>>> qp.data.preprocessing.index(dataset, min_df=5, inplace=True)
>>> qp.domains.preprocessing.index(dataset, min_df=5, inplace=True)
>>>
>>> # the text classifier is a CNN trained by NeuralClassifierTrainer
>>> cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes)
@ -263,15 +264,19 @@ class QuaNetTrainer(BaseQuantifier):
f'patience={early_stop.patience}/{early_stop.PATIENCE_LIMIT}')
def get_params(self, deep=True):
return {**self.classifier.get_params(), **self.quanet_params}
classifier_params = self.classifier.get_params()
classifier_params = {'classifier__'+k:v for k,v in classifier_params.items()}
return {**classifier_params, **self.quanet_params}
def set_params(self, **parameters):
learner_params = {}
for key, val in parameters.items():
if key in self.quanet_params:
self.quanet_params[key] = val
elif key.startswith('classifier__'):
learner_params[key.replace('classifier__', '')] = val
else:
learner_params[key] = val
raise ValueError('unknown parameter ', key)
self.classifier.set_params(**learner_params)
def __check_params_colision(self, quanet_params, learner_params):

View File

@ -56,7 +56,7 @@ class GridSearchQ(BaseQuantifier):
def _sout(self, msg):
if self.verbose:
print(f'[{self.__class__.__name__}]: {msg}')
print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}')
def __check_error(self, error):
if error in qp.error.QUANTIFICATION_ERROR:

View File

@ -383,6 +383,9 @@ def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs
# x_error function) and 'y' is the estim-test shift (computed as according to y_error)
data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
if method_order is None:
method_order = method_names
if binning == 'isomerous':
# take bins containing the same amount of examples
tr_test_drifts = np.concatenate([data[m]['x'] for m in method_order])

View File

@ -89,8 +89,6 @@ setup(
'License :: OSI Approved :: BSD License',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3 :: Only',
@ -113,7 +111,7 @@ setup(
#
packages=find_packages(include=['quapy', 'quapy.*']), # Required
python_requires='>=3.6, <4',
python_requires='>=3.8, <4',
install_requires=['scikit-learn', 'pandas', 'tqdm', 'matplotlib', 'joblib', 'xlrd', 'abstention'],