some plots

switching
first example
2024-09-16 17:50:34 +02:00 · 2024-09-16 15:06:29 +02:00 · 2024-09-16 13:21:18 +02:00 · 2024-09-16 12:54:56 +02:00 · 2024-09-10 10:38:17 +02:00 · 2024-08-22 18:12:49 +02:00
9 changed files with 140 additions and 21 deletions
--- a/.gitignore
+++ b/.gitignore
@ -167,3 +167,4 @@ TweetSentQuant


 *.png
+.idea
--- a/CHANGE_LOG.txt
+++ b/CHANGE_LOG.txt
@ -1,7 +1,10 @@
 Change Log 0.1.9
 ----------------
- [TODO] add LeQua2024 and normalized match distance to qp.error
- [TODO] add Friedman's method and DeBias
+
+- Added LeQua 2024 datasets and normalized match distance to qp.error
+
+- Improved data loaders for UCI binary and UCI multiclass datasets (thanks to Lorenzo Volpi!); these datasets
+    can be loaded with standardised covariates (default)

 - Added a default classifier for aggregative quantifiers, which now can be instantiated without specifying
    the classifier. The default classifier can be accessed in qp.environ['DEFAULT_CLS'] and is assigned to
--- a/KDEy/experiments.py
+++ b/KDEy/experiments.py
@ -0,0 +1,114 @@
+import os
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from os.path import join
+import quapy as qp
+from quapy.protocol import UPP
+from quapy.method.aggregative import KDEyML
+
+DEBUG = False
+
+qp.environ["SAMPLE_SIZE"] = 100 if DEBUG else 500
+val_repeats  = 100 if DEBUG else 500
+test_repeats = 100 if DEBUG else 500
+if DEBUG:
+    qp.environ["DEFAULT_CLS"] = LogisticRegression()
+
+test_results = {}
+val_choice = {}
+
+bandwidth_range = np.linspace(0.01, 0.20, 20)
+if DEBUG:
+    bandwidth_range = np.linspace(0.01, 0.20, 10)
+
+def datasets():
+    for dataset_name in qp.datasets.UCI_MULTICLASS_DATASETS:
+        dataset = qp.datasets.fetch_UCIMulticlassDataset(dataset_name)
+        if DEBUG:
+            dataset = dataset.reduce(random_state=0)
+        yield dataset
+
+
+def experiment_dataset(dataset):
+    train, test = dataset.train_test
+    test_gen = UPP(test, repeats=test_repeats)
+
+    # bandwidth chosen during model selection in validation
+    train_tr, train_va = train.split_stratified(random_state=0)
+    kdey = KDEyML(random_state=0)
+    modsel = qp.model_selection.GridSearchQ(
+        model=kdey,
+        param_grid={'bandwidth': bandwidth_range},
+        protocol=UPP(train_va, repeats=val_repeats),
+        refit=False,
+        n_jobs=-1,
+        verbose=True
+    ).fit(train_tr)
+    chosen_bandwidth = modsel.best_params_['bandwidth']
+    modsel_choice = float(chosen_bandwidth)
+
+    # results in test
+    print(f"testing KDEy in {dataset.name}")
+    dataset_results = []
+    for b in bandwidth_range:
+        kdey = KDEyML(bandwidth=b, random_state=0)
+        kdey.fit(train)
+
+        mae = qp.evaluation.evaluate(kdey, protocol=test_gen, error_metric='mae', verbose=True)
+        print(f'bandwidth={b}: {mae:.5f}')
+        dataset_results.append((float(b), float(mae)))
+
+    return modsel_choice, dataset_results
+
+def plot_bandwidth(val_choice, test_results):
+    for dataset_name in val_choice.keys():
+        import matplotlib.pyplot as plt
+
+        bandwidths, results = zip(*test_results[dataset_name])
+
+        # Crear la gráfica
+        plt.figure(figsize=(8, 6))
+
+        # Graficar los puntos de datos
+        plt.plot(bandwidths, results, marker='o')
+
+        # Agregar la línea vertical en bandwidth_chosen
+        plt.axvline(x=val_choice[dataset_name], color='r', linestyle='--', label=f'Bandwidth elegido: {val_choice[dataset_name]}')
+
+        # Agregar etiquetas y título
+        plt.xlabel('Bandwidth')
+        plt.ylabel('Resultado')
+        plt.title('Gráfica de Bandwidth vs Resultado')
+
+        # Mostrar la leyenda
+        plt.legend()
+
+        # Mostrar la gráfica
+        plt.grid(True)
+        # plt.show()
+        os.makedirs('./plots', exist_ok=True)
+        plt.savefig(f'./plots/{dataset_name}.png')
+
+
+
+for dataset in datasets():
+    if DEBUG:
+        result_path = f'./results/debug/{dataset.name}.pkl'
+    else:
+        result_path = f'./results/{dataset.name}.pkl'
+
+    modsel_choice, dataset_results = qp.util.pickled_resource(result_path, experiment_dataset, dataset)
+    val_choice[dataset.name] = modsel_choice
+    test_results[dataset.name] = dataset_results
+
+    print(f'Dataset = {dataset.name}')
+    print(modsel_choice)
+    print(dataset_results)
+
+plot_bandwidth(val_choice, test_results)
+
+
+
+
+
--- a/README.md
+++ b/README.md
@ -45,19 +45,18 @@ of the test set.

 ```python
 import quapy as qp
-from sklearn.linear_model import LogisticRegression

-dataset = qp.datasets.fetch_twitter('semeval16')
+dataset = qp.datasets.fetch_UCIBinaryDataset("yeast")
+training, test = dataset.train_test

 # create an "Adjusted Classify & Count" quantifier
-model = qp.method.aggregative.ACC(LogisticRegression())
-model.fit(dataset.training)
+model = qp.method.aggregative.ACC()
+model.fit(training)

-estim_prevalence = model.quantify(dataset.test.instances)
-true_prevalence  = dataset.test.prevalence()
+estim_prevalence = model.quantify(test.X)
+true_prevalence  = test.prevalence()

 error = qp.error.mae(true_prevalence, estim_prevalence)
-
 print(f'Mean Absolute Error (MAE)={error:.3f}')
 ```

--- a/TODO.txt
+++ b/TODO.txt
@ -0,0 +1,6 @@
+- [TODO] add ensemble methods SC-MQ, MC-SQ, MC-MQ
+- [TODO] add HistNetQ
+- [TODO] add CDE-iteration and Bayes-CDE methods
+- [TODO] add Friedman's method and DeBias
+- [TODO] check ignore warning stuff
+    check https://docs.python.org/3/library/warnings.html#temporarily-suppressing-warnings
--- a/examples/4b.lequa2024_experiments.py
+++ b/examples/4b.lequa2024_experiments.py
@ -33,9 +33,9 @@ quantifier = KDEyML(classifier=LogisticRegression())

 # model selection
 param_grid = {
-    'classifier__C': np.logspace(-3, 3, 7),          # classifier-dependent: inverse of regularization strength
-    'classifier__class_weight': ['balanced', None],  # classifier-dependent: weights of each class
-    'bandwidth': np.linspace(0.01, 0.2, 20)          # quantifier-dependent: bandwidth of the kernel
+    'classifier__C': np.logspace(-3, 3, 7),       # classifier-dependent: inverse of regularization strength
+    'classifier__class_weight': ['balanced', None],         # classifier-dependent: weights of each class
+    'bandwidth': np.linspace(0.01, 0.2, 20)  # quantifier-dependent: bandwidth of the kernel
 }
 model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True)
 quantifier = model_selection.fit(training)
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@ -502,7 +502,7 @@ class Dataset:
        return len(self.vocabulary)

    @property
-    def train_test(self):
+    def train_test(self) -> (LabelledCollection, LabelledCollection):
        """
        Alias to `self.training` and `self.test`

--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -1,7 +1,3 @@
-def warn(*args, **kwargs):
-    pass
-import warnings
-warnings.warn = warn
 import os
 from contextlib import contextmanager
 import zipfile
@ -10,6 +6,7 @@ import pandas as pd
 from ucimlrepo import fetch_ucirepo
 from quapy.data.base import Dataset, LabelledCollection
 from quapy.data.preprocessing import text2tfidf, reduce_columns
+from quapy.data.preprocessing import standardize as standardizer
 from quapy.data.reader import *
 from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource
 from sklearn.preprocessing import StandardScaler
@ -260,7 +257,7 @@ def fetch_UCIBinaryDataset(dataset_name, data_home=None, test_split=0.3, standar
    data = fetch_UCIBinaryLabelledCollection(dataset_name, data_home, verbose)
    dataset = Dataset(*data.split_stratified(1 - test_split, random_state=0), name=dataset_name)
    if standardize:
-        dataset = qp.data.preprocessing.standardize(dataset)
+        dataset = standardizer(dataset)
    return dataset


@ -640,10 +637,10 @@ def fetch_UCIMulticlassDataset(
        if n_train > max_train_instances:
            train_prop = (max_train_instances / n)

-    data = Dataset(*data.split_stratified(train_prop, random_state=0))
+    data = Dataset(*data.split_stratified(train_prop, random_state=0), name=dataset_name)
    
    if standardize:
-        data = qp.data.preprocessing.standardize(data)
+        data = standardizer(data)
    
    return data

--- a/quapy/functional.py
+++ b/quapy/functional.py
@ -1,4 +1,3 @@
-import itertools
 import warnings
 from collections import defaultdict
 from typing import Literal, Union, Callable
Author	SHA1	Message	Date
Alejandro Moreo Fernandez	faba2494b2	some plots	2024-09-16 17:50:34 +02:00
Alejandro Moreo Fernandez	ede214aa54	switching	2024-09-16 15:06:29 +02:00
Alejandro Moreo Fernandez	af2c4eaf01	first example	2024-09-16 13:21:18 +02:00
Alejandro Moreo Fernandez	a6ff00f96b	simplfiying the minimal working exaple in the README	2024-09-16 12:54:56 +02:00
Alejandro Moreo Fernandez	365a9e626c	gitignore	2024-09-10 10:38:17 +02:00
Alejandro Moreo Fernandez	88541976e9	merge	2024-08-22 18:12:49 +02:00
Alejandro Moreo Fernandez	e580e33b83	removing the warning supression in datasets	2024-08-22 18:11:47 +02:00
Alejandro Moreo Fernandez	4474653a25	Update CHANGE_LOG.txt	2024-07-25 11:42:42 +03:00
Alejandro Moreo Fernandez	13beb45274	import fix	2024-07-23 16:18:02 +02:00
Alejandro Moreo Fernandez	73d53820c2	import fix	2024-07-23 16:07:24 +02:00