Compare commits
10 Commits
3f20aa06b1
...
faba2494b2
| Author | SHA1 | Date |
|---|---|---|
|
|
faba2494b2 | |
|
|
ede214aa54 | |
|
|
af2c4eaf01 | |
|
|
a6ff00f96b | |
|
|
365a9e626c | |
|
|
88541976e9 | |
|
|
e580e33b83 | |
|
|
4474653a25 | |
|
|
13beb45274 | |
|
|
73d53820c2 |
|
|
@ -167,3 +167,4 @@ TweetSentQuant
|
|||
|
||||
|
||||
*.png
|
||||
.idea
|
||||
|
|
|
|||
|
|
@ -1,7 +1,10 @@
|
|||
Change Log 0.1.9
|
||||
----------------
|
||||
- [TODO] add LeQua2024 and normalized match distance to qp.error
|
||||
- [TODO] add Friedman's method and DeBias
|
||||
|
||||
- Added LeQua 2024 datasets and normalized match distance to qp.error
|
||||
|
||||
- Improved data loaders for UCI binary and UCI multiclass datasets (thanks to Lorenzo Volpi!); these datasets
|
||||
can be loaded with standardised covariates (default)
|
||||
|
||||
- Added a default classifier for aggregative quantifiers, which now can be instantiated without specifying
|
||||
the classifier. The default classifier can be accessed in qp.environ['DEFAULT_CLS'] and is assigned to
|
||||
|
|
|
|||
|
|
@ -0,0 +1,114 @@
|
|||
import os
|
||||
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from os.path import join
|
||||
import quapy as qp
|
||||
from quapy.protocol import UPP
|
||||
from quapy.method.aggregative import KDEyML
|
||||
|
||||
DEBUG = False
|
||||
|
||||
qp.environ["SAMPLE_SIZE"] = 100 if DEBUG else 500
|
||||
val_repeats = 100 if DEBUG else 500
|
||||
test_repeats = 100 if DEBUG else 500
|
||||
if DEBUG:
|
||||
qp.environ["DEFAULT_CLS"] = LogisticRegression()
|
||||
|
||||
test_results = {}
|
||||
val_choice = {}
|
||||
|
||||
bandwidth_range = np.linspace(0.01, 0.20, 20)
|
||||
if DEBUG:
|
||||
bandwidth_range = np.linspace(0.01, 0.20, 10)
|
||||
|
||||
def datasets():
|
||||
for dataset_name in qp.datasets.UCI_MULTICLASS_DATASETS:
|
||||
dataset = qp.datasets.fetch_UCIMulticlassDataset(dataset_name)
|
||||
if DEBUG:
|
||||
dataset = dataset.reduce(random_state=0)
|
||||
yield dataset
|
||||
|
||||
|
||||
def experiment_dataset(dataset):
|
||||
train, test = dataset.train_test
|
||||
test_gen = UPP(test, repeats=test_repeats)
|
||||
|
||||
# bandwidth chosen during model selection in validation
|
||||
train_tr, train_va = train.split_stratified(random_state=0)
|
||||
kdey = KDEyML(random_state=0)
|
||||
modsel = qp.model_selection.GridSearchQ(
|
||||
model=kdey,
|
||||
param_grid={'bandwidth': bandwidth_range},
|
||||
protocol=UPP(train_va, repeats=val_repeats),
|
||||
refit=False,
|
||||
n_jobs=-1,
|
||||
verbose=True
|
||||
).fit(train_tr)
|
||||
chosen_bandwidth = modsel.best_params_['bandwidth']
|
||||
modsel_choice = float(chosen_bandwidth)
|
||||
|
||||
# results in test
|
||||
print(f"testing KDEy in {dataset.name}")
|
||||
dataset_results = []
|
||||
for b in bandwidth_range:
|
||||
kdey = KDEyML(bandwidth=b, random_state=0)
|
||||
kdey.fit(train)
|
||||
|
||||
mae = qp.evaluation.evaluate(kdey, protocol=test_gen, error_metric='mae', verbose=True)
|
||||
print(f'bandwidth={b}: {mae:.5f}')
|
||||
dataset_results.append((float(b), float(mae)))
|
||||
|
||||
return modsel_choice, dataset_results
|
||||
|
||||
def plot_bandwidth(val_choice, test_results):
|
||||
for dataset_name in val_choice.keys():
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
bandwidths, results = zip(*test_results[dataset_name])
|
||||
|
||||
# Crear la gráfica
|
||||
plt.figure(figsize=(8, 6))
|
||||
|
||||
# Graficar los puntos de datos
|
||||
plt.plot(bandwidths, results, marker='o')
|
||||
|
||||
# Agregar la línea vertical en bandwidth_chosen
|
||||
plt.axvline(x=val_choice[dataset_name], color='r', linestyle='--', label=f'Bandwidth elegido: {val_choice[dataset_name]}')
|
||||
|
||||
# Agregar etiquetas y título
|
||||
plt.xlabel('Bandwidth')
|
||||
plt.ylabel('Resultado')
|
||||
plt.title('Gráfica de Bandwidth vs Resultado')
|
||||
|
||||
# Mostrar la leyenda
|
||||
plt.legend()
|
||||
|
||||
# Mostrar la gráfica
|
||||
plt.grid(True)
|
||||
# plt.show()
|
||||
os.makedirs('./plots', exist_ok=True)
|
||||
plt.savefig(f'./plots/{dataset_name}.png')
|
||||
|
||||
|
||||
|
||||
for dataset in datasets():
|
||||
if DEBUG:
|
||||
result_path = f'./results/debug/{dataset.name}.pkl'
|
||||
else:
|
||||
result_path = f'./results/{dataset.name}.pkl'
|
||||
|
||||
modsel_choice, dataset_results = qp.util.pickled_resource(result_path, experiment_dataset, dataset)
|
||||
val_choice[dataset.name] = modsel_choice
|
||||
test_results[dataset.name] = dataset_results
|
||||
|
||||
print(f'Dataset = {dataset.name}')
|
||||
print(modsel_choice)
|
||||
print(dataset_results)
|
||||
|
||||
plot_bandwidth(val_choice, test_results)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
13
README.md
13
README.md
|
|
@ -45,19 +45,18 @@ of the test set.
|
|||
|
||||
```python
|
||||
import quapy as qp
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
dataset = qp.datasets.fetch_twitter('semeval16')
|
||||
dataset = qp.datasets.fetch_UCIBinaryDataset("yeast")
|
||||
training, test = dataset.train_test
|
||||
|
||||
# create an "Adjusted Classify & Count" quantifier
|
||||
model = qp.method.aggregative.ACC(LogisticRegression())
|
||||
model.fit(dataset.training)
|
||||
model = qp.method.aggregative.ACC()
|
||||
model.fit(training)
|
||||
|
||||
estim_prevalence = model.quantify(dataset.test.instances)
|
||||
true_prevalence = dataset.test.prevalence()
|
||||
estim_prevalence = model.quantify(test.X)
|
||||
true_prevalence = test.prevalence()
|
||||
|
||||
error = qp.error.mae(true_prevalence, estim_prevalence)
|
||||
|
||||
print(f'Mean Absolute Error (MAE)={error:.3f}')
|
||||
```
|
||||
|
||||
|
|
|
|||
6
TODO.txt
6
TODO.txt
|
|
@ -0,0 +1,6 @@
|
|||
- [TODO] add ensemble methods SC-MQ, MC-SQ, MC-MQ
|
||||
- [TODO] add HistNetQ
|
||||
- [TODO] add CDE-iteration and Bayes-CDE methods
|
||||
- [TODO] add Friedman's method and DeBias
|
||||
- [TODO] check ignore warning stuff
|
||||
check https://docs.python.org/3/library/warnings.html#temporarily-suppressing-warnings
|
||||
|
|
@ -33,9 +33,9 @@ quantifier = KDEyML(classifier=LogisticRegression())
|
|||
|
||||
# model selection
|
||||
param_grid = {
|
||||
'classifier__C': np.logspace(-3, 3, 7), # classifier-dependent: inverse of regularization strength
|
||||
'classifier__class_weight': ['balanced', None], # classifier-dependent: weights of each class
|
||||
'bandwidth': np.linspace(0.01, 0.2, 20) # quantifier-dependent: bandwidth of the kernel
|
||||
'classifier__C': np.logspace(-3, 3, 7), # classifier-dependent: inverse of regularization strength
|
||||
'classifier__class_weight': ['balanced', None], # classifier-dependent: weights of each class
|
||||
'bandwidth': np.linspace(0.01, 0.2, 20) # quantifier-dependent: bandwidth of the kernel
|
||||
}
|
||||
model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True)
|
||||
quantifier = model_selection.fit(training)
|
||||
|
|
|
|||
|
|
@ -502,7 +502,7 @@ class Dataset:
|
|||
return len(self.vocabulary)
|
||||
|
||||
@property
|
||||
def train_test(self):
|
||||
def train_test(self) -> (LabelledCollection, LabelledCollection):
|
||||
"""
|
||||
Alias to `self.training` and `self.test`
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,3 @@
|
|||
def warn(*args, **kwargs):
|
||||
pass
|
||||
import warnings
|
||||
warnings.warn = warn
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
import zipfile
|
||||
|
|
@ -10,6 +6,7 @@ import pandas as pd
|
|||
from ucimlrepo import fetch_ucirepo
|
||||
from quapy.data.base import Dataset, LabelledCollection
|
||||
from quapy.data.preprocessing import text2tfidf, reduce_columns
|
||||
from quapy.data.preprocessing import standardize as standardizer
|
||||
from quapy.data.reader import *
|
||||
from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
|
@ -260,7 +257,7 @@ def fetch_UCIBinaryDataset(dataset_name, data_home=None, test_split=0.3, standar
|
|||
data = fetch_UCIBinaryLabelledCollection(dataset_name, data_home, verbose)
|
||||
dataset = Dataset(*data.split_stratified(1 - test_split, random_state=0), name=dataset_name)
|
||||
if standardize:
|
||||
dataset = qp.data.preprocessing.standardize(dataset)
|
||||
dataset = standardizer(dataset)
|
||||
return dataset
|
||||
|
||||
|
||||
|
|
@ -640,10 +637,10 @@ def fetch_UCIMulticlassDataset(
|
|||
if n_train > max_train_instances:
|
||||
train_prop = (max_train_instances / n)
|
||||
|
||||
data = Dataset(*data.split_stratified(train_prop, random_state=0))
|
||||
data = Dataset(*data.split_stratified(train_prop, random_state=0), name=dataset_name)
|
||||
|
||||
if standardize:
|
||||
data = qp.data.preprocessing.standardize(data)
|
||||
data = standardizer(data)
|
||||
|
||||
return data
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
import itertools
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from typing import Literal, Union, Callable
|
||||
|
|
|
|||
Loading…
Reference in New Issue