Compare commits

...

25 Commits

Author SHA1 Message Date
Alejandro Moreo Fernandez bb0950fad5 code used to generate plots 2023-11-13 12:07:59 +01:00
Alejandro Moreo Fernandez 2e992a0b9a choosing plots for paper 2023-11-10 14:22:43 +01:00
Alejandro Moreo Fernandez 29db15ae25 added DMx and DMy, with a classmethod that returns HDx and HDy respectively 2023-11-09 18:13:54 +01:00
Alejandro Moreo Fernandez daca2bd1cb added MedianEstimator quantifier 2023-11-09 14:20:41 +01:00
Alejandro Moreo Fernandez 66ad7295df fix in DistributionMatchingX 2023-11-08 18:11:45 +01:00
Alejandro Moreo Fernandez c3cf0e2d49 adding DistributionMatchingX, the covariate-specific equivalent counterpart of DistributionMatching 2023-11-08 16:13:48 +01:00
Alejandro Moreo Fernandez 76cf784844 added HDx and an example comparing HDy vs HDx 2023-11-08 15:34:17 +01:00
Alejandro Moreo Fernandez 8a6579428b implementing the 'total' function of IFCB protocols 2023-11-08 11:31:33 +01:00
Alejandro Moreo Fernandez f18bce5f80 added dataset IFCB plankton 2023-11-08 11:07:47 +01:00
Alejandro Moreo Fernandez cc5ab8ad70 Merge branch 'lorenzovolpi-cv_len_fix' into devel 2023-11-08 10:00:44 +01:00
Alejandro Moreo Fernandez 3d4ffcea62 merging cross-val fix 2023-11-08 10:00:25 +01:00
Lorenzo Volpi 5c7fbb2554 cross_val_predict fix added 2023-11-06 02:00:06 +01:00
Lorenzo Volpi 13fe531e12 fix added for cross_val_predict 2023-11-06 01:58:36 +01:00
Lorenzo Volpi 51c3d54aa5 fix added for len of a LabelledCollection 2023-11-06 01:53:52 +01:00
Alejandro Moreo Fernandez 34c60e0870 Merge branch 'AICGijon-uci_multiclass' 2023-10-18 17:51:37 +02:00
Alejandro Moreo Fernandez ea71559722 revised 2023-10-18 17:50:46 +02:00
pglez82 ffab2131a8 fixing requests 2023-10-18 14:12:40 +02:00
pglez82 a9f10f77f4 fixing mistakes 2023-10-17 18:44:28 +02:00
pglez82 239549eb4d fixing mistakes 2023-10-17 18:44:02 +02:00
pglez82 72fd21471d fixing mistakes 2023-10-17 18:43:33 +02:00
pglez82 d7192430e4 uci multiclass datasets 2023-10-17 18:24:33 +02:00
Alejandro Moreo Fernandez 5b90656bd1
Update README.md 2023-06-25 13:31:50 +02:00
Alejandro Moreo Fernandez fd51cd14be
Update README.md 2023-06-25 13:31:33 +02:00
Alejandro Moreo Fernandez 94ca8dec81
Add files via upload 2023-06-25 13:29:38 +02:00
Alejandro Moreo Fernandez ab070b5cc3
Update README.md 2023-06-25 13:28:48 +02:00
23 changed files with 1080 additions and 91 deletions

View File

@ -0,0 +1,73 @@
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
palette = itertools.cycle(sns.color_palette())
def setframe():
fig.spines['top'].set_visible(False)
fig.spines['left'].set_visible(False)
fig.get_yaxis().set_ticks([])
fig.spines['right'].set_visible(False)
# fig.axis('off')
nbins = 50
figsize = (5, 2)
ymax = 0.2
negatives = np.random.normal(loc = 0.3, scale=0.2, size=20000)
negatives = np.asarray([x for x in negatives if 0 <= x <= 1])
plt.figure(figsize=figsize)
plt.xlim(0, 1)
plt.ylim(0, ymax)
fig = sns.histplot(data=negatives, binrange=(0,1), bins=nbins, stat='probability', color=next(palette))
plt.title('Negative distribution')
fig.set(yticklabels=[])
fig.set(ylabel=None)
setframe()
# fig.get_figure().savefig('plots_cacm/negatives.pdf')
# plt.clf()
# -------------------------------------------------------------
positives1 = np.random.normal(loc = 0.75, scale=0.06, size=20000)
positives2 = np.random.normal(loc = 0.65, scale=0.1, size=1)
positives = np.concatenate([positives1, positives2])
np.random.shuffle(positives)
positives = np.asarray([x for x in positives if 0 <= x <= 1])
# plt.figure(figsize=figsize)
plt.xlim(0, 1)
plt.ylim(0, ymax)
fig = sns.histplot(data=positives, binrange=(0,1), bins=nbins, stat='probability', color=next(palette))
plt.title('')
fig.set(yticklabels=[])
fig.set(ylabel=None)
setframe()
fig.get_figure().savefig('plots_cacm/training.pdf')
# -------------------------------------------------------------
prev = 0.2
test = np.concatenate([
negatives[:int(len(negatives)*(1-prev))],
positives[:int(len(positives)*(prev))],
])
plt.figure(figsize=figsize)
plt.xlim(0, 1)
plt.ylim(0, ymax)
fig = sns.histplot(data=test, binrange=(0,1), bins=nbins, stat='probability', color=next(palette))
plt.title('')
fig.set(yticklabels=[])
fig.set(ylabel=None)
setframe()
fig.get_figure().savefig('plots_cacm/test.pdf')

View File

@ -0,0 +1,86 @@
from copy import deepcopy
import numpy as np
from sklearn.linear_model import LogisticRegression
import quapy as qp
from method.non_aggregative import DMx
from protocol import APP
from quapy.method.aggregative import CC, ACC, DMy
from sklearn.svm import LinearSVC
qp.environ['SAMPLE_SIZE'] = 100
DATASETS = qp.datasets.UCI_DATASETS[10:]
def fit_eval_task(args):
model_name, model, train, test = args
with qp.util.temp_seed(0):
model = deepcopy(model)
model.fit(train)
true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
return model_name, true_prev, estim_prev
def gen_data():
def base_classifier():
return LogisticRegression()
#return LinearSVC(class_weight='balanced')
def models():
yield 'CC', CC(base_classifier())
yield 'ACC', ACC(base_classifier())
yield 'HDy', DMy(base_classifier(), val_split=10, nbins=10, n_jobs=-1)
yield 'HDx', DMx(nbins=10, n_jobs=-1)
# train, test = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=10).train_test
method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], []
for dataset_name in DATASETS:
train, test = qp.datasets.fetch_UCIDataset(dataset_name).train_test
print(dataset_name, train.X.shape)
outs = qp.util.parallel(
fit_eval_task,
((method_name, model, train, test) for method_name, model in models()),
seed=0,
n_jobs=-1
)
for method_name, true_prev, estim_prev in outs:
method_names.append(method_name)
true_prevs.append(true_prev)
estim_prevs.append(estim_prev)
tr_prevs.append(train.prevalence())
return method_names, true_prevs, estim_prevs, tr_prevs
method_names, true_prevs, estim_prevs, tr_prevs = qp.util.pickled_resource('../quick_experiment/pickled_plot_data.pkl', gen_data)
def remove_dataset(dataset_order, num_methods=4):
sel_names, sel_true, sel_estim, sel_tr = [],[],[],[]
for i, (name, true, estim, tr) in enumerate(zip(method_names, true_prevs, estim_prevs, tr_prevs)):
dataset_pos = i//num_methods
if dataset_pos not in dataset_order:
sel_names.append(name)
sel_true.append(true)
sel_estim.append(estim)
sel_tr.append(tr)
return np.asarray(sel_names), np.asarray(sel_true), np.asarray(sel_estim), np.asarray(sel_tr)
print(DATASETS)
selected = 10
for i in [selected]:
print(i, DATASETS[i])
all_ = set(range(len(DATASETS)))
remove_index = sorted(all_ - {i})
sel_names, sel_true, sel_estim, sel_tr = remove_dataset(dataset_order=remove_index, num_methods=4)
p=sel_tr[0][1]
sel_names = ['CC$_{'+str(p)+'}$' if x=='CC' else x for x in sel_names]
# qp.plot.binary_diagonal(sel_names, sel_true, sel_estim, train_prev=sel_tr[0], show_std=False, savepath=f'./plots/bin_diag_{i}.png')
qp.plot.error_by_drift(sel_names, sel_true, sel_estim, sel_tr, n_bins=10, savepath=f'./plots/err_drift_{i}.png', show_std=True, show_density=False, title="")
# qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, savepath='./plots/bin_bias.png')
# qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, nbins=3, savepath='./plots/bin_bias_bin.png')

View File

@ -0,0 +1,62 @@
import math
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.neighbors import KernelDensity
import matplotlib.pyplot as plt
import numpy as np
from data import LabelledCollection
scale = 100
import quapy as qp
negatives = np.random.normal(loc = 0.2, scale=0.2, size=20000)
negatives = np.asarray([x for x in negatives if 0 <= x <= 1])
positives = np.random.normal(loc = 0.75, scale=0.05, size=20000)
positives = np.asarray([x for x in positives if 0 <= x <= 1])
prev = 0.1
test = np.concatenate([
negatives[:int(len(negatives)*(1-prev))],
positives[:int(len(positives)*(prev))],
])
nbins = 30
plt.rcParams.update({'font.size': 7})
fig = plt.figure()
positions = np.asarray([2,1,0])
colors = ['r', 'g', 'b']
ax = fig.add_subplot(111, projection='3d')
ax.set_box_aspect((3, 1, 0.8))
for post, c, z in zip([test, positives, negatives], colors, positions):
hist, bins = np.histogram(post, bins=np.linspace(0,1, nbins+1), density=True)
xs = (bins[:-1] + bins[1:])/2
ax.bar(xs, hist, width=1 / nbins, zs=z, zdir='y', color=c, ec=c, alpha=0.6)
ax.yaxis.set_ticks(positions)
ax.yaxis.set_ticklabels([' '*20+'Test distribution', ' '*20+'Positive distribution', ' '*20+'Negative distribution'])
# ax.xaxis.set_ticks([])
# ax.xaxis.set_ticklabels([], minor=True)
ax.zaxis.set_ticks([])
ax.zaxis.set_ticklabels([], minor=True)
#plt.figure(figsize=(10,6))
#plt.show()
plt.savefig('./histograms3d_CACM2023.pdf')

View File

@ -0,0 +1,59 @@
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
import quapy as qp
from data import LabelledCollection
from method.non_aggregative import DMx
from protocol import APP
from quapy.method.aggregative import CC, DMy, ACC
from sklearn.svm import LinearSVC
import numpy as np
from tqdm import tqdm
qp.environ['SAMPLE_SIZE'] = 500
def cls():
return LogisticRegressionCV(n_jobs=-1,Cs=10)
def gen_methods():
yield CC(cls()), 'CC$_{10' + '\%}$'
yield ACC(cls()), 'ACC'
yield DMy(cls(), val_split=10, nbins=10, n_jobs=-1), 'HDy'
yield DMx(nbins=10, n_jobs=-1), 'HDx'
def gen_data():
train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
method_data = []
training_prevalence = 0.1
training_size = 5000
# since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained
train_sample = train.sampling(training_size, 1-training_prevalence, random_state=0)
for model, method_name in tqdm(gen_methods(), total=4):
with qp.util.temp_seed(1):
if method_name == 'HDx':
X, y = train_sample.Xy
svd = TruncatedSVD(n_components=5, random_state=0)
Xred = svd.fit_transform(X)
train_sample_dense = LabelledCollection(Xred, y)
X, y = test.Xy
test_dense = LabelledCollection(svd.transform(X), y)
model.fit(train_sample_dense)
true_prev, estim_prev = qp.evaluation.prediction(model, APP(test_dense, repeats=100, random_state=0))
else:
model.fit(train_sample)
true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence()))
return zip(*method_data)
method_names, true_prevs, estim_prevs, tr_prevs = gen_data()
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots_cacm/bin_diag_4methods.pdf')
qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=10, savepath='./plots_cacm/err_drift_4methods.pdf', title='', show_density=False, show_std=True)

View File

@ -0,0 +1,40 @@
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
import quapy as qp
from protocol import APP
from quapy.method.aggregative import CC
from sklearn.svm import LinearSVC
import numpy as np
from tqdm import tqdm
qp.environ['SAMPLE_SIZE'] = 500
def gen_data():
train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
method_data = []
for training_prevalence in tqdm(np.linspace(0.1, 0.9, 9), total=9):
training_size = 5000
# since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained
train_sample = train.sampling(training_size, 1-training_prevalence)
# cls = GridSearchCV(LinearSVC(), param_grid={'C': np.logspace(-2,2,5), 'class_weight':[None, 'balanced']}, n_jobs=-1)
# cls = GridSearchCV(LogisticRegression(), param_grid={'C': np.logspace(-2, 2, 5), 'class_weight': [None, 'balanced']}, n_jobs=-1)
# cls.fit(*train_sample.Xy)
model = CC(LogisticRegressionCV(n_jobs=-1,Cs=10))
model.fit(train_sample)
true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
method_name = 'CC$_{'+f'{int(100*training_prevalence)}' + '\%}$'
method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence()))
return zip(*method_data)
method_names, true_prevs, estim_prevs, tr_prevs = gen_data()
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots_cacm/bin_diag_cc.pdf')
# qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=10, savepath='./plots_cacm/err_drift_cc.pdf', title='', show_density=False)

View File

@ -111,3 +111,7 @@ are provided:
* [SVMperf](https://github.com/HLT-ISTI/QuaPy/wiki/ExplicitLossMinimization)
* [Model Selection](https://github.com/HLT-ISTI/QuaPy/wiki/Model-Selection)
* [Plotting](https://github.com/HLT-ISTI/QuaPy/wiki/Plotting)
## Acknowledgments:
<img src="SoBigData.png" alt="SoBigData++" width="250"/>

BIN
SoBigData.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 128 KiB

View File

@ -0,0 +1,74 @@
from sklearn.linear_model import LogisticRegression
from time import time
import pandas as pd
from tqdm import tqdm
import quapy as qp
from quapy.protocol import APP
from quapy.method.aggregative import HDy
from quapy.method.non_aggregative import DMx
"""
This example is meant to experimentally compare HDy and HDx.
The implementations of these methods adhere to the original design of the methods; in particular, this means that
the number of bins is not an hyperparameter, but is something that the method explores internally (returning the
median of the estimates as the final prevalence prediction), and the prevalence is not searched through any
numerical optimization procedure, but simply as a linear search between 0 and 1 steppy by 0.01.
See <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ for further details
"""
qp.environ['SAMPLE_SIZE']=100
df = pd.DataFrame(columns=['method', 'dataset', 'MAE', 'MRAE', 'tr-time', 'te-time'])
for dataset_name in tqdm(qp.datasets.UCI_DATASETS, total=len(qp.datasets.UCI_DATASETS)):
if dataset_name in ['acute.a', 'acute.b', 'balance.2', 'iris.1']: continue
collection = qp.datasets.fetch_UCILabelledCollection(dataset_name, verbose=False)
train, test = collection.split_stratified()
# HDy............................................
tinit = time()
hdy = HDy(LogisticRegression()).fit(train)
t_hdy_train = time()-tinit
tinit = time()
hdy_report = qp.evaluation.evaluation_report(hdy, APP(test), error_metrics=['mae', 'mrae']).mean()
t_hdy_test = time() - tinit
df.loc[len(df)] = ['HDy', dataset_name, hdy_report['mae'], hdy_report['mrae'], t_hdy_train, t_hdy_test]
# HDx............................................
tinit = time()
hdx = DMx.HDx(n_jobs=-1).fit(train)
t_hdx_train = time() - tinit
tinit = time()
hdx_report = qp.evaluation.evaluation_report(hdx, APP(test), error_metrics=['mae', 'mrae']).mean()
t_hdx_test = time() - tinit
df.loc[len(df)] = ['HDx', dataset_name, hdx_report['mae'], hdx_report['mrae'], t_hdx_train, t_hdx_test]
# evaluation reports
print('\n'*3)
print('='*80)
print('Comparison in terms of performance')
print('='*80)
pv = df.pivot_table(index='dataset', columns='method', values=['MAE', 'MRAE'])
print(pv)
print('\nAveraged values:')
print(pv.mean())
print('\n'*3)
print('='*80)
print('Comparison in terms of efficiency')
print('='*80)
pv = df.pivot_table(index='dataset', columns='method', values=['tr-time', 'te-time'])
print(pv)
print('\nAveraged values:')
print(pv.mean())

View File

@ -0,0 +1,28 @@
import quapy as qp
from sklearn.linear_model import LogisticRegression
from quapy.evaluation import evaluation_report
def newLR():
return LogisticRegression(n_jobs=-1)
quantifiers = [
('CC', qp.method.aggregative.CC(newLR())),
('ACC', qp.method.aggregative.ACC(newLR())),
('PCC', qp.method.aggregative.PCC(newLR())),
('PACC', qp.method.aggregative.PACC(newLR())),
('HDy', qp.method.aggregative.DMy(newLR())),
('EMQ', qp.method.aggregative.EMQ(newLR()))
]
for quant_name, quantifier in quantifiers:
print("Experiment with "+quant_name)
train, test_gen = qp.datasets.fetch_IFCB()
quantifier.fit(train)
report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True)
print(report.mean())

View File

@ -1,6 +1,6 @@
import quapy as qp
from quapy.protocol import APP
from quapy.method.aggregative import DistributionMatching
from quapy.method.aggregative import DMy
from sklearn.linear_model import LogisticRegression
import numpy as np
@ -8,7 +8,7 @@ import numpy as np
In this example, we show how to perform model selection on a DistributionMatching quantifier.
"""
model = DistributionMatching(LogisticRegression())
model = DMy(LogisticRegression())
qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1

View File

@ -1,13 +1,18 @@
Change Log 0.1.8
----------------
- Added HDx and DistributionMatchingX to non-aggregative quantifiers (see also the new example "comparing_HDy_HDx.py")
- New UCI multiclass datasets added (thanks to Pablo González). The 5 UCI multiclass datasets are those corresponding
to the following criteria:
- >1000 instances
- >2 classes
- classification datasets
- Python API available
- Added NAE, NRAE
- New IFCB (plankton) dataset added. See fetch_IFCB.
- Added new evaluation measures NAE, NRAE
- Added new meta method "MedianEstimator"; an ensemble of binary base quantifiers that receives as input a dictionary
of hyperparameters that will explore exhaustively, fitting and generating predictions for each combination of
hyperparameters, and that returns, as the prevalence estimates, the median across all predictions.
Change Log 0.1.7
----------------

View File

@ -11,7 +11,7 @@ from . import util
from . import model_selection
from . import classification
__version__ = '0.1.7'
__version__ = '0.1.8'
environ = {
'SAMPLE_SIZE': None,

51
quapy/data/_ifcb.py Normal file
View File

@ -0,0 +1,51 @@
import os
import pandas as pd
from quapy.protocol import AbstractProtocol
class IFCBTrainSamplesFromDir(AbstractProtocol):
def __init__(self, path_dir:str, classes: list):
self.path_dir = path_dir
self.classes = classes
self.samples = []
for filename in os.listdir(path_dir):
if filename.endswith('.csv'):
self.samples.append(filename)
def __call__(self):
for sample in self.samples:
s = pd.read_csv(os.path.join(self.path_dir,sample))
# all columns but the first where we get the class
X = s.iloc[:, 1:].to_numpy()
y = s.iloc[:, 0].to_numpy()
yield X, y
def total(self):
"""
Returns the total number of samples that the protocol generates.
:return: The number of training samples to generate.
"""
return len(self.samples)
class IFCBTestSamples(AbstractProtocol):
def __init__(self, path_dir:str, test_prevalences_path: str):
self.path_dir = path_dir
self.test_prevalences = pd.read_csv(os.path.join(path_dir, test_prevalences_path))
def __call__(self):
for _, test_sample in self.test_prevalences.iterrows():
#Load the sample from disk
X = pd.read_csv(os.path.join(self.path_dir,test_sample['sample']+'.csv')).to_numpy()
prevalences = test_sample.iloc[1:].to_numpy().astype(float)
yield X, prevalences
def total(self):
"""
Returns the total number of samples that the protocol generates.
:return: The number of test samples to generate.
"""
return len(self.test_prevalences.index)

View File

@ -6,8 +6,7 @@ import os
import zipfile
from os.path import join
import pandas as pd
import scipy
import quapy
from ucimlrepo import fetch_ucirepo
from quapy.data.base import Dataset, LabelledCollection
from quapy.data.preprocessing import text2tfidf, reduce_columns
from quapy.data.reader import *
@ -45,6 +44,12 @@ UCI_DATASETS = ['acute.a', 'acute.b',
'wine-q-red', 'wine-q-white',
'yeast']
UCI_MULTICLASS_DATASETS = ['dry-bean',
'wine-quality',
'academic-success',
'digits',
'letter']
LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
_TXA_SAMPLE_SIZE = 250
@ -364,7 +369,8 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
elif verbose:
print('no file description available')
print(f'Loading {dataset_name} ({fullname})')
if verbose:
print(f'Loading {dataset_name} ({fullname})')
if identifier == 'acute':
df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
@ -545,7 +551,111 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
y = binarize(y, pos_class='NUC')
data = LabelledCollection(X, y)
data.stats()
if verbose:
data.stats()
return data
def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
"""
Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`.
The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
- It has more than 1000 instances
- It is suited for classification
- It has more than two classes
- It is available for Python import (requires ucimlrepo package)
>>> import quapy as qp
>>> dataset = qp.datasets.fetch_UCIMulticlassDataset("dry-bean")
>>> train, test = dataset.train_test
>>> ...
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
:param dataset_name: a dataset name
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param verbose: set to True (default is False) to get information (stats) about the dataset
:return: a :class:`quapy.data.base.Dataset` instance
"""
data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
"""
Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
- It has more than 1000 instances
- It is suited for classification
- It has more than two classes
- It is available for Python import (requires ucimlrepo package)
>>> import quapy as qp
>>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean")
>>> X, y = collection.Xy
>>> ...
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
:param dataset_name: a dataset name
:param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param verbose: set to True (default is False) to get information (stats) about the dataset
:return: a :class:`quapy.data.base.LabelledCollection` instance
"""
assert dataset_name in UCI_MULTICLASS_DATASETS, \
f'Name {dataset_name} does not match any known dataset from the ' \
f'UCI Machine Learning datasets repository (multiclass). ' \
f'Valid ones are {UCI_MULTICLASS_DATASETS}'
if data_home is None:
data_home = get_quapy_home()
identifiers = {
"dry-bean": 602,
"wine-quality": 186,
"academic-success": 697,
"digits": 80,
"letter": 59
}
full_names = {
"dry-bean": "Dry Bean Dataset",
"wine-quality": "Wine Quality",
"academic-success": "Predict students' dropout and academic success",
"digits": "Optical Recognition of Handwritten Digits",
"letter": "Letter Recognition"
}
identifier = identifiers[dataset_name]
fullname = full_names[dataset_name]
if verbose:
print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
def download(id):
data = fetch_ucirepo(id=id)
X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
classes = np.sort(np.unique(y))
y = np.searchsorted(classes, y)
return LabelledCollection(X, y)
data = pickled_resource(file, download, identifier)
if verbose:
data.stats()
return data
@ -624,12 +734,38 @@ def fetch_lequa2022(task, data_home=None):
return train, val_gen, test_gen
def fetch_IFCB(data_home=None):
def fetch_IFCB(single_sample_train=True, data_home=None):
"""
Loads the IFCB dataset for quantification <https://zenodo.org/records/10036244>`. For more
information on this dataset check the zenodo site.
This dataset is based on the data available publicly at <https://github.com/hsosik/WHOI-Plankton>.
The scripts for the processing are available at <https://github.com/pglez82/IFCB_Zenodo>
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
The datasets are downloaded only once, and stored for fast reuse.
:param single_sample_train: boolean. If True (default), it returns the train dataset as an instance of
:class:`quapy.data.base.LabelledCollection` (all examples together).
If False, a generator of training samples will be returned.
Each example in the training set has an individual class label.
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:return: a tuple `(train, test_gen)` where `train` is an instance of
:class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is True or
:class:`quapy.data._ifcb.IFCBTrainSamplesFromDir` otherwise, i.e. a sampling protocol that
returns a series of samples labelled example by example.
test_gen is an instance of :class:`quapy.data._ifcb.IFCBTestSamples`,
i.e., a sampling protocol that returns a series of samples labelled by prevalence.
"""
from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples
if data_home is None:
data_home = get_quapy_home()
URL_TRAINDEV=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
URL_TRAIN=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip'
URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'
@ -637,13 +773,43 @@ def fetch_IFCB(data_home=None):
os.makedirs(ifcb_dir, exist_ok=True)
def download_unzip_and_remove(unzipped_path, url):
tmp_path = join(ifcb_dir, 'tmp.zip')
tmp_path = join(ifcb_dir, 'ifcb_tmp.zip')
download_file_if_not_exists(url, tmp_path)
with zipfile.ZipFile(tmp_path) as file:
file.extractall(unzipped_path)
os.remove(tmp_path)
if not os.path.exists(join(ifcb_dir, task)):
download_unzip_and_remove(ifcb_dir, URL_TRAINDEV)
if not os.path.exists(os.path.join(ifcb_dir,'train')):
download_unzip_and_remove(ifcb_dir, URL_TRAIN)
if not os.path.exists(os.path.join(ifcb_dir,'test')):
download_unzip_and_remove(ifcb_dir, URL_TEST)
download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
if not os.path.exists(os.path.join(ifcb_dir,'test_prevalences.csv')):
download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
# Load test prevalences and classes
test_true_prev_path = join(ifcb_dir, 'test_prevalences.csv')
test_true_prev = pd.read_csv(test_true_prev_path)
classes = test_true_prev.columns[1:]
#Load train samples
train_samples_path = join(ifcb_dir,'train')
train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes)
#Load test samples
test_samples_path = join(ifcb_dir,'test')
test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences_path=test_true_prev_path)
# In the case the user wants it, join all the train samples in one LabelledCollection
if single_sample_train:
X = []
y = []
for X_, y_ in train_gen():
X.append(X_)
y.append(y_)
X = np.vstack(X)
y = np.concatenate(y)
train = LabelledCollection(X,y, classes=classes)
return train, test_gen
else:
return train_gen, test_gen

View File

@ -1,5 +1,7 @@
import itertools
from collections import defaultdict
from typing import Union, Callable
import scipy
import numpy as np
@ -64,7 +66,7 @@ def prevalence_from_probabilities(posteriors, binarize: bool = False):
return prevalences
def HellingerDistance(P, Q):
def HellingerDistance(P, Q) -> float:
"""
Computes the Hellingher Distance (HD) between (discretized) distributions `P` and `Q`.
The HD for two discrete distributions of `k` bins is defined as:
@ -276,3 +278,70 @@ def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08):
return False
return True
def get_divergence(divergence: Union[str, Callable]):
if isinstance(divergence, str):
if divergence=='HD':
return HellingerDistance
elif divergence=='topsoe':
return TopsoeDistance
else:
raise ValueError(f'unknown divergence {divergence}')
elif callable(divergence):
return divergence
else:
raise ValueError(f'argument "divergence" not understood; use a str or a callable function')
def argmin_prevalence(loss, n_classes, method='optim_minimize'):
if method == 'optim_minimize':
return optim_minimize(loss, n_classes)
elif method == 'linear_search':
return linear_search(loss, n_classes)
elif method == 'ternary_search':
raise NotImplementedError()
else:
raise NotImplementedError()
def optim_minimize(loss, n_classes):
"""
Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
SLSQP routine.
:param loss: (callable) the function to minimize
:param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
:return: (ndarray) the best prevalence vector found
"""
from scipy import optimize
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x
def linear_search(loss, n_classes):
"""
Performs a linear search for the best prevalence value in binary problems. The search is carried out by exploring
the range [0,1] stepping by 0.01. This search is inefficient, and is added only for completeness (some of the
early methods in quantification literature used it, e.g., HDy). A most powerful alternative is `optim_minimize`.
:param loss: (callable) the function to minimize
:param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
:return: (ndarray) the best prevalence vector found
"""
assert n_classes==2, 'linear search is only available for binary problems'
prev_selected, min_score = None, None
for prev in prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0):
score = loss(np.asarray([1 - prev, prev]))
if min_score is None or score < min_score:
prev_selected, min_score = prev, score
return np.asarray([1 - prev_selected, prev_selected])

View File

@ -1,7 +1,7 @@
from . import aggregative
from . import base
from . import meta
from . import aggregative
from . import non_aggregative
from . import meta
AGGREGATIVE_METHODS = {
aggregative.CC,

View File

@ -9,6 +9,7 @@ from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
import quapy as qp
import quapy.functional as F
from functional import get_divergence
from quapy.classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration
from quapy.classification.svmperf import SVMperf
from quapy.data import LabelledCollection
@ -530,7 +531,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
"""
`Hellinger Distance y <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDy).
HDy is a probabilistic method for training binary quantifiers, that models quantification as the problem of
minimizing the divergence (in terms of the Hellinger Distance) between two cumulative distributions of posterior
minimizing the divergence (in terms of the Hellinger Distance) between two distributions of posterior
probabilities returned by the classifier. One of the distributions is generated from the unlabelled examples and
the other is generated from a validation set. This latter distribution is defined as a mixture of the
class-conditional distributions of the posterior probabilities returned for the positive and negative validation
@ -567,10 +568,11 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]]
# pre-compute the histogram for positive and negative examples
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in
self.bins}
self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in
self.bins}
def hist(P, bins):
h = np.histogram(P, bins=bins, range=(0, 1), density=True)[0]
return h / h.sum()
self.Pxy1_density = {bins: hist(self.Pxy1, bins) for bins in self.bins}
self.Pxy0_density = {bins: hist(self.Pxy0, bins) for bins in self.bins}
return self
def aggregate(self, classif_posteriors):
@ -590,6 +592,9 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True)
# the authors proposed to search for the prevalence yielding the best matching as a linear search
# at small steps (modern implementations resort to an optimization procedure,
# see class DistributionMatching)
prev_selected, min_dist = None, None
for prev in F.prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0):
Px_train = prev * Pxy1_density + (1 - prev) * Pxy0_density
@ -602,20 +607,6 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
return np.asarray([1 - class1_prev, class1_prev])
def _get_divergence(divergence: Union[str, Callable]):
if isinstance(divergence, str):
if divergence=='HD':
return F.HellingerDistance
elif divergence=='topsoe':
return F.TopsoeDistance
else:
raise ValueError(f'unknown divergence {divergence}')
elif callable(divergence):
return divergence
else:
raise ValueError(f'argument "divergence" not understood; use a str or a callable function')
class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
"""
`DyS framework <https://ojs.aaai.org/index.php/AAAI/article/view/4376>`_ (DyS).
@ -673,7 +664,7 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
Px = classif_posteriors[:, 1] # takes only the P(y=+1|x)
Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0]
divergence = _get_divergence(self.divergence)
divergence = get_divergence(self.divergence)
def distribution_distance(prev):
Px_train = prev * self.Pxy1_density + (1 - prev) * self.Pxy0_density
@ -722,10 +713,11 @@ class SMM(AggregativeProbabilisticQuantifier, BinaryQuantifier):
return np.asarray([1 - class1_prev, class1_prev])
class DistributionMatching(AggregativeProbabilisticQuantifier):
class DMy(AggregativeProbabilisticQuantifier):
"""
Generic Distribution Matching quantifier for binary or multiclass quantification.
This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters.
Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of posterior
probabilities. This implementation takes the number of bins, the divergence, and the possibility to work on CDF
as hyperparameters.
:param classifier: a `sklearn`'s Estimator that generates a probabilistic classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set to model the
@ -738,18 +730,28 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
:param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented)
or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger
Distance)
:param cdf: whether or not to use CDF instead of PDF (default False)
:param cdf: whether to use CDF instead of PDF (default False)
:param n_jobs: number of parallel workers (default None)
"""
def __init__(self, classifier, val_split=0.4, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, n_jobs=None):
def __init__(self, classifier, val_split=0.4, nbins=8, divergence: Union[str, Callable]='HD',
cdf=False, search='optim_minimize', n_jobs=None):
self.classifier = classifier
self.val_split = val_split
self.nbins = nbins
self.divergence = divergence
self.cdf = cdf
self.search = search
self.n_jobs = n_jobs
@classmethod
def HDy(cls, classifier, val_split=0.4, n_jobs=None):
from quapy.method.meta import MedianEstimator
hdy = DMy(classifier=classifier, val_split=val_split, search='linear_search', divergence='HD')
hdy = MedianEstimator(hdy, param_grid={'nbins': np.linspace(10, 110, 11).astype(int)}, n_jobs=n_jobs)
return hdy
def __get_distributions(self, posteriors):
histograms = []
post_dims = posteriors.shape[1]
@ -770,8 +772,8 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
"""
Trains the classifier (if requested) and generates the validation distributions out of the training data.
The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete
channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; then `di=V[i]`
are the distributions obtained from training data labelled with class `i`; while `dij = di[j]` is the discrete
distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
is the fraction of instances with a value in the `k`-th bin.
@ -803,26 +805,20 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
`n` channels (proper distributions of binned posterior probabilities), on which the divergence is computed
independently. The matching is computed as an average of the divergence across all channels.
:param instances: instances in the sample
:param posteriors: posterior probabilities of the instances in the sample
:return: a vector of class prevalence estimates
"""
test_distribution = self.__get_distributions(posteriors)
divergence = _get_divergence(self.divergence)
divergence = get_divergence(self.divergence)
n_classes, n_channels, nbins = self.validation_distribution.shape
def match(prev):
def loss(prev):
prev = np.expand_dims(prev, axis=0)
mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_channels, -1)
divs = [divergence(test_distribution[ch], mixture_distribution[ch]) for ch in range(n_channels)]
return np.mean(divs)
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
return F.argmin_prevalence(loss, n_classes, method=self.search)
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x
def newELM(svmperf_base=None, loss='01', C=1):
@ -1224,17 +1220,6 @@ class MS2(MS):
return np.median(tprs), np.median(fprs)
ClassifyAndCount = CC
AdjustedClassifyAndCount = ACC
ProbabilisticClassifyAndCount = PCC
ProbabilisticAdjustedClassifyAndCount = PACC
ExpectationMaximizationQuantifier = EMQ
SLD = EMQ
HellingerDistanceY = HDy
MedianSweep = MS
MedianSweep2 = MS2
class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
"""
Allows any binary quantifier to perform quantification on single-label datasets.
@ -1292,3 +1277,18 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
# the estimation for the positive class prevalence
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
#---------------------------------------------------------------
# aliases
#---------------------------------------------------------------
ClassifyAndCount = CC
AdjustedClassifyAndCount = ACC
ProbabilisticClassifyAndCount = PCC
ProbabilisticAdjustedClassifyAndCount = PACC
ExpectationMaximizationQuantifier = EMQ
DistributionMatchingY = DMy
SLD = EMQ
HellingerDistanceY = HDy
MedianSweep = MS
MedianSweep2 = MS2

View File

@ -1,3 +1,4 @@
import itertools
from copy import deepcopy
from typing import Union
import numpy as np
@ -10,13 +11,14 @@ import quapy as qp
from quapy import functional as F
from quapy.data import LabelledCollection
from quapy.model_selection import GridSearchQ
from quapy.method.base import BaseQuantifier, BinaryQuantifier
from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ
try:
from . import neural
except ModuleNotFoundError:
neural = None
from .base import BaseQuantifier
from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ
if neural:
QuaNet = neural.QuaNetTrainer
@ -24,6 +26,67 @@ else:
QuaNet = "QuaNet is not available due to missing torch package"
class MedianEstimator(BinaryQuantifier):
"""
This method is a meta-quantifier that returns, as the estimated class prevalence values, the median of the
estimation returned by differently (hyper)parameterized base quantifiers.
The median of unit-vectors is only guaranteed to be a unit-vector for n=2 dimensions,
i.e., in cases of binary quantification.
:param base_quantifier: the base, binary quantifier
:param random_state: a seed to be set before fitting any base quantifier (default None)
:param param_grid: the grid or parameters towards which the median will be computed
:param n_jobs: number of parllel workes
"""
def __init__(self, base_quantifier: BinaryQuantifier, param_grid: dict, random_state=None, n_jobs=None):
self.base_quantifier = base_quantifier
self.param_grid = param_grid
self.random_state = random_state
self.n_jobs = qp._get_njobs(n_jobs)
def get_params(self, deep=True):
return self.base_quantifier.get_params(deep)
def set_params(self, **params):
self.base_quantifier.set_params(**params)
def _delayed_fit(self, args):
with qp.util.temp_seed(self.random_state):
params, training = args
model = deepcopy(self.base_quantifier)
model.set_params(**params)
model.fit(training)
return model
def fit(self, training: LabelledCollection):
self._check_binary(training, self.__class__.__name__)
params_keys = list(self.param_grid.keys())
params_values = list(self.param_grid.values())
hyper = [dict({k: val[i] for i, k in enumerate(params_keys)}) for val in itertools.product(*params_values)]
self.models = qp.util.parallel(
self._delayed_fit,
((params, training) for params in hyper),
seed=qp.environ.get('_R_SEED', None),
n_jobs=self.n_jobs
)
return self
def _delayed_predict(self, args):
model, instances = args
return model.quantify(instances)
def quantify(self, instances):
prev_preds = qp.util.parallel(
self._delayed_predict,
((model, instances) for model in self.models),
seed=qp.environ.get('_R_SEED', None),
n_jobs=self.n_jobs
)
prev_preds = np.asarray(prev_preds)
return np.median(prev_preds, axis=0)
class Ensemble(BaseQuantifier):
VALID_POLICIES = {'ave', 'ptr', 'ds'} | qp.error.QUANTIFICATION_ERROR_NAMES

View File

@ -1,5 +1,10 @@
from typing import Union, Callable
import numpy as np
from functional import get_divergence
from quapy.data import LabelledCollection
from .base import BaseQuantifier
from quapy.method.base import BaseQuantifier, BinaryQuantifier
import quapy.functional as F
class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
@ -33,3 +38,126 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
"""
return self.estimated_prevalence
class DMx(BaseQuantifier):
"""
Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of covariates.
This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters.
:param nbins: number of bins used to discretize the distributions (default 8)
:param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented)
or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger
Distance)
:param cdf: whether to use CDF instead of PDF (default False)
:param n_jobs: number of parallel workers (default None)
"""
def __init__(self, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, search='optim_minimize', n_jobs=None):
self.nbins = nbins
self.divergence = divergence
self.cdf = cdf
self.search = search
self.n_jobs = n_jobs
@classmethod
def HDx(cls, n_jobs=None):
"""
`Hellinger Distance x <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDx).
HDx is a method for training binary quantifiers, that models quantification as the problem of
minimizing the average divergence (in terms of the Hellinger Distance) across the feature-specific normalized
histograms of two representations, one for the unlabelled examples, and another generated from the training
examples as a mixture model of the class-specific representations. The parameters of the mixture thus represent
the estimates of the class prevalence values.
The method computes all matchings for nbins in [10, 20, ..., 110] and reports the mean of the median.
The best prevalence is searched via linear search, from 0 to 1 stepping by 0.01.
:param n_jobs: number of parallel workers
:return: an instance of this class setup to mimick the performance of the HDx as originally proposed by
González-Castro, Alaiz-Rodríguez, Alegre (2013)
"""
from quapy.method.meta import MedianEstimator
dmx = DMx(divergence='HD', cdf=False, search='linear_search')
nbins = {'nbins': np.linspace(10, 110, 11, dtype=int)}
hdx = MedianEstimator(base_quantifier=dmx, param_grid=nbins, n_jobs=n_jobs)
return hdx
def __get_distributions(self, X):
histograms = []
for feat_idx in range(self.nfeats):
feature = X[:, feat_idx]
feat_range = self.feat_ranges[feat_idx]
hist = np.histogram(feature, bins=self.nbins, range=feat_range)[0]
norm_hist = hist / hist.sum()
histograms.append(norm_hist)
distributions = np.vstack(histograms)
if self.cdf:
distributions = np.cumsum(distributions, axis=1)
return distributions
def fit(self, data: LabelledCollection):
"""
Generates the validation distributions out of the training data (covariates).
The validation distributions have shape `(n, nfeats, nbins)`, with `n` the number of classes, `nfeats`
the number of features, and `nbins` the number of bins.
In particular, let `V` be the validation distributions; then `di=V[i]` are the distributions obtained from
training data labelled with class `i`; while `dij = di[j]` is the discrete distribution for feature j in
training data labelled with class `i`, and `dij[k]` is the fraction of instances with a value in the `k`-th bin.
:param data: the training set
"""
X, y = data.Xy
self.nfeats = X.shape[1]
self.feat_ranges = _get_features_range(X)
self.validation_distribution = np.asarray(
[self.__get_distributions(X[y==cat]) for cat in range(data.n_classes)]
)
return self
def quantify(self, instances):
"""
Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
(the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
The matching is computed as the average dissimilarity (in terms of the dissimilarity measure of choice)
between all feature-specific discrete distributions.
:param instances: instances in the sample
:return: a vector of class prevalence estimates
"""
assert instances.shape[1] == self.nfeats, f'wrong shape; expected {self.nfeats}, found {instances.shape[1]}'
test_distribution = self.__get_distributions(instances)
divergence = get_divergence(self.divergence)
n_classes, n_feats, nbins = self.validation_distribution.shape
def loss(prev):
prev = np.expand_dims(prev, axis=0)
mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_feats, -1)
divs = [divergence(test_distribution[feat], mixture_distribution[feat]) for feat in range(n_feats)]
return np.mean(divs)
return F.argmin_prevalence(loss, n_classes, method=self.search)
def _get_features_range(X):
feat_ranges = []
ncols = X.shape[1]
for col_idx in range(ncols):
feature = X[:,col_idx]
feat_ranges.append((np.min(feature), np.max(feature)))
return feat_ranges
#---------------------------------------------------------------
# aliases
#---------------------------------------------------------------
DistributionMatchingX = DMx

View File

@ -223,7 +223,7 @@ def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfol
for train, test in data.kFCV(nfolds=nfolds, random_state=random_state):
quantifier.fit(train)
fold_prev = quantifier.quantify(test.X)
rel_size = len(test.X)/len(data)
rel_size = 1. * len(test) / len(data)
total_prev += fold_prev*rel_size
return total_prev

View File

@ -72,7 +72,7 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No
train_prev = train_prev[pos_class]
ax.scatter(train_prev, train_prev, c='c', label='tr-prev', linewidth=2, edgecolor='k', s=100, zorder=3)
ax.set(xlabel='true prevalence', ylabel='estimated prevalence', title=title)
ax.set(xlabel='true frequency', ylabel='estimated frequency', title=title)
ax.set_ylim(0, 1)
ax.set_xlim(0, 1)
@ -216,9 +216,10 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
show_density=True,
show_legend=True,
logscale=False,
title=f'Quantification error as a function of distribution shift',
title=f'Quantification error as a function of label shift',
vlines=None,
method_order=None,
fontsize=18,
savepath=None):
"""
Plots the error (along the x-axis, as measured in terms of `error_name`) as a function of the train-test shift
@ -247,6 +248,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
:param savepath: path where to save the plot. If not indicated (as default), the plot is shown.
"""
plt.rcParams['font.size'] = fontsize
fig, ax = plt.subplots()
ax.grid()
@ -261,7 +264,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
if method_order is None:
method_order = method_names
_set_colors(ax, n_methods=len(method_order))
# _set_colors(ax, n_methods=len(method_order))
bins = np.linspace(0, 1, n_bins+1)
binwidth = 1 / n_bins
@ -291,6 +294,9 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
ys = np.asarray(ys)
ystds = np.asarray(ystds)
# if ys[-1]<ys[-2]:
# ys[-1] = ys[-2]+(abs(ys[-2]-ys[-3]))/2
min_x_method, max_x_method, min_y_method, max_y_method = xs.min(), xs.max(), ys.min(), ys.max()
min_x = min_x_method if min_x is None or min_x_method < min_x else min_x
max_x = max_x_method if max_x is None or max_x_method > max_x else max_x
@ -313,8 +319,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
ax2.spines['right'].set_color('g')
ax2.tick_params(axis='y', colors='g')
ax.set(xlabel=f'Distribution shift between training set and test sample',
ylabel=f'{error_name.upper()} (true distribution, predicted distribution)',
ax.set(xlabel=f'Amount of label shift',
ylabel=f'Absolute error',
title=title)
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
@ -329,10 +335,11 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
if show_legend:
fig.legend(loc='lower center',
bbox_to_anchor=(1, 0.5),
ncol=(len(method_names)+1)//2)
ax.legend(loc='center right', bbox_to_anchor=(1.31, 0.5))
# fig.legend(loc='lower center',
# bbox_to_anchor=(1, 0.5),
# ncol=(len(method_names)+1)//2)
_save_or_show(savepath)

View File

@ -236,7 +236,7 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
raise RuntimeError(
f"Abort: the number of samples that will be generated by {self.__class__.__name__} ({n}) "
f"exceeds the maximum number of allowed samples ({sanity_check = }). Set 'sanity_check' to "
f"None for bypassing this check, or to a higher number.")
f"None, or to a higher number, for bypassing this check.")
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)

View File

@ -1,14 +1,17 @@
import numpy
import numpy as np
import pytest
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import quapy as qp
from quapy.model_selection import GridSearchQ
from quapy.method.base import BinaryQuantifier
from quapy.data import Dataset, LabelledCollection
from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS
from quapy.method.aggregative import ACC, PACC, HDy
from quapy.method.meta import Ensemble
from quapy.protocol import APP
from quapy.method.aggregative import DMy
from quapy.method.meta import MedianEstimator
datasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True), id='hcr'),
pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')]
@ -36,7 +39,7 @@ def test_aggregative_methods(dataset: Dataset, aggregative_method, learner):
true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, estim_prevalences)
assert type(error) == numpy.float64
assert type(error) == np.float64
@pytest.mark.parametrize('dataset', datasets)
@ -55,7 +58,7 @@ def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method):
true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, estim_prevalences)
assert type(error) == numpy.float64
assert type(error) == np.float64
@pytest.mark.parametrize('base_method', AGGREGATIVE_METHODS)
@ -80,7 +83,7 @@ def test_ensemble_method(base_method, learner, dataset: Dataset, policy):
true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, estim_prevalences)
assert type(error) == numpy.float64
assert type(error) == np.float64
def test_quanet_method():
@ -119,7 +122,7 @@ def test_quanet_method():
true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, estim_prevalences)
assert type(error) == numpy.float64
assert type(error) == np.float64
def test_str_label_names():
@ -130,32 +133,103 @@ def test_str_label_names():
dataset.test.sampling(1000, 0.25, 0.75))
qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True)
numpy.random.seed(0)
np.random.seed(0)
model.fit(dataset.training)
int_estim_prevalences = model.quantify(dataset.test.instances)
true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, int_estim_prevalences)
assert type(error) == numpy.float64
assert type(error) == np.float64
dataset_str = Dataset(LabelledCollection(dataset.training.instances,
['one' if label == 1 else 'zero' for label in dataset.training.labels]),
LabelledCollection(dataset.test.instances,
['one' if label == 1 else 'zero' for label in dataset.test.labels]))
assert all(dataset_str.training.classes_ == dataset_str.test.classes_), 'wrong indexation'
numpy.random.seed(0)
np.random.seed(0)
model.fit(dataset_str.training)
str_estim_prevalences = model.quantify(dataset_str.test.instances)
true_prevalences = dataset_str.test.prevalence()
error = qp.error.mae(true_prevalences, str_estim_prevalences)
assert type(error) == numpy.float64
assert type(error) == np.float64
print(true_prevalences)
print(int_estim_prevalences)
print(str_estim_prevalences)
numpy.testing.assert_almost_equal(int_estim_prevalences[1],
np.testing.assert_almost_equal(int_estim_prevalences[1],
str_estim_prevalences[list(model.classes_).index('one')])
# helper
def __fit_test(quantifier, train, test):
quantifier.fit(train)
test_samples = APP(test)
true_prevs, estim_prevs = qp.evaluation.prediction(quantifier, test_samples)
return qp.error.mae(true_prevs, estim_prevs), estim_prevs
def test_median_meta():
"""
This test compares the performance of the MedianQuantifier with respect to computing the median of the predictions
of a differently parameterized quantifier. We use the DistributionMatching base quantifier and the median is
computed across different values of nbins
"""
qp.environ['SAMPLE_SIZE'] = 100
# grid of values
nbins_grid = list(range(2, 11))
dataset = 'kindle'
train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test
prevs = []
errors = []
for nbins in nbins_grid:
with qp.util.temp_seed(0):
q = DMy(LogisticRegression(), nbins=nbins)
mae, estim_prevs = __fit_test(q, train, test)
prevs.append(estim_prevs)
errors.append(mae)
print(f'{dataset} DistributionMatching(nbins={nbins}) got MAE {mae:.4f}')
prevs = np.asarray(prevs)
mae = np.mean(errors)
print(f'\tMAE={mae:.4f}')
q = DMy(LogisticRegression())
q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
median_mae, prev = __fit_test(q, train, test)
print(f'\tMAE={median_mae:.4f}')
np.testing.assert_almost_equal(np.median(prevs, axis=0), prev)
assert median_mae < mae, 'the median-based quantifier provided a higher error...'
def test_median_meta_modsel():
"""
This test checks the median-meta quantifier with model selection
"""
qp.environ['SAMPLE_SIZE'] = 100
dataset = 'kindle'
train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test
train, val = train.split_stratified(random_state=0)
nbins_grid = [2, 4, 5, 10, 15]
q = DMy(LogisticRegression())
q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
median_mae, _ = __fit_test(q, train, test)
print(f'\tMAE={median_mae:.4f}')
q = DMy(LogisticRegression())
lr_params = {'classifier__C': np.logspace(-1, 1, 3)}
q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
q = GridSearchQ(q, param_grid=lr_params, protocol=APP(val), n_jobs=-1)
optimized_median_ave, _ = __fit_test(q, train, test)
print(f'\tMAE={optimized_median_ave:.4f}')
assert optimized_median_ave < median_mae, "the optimized method yielded worse performance..."