Compare commits
25 Commits
15777b0fab
...
bb0950fad5
| Author | SHA1 | Date |
|---|---|---|
|
|
bb0950fad5 | |
|
|
2e992a0b9a | |
|
|
29db15ae25 | |
|
|
daca2bd1cb | |
|
|
66ad7295df | |
|
|
c3cf0e2d49 | |
|
|
76cf784844 | |
|
|
8a6579428b | |
|
|
f18bce5f80 | |
|
|
cc5ab8ad70 | |
|
|
3d4ffcea62 | |
|
|
5c7fbb2554 | |
|
|
13fe531e12 | |
|
|
51c3d54aa5 | |
|
|
34c60e0870 | |
|
|
ea71559722 | |
|
|
ffab2131a8 | |
|
|
a9f10f77f4 | |
|
|
239549eb4d | |
|
|
72fd21471d | |
|
|
d7192430e4 | |
|
|
5b90656bd1 | |
|
|
fd51cd14be | |
|
|
94ca8dec81 | |
|
|
ab070b5cc3 |
|
|
@ -0,0 +1,73 @@
|
|||
import itertools
|
||||
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
|
||||
|
||||
palette = itertools.cycle(sns.color_palette())
|
||||
|
||||
def setframe():
|
||||
fig.spines['top'].set_visible(False)
|
||||
fig.spines['left'].set_visible(False)
|
||||
fig.get_yaxis().set_ticks([])
|
||||
fig.spines['right'].set_visible(False)
|
||||
# fig.axis('off')
|
||||
|
||||
nbins = 50
|
||||
figsize = (5, 2)
|
||||
ymax = 0.2
|
||||
|
||||
negatives = np.random.normal(loc = 0.3, scale=0.2, size=20000)
|
||||
negatives = np.asarray([x for x in negatives if 0 <= x <= 1])
|
||||
|
||||
plt.figure(figsize=figsize)
|
||||
plt.xlim(0, 1)
|
||||
plt.ylim(0, ymax)
|
||||
fig = sns.histplot(data=negatives, binrange=(0,1), bins=nbins, stat='probability', color=next(palette))
|
||||
plt.title('Negative distribution')
|
||||
fig.set(yticklabels=[])
|
||||
fig.set(ylabel=None)
|
||||
setframe()
|
||||
# fig.get_figure().savefig('plots_cacm/negatives.pdf')
|
||||
# plt.clf()
|
||||
|
||||
# -------------------------------------------------------------
|
||||
|
||||
positives1 = np.random.normal(loc = 0.75, scale=0.06, size=20000)
|
||||
positives2 = np.random.normal(loc = 0.65, scale=0.1, size=1)
|
||||
positives = np.concatenate([positives1, positives2])
|
||||
np.random.shuffle(positives)
|
||||
positives = np.asarray([x for x in positives if 0 <= x <= 1])
|
||||
|
||||
# plt.figure(figsize=figsize)
|
||||
plt.xlim(0, 1)
|
||||
plt.ylim(0, ymax)
|
||||
fig = sns.histplot(data=positives, binrange=(0,1), bins=nbins, stat='probability', color=next(palette))
|
||||
plt.title('')
|
||||
fig.set(yticklabels=[])
|
||||
fig.set(ylabel=None)
|
||||
setframe()
|
||||
fig.get_figure().savefig('plots_cacm/training.pdf')
|
||||
|
||||
# -------------------------------------------------------------
|
||||
|
||||
prev = 0.2
|
||||
test = np.concatenate([
|
||||
negatives[:int(len(negatives)*(1-prev))],
|
||||
positives[:int(len(positives)*(prev))],
|
||||
])
|
||||
|
||||
|
||||
plt.figure(figsize=figsize)
|
||||
plt.xlim(0, 1)
|
||||
plt.ylim(0, ymax)
|
||||
fig = sns.histplot(data=test, binrange=(0,1), bins=nbins, stat='probability', color=next(palette))
|
||||
plt.title('')
|
||||
fig.set(yticklabels=[])
|
||||
fig.set(ylabel=None)
|
||||
setframe()
|
||||
fig.get_figure().savefig('plots_cacm/test.pdf')
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,86 @@
|
|||
from copy import deepcopy
|
||||
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
import quapy as qp
|
||||
from method.non_aggregative import DMx
|
||||
from protocol import APP
|
||||
from quapy.method.aggregative import CC, ACC, DMy
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 100
|
||||
DATASETS = qp.datasets.UCI_DATASETS[10:]
|
||||
|
||||
def fit_eval_task(args):
|
||||
model_name, model, train, test = args
|
||||
with qp.util.temp_seed(0):
|
||||
model = deepcopy(model)
|
||||
model.fit(train)
|
||||
true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
|
||||
return model_name, true_prev, estim_prev
|
||||
|
||||
|
||||
def gen_data():
|
||||
|
||||
def base_classifier():
|
||||
return LogisticRegression()
|
||||
#return LinearSVC(class_weight='balanced')
|
||||
|
||||
|
||||
def models():
|
||||
yield 'CC', CC(base_classifier())
|
||||
yield 'ACC', ACC(base_classifier())
|
||||
yield 'HDy', DMy(base_classifier(), val_split=10, nbins=10, n_jobs=-1)
|
||||
yield 'HDx', DMx(nbins=10, n_jobs=-1)
|
||||
|
||||
# train, test = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=10).train_test
|
||||
method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], []
|
||||
|
||||
for dataset_name in DATASETS:
|
||||
train, test = qp.datasets.fetch_UCIDataset(dataset_name).train_test
|
||||
print(dataset_name, train.X.shape)
|
||||
|
||||
outs = qp.util.parallel(
|
||||
fit_eval_task,
|
||||
((method_name, model, train, test) for method_name, model in models()),
|
||||
seed=0,
|
||||
n_jobs=-1
|
||||
)
|
||||
|
||||
for method_name, true_prev, estim_prev in outs:
|
||||
method_names.append(method_name)
|
||||
true_prevs.append(true_prev)
|
||||
estim_prevs.append(estim_prev)
|
||||
tr_prevs.append(train.prevalence())
|
||||
|
||||
return method_names, true_prevs, estim_prevs, tr_prevs
|
||||
|
||||
method_names, true_prevs, estim_prevs, tr_prevs = qp.util.pickled_resource('../quick_experiment/pickled_plot_data.pkl', gen_data)
|
||||
|
||||
def remove_dataset(dataset_order, num_methods=4):
|
||||
sel_names, sel_true, sel_estim, sel_tr = [],[],[],[]
|
||||
for i, (name, true, estim, tr) in enumerate(zip(method_names, true_prevs, estim_prevs, tr_prevs)):
|
||||
dataset_pos = i//num_methods
|
||||
if dataset_pos not in dataset_order:
|
||||
sel_names.append(name)
|
||||
sel_true.append(true)
|
||||
sel_estim.append(estim)
|
||||
sel_tr.append(tr)
|
||||
return np.asarray(sel_names), np.asarray(sel_true), np.asarray(sel_estim), np.asarray(sel_tr)
|
||||
|
||||
print(DATASETS)
|
||||
selected = 10
|
||||
for i in [selected]:
|
||||
print(i, DATASETS[i])
|
||||
all_ = set(range(len(DATASETS)))
|
||||
remove_index = sorted(all_ - {i})
|
||||
sel_names, sel_true, sel_estim, sel_tr = remove_dataset(dataset_order=remove_index, num_methods=4)
|
||||
|
||||
p=sel_tr[0][1]
|
||||
sel_names = ['CC$_{'+str(p)+'}$' if x=='CC' else x for x in sel_names]
|
||||
|
||||
# qp.plot.binary_diagonal(sel_names, sel_true, sel_estim, train_prev=sel_tr[0], show_std=False, savepath=f'./plots/bin_diag_{i}.png')
|
||||
qp.plot.error_by_drift(sel_names, sel_true, sel_estim, sel_tr, n_bins=10, savepath=f'./plots/err_drift_{i}.png', show_std=True, show_density=False, title="")
|
||||
# qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, savepath='./plots/bin_bias.png')
|
||||
# qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, nbins=3, savepath='./plots/bin_bias_bin.png')
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
|
||||
import math
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.model_selection import train_test_split, cross_val_predict
|
||||
from sklearn.neighbors import KernelDensity
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
from data import LabelledCollection
|
||||
|
||||
scale = 100
|
||||
|
||||
|
||||
import quapy as qp
|
||||
|
||||
negatives = np.random.normal(loc = 0.2, scale=0.2, size=20000)
|
||||
negatives = np.asarray([x for x in negatives if 0 <= x <= 1])
|
||||
|
||||
positives = np.random.normal(loc = 0.75, scale=0.05, size=20000)
|
||||
positives = np.asarray([x for x in positives if 0 <= x <= 1])
|
||||
|
||||
prev = 0.1
|
||||
test = np.concatenate([
|
||||
negatives[:int(len(negatives)*(1-prev))],
|
||||
positives[:int(len(positives)*(prev))],
|
||||
])
|
||||
|
||||
|
||||
nbins = 30
|
||||
|
||||
plt.rcParams.update({'font.size': 7})
|
||||
|
||||
fig = plt.figure()
|
||||
positions = np.asarray([2,1,0])
|
||||
colors = ['r', 'g', 'b']
|
||||
|
||||
|
||||
ax = fig.add_subplot(111, projection='3d')
|
||||
ax.set_box_aspect((3, 1, 0.8))
|
||||
|
||||
for post, c, z in zip([test, positives, negatives], colors, positions):
|
||||
|
||||
hist, bins = np.histogram(post, bins=np.linspace(0,1, nbins+1), density=True)
|
||||
xs = (bins[:-1] + bins[1:])/2
|
||||
|
||||
ax.bar(xs, hist, width=1 / nbins, zs=z, zdir='y', color=c, ec=c, alpha=0.6)
|
||||
|
||||
|
||||
ax.yaxis.set_ticks(positions)
|
||||
ax.yaxis.set_ticklabels([' '*20+'Test distribution', ' '*20+'Positive distribution', ' '*20+'Negative distribution'])
|
||||
# ax.xaxis.set_ticks([])
|
||||
# ax.xaxis.set_ticklabels([], minor=True)
|
||||
ax.zaxis.set_ticks([])
|
||||
ax.zaxis.set_ticklabels([], minor=True)
|
||||
|
||||
|
||||
#plt.figure(figsize=(10,6))
|
||||
#plt.show()
|
||||
plt.savefig('./histograms3d_CACM2023.pdf')
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
||||
import quapy as qp
|
||||
from data import LabelledCollection
|
||||
from method.non_aggregative import DMx
|
||||
from protocol import APP
|
||||
from quapy.method.aggregative import CC, DMy, ACC
|
||||
from sklearn.svm import LinearSVC
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 500
|
||||
|
||||
def cls():
|
||||
return LogisticRegressionCV(n_jobs=-1,Cs=10)
|
||||
|
||||
def gen_methods():
|
||||
yield CC(cls()), 'CC$_{10' + '\%}$'
|
||||
yield ACC(cls()), 'ACC'
|
||||
yield DMy(cls(), val_split=10, nbins=10, n_jobs=-1), 'HDy'
|
||||
yield DMx(nbins=10, n_jobs=-1), 'HDx'
|
||||
|
||||
def gen_data():
|
||||
|
||||
train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
|
||||
|
||||
method_data = []
|
||||
training_prevalence = 0.1
|
||||
training_size = 5000
|
||||
# since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained
|
||||
train_sample = train.sampling(training_size, 1-training_prevalence, random_state=0)
|
||||
|
||||
for model, method_name in tqdm(gen_methods(), total=4):
|
||||
with qp.util.temp_seed(1):
|
||||
if method_name == 'HDx':
|
||||
X, y = train_sample.Xy
|
||||
svd = TruncatedSVD(n_components=5, random_state=0)
|
||||
Xred = svd.fit_transform(X)
|
||||
train_sample_dense = LabelledCollection(Xred, y)
|
||||
|
||||
X, y = test.Xy
|
||||
test_dense = LabelledCollection(svd.transform(X), y)
|
||||
|
||||
model.fit(train_sample_dense)
|
||||
true_prev, estim_prev = qp.evaluation.prediction(model, APP(test_dense, repeats=100, random_state=0))
|
||||
else:
|
||||
model.fit(train_sample)
|
||||
true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
|
||||
method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence()))
|
||||
|
||||
return zip(*method_data)
|
||||
|
||||
|
||||
method_names, true_prevs, estim_prevs, tr_prevs = gen_data()
|
||||
|
||||
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots_cacm/bin_diag_4methods.pdf')
|
||||
qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=10, savepath='./plots_cacm/err_drift_4methods.pdf', title='', show_density=False, show_std=True)
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
||||
import quapy as qp
|
||||
from protocol import APP
|
||||
from quapy.method.aggregative import CC
|
||||
from sklearn.svm import LinearSVC
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 500
|
||||
|
||||
def gen_data():
|
||||
|
||||
train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
|
||||
|
||||
method_data = []
|
||||
for training_prevalence in tqdm(np.linspace(0.1, 0.9, 9), total=9):
|
||||
training_size = 5000
|
||||
# since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained
|
||||
train_sample = train.sampling(training_size, 1-training_prevalence)
|
||||
|
||||
# cls = GridSearchCV(LinearSVC(), param_grid={'C': np.logspace(-2,2,5), 'class_weight':[None, 'balanced']}, n_jobs=-1)
|
||||
# cls = GridSearchCV(LogisticRegression(), param_grid={'C': np.logspace(-2, 2, 5), 'class_weight': [None, 'balanced']}, n_jobs=-1)
|
||||
# cls.fit(*train_sample.Xy)
|
||||
|
||||
model = CC(LogisticRegressionCV(n_jobs=-1,Cs=10))
|
||||
|
||||
model.fit(train_sample)
|
||||
true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
|
||||
method_name = 'CC$_{'+f'{int(100*training_prevalence)}' + '\%}$'
|
||||
method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence()))
|
||||
|
||||
return zip(*method_data)
|
||||
|
||||
|
||||
method_names, true_prevs, estim_prevs, tr_prevs = gen_data()
|
||||
|
||||
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots_cacm/bin_diag_cc.pdf')
|
||||
# qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=10, savepath='./plots_cacm/err_drift_cc.pdf', title='', show_density=False)
|
||||
|
|
@ -111,3 +111,7 @@ are provided:
|
|||
* [SVMperf](https://github.com/HLT-ISTI/QuaPy/wiki/ExplicitLossMinimization)
|
||||
* [Model Selection](https://github.com/HLT-ISTI/QuaPy/wiki/Model-Selection)
|
||||
* [Plotting](https://github.com/HLT-ISTI/QuaPy/wiki/Plotting)
|
||||
|
||||
## Acknowledgments:
|
||||
|
||||
<img src="SoBigData.png" alt="SoBigData++" width="250"/>
|
||||
|
|
|
|||
Binary file not shown.
|
After Width: | Height: | Size: 128 KiB |
|
|
@ -0,0 +1,74 @@
|
|||
from sklearn.linear_model import LogisticRegression
|
||||
from time import time
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
import quapy as qp
|
||||
from quapy.protocol import APP
|
||||
from quapy.method.aggregative import HDy
|
||||
from quapy.method.non_aggregative import DMx
|
||||
|
||||
|
||||
"""
|
||||
This example is meant to experimentally compare HDy and HDx.
|
||||
The implementations of these methods adhere to the original design of the methods; in particular, this means that
|
||||
the number of bins is not an hyperparameter, but is something that the method explores internally (returning the
|
||||
median of the estimates as the final prevalence prediction), and the prevalence is not searched through any
|
||||
numerical optimization procedure, but simply as a linear search between 0 and 1 steppy by 0.01.
|
||||
See <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ for further details
|
||||
"""
|
||||
|
||||
qp.environ['SAMPLE_SIZE']=100
|
||||
|
||||
|
||||
df = pd.DataFrame(columns=['method', 'dataset', 'MAE', 'MRAE', 'tr-time', 'te-time'])
|
||||
|
||||
|
||||
for dataset_name in tqdm(qp.datasets.UCI_DATASETS, total=len(qp.datasets.UCI_DATASETS)):
|
||||
if dataset_name in ['acute.a', 'acute.b', 'balance.2', 'iris.1']: continue
|
||||
|
||||
collection = qp.datasets.fetch_UCILabelledCollection(dataset_name, verbose=False)
|
||||
train, test = collection.split_stratified()
|
||||
|
||||
# HDy............................................
|
||||
tinit = time()
|
||||
hdy = HDy(LogisticRegression()).fit(train)
|
||||
t_hdy_train = time()-tinit
|
||||
|
||||
tinit = time()
|
||||
hdy_report = qp.evaluation.evaluation_report(hdy, APP(test), error_metrics=['mae', 'mrae']).mean()
|
||||
t_hdy_test = time() - tinit
|
||||
df.loc[len(df)] = ['HDy', dataset_name, hdy_report['mae'], hdy_report['mrae'], t_hdy_train, t_hdy_test]
|
||||
|
||||
# HDx............................................
|
||||
tinit = time()
|
||||
hdx = DMx.HDx(n_jobs=-1).fit(train)
|
||||
t_hdx_train = time() - tinit
|
||||
|
||||
tinit = time()
|
||||
hdx_report = qp.evaluation.evaluation_report(hdx, APP(test), error_metrics=['mae', 'mrae']).mean()
|
||||
t_hdx_test = time() - tinit
|
||||
df.loc[len(df)] = ['HDx', dataset_name, hdx_report['mae'], hdx_report['mrae'], t_hdx_train, t_hdx_test]
|
||||
|
||||
# evaluation reports
|
||||
|
||||
print('\n'*3)
|
||||
print('='*80)
|
||||
print('Comparison in terms of performance')
|
||||
print('='*80)
|
||||
pv = df.pivot_table(index='dataset', columns='method', values=['MAE', 'MRAE'])
|
||||
print(pv)
|
||||
print('\nAveraged values:')
|
||||
print(pv.mean())
|
||||
|
||||
print('\n'*3)
|
||||
print('='*80)
|
||||
print('Comparison in terms of efficiency')
|
||||
print('='*80)
|
||||
pv = df.pivot_table(index='dataset', columns='method', values=['tr-time', 'te-time'])
|
||||
print(pv)
|
||||
print('\nAveraged values:')
|
||||
print(pv.mean())
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
import quapy as qp
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from quapy.evaluation import evaluation_report
|
||||
|
||||
|
||||
def newLR():
|
||||
return LogisticRegression(n_jobs=-1)
|
||||
|
||||
|
||||
quantifiers = [
|
||||
('CC', qp.method.aggregative.CC(newLR())),
|
||||
('ACC', qp.method.aggregative.ACC(newLR())),
|
||||
('PCC', qp.method.aggregative.PCC(newLR())),
|
||||
('PACC', qp.method.aggregative.PACC(newLR())),
|
||||
('HDy', qp.method.aggregative.DMy(newLR())),
|
||||
('EMQ', qp.method.aggregative.EMQ(newLR()))
|
||||
]
|
||||
|
||||
|
||||
for quant_name, quantifier in quantifiers:
|
||||
print("Experiment with "+quant_name)
|
||||
|
||||
train, test_gen = qp.datasets.fetch_IFCB()
|
||||
|
||||
quantifier.fit(train)
|
||||
|
||||
report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True)
|
||||
print(report.mean())
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import quapy as qp
|
||||
from quapy.protocol import APP
|
||||
from quapy.method.aggregative import DistributionMatching
|
||||
from quapy.method.aggregative import DMy
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
import numpy as np
|
||||
|
||||
|
|
@ -8,7 +8,7 @@ import numpy as np
|
|||
In this example, we show how to perform model selection on a DistributionMatching quantifier.
|
||||
"""
|
||||
|
||||
model = DistributionMatching(LogisticRegression())
|
||||
model = DMy(LogisticRegression())
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 100
|
||||
qp.environ['N_JOBS'] = -1
|
||||
|
|
|
|||
|
|
@ -1,13 +1,18 @@
|
|||
Change Log 0.1.8
|
||||
----------------
|
||||
|
||||
- Added HDx and DistributionMatchingX to non-aggregative quantifiers (see also the new example "comparing_HDy_HDx.py")
|
||||
- New UCI multiclass datasets added (thanks to Pablo González). The 5 UCI multiclass datasets are those corresponding
|
||||
to the following criteria:
|
||||
- >1000 instances
|
||||
- >2 classes
|
||||
- classification datasets
|
||||
- Python API available
|
||||
- Added NAE, NRAE
|
||||
- New IFCB (plankton) dataset added. See fetch_IFCB.
|
||||
- Added new evaluation measures NAE, NRAE
|
||||
- Added new meta method "MedianEstimator"; an ensemble of binary base quantifiers that receives as input a dictionary
|
||||
of hyperparameters that will explore exhaustively, fitting and generating predictions for each combination of
|
||||
hyperparameters, and that returns, as the prevalence estimates, the median across all predictions.
|
||||
|
||||
Change Log 0.1.7
|
||||
----------------
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ from . import util
|
|||
from . import model_selection
|
||||
from . import classification
|
||||
|
||||
__version__ = '0.1.7'
|
||||
__version__ = '0.1.8'
|
||||
|
||||
environ = {
|
||||
'SAMPLE_SIZE': None,
|
||||
|
|
|
|||
|
|
@ -0,0 +1,51 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
from quapy.protocol import AbstractProtocol
|
||||
|
||||
class IFCBTrainSamplesFromDir(AbstractProtocol):
|
||||
|
||||
def __init__(self, path_dir:str, classes: list):
|
||||
self.path_dir = path_dir
|
||||
self.classes = classes
|
||||
self.samples = []
|
||||
for filename in os.listdir(path_dir):
|
||||
if filename.endswith('.csv'):
|
||||
self.samples.append(filename)
|
||||
|
||||
def __call__(self):
|
||||
for sample in self.samples:
|
||||
s = pd.read_csv(os.path.join(self.path_dir,sample))
|
||||
# all columns but the first where we get the class
|
||||
X = s.iloc[:, 1:].to_numpy()
|
||||
y = s.iloc[:, 0].to_numpy()
|
||||
yield X, y
|
||||
|
||||
def total(self):
|
||||
"""
|
||||
Returns the total number of samples that the protocol generates.
|
||||
|
||||
:return: The number of training samples to generate.
|
||||
"""
|
||||
return len(self.samples)
|
||||
|
||||
|
||||
class IFCBTestSamples(AbstractProtocol):
|
||||
|
||||
def __init__(self, path_dir:str, test_prevalences_path: str):
|
||||
self.path_dir = path_dir
|
||||
self.test_prevalences = pd.read_csv(os.path.join(path_dir, test_prevalences_path))
|
||||
|
||||
def __call__(self):
|
||||
for _, test_sample in self.test_prevalences.iterrows():
|
||||
#Load the sample from disk
|
||||
X = pd.read_csv(os.path.join(self.path_dir,test_sample['sample']+'.csv')).to_numpy()
|
||||
prevalences = test_sample.iloc[1:].to_numpy().astype(float)
|
||||
yield X, prevalences
|
||||
|
||||
def total(self):
|
||||
"""
|
||||
Returns the total number of samples that the protocol generates.
|
||||
|
||||
:return: The number of test samples to generate.
|
||||
"""
|
||||
return len(self.test_prevalences.index)
|
||||
|
|
@ -6,8 +6,7 @@ import os
|
|||
import zipfile
|
||||
from os.path import join
|
||||
import pandas as pd
|
||||
import scipy
|
||||
import quapy
|
||||
from ucimlrepo import fetch_ucirepo
|
||||
from quapy.data.base import Dataset, LabelledCollection
|
||||
from quapy.data.preprocessing import text2tfidf, reduce_columns
|
||||
from quapy.data.reader import *
|
||||
|
|
@ -45,6 +44,12 @@ UCI_DATASETS = ['acute.a', 'acute.b',
|
|||
'wine-q-red', 'wine-q-white',
|
||||
'yeast']
|
||||
|
||||
UCI_MULTICLASS_DATASETS = ['dry-bean',
|
||||
'wine-quality',
|
||||
'academic-success',
|
||||
'digits',
|
||||
'letter']
|
||||
|
||||
LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
|
||||
|
||||
_TXA_SAMPLE_SIZE = 250
|
||||
|
|
@ -364,7 +369,8 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
|
|||
elif verbose:
|
||||
print('no file description available')
|
||||
|
||||
print(f'Loading {dataset_name} ({fullname})')
|
||||
if verbose:
|
||||
print(f'Loading {dataset_name} ({fullname})')
|
||||
if identifier == 'acute':
|
||||
df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
|
||||
|
||||
|
|
@ -545,7 +551,111 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
|
|||
y = binarize(y, pos_class='NUC')
|
||||
|
||||
data = LabelledCollection(X, y)
|
||||
data.stats()
|
||||
if verbose:
|
||||
data.stats()
|
||||
return data
|
||||
|
||||
|
||||
def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
|
||||
"""
|
||||
Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`.
|
||||
|
||||
The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
|
||||
- It has more than 1000 instances
|
||||
- It is suited for classification
|
||||
- It has more than two classes
|
||||
- It is available for Python import (requires ucimlrepo package)
|
||||
|
||||
>>> import quapy as qp
|
||||
>>> dataset = qp.datasets.fetch_UCIMulticlassDataset("dry-bean")
|
||||
>>> train, test = dataset.train_test
|
||||
>>> ...
|
||||
|
||||
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
|
||||
|
||||
The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
|
||||
|
||||
:param dataset_name: a dataset name
|
||||
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||
~/quay_data/ directory)
|
||||
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
||||
:param verbose: set to True (default is False) to get information (stats) about the dataset
|
||||
:return: a :class:`quapy.data.base.Dataset` instance
|
||||
"""
|
||||
data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
|
||||
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
|
||||
|
||||
|
||||
def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
|
||||
"""
|
||||
Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
|
||||
|
||||
The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
|
||||
- It has more than 1000 instances
|
||||
- It is suited for classification
|
||||
- It has more than two classes
|
||||
- It is available for Python import (requires ucimlrepo package)
|
||||
|
||||
>>> import quapy as qp
|
||||
>>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean")
|
||||
>>> X, y = collection.Xy
|
||||
>>> ...
|
||||
|
||||
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
|
||||
|
||||
The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
|
||||
|
||||
:param dataset_name: a dataset name
|
||||
:param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
|
||||
~/quay_data/ directory)
|
||||
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
||||
:param verbose: set to True (default is False) to get information (stats) about the dataset
|
||||
:return: a :class:`quapy.data.base.LabelledCollection` instance
|
||||
"""
|
||||
assert dataset_name in UCI_MULTICLASS_DATASETS, \
|
||||
f'Name {dataset_name} does not match any known dataset from the ' \
|
||||
f'UCI Machine Learning datasets repository (multiclass). ' \
|
||||
f'Valid ones are {UCI_MULTICLASS_DATASETS}'
|
||||
|
||||
if data_home is None:
|
||||
data_home = get_quapy_home()
|
||||
|
||||
identifiers = {
|
||||
"dry-bean": 602,
|
||||
"wine-quality": 186,
|
||||
"academic-success": 697,
|
||||
"digits": 80,
|
||||
"letter": 59
|
||||
}
|
||||
|
||||
full_names = {
|
||||
"dry-bean": "Dry Bean Dataset",
|
||||
"wine-quality": "Wine Quality",
|
||||
"academic-success": "Predict students' dropout and academic success",
|
||||
"digits": "Optical Recognition of Handwritten Digits",
|
||||
"letter": "Letter Recognition"
|
||||
}
|
||||
|
||||
identifier = identifiers[dataset_name]
|
||||
fullname = full_names[dataset_name]
|
||||
|
||||
if verbose:
|
||||
print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
|
||||
|
||||
file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
|
||||
|
||||
def download(id):
|
||||
data = fetch_ucirepo(id=id)
|
||||
X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
|
||||
classes = np.sort(np.unique(y))
|
||||
y = np.searchsorted(classes, y)
|
||||
return LabelledCollection(X, y)
|
||||
|
||||
data = pickled_resource(file, download, identifier)
|
||||
|
||||
if verbose:
|
||||
data.stats()
|
||||
|
||||
return data
|
||||
|
||||
|
||||
|
|
@ -624,12 +734,38 @@ def fetch_lequa2022(task, data_home=None):
|
|||
|
||||
return train, val_gen, test_gen
|
||||
|
||||
def fetch_IFCB(data_home=None):
|
||||
|
||||
def fetch_IFCB(single_sample_train=True, data_home=None):
|
||||
"""
|
||||
Loads the IFCB dataset for quantification <https://zenodo.org/records/10036244>`. For more
|
||||
information on this dataset check the zenodo site.
|
||||
This dataset is based on the data available publicly at <https://github.com/hsosik/WHOI-Plankton>.
|
||||
The scripts for the processing are available at <https://github.com/pglez82/IFCB_Zenodo>
|
||||
|
||||
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
|
||||
|
||||
The datasets are downloaded only once, and stored for fast reuse.
|
||||
|
||||
:param single_sample_train: boolean. If True (default), it returns the train dataset as an instance of
|
||||
:class:`quapy.data.base.LabelledCollection` (all examples together).
|
||||
If False, a generator of training samples will be returned.
|
||||
Each example in the training set has an individual class label.
|
||||
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||
~/quay_data/ directory)
|
||||
:return: a tuple `(train, test_gen)` where `train` is an instance of
|
||||
:class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is True or
|
||||
:class:`quapy.data._ifcb.IFCBTrainSamplesFromDir` otherwise, i.e. a sampling protocol that
|
||||
returns a series of samples labelled example by example.
|
||||
test_gen is an instance of :class:`quapy.data._ifcb.IFCBTestSamples`,
|
||||
i.e., a sampling protocol that returns a series of samples labelled by prevalence.
|
||||
"""
|
||||
|
||||
from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples
|
||||
|
||||
if data_home is None:
|
||||
data_home = get_quapy_home()
|
||||
|
||||
URL_TRAINDEV=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
|
||||
|
||||
URL_TRAIN=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
|
||||
URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip'
|
||||
URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'
|
||||
|
||||
|
|
@ -637,13 +773,43 @@ def fetch_IFCB(data_home=None):
|
|||
os.makedirs(ifcb_dir, exist_ok=True)
|
||||
|
||||
def download_unzip_and_remove(unzipped_path, url):
|
||||
tmp_path = join(ifcb_dir, 'tmp.zip')
|
||||
tmp_path = join(ifcb_dir, 'ifcb_tmp.zip')
|
||||
download_file_if_not_exists(url, tmp_path)
|
||||
with zipfile.ZipFile(tmp_path) as file:
|
||||
file.extractall(unzipped_path)
|
||||
os.remove(tmp_path)
|
||||
|
||||
if not os.path.exists(join(ifcb_dir, task)):
|
||||
download_unzip_and_remove(ifcb_dir, URL_TRAINDEV)
|
||||
if not os.path.exists(os.path.join(ifcb_dir,'train')):
|
||||
download_unzip_and_remove(ifcb_dir, URL_TRAIN)
|
||||
if not os.path.exists(os.path.join(ifcb_dir,'test')):
|
||||
download_unzip_and_remove(ifcb_dir, URL_TEST)
|
||||
download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
|
||||
if not os.path.exists(os.path.join(ifcb_dir,'test_prevalences.csv')):
|
||||
download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
|
||||
|
||||
# Load test prevalences and classes
|
||||
test_true_prev_path = join(ifcb_dir, 'test_prevalences.csv')
|
||||
test_true_prev = pd.read_csv(test_true_prev_path)
|
||||
classes = test_true_prev.columns[1:]
|
||||
|
||||
#Load train samples
|
||||
train_samples_path = join(ifcb_dir,'train')
|
||||
train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes)
|
||||
|
||||
#Load test samples
|
||||
test_samples_path = join(ifcb_dir,'test')
|
||||
test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences_path=test_true_prev_path)
|
||||
|
||||
# In the case the user wants it, join all the train samples in one LabelledCollection
|
||||
if single_sample_train:
|
||||
X = []
|
||||
y = []
|
||||
for X_, y_ in train_gen():
|
||||
X.append(X_)
|
||||
y.append(y_)
|
||||
|
||||
X = np.vstack(X)
|
||||
y = np.concatenate(y)
|
||||
train = LabelledCollection(X,y, classes=classes)
|
||||
return train, test_gen
|
||||
else:
|
||||
return train_gen, test_gen
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
import itertools
|
||||
from collections import defaultdict
|
||||
from typing import Union, Callable
|
||||
|
||||
import scipy
|
||||
import numpy as np
|
||||
|
||||
|
|
@ -64,7 +66,7 @@ def prevalence_from_probabilities(posteriors, binarize: bool = False):
|
|||
return prevalences
|
||||
|
||||
|
||||
def HellingerDistance(P, Q):
|
||||
def HellingerDistance(P, Q) -> float:
|
||||
"""
|
||||
Computes the Hellingher Distance (HD) between (discretized) distributions `P` and `Q`.
|
||||
The HD for two discrete distributions of `k` bins is defined as:
|
||||
|
|
@ -276,3 +278,70 @@ def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08):
|
|||
return False
|
||||
return True
|
||||
|
||||
|
||||
def get_divergence(divergence: Union[str, Callable]):
|
||||
if isinstance(divergence, str):
|
||||
if divergence=='HD':
|
||||
return HellingerDistance
|
||||
elif divergence=='topsoe':
|
||||
return TopsoeDistance
|
||||
else:
|
||||
raise ValueError(f'unknown divergence {divergence}')
|
||||
elif callable(divergence):
|
||||
return divergence
|
||||
else:
|
||||
raise ValueError(f'argument "divergence" not understood; use a str or a callable function')
|
||||
|
||||
|
||||
def argmin_prevalence(loss, n_classes, method='optim_minimize'):
|
||||
if method == 'optim_minimize':
|
||||
return optim_minimize(loss, n_classes)
|
||||
elif method == 'linear_search':
|
||||
return linear_search(loss, n_classes)
|
||||
elif method == 'ternary_search':
|
||||
raise NotImplementedError()
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def optim_minimize(loss, n_classes):
|
||||
"""
|
||||
Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
|
||||
that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
|
||||
SLSQP routine.
|
||||
|
||||
:param loss: (callable) the function to minimize
|
||||
:param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
|
||||
:return: (ndarray) the best prevalence vector found
|
||||
"""
|
||||
from scipy import optimize
|
||||
|
||||
# the initial point is set as the uniform distribution
|
||||
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||
|
||||
# solutions are bounded to those contained in the unit-simplex
|
||||
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
|
||||
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||
r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||
return r.x
|
||||
|
||||
|
||||
def linear_search(loss, n_classes):
|
||||
"""
|
||||
Performs a linear search for the best prevalence value in binary problems. The search is carried out by exploring
|
||||
the range [0,1] stepping by 0.01. This search is inefficient, and is added only for completeness (some of the
|
||||
early methods in quantification literature used it, e.g., HDy). A most powerful alternative is `optim_minimize`.
|
||||
|
||||
:param loss: (callable) the function to minimize
|
||||
:param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
|
||||
:return: (ndarray) the best prevalence vector found
|
||||
"""
|
||||
assert n_classes==2, 'linear search is only available for binary problems'
|
||||
|
||||
prev_selected, min_score = None, None
|
||||
for prev in prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0):
|
||||
score = loss(np.asarray([1 - prev, prev]))
|
||||
if min_score is None or score < min_score:
|
||||
prev_selected, min_score = prev, score
|
||||
|
||||
return np.asarray([1 - prev_selected, prev_selected])
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
from . import aggregative
|
||||
from . import base
|
||||
from . import meta
|
||||
from . import aggregative
|
||||
from . import non_aggregative
|
||||
from . import meta
|
||||
|
||||
AGGREGATIVE_METHODS = {
|
||||
aggregative.CC,
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ from sklearn.metrics import confusion_matrix
|
|||
from sklearn.model_selection import cross_val_predict
|
||||
import quapy as qp
|
||||
import quapy.functional as F
|
||||
from functional import get_divergence
|
||||
from quapy.classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration
|
||||
from quapy.classification.svmperf import SVMperf
|
||||
from quapy.data import LabelledCollection
|
||||
|
|
@ -530,7 +531,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|||
"""
|
||||
`Hellinger Distance y <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDy).
|
||||
HDy is a probabilistic method for training binary quantifiers, that models quantification as the problem of
|
||||
minimizing the divergence (in terms of the Hellinger Distance) between two cumulative distributions of posterior
|
||||
minimizing the divergence (in terms of the Hellinger Distance) between two distributions of posterior
|
||||
probabilities returned by the classifier. One of the distributions is generated from the unlabelled examples and
|
||||
the other is generated from a validation set. This latter distribution is defined as a mixture of the
|
||||
class-conditional distributions of the posterior probabilities returned for the positive and negative validation
|
||||
|
|
@ -567,10 +568,11 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|||
self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]]
|
||||
# pre-compute the histogram for positive and negative examples
|
||||
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
|
||||
self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in
|
||||
self.bins}
|
||||
self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in
|
||||
self.bins}
|
||||
def hist(P, bins):
|
||||
h = np.histogram(P, bins=bins, range=(0, 1), density=True)[0]
|
||||
return h / h.sum()
|
||||
self.Pxy1_density = {bins: hist(self.Pxy1, bins) for bins in self.bins}
|
||||
self.Pxy0_density = {bins: hist(self.Pxy0, bins) for bins in self.bins}
|
||||
return self
|
||||
|
||||
def aggregate(self, classif_posteriors):
|
||||
|
|
@ -590,6 +592,9 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|||
|
||||
Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True)
|
||||
|
||||
# the authors proposed to search for the prevalence yielding the best matching as a linear search
|
||||
# at small steps (modern implementations resort to an optimization procedure,
|
||||
# see class DistributionMatching)
|
||||
prev_selected, min_dist = None, None
|
||||
for prev in F.prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0):
|
||||
Px_train = prev * Pxy1_density + (1 - prev) * Pxy0_density
|
||||
|
|
@ -602,20 +607,6 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|||
return np.asarray([1 - class1_prev, class1_prev])
|
||||
|
||||
|
||||
def _get_divergence(divergence: Union[str, Callable]):
|
||||
if isinstance(divergence, str):
|
||||
if divergence=='HD':
|
||||
return F.HellingerDistance
|
||||
elif divergence=='topsoe':
|
||||
return F.TopsoeDistance
|
||||
else:
|
||||
raise ValueError(f'unknown divergence {divergence}')
|
||||
elif callable(divergence):
|
||||
return divergence
|
||||
else:
|
||||
raise ValueError(f'argument "divergence" not understood; use a str or a callable function')
|
||||
|
||||
|
||||
class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||
"""
|
||||
`DyS framework <https://ojs.aaai.org/index.php/AAAI/article/view/4376>`_ (DyS).
|
||||
|
|
@ -673,7 +664,7 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|||
Px = classif_posteriors[:, 1] # takes only the P(y=+1|x)
|
||||
|
||||
Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0]
|
||||
divergence = _get_divergence(self.divergence)
|
||||
divergence = get_divergence(self.divergence)
|
||||
|
||||
def distribution_distance(prev):
|
||||
Px_train = prev * self.Pxy1_density + (1 - prev) * self.Pxy0_density
|
||||
|
|
@ -722,10 +713,11 @@ class SMM(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|||
return np.asarray([1 - class1_prev, class1_prev])
|
||||
|
||||
|
||||
class DistributionMatching(AggregativeProbabilisticQuantifier):
|
||||
class DMy(AggregativeProbabilisticQuantifier):
|
||||
"""
|
||||
Generic Distribution Matching quantifier for binary or multiclass quantification.
|
||||
This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters.
|
||||
Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of posterior
|
||||
probabilities. This implementation takes the number of bins, the divergence, and the possibility to work on CDF
|
||||
as hyperparameters.
|
||||
|
||||
:param classifier: a `sklearn`'s Estimator that generates a probabilistic classifier
|
||||
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set to model the
|
||||
|
|
@ -738,18 +730,28 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
|
|||
:param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented)
|
||||
or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger
|
||||
Distance)
|
||||
:param cdf: whether or not to use CDF instead of PDF (default False)
|
||||
:param cdf: whether to use CDF instead of PDF (default False)
|
||||
:param n_jobs: number of parallel workers (default None)
|
||||
"""
|
||||
|
||||
def __init__(self, classifier, val_split=0.4, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, n_jobs=None):
|
||||
def __init__(self, classifier, val_split=0.4, nbins=8, divergence: Union[str, Callable]='HD',
|
||||
cdf=False, search='optim_minimize', n_jobs=None):
|
||||
self.classifier = classifier
|
||||
self.val_split = val_split
|
||||
self.nbins = nbins
|
||||
self.divergence = divergence
|
||||
self.cdf = cdf
|
||||
self.search = search
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
@classmethod
|
||||
def HDy(cls, classifier, val_split=0.4, n_jobs=None):
|
||||
from quapy.method.meta import MedianEstimator
|
||||
|
||||
hdy = DMy(classifier=classifier, val_split=val_split, search='linear_search', divergence='HD')
|
||||
hdy = MedianEstimator(hdy, param_grid={'nbins': np.linspace(10, 110, 11).astype(int)}, n_jobs=n_jobs)
|
||||
return hdy
|
||||
|
||||
def __get_distributions(self, posteriors):
|
||||
histograms = []
|
||||
post_dims = posteriors.shape[1]
|
||||
|
|
@ -770,8 +772,8 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
|
|||
"""
|
||||
Trains the classifier (if requested) and generates the validation distributions out of the training data.
|
||||
The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
|
||||
channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
|
||||
are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete
|
||||
channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; then `di=V[i]`
|
||||
are the distributions obtained from training data labelled with class `i`; while `dij = di[j]` is the discrete
|
||||
distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
|
||||
is the fraction of instances with a value in the `k`-th bin.
|
||||
|
||||
|
|
@ -803,26 +805,20 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
|
|||
`n` channels (proper distributions of binned posterior probabilities), on which the divergence is computed
|
||||
independently. The matching is computed as an average of the divergence across all channels.
|
||||
|
||||
:param instances: instances in the sample
|
||||
:param posteriors: posterior probabilities of the instances in the sample
|
||||
:return: a vector of class prevalence estimates
|
||||
"""
|
||||
test_distribution = self.__get_distributions(posteriors)
|
||||
divergence = _get_divergence(self.divergence)
|
||||
divergence = get_divergence(self.divergence)
|
||||
n_classes, n_channels, nbins = self.validation_distribution.shape
|
||||
def match(prev):
|
||||
def loss(prev):
|
||||
prev = np.expand_dims(prev, axis=0)
|
||||
mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_channels, -1)
|
||||
divs = [divergence(test_distribution[ch], mixture_distribution[ch]) for ch in range(n_channels)]
|
||||
return np.mean(divs)
|
||||
|
||||
# the initial point is set as the uniform distribution
|
||||
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||
return F.argmin_prevalence(loss, n_classes, method=self.search)
|
||||
|
||||
# solutions are bounded to those contained in the unit-simplex
|
||||
bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1]
|
||||
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||
return r.x
|
||||
|
||||
|
||||
def newELM(svmperf_base=None, loss='01', C=1):
|
||||
|
|
@ -1224,17 +1220,6 @@ class MS2(MS):
|
|||
return np.median(tprs), np.median(fprs)
|
||||
|
||||
|
||||
ClassifyAndCount = CC
|
||||
AdjustedClassifyAndCount = ACC
|
||||
ProbabilisticClassifyAndCount = PCC
|
||||
ProbabilisticAdjustedClassifyAndCount = PACC
|
||||
ExpectationMaximizationQuantifier = EMQ
|
||||
SLD = EMQ
|
||||
HellingerDistanceY = HDy
|
||||
MedianSweep = MS
|
||||
MedianSweep2 = MS2
|
||||
|
||||
|
||||
class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
|
||||
"""
|
||||
Allows any binary quantifier to perform quantification on single-label datasets.
|
||||
|
|
@ -1292,3 +1277,18 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
|
|||
# the estimation for the positive class prevalence
|
||||
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
|
||||
|
||||
|
||||
#---------------------------------------------------------------
|
||||
# aliases
|
||||
#---------------------------------------------------------------
|
||||
|
||||
ClassifyAndCount = CC
|
||||
AdjustedClassifyAndCount = ACC
|
||||
ProbabilisticClassifyAndCount = PCC
|
||||
ProbabilisticAdjustedClassifyAndCount = PACC
|
||||
ExpectationMaximizationQuantifier = EMQ
|
||||
DistributionMatchingY = DMy
|
||||
SLD = EMQ
|
||||
HellingerDistanceY = HDy
|
||||
MedianSweep = MS
|
||||
MedianSweep2 = MS2
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
import itertools
|
||||
from copy import deepcopy
|
||||
from typing import Union
|
||||
import numpy as np
|
||||
|
|
@ -10,13 +11,14 @@ import quapy as qp
|
|||
from quapy import functional as F
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.model_selection import GridSearchQ
|
||||
from quapy.method.base import BaseQuantifier, BinaryQuantifier
|
||||
from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ
|
||||
|
||||
try:
|
||||
from . import neural
|
||||
except ModuleNotFoundError:
|
||||
neural = None
|
||||
from .base import BaseQuantifier
|
||||
from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ
|
||||
|
||||
|
||||
if neural:
|
||||
QuaNet = neural.QuaNetTrainer
|
||||
|
|
@ -24,6 +26,67 @@ else:
|
|||
QuaNet = "QuaNet is not available due to missing torch package"
|
||||
|
||||
|
||||
class MedianEstimator(BinaryQuantifier):
|
||||
"""
|
||||
This method is a meta-quantifier that returns, as the estimated class prevalence values, the median of the
|
||||
estimation returned by differently (hyper)parameterized base quantifiers.
|
||||
The median of unit-vectors is only guaranteed to be a unit-vector for n=2 dimensions,
|
||||
i.e., in cases of binary quantification.
|
||||
|
||||
:param base_quantifier: the base, binary quantifier
|
||||
:param random_state: a seed to be set before fitting any base quantifier (default None)
|
||||
:param param_grid: the grid or parameters towards which the median will be computed
|
||||
:param n_jobs: number of parllel workes
|
||||
"""
|
||||
def __init__(self, base_quantifier: BinaryQuantifier, param_grid: dict, random_state=None, n_jobs=None):
|
||||
self.base_quantifier = base_quantifier
|
||||
self.param_grid = param_grid
|
||||
self.random_state = random_state
|
||||
self.n_jobs = qp._get_njobs(n_jobs)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.base_quantifier.get_params(deep)
|
||||
|
||||
def set_params(self, **params):
|
||||
self.base_quantifier.set_params(**params)
|
||||
|
||||
def _delayed_fit(self, args):
|
||||
with qp.util.temp_seed(self.random_state):
|
||||
params, training = args
|
||||
model = deepcopy(self.base_quantifier)
|
||||
model.set_params(**params)
|
||||
model.fit(training)
|
||||
return model
|
||||
|
||||
def fit(self, training: LabelledCollection):
|
||||
self._check_binary(training, self.__class__.__name__)
|
||||
params_keys = list(self.param_grid.keys())
|
||||
params_values = list(self.param_grid.values())
|
||||
hyper = [dict({k: val[i] for i, k in enumerate(params_keys)}) for val in itertools.product(*params_values)]
|
||||
self.models = qp.util.parallel(
|
||||
self._delayed_fit,
|
||||
((params, training) for params in hyper),
|
||||
seed=qp.environ.get('_R_SEED', None),
|
||||
n_jobs=self.n_jobs
|
||||
)
|
||||
return self
|
||||
|
||||
def _delayed_predict(self, args):
|
||||
model, instances = args
|
||||
return model.quantify(instances)
|
||||
|
||||
def quantify(self, instances):
|
||||
prev_preds = qp.util.parallel(
|
||||
self._delayed_predict,
|
||||
((model, instances) for model in self.models),
|
||||
seed=qp.environ.get('_R_SEED', None),
|
||||
n_jobs=self.n_jobs
|
||||
)
|
||||
prev_preds = np.asarray(prev_preds)
|
||||
return np.median(prev_preds, axis=0)
|
||||
|
||||
|
||||
|
||||
class Ensemble(BaseQuantifier):
|
||||
VALID_POLICIES = {'ave', 'ptr', 'ds'} | qp.error.QUANTIFICATION_ERROR_NAMES
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,10 @@
|
|||
from typing import Union, Callable
|
||||
import numpy as np
|
||||
|
||||
from functional import get_divergence
|
||||
from quapy.data import LabelledCollection
|
||||
from .base import BaseQuantifier
|
||||
from quapy.method.base import BaseQuantifier, BinaryQuantifier
|
||||
import quapy.functional as F
|
||||
|
||||
|
||||
class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
|
||||
|
|
@ -33,3 +38,126 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
|
|||
"""
|
||||
return self.estimated_prevalence
|
||||
|
||||
|
||||
class DMx(BaseQuantifier):
|
||||
"""
|
||||
Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of covariates.
|
||||
This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters.
|
||||
|
||||
:param nbins: number of bins used to discretize the distributions (default 8)
|
||||
:param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented)
|
||||
or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger
|
||||
Distance)
|
||||
:param cdf: whether to use CDF instead of PDF (default False)
|
||||
:param n_jobs: number of parallel workers (default None)
|
||||
"""
|
||||
|
||||
def __init__(self, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, search='optim_minimize', n_jobs=None):
|
||||
self.nbins = nbins
|
||||
self.divergence = divergence
|
||||
self.cdf = cdf
|
||||
self.search = search
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
@classmethod
|
||||
def HDx(cls, n_jobs=None):
|
||||
"""
|
||||
`Hellinger Distance x <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDx).
|
||||
HDx is a method for training binary quantifiers, that models quantification as the problem of
|
||||
minimizing the average divergence (in terms of the Hellinger Distance) across the feature-specific normalized
|
||||
histograms of two representations, one for the unlabelled examples, and another generated from the training
|
||||
examples as a mixture model of the class-specific representations. The parameters of the mixture thus represent
|
||||
the estimates of the class prevalence values.
|
||||
|
||||
The method computes all matchings for nbins in [10, 20, ..., 110] and reports the mean of the median.
|
||||
The best prevalence is searched via linear search, from 0 to 1 stepping by 0.01.
|
||||
|
||||
:param n_jobs: number of parallel workers
|
||||
:return: an instance of this class setup to mimick the performance of the HDx as originally proposed by
|
||||
González-Castro, Alaiz-Rodríguez, Alegre (2013)
|
||||
"""
|
||||
from quapy.method.meta import MedianEstimator
|
||||
|
||||
dmx = DMx(divergence='HD', cdf=False, search='linear_search')
|
||||
nbins = {'nbins': np.linspace(10, 110, 11, dtype=int)}
|
||||
hdx = MedianEstimator(base_quantifier=dmx, param_grid=nbins, n_jobs=n_jobs)
|
||||
return hdx
|
||||
|
||||
def __get_distributions(self, X):
|
||||
|
||||
histograms = []
|
||||
for feat_idx in range(self.nfeats):
|
||||
feature = X[:, feat_idx]
|
||||
feat_range = self.feat_ranges[feat_idx]
|
||||
hist = np.histogram(feature, bins=self.nbins, range=feat_range)[0]
|
||||
norm_hist = hist / hist.sum()
|
||||
histograms.append(norm_hist)
|
||||
distributions = np.vstack(histograms)
|
||||
|
||||
if self.cdf:
|
||||
distributions = np.cumsum(distributions, axis=1)
|
||||
|
||||
return distributions
|
||||
|
||||
def fit(self, data: LabelledCollection):
|
||||
"""
|
||||
Generates the validation distributions out of the training data (covariates).
|
||||
The validation distributions have shape `(n, nfeats, nbins)`, with `n` the number of classes, `nfeats`
|
||||
the number of features, and `nbins` the number of bins.
|
||||
In particular, let `V` be the validation distributions; then `di=V[i]` are the distributions obtained from
|
||||
training data labelled with class `i`; while `dij = di[j]` is the discrete distribution for feature j in
|
||||
training data labelled with class `i`, and `dij[k]` is the fraction of instances with a value in the `k`-th bin.
|
||||
|
||||
:param data: the training set
|
||||
"""
|
||||
X, y = data.Xy
|
||||
|
||||
self.nfeats = X.shape[1]
|
||||
self.feat_ranges = _get_features_range(X)
|
||||
|
||||
self.validation_distribution = np.asarray(
|
||||
[self.__get_distributions(X[y==cat]) for cat in range(data.n_classes)]
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
"""
|
||||
Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
|
||||
(the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
|
||||
The matching is computed as the average dissimilarity (in terms of the dissimilarity measure of choice)
|
||||
between all feature-specific discrete distributions.
|
||||
|
||||
:param instances: instances in the sample
|
||||
:return: a vector of class prevalence estimates
|
||||
"""
|
||||
|
||||
assert instances.shape[1] == self.nfeats, f'wrong shape; expected {self.nfeats}, found {instances.shape[1]}'
|
||||
|
||||
test_distribution = self.__get_distributions(instances)
|
||||
divergence = get_divergence(self.divergence)
|
||||
n_classes, n_feats, nbins = self.validation_distribution.shape
|
||||
def loss(prev):
|
||||
prev = np.expand_dims(prev, axis=0)
|
||||
mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_feats, -1)
|
||||
divs = [divergence(test_distribution[feat], mixture_distribution[feat]) for feat in range(n_feats)]
|
||||
return np.mean(divs)
|
||||
|
||||
return F.argmin_prevalence(loss, n_classes, method=self.search)
|
||||
|
||||
|
||||
|
||||
def _get_features_range(X):
|
||||
feat_ranges = []
|
||||
ncols = X.shape[1]
|
||||
for col_idx in range(ncols):
|
||||
feature = X[:,col_idx]
|
||||
feat_ranges.append((np.min(feature), np.max(feature)))
|
||||
return feat_ranges
|
||||
|
||||
|
||||
#---------------------------------------------------------------
|
||||
# aliases
|
||||
#---------------------------------------------------------------
|
||||
|
||||
DistributionMatchingX = DMx
|
||||
|
|
@ -223,7 +223,7 @@ def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfol
|
|||
for train, test in data.kFCV(nfolds=nfolds, random_state=random_state):
|
||||
quantifier.fit(train)
|
||||
fold_prev = quantifier.quantify(test.X)
|
||||
rel_size = len(test.X)/len(data)
|
||||
rel_size = 1. * len(test) / len(data)
|
||||
total_prev += fold_prev*rel_size
|
||||
|
||||
return total_prev
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No
|
|||
train_prev = train_prev[pos_class]
|
||||
ax.scatter(train_prev, train_prev, c='c', label='tr-prev', linewidth=2, edgecolor='k', s=100, zorder=3)
|
||||
|
||||
ax.set(xlabel='true prevalence', ylabel='estimated prevalence', title=title)
|
||||
ax.set(xlabel='true frequency', ylabel='estimated frequency', title=title)
|
||||
ax.set_ylim(0, 1)
|
||||
ax.set_xlim(0, 1)
|
||||
|
||||
|
|
@ -216,9 +216,10 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
show_density=True,
|
||||
show_legend=True,
|
||||
logscale=False,
|
||||
title=f'Quantification error as a function of distribution shift',
|
||||
title=f'Quantification error as a function of label shift',
|
||||
vlines=None,
|
||||
method_order=None,
|
||||
fontsize=18,
|
||||
savepath=None):
|
||||
"""
|
||||
Plots the error (along the x-axis, as measured in terms of `error_name`) as a function of the train-test shift
|
||||
|
|
@ -247,6 +248,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
:param savepath: path where to save the plot. If not indicated (as default), the plot is shown.
|
||||
"""
|
||||
|
||||
plt.rcParams['font.size'] = fontsize
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
ax.grid()
|
||||
|
||||
|
|
@ -261,7 +264,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
if method_order is None:
|
||||
method_order = method_names
|
||||
|
||||
_set_colors(ax, n_methods=len(method_order))
|
||||
# _set_colors(ax, n_methods=len(method_order))
|
||||
|
||||
bins = np.linspace(0, 1, n_bins+1)
|
||||
binwidth = 1 / n_bins
|
||||
|
|
@ -291,6 +294,9 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
ys = np.asarray(ys)
|
||||
ystds = np.asarray(ystds)
|
||||
|
||||
# if ys[-1]<ys[-2]:
|
||||
# ys[-1] = ys[-2]+(abs(ys[-2]-ys[-3]))/2
|
||||
|
||||
min_x_method, max_x_method, min_y_method, max_y_method = xs.min(), xs.max(), ys.min(), ys.max()
|
||||
min_x = min_x_method if min_x is None or min_x_method < min_x else min_x
|
||||
max_x = max_x_method if max_x is None or max_x_method > max_x else max_x
|
||||
|
|
@ -313,8 +319,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
ax2.spines['right'].set_color('g')
|
||||
ax2.tick_params(axis='y', colors='g')
|
||||
|
||||
ax.set(xlabel=f'Distribution shift between training set and test sample',
|
||||
ylabel=f'{error_name.upper()} (true distribution, predicted distribution)',
|
||||
ax.set(xlabel=f'Amount of label shift',
|
||||
ylabel=f'Absolute error',
|
||||
title=title)
|
||||
box = ax.get_position()
|
||||
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
|
||||
|
|
@ -329,10 +335,11 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
|
||||
|
||||
if show_legend:
|
||||
fig.legend(loc='lower center',
|
||||
bbox_to_anchor=(1, 0.5),
|
||||
ncol=(len(method_names)+1)//2)
|
||||
|
||||
ax.legend(loc='center right', bbox_to_anchor=(1.31, 0.5))
|
||||
# fig.legend(loc='lower center',
|
||||
# bbox_to_anchor=(1, 0.5),
|
||||
# ncol=(len(method_names)+1)//2)
|
||||
|
||||
_save_or_show(savepath)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -236,7 +236,7 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
|||
raise RuntimeError(
|
||||
f"Abort: the number of samples that will be generated by {self.__class__.__name__} ({n}) "
|
||||
f"exceeds the maximum number of allowed samples ({sanity_check = }). Set 'sanity_check' to "
|
||||
f"None for bypassing this check, or to a higher number.")
|
||||
f"None, or to a higher number, for bypassing this check.")
|
||||
|
||||
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,14 +1,17 @@
|
|||
import numpy
|
||||
import numpy as np
|
||||
import pytest
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
import quapy as qp
|
||||
from quapy.model_selection import GridSearchQ
|
||||
from quapy.method.base import BinaryQuantifier
|
||||
from quapy.data import Dataset, LabelledCollection
|
||||
from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS
|
||||
from quapy.method.aggregative import ACC, PACC, HDy
|
||||
from quapy.method.meta import Ensemble
|
||||
from quapy.protocol import APP
|
||||
from quapy.method.aggregative import DMy
|
||||
from quapy.method.meta import MedianEstimator
|
||||
|
||||
datasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True), id='hcr'),
|
||||
pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')]
|
||||
|
|
@ -36,7 +39,7 @@ def test_aggregative_methods(dataset: Dataset, aggregative_method, learner):
|
|||
true_prevalences = dataset.test.prevalence()
|
||||
error = qp.error.mae(true_prevalences, estim_prevalences)
|
||||
|
||||
assert type(error) == numpy.float64
|
||||
assert type(error) == np.float64
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dataset', datasets)
|
||||
|
|
@ -55,7 +58,7 @@ def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method):
|
|||
true_prevalences = dataset.test.prevalence()
|
||||
error = qp.error.mae(true_prevalences, estim_prevalences)
|
||||
|
||||
assert type(error) == numpy.float64
|
||||
assert type(error) == np.float64
|
||||
|
||||
|
||||
@pytest.mark.parametrize('base_method', AGGREGATIVE_METHODS)
|
||||
|
|
@ -80,7 +83,7 @@ def test_ensemble_method(base_method, learner, dataset: Dataset, policy):
|
|||
true_prevalences = dataset.test.prevalence()
|
||||
error = qp.error.mae(true_prevalences, estim_prevalences)
|
||||
|
||||
assert type(error) == numpy.float64
|
||||
assert type(error) == np.float64
|
||||
|
||||
|
||||
def test_quanet_method():
|
||||
|
|
@ -119,7 +122,7 @@ def test_quanet_method():
|
|||
true_prevalences = dataset.test.prevalence()
|
||||
error = qp.error.mae(true_prevalences, estim_prevalences)
|
||||
|
||||
assert type(error) == numpy.float64
|
||||
assert type(error) == np.float64
|
||||
|
||||
|
||||
def test_str_label_names():
|
||||
|
|
@ -130,32 +133,103 @@ def test_str_label_names():
|
|||
dataset.test.sampling(1000, 0.25, 0.75))
|
||||
qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True)
|
||||
|
||||
numpy.random.seed(0)
|
||||
np.random.seed(0)
|
||||
model.fit(dataset.training)
|
||||
|
||||
int_estim_prevalences = model.quantify(dataset.test.instances)
|
||||
true_prevalences = dataset.test.prevalence()
|
||||
|
||||
error = qp.error.mae(true_prevalences, int_estim_prevalences)
|
||||
assert type(error) == numpy.float64
|
||||
assert type(error) == np.float64
|
||||
|
||||
dataset_str = Dataset(LabelledCollection(dataset.training.instances,
|
||||
['one' if label == 1 else 'zero' for label in dataset.training.labels]),
|
||||
LabelledCollection(dataset.test.instances,
|
||||
['one' if label == 1 else 'zero' for label in dataset.test.labels]))
|
||||
assert all(dataset_str.training.classes_ == dataset_str.test.classes_), 'wrong indexation'
|
||||
numpy.random.seed(0)
|
||||
np.random.seed(0)
|
||||
model.fit(dataset_str.training)
|
||||
|
||||
str_estim_prevalences = model.quantify(dataset_str.test.instances)
|
||||
true_prevalences = dataset_str.test.prevalence()
|
||||
|
||||
error = qp.error.mae(true_prevalences, str_estim_prevalences)
|
||||
assert type(error) == numpy.float64
|
||||
assert type(error) == np.float64
|
||||
|
||||
print(true_prevalences)
|
||||
print(int_estim_prevalences)
|
||||
print(str_estim_prevalences)
|
||||
|
||||
numpy.testing.assert_almost_equal(int_estim_prevalences[1],
|
||||
np.testing.assert_almost_equal(int_estim_prevalences[1],
|
||||
str_estim_prevalences[list(model.classes_).index('one')])
|
||||
|
||||
# helper
|
||||
def __fit_test(quantifier, train, test):
|
||||
quantifier.fit(train)
|
||||
test_samples = APP(test)
|
||||
true_prevs, estim_prevs = qp.evaluation.prediction(quantifier, test_samples)
|
||||
return qp.error.mae(true_prevs, estim_prevs), estim_prevs
|
||||
|
||||
|
||||
def test_median_meta():
|
||||
"""
|
||||
This test compares the performance of the MedianQuantifier with respect to computing the median of the predictions
|
||||
of a differently parameterized quantifier. We use the DistributionMatching base quantifier and the median is
|
||||
computed across different values of nbins
|
||||
"""
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 100
|
||||
|
||||
# grid of values
|
||||
nbins_grid = list(range(2, 11))
|
||||
|
||||
dataset = 'kindle'
|
||||
train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test
|
||||
prevs = []
|
||||
errors = []
|
||||
for nbins in nbins_grid:
|
||||
with qp.util.temp_seed(0):
|
||||
q = DMy(LogisticRegression(), nbins=nbins)
|
||||
mae, estim_prevs = __fit_test(q, train, test)
|
||||
prevs.append(estim_prevs)
|
||||
errors.append(mae)
|
||||
print(f'{dataset} DistributionMatching(nbins={nbins}) got MAE {mae:.4f}')
|
||||
prevs = np.asarray(prevs)
|
||||
mae = np.mean(errors)
|
||||
print(f'\tMAE={mae:.4f}')
|
||||
|
||||
q = DMy(LogisticRegression())
|
||||
q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
|
||||
median_mae, prev = __fit_test(q, train, test)
|
||||
print(f'\tMAE={median_mae:.4f}')
|
||||
|
||||
np.testing.assert_almost_equal(np.median(prevs, axis=0), prev)
|
||||
assert median_mae < mae, 'the median-based quantifier provided a higher error...'
|
||||
|
||||
|
||||
def test_median_meta_modsel():
|
||||
"""
|
||||
This test checks the median-meta quantifier with model selection
|
||||
"""
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 100
|
||||
|
||||
dataset = 'kindle'
|
||||
train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test
|
||||
train, val = train.split_stratified(random_state=0)
|
||||
|
||||
nbins_grid = [2, 4, 5, 10, 15]
|
||||
|
||||
q = DMy(LogisticRegression())
|
||||
q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
|
||||
median_mae, _ = __fit_test(q, train, test)
|
||||
print(f'\tMAE={median_mae:.4f}')
|
||||
|
||||
q = DMy(LogisticRegression())
|
||||
lr_params = {'classifier__C': np.logspace(-1, 1, 3)}
|
||||
q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
|
||||
q = GridSearchQ(q, param_grid=lr_params, protocol=APP(val), n_jobs=-1)
|
||||
optimized_median_ave, _ = __fit_test(q, train, test)
|
||||
print(f'\tMAE={optimized_median_ave:.4f}')
|
||||
|
||||
assert optimized_median_ave < median_mae, "the optimized method yielded worse performance..."
|
||||
Loading…
Reference in New Issue