Merge branch 'devel' of github.com:HLT-ISTI/QuaPy into devel

This commit is contained in:
Alejandro Moreo Fernandez 2024-11-29 10:57:14 +01:00
commit 1c733f3d77
13 changed files with 595 additions and 21 deletions

View File

@ -1,3 +1,12 @@
Change Log 0.1.10
-----------------
- Added (aggregative) bootstrap for deriving confidence regions (confidence intervals, ellipses in the simplex, or
ellipses in the CLR space). This method is efficient as it leverages the two-phases of the aggregative quantifiers.
This method applies resampling only to the aggregation phase, thus avoiding to train many quantifiers, or
classify multiple times the instances of a sample. See the new example no. 15.
Change Log 0.1.9
----------------

View File

@ -1,3 +1,6 @@
- [TODO] adapt BayesianCC to WithConfidence interface
- [TODO] Test the return_type="index" in protocols and finish the "distributin_samples.py" example
- [TODO] Add EDy (an implementation is available at quantificationlib)
- [TODO] add ensemble methods SC-MQ, MC-SQ, MC-MQ
- [TODO] add HistNetQ
- [TODO] add CDE-iteration and Bayes-CDE methods

View File

@ -33,10 +33,8 @@ import quapy.functional as F # <- this module has some functional utilities, li
print(f'training prevalence = {F.strprev(train.prevalence())}')
# let us train one quantifier, for example, PACC using a sklearn's Logistic Regressor as the underlying classifier
# classifier = LogisticRegression()
# pacc = qp.method.aggregative.PACC(classifier)
pacc = qp.method.aggregative.PACC()
classifier = LogisticRegression()
pacc = qp.method.aggregative.PACC(classifier)
print(f'training {pacc}')
pacc.fit(train)

View File

@ -0,0 +1,78 @@
from quapy.method.confidence import AggregativeBootstrap
from quapy.method.aggregative import PACC
import quapy.functional as F
import quapy as qp
"""
Just like any other type of estimator, quantifier predictions are affected by error. It is therefore useful to provide,
along with the point estimate (the class prevalence values) a measure of uncertainty. These, typically come in the
form of credible regions around the point estimate.
QuaPy implements a method for deriving confidence regions around point estimates of class prevalence based on bootstrap.
Bootstrap method comes down to resampling the population several times, thus generating a series of point estimates.
QuaPy provides a variant of bootstrap for aggregative quantifiers, that only applies resampling to the pre-classified
instances.
Let see one example:
"""
# load some data
data = qp.datasets.fetch_UCIMulticlassDataset('molecular')
train, test = data.train_test
# by simply wrapping an aggregative quantifier within the AggregativeBootstrap class, we can obtain confidence
# intervals around the point estimate, in this case, at 95% of confidence
pacc = AggregativeBootstrap(PACC(), confidence_level=0.95)
with qp.util.temp_seed(0):
# we train the quantifier the usual way
pacc.fit(train)
# let us simulate some shift in the test data
random_prevalence = F.uniform_prevalence_sampling(n_classes=test.n_classes)
shifted_test = test.sampling(200, *random_prevalence)
true_prev = shifted_test.prevalence()
# by calling "quantify_conf", we obtain the point estimate and the confidence intervals around it
pred_prev, conf_intervals = pacc.quantify_conf(shifted_test.X)
# conf_intervals is an instance of ConfidenceRegionABC, which provides some useful utilities like:
# - coverage: a function which computes the fraction of true values that belong to the confidence region
# - simplex_proportion: estimates the proportion of the simplex covered by the confidence region (amplitude)
# ideally, we are interested in obtaining confidence regions with high level of coverage and small amplitude
# the point estimate is computed as the mean of all bootstrap predictions; let us see the prediction error
error = qp.error.ae(true_prev, pred_prev)
# some useful outputs
print(f'train prevalence: {F.strprev(train.prevalence())}')
print(f'test prevalence: {F.strprev(true_prev)}')
print(f'point-estimate: {F.strprev(pred_prev)}')
print(f'absolute error: {error:.3f}')
print(f'Is the true value in the confidence region?: {conf_intervals.coverage(true_prev)==1}')
print(f'Proportion of simplex covered at {pacc.confidence_level*100:.1f}%: {conf_intervals.simplex_portion()*100:.2f}%')
"""
Final remarks:
There are various ways for performing bootstrap:
- the population-based approach (default): performs resampling of the test instances
e.g., use AggregativeBootstrap(PACC(), n_train_samples=1, n_test_samples=100, confidence_level=0.95)
- the model-based approach: performs resampling of the training instances, thus training several quantifiers
e.g., use AggregativeBootstrap(PACC(), n_train_samples=100, n_test_samples=1, confidence_level=0.95)
this implementation avoids retraining the classifier, and performs resampling only to train different aggregation functions
- the combined approach: a combination of the above
e.g., use AggregativeBootstrap(PACC(), n_train_samples=100, n_test_samples=100, confidence_level=0.95)
this example will generate 100 x 100 predictions
There are different ways for constructing confidence regions implemented in QuaPy:
- confidence intervals: the simplest way, and one that typically works well in practice
use: AggregativeBootstrap(PACC(), confidence_level=0.95, method='intervals')
- confidence ellipse in the simplex: creates an ellipse, which lies on the probability simplex, around the point estimate
use: AggregativeBootstrap(PACC(), confidence_level=0.95, method='ellipse')
- confidence ellipse in the Centered-Log Ratio (CLR) space: creates an ellipse in the CLR space (this should be
convenient for taking into account the inner structure of the probability simplex)
use: AggregativeBootstrap(PACC(), confidence_level=0.95, method='ellipse-clr')
"""

View File

@ -0,0 +1,38 @@
"""
Imagine we want to generate many samples out of a collection, that we want to distribute for others to run their
own experiments in the very same test samples. One naive solution would come down to applying a given protocol to
our collection (say the artificial prevalence protocol on the 'academic-success' UCI dataset), store all those samples
on disk and make them available online. Distributing many such samples is undesirable.
In this example, we generate the indexes that allow anyone to regenerate the samples out of the original collection.
"""
import quapy as qp
from quapy.method.aggregative import PACC
from quapy.protocol import UPP
data = qp.datasets.fetch_UCIMulticlassDataset('academic-success')
train, test = data.train_test
# let us train a quantifier to check whether we can actually replicate the results
quantifier = PACC()
quantifier.fit(train)
# let us simulate our experimental results
protocol = UPP(test, sample_size=100, repeats=100, random_state=0)
our_mae = qp.evaluation.evaluate(quantifier, protocol=protocol, error_metric='mae')
print(f'We have obtained a MAE={our_mae:.3f}')
# let us distribute the indexes; we specify that we want the indexes, not the samples
protocol = UPP(test, sample_size=100, repeats=100, random_state=0, return_type='index')
indexes = protocol.samples_parameters()
# Imagine we distribute the indexes; now we show how to replicate our experiments.
from quapy.protocol import ProtocolFromIndex
data = qp.datasets.fetch_UCIMulticlassDataset('academic-success')
train, test = data.train_test
protocol = ProtocolFromIndex(data=test, indexes=indexes)
their_mae = qp.evaluation.evaluate(quantifier, protocol=protocol, error_metric='mae')
print(f'Another lab obtains a MAE={our_mae:.3f}')

36
examples/ensembles.py Normal file
View File

@ -0,0 +1,36 @@
from sklearn.linear_model import LogisticRegression
from statsmodels.sandbox.distributions.genpareto import quant
import quapy as qp
from quapy.protocol import UPP
from quapy.method.aggregative import PACC, DMy, EMQ, KDEyML
from quapy.method.meta import SCMQ
qp.environ["SAMPLE_SIZE"]=100
def train_and_test_model(quantifier, train, test):
quantifier.fit(train)
report = qp.evaluation.evaluation_report(quantifier, UPP(test), error_metrics=['mae', 'mrae'])
print(quantifier.__class__.__name__)
print(report.mean(numeric_only=True))
quantifiers = [
PACC(),
DMy(),
EMQ(),
KDEyML()
]
classifier = LogisticRegression()
dataset_name = qp.datasets.UCI_MULTICLASS_DATASETS[0]
data = qp.datasets.fetch_UCIMulticlassDataset(dataset_name)
train, test = data.train_test
scmq = SCMQ(classifier, quantifiers)
train_and_test_model(scmq, train, test)
for quantifier in quantifiers:
train_and_test_model(quantifier, train, test)

View File

@ -14,7 +14,7 @@ from . import model_selection
from . import classification
import os
__version__ = '0.1.9'
__version__ = '0.1.10'
environ = {
'SAMPLE_SIZE': None,

View File

@ -298,6 +298,31 @@ def nmd(prevs, prevs_hat):
return (1./(n-1))*np.mean(match_distance(prevs, prevs_hat))
def bias_binary(prevs, prevs_hat):
"""
Computes the (positive) bias in a binary problem. The bias is simply the difference between the
predicted positive value and the true positive value, so that a positive such value indicates the
prediction has positive bias (i.e., it tends to overestimate) the true value, and negative otherwise.
:math:`bias(p,\\hat{p})=\\hat{p}_1-p_1`,
:param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted
prevalence values
:return: binary bias
"""
assert prevs.shape[-1] == 2 and prevs.shape[-1] == 2, f'bias_binary can only be applied to binary problems'
return prevs_hat[...,1]-prevs[...,1]
def mean_bias_binary(prevs, prevs_hat):
"""
Computes the mean of the (positive) bias in a binary problem.
:param prevs: array-like of shape `(n_classes,)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
:return: mean binary bias
"""
return np.mean(bias_binary(prevs, prevs_hat))
def md(prevs, prevs_hat, ERROR_TOL=1E-3):
"""
Computes the Match Distance, under the assumption that the cost in mistaking class i with class i+1 is 1 in
@ -338,8 +363,8 @@ def __check_eps(eps=None):
CLASSIFICATION_ERROR = {f1e, acce}
QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld}
QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld}
QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld, mean_bias_binary}
QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld, bias_binary}
QUANTIFICATION_ERROR_SMOOTH = {kld, nkld, rae, nrae, mkld, mnkld, mrae}
CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR}
QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR}

View File

@ -591,7 +591,6 @@ class PACC(AggregativeSoftQuantifier):
if self.norm not in ACC.NORMALIZATIONS:
raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}")
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
"""
Estimates the misclassification rates
@ -870,13 +869,13 @@ class BayesianCC(AggregativeCrispQuantifier):
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
"""
pred_labels, true_labels = classif_predictions.Xy
self._n_and_c_labeled = confusion_matrix(y_true=true_labels, y_pred=pred_labels, labels=self.classifier.classes_)
self._n_and_c_labeled = confusion_matrix(y_true=true_labels, y_pred=pred_labels, labels=self.classifier.classes_).astype(float)
def sample_from_posterior(self, classif_predictions):
if self._n_and_c_labeled is None:
raise ValueError("aggregation_fit must be called before sample_from_posterior")
n_c_unlabeled = F.counts_from_labels(classif_predictions, self.classifier.classes_)
n_c_unlabeled = F.counts_from_labels(classif_predictions, self.classifier.classes_).astype(float)
self._samples = _bayesian.sample_posterior(
n_c_unlabeled=n_c_unlabeled,

291
quapy/method/confidence.py Normal file
View File

@ -0,0 +1,291 @@
from functools import cached_property
import numpy as np
import quapy as qp
import quapy.functional as F
from quapy.data import LabelledCollection
from quapy.method.aggregative import AggregativeQuantifier
from scipy.stats import chi2
from scipy.special import gamma
from sklearn.utils import resample
from abc import ABC, abstractmethod
from scipy.special import softmax, factorial
import copy
from functools import lru_cache
class ConfidenceRegionABC(ABC):
@abstractmethod
def point_estimate(self) -> np.ndarray:
...
def ndim(self):
return len(self.point_estimate())
@abstractmethod
def coverage(self, true_value):
...
@lru_cache
def simplex_portion(self):
return self.montecarlo_proportion()
@lru_cache
def montecarlo_proportion(self, n_trials=10_000):
with qp.util.temp_seed(0):
uniform_simplex = F.uniform_simplex_sampling(n_classes=self.ndim(), size=n_trials)
proportion = np.clip(self.coverage(uniform_simplex), 0., 1.)
return proportion
class WithConfidenceABC(ABC):
@abstractmethod
def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
...
def simplex_volume(n):
return 1 / factorial(n)
def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
"""
Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix`
at a distance `chi2_critical`.
:param values: a np.ndarray with shape (ndim,) or (n_values,ndim,)
:param mean: a np.ndarray with the mean of the sample
:param prec_matrix: a np.ndarray with the precision matrix (inverse of the
covariance matrix) of the sample. If this inverse cannot be computed
then None must be passed
:param chi2_critical: the chi2 critical value
:return: the fraction of values that are contained in the ellipse
defined by the mean, the precision matrix, and the chi2_critical.
If values is only one value, then either 0 (not contained) or
1 (contained) is returned.
"""
if prec_matrix is None:
return 0.
diff = values - mean # Mahalanobis distance
d_M_squared = diff @ prec_matrix @ diff.T # d_M^2
if d_M_squared.ndim == 2:
d_M_squared = np.diag(d_M_squared)
within_elipse = (d_M_squared <= chi2_critical)
if isinstance(within_elipse, np.ndarray):
within_elipse = np.mean(within_elipse)
return within_elipse * 1.0
class ConfidenceEllipseSimplex(ConfidenceRegionABC):
def __init__(self, X, confidence_level=0.95):
assert 0. < confidence_level < 1., f'{confidence_level=} must be in range(0,1)'
X = np.asarray(X)
self.mean_ = X.mean(axis=0)
self.cov_ = np.cov(X, rowvar=False, ddof=1)
try:
self.precision_matrix_ = np.linalg.inv(self.cov_)
except:
self.precision_matrix_ = None
self.dim = X.shape[-1]
self.ddof = self.dim - 1
# critical chi-square value
self.confidence_level = confidence_level
self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof)
def point_estimate(self):
return self.mean_
def coverage(self, true_value):
"""
true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
confidence_level None means that the confidence_level is taken from the __init__
returns true or false depending on whether true_value is in the ellipse or not,
or returns the proportion of true_values that are within the ellipse if more
than one are passed
"""
return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_)
class ConfidenceEllipseCLR(ConfidenceRegionABC):
def __init__(self, X, confidence_level=0.95):
self.clr = CLRtransformation()
Z = self.clr(X)
self.mean_ = np.mean(X, axis=0)
self.conf_region_clr = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level)
def point_estimate(self):
# Z_mean = self.conf_region_clr.mean()
# return self.clr.inverse(Z_mean)
# the inverse of the CLR does not coincide with the clean mean because the geometric mean
# requires smoothing the prevalence vectors and this affects the softmax (inverse)
return self.mean_
def coverage(self, true_value):
"""
true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
confidence_level None means that the confidence_level is taken from the __init__
returns true or false depending on whether true_value is in the ellipse or not,
or returns the proportion of true_values that are within the ellipse if more
than one are passed
"""
transformed_values = self.clr(true_value)
return self.conf_region_clr.coverage(transformed_values)
class ConfidenceIntervals(ConfidenceRegionABC):
def __init__(self, X, confidence_level=0.95):
assert 0 < confidence_level < 1, f'{confidence_level=} must be in range(0,1)'
X = np.asarray(X)
self.means_ = X.mean(axis=0)
self.I_low, self.I_high = np.percentile(X, q=[2.5, 97.5], axis=0)
def point_estimate(self):
return self.means_
def coverage(self, true_value):
"""
true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
returns true or false depending on whether true_value is in the ellipse or not,
or returns the proportion of true_values that are within the ellipse if more
than one are passed
"""
within_intervals = np.logical_and(self.I_low <= true_value, true_value <= self.I_high)
within_all_intervals = np.all(within_intervals, axis=-1, keepdims=True)
proportion = within_all_intervals.mean()
return proportion
class CLRtransformation:
"""
Centered log-ratio
"""
def __call__(self, X, epsilon=1e-6):
X = np.asarray(X)
X = qp.error.smooth(X, epsilon)
G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True)) # geometric mean
return np.log(X / G)
def inverse(self, X):
return softmax(X, axis=-1)
class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
METHODS = ['intervals', 'ellipse', 'ellipse-clr']
def __init__(self,
quantifier: AggregativeQuantifier,
n_train_samples=1,
n_test_samples=500,
confidence_level=0.95,
method='intervals',
random_state=None):
assert isinstance(quantifier, AggregativeQuantifier), \
f'base quantifier does not seem to be an instance of {AggregativeQuantifier.__name__}'
assert n_train_samples >= 1, \
f'{n_train_samples=} must be >= 1'
assert n_test_samples >= 1, \
f'{n_test_samples=} must be >= 1'
assert n_test_samples>1 or n_train_samples>1, \
f'either {n_test_samples=} or {n_train_samples=} must be >1'
assert method in self.METHODS, \
f'unknown method; valid ones are {self.METHODS}'
self.quantifier = quantifier
self.n_train_samples = n_train_samples
self.n_test_samples = n_test_samples
self.confidence_level = confidence_level
self.method = method
self.random_state = random_state
def _return_conf(self, prevs, confidence_level):
region = None
if self.method == 'intervals':
region = ConfidenceIntervals(prevs, confidence_level=confidence_level)
elif self.method == 'ellipse':
region = ConfidenceEllipseSimplex(prevs, confidence_level=confidence_level)
elif self.method == 'ellipse-clr':
region = ConfidenceEllipseCLR(prevs, confidence_level=confidence_level)
if region is None:
raise NotImplementedError(f'unknown method {self.method}')
return region
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
self.quantifiers = []
if self.n_train_samples==1:
self.quantifier.aggregation_fit(classif_predictions, data)
self.quantifiers.append(self.quantifier)
else:
# model-based bootstrap (only on the aggregative part)
full_index = np.arange(len(data))
with qp.util.temp_seed(self.random_state):
for i in range(self.n_train_samples):
quantifier = copy.deepcopy(self.quantifier)
index = resample(full_index, n_samples=len(data))
classif_predictions_i = classif_predictions.sampling_from_index(index)
data_i = data.sampling_from_index(index)
quantifier.aggregation_fit(classif_predictions_i, data_i)
self.quantifiers.append(quantifier)
return self
def aggregate(self, classif_predictions: np.ndarray):
prev_mean, self.confidence = self.aggregate_conf(classif_predictions)
return prev_mean
def aggregate_conf(self, classif_predictions: np.ndarray, confidence_level=None):
if confidence_level is None:
confidence_level = self.confidence_level
n_samples = classif_predictions.shape[0]
prevs = []
with qp.util.temp_seed(self.random_state):
for quantifier in self.quantifiers:
for i in range(self.n_test_samples):
sample_i = resample(classif_predictions, n_samples=n_samples)
prev_i = quantifier.aggregate(sample_i)
prevs.append(prev_i)
conf = self._return_conf(prevs, confidence_level)
prev_estim = conf.point_estimate()
return prev_estim, conf
def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None):
self.quantifier._check_init_parameters()
classif_predictions = self.quantifier.classifier_fit_predict(data, fit_classifier, predict_on=val_split)
self.aggregation_fit(classif_predictions, data)
return self
def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
predictions = self.quantifier.classify(instances)
return self.aggregate_conf(predictions, confidence_level=confidence_level)
@property
def classifier(self):
return self.quantifier.classifier
def _classifier_method(self):
return self.quantifier._classifier_method()

View File

@ -1,6 +1,6 @@
import itertools
from copy import deepcopy
from typing import Union
from typing import Union, List
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, make_scorer, accuracy_score
@ -12,7 +12,7 @@ from quapy import functional as F
from quapy.data import LabelledCollection
from quapy.model_selection import GridSearchQ
from quapy.method.base import BaseQuantifier, BinaryQuantifier
from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ, AggregativeQuantifier
from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ, AggregativeQuantifier, AggregativeSoftQuantifier
try:
from . import _neural
@ -691,3 +691,66 @@ def EEMQ(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
"""
return ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs)
class SCMQ(AggregativeSoftQuantifier):
MERGE_FUNCTIONS = ['median']
def __init__(self, classifier, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5):
self.classifier = classifier
self.quantifiers = quantifiers
assert merge_fun in self.MERGE_FUNCTIONS, f'unknwon {merge_fun=}, valid ones are {self.MERGE_FUNCTIONS}'
self.merge_fun = merge_fun
self.val_split = val_split
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
for quantifier in self.quantifiers:
quantifier.classifier = self.classifier
quantifier.aggregation_fit(classif_predictions, data)
return self
def aggregate(self, classif_predictions: np.ndarray):
prev_predictions = []
for quantifier_i in self.quantifiers:
prevalence_i = quantifier_i.aggregate(classif_predictions)
prev_predictions.append(prevalence_i)
return self.merge(prev_predictions)
def merge(self, prev_predictions):
prev_predictions = np.asarray(prev_predictions)
if self.merge_fun == 'median':
prevalences = np.median(prev_predictions, axis=0)
prevalences = F.normalize_prevalence(prevalences, method='l1')
elif self.merge_fun == 'mean':
prevalences = np.mean(prev_predictions, axis=0)
else:
raise NotImplementedError(f'merge function {self.merge_fun} not implemented!')
return prevalences

View File

@ -1,4 +1,6 @@
from copy import deepcopy
from typing import Iterable
import quapy as qp
import numpy as np
import itertools
@ -62,6 +64,36 @@ class IterateProtocol(AbstractProtocol):
return len(self.samples)
class ProtocolFromIndex(AbstractProtocol):
"""
A protocol from a list of indexes
:param data: a :class:`quapy.data.base.LabelledCollection`
:param indexes: a list of indexes
"""
def __init__(self, data: LabelledCollection, indexes: Iterable):
self.data = data
self.indexes = indexes
def __call__(self):
"""
Yields one sample at a time extracted using the indexes
:return: yields a tuple `(sample, prev) at a time, where `sample` is a set of instances
and in which `prev` is an `nd.array` with the class prevalence values
"""
for index in self.indexes:
yield self.data.sampling_from_index(index).Xp
def total(self):
"""
Returns the number of samples in this protocol
:return: int
"""
return len(self.indexes)
class AbstractStochasticSeededProtocol(AbstractProtocol):
"""
An `AbstractStochasticSeededProtocol` is a protocol that generates, via any random procedure (e.g.,
@ -124,9 +156,9 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
if self.random_state is not None:
stack.enter_context(qp.util.temp_seed(self.random_state))
for params in self.samples_parameters():
yield self.collator(self.sample(params))
yield self.collator(self.sample(params), params)
def collator(self, sample, *args):
def collator(self, sample, params):
"""
The collator prepares the sample to accommodate the desired output format before returning the output.
This collator simply returns the sample as it is. Classes inheriting from this abstract class can
@ -191,9 +223,11 @@ class OnLabelledCollectionProtocol:
assert return_type in cls.RETURN_TYPES, \
f'unknown return type passed as argument; valid ones are {cls.RETURN_TYPES}'
if return_type=='sample_prev':
return lambda lc:lc.Xp
return lambda lc,params:lc.Xp
elif return_type=='labelled_collection':
return lambda lc:lc
return lambda lc,params:lc
elif return_type=='index':
return lambda lc,params:params
class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):

View File

@ -25,7 +25,7 @@ class ModselTestCase(unittest.TestCase):
param_grid = {'classifier__C': [0.000001, 10.]}
app = APP(validation, sample_size=100, random_state=1)
q = GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True
q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, verbose=True, n_jobs=-1
).fit(training)
print('best params', q.best_params_)
print('best score', q.best_score_)
@ -39,9 +39,9 @@ class ModselTestCase(unittest.TestCase):
obtains the same optimal parameters
"""
q = PACC(LogisticRegression(random_state=1, max_iter=5000))
q = PACC(LogisticRegression(random_state=1, max_iter=500))
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(n_train=500, random_state=1)
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=50).reduce(n_train=500, random_state=1)
training, validation = data.training.split_stratified(0.7, random_state=1)
param_grid = {'classifier__C': np.logspace(-3,3,7)}
@ -50,7 +50,7 @@ class ModselTestCase(unittest.TestCase):
print('starting model selection in sequential exploration')
tinit = time.time()
modsel = GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=1, verbose=True
q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True
).fit(training)
tend_seq = time.time()-tinit
best_c_seq = modsel.best_params_['classifier__C']
@ -59,7 +59,7 @@ class ModselTestCase(unittest.TestCase):
print('starting model selection in parallel exploration')
tinit = time.time()
modsel = GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True
q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True
).fit(training)
tend_par = time.time() - tinit
best_c_par = modsel.best_params_['classifier__C']