1
0
Fork 0

fixing random_state in base and in protocols

This commit is contained in:
Alejandro Moreo Fernandez 2022-06-21 10:27:06 +02:00
parent c0c37f0a17
commit f4a2a94ba5
6 changed files with 47 additions and 42 deletions

View File

@ -2,7 +2,7 @@ import numpy as np
from scipy.sparse import issparse from scipy.sparse import issparse
from scipy.sparse import vstack from scipy.sparse import vstack
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from numpy.random import RandomState
from quapy.functional import strprev from quapy.functional import strprev
@ -146,16 +146,21 @@ class LabelledCollection:
return indexes_sample return indexes_sample
def uniform_sampling_index(self, size): def uniform_sampling_index(self, size, random_state=None):
""" """
Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement without replacement if the requested size is greater than the number of instances, or with replacement
otherwise. otherwise.
:param size: integer, the size of the uniform sample :param size: integer, the size of the uniform sample
:param random_state: if specified, guarantees reproducibility of the split.
:return: a np.ndarray of shape `(size)` with the indexes :return: a np.ndarray of shape `(size)` with the indexes
""" """
return np.random.choice(len(self), size, replace=size > len(self)) if random_state is not None:
ng = RandomState(seed=random_state)
else:
ng = np.random
return ng.choice(len(self), size, replace=size > len(self))
def sampling(self, size, *prevs, shuffle=True): def sampling(self, size, *prevs, shuffle=True):
""" """
@ -174,16 +179,17 @@ class LabelledCollection:
prev_index = self.sampling_index(size, *prevs, shuffle=shuffle) prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
return self.sampling_from_index(prev_index) return self.sampling_from_index(prev_index)
def uniform_sampling(self, size): def uniform_sampling(self, size, random_state=None):
""" """
Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement without replacement if the requested size is greater than the number of instances, or with replacement
otherwise. otherwise.
:param size: integer, the requested size :param size: integer, the requested size
:param random_state: if specified, guarantees reproducibility of the split.
:return: an instance of :class:`LabelledCollection` with length == `size` :return: an instance of :class:`LabelledCollection` with length == `size`
""" """
unif_index = self.uniform_sampling_index(size) unif_index = self.uniform_sampling_index(size, random_state=random_state)
return self.sampling_from_index(unif_index) return self.sampling_from_index(unif_index)
def sampling_from_index(self, index): def sampling_from_index(self, index):

View File

@ -40,22 +40,22 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
needed for extracting the samples, and :meth:`sample` that, given some parameters as input, needed for extracting the samples, and :meth:`sample` that, given some parameters as input,
deterministically generates a sample. deterministically generates a sample.
:param seed: the seed for allowing to replicate any sequence of samples. Default is None, meaning that :param random_state: the seed for allowing to replicate any sequence of samples. Default is None, meaning that
the sequence will be different every time the protocol is called. the sequence will be different every time the protocol is called.
""" """
_random_seed = -1 # means "not set" _random_state = -1 # means "not set"
def __init__(self, seed=None): def __init__(self, random_state=None):
self.random_seed = seed self.random_state = random_state
@property @property
def random_seed(self): def random_state(self):
return self._random_seed return self._random_state
@random_seed.setter @random_state.setter
def random_seed(self, seed): def random_state(self, random_state):
self._random_seed = seed self._random_state = random_state
@abstractmethod @abstractmethod
def samples_parameters(self): def samples_parameters(self):
@ -78,11 +78,11 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
def __call__(self): def __call__(self):
with ExitStack() as stack: with ExitStack() as stack:
if self.random_seed == -1: if self.random_state == -1:
raise ValueError('The random seed has never been initialized. ' raise ValueError('The random seed has never been initialized. '
'Set it to None not to impose replicability.') 'Set it to None not to impose replicability.')
if self.random_seed is not None: if self.random_state is not None:
stack.enter_context(qp.util.temp_seed(self.random_seed)) stack.enter_context(qp.util.temp_seed(self.random_state))
for params in self.samples_parameters(): for params in self.samples_parameters():
yield self.collator(self.sample(params)) yield self.collator(self.sample(params))
@ -132,11 +132,11 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
:param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the :param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the
grid (default is 21) grid (default is 21)
:param repeats: number of copies for each valid prevalence vector (default is 10) :param repeats: number of copies for each valid prevalence vector (default is 10)
:param random_seed: allows replicating samples across runs (default None) :param random_state: allows replicating samples across runs (default None)
""" """
def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_seed=None, return_type='sample_prev'): def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_state=None, return_type='sample_prev'):
super(APP, self).__init__(random_seed) super(APP, self).__init__(random_state)
self.data = data self.data = data
self.sample_size = sample_size self.sample_size = sample_size
self.n_prevalences = n_prevalences self.n_prevalences = n_prevalences
@ -189,15 +189,15 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
:param data: a `LabelledCollection` from which the samples will be drawn :param data: a `LabelledCollection` from which the samples will be drawn
:param sample_size: integer, the number of instances in each sample :param sample_size: integer, the number of instances in each sample
:param repeats: the number of samples to generate. Default is 100. :param repeats: the number of samples to generate. Default is 100.
:param random_seed: allows replicating samples across runs (default None) :param random_state: allows replicating samples across runs (default None)
""" """
def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_seed=None, return_type='sample_prev'): def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_state=None, return_type='sample_prev'):
super(NPP, self).__init__(random_seed) super(NPP, self).__init__(random_state)
self.data = data self.data = data
self.sample_size = sample_size self.sample_size = sample_size
self.repeats = repeats self.repeats = repeats
self.random_seed = random_seed self.random_state = random_state
self.collator = OnLabelledCollectionProtocol.get_collator(return_type) self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
def samples_parameters(self): def samples_parameters(self):
@ -226,15 +226,15 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
:param data: a `LabelledCollection` from which the samples will be drawn :param data: a `LabelledCollection` from which the samples will be drawn
:param sample_size: integer, the number of instances in each sample :param sample_size: integer, the number of instances in each sample
:param repeats: the number of samples to generate. Default is 100. :param repeats: the number of samples to generate. Default is 100.
:param random_seed: allows replicating samples across runs (default None) :param random_state: allows replicating samples across runs (default None)
""" """
def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_seed=None, return_type='sample_prev'): def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_state=None, return_type='sample_prev'):
super(USimplexPP, self).__init__(random_seed) super(USimplexPP, self).__init__(random_state)
self.data = data self.data = data
self.sample_size = sample_size self.sample_size = sample_size
self.repeats = repeats self.repeats = repeats
self.random_seed = random_seed self.random_state = random_state
self.collator = OnLabelledCollectionProtocol.get_collator(return_type) self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
def samples_parameters(self): def samples_parameters(self):
@ -290,7 +290,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
:param mixture_points: an integer indicating the number of points to take from a linear scale (e.g., 21 will :param mixture_points: an integer indicating the number of points to take from a linear scale (e.g., 21 will
generate the mixture points [1, 0.95, 0.9, ..., 0]), or the array of mixture values itself. generate the mixture points [1, 0.95, 0.9, ..., 0]), or the array of mixture values itself.
the specific points the specific points
:param random_seed: :param random_state:
""" """
def __init__( def __init__(
@ -301,9 +301,9 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
repeats=1, repeats=1,
prevalence=None, prevalence=None,
mixture_points=11, mixture_points=11,
random_seed=None, random_state=None,
return_type='sample_prev'): return_type='sample_prev'):
super(CovariateShiftPP, self).__init__(random_seed) super(CovariateShiftPP, self).__init__(random_state)
self.A = domainA self.A = domainA
self.B = domainB self.B = domainB
self.sample_size = sample_size self.sample_size = sample_size
@ -322,7 +322,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
self.mixture_points = np.asarray(mixture_points) self.mixture_points = np.asarray(mixture_points)
assert all(np.logical_and(self.mixture_points >= 0, self.mixture_points<=1)), \ assert all(np.logical_and(self.mixture_points >= 0, self.mixture_points<=1)), \
'mixture_model datatype not understood (expected int or a sequence of real values in [0,1])' 'mixture_model datatype not understood (expected int or a sequence of real values in [0,1])'
self.random_seed = random_seed self.random_state = random_state
self.collator = OnLabelledCollectionProtocol.get_collator(return_type) self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
def samples_parameters(self): def samples_parameters(self):

View File

@ -12,7 +12,7 @@ class EvalTestCase(unittest.TestCase):
data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True) data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
train, test = data.training, data.test train, test = data.training, data.test
protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_seed=1) protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_state=1)
class SlowLR(LogisticRegression): class SlowLR(LogisticRegression):
def predict_proba(self, X): def predict_proba(self, X):

View File

@ -21,7 +21,7 @@ class ModselTestCase(unittest.TestCase):
training, validation = data.training.split_stratified(0.7, random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1)
param_grid = {'C': np.logspace(-3,3,7)} param_grid = {'C': np.logspace(-3,3,7)}
app = APP(validation, sample_size=100, random_seed=1) app = APP(validation, sample_size=100, random_state=1)
q = GridSearchQ( q = GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True
).fit(training) ).fit(training)
@ -40,7 +40,7 @@ class ModselTestCase(unittest.TestCase):
# test = data.test # test = data.test
param_grid = {'C': np.logspace(-3,3,7)} param_grid = {'C': np.logspace(-3,3,7)}
app = APP(validation, sample_size=100, random_seed=1) app = APP(validation, sample_size=100, random_state=1)
q = GridSearchQ( q = GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True
).fit(training) ).fit(training)
@ -62,7 +62,7 @@ class ModselTestCase(unittest.TestCase):
training, validation = data.training.split_stratified(0.7, random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1)
param_grid = {'C': np.logspace(-3, 3, 7)} param_grid = {'C': np.logspace(-3, 3, 7)}
app = APP(validation, sample_size=100, random_seed=1) app = APP(validation, sample_size=100, random_state=1)
tinit = time.time() tinit = time.time()
GridSearchQ( GridSearchQ(
@ -96,7 +96,7 @@ class ModselTestCase(unittest.TestCase):
# test = data.test # test = data.test
param_grid = {'C': np.logspace(-3,3,7)} param_grid = {'C': np.logspace(-3,3,7)}
app = APP(validation, sample_size=100, random_seed=1) app = APP(validation, sample_size=100, random_state=1)
q = GridSearchQ( q = GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=True, timeout=3, n_jobs=-1, verbose=True q, param_grid, protocol=app, error='mae', refit=True, timeout=3, n_jobs=-1, verbose=True
) )

View File

@ -21,7 +21,7 @@ class TestProtocols(unittest.TestCase):
def test_app_replicate(self): def test_app_replicate(self):
data = mock_labelled_collection() data = mock_labelled_collection()
p = APP(data, sample_size=5, n_prevalences=11, random_seed=42) p = APP(data, sample_size=5, n_prevalences=11, random_state=42)
samples1 = samples_to_str(p) samples1 = samples_to_str(p)
samples2 = samples_to_str(p) samples2 = samples_to_str(p)
@ -57,7 +57,7 @@ class TestProtocols(unittest.TestCase):
def test_npp_replicate(self): def test_npp_replicate(self):
data = mock_labelled_collection() data = mock_labelled_collection()
p = NPP(data, sample_size=5, repeats=5, random_seed=42) p = NPP(data, sample_size=5, repeats=5, random_state=42)
samples1 = samples_to_str(p) samples1 = samples_to_str(p)
samples2 = samples_to_str(p) samples2 = samples_to_str(p)
@ -75,7 +75,7 @@ class TestProtocols(unittest.TestCase):
def test_kraemer_replicate(self): def test_kraemer_replicate(self):
data = mock_labelled_collection() data = mock_labelled_collection()
p = USimplexPP(data, sample_size=5, repeats=10, random_seed=42) p = USimplexPP(data, sample_size=5, repeats=10, random_state=42)
samples1 = samples_to_str(p) samples1 = samples_to_str(p)
samples2 = samples_to_str(p) samples2 = samples_to_str(p)
@ -94,7 +94,7 @@ class TestProtocols(unittest.TestCase):
def test_covariate_shift_replicate(self): def test_covariate_shift_replicate(self):
dataA = mock_labelled_collection('domA') dataA = mock_labelled_collection('domA')
dataB = mock_labelled_collection('domB') dataB = mock_labelled_collection('domB')
p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_seed=1) p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_state=1)
samples1 = samples_to_str(p) samples1 = samples_to_str(p)
samples2 = samples_to_str(p) samples2 = samples_to_str(p)

View File

@ -50,7 +50,6 @@ def parallel(func, args, n_jobs):
def func_dec(environ, *args): def func_dec(environ, *args):
qp.environ = environ.copy() qp.environ = environ.copy()
qp.environ['N_JOBS'] = 1 qp.environ['N_JOBS'] = 1
print(f'setting n_jobs from {environ["N_JOBS"]} to 1')
return func(*args) return func(*args)
return Parallel(n_jobs=n_jobs)( return Parallel(n_jobs=n_jobs)(
delayed(func_dec)(qp.environ, args_i) for args_i in args delayed(func_dec)(qp.environ, args_i) for args_i in args