forked from moreo/QuaPy
fixing random_state in base and in protocols
This commit is contained in:
parent
c0c37f0a17
commit
f4a2a94ba5
|
@ -2,7 +2,7 @@ import numpy as np
|
|||
from scipy.sparse import issparse
|
||||
from scipy.sparse import vstack
|
||||
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
|
||||
|
||||
from numpy.random import RandomState
|
||||
from quapy.functional import strprev
|
||||
|
||||
|
||||
|
@ -146,16 +146,21 @@ class LabelledCollection:
|
|||
|
||||
return indexes_sample
|
||||
|
||||
def uniform_sampling_index(self, size):
|
||||
def uniform_sampling_index(self, size, random_state=None):
|
||||
"""
|
||||
Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
|
||||
without replacement if the requested size is greater than the number of instances, or with replacement
|
||||
otherwise.
|
||||
|
||||
:param size: integer, the size of the uniform sample
|
||||
:param random_state: if specified, guarantees reproducibility of the split.
|
||||
:return: a np.ndarray of shape `(size)` with the indexes
|
||||
"""
|
||||
return np.random.choice(len(self), size, replace=size > len(self))
|
||||
if random_state is not None:
|
||||
ng = RandomState(seed=random_state)
|
||||
else:
|
||||
ng = np.random
|
||||
return ng.choice(len(self), size, replace=size > len(self))
|
||||
|
||||
def sampling(self, size, *prevs, shuffle=True):
|
||||
"""
|
||||
|
@ -174,16 +179,17 @@ class LabelledCollection:
|
|||
prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
|
||||
return self.sampling_from_index(prev_index)
|
||||
|
||||
def uniform_sampling(self, size):
|
||||
def uniform_sampling(self, size, random_state=None):
|
||||
"""
|
||||
Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn
|
||||
without replacement if the requested size is greater than the number of instances, or with replacement
|
||||
otherwise.
|
||||
|
||||
:param size: integer, the requested size
|
||||
:param random_state: if specified, guarantees reproducibility of the split.
|
||||
:return: an instance of :class:`LabelledCollection` with length == `size`
|
||||
"""
|
||||
unif_index = self.uniform_sampling_index(size)
|
||||
unif_index = self.uniform_sampling_index(size, random_state=random_state)
|
||||
return self.sampling_from_index(unif_index)
|
||||
|
||||
def sampling_from_index(self, index):
|
||||
|
|
|
@ -40,22 +40,22 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
|
|||
needed for extracting the samples, and :meth:`sample` that, given some parameters as input,
|
||||
deterministically generates a sample.
|
||||
|
||||
:param seed: the seed for allowing to replicate any sequence of samples. Default is None, meaning that
|
||||
:param random_state: the seed for allowing to replicate any sequence of samples. Default is None, meaning that
|
||||
the sequence will be different every time the protocol is called.
|
||||
"""
|
||||
|
||||
_random_seed = -1 # means "not set"
|
||||
_random_state = -1 # means "not set"
|
||||
|
||||
def __init__(self, seed=None):
|
||||
self.random_seed = seed
|
||||
def __init__(self, random_state=None):
|
||||
self.random_state = random_state
|
||||
|
||||
@property
|
||||
def random_seed(self):
|
||||
return self._random_seed
|
||||
def random_state(self):
|
||||
return self._random_state
|
||||
|
||||
@random_seed.setter
|
||||
def random_seed(self, seed):
|
||||
self._random_seed = seed
|
||||
@random_state.setter
|
||||
def random_state(self, random_state):
|
||||
self._random_state = random_state
|
||||
|
||||
@abstractmethod
|
||||
def samples_parameters(self):
|
||||
|
@ -78,11 +78,11 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
|
|||
|
||||
def __call__(self):
|
||||
with ExitStack() as stack:
|
||||
if self.random_seed == -1:
|
||||
if self.random_state == -1:
|
||||
raise ValueError('The random seed has never been initialized. '
|
||||
'Set it to None not to impose replicability.')
|
||||
if self.random_seed is not None:
|
||||
stack.enter_context(qp.util.temp_seed(self.random_seed))
|
||||
if self.random_state is not None:
|
||||
stack.enter_context(qp.util.temp_seed(self.random_state))
|
||||
for params in self.samples_parameters():
|
||||
yield self.collator(self.sample(params))
|
||||
|
||||
|
@ -132,11 +132,11 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
|||
:param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the
|
||||
grid (default is 21)
|
||||
:param repeats: number of copies for each valid prevalence vector (default is 10)
|
||||
:param random_seed: allows replicating samples across runs (default None)
|
||||
:param random_state: allows replicating samples across runs (default None)
|
||||
"""
|
||||
|
||||
def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_seed=None, return_type='sample_prev'):
|
||||
super(APP, self).__init__(random_seed)
|
||||
def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_state=None, return_type='sample_prev'):
|
||||
super(APP, self).__init__(random_state)
|
||||
self.data = data
|
||||
self.sample_size = sample_size
|
||||
self.n_prevalences = n_prevalences
|
||||
|
@ -189,15 +189,15 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
|||
:param data: a `LabelledCollection` from which the samples will be drawn
|
||||
:param sample_size: integer, the number of instances in each sample
|
||||
:param repeats: the number of samples to generate. Default is 100.
|
||||
:param random_seed: allows replicating samples across runs (default None)
|
||||
:param random_state: allows replicating samples across runs (default None)
|
||||
"""
|
||||
|
||||
def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_seed=None, return_type='sample_prev'):
|
||||
super(NPP, self).__init__(random_seed)
|
||||
def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_state=None, return_type='sample_prev'):
|
||||
super(NPP, self).__init__(random_state)
|
||||
self.data = data
|
||||
self.sample_size = sample_size
|
||||
self.repeats = repeats
|
||||
self.random_seed = random_seed
|
||||
self.random_state = random_state
|
||||
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
|
||||
|
||||
def samples_parameters(self):
|
||||
|
@ -226,15 +226,15 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
|
|||
:param data: a `LabelledCollection` from which the samples will be drawn
|
||||
:param sample_size: integer, the number of instances in each sample
|
||||
:param repeats: the number of samples to generate. Default is 100.
|
||||
:param random_seed: allows replicating samples across runs (default None)
|
||||
:param random_state: allows replicating samples across runs (default None)
|
||||
"""
|
||||
|
||||
def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_seed=None, return_type='sample_prev'):
|
||||
super(USimplexPP, self).__init__(random_seed)
|
||||
def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_state=None, return_type='sample_prev'):
|
||||
super(USimplexPP, self).__init__(random_state)
|
||||
self.data = data
|
||||
self.sample_size = sample_size
|
||||
self.repeats = repeats
|
||||
self.random_seed = random_seed
|
||||
self.random_state = random_state
|
||||
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
|
||||
|
||||
def samples_parameters(self):
|
||||
|
@ -290,7 +290,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
|
|||
:param mixture_points: an integer indicating the number of points to take from a linear scale (e.g., 21 will
|
||||
generate the mixture points [1, 0.95, 0.9, ..., 0]), or the array of mixture values itself.
|
||||
the specific points
|
||||
:param random_seed:
|
||||
:param random_state:
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
@ -301,9 +301,9 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
|
|||
repeats=1,
|
||||
prevalence=None,
|
||||
mixture_points=11,
|
||||
random_seed=None,
|
||||
random_state=None,
|
||||
return_type='sample_prev'):
|
||||
super(CovariateShiftPP, self).__init__(random_seed)
|
||||
super(CovariateShiftPP, self).__init__(random_state)
|
||||
self.A = domainA
|
||||
self.B = domainB
|
||||
self.sample_size = sample_size
|
||||
|
@ -322,7 +322,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
|
|||
self.mixture_points = np.asarray(mixture_points)
|
||||
assert all(np.logical_and(self.mixture_points >= 0, self.mixture_points<=1)), \
|
||||
'mixture_model datatype not understood (expected int or a sequence of real values in [0,1])'
|
||||
self.random_seed = random_seed
|
||||
self.random_state = random_state
|
||||
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
|
||||
|
||||
def samples_parameters(self):
|
||||
|
|
|
@ -12,7 +12,7 @@ class EvalTestCase(unittest.TestCase):
|
|||
data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
|
||||
train, test = data.training, data.test
|
||||
|
||||
protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_seed=1)
|
||||
protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_state=1)
|
||||
|
||||
class SlowLR(LogisticRegression):
|
||||
def predict_proba(self, X):
|
||||
|
|
|
@ -21,7 +21,7 @@ class ModselTestCase(unittest.TestCase):
|
|||
training, validation = data.training.split_stratified(0.7, random_state=1)
|
||||
|
||||
param_grid = {'C': np.logspace(-3,3,7)}
|
||||
app = APP(validation, sample_size=100, random_seed=1)
|
||||
app = APP(validation, sample_size=100, random_state=1)
|
||||
q = GridSearchQ(
|
||||
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True
|
||||
).fit(training)
|
||||
|
@ -40,7 +40,7 @@ class ModselTestCase(unittest.TestCase):
|
|||
# test = data.test
|
||||
|
||||
param_grid = {'C': np.logspace(-3,3,7)}
|
||||
app = APP(validation, sample_size=100, random_seed=1)
|
||||
app = APP(validation, sample_size=100, random_state=1)
|
||||
q = GridSearchQ(
|
||||
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True
|
||||
).fit(training)
|
||||
|
@ -62,7 +62,7 @@ class ModselTestCase(unittest.TestCase):
|
|||
training, validation = data.training.split_stratified(0.7, random_state=1)
|
||||
|
||||
param_grid = {'C': np.logspace(-3, 3, 7)}
|
||||
app = APP(validation, sample_size=100, random_seed=1)
|
||||
app = APP(validation, sample_size=100, random_state=1)
|
||||
|
||||
tinit = time.time()
|
||||
GridSearchQ(
|
||||
|
@ -96,7 +96,7 @@ class ModselTestCase(unittest.TestCase):
|
|||
# test = data.test
|
||||
|
||||
param_grid = {'C': np.logspace(-3,3,7)}
|
||||
app = APP(validation, sample_size=100, random_seed=1)
|
||||
app = APP(validation, sample_size=100, random_state=1)
|
||||
q = GridSearchQ(
|
||||
q, param_grid, protocol=app, error='mae', refit=True, timeout=3, n_jobs=-1, verbose=True
|
||||
)
|
||||
|
|
|
@ -21,7 +21,7 @@ class TestProtocols(unittest.TestCase):
|
|||
|
||||
def test_app_replicate(self):
|
||||
data = mock_labelled_collection()
|
||||
p = APP(data, sample_size=5, n_prevalences=11, random_seed=42)
|
||||
p = APP(data, sample_size=5, n_prevalences=11, random_state=42)
|
||||
|
||||
samples1 = samples_to_str(p)
|
||||
samples2 = samples_to_str(p)
|
||||
|
@ -57,7 +57,7 @@ class TestProtocols(unittest.TestCase):
|
|||
|
||||
def test_npp_replicate(self):
|
||||
data = mock_labelled_collection()
|
||||
p = NPP(data, sample_size=5, repeats=5, random_seed=42)
|
||||
p = NPP(data, sample_size=5, repeats=5, random_state=42)
|
||||
|
||||
samples1 = samples_to_str(p)
|
||||
samples2 = samples_to_str(p)
|
||||
|
@ -75,7 +75,7 @@ class TestProtocols(unittest.TestCase):
|
|||
|
||||
def test_kraemer_replicate(self):
|
||||
data = mock_labelled_collection()
|
||||
p = USimplexPP(data, sample_size=5, repeats=10, random_seed=42)
|
||||
p = USimplexPP(data, sample_size=5, repeats=10, random_state=42)
|
||||
|
||||
samples1 = samples_to_str(p)
|
||||
samples2 = samples_to_str(p)
|
||||
|
@ -94,7 +94,7 @@ class TestProtocols(unittest.TestCase):
|
|||
def test_covariate_shift_replicate(self):
|
||||
dataA = mock_labelled_collection('domA')
|
||||
dataB = mock_labelled_collection('domB')
|
||||
p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_seed=1)
|
||||
p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_state=1)
|
||||
|
||||
samples1 = samples_to_str(p)
|
||||
samples2 = samples_to_str(p)
|
||||
|
|
|
@ -50,7 +50,6 @@ def parallel(func, args, n_jobs):
|
|||
def func_dec(environ, *args):
|
||||
qp.environ = environ.copy()
|
||||
qp.environ['N_JOBS'] = 1
|
||||
print(f'setting n_jobs from {environ["N_JOBS"]} to 1')
|
||||
return func(*args)
|
||||
return Parallel(n_jobs=n_jobs)(
|
||||
delayed(func_dec)(qp.environ, args_i) for args_i in args
|
||||
|
|
Loading…
Reference in New Issue