forked from moreo/QuaPy
fixing random_state in base and in protocols
This commit is contained in:
parent
c0c37f0a17
commit
f4a2a94ba5
|
@ -2,7 +2,7 @@ import numpy as np
|
||||||
from scipy.sparse import issparse
|
from scipy.sparse import issparse
|
||||||
from scipy.sparse import vstack
|
from scipy.sparse import vstack
|
||||||
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
|
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
|
||||||
|
from numpy.random import RandomState
|
||||||
from quapy.functional import strprev
|
from quapy.functional import strprev
|
||||||
|
|
||||||
|
|
||||||
|
@ -146,16 +146,21 @@ class LabelledCollection:
|
||||||
|
|
||||||
return indexes_sample
|
return indexes_sample
|
||||||
|
|
||||||
def uniform_sampling_index(self, size):
|
def uniform_sampling_index(self, size, random_state=None):
|
||||||
"""
|
"""
|
||||||
Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
|
Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
|
||||||
without replacement if the requested size is greater than the number of instances, or with replacement
|
without replacement if the requested size is greater than the number of instances, or with replacement
|
||||||
otherwise.
|
otherwise.
|
||||||
|
|
||||||
:param size: integer, the size of the uniform sample
|
:param size: integer, the size of the uniform sample
|
||||||
|
:param random_state: if specified, guarantees reproducibility of the split.
|
||||||
:return: a np.ndarray of shape `(size)` with the indexes
|
:return: a np.ndarray of shape `(size)` with the indexes
|
||||||
"""
|
"""
|
||||||
return np.random.choice(len(self), size, replace=size > len(self))
|
if random_state is not None:
|
||||||
|
ng = RandomState(seed=random_state)
|
||||||
|
else:
|
||||||
|
ng = np.random
|
||||||
|
return ng.choice(len(self), size, replace=size > len(self))
|
||||||
|
|
||||||
def sampling(self, size, *prevs, shuffle=True):
|
def sampling(self, size, *prevs, shuffle=True):
|
||||||
"""
|
"""
|
||||||
|
@ -174,16 +179,17 @@ class LabelledCollection:
|
||||||
prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
|
prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
|
||||||
return self.sampling_from_index(prev_index)
|
return self.sampling_from_index(prev_index)
|
||||||
|
|
||||||
def uniform_sampling(self, size):
|
def uniform_sampling(self, size, random_state=None):
|
||||||
"""
|
"""
|
||||||
Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn
|
Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn
|
||||||
without replacement if the requested size is greater than the number of instances, or with replacement
|
without replacement if the requested size is greater than the number of instances, or with replacement
|
||||||
otherwise.
|
otherwise.
|
||||||
|
|
||||||
:param size: integer, the requested size
|
:param size: integer, the requested size
|
||||||
|
:param random_state: if specified, guarantees reproducibility of the split.
|
||||||
:return: an instance of :class:`LabelledCollection` with length == `size`
|
:return: an instance of :class:`LabelledCollection` with length == `size`
|
||||||
"""
|
"""
|
||||||
unif_index = self.uniform_sampling_index(size)
|
unif_index = self.uniform_sampling_index(size, random_state=random_state)
|
||||||
return self.sampling_from_index(unif_index)
|
return self.sampling_from_index(unif_index)
|
||||||
|
|
||||||
def sampling_from_index(self, index):
|
def sampling_from_index(self, index):
|
||||||
|
|
|
@ -40,22 +40,22 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
|
||||||
needed for extracting the samples, and :meth:`sample` that, given some parameters as input,
|
needed for extracting the samples, and :meth:`sample` that, given some parameters as input,
|
||||||
deterministically generates a sample.
|
deterministically generates a sample.
|
||||||
|
|
||||||
:param seed: the seed for allowing to replicate any sequence of samples. Default is None, meaning that
|
:param random_state: the seed for allowing to replicate any sequence of samples. Default is None, meaning that
|
||||||
the sequence will be different every time the protocol is called.
|
the sequence will be different every time the protocol is called.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_random_seed = -1 # means "not set"
|
_random_state = -1 # means "not set"
|
||||||
|
|
||||||
def __init__(self, seed=None):
|
def __init__(self, random_state=None):
|
||||||
self.random_seed = seed
|
self.random_state = random_state
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def random_seed(self):
|
def random_state(self):
|
||||||
return self._random_seed
|
return self._random_state
|
||||||
|
|
||||||
@random_seed.setter
|
@random_state.setter
|
||||||
def random_seed(self, seed):
|
def random_state(self, random_state):
|
||||||
self._random_seed = seed
|
self._random_state = random_state
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def samples_parameters(self):
|
def samples_parameters(self):
|
||||||
|
@ -78,11 +78,11 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
|
||||||
|
|
||||||
def __call__(self):
|
def __call__(self):
|
||||||
with ExitStack() as stack:
|
with ExitStack() as stack:
|
||||||
if self.random_seed == -1:
|
if self.random_state == -1:
|
||||||
raise ValueError('The random seed has never been initialized. '
|
raise ValueError('The random seed has never been initialized. '
|
||||||
'Set it to None not to impose replicability.')
|
'Set it to None not to impose replicability.')
|
||||||
if self.random_seed is not None:
|
if self.random_state is not None:
|
||||||
stack.enter_context(qp.util.temp_seed(self.random_seed))
|
stack.enter_context(qp.util.temp_seed(self.random_state))
|
||||||
for params in self.samples_parameters():
|
for params in self.samples_parameters():
|
||||||
yield self.collator(self.sample(params))
|
yield self.collator(self.sample(params))
|
||||||
|
|
||||||
|
@ -132,11 +132,11 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
||||||
:param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the
|
:param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the
|
||||||
grid (default is 21)
|
grid (default is 21)
|
||||||
:param repeats: number of copies for each valid prevalence vector (default is 10)
|
:param repeats: number of copies for each valid prevalence vector (default is 10)
|
||||||
:param random_seed: allows replicating samples across runs (default None)
|
:param random_state: allows replicating samples across runs (default None)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_seed=None, return_type='sample_prev'):
|
def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_state=None, return_type='sample_prev'):
|
||||||
super(APP, self).__init__(random_seed)
|
super(APP, self).__init__(random_state)
|
||||||
self.data = data
|
self.data = data
|
||||||
self.sample_size = sample_size
|
self.sample_size = sample_size
|
||||||
self.n_prevalences = n_prevalences
|
self.n_prevalences = n_prevalences
|
||||||
|
@ -189,15 +189,15 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
||||||
:param data: a `LabelledCollection` from which the samples will be drawn
|
:param data: a `LabelledCollection` from which the samples will be drawn
|
||||||
:param sample_size: integer, the number of instances in each sample
|
:param sample_size: integer, the number of instances in each sample
|
||||||
:param repeats: the number of samples to generate. Default is 100.
|
:param repeats: the number of samples to generate. Default is 100.
|
||||||
:param random_seed: allows replicating samples across runs (default None)
|
:param random_state: allows replicating samples across runs (default None)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_seed=None, return_type='sample_prev'):
|
def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_state=None, return_type='sample_prev'):
|
||||||
super(NPP, self).__init__(random_seed)
|
super(NPP, self).__init__(random_state)
|
||||||
self.data = data
|
self.data = data
|
||||||
self.sample_size = sample_size
|
self.sample_size = sample_size
|
||||||
self.repeats = repeats
|
self.repeats = repeats
|
||||||
self.random_seed = random_seed
|
self.random_state = random_state
|
||||||
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
|
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
|
||||||
|
|
||||||
def samples_parameters(self):
|
def samples_parameters(self):
|
||||||
|
@ -226,15 +226,15 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
|
||||||
:param data: a `LabelledCollection` from which the samples will be drawn
|
:param data: a `LabelledCollection` from which the samples will be drawn
|
||||||
:param sample_size: integer, the number of instances in each sample
|
:param sample_size: integer, the number of instances in each sample
|
||||||
:param repeats: the number of samples to generate. Default is 100.
|
:param repeats: the number of samples to generate. Default is 100.
|
||||||
:param random_seed: allows replicating samples across runs (default None)
|
:param random_state: allows replicating samples across runs (default None)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_seed=None, return_type='sample_prev'):
|
def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_state=None, return_type='sample_prev'):
|
||||||
super(USimplexPP, self).__init__(random_seed)
|
super(USimplexPP, self).__init__(random_state)
|
||||||
self.data = data
|
self.data = data
|
||||||
self.sample_size = sample_size
|
self.sample_size = sample_size
|
||||||
self.repeats = repeats
|
self.repeats = repeats
|
||||||
self.random_seed = random_seed
|
self.random_state = random_state
|
||||||
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
|
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
|
||||||
|
|
||||||
def samples_parameters(self):
|
def samples_parameters(self):
|
||||||
|
@ -290,7 +290,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
|
||||||
:param mixture_points: an integer indicating the number of points to take from a linear scale (e.g., 21 will
|
:param mixture_points: an integer indicating the number of points to take from a linear scale (e.g., 21 will
|
||||||
generate the mixture points [1, 0.95, 0.9, ..., 0]), or the array of mixture values itself.
|
generate the mixture points [1, 0.95, 0.9, ..., 0]), or the array of mixture values itself.
|
||||||
the specific points
|
the specific points
|
||||||
:param random_seed:
|
:param random_state:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -301,9 +301,9 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
|
||||||
repeats=1,
|
repeats=1,
|
||||||
prevalence=None,
|
prevalence=None,
|
||||||
mixture_points=11,
|
mixture_points=11,
|
||||||
random_seed=None,
|
random_state=None,
|
||||||
return_type='sample_prev'):
|
return_type='sample_prev'):
|
||||||
super(CovariateShiftPP, self).__init__(random_seed)
|
super(CovariateShiftPP, self).__init__(random_state)
|
||||||
self.A = domainA
|
self.A = domainA
|
||||||
self.B = domainB
|
self.B = domainB
|
||||||
self.sample_size = sample_size
|
self.sample_size = sample_size
|
||||||
|
@ -322,7 +322,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
|
||||||
self.mixture_points = np.asarray(mixture_points)
|
self.mixture_points = np.asarray(mixture_points)
|
||||||
assert all(np.logical_and(self.mixture_points >= 0, self.mixture_points<=1)), \
|
assert all(np.logical_and(self.mixture_points >= 0, self.mixture_points<=1)), \
|
||||||
'mixture_model datatype not understood (expected int or a sequence of real values in [0,1])'
|
'mixture_model datatype not understood (expected int or a sequence of real values in [0,1])'
|
||||||
self.random_seed = random_seed
|
self.random_state = random_state
|
||||||
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
|
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
|
||||||
|
|
||||||
def samples_parameters(self):
|
def samples_parameters(self):
|
||||||
|
|
|
@ -12,7 +12,7 @@ class EvalTestCase(unittest.TestCase):
|
||||||
data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
|
data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
|
||||||
train, test = data.training, data.test
|
train, test = data.training, data.test
|
||||||
|
|
||||||
protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_seed=1)
|
protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_state=1)
|
||||||
|
|
||||||
class SlowLR(LogisticRegression):
|
class SlowLR(LogisticRegression):
|
||||||
def predict_proba(self, X):
|
def predict_proba(self, X):
|
||||||
|
|
|
@ -21,7 +21,7 @@ class ModselTestCase(unittest.TestCase):
|
||||||
training, validation = data.training.split_stratified(0.7, random_state=1)
|
training, validation = data.training.split_stratified(0.7, random_state=1)
|
||||||
|
|
||||||
param_grid = {'C': np.logspace(-3,3,7)}
|
param_grid = {'C': np.logspace(-3,3,7)}
|
||||||
app = APP(validation, sample_size=100, random_seed=1)
|
app = APP(validation, sample_size=100, random_state=1)
|
||||||
q = GridSearchQ(
|
q = GridSearchQ(
|
||||||
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True
|
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True
|
||||||
).fit(training)
|
).fit(training)
|
||||||
|
@ -40,7 +40,7 @@ class ModselTestCase(unittest.TestCase):
|
||||||
# test = data.test
|
# test = data.test
|
||||||
|
|
||||||
param_grid = {'C': np.logspace(-3,3,7)}
|
param_grid = {'C': np.logspace(-3,3,7)}
|
||||||
app = APP(validation, sample_size=100, random_seed=1)
|
app = APP(validation, sample_size=100, random_state=1)
|
||||||
q = GridSearchQ(
|
q = GridSearchQ(
|
||||||
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True
|
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True
|
||||||
).fit(training)
|
).fit(training)
|
||||||
|
@ -62,7 +62,7 @@ class ModselTestCase(unittest.TestCase):
|
||||||
training, validation = data.training.split_stratified(0.7, random_state=1)
|
training, validation = data.training.split_stratified(0.7, random_state=1)
|
||||||
|
|
||||||
param_grid = {'C': np.logspace(-3, 3, 7)}
|
param_grid = {'C': np.logspace(-3, 3, 7)}
|
||||||
app = APP(validation, sample_size=100, random_seed=1)
|
app = APP(validation, sample_size=100, random_state=1)
|
||||||
|
|
||||||
tinit = time.time()
|
tinit = time.time()
|
||||||
GridSearchQ(
|
GridSearchQ(
|
||||||
|
@ -96,7 +96,7 @@ class ModselTestCase(unittest.TestCase):
|
||||||
# test = data.test
|
# test = data.test
|
||||||
|
|
||||||
param_grid = {'C': np.logspace(-3,3,7)}
|
param_grid = {'C': np.logspace(-3,3,7)}
|
||||||
app = APP(validation, sample_size=100, random_seed=1)
|
app = APP(validation, sample_size=100, random_state=1)
|
||||||
q = GridSearchQ(
|
q = GridSearchQ(
|
||||||
q, param_grid, protocol=app, error='mae', refit=True, timeout=3, n_jobs=-1, verbose=True
|
q, param_grid, protocol=app, error='mae', refit=True, timeout=3, n_jobs=-1, verbose=True
|
||||||
)
|
)
|
||||||
|
|
|
@ -21,7 +21,7 @@ class TestProtocols(unittest.TestCase):
|
||||||
|
|
||||||
def test_app_replicate(self):
|
def test_app_replicate(self):
|
||||||
data = mock_labelled_collection()
|
data = mock_labelled_collection()
|
||||||
p = APP(data, sample_size=5, n_prevalences=11, random_seed=42)
|
p = APP(data, sample_size=5, n_prevalences=11, random_state=42)
|
||||||
|
|
||||||
samples1 = samples_to_str(p)
|
samples1 = samples_to_str(p)
|
||||||
samples2 = samples_to_str(p)
|
samples2 = samples_to_str(p)
|
||||||
|
@ -57,7 +57,7 @@ class TestProtocols(unittest.TestCase):
|
||||||
|
|
||||||
def test_npp_replicate(self):
|
def test_npp_replicate(self):
|
||||||
data = mock_labelled_collection()
|
data = mock_labelled_collection()
|
||||||
p = NPP(data, sample_size=5, repeats=5, random_seed=42)
|
p = NPP(data, sample_size=5, repeats=5, random_state=42)
|
||||||
|
|
||||||
samples1 = samples_to_str(p)
|
samples1 = samples_to_str(p)
|
||||||
samples2 = samples_to_str(p)
|
samples2 = samples_to_str(p)
|
||||||
|
@ -75,7 +75,7 @@ class TestProtocols(unittest.TestCase):
|
||||||
|
|
||||||
def test_kraemer_replicate(self):
|
def test_kraemer_replicate(self):
|
||||||
data = mock_labelled_collection()
|
data = mock_labelled_collection()
|
||||||
p = USimplexPP(data, sample_size=5, repeats=10, random_seed=42)
|
p = USimplexPP(data, sample_size=5, repeats=10, random_state=42)
|
||||||
|
|
||||||
samples1 = samples_to_str(p)
|
samples1 = samples_to_str(p)
|
||||||
samples2 = samples_to_str(p)
|
samples2 = samples_to_str(p)
|
||||||
|
@ -94,7 +94,7 @@ class TestProtocols(unittest.TestCase):
|
||||||
def test_covariate_shift_replicate(self):
|
def test_covariate_shift_replicate(self):
|
||||||
dataA = mock_labelled_collection('domA')
|
dataA = mock_labelled_collection('domA')
|
||||||
dataB = mock_labelled_collection('domB')
|
dataB = mock_labelled_collection('domB')
|
||||||
p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_seed=1)
|
p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_state=1)
|
||||||
|
|
||||||
samples1 = samples_to_str(p)
|
samples1 = samples_to_str(p)
|
||||||
samples2 = samples_to_str(p)
|
samples2 = samples_to_str(p)
|
||||||
|
|
|
@ -50,7 +50,6 @@ def parallel(func, args, n_jobs):
|
||||||
def func_dec(environ, *args):
|
def func_dec(environ, *args):
|
||||||
qp.environ = environ.copy()
|
qp.environ = environ.copy()
|
||||||
qp.environ['N_JOBS'] = 1
|
qp.environ['N_JOBS'] = 1
|
||||||
print(f'setting n_jobs from {environ["N_JOBS"]} to 1')
|
|
||||||
return func(*args)
|
return func(*args)
|
||||||
return Parallel(n_jobs=n_jobs)(
|
return Parallel(n_jobs=n_jobs)(
|
||||||
delayed(func_dec)(qp.environ, args_i) for args_i in args
|
delayed(func_dec)(qp.environ, args_i) for args_i in args
|
||||||
|
|
Loading…
Reference in New Issue