diff --git a/quapy/data/base.py b/quapy/data/base.py index 1125449..3c9bb67 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -2,7 +2,7 @@ import numpy as np from scipy.sparse import issparse from scipy.sparse import vstack from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold - +from numpy.random import RandomState from quapy.functional import strprev @@ -146,16 +146,21 @@ class LabelledCollection: return indexes_sample - def uniform_sampling_index(self, size): + def uniform_sampling_index(self, size, random_state=None): """ Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn without replacement if the requested size is greater than the number of instances, or with replacement otherwise. :param size: integer, the size of the uniform sample + :param random_state: if specified, guarantees reproducibility of the split. :return: a np.ndarray of shape `(size)` with the indexes """ - return np.random.choice(len(self), size, replace=size > len(self)) + if random_state is not None: + ng = RandomState(seed=random_state) + else: + ng = np.random + return ng.choice(len(self), size, replace=size > len(self)) def sampling(self, size, *prevs, shuffle=True): """ @@ -174,16 +179,17 @@ class LabelledCollection: prev_index = self.sampling_index(size, *prevs, shuffle=shuffle) return self.sampling_from_index(prev_index) - def uniform_sampling(self, size): + def uniform_sampling(self, size, random_state=None): """ Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn without replacement if the requested size is greater than the number of instances, or with replacement otherwise. :param size: integer, the requested size + :param random_state: if specified, guarantees reproducibility of the split. :return: an instance of :class:`LabelledCollection` with length == `size` """ - unif_index = self.uniform_sampling_index(size) + unif_index = self.uniform_sampling_index(size, random_state=random_state) return self.sampling_from_index(unif_index) def sampling_from_index(self, index): diff --git a/quapy/protocol.py b/quapy/protocol.py index f8b828f..c232ebc 100644 --- a/quapy/protocol.py +++ b/quapy/protocol.py @@ -40,22 +40,22 @@ class AbstractStochasticSeededProtocol(AbstractProtocol): needed for extracting the samples, and :meth:`sample` that, given some parameters as input, deterministically generates a sample. - :param seed: the seed for allowing to replicate any sequence of samples. Default is None, meaning that + :param random_state: the seed for allowing to replicate any sequence of samples. Default is None, meaning that the sequence will be different every time the protocol is called. """ - _random_seed = -1 # means "not set" + _random_state = -1 # means "not set" - def __init__(self, seed=None): - self.random_seed = seed + def __init__(self, random_state=None): + self.random_state = random_state @property - def random_seed(self): - return self._random_seed + def random_state(self): + return self._random_state - @random_seed.setter - def random_seed(self, seed): - self._random_seed = seed + @random_state.setter + def random_state(self, random_state): + self._random_state = random_state @abstractmethod def samples_parameters(self): @@ -78,11 +78,11 @@ class AbstractStochasticSeededProtocol(AbstractProtocol): def __call__(self): with ExitStack() as stack: - if self.random_seed == -1: + if self.random_state == -1: raise ValueError('The random seed has never been initialized. ' 'Set it to None not to impose replicability.') - if self.random_seed is not None: - stack.enter_context(qp.util.temp_seed(self.random_seed)) + if self.random_state is not None: + stack.enter_context(qp.util.temp_seed(self.random_state)) for params in self.samples_parameters(): yield self.collator(self.sample(params)) @@ -132,11 +132,11 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol): :param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the grid (default is 21) :param repeats: number of copies for each valid prevalence vector (default is 10) - :param random_seed: allows replicating samples across runs (default None) + :param random_state: allows replicating samples across runs (default None) """ - def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_seed=None, return_type='sample_prev'): - super(APP, self).__init__(random_seed) + def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_state=None, return_type='sample_prev'): + super(APP, self).__init__(random_state) self.data = data self.sample_size = sample_size self.n_prevalences = n_prevalences @@ -189,15 +189,15 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol): :param data: a `LabelledCollection` from which the samples will be drawn :param sample_size: integer, the number of instances in each sample :param repeats: the number of samples to generate. Default is 100. - :param random_seed: allows replicating samples across runs (default None) + :param random_state: allows replicating samples across runs (default None) """ - def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_seed=None, return_type='sample_prev'): - super(NPP, self).__init__(random_seed) + def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_state=None, return_type='sample_prev'): + super(NPP, self).__init__(random_state) self.data = data self.sample_size = sample_size self.repeats = repeats - self.random_seed = random_seed + self.random_state = random_state self.collator = OnLabelledCollectionProtocol.get_collator(return_type) def samples_parameters(self): @@ -226,15 +226,15 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol) :param data: a `LabelledCollection` from which the samples will be drawn :param sample_size: integer, the number of instances in each sample :param repeats: the number of samples to generate. Default is 100. - :param random_seed: allows replicating samples across runs (default None) + :param random_state: allows replicating samples across runs (default None) """ - def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_seed=None, return_type='sample_prev'): - super(USimplexPP, self).__init__(random_seed) + def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_state=None, return_type='sample_prev'): + super(USimplexPP, self).__init__(random_state) self.data = data self.sample_size = sample_size self.repeats = repeats - self.random_seed = random_seed + self.random_state = random_state self.collator = OnLabelledCollectionProtocol.get_collator(return_type) def samples_parameters(self): @@ -290,7 +290,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol): :param mixture_points: an integer indicating the number of points to take from a linear scale (e.g., 21 will generate the mixture points [1, 0.95, 0.9, ..., 0]), or the array of mixture values itself. the specific points - :param random_seed: + :param random_state: """ def __init__( @@ -301,9 +301,9 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol): repeats=1, prevalence=None, mixture_points=11, - random_seed=None, + random_state=None, return_type='sample_prev'): - super(CovariateShiftPP, self).__init__(random_seed) + super(CovariateShiftPP, self).__init__(random_state) self.A = domainA self.B = domainB self.sample_size = sample_size @@ -322,7 +322,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol): self.mixture_points = np.asarray(mixture_points) assert all(np.logical_and(self.mixture_points >= 0, self.mixture_points<=1)), \ 'mixture_model datatype not understood (expected int or a sequence of real values in [0,1])' - self.random_seed = random_seed + self.random_state = random_state self.collator = OnLabelledCollectionProtocol.get_collator(return_type) def samples_parameters(self): diff --git a/quapy/tests/test_evaluation.py b/quapy/tests/test_evaluation.py index 73dc485..9a77867 100644 --- a/quapy/tests/test_evaluation.py +++ b/quapy/tests/test_evaluation.py @@ -12,7 +12,7 @@ class EvalTestCase(unittest.TestCase): data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True) train, test = data.training, data.test - protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_seed=1) + protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_state=1) class SlowLR(LogisticRegression): def predict_proba(self, X): diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py index 9c6604a..d54dcbe 100644 --- a/quapy/tests/test_modsel.py +++ b/quapy/tests/test_modsel.py @@ -21,7 +21,7 @@ class ModselTestCase(unittest.TestCase): training, validation = data.training.split_stratified(0.7, random_state=1) param_grid = {'C': np.logspace(-3,3,7)} - app = APP(validation, sample_size=100, random_seed=1) + app = APP(validation, sample_size=100, random_state=1) q = GridSearchQ( q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True ).fit(training) @@ -40,7 +40,7 @@ class ModselTestCase(unittest.TestCase): # test = data.test param_grid = {'C': np.logspace(-3,3,7)} - app = APP(validation, sample_size=100, random_seed=1) + app = APP(validation, sample_size=100, random_state=1) q = GridSearchQ( q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True ).fit(training) @@ -62,7 +62,7 @@ class ModselTestCase(unittest.TestCase): training, validation = data.training.split_stratified(0.7, random_state=1) param_grid = {'C': np.logspace(-3, 3, 7)} - app = APP(validation, sample_size=100, random_seed=1) + app = APP(validation, sample_size=100, random_state=1) tinit = time.time() GridSearchQ( @@ -96,7 +96,7 @@ class ModselTestCase(unittest.TestCase): # test = data.test param_grid = {'C': np.logspace(-3,3,7)} - app = APP(validation, sample_size=100, random_seed=1) + app = APP(validation, sample_size=100, random_state=1) q = GridSearchQ( q, param_grid, protocol=app, error='mae', refit=True, timeout=3, n_jobs=-1, verbose=True ) diff --git a/quapy/tests/test_protocols.py b/quapy/tests/test_protocols.py index aeb1f4e..dea3290 100644 --- a/quapy/tests/test_protocols.py +++ b/quapy/tests/test_protocols.py @@ -21,7 +21,7 @@ class TestProtocols(unittest.TestCase): def test_app_replicate(self): data = mock_labelled_collection() - p = APP(data, sample_size=5, n_prevalences=11, random_seed=42) + p = APP(data, sample_size=5, n_prevalences=11, random_state=42) samples1 = samples_to_str(p) samples2 = samples_to_str(p) @@ -57,7 +57,7 @@ class TestProtocols(unittest.TestCase): def test_npp_replicate(self): data = mock_labelled_collection() - p = NPP(data, sample_size=5, repeats=5, random_seed=42) + p = NPP(data, sample_size=5, repeats=5, random_state=42) samples1 = samples_to_str(p) samples2 = samples_to_str(p) @@ -75,7 +75,7 @@ class TestProtocols(unittest.TestCase): def test_kraemer_replicate(self): data = mock_labelled_collection() - p = USimplexPP(data, sample_size=5, repeats=10, random_seed=42) + p = USimplexPP(data, sample_size=5, repeats=10, random_state=42) samples1 = samples_to_str(p) samples2 = samples_to_str(p) @@ -94,7 +94,7 @@ class TestProtocols(unittest.TestCase): def test_covariate_shift_replicate(self): dataA = mock_labelled_collection('domA') dataB = mock_labelled_collection('domB') - p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_seed=1) + p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_state=1) samples1 = samples_to_str(p) samples2 = samples_to_str(p) diff --git a/quapy/util.py b/quapy/util.py index 259178e..049ebed 100644 --- a/quapy/util.py +++ b/quapy/util.py @@ -50,7 +50,6 @@ def parallel(func, args, n_jobs): def func_dec(environ, *args): qp.environ = environ.copy() qp.environ['N_JOBS'] = 1 - print(f'setting n_jobs from {environ["N_JOBS"]} to 1') return func(*args) return Parallel(n_jobs=n_jobs)( delayed(func_dec)(qp.environ, args_i) for args_i in args