diff --git a/quapy/data/base.py b/quapy/data/base.py index ce05b95..2629084 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -108,8 +108,7 @@ class LabelledCollection: """ Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the prevalence values are not specified, then returns the index of a uniform sampling. - For each class, the sampling is drawn with replacement if the requested prevalence is larger than - the actual prevalence of the class, or without replacement otherwise. + For each class, the sampling is drawn with replacement. :param size: integer, the requested size :param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since @@ -153,7 +152,7 @@ class LabelledCollection: for class_, n_requested in n_requests.items(): n_candidates = len(self.index[class_]) index_sample = self.index[class_][ - np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates)) + np.random.choice(n_candidates, size=n_requested, replace=True) ] if n_requested > 0 else [] indexes_sample.append(index_sample) @@ -168,8 +167,7 @@ class LabelledCollection: def uniform_sampling_index(self, size, random_state=None): """ Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn - with replacement if the requested size is greater than the number of instances, or without replacement - otherwise. + with replacement. :param size: integer, the size of the uniform sample :param random_state: if specified, guarantees reproducibility of the split. @@ -179,13 +177,12 @@ class LabelledCollection: ng = RandomState(seed=random_state) else: ng = np.random - return ng.choice(len(self), size, replace=size > len(self)) + return ng.choice(len(self), size, replace=True) def sampling(self, size, *prevs, shuffle=True, random_state=None): """ Return a random sample (an instance of :class:`LabelledCollection`) of desired size and desired prevalence - values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than - the actual prevalence of the class, or with replacement otherwise. + values. For each class, the sampling is drawn with replacement. :param size: integer, the requested size :param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since @@ -202,8 +199,7 @@ class LabelledCollection: def uniform_sampling(self, size, random_state=None): """ Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn - with replacement if the requested size is greater than the number of instances, or without replacement - otherwise. + with replacement. :param size: integer, the requested size :param random_state: if specified, guarantees reproducibility of the split. @@ -236,24 +232,11 @@ class LabelledCollection: :return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the second one with `1-train_prop` elements """ - instances = self.instances - labels = self.labels - remainder = None - for idx in np.argwhere(self.counts()==1): - class_with_1 = self.classes_[idx.item()] - if remainder is None: - remainder = LabelledCollection(instances[labels==class_with_1], [class_with_1], classes=self.classes_) - else: - remainder += LabelledCollection(instances[labels==class_with_1], [class_with_1], classes=self.classes_) - instances = instances[labels!=class_with_1] - labels = labels[labels!=class_with_1] tr_docs, te_docs, tr_labels, te_labels = train_test_split( - instances, labels, train_size=train_prop, stratify=labels, random_state=random_state + self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state ) training = LabelledCollection(tr_docs, tr_labels, classes=self.classes_) test = LabelledCollection(te_docs, te_labels, classes=self.classes_) - if remainder is not None: - training += remainder return training, test def split_random(self, train_prop=0.6, random_state=None):