forked from moreo/QuaPy
passing pytests
This commit is contained in:
parent
e6dcfbced1
commit
2f2e48d86a
|
@ -1,6 +1,10 @@
|
|||
Change Log 0.1.8
|
||||
----------------
|
||||
|
||||
- Added Kernel Density Estimation methods (KDEyML, KDEyCS, KDEyHD) as proposed in the paper:
|
||||
Moreo, A., González, P., & del Coz, J. J. Kernel Density Estimation for Multiclass Quantification.
|
||||
arXiv preprint arXiv:2401.00490, 2024
|
||||
|
||||
- Added different solvers for ACC and PACC quantifiers. In quapy < 0.1.8 these quantifiers try to solve the system
|
||||
of equations Ax=B exactly (by means of np.linalg.solve). As noted by Mirko Bunse (thanks!), such an exact solution
|
||||
does sometimes not exist. In cases like this, quapy < 0.1.8 resorted to CC for providing a plausible solution.
|
||||
|
@ -21,7 +25,7 @@ Change Log 0.1.8
|
|||
- classification datasets
|
||||
- Python API available
|
||||
|
||||
- New IFCB (plankton) dataset added. See fetch_IFCB.
|
||||
- New IFCB (plankton) dataset added (thanks to Pablo González). See qp.datasets.fetch_IFCB.
|
||||
|
||||
- Added new evaluation measures NAE, NRAE
|
||||
|
||||
|
|
|
@ -119,22 +119,18 @@ class MedianEstimator(BinaryQuantifier):
|
|||
|
||||
def _delayed_fit_classifier(self, args):
|
||||
with qp.util.temp_seed(self.random_state):
|
||||
print('enter job')
|
||||
cls_params, training = args
|
||||
model = deepcopy(self.base_quantifier)
|
||||
model.set_params(**cls_params)
|
||||
predictions = model.classifier_fit_predict(training, predict_on=model.val_split)
|
||||
print('exit job')
|
||||
return (model, predictions)
|
||||
|
||||
def _delayed_fit_aggregation(self, args):
|
||||
with qp.util.temp_seed(self.random_state):
|
||||
print('\tenter job')
|
||||
((model, predictions), q_params), training = args
|
||||
model = deepcopy(model)
|
||||
model.set_params(**q_params)
|
||||
model.aggregation_fit(predictions, training)
|
||||
print('\texit job')
|
||||
return model
|
||||
|
||||
|
||||
|
@ -153,7 +149,6 @@ class MedianEstimator(BinaryQuantifier):
|
|||
asarray=False
|
||||
)
|
||||
else:
|
||||
print('only 1')
|
||||
model = self.base_quantifier
|
||||
model.set_params(**cls_configs[0])
|
||||
predictions = model.classifier_fit_predict(training, predict_on=model.val_split)
|
||||
|
@ -263,9 +258,10 @@ class Ensemble(BaseQuantifier):
|
|||
print('[Ensemble]' + msg)
|
||||
|
||||
def fit(self, data: qp.data.LabelledCollection, val_split: Union[qp.data.LabelledCollection, float] = None):
|
||||
self._sout('Fit')
|
||||
|
||||
if self.policy == 'ds' and not data.binary:
|
||||
raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary')
|
||||
|
||||
if val_split is None:
|
||||
val_split = self.val_split
|
||||
|
||||
|
@ -288,6 +284,7 @@ class Ensemble(BaseQuantifier):
|
|||
self.ensemble = qp.util.parallel(
|
||||
_delayed_new_instance,
|
||||
tqdm(args, desc='fitting ensamble', total=self.size) if self.verbose else args,
|
||||
asarray=False,
|
||||
n_jobs=self.n_jobs)
|
||||
|
||||
# static selection policy (the name of a quantification-oriented error function to minimize)
|
||||
|
@ -369,30 +366,31 @@ class Ensemble(BaseQuantifier):
|
|||
|
||||
def _ds_policy_get_posteriors(self, data: LabelledCollection):
|
||||
"""
|
||||
In the original article, this procedure is not described in a sufficient level of detail. The paper only says
|
||||
In the original article, there are some aspects regarding this method that are not mentioned. The paper says
|
||||
that the distribution of posterior probabilities from training and test examples is compared by means of the
|
||||
Hellinger Distance. However, how these posterior probabilities are generated is not specified. In the article,
|
||||
a Logistic Regressor (LR) is used as the classifier device and that could be used for this purpose. However, in
|
||||
general, a Quantifier is not necessarily an instance of Aggreggative Probabilistic Quantifiers, and so, that the
|
||||
quantifier builds on top of a probabilistic classifier cannot be given for granted. Additionally, it would not
|
||||
be correct to generate the posterior probabilities for training documents that have concurred in training the
|
||||
be correct to generate the posterior probabilities for training instances that have concurred in training the
|
||||
classifier that generates them.
|
||||
|
||||
This function thus generates the posterior probabilities for all training documents in a cross-validation way,
|
||||
using a LR with hyperparameters that have previously been optimized via grid search in 5FCV.
|
||||
:return P,f, where P is a ndarray containing the posterior probabilities of the training data, generated via
|
||||
cross-validation and using an optimized LR, and the function to be used in order to generate posterior
|
||||
probabilities for test instances.
|
||||
using LR with hyperparameters that have previously been optimized via grid search in 5FCV.
|
||||
|
||||
:param data: a LabelledCollection
|
||||
:return: (P,f,) where P is an ndarray containing the posterior probabilities of the training data, generated via
|
||||
cross-validation and using an optimized LR, and the function to be used in order to generate posterior
|
||||
probabilities for test instances.
|
||||
"""
|
||||
|
||||
X, y = data.Xy
|
||||
lr_base = LogisticRegression(class_weight='balanced', max_iter=1000)
|
||||
|
||||
optim = GridSearchCV(
|
||||
lr_base, param_grid={'C': np.logspace(-4, 4, 9)}, cv=5, n_jobs=self.n_jobs, refit=True
|
||||
).fit(X, y)
|
||||
param_grid = {'C': np.logspace(-4, 4, 9)}
|
||||
optim = GridSearchCV(lr_base, param_grid=param_grid, cv=5, n_jobs=self.n_jobs, refit=True).fit(X, y)
|
||||
|
||||
posteriors = cross_val_predict(
|
||||
optim.best_estimator_, X, y, cv=5, n_jobs=self.n_jobs, method='predict_proba'
|
||||
)
|
||||
posteriors = cross_val_predict(optim.best_estimator_, X, y, cv=5, n_jobs=self.n_jobs, method='predict_proba')
|
||||
posteriors_generator = optim.best_estimator_.predict_proba
|
||||
|
||||
return posteriors, posteriors_generator
|
||||
|
@ -463,8 +461,10 @@ def _delayed_new_instance(args):
|
|||
|
||||
tr_prevalence = sample.prevalence()
|
||||
tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None
|
||||
|
||||
if verbose:
|
||||
print(f'\t\--fit-ended for prev {F.strprev(prev)}')
|
||||
|
||||
return (model, tr_prevalence, tr_distribution, sample if keep_samples else None)
|
||||
|
||||
|
||||
|
@ -475,8 +475,9 @@ def _delayed_quantify(args):
|
|||
|
||||
def _draw_simplex(ndim, min_val, max_trials=100):
|
||||
"""
|
||||
returns a uniform sampling from the ndim-dimensional simplex but guarantees that all dimensions
|
||||
Returns a uniform sampling from the ndim-dimensional simplex but guarantees that all dimensions
|
||||
are >= min_class_prev (for min_val>0, this makes the sampling not truly uniform)
|
||||
|
||||
:param ndim: number of dimensions of the simplex
|
||||
:param min_val: minimum class prevalence allowed. If less than 1/ndim a ValueError will be throw since
|
||||
there is no possible solution.
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
import pytest
|
||||
|
||||
from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \
|
||||
TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, LEQUA2022_TASKS, \
|
||||
fetch_reviews, fetch_twitter, fetch_UCIDataset, fetch_lequa2022
|
||||
TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, LEQUA2022_TASKS, UCI_MULTICLASS_DATASETS,\
|
||||
fetch_reviews, fetch_twitter, fetch_UCIDataset, fetch_lequa2022, fetch_UCIMulticlassLabelledCollection
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS)
|
||||
|
@ -44,6 +44,15 @@ def test_fetch_UCIDataset(dataset_name):
|
|||
print('Test set stats')
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dataset_name', UCI_MULTICLASS_DATASETS)
|
||||
def test_fetch_UCIMultiDataset(dataset_name):
|
||||
dataset = fetch_UCIMulticlassLabelledCollection(dataset_name)
|
||||
print(f'Dataset {dataset_name}')
|
||||
print('Training set stats')
|
||||
dataset.stats()
|
||||
print('Test set stats')
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dataset_name', LEQUA2022_TASKS)
|
||||
def test_fetch_lequa2022(dataset_name):
|
||||
train, gen_val, gen_test = fetch_lequa2022(dataset_name)
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
import unittest
|
||||
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
import quapy as qp
|
||||
from quapy.method.aggregative import *
|
||||
|
||||
|
||||
|
||||
class HierarchyTestCase(unittest.TestCase):
|
||||
|
||||
def test_aggregative(self):
|
||||
|
@ -22,8 +18,10 @@ class HierarchyTestCase(unittest.TestCase):
|
|||
def test_probabilistic(self):
|
||||
lr = LogisticRegression()
|
||||
for m in [CC(lr), ACC(lr)]:
|
||||
self.assertEqual(isinstance(m, AggregativeCrispQuantifier), True)
|
||||
self.assertEqual(isinstance(m, AggregativeSoftQuantifier), False)
|
||||
for m in [PCC(lr), PACC(lr)]:
|
||||
self.assertEqual(isinstance(m, AggregativeCrispQuantifier), False)
|
||||
self.assertEqual(isinstance(m, AggregativeSoftQuantifier), True)
|
||||
|
||||
|
||||
|
|
|
@ -67,15 +67,16 @@ def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method):
|
|||
@pytest.mark.parametrize('dataset', tinydatasets)
|
||||
@pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES)
|
||||
def test_ensemble_method(base_method, learner, dataset: Dataset, policy):
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 20
|
||||
|
||||
base_quantifier=base_method(learner())
|
||||
if isinstance(base_quantifier, BinaryQuantifier) and not dataset.binary:
|
||||
print(f'skipping the test of binary model {base_quantifier} on non-binary dataset {dataset}')
|
||||
return
|
||||
|
||||
if not dataset.binary and policy=='ds':
|
||||
print(f'skipping the test of binary policy ds on non-binary dataset {dataset}')
|
||||
return
|
||||
model = Ensemble(quantifier=base_quantifier, size=5, policy=policy, n_jobs=-1)
|
||||
|
||||
model = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1)
|
||||
|
||||
model.fit(dataset.training)
|
||||
|
||||
|
@ -97,9 +98,7 @@ def test_quanet_method():
|
|||
qp.environ['SAMPLE_SIZE'] = 100
|
||||
|
||||
# load the kindle dataset as text, and convert words to numerical indexes
|
||||
dataset = qp.datasets.fetch_reviews('kindle', pickle=True)
|
||||
dataset = Dataset(dataset.training.sampling(200, *dataset.training.prevalence()),
|
||||
dataset.test.sampling(200, *dataset.test.prevalence()))
|
||||
dataset = qp.datasets.fetch_reviews('kindle', pickle=True).reduce(200, 200)
|
||||
qp.data.preprocessing.index(dataset, min_df=5, inplace=True)
|
||||
|
||||
from quapy.classification.neural import CNNnet
|
||||
|
|
|
@ -3,11 +3,13 @@ import quapy as qp
|
|||
from quapy.data import LabelledCollection
|
||||
from quapy.functional import strprev
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
import numpy as np
|
||||
from quapy.method.aggregative import PACC
|
||||
import quapy.functional as F
|
||||
|
||||
|
||||
class MyTestCase(unittest.TestCase):
|
||||
|
||||
def test_prediction_replicability(self):
|
||||
|
||||
dataset = qp.datasets.fetch_UCIDataset('yeast')
|
||||
|
@ -26,8 +28,8 @@ class MyTestCase(unittest.TestCase):
|
|||
|
||||
self.assertEqual(str_prev1, str_prev2) # add assertion here
|
||||
|
||||
|
||||
def test_samping_replicability(self):
|
||||
import numpy as np
|
||||
|
||||
def equal_collections(c1, c2, value=True):
|
||||
self.assertEqual(np.all(c1.X == c2.X), value)
|
||||
|
@ -74,5 +76,36 @@ class MyTestCase(unittest.TestCase):
|
|||
equal_collections(sample1_te, sample2_te, True)
|
||||
|
||||
|
||||
def test_parallel_replicability(self):
|
||||
|
||||
train, test = qp.datasets.fetch_UCIMulticlassDataset('dry-bean').train_test
|
||||
|
||||
test = test.sampling(500, *[0.1, 0.0, 0.1, 0.1, 0.2, 0.5, 0.0])
|
||||
|
||||
with qp.util.temp_seed(10):
|
||||
pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2)
|
||||
pacc.fit(train, val_split=0.5)
|
||||
prev1 = F.strprev(pacc.quantify(test.instances))
|
||||
|
||||
with qp.util.temp_seed(0):
|
||||
pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2)
|
||||
pacc.fit(train, val_split=0.5)
|
||||
prev2 = F.strprev(pacc.quantify(test.instances))
|
||||
|
||||
with qp.util.temp_seed(0):
|
||||
pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2)
|
||||
pacc.fit(train, val_split=0.5)
|
||||
prev3 = F.strprev(pacc.quantify(test.instances))
|
||||
|
||||
print(prev1)
|
||||
print(prev2)
|
||||
print(prev3)
|
||||
|
||||
self.assertNotEqual(prev1, prev2)
|
||||
self.assertEqual(prev2, prev3)
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
|
Loading…
Reference in New Issue