1
0
Fork 0
This commit is contained in:
Alejandro Moreo Fernandez 2021-06-11 11:00:37 +02:00
commit 9fd9d096f6
22 changed files with 694 additions and 156 deletions

View File

@ -11,6 +11,12 @@ used for evaluating quantification methods.
QuaPy also integrates commonly used datasets and offers visualization tools QuaPy also integrates commonly used datasets and offers visualization tools
for facilitating the analysis and interpretation of results. for facilitating the analysis and interpretation of results.
### Installation
```commandline
pip install quapy
```
## A quick example: ## A quick example:
The following script fetchs a Twitter dataset, trains and evaluates an The following script fetchs a Twitter dataset, trains and evaluates an

View File

@ -2,7 +2,6 @@ Packaging:
========================================== ==========================================
Documentation with sphinx Documentation with sphinx
Document methods with paper references Document methods with paper references
allow for "pip install"
unit-tests unit-tests
New features: New features:
@ -18,14 +17,13 @@ SVMperf-based learners do not remove temp files in __del__?
In binary quantification (hp, kindle, imdb) we used F1 in the minority class (which in kindle and hp happens to be the In binary quantification (hp, kindle, imdb) we used F1 in the minority class (which in kindle and hp happens to be the
negative class). This is not covered in this new implementation, in which the binary case is not treated as such, but as negative class). This is not covered in this new implementation, in which the binary case is not treated as such, but as
an instance of single-label with 2 labels. Check an instance of single-label with 2 labels. Check
Add classnames to LabelledCollection? This should improve visualization of reports
Add automatic reindex of class labels in LabelledCollection (currently, class indexes should be ordered and with no gaps) Add automatic reindex of class labels in LabelledCollection (currently, class indexes should be ordered and with no gaps)
OVR I believe is currently tied to aggregative methods. We should provide a general interface also for general quantifiers OVR I believe is currently tied to aggregative methods. We should provide a general interface also for general quantifiers
Currently, being "binary" only adds one checker; we should figure out how to impose the check to be automatically performed Currently, being "binary" only adds one checker; we should figure out how to impose the check to be automatically performed
Add random seed management to support replicability (see temp_seed in util.py).
Improvements: Improvements:
========================================== ==========================================
Clarify whether QuaNet is an aggregative method or not.
Explore the hyperparameter "number of bins" in HDy Explore the hyperparameter "number of bins" in HDy
Rename EMQ to SLD ? Rename EMQ to SLD ?
Parallelize the kFCV in ACC and PACC? Parallelize the kFCV in ACC and PACC?

View File

@ -10,7 +10,7 @@ from . import model_selection
from . import classification from . import classification
from quapy.method.base import isprobabilistic, isaggregative from quapy.method.base import isprobabilistic, isaggregative
__version__ = '0.1' __version__ = '0.1.5'
environ = { environ = {
'SAMPLE_SIZE': None, 'SAMPLE_SIZE': None,

View File

@ -11,8 +11,8 @@ from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm from tqdm import tqdm
import quapy as qp import quapy as qp
from data import LabelledCollection from quapy.data import LabelledCollection
from util import EarlyStop from quapy.util import EarlyStop
class NeuralClassifierTrainer: class NeuralClassifierTrainer:

View File

@ -2,12 +2,21 @@ import numpy as np
from scipy.sparse import issparse from scipy.sparse import issparse
from scipy.sparse import vstack from scipy.sparse import vstack
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from quapy.functional import artificial_prevalence_sampling, strprev from quapy.functional import artificial_prevalence_sampling, strprev
class LabelledCollection: class LabelledCollection:
'''
A LabelledCollection is a set of objects each with a label associated to it.
'''
def __init__(self, instances, labels, n_classes=None): def __init__(self, instances, labels, classes_=None):
"""
:param instances: list of objects
:param labels: list of labels, same length of instances
:param classes_: optional, list of classes from which labels are taken. When used, must contain the set of values used in labels.
"""
if issparse(instances): if issparse(instances):
self.instances = instances self.instances = instances
elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str): elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str):
@ -15,14 +24,17 @@ class LabelledCollection:
self.instances = np.asarray(instances, dtype=object) self.instances = np.asarray(instances, dtype=object)
else: else:
self.instances = np.asarray(instances) self.instances = np.asarray(instances)
self.labels = np.asarray(labels, dtype=int) self.labels = np.asarray(labels)
n_docs = len(self) n_docs = len(self)
if n_classes is None: if classes_ is None:
self.classes_ = np.unique(self.labels) self.classes_ = np.unique(self.labels)
self.classes_.sort() self.classes_.sort()
else: else:
self.classes_ = np.arange(n_classes) self.classes_ = np.unique(np.asarray(classes_))
self.index = {class_i: np.arange(n_docs)[self.labels == class_i] for class_i in self.classes_} self.classes_.sort()
if len(set(self.labels).difference(set(classes_))) > 0:
raise ValueError('labels contains values not included in classes_')
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
@classmethod @classmethod
def load(cls, path: str, loader_func: callable): def load(cls, path: str, loader_func: callable):
@ -35,7 +47,7 @@ class LabelledCollection:
return self.counts() / len(self) return self.counts() / len(self)
def counts(self): def counts(self):
return np.asarray([len(self.index[ci]) for ci in self.classes_]) return np.asarray([len(self.index[class_]) for class_ in self.classes_])
@property @property
def n_classes(self): def n_classes(self):
@ -55,14 +67,14 @@ class LabelledCollection:
taken = 0 taken = 0
indexes_sample = [] indexes_sample = []
for i, class_i in enumerate(self.classes_): for i, class_ in enumerate(self.classes_):
if i == self.n_classes - 1: if i == self.n_classes - 1:
n_requested = size - taken n_requested = size - taken
else: else:
n_requested = int(size * prevs[i]) n_requested = int(size * prevs[i])
n_candidates = len(self.index[class_i]) n_candidates = len(self.index[class_])
index_sample = self.index[class_i][ index_sample = self.index[class_][
np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates)) np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates))
] if n_requested > 0 else [] ] if n_requested > 0 else []
@ -90,12 +102,13 @@ class LabelledCollection:
def sampling_from_index(self, index): def sampling_from_index(self, index):
documents = self.instances[index] documents = self.instances[index]
labels = self.labels[index] labels = self.labels[index]
return LabelledCollection(documents, labels, n_classes=self.n_classes) return LabelledCollection(documents, labels, classes_=self.classes_)
def split_stratified(self, train_prop=0.6, random_state=None): def split_stratified(self, train_prop=0.6, random_state=None):
# with temp_seed(42): # with temp_seed(42):
tr_docs, te_docs, tr_labels, te_labels = \ tr_docs, te_docs, tr_labels, te_labels = \
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state) train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
random_state=random_state)
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels) return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1): def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
@ -144,7 +157,7 @@ class LabelledCollection:
stats_ = {'instances': ninstances, stats_ = {'instances': ninstances,
'type': instance_type, 'type': instance_type,
'features': nfeats, 'features': nfeats,
'classes': self.n_classes, 'classes': self.classes_,
'prevs': strprev(self.prevalence())} 'prevs': strprev(self.prevalence())}
if show: if show:
print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, ' print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, '
@ -158,10 +171,11 @@ class LabelledCollection:
test = self.sampling_from_index(test_index) test = self.sampling_from_index(test_index)
yield train, test yield train, test
class Dataset: class Dataset:
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''): def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
assert training.n_classes == test.n_classes, 'incompatible labels in training and test collections' assert set(training.classes_) == set(test.classes_), 'incompatible labels in training and test collections'
self.training = training self.training = training
self.test = test self.test = test
self.vocabulary = vocabulary self.vocabulary = vocabulary
@ -171,6 +185,10 @@ class Dataset:
def SplitStratified(cls, collection: LabelledCollection, train_size=0.6): def SplitStratified(cls, collection: LabelledCollection, train_size=0.6):
return Dataset(*collection.split_stratified(train_prop=train_size)) return Dataset(*collection.split_stratified(train_prop=train_size))
@property
def classes_(self):
return self.training.classes_
@property @property
def n_classes(self): def n_classes(self):
return self.training.n_classes return self.training.n_classes
@ -207,7 +225,3 @@ def isbinary(data):
if isinstance(data, Dataset) or isinstance(data, LabelledCollection): if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
return data.binary return data.binary
return False return False

View File

@ -47,7 +47,7 @@ UCI_DATASETS = ['acute.a', 'acute.b',
'yeast'] 'yeast']
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False): def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
""" """
Load a Reviews dataset as a Dataset instance, as used in: Load a Reviews dataset as a Dataset instance, as used in:
Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification." Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
@ -91,7 +91,7 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle
return data return data
def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False): def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset:
""" """
Load a Twitter dataset as a Dataset instance, as used in: Load a Twitter dataset as a Dataset instance, as used in:
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
@ -162,12 +162,12 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
return data return data
def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False): def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
data = fetch_UCILabelledCollection(dataset_name, data_home, verbose) data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
return Dataset(*data.split_stratified(1 - test_split, random_state=0)) return Dataset(*data.split_stratified(1 - test_split, random_state=0))
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False): def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> Dataset:
assert dataset_name in UCI_DATASETS, \ assert dataset_name in UCI_DATASETS, \
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \ f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \

View File

@ -29,13 +29,13 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw
test_documents = vectorizer.transform(dataset.test.instances) test_documents = vectorizer.transform(dataset.test.instances)
if inplace: if inplace:
dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.n_classes) dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.classes_)
dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.n_classes) dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.classes_)
dataset.vocabulary = vectorizer.vocabulary_ dataset.vocabulary = vectorizer.vocabulary_
return dataset return dataset
else: else:
training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.n_classes) training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.classes_)
test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.n_classes) test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.classes_)
return Dataset(training, test, vectorizer.vocabulary_) return Dataset(training, test, vectorizer.vocabulary_)
@ -66,8 +66,8 @@ def reduce_columns(dataset: Dataset, min_df=5, inplace=False):
dataset.test.instances = Xte dataset.test.instances = Xte
return dataset return dataset
else: else:
training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.n_classes) training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.classes_)
test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.n_classes) test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.classes_)
return Dataset(training, test) return Dataset(training, test)
@ -100,13 +100,13 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
test_index = indexer.transform(dataset.test.instances) test_index = indexer.transform(dataset.test.instances)
if inplace: if inplace:
dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.n_classes) dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.classes_)
dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.n_classes) dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.classes_)
dataset.vocabulary = indexer.vocabulary_ dataset.vocabulary = indexer.vocabulary_
return dataset return dataset
else: else:
training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.n_classes) training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.classes_)
test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.n_classes) test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.classes_)
return Dataset(training, test, indexer.vocabulary_) return Dataset(training, test, indexer.vocabulary_)

View File

@ -3,7 +3,7 @@ from scipy.sparse import dok_matrix
from tqdm import tqdm from tqdm import tqdm
def from_text(path): def from_text(path, encoding='utf-8'):
""" """
Reas a labelled colletion of documents. Reas a labelled colletion of documents.
File fomart <0 or 1>\t<document>\n File fomart <0 or 1>\t<document>\n
@ -11,7 +11,7 @@ def from_text(path):
:return: a list of sentences, and a list of labels :return: a list of sentences, and a list of labels
""" """
all_sentences, all_labels = [], [] all_sentences, all_labels = [], []
for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'): for line in tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}'):
line = line.strip() line = line.strip()
if line: if line:
label, sentence = line.split('\t') label, sentence = line.split('\t')
@ -25,8 +25,8 @@ def from_text(path):
def from_sparse(path): def from_sparse(path):
""" """
Reas a labelled colletion of real-valued instances expressed in sparse format Reads a labelled collection of real-valued instances expressed in sparse format
File fomart <-1 or 0 or 1>[\s col(int):val(float)]\n File format <-1 or 0 or 1>[\s col(int):val(float)]\n
:param path: path to the labelled collection :param path: path to the labelled collection
:return: a csr_matrix containing the instances (rows), and a ndarray containing the labels :return: a csr_matrix containing the instances (rows), and a ndarray containing the labels
""" """
@ -56,16 +56,16 @@ def from_sparse(path):
return X, y return X, y
def from_csv(path): def from_csv(path, encoding='utf-8'):
""" """
Reas a csv file in which columns are separated by ','. Reads a csv file in which columns are separated by ','.
File fomart <label>,<feat1>,<feat2>,...,<featn>\n File format <label>,<feat1>,<feat2>,...,<featn>\n
:param path: path to the csv file :param path: path to the csv file
:return: a ndarray for the labels and a ndarray (float) for the covariates :return: a ndarray for the labels and a ndarray (float) for the covariates
""" """
X, y = [], [] X, y = [], []
for instance in tqdm(open(path, 'rt').readlines(), desc=f'reading {path}'): for instance in tqdm(open(path, 'rt', encoding=encoding).readlines(), desc=f'reading {path}'):
yi, *xi = instance.strip().split(',') yi, *xi = instance.strip().split(',')
X.append(list(map(float,xi))) X.append(list(map(float,xi)))
y.append(yi) y.append(yi)

View File

@ -12,6 +12,7 @@ import quapy.functional as F
import pandas as pd import pandas as pd
def artificial_sampling_prediction( def artificial_sampling_prediction(
model: BaseQuantifier, model: BaseQuantifier,
test: LabelledCollection, test: LabelledCollection,
@ -21,8 +22,7 @@ def artificial_sampling_prediction(
eval_budget: int = None, eval_budget: int = None,
n_jobs=1, n_jobs=1,
random_seed=42, random_seed=42,
verbose=True verbose=False):
):
""" """
Performs the predictions for all samples generated according to the artificial sampling protocol. Performs the predictions for all samples generated according to the artificial sampling protocol.
:param model: the model in charge of generating the class prevalence estimations :param model: the model in charge of generating the class prevalence estimations
@ -48,6 +48,45 @@ def artificial_sampling_prediction(
with temp_seed(random_seed): with temp_seed(random_seed):
indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, n_repetitions)) indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, n_repetitions))
return _predict_from_indexes(indexes, model, test, n_jobs, verbose)
def natural_sampling_prediction(
model: BaseQuantifier,
test: LabelledCollection,
sample_size,
n_repetitions=1,
n_jobs=1,
random_seed=42,
verbose=False):
"""
Performs the predictions for all samples generated according to the artificial sampling protocol.
:param model: the model in charge of generating the class prevalence estimations
:param test: the test set on which to perform arificial sampling
:param sample_size: the size of the samples
:param n_repetitions: the number of repetitions for each prevalence
:param n_jobs: number of jobs to be run in parallel
:param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
any other random process.
:param verbose: if True, shows a progress bar
:return: two ndarrays of shape (m,n) with m the number of samples (n_repetitions) and n the
number of classes. The first one contains the true prevalences for the samples generated while the second one
contains the the prevalence estimations
"""
with temp_seed(random_seed):
indexes = list(test.natural_sampling_index_generator(sample_size, n_repetitions))
return _predict_from_indexes(indexes, model, test, n_jobs, verbose)
def _predict_from_indexes(
indexes,
model: BaseQuantifier,
test: LabelledCollection,
n_jobs=1,
verbose=False):
if model.aggregative: #isinstance(model, qp.method.aggregative.AggregativeQuantifier): if model.aggregative: #isinstance(model, qp.method.aggregative.AggregativeQuantifier):
# print('\tinstance of aggregative-quantifier') # print('\tinstance of aggregative-quantifier')
quantification_func = model.aggregate quantification_func = model.aggregate
@ -88,7 +127,34 @@ def artificial_sampling_report(
n_jobs=1, n_jobs=1,
random_seed=42, random_seed=42,
error_metrics:Iterable[Union[str,Callable]]='mae', error_metrics:Iterable[Union[str,Callable]]='mae',
verbose=True): verbose=False):
true_prevs, estim_prevs = artificial_sampling_prediction(
model, test, sample_size, n_prevpoints, n_repetitions, eval_budget, n_jobs, random_seed, verbose
)
return _sampling_report(true_prevs, estim_prevs, error_metrics)
def natural_sampling_report(
model: BaseQuantifier,
test: LabelledCollection,
sample_size,
n_repetitions=1,
n_jobs=1,
random_seed=42,
error_metrics:Iterable[Union[str,Callable]]='mae',
verbose=False):
true_prevs, estim_prevs = natural_sampling_prediction(
model, test, sample_size, n_repetitions, n_jobs, random_seed, verbose
)
return _sampling_report(true_prevs, estim_prevs, error_metrics)
def _sampling_report(
true_prevs,
estim_prevs,
error_metrics: Iterable[Union[str, Callable]] = 'mae'):
if isinstance(error_metrics, str): if isinstance(error_metrics, str):
error_metrics = [error_metrics] error_metrics = [error_metrics]
@ -98,9 +164,6 @@ def artificial_sampling_report(
assert all(hasattr(e, '__call__') for e in error_funcs), 'invalid error functions' assert all(hasattr(e, '__call__') for e in error_funcs), 'invalid error functions'
df = pd.DataFrame(columns=['true-prev', 'estim-prev'] + error_names) df = pd.DataFrame(columns=['true-prev', 'estim-prev'] + error_names)
true_prevs, estim_prevs = artificial_sampling_prediction(
model, test, sample_size, n_prevpoints, n_repetitions, eval_budget, n_jobs, random_seed, verbose
)
for true_prev, estim_prev in zip(true_prevs, estim_prevs): for true_prev, estim_prev in zip(true_prevs, estim_prevs):
series = {'true-prev': true_prev, 'estim-prev': estim_prev} series = {'true-prev': true_prev, 'estim-prev': estim_prev}
for error_name, error_metric in zip(error_names, error_funcs): for error_name, error_metric in zip(error_names, error_funcs):
@ -110,7 +173,6 @@ def artificial_sampling_report(
return df return df
def artificial_sampling_eval( def artificial_sampling_eval(
model: BaseQuantifier, model: BaseQuantifier,
test: LabelledCollection, test: LabelledCollection,
@ -121,7 +183,7 @@ def artificial_sampling_eval(
n_jobs=1, n_jobs=1,
random_seed=42, random_seed=42,
error_metric:Union[str,Callable]='mae', error_metric:Union[str,Callable]='mae',
verbose=True): verbose=False):
if isinstance(error_metric, str): if isinstance(error_metric, str):
error_metric = qp.error.from_name(error_metric) error_metric = qp.error.from_name(error_metric)
@ -135,6 +197,28 @@ def artificial_sampling_eval(
return error_metric(true_prevs, estim_prevs) return error_metric(true_prevs, estim_prevs)
def natural_sampling_eval(
model: BaseQuantifier,
test: LabelledCollection,
sample_size,
n_repetitions=1,
n_jobs=1,
random_seed=42,
error_metric:Union[str,Callable]='mae',
verbose=False):
if isinstance(error_metric, str):
error_metric = qp.error.from_name(error_metric)
assert hasattr(error_metric, '__call__'), 'invalid error function'
true_prevs, estim_prevs = natural_sampling_prediction(
model, test, sample_size, n_repetitions, n_jobs, random_seed, verbose
)
return error_metric(true_prevs, estim_prevs)
def evaluate(model: BaseQuantifier, test_samples:Iterable[LabelledCollection], err:Union[str, Callable], n_jobs:int=-1): def evaluate(model: BaseQuantifier, test_samples:Iterable[LabelledCollection], err:Union[str, Callable], n_jobs:int=-1):
if isinstance(err, str): if isinstance(err, str):
err = qp.error.from_name(err) err = qp.error.from_name(err)
@ -149,7 +233,7 @@ def _delayed_eval(args):
return error(prev_true, prev_estim) return error(prev_true, prev_estim)
def _check_num_evals(n_classes, n_prevpoints=None, eval_budget=None, n_repetitions=1, verbose=True): def _check_num_evals(n_classes, n_prevpoints=None, eval_budget=None, n_repetitions=1, verbose=False):
if n_prevpoints is None and eval_budget is None: if n_prevpoints is None and eval_budget is None:
raise ValueError('either n_prevpoints or eval_budget has to be specified') raise ValueError('either n_prevpoints or eval_budget has to be specified')
elif n_prevpoints is None: elif n_prevpoints is None:

View File

@ -36,12 +36,12 @@ def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
return p return p
def prevalence_from_labels(labels, n_classes): def prevalence_from_labels(labels, classes_):
if labels.ndim != 1: if labels.ndim != 1:
raise ValueError(f'param labels does not seem to be a ndarray of label predictions') raise ValueError(f'param labels does not seem to be a ndarray of label predictions')
unique, counts = np.unique(labels, return_counts=True) unique, counts = np.unique(labels, return_counts=True)
by_class = defaultdict(lambda:0, dict(zip(unique, counts))) by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
prevalences = np.asarray([by_class[ci] for ci in range(n_classes)], dtype=np.float) prevalences = np.asarray([by_class[class_] for class_ in classes_], dtype=np.float)
prevalences /= prevalences.sum() prevalences /= prevalences.sum()
return prevalences return prevalences
@ -51,7 +51,7 @@ def prevalence_from_probabilities(posteriors, binarize: bool = False):
raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities') raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities')
if binarize: if binarize:
predictions = np.argmax(posteriors, axis=-1) predictions = np.argmax(posteriors, axis=-1)
return prevalence_from_labels(predictions, n_classes=posteriors.shape[1]) return prevalence_from_labels(predictions, np.arange(posteriors.shape[1]))
else: else:
prevalences = posteriors.mean(axis=0) prevalences = posteriors.mean(axis=0)
prevalences /= prevalences.sum() prevalences /= prevalences.sum()

View File

@ -3,21 +3,31 @@ from . import base
from . import meta from . import meta
from . import non_aggregative from . import non_aggregative
EXPLICIT_LOSS_MINIMIZATION_METHODS = {
aggregative.ELM,
aggregative.SVMQ,
aggregative.SVMAE,
aggregative.SVMKLD,
aggregative.SVMRAE,
aggregative.SVMNKLD
}
AGGREGATIVE_METHODS = { AGGREGATIVE_METHODS = {
aggregative.CC, aggregative.CC,
aggregative.ACC, aggregative.ACC,
aggregative.PCC, aggregative.PCC,
aggregative.PACC, aggregative.PACC,
aggregative.ELM,
aggregative.EMQ, aggregative.EMQ,
aggregative.HDy aggregative.HDy
} } | EXPLICIT_LOSS_MINIMIZATION_METHODS
NON_AGGREGATIVE_METHODS = { NON_AGGREGATIVE_METHODS = {
non_aggregative.MaximumLikelihoodPrevalenceEstimation non_aggregative.MaximumLikelihoodPrevalenceEstimation
} }
META_METHODS = { META_METHODS = {
meta.Ensemble,
meta.QuaNet meta.QuaNet
} }

View File

@ -1,6 +1,7 @@
from abc import abstractmethod from abc import abstractmethod
from copy import deepcopy from copy import deepcopy
from typing import Union from typing import Union
import numpy as np import numpy as np
from joblib import Parallel, delayed from joblib import Parallel, delayed
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
@ -8,6 +9,7 @@ from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm from tqdm import tqdm
import quapy as qp import quapy as qp
import quapy.functional as F import quapy.functional as F
from quapy.classification.svmperf import SVMperf from quapy.classification.svmperf import SVMperf
@ -53,10 +55,10 @@ class AggregativeQuantifier(BaseQuantifier):
@property @property
def n_classes(self): def n_classes(self):
return len(self.classes) return len(self.classes_)
@property @property
def classes(self): def classes_(self):
return self.learner.classes_ return self.learner.classes_
@property @property
@ -127,7 +129,8 @@ def training_helper(learner,
train = data train = data
unused = val_split unused = val_split
else: else:
raise ValueError(f'param "val_split" ({type(val_split)}) not understood; use either a float indicating the split ' raise ValueError(
f'param "val_split" ({type(val_split)}) not understood; use either a float indicating the split '
'proportion, or a LabelledCollection indicating the validation split') 'proportion, or a LabelledCollection indicating the validation split')
else: else:
train, unused = data, None train, unused = data, None
@ -167,7 +170,7 @@ class CC(AggregativeQuantifier):
return self return self
def aggregate(self, classif_predictions): def aggregate(self, classif_predictions):
return F.prevalence_from_labels(classif_predictions, self.n_classes) return F.prevalence_from_labels(classif_predictions, self.classes_)
class ACC(AggregativeQuantifier): class ACC(AggregativeQuantifier):
@ -294,7 +297,8 @@ class PACC(AggregativeProbabilisticQuantifier):
y_ = np.vstack(y_) y_ = np.vstack(y_)
# fit the learner on all data # fit the learner on all data
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True, val_split=None) self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True,
val_split=None)
else: else:
self.learner, val_data = training_helper( self.learner, val_data = training_helper(
@ -307,8 +311,8 @@ class PACC(AggregativeProbabilisticQuantifier):
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi # document that belongs to yj ends up being classified as belonging to yi
confusion = np.empty(shape=(data.n_classes, data.n_classes)) confusion = np.empty(shape=(data.n_classes, data.n_classes))
for yi in range(data.n_classes): for i,class_ in enumerate(data.classes_):
confusion[yi] = y_[y==yi].mean(axis=0) confusion[i] = y_[y == class_].mean(axis=0)
self.Pte_cond_estim_ = confusion.T self.Pte_cond_estim_ = confusion.T
@ -338,7 +342,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
def fit(self, data: LabelledCollection, fit_learner=True): def fit(self, data: LabelledCollection, fit_learner=True):
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
self.train_prevalence = F.prevalence_from_labels(data.labels, self.n_classes) self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
return self return self
def aggregate(self, classif_posteriors, epsilon=EPSILON): def aggregate(self, classif_posteriors, epsilon=EPSILON):
@ -406,12 +410,14 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
self.learner, validation = training_helper( self.learner, validation = training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split) self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
Px = self.posterior_probabilities(validation.instances)[:, 1] # takes only the P(y=+1|x) Px = self.posterior_probabilities(validation.instances)[:, 1] # takes only the P(y=+1|x)
self.Pxy1 = Px[validation.labels == 1] self.Pxy1 = Px[validation.labels == self.learner.classes_[1]]
self.Pxy0 = Px[validation.labels == 0] self.Pxy0 = Px[validation.labels == self.learner.classes_[0]]
# pre-compute the histogram for positive and negative examples # pre-compute the histogram for positive and negative examples
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110] self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in self.bins} self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in
self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in self.bins} self.bins}
self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in
self.bins}
return self return self
def aggregate(self, classif_posteriors): def aggregate(self, classif_posteriors):
@ -439,8 +445,8 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
prev_selected, min_dist = prev, hdy prev_selected, min_dist = prev, hdy
prev_estimations.append(prev_selected) prev_estimations.append(prev_selected)
pos_class_prev = np.median(prev_estimations) class1_prev = np.median(prev_estimations)
return np.asarray([1-pos_class_prev, pos_class_prev]) return np.asarray([1 - class1_prev, class1_prev])
class ELM(AggregativeQuantifier, BinaryQuantifier): class ELM(AggregativeQuantifier, BinaryQuantifier):
@ -458,7 +464,7 @@ class ELM(AggregativeQuantifier, BinaryQuantifier):
return self return self
def aggregate(self, classif_predictions: np.ndarray): def aggregate(self, classif_predictions: np.ndarray):
return F.prevalence_from_labels(classif_predictions, self.learner.n_classes_) return F.prevalence_from_labels(classif_predictions, self.classes_)
def classify(self, X, y=None): def classify(self, X, y=None):
return self.learner.predict(X) return self.learner.predict(X)
@ -470,6 +476,7 @@ class SVMQ(ELM):
Quantification-oriented learning based on reliable classifiers. Quantification-oriented learning based on reliable classifiers.
Pattern Recognition, 48(2):591604. Pattern Recognition, 48(2):591604.
""" """
def __init__(self, svmperf_base=None, **kwargs): def __init__(self, svmperf_base=None, **kwargs):
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs) super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
@ -480,6 +487,7 @@ class SVMKLD(ELM):
Optimizing text quantifiers for multivariate loss functions. Optimizing text quantifiers for multivariate loss functions.
ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27. ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27.
""" """
def __init__(self, svmperf_base=None, **kwargs): def __init__(self, svmperf_base=None, **kwargs):
super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs) super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs)
@ -490,6 +498,7 @@ class SVMNKLD(ELM):
Optimizing text quantifiers for multivariate loss functions. Optimizing text quantifiers for multivariate loss functions.
ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27. ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27.
""" """
def __init__(self, svmperf_base=None, **kwargs): def __init__(self, svmperf_base=None, **kwargs):
super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs) super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs)
@ -581,12 +590,12 @@ class OneVsAll(AggregativeQuantifier):
# some quantifiers (in particular, ELM-based ones) cannot be run with multiprocess, since the temp dir they # some quantifiers (in particular, ELM-based ones) cannot be run with multiprocess, since the temp dir they
# create during the fit will be removed and be no longer available for the predict... # create during the fit will be removed and be no longer available for the predict...
Parallel(n_jobs=self.n_jobs, backend='threading')( Parallel(n_jobs=self.n_jobs, backend='threading')(
delayed(func)(c, *args, **kwargs) for c in self.classes delayed(func)(c, *args, **kwargs) for c in self.classes_
) )
) )
@property @property
def classes(self): def classes_(self):
return sorted(self.dict_binary_quantifiers.keys()) return sorted(self.dict_binary_quantifiers.keys())
def set_params(self, **parameters): def set_params(self, **parameters):
@ -606,7 +615,7 @@ class OneVsAll(AggregativeQuantifier):
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1] return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
def _delayed_binary_fit(self, c, data): def _delayed_binary_fit(self, c, data):
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2) bindata = LabelledCollection(data.instances, data.labels == c, classes_=[False, True])
self.dict_binary_quantifiers[c].fit(bindata) self.dict_binary_quantifiers[c].fit(bindata)
@property @property
@ -616,9 +625,3 @@ class OneVsAll(AggregativeQuantifier):
@property @property
def probabilistic(self): def probabilistic(self):
return self.binary_quantifier.probabilistic return self.binary_quantifier.probabilistic

View File

@ -19,6 +19,10 @@ class BaseQuantifier(metaclass=ABCMeta):
@abstractmethod @abstractmethod
def get_params(self, deep=True): ... def get_params(self, deep=True): ...
@property
@abstractmethod
def classes_(self): ...
# these methods allows meta-learners to reimplement the decision based on their constituents, and not # these methods allows meta-learners to reimplement the decision based on their constituents, and not
# based on class structure # based on class structure
@property @property

View File

@ -1,28 +1,32 @@
from copy import deepcopy from copy import deepcopy
from typing import Union from typing import Union
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, make_scorer, accuracy_score from sklearn.metrics import f1_score, make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_predict
from tqdm import tqdm from tqdm import tqdm
import numpy as np
from joblib import Parallel, delayed
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_predict
import quapy as qp import quapy as qp
from quapy.data import LabelledCollection
from quapy import functional as F from quapy import functional as F
from quapy.data import LabelledCollection
from quapy.evaluation import evaluate from quapy.evaluation import evaluate
from quapy.model_selection import GridSearchQ from quapy.model_selection import GridSearchQ
from . import neural
from .base import BaseQuantifier
from quapy.method.aggregative import CC, ACC, PCC, PACC, HDy, EMQ
try:
from . import neural
except ModuleNotFoundError:
neural = None
from .base import BaseQuantifier
from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ
if neural:
QuaNet = neural.QuaNetTrainer QuaNet = neural.QuaNetTrainer
else:
QuaNet = "QuaNet is not available due to missing torch package"
class Ensemble(BaseQuantifier): class Ensemble(BaseQuantifier):
VALID_POLICIES = {'ave', 'ptr', 'ds'} | qp.error.QUANTIFICATION_ERROR_NAMES VALID_POLICIES = {'ave', 'ptr', 'ds'} | qp.error.QUANTIFICATION_ERROR_NAMES
""" """
@ -186,6 +190,10 @@ class Ensemble(BaseQuantifier):
order = np.argsort(dist) order = np.argsort(dist)
return _select_k(predictions, order, k=self.red_size) return _select_k(predictions, order, k=self.red_size)
@property
def classes_(self):
return self.base_quantifier.classes_
@property @property
def binary(self): def binary(self):
return self.base_quantifier.binary return self.base_quantifier.binary
@ -296,7 +304,8 @@ def _check_error(error):
f'the name of an error function in {qp.error.ERROR_NAMES}') f'the name of an error function in {qp.error.ERROR_NAMES}')
def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None, param_model_sel:dict=None, **kwargs): def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None, param_model_sel: dict = None,
**kwargs):
if optim is not None: if optim is not None:
if param_grid is None: if param_grid is None:
raise ValueError(f'param_grid is None but optim was requested.') raise ValueError(f'param_grid is None but optim was requested.')

View File

@ -58,6 +58,7 @@ class QuaNetTrainer(BaseQuantifier):
self.device = torch.device(device) self.device = torch.device(device)
self.__check_params_colision(self.quanet_params, self.learner.get_params()) self.__check_params_colision(self.quanet_params, self.learner.get_params())
self._classes_ = None
def fit(self, data: LabelledCollection, fit_learner=True): def fit(self, data: LabelledCollection, fit_learner=True):
""" """
@ -67,6 +68,7 @@ class QuaNetTrainer(BaseQuantifier):
:param fit_learner: if true, trains the classifier on a split containing 40% of the data :param fit_learner: if true, trains the classifier on a split containing 40% of the data
:return: self :return: self
""" """
self._classes_ = data.classes_
classifier_data, unused_data = data.split_stratified(0.4) classifier_data, unused_data = data.split_stratified(0.4)
train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20% train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20%
@ -256,6 +258,10 @@ class QuaNetTrainer(BaseQuantifier):
import shutil import shutil
shutil.rmtree(self.checkpointdir, ignore_errors=True) shutil.rmtree(self.checkpointdir, ignore_errors=True)
@property
def classes_(self):
return self._classes_
def mae_loss(output, target): def mae_loss(output, target):
return torch.mean(torch.abs(output - target)) return torch.mean(torch.abs(output - target))

View File

@ -2,18 +2,22 @@ from quapy.data import LabelledCollection
from .base import BaseQuantifier from .base import BaseQuantifier
class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier): class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
def __init__(self, **kwargs): def __init__(self, **kwargs):
pass self._classes_ = None
def fit(self, data: LabelledCollection, *args): def fit(self, data: LabelledCollection, *args):
self._classes_ = data.classes_
self.estimated_prevalence = data.prevalence() self.estimated_prevalence = data.prevalence()
def quantify(self, documents, *args): def quantify(self, documents, *args):
return self.estimated_prevalence return self.estimated_prevalence
@property
def classes_(self):
return self._classes_
def get_params(self): def get_params(self):
pass pass

View File

@ -4,7 +4,6 @@ from copy import deepcopy
from typing import Union, Callable from typing import Union, Callable
import quapy as qp import quapy as qp
import quapy.functional as F
from quapy.data.base import LabelledCollection from quapy.data.base import LabelledCollection
from quapy.evaluation import artificial_sampling_prediction from quapy.evaluation import artificial_sampling_prediction
from quapy.method.aggregative import BaseQuantifier from quapy.method.aggregative import BaseQuantifier
@ -118,6 +117,7 @@ class GridSearchQ(BaseQuantifier):
def handler(signum, frame): def handler(signum, frame):
self.sout('timeout reached') self.sout('timeout reached')
raise TimeoutError() raise TimeoutError()
signal.signal(signal.SIGALRM, handler) signal.signal(signal.SIGALRM, handler)
self.sout(f'starting optimization with n_jobs={n_jobs}') self.sout(f'starting optimization with n_jobs={n_jobs}')
@ -175,6 +175,10 @@ class GridSearchQ(BaseQuantifier):
def quantify(self, instances): def quantify(self, instances):
return self.best_model_.quantify(instances) return self.best_model_.quantify(instances)
@property
def classes_(self):
return self.best_model_.classes_
def set_params(self, **parameters): def set_params(self, **parameters):
self.param_grid = parameters self.param_grid = parameters
@ -185,4 +189,3 @@ class GridSearchQ(BaseQuantifier):
if hasattr(self, 'best_model_'): if hasattr(self, 'best_model_'):
return self.best_model_ return self.best_model_
raise ValueError('best_model called before fit') raise ValueError('best_model called before fit')

0
quapy/tests/__init__.py Normal file
View File

5
quapy/tests/test_base.py Normal file
View File

@ -0,0 +1,5 @@
import pytest
def test_import():
import quapy as qp
assert qp.__version__ is not None

View File

@ -0,0 +1,43 @@
import pytest
from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \
TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, fetch_reviews, fetch_twitter, fetch_UCIDataset
@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS)
def test_fetch_reviews(dataset_name):
dataset = fetch_reviews(dataset_name)
print(f'Dataset {dataset_name}')
print('Training set stats')
dataset.training.stats()
print('Test set stats')
dataset.test.stats()
@pytest.mark.parametrize('dataset_name', TWITTER_SENTIMENT_DATASETS_TEST + TWITTER_SENTIMENT_DATASETS_TRAIN)
def test_fetch_twitter(dataset_name):
try:
dataset = fetch_twitter(dataset_name)
except ValueError as ve:
if dataset_name == 'semeval' and ve.args[0].startswith(
'dataset "semeval" can only be used for model selection.'):
dataset = fetch_twitter(dataset_name, for_model_selection=True)
print(f'Dataset {dataset_name}')
print('Training set stats')
dataset.training.stats()
print('Test set stats')
@pytest.mark.parametrize('dataset_name', UCI_DATASETS)
def test_fetch_UCIDataset(dataset_name):
try:
dataset = fetch_UCIDataset(dataset_name)
except FileNotFoundError as fnfe:
if dataset_name == 'pageblocks.5' and fnfe.args[0].find(
'If this is the first time you attempt to load this dataset') > 0:
print('The pageblocks.5 dataset requires some hand processing to be usable, skipping this test.')
return
print(f'Dataset {dataset_name}')
print('Training set stats')
dataset.training.stats()
print('Test set stats')

185
quapy/tests/test_methods.py Normal file
View File

@ -0,0 +1,185 @@
import numpy
import pytest
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import quapy as qp
from quapy.data import Dataset, LabelledCollection
from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS, EXPLICIT_LOSS_MINIMIZATION_METHODS
from quapy.method.aggregative import ACC, PACC, HDy
from quapy.method.meta import Ensemble
datasets = [pytest.param(qp.datasets.fetch_twitter('hcr'), id='hcr'),
pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')]
learners = [LogisticRegression, LinearSVC]
@pytest.mark.parametrize('dataset', datasets)
@pytest.mark.parametrize('aggregative_method', AGGREGATIVE_METHODS.difference(EXPLICIT_LOSS_MINIMIZATION_METHODS))
@pytest.mark.parametrize('learner', learners)
def test_aggregative_methods(dataset: Dataset, aggregative_method, learner):
model = aggregative_method(learner())
if model.binary and not dataset.binary:
print(f'skipping the test of binary model {type(model)} on non-binary dataset {dataset}')
return
model.fit(dataset.training)
estim_prevalences = model.quantify(dataset.test.instances)
true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, estim_prevalences)
assert type(error) == numpy.float64
@pytest.mark.parametrize('dataset', datasets)
@pytest.mark.parametrize('elm_method', EXPLICIT_LOSS_MINIMIZATION_METHODS)
def test_elm_methods(dataset: Dataset, elm_method):
try:
model = elm_method()
except AssertionError as ae:
if ae.args[0].find('does not seem to point to a valid path') > 0:
print('Missing SVMperf binary program, skipping test')
return
if model.binary and not dataset.binary:
print(f'skipping the test of binary model {model} on non-binary dataset {dataset}')
return
model.fit(dataset.training)
estim_prevalences = model.quantify(dataset.test.instances)
true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, estim_prevalences)
assert type(error) == numpy.float64
@pytest.mark.parametrize('dataset', datasets)
@pytest.mark.parametrize('non_aggregative_method', NON_AGGREGATIVE_METHODS)
def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method):
model = non_aggregative_method()
if model.binary and not dataset.binary:
print(f'skipping the test of binary model {model} on non-binary dataset {dataset}')
return
model.fit(dataset.training)
estim_prevalences = model.quantify(dataset.test.instances)
true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, estim_prevalences)
assert type(error) == numpy.float64
@pytest.mark.parametrize('base_method', AGGREGATIVE_METHODS.difference(EXPLICIT_LOSS_MINIMIZATION_METHODS))
@pytest.mark.parametrize('learner', learners)
@pytest.mark.parametrize('dataset', datasets)
@pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES)
def test_ensemble_method(base_method, learner, dataset: Dataset, policy):
qp.environ['SAMPLE_SIZE'] = len(dataset.training)
model = Ensemble(quantifier=base_method(learner()), size=5, policy=policy, n_jobs=-1)
if model.binary and not dataset.binary:
print(f'skipping the test of binary model {model} on non-binary dataset {dataset}')
return
model.fit(dataset.training)
estim_prevalences = model.quantify(dataset.test.instances)
true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, estim_prevalences)
assert type(error) == numpy.float64
def test_quanet_method():
try:
import quapy.classification.neural
except ModuleNotFoundError:
print('skipping QuaNet test due to missing torch package')
return
dataset = qp.datasets.fetch_reviews('kindle', pickle=True)
dataset = Dataset(dataset.training.sampling(100, *dataset.training.prevalence()),
dataset.test.sampling(100, *dataset.test.prevalence()))
qp.data.preprocessing.index(dataset, min_df=5, inplace=True)
from quapy.classification.neural import CNNnet
cnn = CNNnet(dataset.vocabulary_size, dataset.training.n_classes)
from quapy.classification.neural import NeuralClassifierTrainer
learner = NeuralClassifierTrainer(cnn, device='cuda')
from quapy.method.meta import QuaNet
model = QuaNet(learner, sample_size=len(dataset.training), device='cuda')
if model.binary and not dataset.binary:
print(f'skipping the test of binary model {model} on non-binary dataset {dataset}')
return
model.fit(dataset.training)
estim_prevalences = model.quantify(dataset.test.instances)
true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, estim_prevalences)
assert type(error) == numpy.float64
def models_to_test_for_str_label_names():
models = list()
learner = LogisticRegression
for method in AGGREGATIVE_METHODS.difference(EXPLICIT_LOSS_MINIMIZATION_METHODS):
models.append(method(learner()))
for method in NON_AGGREGATIVE_METHODS:
models.append(method())
return models
@pytest.mark.parametrize('model', models_to_test_for_str_label_names())
def test_str_label_names(model):
if type(model) in {ACC, PACC, HDy}:
print(
f'skipping the test of binary model {type(model)} because it currently does not support random seed control.')
return
dataset = qp.datasets.fetch_reviews('imdb', pickle=True)
dataset = Dataset(dataset.training.sampling(1000, *dataset.training.prevalence()),
dataset.test.sampling(1000, *dataset.test.prevalence()))
qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True)
model.fit(dataset.training)
int_estim_prevalences = model.quantify(dataset.test.instances)
true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, int_estim_prevalences)
assert type(error) == numpy.float64
dataset_str = Dataset(LabelledCollection(dataset.training.instances,
['one' if label == 1 else 'zero' for label in dataset.training.labels]),
LabelledCollection(dataset.test.instances,
['one' if label == 1 else 'zero' for label in dataset.test.labels]))
model.fit(dataset_str.training)
str_estim_prevalences = model.quantify(dataset_str.test.instances)
true_prevalences = dataset_str.test.prevalence()
error = qp.error.mae(true_prevalences, str_estim_prevalences)
assert type(error) == numpy.float64
print(true_prevalences)
print(int_estim_prevalences)
print(str_estim_prevalences)
numpy.testing.assert_almost_equal(int_estim_prevalences[1],
str_estim_prevalences[list(model.classes_).index('one')])

164
setup.py Normal file
View File

@ -0,0 +1,164 @@
from setuptools import setup, find_packages
import pathlib
here = pathlib.Path(__file__).parent.resolve()
long_description = (here / 'README.md').read_text(encoding='utf-8')
def get_version(rel_path):
init_content = (here / rel_path).read_text(encoding='utf-8')
for line in init_content.split('\n'):
if line.startswith('__version__'):
delim = '"' if '"' in line else "'"
return line.split(delim)[1]
else:
raise RuntimeError("Unable to find version string.")
# Arguments marked as "Required" below must be included for upload to PyPI.
# Fields marked as "Optional" may be commented out.
setup(
# This is the name of your project. The first time you publish this
# package, this name will be registered for you. It will determine how
# users can install this project, e.g.:
#
# $ pip install sampleproject
#
# And where it will live on PyPI: https://pypi.org/project/sampleproject/
#
# There are some restrictions on what makes a valid project name
# specification here:
# https://packaging.python.org/specifications/core-metadata/#name
name='QuaPy', # Required
# Versions should comply with PEP 440:
# https://www.python.org/dev/peps/pep-0440/
#
# For a discussion on single-sourcing the version across setup.py and the
# project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version=get_version("quapy/__init__.py"), # Required
# This is a one-line description or tagline of what your project does. This
# corresponds to the "Summary" metadata field:
# https://packaging.python.org/specifications/core-metadata/#summary
description='QuaPy: a framework for Quantification in Python', # Optional
# This is an optional longer description of your project that represents
# the body of text which users will see when they visit PyPI.
#
# Often, this is the same as your README, so you can just read it in from
# that file directly (as we have already done above)
#
# This field corresponds to the "Description" metadata field:
# https://packaging.python.org/specifications/core-metadata/#description-optional
long_description=long_description, # Optional
# Denotes that our long_description is in Markdown; valid values are
# text/plain, text/x-rst, and text/markdown
#
# Optional if long_description is written in reStructuredText (rst) but
# required for plain-text or Markdown; if unspecified, "applications should
# attempt to render [the long_description] as text/x-rst; charset=UTF-8 and
# fall back to text/plain if it is not valid rst" (see link below)
#
# This field corresponds to the "Description-Content-Type" metadata field:
# https://packaging.python.org/specifications/core-metadata/#description-content-type-optional
long_description_content_type='text/markdown', # Optional (see note above)
# This should be a valid link to your project's main homepage.
#
# This field corresponds to the "Home-Page" metadata field:
# https://packaging.python.org/specifications/core-metadata/#home-page-optional
url='https://github.com/HLT-ISTI/QuaPy', # Optional
maintainer='Alejandro Moreo',
maintainer_email='alejandro.moreo@isti.cnr.it',
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'Programming Language :: Python',
'Topic :: Software Development',
'Topic :: Scientific/Engineering',
'License :: OSI Approved :: BSD License',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3 :: Only',
],
keywords='machine learning, quantification, classification, prevalence estimation, priors estimate',
# When your source code is in a subdirectory under the project root, e.g.
# `src/`, it is necessary to specify the `package_dir` argument.
#package_dir={'': 'src'}, # Optional
# You can just specify package directories manually here if your project is
# simple. Or you can use find_packages().
#
# Alternatively, if you just want to distribute a single Python file, use
# the `py_modules` argument instead as follows, which will expect a file
# called `my_module.py` to exist:
#
# py_modules=["my_module"],
#
packages=find_packages(include=['quapy', 'quapy.*']), # Required
python_requires='>=3.6, <4',
install_requires=['scikit-learn', 'pandas', 'tqdm', 'matplotlib'],
# List additional groups of dependencies here (e.g. development
# dependencies). Users will be able to install these using the "extras"
# syntax, for example:
#
# $ pip install sampleproject[dev]
#
# Similar to `install_requires` above, these must be valid existing
# projects.
# extras_require={ # Optional
# 'dev': ['check-manifest'],
# 'test': ['coverage'],
# },
# If there are data files included in your packages that need to be
# installed, specify them here.
# package_data={ # Optional
# 'sample': ['package_data.dat'],
# },
# Although 'package_data' is the preferred approach, in some case you may
# need to place data files outside of your packages. See:
# http://docs.python.org/distutils/setupscript.html#installing-additional-files
#
# In this case, 'data_file' will be installed into '<sys.prefix>/my_data'
# data_files=[('my_data', ['data/data_file'])], # Optional
# To provide executable scripts, use entry points in preference to the
# "scripts" keyword. Entry points provide cross-platform support and allow
# `pip` to create the appropriate form of executable for the target
# platform.
#
# For example, the following would provide a command called `sample` which
# executes the function `main` from this package when invoked:
# entry_points={ # Optional
# 'console_scripts': [
# 'sample=sample:main',
# ],
# },
project_urls={ # Optional
'Contributors': 'https://github.com/HLT-ISTI/QuaPy/graphs/contributors',
'Bug Reports': 'https://github.com/HLT-ISTI/QuaPy/issues',
'Documentation': 'https://github.com/HLT-ISTI/QuaPy/wiki',
'Source': 'https://github.com/HLT-ISTI/QuaPy/',
},
)