1
0
Fork 0

added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

This commit is contained in:
Alejandro Moreo Fernandez 2021-01-06 14:58:29 +01:00
parent d8e2f7556e
commit 326a8ab803
16 changed files with 705 additions and 105 deletions

View File

@ -9,4 +9,18 @@ negative class). This is not covered in this new implementation, in which the bi
an instance of single-label with 2 labels. Check
Add classnames to LabelledCollection ?
Check the overhead in OneVsAll for SVMperf-based (?)
Add HDy to QuaNet? if so, wrap HDy into OneVsAll in case the dataset is not binary.
Plots (one for binary -- the "diagonal", or for a specific class), another for the error as a funcition of drift.
Add datasets for topic.
Add other methods
Clarify whether QuaNet is an aggregative method or not.
Add datasets from Pérez-Gallego et al. 2017, 2019
Add ensemble models from Pérez-Gallego et al. 2017, 2019
Add plots models like those in Pérez-Gallego et al. 2017 (error boxes)
Add support for CV prediction in ACC and PACC for tpr, fpr
Add medium swap method
Explore the hyperparameter "number of bins" in HDy
Implement HDy for single-label?
Rename EMQ to SLD ?
How many times is the system of equations for ACC and PACC not solved? How many times is it clipped? Do they sum up
to one always?

View File

@ -1,8 +1,8 @@
from . import data
from . import error
from .data import datasets
from . import functional
from . import method
from . import error
from . import data
from . import evaluation
from method.aggregative import isaggregative, isprobabilistic

View File

@ -0,0 +1,38 @@
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
class PCALR:
def __init__(self, n_components=300, C=10, class_weight=None):
self.n_components = n_components
self.learner = LogisticRegression(C=C, class_weight=class_weight, max_iter=1000)
def get_params(self):
params = {'n_components': self.n_components}
params.update(self.learner.get_params())
return params
def set_params(self, **params):
if 'n_components' in params:
self.n_components = params['n_components']
del params['n_components']
self.learner.set_params(**params)
def fit(self, documents, labels):
self.pca = TruncatedSVD(self.n_components)
embedded = self.pca.fit_transform(documents, labels)
self.learner.fit(embedded, labels)
self.classes_ = self.learner.classes_
return self
def predict(self, documents):
embedded = self.transform(documents)
return self.learner.predict(embedded)
def predict_proba(self, documents):
embedded = self.transform(documents)
return self.learner.predict_proba(embedded)
def transform(self, documents):
return self.pca.transform(documents)

View File

@ -1,11 +1,9 @@
import numpy as np
from scipy.sparse import issparse
from sklearn.model_selection import train_test_split
from quapy.functional import artificial_prevalence_sampling
from quapy.functional import artificial_prevalence_sampling, strprev
from scipy.sparse import vstack
from util import temp_seed
class LabelledCollection:
@ -130,6 +128,21 @@ class LabelledCollection:
labels = np.concatenate([self.labels, other.labels])
return LabelledCollection(join_instances, labels)
@property
def Xy(self):
return self.instances, self.labels
def stats(self):
ninstances = len(self)
instance_type = type(self.instances[0])
if instance_type == list:
nfeats = len(self.instances[0])
elif instance_type == np.ndarray:
nfeats = self.instances.shape[1]
else:
nfeats = '?'
print(f'#instances={ninstances}, type={instance_type}, features={nfeats}, n_classes={self.n_classes}, '
f'prevs={strprev(self.prevalence())}')
class Dataset:

View File

@ -2,13 +2,15 @@ import zipfile
from util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource
import os
from os.path import join
from data.base import Dataset
from data.reader import from_text, from_sparse
from data.base import Dataset, LabelledCollection
from data.reader import *
from data.preprocessing import text2tfidf, reduce_columns
import pandas as pd
REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', 'semeval13', 'semeval14', 'semeval15', 'semeval16',
TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders',
'semeval13', 'semeval14', 'semeval15', 'semeval16',
'sst', 'wa', 'wb']
@ -117,4 +119,88 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
return data
UCI_DATASETS = ['acute.a', 'acute.b',
'balance.1', 'balance.2', 'balance.3']
def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
assert dataset_name in UCI_DATASETS, \
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
f'Valid ones are {UCI_DATASETS}'
if data_home is None:
data_home = get_quapy_home()
identifier_map = {
'acute.a': 'acute',
'acute.b': 'acute',
'balance.1': 'balance-scale',
'balance.2': 'balance-scale',
'balance.3': 'balance-scale',
}
dataset_fullname = {
'acute.a': 'Acute Inflammations (urinary bladder)',
'acute.b': 'Acute Inflammations (renal pelvis)',
'balance.1': 'Balance Scale Weight & Distance Database (left)',
'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
'balance.3': 'Balance Scale Weight & Distance Database (right)',
}
data_folder = {
'acute': 'diagnosis',
'balance-scale': 'balance-scale',
}
identifier = identifier_map[dataset_name]
URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
data_path = join(data_home, 'uci_datasets', identifier)
download_file_if_not_exists(f'{URL}/{data_folder[identifier]}.data', f'{data_path}/{identifier}.data')
download_file_if_not_exists(f'{URL}/{data_folder[identifier]}.names', f'{data_path}/{identifier}.names')
if verbose:
print(open(f'{data_path}/{identifier}.names', 'rt').read())
print(f'Loading {dataset_name} ({dataset_fullname[dataset_name]})')
if identifier == 'acute':
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, encoding='utf-16', sep='\t')
if dataset_name == 'acute.a':
y = binarize(df[6], pos_class='yes')
elif dataset_name == 'acute.b':
y = binarize(df[7], pos_class='yes')
mintemp, maxtemp = 35, 42
df[0] = df[0].apply(lambda x:(float(x.replace(',','.'))-mintemp)/(maxtemp-mintemp)).astype(float, copy=False)
[df_replace(df, col) for col in range(1, 6)]
X = df.loc[:, 0:5].values
if identifier == 'balance-scale':
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
if dataset_name == 'balance.1':
y = binarize(df[0], pos_class='L')
elif dataset_name == 'balance.2':
y = binarize(df[0], pos_class='B')
elif dataset_name == 'balance.3':
y = binarize(df[0], pos_class='R')
X = df.loc[:, 1:].astype(float).values
data = LabelledCollection(X, y)
data.stats()
#print(df)
#print(df.loc[:, 0:5].values)
#print(y)
# X = __read_csv(f'{data_path}/{identifier}.data', separator='\t')
# print(X)
#X, y = from_csv(f'{data_path}/{dataset_name}.data')
#y, classnames = reindex_labels(y)
#def __read_csv(path, separator=','):
# x = []
# for instance in tqdm(open(path, 'rt', encoding='utf-16').readlines(), desc=f'reading {path}'):
# x.append(instance.strip().split(separator))
# return x
def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)

View File

@ -1,6 +1,7 @@
import numpy as np
from scipy.sparse import dok_matrix
from tqdm import tqdm
import pandas as pd
def from_text(path):
@ -55,3 +56,42 @@ def from_sparse(path):
y = np.asarray(all_labels) + 1
return X, y
def from_csv(path):
"""
Reas a csv file in which columns are separated by ','.
File fomart <label>,<feat1>,<feat2>,...,<featn>\n
:param path: path to the csv file
:return: a ndarray for the labels and a ndarray (float) for the covariates
"""
X, y = [], []
for instance in tqdm(open(path, 'rt').readlines(), desc=f'reading {path}'):
yi, *xi = instance.strip().split(',')
X.append(list(map(float,xi)))
y.append(yi)
X = np.asarray(X)
y = np.asarray(y)
return X, y
def reindex_labels(y):
"""
Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes.
E.g., y=['B', 'B', 'A', 'C'] -> [1,1,0,2], ['A','B','C']
:param y: the list or array of original labels
:return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.
"""
classnames = sorted(np.unique(y))
label2index = {label: index for index, label in enumerate(classnames)}
indexed = np.empty(y.shape, dtype=np.int)
for label in classnames:
indexed[y==label] = label2index[label]
return indexed, classnames
def binarize(y, pos_class):
y = np.asarray(y)
ybin = np.zeros(y.shape, dtype=np.int)
ybin[y == pos_class] = 1
return ybin

View File

@ -77,6 +77,9 @@ def __check_eps(eps):
CLASSIFICATION_ERROR = {f1e, acce}
QUANTIFICATION_ERROR = {mae, mrae, mse, mkld, mnkld}
CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR}
QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR}
ERROR_NAMES = CLASSIFICATION_ERROR_NAMES | QUANTIFICATION_ERROR_NAMES
f1_error = f1e
acc_error = acce

View File

@ -1,10 +1,12 @@
from typing import Union, Callable, Iterable
from data import LabelledCollection
from quapy.method.aggregative import AggregativeQuantifier, AggregativeProbabilisticQuantifier
from method.aggregative import AggregativeQuantifier, AggregativeProbabilisticQuantifier
from method.base import BaseQuantifier
from util import temp_seed
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm
import error
def artificial_sampling_prediction(
@ -64,5 +66,19 @@ def artificial_sampling_prediction(
return true_prevalences, estim_prevalences
def evaluate(model: BaseQuantifier, test_samples:Iterable[LabelledCollection], err:Union[str, Callable], n_jobs:int=-1):
if isinstance(err, str):
err = getattr(error, err)
assert err.__name__ in error.QUANTIFICATION_ERROR_NAMES, \
f'error={err} does not seem to be a quantification error'
scores = Parallel(n_jobs=n_jobs)(
delayed(_delayed_eval)(model, Ti, err) for Ti in test_samples
)
return np.mean(scores)
def _delayed_eval(model:BaseQuantifier, test:LabelledCollection, error:Callable):
prev_estim = model.quantify(test.instances)
prev_true = test.prevalence()
return error(prev_true, prev_estim)

View File

@ -57,6 +57,37 @@ def prevalence_from_probabilities(posteriors, binarize: bool = False):
return prevalences
def HellingerDistance(P, Q):
return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2))
#def uniform_simplex_sampling(n_classes):
# from https://cs.stackexchange.com/questions/3227/uniform-sampling-from-a-simplex
# r = [0.] + sorted(np.random.rand(n_classes-1)) + [1.]
# return np.asarray([b-a for a,b in zip(r[:-1],r[1:])])
def uniform_prevalence_sampling(n_classes, size=1):
if n_classes == 2:
u = np.random.rand(size)
u = np.vstack([1-u, u]).T
else:
# from https://cs.stackexchange.com/questions/3227/uniform-sampling-from-a-simplex
u = np.random.rand(size, n_classes-1)
u.sort(axis=-1)
_0s = np.zeros(shape=(size, 1))
_1s = np.ones(shape=(size, 1))
a = np.hstack([_0s, u])
b = np.hstack([u, _1s])
u = b-a
if size == 1:
u = u.flatten()
return u
#return np.asarray([uniform_simplex_sampling(n_classes) for _ in range(size)])
uniform_simplex_sampling = uniform_prevalence_sampling
def strprev(prevalences, prec=3):
return '['+ ', '.join([f'{p:.{prec}f}' for p in prevalences]) + ']'
@ -72,14 +103,17 @@ def adjusted_quantification(prevalence_estim, tpr, fpr, clip=True):
def normalize_prevalence(prevalences):
assert prevalences.ndim==1, 'unexpected shape'
accum = prevalences.sum()
if accum > 0:
return prevalences / accum
prevalences = np.asarray(prevalences)
n_classes = prevalences.shape[-1]
accum = prevalences.sum(axis=-1, keepdims=True)
prevalences = np.true_divide(prevalences, accum, where=accum>0)
allzeros = accum.flatten()==0
if any(allzeros):
if prevalences.ndim == 1:
prevalences = np.full(shape=n_classes, fill_value=1./n_classes)
else:
# if all classifiers are trivial rejectors
return np.ones_like(prevalences) / prevalences.size
prevalences[accum.flatten()==0] = np.full(shape=n_classes, fill_value=1./n_classes)
return prevalences
def num_prevalence_combinations(n_prevpoints:int, n_classes:int, n_repeats:int=1):

View File

@ -1,23 +1,28 @@
from . import base
from . import aggregative as agg
from . import aggregative
from . import non_aggregative
from . import meta
AGGREGATIVE_METHODS = {
agg.ClassifyAndCount,
agg.AdjustedClassifyAndCount,
agg.ProbabilisticClassifyAndCount,
agg.ProbabilisticAdjustedClassifyAndCount,
agg.ExplicitLossMinimisation,
agg.ExpectationMaximizationQuantifier,
agg.HellingerDistanceY
aggregative.ClassifyAndCount,
aggregative.AdjustedClassifyAndCount,
aggregative.ProbabilisticClassifyAndCount,
aggregative.ProbabilisticAdjustedClassifyAndCount,
aggregative.ExplicitLossMinimisation,
aggregative.ExpectationMaximizationQuantifier,
aggregative.HellingerDistanceY
}
NON_AGGREGATIVE_METHODS = {
non_aggregative.MaximumLikelihoodPrevalenceEstimation
}
QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS
META_METHODS = {
meta.QuaNet
}
QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS | META_METHODS

View File

@ -2,7 +2,7 @@ import numpy as np
from copy import deepcopy
import functional as F
import error
from method.base import BaseQuantifier
from method.base import BaseQuantifier, BinaryQuantifier
from classification.svmperf import SVMperf
from data import LabelledCollection
from sklearn.metrics import confusion_matrix
@ -22,7 +22,7 @@ class AggregativeQuantifier(BaseQuantifier):
"""
@abstractmethod
def fit(self, data: LabelledCollection, fit_learner=True, *args): ...
def fit(self, data: LabelledCollection, fit_learner=True): ...
@property
def learner(self):
@ -35,12 +35,12 @@ class AggregativeQuantifier(BaseQuantifier):
def classify(self, instances):
return self.learner.predict(instances)
def quantify(self, instances, *args):
def quantify(self, instances):
classif_predictions = self.classify(instances)
return self.aggregate(classif_predictions, *args)
return self.aggregate(classif_predictions)
@abstractmethod
def aggregate(self, classif_predictions:np.ndarray, *args): ...
def aggregate(self, classif_predictions:np.ndarray): ...
def get_params(self, deep=True):
return self.learner.get_params()
@ -68,9 +68,9 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
def posterior_probabilities(self, data):
return self.learner.predict_proba(data)
def quantify(self, instances, *args):
def quantify(self, instances):
classif_posteriors = self.posterior_probabilities(instances)
return self.aggregate(classif_posteriors, *args)
return self.aggregate(classif_posteriors)
def set_params(self, **parameters):
if isinstance(self.learner, CalibratedClassifierCV):
@ -78,11 +78,6 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
self.learner.set_params(**parameters)
class BinaryQuantifier(BaseQuantifier):
def _check_binary(self, data : LabelledCollection, quantifier_name):
assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \
f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
# Helper
# ------------------------------------
@ -144,18 +139,17 @@ class ClassifyAndCount(AggregativeQuantifier):
def __init__(self, learner):
self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, *args):
def fit(self, data: LabelledCollection, fit_learner=True):
"""
Trains the Classify & Count method unless _fit_learner_ is False, in which case it is assumed to be already fit.
:param data: training data
:param fit_learner: if False, the classifier is assumed to be fit
:param args: unused
:return: self
"""
self.learner, _ = training_helper(self.learner, data, fit_learner)
return self
def aggregate(self, classif_predictions, *args):
def aggregate(self, classif_predictions):
return F.prevalence_from_labels(classif_predictions, self.n_classes)
@ -186,7 +180,7 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
def classify(self, data):
return self.cc.classify(data)
def aggregate(self, classif_predictions, *args):
def aggregate(self, classif_predictions):
prevs_estim = self.cc.aggregate(classif_predictions)
return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
@ -208,11 +202,11 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
def __init__(self, learner):
self.learner = learner
def fit(self, data : LabelledCollection, fit_learner=True, *args):
def fit(self, data : LabelledCollection, fit_learner=True):
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
return self
def aggregate(self, classif_posteriors, *args):
def aggregate(self, classif_posteriors):
return F.prevalence_from_probabilities(classif_posteriors, binarize=False)
@ -235,14 +229,22 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier):
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split
)
self.pcc = ProbabilisticClassifyAndCount(self.learner)
y_ = self.classify(validation.instances)
y_ = self.soft_classify(validation.instances)
y = validation.labels
confusion = np.empty(shape=(data.n_classes, data.n_classes))
for yi in range(data.n_classes):
confusion[yi] = y_[y==yi].mean(axis=0)
self.Pte_cond_estim_ = confusion.T
#y_ = self.classify(validation.instances)
#y = validation.labels
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
#self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
return self
def aggregate(self, classif_posteriors, *args):
def aggregate(self, classif_posteriors):
prevs_estim = self.pcc.aggregate(classif_posteriors)
return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
@ -261,7 +263,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
def __init__(self, learner):
self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, *args):
def fit(self, data: LabelledCollection, fit_learner=True):
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
self.train_prevalence = F.prevalence_from_labels(data.labels, self.n_classes)
return self
@ -320,17 +322,17 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier, BinaryQuantifier):
self._check_binary(data, self.__class__.__name__)
self.learner, validation = training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
Px = self.posterior_probabilities(validation.instances)
Px = self.posterior_probabilities(validation.instances)[:,1] # takes only the P(y=+1|x)
self.Pxy1 = Px[validation.labels == 1]
self.Pxy0 = Px[validation.labels == 0]
return self
def aggregate(self, classif_posteriors, *args):
def aggregate(self, classif_posteriors):
# "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
# and the final estimated a priori probability was taken as the median of these 11 estimates."
# (González-Castro, et al., 2013).
Px = classif_posteriors
Px = classif_posteriors[:,1] # takes only the P(y=+1|x)
prev_estimations = []
for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
@ -342,7 +344,7 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier, BinaryQuantifier):
prev_selected, min_dist = None, None
for prev in F.prevalence_linspace(n_prevalences=100, repeat=1, smooth_limits_epsilon=0.0):
Px_train = prev*Pxy1_density + (1 - prev)*Pxy0_density
hdy = HellingerDistanceY.HellingerDistance(Px_train, Px_test)
hdy = F.HellingerDistance(Px_train, Px_test)
if prev_selected is None or hdy < min_dist:
prev_selected, min_dist = prev, hdy
prev_estimations.append(prev_selected)
@ -350,10 +352,6 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier, BinaryQuantifier):
pos_class_prev = np.median(prev_estimations)
return np.asarray([1-pos_class_prev, pos_class_prev])
@classmethod
def HellingerDistance(cls, P, Q):
return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2))
class ExplicitLossMinimisation(AggregativeQuantifier, BinaryQuantifier):
@ -362,13 +360,13 @@ class ExplicitLossMinimisation(AggregativeQuantifier, BinaryQuantifier):
self.loss = loss
self.kwargs = kwargs
def fit(self, data: LabelledCollection, fit_learner=True, *args):
def fit(self, data: LabelledCollection, fit_learner=True):
self._check_binary(data, self.__class__.__name__)
assert fit_learner, 'the method requires that fit_learner=True'
self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels)
return self
def aggregate(self, classif_predictions:np.ndarray, *args):
def aggregate(self, classif_predictions:np.ndarray):
return F.prevalence_from_labels(classif_predictions, self.learner.n_classes_)
def classify(self, X, y=None):
@ -423,23 +421,24 @@ class OneVsAll(AggregativeQuantifier):
self.binary_quantifier = binary_quantifier
self.n_jobs = n_jobs
def fit(self, data: LabelledCollection, **kwargs):
def fit(self, data: LabelledCollection, fit_learner=True):
assert not data.binary, \
f'{self.__class__.__name__} expect non-binary data'
assert isinstance(self.binary_quantifier, BaseQuantifier), \
f'{self.binary_quantifier} does not seem to be a Quantifier'
assert fit_learner==True, 'fit_learner must be True'
if not isinstance(self.binary_quantifier, BinaryQuantifier):
raise ValueError(f'{self.binary_quantifier.__class__.__name__} does not seem to be an instance of '
f'{BinaryQuantifier.__class__.__name__}')
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
self.__parallel(self._delayed_binary_fit, data, **kwargs)
self.__parallel(self._delayed_binary_fit, data)
return self
def classify(self, instances):
classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
return classif_predictions_bin.T
def aggregate(self, classif_predictions_bin, *args):
def aggregate(self, classif_predictions_bin):
assert set(np.unique(classif_predictions_bin)) == {0,1}, \
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
'predictions for each document (row) and class (columns)'
@ -450,7 +449,7 @@ class OneVsAll(AggregativeQuantifier):
#prevalences = np.asarray(prevalences)
return F.normalize_prevalence(prevalences)
def quantify(self, X, *args):
def quantify(self, X):
prevalences = self.__parallel(self._delayed_binary_quantify, X)
return F.normalize_prevalence(prevalences)
@ -480,9 +479,9 @@ class OneVsAll(AggregativeQuantifier):
def _delayed_binary_aggregate(self, c, classif_predictions):
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:,c])[1] # the estimation for the positive class prevalence
def _delayed_binary_fit(self, c, data, **kwargs):
def _delayed_binary_fit(self, c, data):
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
self.dict_binary_quantifiers[c].fit(bindata, **kwargs)
self.dict_binary_quantifiers[c].fit(bindata)
def isaggregative(model):
@ -497,5 +496,3 @@ def isbinary(model):
return isinstance(model, BinaryQuantifier)
from . import neural
QuaNet = neural.QuaNetTrainer

View File

@ -1,15 +1,18 @@
from abc import ABCMeta, abstractmethod
from data import LabelledCollection
# Base Quantifier abstract class
# ------------------------------------
class BaseQuantifier(metaclass=ABCMeta):
@abstractmethod
def fit(self, data, *args): ...
def fit(self, data): ...
@abstractmethod
def quantify(self, instances, *args): ...
def quantify(self, instances): ...
@abstractmethod
def set_params(self, **parameters): ...
@ -18,6 +21,12 @@ class BaseQuantifier(metaclass=ABCMeta):
def get_params(self, deep=True): ...
class BinaryQuantifier(BaseQuantifier):
def _check_binary(self, data: LabelledCollection, quantifier_name):
assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \
f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
# class OneVsAll(AggregativeQuantifier):
# """
# Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary

304
quapy/method/meta.py Normal file
View File

@ -0,0 +1,304 @@
import numpy as np
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
import quapy as qp
from sklearn.model_selection import GridSearchCV, cross_val_predict
from model_selection import GridSearchQ
from .base import BaseQuantifier, BinaryQuantifier
from joblib import Parallel, delayed
from copy import deepcopy
from data import LabelledCollection
from quapy import functional as F
from . import neural
from evaluation import evaluate
QuaNet = neural.QuaNetTrainer
class Ensemble(BaseQuantifier):
VALID_POLICIES = {'ave', 'ptr', 'ds'} | qp.error.QUANTIFICATION_ERROR_NAMES
"""
Methods from the articles:
Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
Information Fusion, 34, 87-100.
and
Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
Dynamic ensemble selection for quantification tasks.
Information Fusion, 45, 1-15.
"""
def __init__(self, quantifier: BaseQuantifier, size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1):
assert policy in Ensemble.VALID_POLICIES, f'unknown policy={policy}; valid are {Ensemble.VALID_POLICIES}'
self.base_quantifier = quantifier
self.size = size
self.min_pos = min_pos
self.red_size = red_size
self.policy = policy
self.n_jobs = n_jobs
self.post_proba_fn = None
def fit(self, data: LabelledCollection):
if self.policy=='ds' and not data.binary:
raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary')
# randomly chooses the prevalences for each member of the ensemble (preventing classes with less than
# min_pos positive examples)
prevs = [_draw_simplex(ndim=data.n_classes, min_val=self.min_pos / len(data)) for _ in range(self.size)]
posteriors = None
if self.policy == 'ds':
# precompute the training posterior probabilities
posteriors, self.post_proba_fn = self.ds_policy_get_posteriors(data)
is_static_policy = (self.policy in qp.error.QUANTIFICATION_ERROR_NAMES)
self.ensemble = Parallel(n_jobs=self.n_jobs)(
delayed(_delayed_new_instance)(
self.base_quantifier, data, prev, posteriors, keep_samples=is_static_policy
) for prev in prevs
)
# self.ensemble = [deepcopy(self.base_quantifier) for _ in range(self.size)]
# self.prevs = [self._valid_simplex_sampling(data.n_classes, min_val=min_freq) for _ in range(self.size)]
# self.samples = [data.sampling(sample_size, *Pi) for Pi in self.prevs]
# Parallel(n_jobs=self.n_jobs)(
# delayed(_delayed_fit)(Qi, Si) for Si, Qi, in zip(self.samples, self.ensemble)
# )
# static selection policy (the name of a quantification-oriented error function to minimize)
if self.policy in qp.error.QUANTIFICATION_ERROR_NAMES:
self.accuracy_policy(error_name=self.policy)
return self
def quantify(self, instances):
predictions = np.asarray(Parallel(n_jobs=self.n_jobs)(
delayed(_delayed_quantify)(Qi, instances) for Qi in self.ensemble
))
if self.policy == 'ptr':
predictions = self.ptr_policy(predictions)
elif self.policy == 'ds':
predictions = self.ds_policy(predictions, instances)
predictions = np.mean(predictions, axis=0)
return F.normalize_prevalence(predictions)
def set_params(self, **parameters):
raise NotImplementedError(f'{self.__class__.__name__} should not be used within GridSearchQ; '
f'instead, use GridSearchQ within Ensemble, or GridSearchCV whithin the '
f'base quantifier if it is an aggregative one.')
def get_params(self, deep=True):
raise NotImplementedError()
def accuracy_policy(self, error_name):
"""
Selects the red_size best performant quantifiers in a static way (i.e., dropping all non-selected instances).
For each model in the ensemble, the performance is measured in terms of _error_name_ on the quantification of
the samples used for training the rest of the models in the ensemble.
"""
error = getattr(qp.error, error_name)
tests = [m[3] for m in self.ensemble]
scores = []
for i, model in enumerate(self.ensemble):
scores.append(evaluate(model[0], tests[:i] + tests[i+1:], error, self.n_jobs))
order = np.argsort(scores)
self.ensemble = select_k(self.ensemble, order, k=self.red_size)
def ptr_policy(self, predictions):
"""
Selects the predictions made by models that have been trained on samples with a prevalence that is most similar
to a first approximation of the test prevalence as made by all models in the ensemble.
"""
test_prev_estim = predictions.mean(axis=0)
tr_prevs = [m[1] for m in self.ensemble]
ptr_differences = [qp.error.mse(ptr_i, test_prev_estim) for ptr_i in tr_prevs]
order = np.argsort(ptr_differences)
return select_k(predictions, order, k=self.red_size)
def ds_policy_get_posteriors(self, data: LabelledCollection):
"""
In the original article, this procedure is not described in a sufficient level of detail. The paper only says
that the distribution of posterior probabilities from training and test examples is compared by means of the
Hellinger Distance. However, how these posterior probabilities are generated is not specified. In the article,
a Logistic Regressor (LR) is used as the classifier device and that could be used for this purpose. However, in
general, a Quantifier is not necessarily an instance of Aggreggative Probabilistic Quantifiers, and so that the
quantifier builds on top of a probabilistic classifier cannot be given for granted. Additionally, it would not
be correct to generate the posterior probabilities for training documents that have concurred in training the
classifier that generates them.
This function thus generates the posterior probabilities for all training documents in a cross-validation way,
using a LR with hyperparameters that have previously been optimized via grid search in 5FCV.
:return P,f, where P is a ndarray containing the posterior probabilities of the training data, generated via
cross-validation and using an optimized LR, and the function to be used in order to generate posterior
probabilities for test instances.
"""
X, y = data.Xy
lr_base = LogisticRegression(class_weight='balanced', max_iter=1000)
optim = GridSearchCV(
lr_base, param_grid={'C': np.logspace(-4,4,9)}, cv=5, n_jobs=self.n_jobs, refit=True
).fit(X, y)
posteriors = cross_val_predict(
optim.best_estimator_, X, y, cv=5, n_jobs=self.n_jobs, method='predict_proba'
)
posteriors_generator = optim.best_estimator_.predict_proba
return posteriors, posteriors_generator
def ds_policy(self, predictions, test):
test_posteriors = self.post_proba_fn(test)
test_distribution = get_probability_distribution(test_posteriors)
tr_distributions = [m[2] for m in self.ensemble]
dist = [F.HellingerDistance(tr_dist_i, test_distribution) for tr_dist_i in tr_distributions]
order = np.argsort(dist)
return select_k(predictions, order, k=self.red_size)
def get_probability_distribution(posterior_probabilities, bins=8):
assert posterior_probabilities.shape[1]==2, 'the posterior probabilities do not seem to be for a binary problem'
posterior_probabilities = posterior_probabilities[:,1] # take the positive posteriors only
distribution, _ = np.histogram(posterior_probabilities, bins=bins, range=(0, 1), density=True)
return distribution
def select_k(elements, order, k):
return [elements[idx] for idx in order[:k]]
def _delayed_new_instance(base_quantifier, data:LabelledCollection, prev, posteriors, keep_samples):
model = deepcopy(base_quantifier)
sample_index = data.sampling_index(len(data), *prev)
sample = data.sampling_from_index(sample_index)
model.fit(sample)
tr_prevalence = sample.prevalence()
tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None
return (model, tr_prevalence, tr_distribution, sample if keep_samples else None)
def _delayed_fit(quantifier, data):
quantifier.fit(data)
def _delayed_quantify(quantifier, instances):
return quantifier[0].quantify(instances)
def _draw_simplex(ndim, min_val, max_trials=100):
"""
returns a uniform sampling from the ndim-dimensional simplex but guarantees that all dimensions
are >= min_class_prev (for min_val>0, this makes the sampling not truly uniform)
:param ndim: number of dimensions of the simplex
:param min_val: minimum class prevalence allowed. If less than 1/ndim a ValueError will be throw since
there is no possible solution.
:return: a sample from the ndim-dimensional simplex that is uniform in S(ndim)-R where S(ndim) is the simplex
and R is the simplex subset containing dimensions lower than min_val
"""
if min_val >= 1/ndim:
raise ValueError(f'no sample can be draw from the {ndim}-dimensional simplex so that '
f'all its values are >={min_val} (try with a larger value for min_pos)')
trials = 0
while True:
u = F.uniform_simplex_sampling(ndim)
if all(u >= min_val):
return u
trials += 1
if trials >= max_trials:
raise ValueError(f'it looks like finding a random simplex with all its dimensions being'
f'>= {min_val} is unlikely (it failed after {max_trials} trials)')
def _instantiate_ensemble(learner, base_quantifier_class, param_grid, optim, sample_size, eval_budget, **kwargs):
if optim is None:
base_quantifier = base_quantifier_class(learner)
elif optim in qp.error.CLASSIFICATION_ERROR:
learner = GridSearchCV(learner, param_grid)
base_quantifier = base_quantifier_class(learner)
elif optim in qp.error.QUANTIFICATION_ERROR:
base_quantifier = GridSearchQ(base_quantifier_class(learner),
param_grid=param_grid,
sample_size=sample_size,
eval_budget=eval_budget,
error=optim)
else:
raise ValueError(f'value optim={optim} not understood')
return Ensemble(base_quantifier, **kwargs)
class EnsembleFactory(BaseQuantifier):
def __init__(self, learner, base_quantifier_class, param_grid=None, optim=None, sample_size=None, eval_budget=None,
size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1):
if param_grid is None and optim is not None:
raise ValueError(f'param_grid is None but optim was requested.')
error = self._check_error(optim)
self.model = _instantiate_ensemble(learner, base_quantifier_class, param_grid, error, sample_size,
eval_budget, size=size, min_pos=min_pos, red_size=red_size,
policy=policy, n_jobs=n_jobs)
def fit(self, data):
return self.model.fit(data)
def quantify(self, instances):
return self.model.quantify(instances)
def set_params(self, **parameters):
return self.model.set_params(**parameters)
def get_params(self, deep=True):
return self.model.get_params(deep)
def _check_error(self, error):
if error is None:
return None
if error in qp.error.QUANTIFICATION_ERROR or error in qp.error.CLASSIFICATION_ERROR:
return error
elif isinstance(error, str):
assert error in qp.error.ERROR_NAMES, \
f'unknown error name; valid ones are {qp.error.ERROR_NAMES}'
return getattr(qp.error, error)
else:
raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n'
f'the name of an error function in {qp.error.ERROR_NAMES}')
class ECC(EnsembleFactory):
def __init__(self, learner, param_grid=None, optim=None, sample_size=None, eval_budget=None,
size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1):
super().__init__(
learner, qp.method.aggregative.CC, param_grid, optim, sample_size, eval_budget, size, min_pos,
red_size, policy, n_jobs
)
class EACC(EnsembleFactory):
def __init__(self, learner, param_grid=None, optim=None, sample_size=None, eval_budget=None,
size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1):
super().__init__(
learner, qp.method.aggregative.ACC, param_grid, optim, sample_size, eval_budget, size, min_pos,
red_size, policy, n_jobs
)
class EHDy(EnsembleFactory):
def __init__(self, learner, param_grid=None, optim=None, sample_size=None, eval_budget=None,
size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1):
super().__init__(
learner, qp.method.aggregative.HDy, param_grid, optim, sample_size, eval_budget, size, min_pos,
red_size, policy, n_jobs
)
class EEMQ(EnsembleFactory):
def __init__(self, learner, param_grid=None, optim=None, sample_size=None, eval_budget=None,
size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1):
super().__init__(
learner, qp.method.aggregative.EMQ, param_grid, optim, sample_size, eval_budget, size, min_pos,
red_size, policy, n_jobs
)

View File

@ -75,23 +75,26 @@ class QuaNetTrainer(BaseQuantifier):
# estimate the hard and soft stats tpr and fpr of the classifier
self.tr_prev = data.prevalence()
self.quantifiers = [
ClassifyAndCount(self.learner).fit(data, fit_learner=False),
AdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False),
ProbabilisticClassifyAndCount(self.learner).fit(data, fit_learner=False),
ProbabilisticAdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False),
ExpectationMaximizationQuantifier(self.learner).fit(data, fit_learner=False),
]
self.quantifiers = {
'cc': ClassifyAndCount(self.learner).fit(data, fit_learner=False),
'acc': AdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False),
'pcc': ProbabilisticClassifyAndCount(self.learner).fit(data, fit_learner=False),
'pacc': ProbabilisticAdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False),
'emq': ExpectationMaximizationQuantifier(self.learner).fit(data, fit_learner=False),
}
self.status = {
'tr-loss': -1,
'va-loss': -1,
}
nQ = len(self.quantifiers)
nC = data.n_classes
self.quanet = QuaNetModule(
doc_embedding_size=train_data.instances.shape[1],
n_classes=data.n_classes,
stats_size=len(self.quantifiers) * data.n_classes,
stats_size=nQ*nC + 2*nC*nC,
order_by=0 if data.binary else None,
**self.quanet_params
).to(self.device)
@ -119,10 +122,15 @@ class QuaNetTrainer(BaseQuantifier):
def get_aggregative_estims(self, posteriors):
label_predictions = np.argmax(posteriors, axis=-1)
prevs_estim = []
for quantifier in self.quantifiers:
for quantifier in self.quantifiers.values():
predictions = posteriors if isprobabilistic(quantifier) else label_predictions
prevs_estim.append(quantifier.aggregate(predictions))
return np.asarray(prevs_estim).flatten()
prevs_estim.extend(quantifier.aggregate(predictions))
# add the class-conditional predictions P(y'i|yj) from ACC and PACC
prevs_estim.extend(self.quantifiers['acc'].Pte_cond_estim_.flatten())
prevs_estim.extend(self.quantifiers['pacc'].Pte_cond_estim_.flatten())
return prevs_estim
def quantify(self, instances, *args):
posteriors = self.learner.predict_proba(instances)

View File

@ -4,11 +4,11 @@ from evaluation import artificial_sampling_prediction
from data.base import LabelledCollection
from method.aggregative import BaseQuantifier
from typing import Union, Callable
import quapy.functional as F
import functional as F
from copy import deepcopy
class GridSearchQ:
class GridSearchQ(BaseQuantifier):
def __init__(self,
model: BaseQuantifier,
@ -105,14 +105,14 @@ class GridSearchQ:
if error in qp.error.QUANTIFICATION_ERROR:
self.error = error
elif isinstance(error, str):
assert error in {func.__name__ for func in qp.error.QUANTIFICATION_ERROR}, \
f'unknown error name; valid ones are {qp.error.QUANTIFICATION_ERROR}'
assert error in qp.error.QUANTIFICATION_ERROR_NAMES, \
f'unknown error name; valid ones are {qp.error.QUANTIFICATION_ERROR_NAMES}'
self.error = getattr(qp.error, error)
else:
raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n'
f'the name of an error function in {qp.error.QUANTIFICATION_ERROR}')
f'the name of an error function in {qp.error.QUANTIFICATION_ERROR_NAMES}')
def fit(self, training: LabelledCollection, validation: Union[LabelledCollection, float]):
def fit(self, training: LabelledCollection, validation: Union[LabelledCollection, float]=0.3):
"""
:param training: the training set on which to optimize the hyperparameters
:param validation: either a LabelledCollection on which to test the performance of the different settings, or
@ -158,5 +158,14 @@ class GridSearchQ:
self.sout(f'refitting on the whole development set')
self.best_model_.fit(training + validation)
return self.best_model_
return self
def quantify(self, instances):
return self.best_model_.quantify(instances)
def set_params(self, **parameters):
self.param_grid = parameters
def get_params(self, deep=True):
return self.param_grid

50
test.py
View File

@ -1,25 +1,35 @@
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
import quapy.functional as F
import sys
import numpy as np
from classification.methods import PCALR
from classification.neural import NeuralClassifierTrainer, CNNnet
from quapy.model_selection import GridSearchQ
#qp.datasets.fetch_UCIDataset('acute.b', verbose=True)
#sys.exit(0)
qp.environ['SAMPLE_SIZE'] = 500
#param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
param_grid = {'C': np.logspace(0,3,4), 'class_weight': ['balanced']}
max_evaluations = 5000
sample_size = qp.environ['SAMPLE_SIZE']
binary = True
svmperf_home = './svm_perf_quantification'
if binary:
dataset = qp.datasets.fetch_reviews('kindle', tfidf=False, min_df=5)
qp.data.preprocessing.index(dataset, inplace=True)
dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
#qp.data.preprocessing.index(dataset, inplace=True)
else:
dataset = qp.datasets.fetch_twitter('hcr', for_model_selection=False, min_df=10, pickle=True)
# dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3)
dataset.training = dataset.training.sampling(sample_size, 0.2, 0.5, 0.3)
print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.test)}')
@ -30,16 +40,26 @@ print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.tes
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
# model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
# model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
# model = qp.method.aggregative.HellingerDistanceY(learner)
# model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
# model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100)
# model = qp.method.aggregative.SVMQ(svmperf_home, C=1)
learner = NeuralClassifierTrainer(CNNnet(dataset.vocabulary_size, dataset.n_classes))
print(learner.get_params())
model = qp.method.aggregative.QuaNet(learner, sample_size, device='cpu')
#learner = PCALR()
#learner = NeuralClassifierTrainer(CNNnet(dataset.vocabulary_size, dataset.n_classes))
#print(learner.get_params())
#model = qp.method.meta.QuaNet(learner, sample_size, device='cpu')
if qp.isbinary(model) and not qp.isbinary(dataset):
model = qp.method.aggregative.OneVsAll(model)
#learner = GridSearchCV(LogisticRegression(max_iter=1000), param_grid=param_grid, n_jobs=-1, verbose=1)
learner = LogisticRegression(max_iter=1000)
model = qp.method.meta.ECC(learner, size=20, red_size=10, param_grid=None, optim=None, policy='ds')
#model = qp.method.meta.EHDy(learner, param_grid=param_grid, optim='mae',
# sample_size=sample_size, eval_budget=max_evaluations//10, n_jobs=-1)
#model = qp.method.aggregative.ClassifyAndCount(learner)
#if qp.isbinary(model) and not qp.isbinary(dataset):
# model = qp.method.aggregative.OneVsAll(model)
# Model fit and Evaluation on the test data
@ -49,6 +69,10 @@ print(f'fitting model {model.__class__.__name__}')
#train, val = dataset.training.split_stratified(0.6)
#model.fit(train, val_split=val)
model.fit(dataset.training)
#for i,e in enumerate(model.ensemble):
#print(i, e.learner.best_estimator_)
# print(i, e.best_model_.learner)
# estimating class prevalences
print('quantifying')
@ -67,7 +91,7 @@ print(f'mae={error:.3f}')
# Model fit and Evaluation according to the artificial sampling protocol
# ----------------------------------------------------------------------------
max_evaluations = 5000
n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes)
n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes)
print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence points for each class, so that\n'
@ -76,7 +100,7 @@ print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, sample_size, n_prevpoints)
qp.error.SAMPLE_SIZE = sample_size
#qp.error.SAMPLE_SIZE = sample_size
print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
for error in qp.error.QUANTIFICATION_ERROR:
score = error(true_prev, estim_prev)
@ -86,7 +110,7 @@ for error in qp.error.QUANTIFICATION_ERROR:
# Model selection and Evaluation according to the artificial sampling protocol
# ----------------------------------------------------------------------------
sys.exit(0)
param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
model_selection = GridSearchQ(model,
param_grid=param_grid,
@ -96,8 +120,8 @@ model_selection = GridSearchQ(model,
refit=True,
verbose=True)
# model = model_selection.fit(dataset.training, validation=0.3)
model = model_selection.fit(train, validation=val)
model = model_selection.fit(dataset.training, validation=0.3)
#model = model_selection.fit(train, validation=val)
print(f'Model selection: best_params = {model_selection.best_params_}')
print(f'param scores:')
for params, score in model_selection.param_scores_.items():