Compare commits

..

4 Commits

6 changed files with 101 additions and 5 deletions

View File

@ -1,3 +1,9 @@
Change Log 0.1.9
----------------
<...>
Change Log 0.1.8
----------------

View File

@ -11,7 +11,7 @@ from . import util
from . import model_selection
from . import classification
__version__ = '0.1.8'
__version__ = '0.1.9'
environ = {
'SAMPLE_SIZE': None,

View File

@ -52,7 +52,7 @@ class KDEBase:
"""
return np.exp(kde.score_samples(X))
def get_mixture_components(self, X, y, n_classes, bandwidth):
def get_mixture_components(self, X, y, classes, bandwidth):
"""
Returns an array containing the mixture components, i.e., the KDE functions for each class.
@ -62,7 +62,7 @@ class KDEBase:
:param bandwidth: float, the bandwidth of the kernel
:return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
"""
return [self.get_kde_function(X[y == cat], bandwidth) for cat in range(n_classes)]
return [self.get_kde_function(X[y == cat], bandwidth) for cat in classes]
@ -114,7 +114,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
self.random_state=random_state
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth)
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
return self
def aggregate(self, posteriors: np.ndarray):
@ -196,7 +196,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
self.montecarlo_trials = montecarlo_trials
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth)
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
N = self.montecarlo_trials
rs = self.random_state

View File

@ -640,6 +640,8 @@ class EMQ(AggregativeSoftQuantifier):
raise ValueError('invalid param argument for recalibration method; available ones are '
'"nbvs", "bcts", "ts", and "vs".')
if not np.issubdtype(y.dtype, np.number):
y = np.searchsorted(data.classes_, y)
self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)
if self.exact_train_prev:
@ -681,6 +683,11 @@ class EMQ(AggregativeSoftQuantifier):
"""
Px = posterior_probabilities
Ptr = np.copy(tr_prev)
if np.product(Ptr) == 0: # some entry is 0; we should smooth the values to avoid 0 division
Ptr += epsilon
Ptr /= Ptr.sum()
qs = np.copy(Ptr) # qs (the running estimate) is initialized as the training prevalence
s, converged = 0, False

View File

@ -1,5 +1,6 @@
from typing import Union, Callable
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from quapy.functional import get_divergence
from quapy.data import LabelledCollection
@ -146,6 +147,53 @@ class DMx(BaseQuantifier):
return F.argmin_prevalence(loss, n_classes, method=self.search)
class ReadMe(BaseQuantifier):
def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs):
self.bootstrap_trials = bootstrap_trials
self.bootstrap_range = bootstrap_range
self.bagging_trials = bagging_trials
self.bagging_range = bagging_range
self.vectorizer_kwargs = vectorizer_kwargs
def fit(self, data: LabelledCollection):
X, y = data.Xy
self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs)
X = self.vectorizer.fit_transform(X)
self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)}
def quantify(self, instances):
X = self.vectorizer.transform(instances)
# number of features
num_docs, num_feats = X.shape
# bootstrap
p_boots = []
for _ in range(self.bootstrap_trials):
docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False)
class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()}
Xboot = X[docs_idx]
# bagging
p_bags = []
for _ in range(self.bagging_trials):
feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False)
class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()}
Xbag = Xboot[:,feat_idx]
p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag)
p_bags.append(p)
p_boots.append(np.mean(p_bags, axis=0))
p_mean = np.mean(p_boots, axis=0)
p_std = np.std(p_bags, axis=0)
return p_mean
def std_constrained_linear_ls(self, X, class_cond_X: dict):
pass
def _get_features_range(X):
feat_ranges = []

View File

@ -56,6 +56,7 @@ def parallel(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
:param seed: the numeric seed
:param asarray: set to True to return a np.ndarray instead of a list
:param backend: indicates the backend used for handling parallel works
:param open_args: if True, then the delayed function is called on *args_i, instead of on args_i
"""
def func_dec(environ, seed, *args):
qp.environ = environ.copy()
@ -74,6 +75,40 @@ def parallel(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
return out
def parallel_unpack(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
"""
A wrapper of multiprocessing:
>>> Parallel(n_jobs=n_jobs)(
>>> delayed(func)(*args_i) for args_i in args
>>> )
that takes the `quapy.environ` variable as input silently.
Seeds the child processes to ensure reproducibility when n_jobs>1.
:param func: callable
:param args: args of func
:param seed: the numeric seed
:param asarray: set to True to return a np.ndarray instead of a list
:param backend: indicates the backend used for handling parallel works
"""
def func_dec(environ, seed, *args):
qp.environ = environ.copy()
qp.environ['N_JOBS'] = 1
# set a context with a temporal seed to ensure results are reproducibles in parallel
with ExitStack() as stack:
if seed is not None:
stack.enter_context(qp.util.temp_seed(seed))
return func(*args)
out = Parallel(n_jobs=n_jobs, backend=backend)(
delayed(func_dec)(qp.environ, None if seed is None else seed + i, *args_i) for i, args_i in enumerate(args)
)
if asarray:
out = np.asarray(out)
return out
@contextlib.contextmanager
def temp_seed(random_state):
"""