1
0
Fork 0

aggregation methods updated

This commit is contained in:
Alejandro Moreo Fernandez 2020-12-09 12:46:50 +01:00
parent 9c8d29156c
commit 2361186a01
9 changed files with 71 additions and 53 deletions

View File

@ -1,4 +1,4 @@
from .dataset import * from .data import *
from . import functional from . import functional
from . import method from . import method
from . import error from . import error

View File

@ -1,9 +1,10 @@
import numpy as np import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from dataset.base import Dataset from data.base import Dataset
from scipy.sparse import spmatrix from scipy.sparse import spmatrix
from utils.util import parallelize from utils.util import parallelize
from .base import LabelledCollection from .base import LabelledCollection
from tqdm import tqdm
def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs): def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
@ -78,8 +79,8 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True) :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
consisting of lists of integer values representing indices. consisting of lists of integer values representing indices.
""" """
__check_type(dataset.training.instances, list, str) __check_type(dataset.training.instances, np.ndarray, str)
__check_type(dataset.test.instances, list, str) __check_type(dataset.test.instances, np.ndarray, str)
indexer = IndexTransformer(min_df=min_df, **kwargs) indexer = IndexTransformer(min_df=min_df, **kwargs)
training_index = indexer.fit_transform(dataset.training.instances) training_index = indexer.fit_transform(dataset.training.instances)
@ -105,7 +106,6 @@ def __check_type(container, container_type=None, element_type=None):
f'unexpected type of element (expected {container_type}, found {type(container)})' f'unexpected type of element (expected {container_type}, found {type(container)})'
class IndexTransformer: class IndexTransformer:
def __init__(self, **kwargs): def __init__(self, **kwargs):
@ -140,7 +140,7 @@ class IndexTransformer:
return self.fit(X).transform(X, n_jobs=n_jobs) return self.fit(X).transform(X, n_jobs=n_jobs)
def vocabulary_size(self): def vocabulary_size(self):
return len(self.vocabulary_) + 1 # the reserved unk token return len(self.vocabulary_)
def add_word(self, word): def add_word(self, word):
if word in self.vocabulary_: if word in self.vocabulary_:

View File

@ -1,5 +1,6 @@
from . import base
from . import aggregative as agg from . import aggregative as agg
from . import non_aggregative as nagg from . import non_aggregative
AGGREGATIVE_METHODS = { AGGREGATIVE_METHODS = {
@ -13,13 +14,10 @@ AGGREGATIVE_METHODS = {
} }
NON_AGGREGATIVE_METHODS = { NON_AGGREGATIVE_METHODS = {
nagg.MaximumLikelihoodPrevalenceEstimation non_aggregative.MaximumLikelihoodPrevalenceEstimation
} }
QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS
# common alisases
MLPE = nagg.MaximumLikelihoodPrevalenceEstimation

View File

@ -1,14 +1,14 @@
import numpy as np import numpy as np
from .base import * from copy import deepcopy
from ..error import mae
import functional as F import functional as F
from ..classification.svmperf import SVMperf import error
from ..dataset import LabelledCollection from method.base import BaseQuantifier
from quapy.classification.svmperf import SVMperf
from quapy.data import LabelledCollection
from sklearn.metrics import confusion_matrix from sklearn.metrics import confusion_matrix
from sklearn.calibration import CalibratedClassifierCV from sklearn.calibration import CalibratedClassifierCV
from joblib import Parallel, delayed from joblib import Parallel, delayed
from abc import abstractmethod
# Abstract classes # Abstract classes
@ -23,6 +23,14 @@ class AggregativeQuantifier(BaseQuantifier):
@abstractmethod @abstractmethod
def fit(self, data: LabelledCollection, fit_learner=True, *args): ... def fit(self, data: LabelledCollection, fit_learner=True, *args): ...
@property
def learner(self):
return self.learner_
@learner.setter
def learner(self, value):
self.learner_ = value
def classify(self, instances): def classify(self, instances):
return self.learner.predict(instances) return self.learner.predict(instances)
@ -69,12 +77,12 @@ def training_helper(learner,
Training procedure common to all Aggregative Quantifiers. Training procedure common to all Aggregative Quantifiers.
:param learner: the learner to be fit :param learner: the learner to be fit
:param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner. :param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner.
:param fit_learner: whether or not to fit the learner :param fit_learner: whether or not to fit the learner (if False, then bypasses any action)
:param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
learner is not probabilistic, then a CalibratedCV instance of it is trained) learner is not probabilistic, then a CalibratedCV instance of it is trained)
:param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner :param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner
:return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0 :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
or None otherwise) or None otherwise) to be used as a validation set for any subsequent parameter fitting
""" """
if fit_learner: if fit_learner:
if ensure_probabilistic: if ensure_probabilistic:
@ -239,7 +247,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
# M-step: qs_pos is Ps+1(y=+1) # M-step: qs_pos is Ps+1(y=+1)
qs = ps.mean(axis=0) qs = ps.mean(axis=0)
if qs_prev_ is not None and mae(qs, qs_prev_) < epsilon and s>10: if qs_prev_ is not None and error.mae(qs, qs_prev_) < epsilon and s>10:
converged = True converged = True
qs_prev_ = qs qs_prev_ = qs
@ -265,7 +273,8 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
self.learner = learner self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6): def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification' assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification. ' \
f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
self.learner, validation = training_helper( self.learner, validation = training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split) self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
Px = self.soft_classify(validation.instances) Px = self.soft_classify(validation.instances)
@ -304,15 +313,19 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
class OneVsAll(AggregativeQuantifier): class OneVsAll(AggregativeQuantifier):
"""
Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
"""
def __init__(self, binary_method, n_jobs=-1, **kwargs): def __init__(self, binary_method, n_jobs=-1):
self.binary_method = binary_method self.binary_method = binary_method
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.kwargs = kwargs
def fit(self, data: LabelledCollection, **kwargs): def fit(self, data: LabelledCollection, **kwargs):
assert not data.binary, f'{self.__class__.__name__} expect non-binary data' assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
self.class_method = {c: self.binary_method(**self.kwargs) for c in data.classes_} assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
Parallel(n_jobs=self.n_jobs, backend='threading')( Parallel(n_jobs=self.n_jobs, backend='threading')(
delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_ delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
) )
@ -332,10 +345,10 @@ class OneVsAll(AggregativeQuantifier):
return sorted(self.class_method.keys()) return sorted(self.class_method.keys())
def set_params(self, **parameters): def set_params(self, **parameters):
self.kwargs=parameters self.binary_method.set_params(**parameters)
def get_params(self, deep=True): def get_params(self, deep=True):
return self.kwargs return self.binary_method.get_params()
def _delayed_binary_predict(self, c, learners, X): def _delayed_binary_predict(self, c, learners, X):
return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence
@ -346,6 +359,12 @@ class OneVsAll(AggregativeQuantifier):
class ExplicitLossMinimisation(AggregativeQuantifier): class ExplicitLossMinimisation(AggregativeQuantifier):
"""
A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
Social Network Analysis and Mining6(19), 122 (2016)
"""
def __init__(self, svmperf_base, loss, **kwargs): def __init__(self, svmperf_base, loss, **kwargs):
self.svmperf_base = svmperf_base self.svmperf_base = svmperf_base
@ -354,16 +373,9 @@ class ExplicitLossMinimisation(AggregativeQuantifier):
def fit(self, data: LabelledCollection, fit_learner=True, *args): def fit(self, data: LabelledCollection, fit_learner=True, *args):
assert fit_learner, 'the method requires that fit_learner=True' assert fit_learner, 'the method requires that fit_learner=True'
if data.binary: self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs) if not data.binary:
else: self.learner = OneVsAll(self.learner, n_jobs=-1)
self.learner = OneVsAll(
binary_method=ExplicitLossMinimisationBinary,
n_jobs=-1,
svmperf_base=self.svmperf_base,
loss=self.loss,
**self.kwargs
)
return self.learner.fit(data, *args) return self.learner.fit(data, *args)
def quantify(self, instances, *args): def quantify(self, instances, *args):
@ -393,6 +405,7 @@ class ExplicitLossMinimisationBinary(AggregativeQuantifier):
return self.learner.predict(X) return self.learner.predict(X)
class SVMQ(ExplicitLossMinimisation): class SVMQ(ExplicitLossMinimisation):
def __init__(self, svmperf_base, **kwargs): def __init__(self, svmperf_base, **kwargs):
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs) super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)

View File

@ -1,5 +1,4 @@
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
import quapy as qp
# Base Quantifier abstract class # Base Quantifier abstract class
@ -7,7 +6,7 @@ import quapy as qp
class BaseQuantifier(metaclass=ABCMeta): class BaseQuantifier(metaclass=ABCMeta):
@abstractmethod @abstractmethod
def fit(self, data: qp.LabelledCollection, *args): ... def fit(self, data, *args): ...
@abstractmethod @abstractmethod
def quantify(self, instances, *args): ... def quantify(self, instances, *args): ...

38
test.py
View File

@ -2,23 +2,25 @@ from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC from sklearn.svm import LinearSVC
import quapy as qp import quapy as qp
import quapy.functional as F import quapy.functional as F
from method.aggregative import OneVsAll
# load a textual binary dataset and create a tfidf bag of words # load a textual binary dataset and create a tfidf bag of words
#from method.aggregative import OneVsAll, BaseQuantifier
train_path = './datasets/reviews/kindle/train.txt' train_path = './datasets/reviews/kindle/train.txt'
test_path = './datasets/reviews/kindle/test.txt' test_path = './datasets/reviews/kindle/test.txt'
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text) #dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
dataset.training = dataset.training.sampling(1000, 0.4, 0.6) #dataset.training = dataset.training.sampling(1000, 0.4, 0.6)
dataset.test = dataset.test.sampling(500, 0.6, 0.4) #dataset.test = dataset.test.sampling(500, 0.6, 0.4)
qp.preprocessing.text2tfidf(dataset, inplace=True) #qp.preprocessing.text2tfidf(dataset, inplace=True)
qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True) #qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
# load a sparse matrix ternary dataset # load a sparse matrix ternary dataset
#train_path = './datasets/twitter/train/sst.train+dev.feature.txt' train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
#test_path = './datasets/twitter/test/sst.test.feature.txt' test_path = './datasets/twitter/test/sst.test.feature.txt'
#dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse) dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
#dataset.training = dataset.training.sampling(500, 0.3, 0.2, 0.5) dataset.training = dataset.training.sampling(500, 0.3, 0.4, 0.3)
#dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3) dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3)
# training a quantifier # training a quantifier
learner = LogisticRegression() learner = LogisticRegression()
@ -30,17 +32,23 @@ learner = LogisticRegression()
# q = qp.method.aggregative.ExpectationMaximizationQuantifier(learner) # q = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
# q = qp.method.aggregative.ExplicitLossMinimisation(svmperf_base='./svm_perf_quantification', loss='q', verbose=0, C=1000) # q = qp.method.aggregative.ExplicitLossMinimisation(svmperf_base='./svm_perf_quantification', loss='q', verbose=0, C=1000)
# q = qp.method.aggregative.SVMQ(svmperf_base='./svm_perf_quantification', verbose=0, C=1000) # q = qp.method.aggregative.SVMQ(svmperf_base='./svm_perf_quantification', verbose=0, C=1000)
q = qp.method.aggregative.HDy(learner) #model = qp.method.aggregative.HDy(learner)
q.fit(dataset.training) #
model = qp.method.aggregative.HDy(learner)
model = OneVsAll(model)
print(model.get_params())
model.fit(dataset.training)
# estimating class prevalences # estimating class prevalences
prevalences_estim = q.quantify(dataset.test.instances) prevalences_estim = model.quantify(dataset.test.instances)
prevalences_true = dataset.test.prevalence() prevalences_true = dataset.test.prevalence()
# evaluation (one single prediction) # evaluation (one single prediction)
error = qp.error.mae(prevalences_true, prevalences_estim) error = qp.error.mae(prevalences_true, prevalences_estim)
print(f'method {q.__class__.__name__}') print(f'method {model.__class__.__name__}')
print(f'true prevalence {F.strprev(prevalences_true)}') print(f'true prevalence {F.strprev(prevalences_true)}')
print(f'estim prevalence {F.strprev(prevalences_estim)}') print(f'estim prevalence {F.strprev(prevalences_estim)}')
print(f'MAE={error:.3f}') print(f'MAE={error:.3f}')