forked from moreo/QuaPy
aggregation methods updated
This commit is contained in:
parent
9c8d29156c
commit
2361186a01
|
@ -1,4 +1,4 @@
|
||||||
from .dataset import *
|
from .data import *
|
||||||
from . import functional
|
from . import functional
|
||||||
from . import method
|
from . import method
|
||||||
from . import error
|
from . import error
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
||||||
from dataset.base import Dataset
|
from data.base import Dataset
|
||||||
from scipy.sparse import spmatrix
|
from scipy.sparse import spmatrix
|
||||||
from utils.util import parallelize
|
from utils.util import parallelize
|
||||||
from .base import LabelledCollection
|
from .base import LabelledCollection
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
|
def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
|
||||||
|
@ -78,8 +79,8 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
|
||||||
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
|
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
|
||||||
consisting of lists of integer values representing indices.
|
consisting of lists of integer values representing indices.
|
||||||
"""
|
"""
|
||||||
__check_type(dataset.training.instances, list, str)
|
__check_type(dataset.training.instances, np.ndarray, str)
|
||||||
__check_type(dataset.test.instances, list, str)
|
__check_type(dataset.test.instances, np.ndarray, str)
|
||||||
|
|
||||||
indexer = IndexTransformer(min_df=min_df, **kwargs)
|
indexer = IndexTransformer(min_df=min_df, **kwargs)
|
||||||
training_index = indexer.fit_transform(dataset.training.instances)
|
training_index = indexer.fit_transform(dataset.training.instances)
|
||||||
|
@ -105,7 +106,6 @@ def __check_type(container, container_type=None, element_type=None):
|
||||||
f'unexpected type of element (expected {container_type}, found {type(container)})'
|
f'unexpected type of element (expected {container_type}, found {type(container)})'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class IndexTransformer:
|
class IndexTransformer:
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
|
@ -140,7 +140,7 @@ class IndexTransformer:
|
||||||
return self.fit(X).transform(X, n_jobs=n_jobs)
|
return self.fit(X).transform(X, n_jobs=n_jobs)
|
||||||
|
|
||||||
def vocabulary_size(self):
|
def vocabulary_size(self):
|
||||||
return len(self.vocabulary_) + 1 # the reserved unk token
|
return len(self.vocabulary_)
|
||||||
|
|
||||||
def add_word(self, word):
|
def add_word(self, word):
|
||||||
if word in self.vocabulary_:
|
if word in self.vocabulary_:
|
|
@ -1,5 +1,6 @@
|
||||||
|
from . import base
|
||||||
from . import aggregative as agg
|
from . import aggregative as agg
|
||||||
from . import non_aggregative as nagg
|
from . import non_aggregative
|
||||||
|
|
||||||
|
|
||||||
AGGREGATIVE_METHODS = {
|
AGGREGATIVE_METHODS = {
|
||||||
|
@ -13,13 +14,10 @@ AGGREGATIVE_METHODS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
NON_AGGREGATIVE_METHODS = {
|
NON_AGGREGATIVE_METHODS = {
|
||||||
nagg.MaximumLikelihoodPrevalenceEstimation
|
non_aggregative.MaximumLikelihoodPrevalenceEstimation
|
||||||
}
|
}
|
||||||
|
|
||||||
QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS
|
QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS
|
||||||
|
|
||||||
|
|
||||||
# common alisases
|
|
||||||
MLPE = nagg.MaximumLikelihoodPrevalenceEstimation
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from .base import *
|
from copy import deepcopy
|
||||||
from ..error import mae
|
|
||||||
import functional as F
|
import functional as F
|
||||||
from ..classification.svmperf import SVMperf
|
import error
|
||||||
from ..dataset import LabelledCollection
|
from method.base import BaseQuantifier
|
||||||
|
from quapy.classification.svmperf import SVMperf
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
from sklearn.metrics import confusion_matrix
|
from sklearn.metrics import confusion_matrix
|
||||||
from sklearn.calibration import CalibratedClassifierCV
|
from sklearn.calibration import CalibratedClassifierCV
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
|
from abc import abstractmethod
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Abstract classes
|
# Abstract classes
|
||||||
|
@ -23,6 +23,14 @@ class AggregativeQuantifier(BaseQuantifier):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, *args): ...
|
def fit(self, data: LabelledCollection, fit_learner=True, *args): ...
|
||||||
|
|
||||||
|
@property
|
||||||
|
def learner(self):
|
||||||
|
return self.learner_
|
||||||
|
|
||||||
|
@learner.setter
|
||||||
|
def learner(self, value):
|
||||||
|
self.learner_ = value
|
||||||
|
|
||||||
def classify(self, instances):
|
def classify(self, instances):
|
||||||
return self.learner.predict(instances)
|
return self.learner.predict(instances)
|
||||||
|
|
||||||
|
@ -69,12 +77,12 @@ def training_helper(learner,
|
||||||
Training procedure common to all Aggregative Quantifiers.
|
Training procedure common to all Aggregative Quantifiers.
|
||||||
:param learner: the learner to be fit
|
:param learner: the learner to be fit
|
||||||
:param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner.
|
:param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner.
|
||||||
:param fit_learner: whether or not to fit the learner
|
:param fit_learner: whether or not to fit the learner (if False, then bypasses any action)
|
||||||
:param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
|
:param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
|
||||||
learner is not probabilistic, then a CalibratedCV instance of it is trained)
|
learner is not probabilistic, then a CalibratedCV instance of it is trained)
|
||||||
:param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner
|
:param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner
|
||||||
:return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
|
:return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
|
||||||
or None otherwise)
|
or None otherwise) to be used as a validation set for any subsequent parameter fitting
|
||||||
"""
|
"""
|
||||||
if fit_learner:
|
if fit_learner:
|
||||||
if ensure_probabilistic:
|
if ensure_probabilistic:
|
||||||
|
@ -239,7 +247,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
|
||||||
# M-step: qs_pos is Ps+1(y=+1)
|
# M-step: qs_pos is Ps+1(y=+1)
|
||||||
qs = ps.mean(axis=0)
|
qs = ps.mean(axis=0)
|
||||||
|
|
||||||
if qs_prev_ is not None and mae(qs, qs_prev_) < epsilon and s>10:
|
if qs_prev_ is not None and error.mae(qs, qs_prev_) < epsilon and s>10:
|
||||||
converged = True
|
converged = True
|
||||||
|
|
||||||
qs_prev_ = qs
|
qs_prev_ = qs
|
||||||
|
@ -265,7 +273,8 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
|
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
|
||||||
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification'
|
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification. ' \
|
||||||
|
f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
|
||||||
self.learner, validation = training_helper(
|
self.learner, validation = training_helper(
|
||||||
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
|
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
|
||||||
Px = self.soft_classify(validation.instances)
|
Px = self.soft_classify(validation.instances)
|
||||||
|
@ -304,15 +313,19 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
|
|
||||||
class OneVsAll(AggregativeQuantifier):
|
class OneVsAll(AggregativeQuantifier):
|
||||||
|
"""
|
||||||
|
Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
|
||||||
|
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, binary_method, n_jobs=-1, **kwargs):
|
def __init__(self, binary_method, n_jobs=-1):
|
||||||
self.binary_method = binary_method
|
self.binary_method = binary_method
|
||||||
self.n_jobs = n_jobs
|
self.n_jobs = n_jobs
|
||||||
self.kwargs = kwargs
|
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, **kwargs):
|
def fit(self, data: LabelledCollection, **kwargs):
|
||||||
assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
|
assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
|
||||||
self.class_method = {c: self.binary_method(**self.kwargs) for c in data.classes_}
|
assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
|
||||||
|
self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
|
||||||
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||||
delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
|
delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
|
||||||
)
|
)
|
||||||
|
@ -332,10 +345,10 @@ class OneVsAll(AggregativeQuantifier):
|
||||||
return sorted(self.class_method.keys())
|
return sorted(self.class_method.keys())
|
||||||
|
|
||||||
def set_params(self, **parameters):
|
def set_params(self, **parameters):
|
||||||
self.kwargs=parameters
|
self.binary_method.set_params(**parameters)
|
||||||
|
|
||||||
def get_params(self, deep=True):
|
def get_params(self, deep=True):
|
||||||
return self.kwargs
|
return self.binary_method.get_params()
|
||||||
|
|
||||||
def _delayed_binary_predict(self, c, learners, X):
|
def _delayed_binary_predict(self, c, learners, X):
|
||||||
return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence
|
return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence
|
||||||
|
@ -346,6 +359,12 @@ class OneVsAll(AggregativeQuantifier):
|
||||||
|
|
||||||
|
|
||||||
class ExplicitLossMinimisation(AggregativeQuantifier):
|
class ExplicitLossMinimisation(AggregativeQuantifier):
|
||||||
|
"""
|
||||||
|
A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
|
||||||
|
quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
|
||||||
|
This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
||||||
|
Social Network Analysis and Mining6(19), 1–22 (2016)
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, svmperf_base, loss, **kwargs):
|
def __init__(self, svmperf_base, loss, **kwargs):
|
||||||
self.svmperf_base = svmperf_base
|
self.svmperf_base = svmperf_base
|
||||||
|
@ -354,16 +373,9 @@ class ExplicitLossMinimisation(AggregativeQuantifier):
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, *args):
|
def fit(self, data: LabelledCollection, fit_learner=True, *args):
|
||||||
assert fit_learner, 'the method requires that fit_learner=True'
|
assert fit_learner, 'the method requires that fit_learner=True'
|
||||||
if data.binary:
|
self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
|
||||||
self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
|
if not data.binary:
|
||||||
else:
|
self.learner = OneVsAll(self.learner, n_jobs=-1)
|
||||||
self.learner = OneVsAll(
|
|
||||||
binary_method=ExplicitLossMinimisationBinary,
|
|
||||||
n_jobs=-1,
|
|
||||||
svmperf_base=self.svmperf_base,
|
|
||||||
loss=self.loss,
|
|
||||||
**self.kwargs
|
|
||||||
)
|
|
||||||
return self.learner.fit(data, *args)
|
return self.learner.fit(data, *args)
|
||||||
|
|
||||||
def quantify(self, instances, *args):
|
def quantify(self, instances, *args):
|
||||||
|
@ -393,6 +405,7 @@ class ExplicitLossMinimisationBinary(AggregativeQuantifier):
|
||||||
return self.learner.predict(X)
|
return self.learner.predict(X)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class SVMQ(ExplicitLossMinimisation):
|
class SVMQ(ExplicitLossMinimisation):
|
||||||
def __init__(self, svmperf_base, **kwargs):
|
def __init__(self, svmperf_base, **kwargs):
|
||||||
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
|
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from abc import ABCMeta, abstractmethod
|
from abc import ABCMeta, abstractmethod
|
||||||
import quapy as qp
|
|
||||||
|
|
||||||
|
|
||||||
# Base Quantifier abstract class
|
# Base Quantifier abstract class
|
||||||
|
@ -7,7 +6,7 @@ import quapy as qp
|
||||||
class BaseQuantifier(metaclass=ABCMeta):
|
class BaseQuantifier(metaclass=ABCMeta):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def fit(self, data: qp.LabelledCollection, *args): ...
|
def fit(self, data, *args): ...
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def quantify(self, instances, *args): ...
|
def quantify(self, instances, *args): ...
|
||||||
|
|
38
test.py
38
test.py
|
@ -2,23 +2,25 @@ from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.svm import LinearSVC
|
from sklearn.svm import LinearSVC
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
|
from method.aggregative import OneVsAll
|
||||||
|
|
||||||
# load a textual binary dataset and create a tfidf bag of words
|
# load a textual binary dataset and create a tfidf bag of words
|
||||||
|
#from method.aggregative import OneVsAll, BaseQuantifier
|
||||||
|
|
||||||
train_path = './datasets/reviews/kindle/train.txt'
|
train_path = './datasets/reviews/kindle/train.txt'
|
||||||
test_path = './datasets/reviews/kindle/test.txt'
|
test_path = './datasets/reviews/kindle/test.txt'
|
||||||
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
|
#dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
|
||||||
dataset.training = dataset.training.sampling(1000, 0.4, 0.6)
|
#dataset.training = dataset.training.sampling(1000, 0.4, 0.6)
|
||||||
dataset.test = dataset.test.sampling(500, 0.6, 0.4)
|
#dataset.test = dataset.test.sampling(500, 0.6, 0.4)
|
||||||
qp.preprocessing.text2tfidf(dataset, inplace=True)
|
#qp.preprocessing.text2tfidf(dataset, inplace=True)
|
||||||
qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
|
#qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
|
||||||
|
|
||||||
# load a sparse matrix ternary dataset
|
# load a sparse matrix ternary dataset
|
||||||
#train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
|
train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
|
||||||
#test_path = './datasets/twitter/test/sst.test.feature.txt'
|
test_path = './datasets/twitter/test/sst.test.feature.txt'
|
||||||
#dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
|
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
|
||||||
#dataset.training = dataset.training.sampling(500, 0.3, 0.2, 0.5)
|
dataset.training = dataset.training.sampling(500, 0.3, 0.4, 0.3)
|
||||||
#dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3)
|
dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3)
|
||||||
|
|
||||||
# training a quantifier
|
# training a quantifier
|
||||||
learner = LogisticRegression()
|
learner = LogisticRegression()
|
||||||
|
@ -30,17 +32,23 @@ learner = LogisticRegression()
|
||||||
# q = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
|
# q = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
|
||||||
# q = qp.method.aggregative.ExplicitLossMinimisation(svmperf_base='./svm_perf_quantification', loss='q', verbose=0, C=1000)
|
# q = qp.method.aggregative.ExplicitLossMinimisation(svmperf_base='./svm_perf_quantification', loss='q', verbose=0, C=1000)
|
||||||
# q = qp.method.aggregative.SVMQ(svmperf_base='./svm_perf_quantification', verbose=0, C=1000)
|
# q = qp.method.aggregative.SVMQ(svmperf_base='./svm_perf_quantification', verbose=0, C=1000)
|
||||||
q = qp.method.aggregative.HDy(learner)
|
#model = qp.method.aggregative.HDy(learner)
|
||||||
q.fit(dataset.training)
|
#
|
||||||
|
|
||||||
|
model = qp.method.aggregative.HDy(learner)
|
||||||
|
model = OneVsAll(model)
|
||||||
|
print(model.get_params())
|
||||||
|
|
||||||
|
model.fit(dataset.training)
|
||||||
|
|
||||||
# estimating class prevalences
|
# estimating class prevalences
|
||||||
prevalences_estim = q.quantify(dataset.test.instances)
|
prevalences_estim = model.quantify(dataset.test.instances)
|
||||||
prevalences_true = dataset.test.prevalence()
|
prevalences_true = dataset.test.prevalence()
|
||||||
|
|
||||||
# evaluation (one single prediction)
|
# evaluation (one single prediction)
|
||||||
error = qp.error.mae(prevalences_true, prevalences_estim)
|
error = qp.error.mae(prevalences_true, prevalences_estim)
|
||||||
|
|
||||||
print(f'method {q.__class__.__name__}')
|
print(f'method {model.__class__.__name__}')
|
||||||
print(f'true prevalence {F.strprev(prevalences_true)}')
|
print(f'true prevalence {F.strprev(prevalences_true)}')
|
||||||
print(f'estim prevalence {F.strprev(prevalences_estim)}')
|
print(f'estim prevalence {F.strprev(prevalences_estim)}')
|
||||||
print(f'MAE={error:.3f}')
|
print(f'MAE={error:.3f}')
|
Loading…
Reference in New Issue