diff --git a/quapy/__init__.py b/quapy/__init__.py index 59e21fe..701641d 100644 --- a/quapy/__init__.py +++ b/quapy/__init__.py @@ -1,4 +1,4 @@ -from .dataset import * +from .data import * from . import functional from . import method from . import error diff --git a/quapy/dataset/__init__.py b/quapy/data/__init__.py similarity index 100% rename from quapy/dataset/__init__.py rename to quapy/data/__init__.py diff --git a/quapy/dataset/base.py b/quapy/data/base.py similarity index 100% rename from quapy/dataset/base.py rename to quapy/data/base.py diff --git a/quapy/dataset/preprocessing.py b/quapy/data/preprocessing.py similarity index 97% rename from quapy/dataset/preprocessing.py rename to quapy/data/preprocessing.py index a6259b2..b08bcab 100644 --- a/quapy/dataset/preprocessing.py +++ b/quapy/data/preprocessing.py @@ -1,9 +1,10 @@ import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer -from dataset.base import Dataset +from data.base import Dataset from scipy.sparse import spmatrix from utils.util import parallelize from .base import LabelledCollection +from tqdm import tqdm def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs): @@ -78,8 +79,8 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs): :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True) consisting of lists of integer values representing indices. """ - __check_type(dataset.training.instances, list, str) - __check_type(dataset.test.instances, list, str) + __check_type(dataset.training.instances, np.ndarray, str) + __check_type(dataset.test.instances, np.ndarray, str) indexer = IndexTransformer(min_df=min_df, **kwargs) training_index = indexer.fit_transform(dataset.training.instances) @@ -105,7 +106,6 @@ def __check_type(container, container_type=None, element_type=None): f'unexpected type of element (expected {container_type}, found {type(container)})' - class IndexTransformer: def __init__(self, **kwargs): @@ -140,7 +140,7 @@ class IndexTransformer: return self.fit(X).transform(X, n_jobs=n_jobs) def vocabulary_size(self): - return len(self.vocabulary_) + 1 # the reserved unk token + return len(self.vocabulary_) def add_word(self, word): if word in self.vocabulary_: diff --git a/quapy/dataset/reader.py b/quapy/data/reader.py similarity index 100% rename from quapy/dataset/reader.py rename to quapy/data/reader.py diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py index df5cfd5..88acd16 100644 --- a/quapy/method/__init__.py +++ b/quapy/method/__init__.py @@ -1,5 +1,6 @@ +from . import base from . import aggregative as agg -from . import non_aggregative as nagg +from . import non_aggregative AGGREGATIVE_METHODS = { @@ -13,13 +14,10 @@ AGGREGATIVE_METHODS = { } NON_AGGREGATIVE_METHODS = { - nagg.MaximumLikelihoodPrevalenceEstimation + non_aggregative.MaximumLikelihoodPrevalenceEstimation } QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS -# common alisases -MLPE = nagg.MaximumLikelihoodPrevalenceEstimation - diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 0862588..99204ab 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -1,14 +1,14 @@ import numpy as np -from .base import * -from ..error import mae +from copy import deepcopy import functional as F -from ..classification.svmperf import SVMperf -from ..dataset import LabelledCollection +import error +from method.base import BaseQuantifier +from quapy.classification.svmperf import SVMperf +from quapy.data import LabelledCollection from sklearn.metrics import confusion_matrix from sklearn.calibration import CalibratedClassifierCV from joblib import Parallel, delayed - - +from abc import abstractmethod # Abstract classes @@ -23,6 +23,14 @@ class AggregativeQuantifier(BaseQuantifier): @abstractmethod def fit(self, data: LabelledCollection, fit_learner=True, *args): ... + @property + def learner(self): + return self.learner_ + + @learner.setter + def learner(self, value): + self.learner_ = value + def classify(self, instances): return self.learner.predict(instances) @@ -69,12 +77,12 @@ def training_helper(learner, Training procedure common to all Aggregative Quantifiers. :param learner: the learner to be fit :param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner. - :param fit_learner: whether or not to fit the learner + :param fit_learner: whether or not to fit the learner (if False, then bypasses any action) :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the learner is not probabilistic, then a CalibratedCV instance of it is trained) :param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0 - or None otherwise) + or None otherwise) to be used as a validation set for any subsequent parameter fitting """ if fit_learner: if ensure_probabilistic: @@ -239,7 +247,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier): # M-step: qs_pos is Ps+1(y=+1) qs = ps.mean(axis=0) - if qs_prev_ is not None and mae(qs, qs_prev_) < epsilon and s>10: + if qs_prev_ is not None and error.mae(qs, qs_prev_) < epsilon and s>10: converged = True qs_prev_ = qs @@ -265,7 +273,8 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier): self.learner = learner def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6): - assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification' + assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification. ' \ + f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.' self.learner, validation = training_helper( self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split) Px = self.soft_classify(validation.instances) @@ -304,15 +313,19 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier): class OneVsAll(AggregativeQuantifier): + """ + Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary + quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1. + """ - def __init__(self, binary_method, n_jobs=-1, **kwargs): + def __init__(self, binary_method, n_jobs=-1): self.binary_method = binary_method self.n_jobs = n_jobs - self.kwargs = kwargs def fit(self, data: LabelledCollection, **kwargs): assert not data.binary, f'{self.__class__.__name__} expect non-binary data' - self.class_method = {c: self.binary_method(**self.kwargs) for c in data.classes_} + assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier' + self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_} Parallel(n_jobs=self.n_jobs, backend='threading')( delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_ ) @@ -332,10 +345,10 @@ class OneVsAll(AggregativeQuantifier): return sorted(self.class_method.keys()) def set_params(self, **parameters): - self.kwargs=parameters + self.binary_method.set_params(**parameters) def get_params(self, deep=True): - return self.kwargs + return self.binary_method.get_params() def _delayed_binary_predict(self, c, learners, X): return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence @@ -346,6 +359,12 @@ class OneVsAll(AggregativeQuantifier): class ExplicitLossMinimisation(AggregativeQuantifier): + """ + A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary + quantifier for each class and then l1-normalizes the class predictions so that they sum up to one. + This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. + Social Network Analysis and Mining6(19), 1–22 (2016) + """ def __init__(self, svmperf_base, loss, **kwargs): self.svmperf_base = svmperf_base @@ -354,16 +373,9 @@ class ExplicitLossMinimisation(AggregativeQuantifier): def fit(self, data: LabelledCollection, fit_learner=True, *args): assert fit_learner, 'the method requires that fit_learner=True' - if data.binary: - self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs) - else: - self.learner = OneVsAll( - binary_method=ExplicitLossMinimisationBinary, - n_jobs=-1, - svmperf_base=self.svmperf_base, - loss=self.loss, - **self.kwargs - ) + self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs) + if not data.binary: + self.learner = OneVsAll(self.learner, n_jobs=-1) return self.learner.fit(data, *args) def quantify(self, instances, *args): @@ -393,6 +405,7 @@ class ExplicitLossMinimisationBinary(AggregativeQuantifier): return self.learner.predict(X) + class SVMQ(ExplicitLossMinimisation): def __init__(self, svmperf_base, **kwargs): super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs) diff --git a/quapy/method/base.py b/quapy/method/base.py index bf7ff54..e65b45e 100644 --- a/quapy/method/base.py +++ b/quapy/method/base.py @@ -1,5 +1,4 @@ from abc import ABCMeta, abstractmethod -import quapy as qp # Base Quantifier abstract class @@ -7,7 +6,7 @@ import quapy as qp class BaseQuantifier(metaclass=ABCMeta): @abstractmethod - def fit(self, data: qp.LabelledCollection, *args): ... + def fit(self, data, *args): ... @abstractmethod def quantify(self, instances, *args): ... diff --git a/test.py b/test.py index fe842a2..90167fe 100644 --- a/test.py +++ b/test.py @@ -2,23 +2,25 @@ from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC import quapy as qp import quapy.functional as F - +from method.aggregative import OneVsAll # load a textual binary dataset and create a tfidf bag of words +#from method.aggregative import OneVsAll, BaseQuantifier + train_path = './datasets/reviews/kindle/train.txt' test_path = './datasets/reviews/kindle/test.txt' -dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text) -dataset.training = dataset.training.sampling(1000, 0.4, 0.6) -dataset.test = dataset.test.sampling(500, 0.6, 0.4) -qp.preprocessing.text2tfidf(dataset, inplace=True) -qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True) +#dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text) +#dataset.training = dataset.training.sampling(1000, 0.4, 0.6) +#dataset.test = dataset.test.sampling(500, 0.6, 0.4) +#qp.preprocessing.text2tfidf(dataset, inplace=True) +#qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True) # load a sparse matrix ternary dataset -#train_path = './datasets/twitter/train/sst.train+dev.feature.txt' -#test_path = './datasets/twitter/test/sst.test.feature.txt' -#dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse) -#dataset.training = dataset.training.sampling(500, 0.3, 0.2, 0.5) -#dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3) +train_path = './datasets/twitter/train/sst.train+dev.feature.txt' +test_path = './datasets/twitter/test/sst.test.feature.txt' +dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse) +dataset.training = dataset.training.sampling(500, 0.3, 0.4, 0.3) +dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3) # training a quantifier learner = LogisticRegression() @@ -30,17 +32,23 @@ learner = LogisticRegression() # q = qp.method.aggregative.ExpectationMaximizationQuantifier(learner) # q = qp.method.aggregative.ExplicitLossMinimisation(svmperf_base='./svm_perf_quantification', loss='q', verbose=0, C=1000) # q = qp.method.aggregative.SVMQ(svmperf_base='./svm_perf_quantification', verbose=0, C=1000) -q = qp.method.aggregative.HDy(learner) -q.fit(dataset.training) +#model = qp.method.aggregative.HDy(learner) +# + +model = qp.method.aggregative.HDy(learner) +model = OneVsAll(model) +print(model.get_params()) + +model.fit(dataset.training) # estimating class prevalences -prevalences_estim = q.quantify(dataset.test.instances) +prevalences_estim = model.quantify(dataset.test.instances) prevalences_true = dataset.test.prevalence() # evaluation (one single prediction) error = qp.error.mae(prevalences_true, prevalences_estim) -print(f'method {q.__class__.__name__}') +print(f'method {model.__class__.__name__}') print(f'true prevalence {F.strprev(prevalences_true)}') print(f'estim prevalence {F.strprev(prevalences_estim)}') print(f'MAE={error:.3f}') \ No newline at end of file