QuaNet added, two examples of TextClassifiers added (CNN, LSTM)

2020-12-29 20:33:59 +01:00 · 2020-12-29 20:33:59 +01:00 · d8e2f7556e
parent 3ec711c96e
commit d8e2f7556e
12 changed files with 746 additions and 35 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -1,8 +1,12 @@
 Documentation with sphinx
-Add quantification_report (akin to classification_report from sklearn)
-Add optimization - artificial sampling
+Add quantification_report (akin to classification_report from sklearn) (?)
 Add NAE, NRAE
 Add "measures for evaluating ordinal"?
 Document methods with paper references
 The parallel training in svmperf seems not to work (not sure...)
+In binary quantification (hp, kindle, imdb) we used F1 in the minority class (which in kindle and hp happens to be the
+negative class). This is not covered in this new implementation, in which the binary case is not treated as such, but as
+an instance of single-label with 2 labels. Check
+Add classnames to LabelledCollection ?
+Check the overhead in OneVsAll for SVMperf-based (?)

--- a/quapy/init.py
+++ b/quapy/init.py
@ -1,5 +1,20 @@
-from .data import *
+from . import data
+from .data import datasets
 from . import functional
 from . import method
 from . import error
 from . import evaluation
+from method.aggregative import isaggregative, isprobabilistic
+
+
+environ = {
+    'SAMPLE_SIZE': None,
+    'UNK_TOKEN': '[UNK]',
+    'UNK_INDEX': 0,
+    'PAD_TOKEN': '[PAD]',
+    'PAD_INDEX': 1,
+}
+
+
+def isbinary(x):
+    return data.isbinary(x) or method.aggregative.isbinary(x)
--- a/quapy/classification/neural.py
+++ b/quapy/classification/neural.py
@ -0,0 +1,351 @@
+import os
+from abc import ABCMeta, abstractmethod
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from sklearn.metrics import accuracy_score, f1_score
+from torch.nn.utils.rnn import pad_sequence
+from tqdm import tqdm
+from data import LabelledCollection
+from util import EarlyStop
+import quapy as qp
+
+
+
+class NeuralClassifierTrainer:
+
+    def __init__(self,
+                 net,  # TextClassifierNet
+                 lr=1e-3,
+                 weight_decay=0,
+                 patience=10,
+                 epochs=200,
+                 batch_size=64,
+                 batch_size_test=512,
+                 padding_length=300,
+                 device='cpu',
+                 checkpointpath='../checkpoint/classifier_net.dat'):
+
+        super().__init__()
+
+        assert isinstance(net, TextClassifierNet), f'net is not an instance of {TextClassifierNet.__name__}'
+        self.net = net
+        self.vocab_size = self.net.vocabulary_size
+        self.trainer_hyperparams={
+            'lr': lr,
+            'weight_decay': weight_decay,
+            'patience': patience,
+            'epochs': epochs,
+            'batch_size': batch_size,
+            'batch_size_test': batch_size_test,
+            'padding_length': padding_length,
+            'device': torch.device(device)
+        }
+        self.learner_hyperparams = self.net.get_params()
+
+        self.checkpointpath = checkpointpath
+        self.classes_ = np.asarray([0, 1])
+
+        print(f'[NeuralNetwork running on {device}]')
+        os.makedirs(Path(checkpointpath).parent, exist_ok=True)
+
+    def reset_net_params(self, vocab_size, n_classes):
+        self.net = self.net.__class__(vocab_size, n_classes, **self.learner_hyperparams)
+        self.net.xavier_uniform()
+
+    def get_params(self):
+        return {**self.net.get_params(), **self.trainer_hyperparams}
+
+    def set_params(self, **params):
+        trainer_hyperparams = self.trainer_hyperparams
+        learner_hyperparams = self.net.get_params()
+        for key, val in params.items():
+            if key in trainer_hyperparams and key in learner_hyperparams:
+                raise ValueError(f'the use of parameter {key} is ambiguous since it can refer to '
+                                 f'a parameters of the Trainer or the learner {self.netclass.__name__}')
+            elif key not in trainer_hyperparams and key not in learner_hyperparams:
+                raise ValueError(f'parameter {key} is not valid')
+
+            if key in trainer_hyperparams:
+                trainer_hyperparams[key] = val
+            else:
+                learner_hyperparams[key] = val
+
+        self.trainer_hyperparams = trainer_hyperparams
+        self.learner_hyperparams = learner_hyperparams 
+
+    @property
+    def device(self):
+        return next(self.net.parameters()).device
+
+    def __update_progress_bar(self, pbar):
+        pbar.set_description(f'[{self.net.__class__.__name__}] training epoch={self.current_epoch} '
+                             f'tr-loss={self.status["tr"]["loss"]:.5f} '
+                             f'tr-acc={100 * self.status["tr"]["acc"]:.2f}% '
+                             f'tr-macroF1={100 * self.status["tr"]["f1"]:.2f}% '
+                             f'patience={self.early_stop.patience}/{self.early_stop.PATIENCE_LIMIT} '
+                             f'val-loss={self.status["va"]["loss"]:.5f} '
+                             f'val-acc={100 * self.status["va"]["acc"]:.2f}% '
+                             f'macroF1={100 * self.status["va"]["f1"]:.2f}%')
+
+    def _train_epoch(self, data, status, pbar):
+        self.net.train()
+        criterion = torch.nn.CrossEntropyLoss()
+        losses, predictions, true_labels = [], [], []
+        for xi, yi in data:
+            self.optim.zero_grad()
+            logits = self.net.forward(xi)
+            loss = criterion(logits, yi)
+            loss.backward()
+            self.optim.step()
+            losses.append(loss.item())
+            preds = torch.softmax(logits, dim=-1).detach().cpu().numpy().argmax(axis=-1)
+
+            status["loss"] = np.mean(losses)
+            predictions.extend(preds.tolist())
+            true_labels.extend(yi.detach().cpu().numpy().tolist())
+            status["acc"] = accuracy_score(true_labels, predictions)
+            status["f1"] = f1_score(true_labels, predictions, average='macro')
+            self.__update_progress_bar(pbar)
+
+    def _test_epoch(self, data, status, pbar):
+        self.net.eval()
+        criterion = torch.nn.CrossEntropyLoss()
+        losses, predictions, true_labels = [], [], []
+        with torch.no_grad():
+            for xi, yi in data:
+                logits = self.net.forward(xi)
+                loss = criterion(logits, yi)
+                losses.append(loss.item())
+                preds = torch.softmax(logits, dim=-1).detach().cpu().numpy().argmax(axis=-1)
+                predictions.extend(preds.tolist())
+                true_labels.extend(yi.detach().cpu().numpy().tolist())
+
+            status["loss"] = np.mean(losses)
+            status["acc"] = accuracy_score(true_labels, predictions)
+            status["f1"] = f1_score(true_labels, predictions, average='macro')
+            self.__update_progress_bar(pbar)
+
+    def fit(self, instances, labels, val_split=0.3):
+        train, val = LabelledCollection(instances, labels).split_stratified(1-val_split)
+        opt = self.trainer_hyperparams
+        checkpoint = self.checkpointpath
+        self.reset_net_params(self.vocab_size, train.n_classes)
+
+        train_generator = TorchDataset(train.instances, train.labels).asDataloader(
+            opt['batch_size'], shuffle=True, pad_length=opt['padding_length'], device=opt['device'])
+        valid_generator = TorchDataset(val.instances, val.labels).asDataloader(
+            opt['batch_size_test'], shuffle=False, pad_length=opt['padding_length'], device=opt['device'])
+
+        self.status = {'tr': {'loss': -1, 'acc': -1, 'f1': -1},
+                       'va': {'loss': -1, 'acc': -1, 'f1': -1}}
+
+        self.optim = torch.optim.Adam(self.net.parameters(), lr=opt['lr'], weight_decay=opt['weight_decay'])
+        self.early_stop = EarlyStop(opt['patience'], lower_is_better=False)
+
+        with tqdm(range(1, opt['epochs'] + 1)) as pbar:
+            for self.current_epoch in pbar:
+                self._train_epoch(train_generator, self.status['tr'], pbar)
+                self._test_epoch(valid_generator, self.status['va'], pbar)
+
+                self.early_stop(self.status['va']['f1'], self.current_epoch)
+                if self.early_stop.IMPROVED:
+                    torch.save(self.net.state_dict(), checkpoint)
+                elif self.early_stop.STOP:
+                    print(f'training ended by patience exhasted; loading best model parameters in {checkpoint} '
+                          f'for epoch {self.early_stop.best_epoch}')
+                    self.net.load_state_dict(torch.load(checkpoint))
+                    break
+
+        print('performing one training pass over the validation set...')
+        self._train_epoch(valid_generator, self.status['tr'], pbar)
+        print('[done]')
+
+        return self
+
+    def predict(self, instances):
+        return np.argmax(self.predict_proba(instances), axis=-1)
+
+    def predict_proba(self, instances):
+        return self.net.predict_proba(instances)
+
+    def predict_probability_positive(self, instances):
+        self.net.eval()
+        opt = self.trainer_hyperparams
+        with torch.no_grad():
+            positive_probs = []
+            for xi in TorchDataset(instances).asDataloader(
+                    opt['batch_size_test'], shuffle=False, pad_length=opt['padding_length'], device=opt['device']):
+                positive_probs.append(self.net.predict_proba(xi))
+        return np.concatenate(positive_probs)
+
+    def transform(self, instances):
+        self.net.eval()
+        embeddings = []
+        with torch.no_grad():
+            for xi in TorchDataset(instances).asDataloader(
+                    self.batch_size_test, shuffle=False, pad_length=self.padding_length, device=self.device):
+                embeddings.append(self.net.document_embedding(xi).detach().cpu().numpy())
+        return np.concatenate(embeddings)
+
+
+class TorchDataset(torch.utils.data.Dataset):
+
+    def __init__(self, instances, labels=None):
+        self.instances = instances
+        self.labels = labels
+
+    def __len__(self):
+        return len(self.instances)
+
+    def __getitem__(self, index):
+        return {'doc': self.instances[index], 'label': self.labels[index] if self.labels is not None else None}
+
+    def asDataloader(self, batch_size, shuffle, pad_length, device):
+        def collate(batch):
+            data = [torch.LongTensor(item['doc'][:pad_length]) for item in batch]
+            data = pad_sequence(data, batch_first=True, padding_value=qp.environ['PAD_INDEX']).to(device)
+            targets = [item['label'] for item in batch]
+            if targets[0] is None:
+                return data
+            else:
+                targets = torch.as_tensor(targets, dtype=torch.long).to(device)
+                return [data, targets]
+
+        torchDataset = TorchDataset(self.instances, self.labels)
+        return torch.utils.data.DataLoader(torchDataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate)
+
+
+class TextClassifierNet(torch.nn.Module, metaclass=ABCMeta):
+
+    @abstractmethod
+    def document_embedding(self, x): ...
+
+    def forward(self, x):
+        doc_embedded = self.document_embedding(x)
+        return self.output(doc_embedded)
+
+    def dimensions(self):
+        return self.dim
+
+    def predict_proba(self, x):
+        logits = self(x)
+        return torch.softmax(logits).detach().cpu().numpy()
+
+    def xavier_uniform(self):
+        for p in self.parameters():
+            if p.dim() > 1 and p.requires_grad:
+                torch.nn.init.xavier_uniform_(p)
+
+    @abstractmethod
+    def get_params(self): ...
+
+    @property
+    def vocabulary_size(self): ...
+
+
+class LSTMnet(TextClassifierNet):
+
+    def __init__(self, vocabulary_size, n_classes, embedding_size=100, hidden_size=256, repr_size=100, lstm_nlayers=1,
+                 drop_p=0.5):
+        super().__init__()
+        self.vocabulary_size_ = vocabulary_size
+        self.n_classes = n_classes
+        self.hyperparams={
+            'embedding_size': embedding_size,
+            'hidden_size': hidden_size,
+            'repr_size': repr_size,
+            'lstm_nlayers': lstm_nlayers,
+            'drop_p': drop_p
+        }
+
+        self.word_embedding = torch.nn.Embedding(vocabulary_size, embedding_size)
+        self.lstm = torch.nn.LSTM(embedding_size, hidden_size, lstm_nlayers, dropout=drop_p, batch_first=True)
+        self.dropout = torch.nn.Dropout(drop_p)
+
+        self.dim = repr_size
+        self.doc_embedder = torch.nn.Linear(hidden_size, self.dim)
+        self.output = torch.nn.Linear(self.dim, n_classes)
+
+    def init_hidden(self, set_size):
+        opt = self.hyperparams
+        var_hidden = torch.zeros(opt['lstm_nlayers'], set_size, opt['lstm_hidden_size'])
+        var_cell = torch.zeros(opt['lstm_nlayers'], set_size, opt['lstm_hidden_size'])
+        if next(self.lstm.parameters()).is_cuda:
+            var_hidden, var_cell = var_hidden.cuda(), var_cell.cuda()
+        return var_hidden, var_cell
+
+    def document_embedding(self, x):
+        embedded = self.word_embedding(x)
+        rnn_output, rnn_hidden = self.lstm(embedded, self.init_hidden(x.size()[0]))
+        abstracted = self.dropout(F.relu(rnn_hidden[0][-1]))
+        abstracted = self.doc_embedder(abstracted)
+        return abstracted
+
+    def get_params(self):
+        return self.hyperparams
+
+    @property
+    def vocabulary_size(self):
+        return self.vocabulary_size_
+
+
+class CNNnet(TextClassifierNet):
+
+    def __init__(self, vocabulary_size, n_classes, embedding_size=100, hidden_size=256, repr_size=100,
+                 kernel_heights=[3, 5, 7], stride=1, padding=0, drop_p=0.5):
+        super(CNNnet, self).__init__()
+
+        self.vocabulary_size_ = vocabulary_size
+        self.n_classes = n_classes
+        self.hyperparams={
+            'embedding_size': embedding_size,
+            'hidden_size': hidden_size,
+            'repr_size': repr_size,
+            'kernel_heights':kernel_heights,
+            'stride': stride,
+            'drop_p': drop_p
+        }
+        self.word_embedding = torch.nn.Embedding(vocabulary_size, embedding_size)
+        in_channels = 1
+        self.conv1 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[0], embedding_size), stride, padding)
+        self.conv2 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[1], embedding_size), stride, padding)
+        self.conv3 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[2], embedding_size), stride, padding)
+        self.dropout = nn.Dropout(drop_p)
+
+        self.dim = repr_size
+        self.doc_embedder = torch.nn.Linear(len(kernel_heights) * hidden_size, self.dim)
+        self.output = nn.Linear(self.dim, n_classes)
+
+    def conv_block(self, input, conv_layer):
+        conv_out = conv_layer(input)  # conv_out.size() = (batch_size, out_channels, dim, 1)
+        activation = F.relu(conv_out.squeeze(3))  # activation.size() = (batch_size, out_channels, dim1)
+        max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2)  # maxpool_out.size() = (batch_size, out_channels)
+        return max_out
+
+    def document_embedding(self, input):
+        input = self.word_embedding(input)
+        input = input.unsqueeze(1)  # input.size() = (batch_size, 1, num_seq, embedding_length)
+
+        max_out1 = self.conv_block(input, self.conv1)
+        max_out2 = self.conv_block(input, self.conv2)
+        max_out3 = self.conv_block(input, self.conv3)
+
+        all_out = torch.cat((max_out1, max_out2, max_out3), 1)  # all_out.size() = (batch_size, num_kernels*out_channels)
+        abstracted = self.dropout(F.relu(all_out))  #  (batch_size, num_kernels*out_channels)
+        abstracted = self.doc_embedder(abstracted)
+        return abstracted
+
+    def get_params(self):
+        return self.hyperparams
+
+    @property
+    def vocabulary_size(self):
+        return self.vocabulary_size_
+
+
+
+
+
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@ -158,6 +158,16 @@ class Dataset:
        test = LabelledCollection.load(test_path, loader_func)
        return Dataset(training, test)

+    @property
+    def vocabulary_size(self):
+        return len(self.vocabulary)
+
+
+def isbinary(data):
+    if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
+        return data.binary
+    return False
+



--- a/quapy/data/preprocessing.py
+++ b/quapy/data/preprocessing.py
@ -5,6 +5,7 @@ from scipy.sparse import spmatrix
 from util import parallelize
 from .base import LabelledCollection
 from tqdm import tqdm
+import quapy as qp


 def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
@ -114,6 +115,7 @@ class IndexTransformer:
        """
        self.vect = CountVectorizer(**kwargs)
        self.unk = -1  # a valid index is assigned after fit
+        self.pad = -2  # a valid index is assigned after fit

    def fit(self, X):
        """
@ -123,12 +125,13 @@ class IndexTransformer:
        self.vect.fit(X)
        self.analyzer = self.vect.build_analyzer()
        self.vocabulary_ = self.vect.vocabulary_
-        self.unk = self.add_word('UNK')
+        self.unk = self.add_word(qp.environ['UNK_TOKEN'], qp.environ['UNK_INDEX'])
+        self.pad = self.add_word(qp.environ['PAD_TOKEN'], qp.environ['PAD_INDEX'])
        return self

    def transform(self, X, n_jobs=-1):
        # given the number of tasks and the number of jobs, generates the slices for the parallel threads
-        assert self.unk > 0, 'transform called before fit'
+        assert self.unk != -1, 'transform called before fit'
        indexed = parallelize(func=self.index, args=X, n_jobs=n_jobs)
        return np.asarray(indexed)

@ -142,9 +145,22 @@ class IndexTransformer:
    def vocabulary_size(self):
        return len(self.vocabulary_)

-    def add_word(self, word):
+    def add_word(self, word, id=None, nogaps=True):
        if word in self.vocabulary_:
            raise ValueError(f'word {word} already in dictionary')
-        self.vocabulary_[word] = len(self.vocabulary_)
+        if id is None:
+            # add the word with the next id
+            self.vocabulary_[word] = len(self.vocabulary_)
+        else:
+            id2word = {id_:word_ for word_, id_ in self.vocabulary_.items()}
+            if id in id2word:
+                old_word = id2word[id]
+                self.vocabulary_[word] = id
+                del self.vocabulary_[old_word]
+                self.add_word(old_word)
+            elif nogaps:
+                if id > self.vocabulary_size()+1:
+                    raise ValueError(f'word {word} added with id {id}, while the current vocabulary size '
+                                     f'is of {self.vocabulary_size()}, and id gaps are not allowed')
        return self.vocabulary_[word]

--- a/quapy/error.py
+++ b/quapy/error.py
@ -1,11 +1,9 @@
 from sklearn.metrics import f1_score
 import numpy as np
+import quapy as qp



-SAMPLE_SIZE = None
-
-
 def f1e(y_true, y_pred):
    return 1. - f1_score(y_true, y_pred, average='macro')

@ -68,11 +66,12 @@ def smooth(p, eps):


 def __check_eps(eps):
+    sample_size = qp.environ['SAMPLE_SIZE']
    if eps is None:
-        if SAMPLE_SIZE is None:
-            raise ValueError('eps was not defined, and qp.error.SAMPLE_SIZE was not set')
+        if sample_size is None:
+            raise ValueError('eps was not defined, and qp.environ["SAMPLE_SIZE"] was not set')
        else:
-            eps = 1. / (2. * SAMPLE_SIZE)
+            eps = 1. / (2. * sample_size)
    return eps


--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -289,6 +289,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
                converged = True

            qs_prev_ = qs
+            s += 1

        if not converged:
            raise UserWarning('the method has reached the maximum number of iterations; it might have not converged')
@ -443,6 +444,10 @@ class OneVsAll(AggregativeQuantifier):
            'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
            'predictions for each document (row) and class (columns)'
        prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin)
+        #prevalences = []
+        #for c in self.classes:
+        #    prevalences.append(self._delayed_binary_aggregate(c, classif_predictions_bin))
+        #prevalences = np.asarray(prevalences)
        return F.normalize_prevalence(prevalences)

    def quantify(self, X, *args):
@ -477,4 +482,20 @@ class OneVsAll(AggregativeQuantifier):

    def _delayed_binary_fit(self, c, data, **kwargs):
        bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
-        self.dict_binary_quantifiers[c].fit(bindata, **kwargs)
+        self.dict_binary_quantifiers[c].fit(bindata, **kwargs)
+
+
+def isaggregative(model):
+    return isinstance(model, AggregativeQuantifier)
+
+
+def isprobabilistic(model):
+    return isinstance(model, AggregativeProbabilisticQuantifier)
+
+
+def isbinary(model):
+    return isinstance(model, BinaryQuantifier)
+
+
+from . import neural
+QuaNet = neural.QuaNetTrainer
--- a/quapy/method/neural.py
+++ b/quapy/method/neural.py
@ -0,0 +1,267 @@
+import os
+from pathlib import Path
+import torch
+from torch.nn import MSELoss
+from torch.nn.functional import relu
+from tqdm import tqdm
+from method.aggregative import *
+from util import EarlyStop
+
+
+class QuaNetTrainer(BaseQuantifier):
+
+    def __init__(self,
+                 learner,
+                 sample_size,
+                 n_epochs=500,
+                 tr_iter_per_poch=200,
+                 va_iter_per_poch=21,
+                 lr=1e-3,
+                 lstm_hidden_size=64,
+                 lstm_nlayers=1,
+                 ff_layers=[1024, 512],
+                 bidirectional=True,
+                 qdrop_p=0.5,
+                 patience=10, checkpointpath='../checkpoint/quanet.dat', device='cuda'):
+        assert hasattr(learner, 'transform'), \
+            f'the learner {learner.__class__.__name__} does not seem to be able to produce document embeddings ' \
+                f'since it does not implement the method "transform"'
+        assert hasattr(learner, 'predict_proba'), \
+            f'the learner {learner.__class__.__name__} does not seem to be able to produce posterior probabilities ' \
+                f'since it does not implement the method "predict_proba"'
+        self.learner = learner
+        self.sample_size = sample_size
+        self.n_epochs = n_epochs
+        self.tr_iter = tr_iter_per_poch
+        self.va_iter = va_iter_per_poch
+        self.lr = lr
+        self.quanet_params = {
+            'lstm_hidden_size': lstm_hidden_size,
+            'lstm_nlayers': lstm_nlayers,
+            'ff_layers': ff_layers,
+            'bidirectional': bidirectional,
+            'qdrop_p': qdrop_p
+        }
+
+        self.patience = patience
+        self.checkpointpath = checkpointpath
+        os.makedirs(Path(checkpointpath).parent, exist_ok=True)
+        self.device = torch.device(device)
+
+        self.__check_params_colision(self.quanet_params, self.learner.get_params())
+
+    def fit(self, data: LabelledCollection, fit_learner=True, *args):
+        """
+        :param data: the training data on which to train QuaNet. If fit_learner=True, the data will be split in
+        40/40/20 for training the classifier, training QuaNet, and validating QuaNet, respectively. If
+        fit_learner=False, the data will be split in 66/34 for training QuaNet and validating it, respectively.
+        :param fit_learner: if true, trains the classifier on a split containing 40% of the data
+        :param args: unused
+        :return: self
+        """
+        # split: 40% for training classification, 40% for training quapy, and 20% for validating quapy
+        self.learner, unused_data = \
+            training_helper(self.learner, data, fit_learner,  ensure_probabilistic=True, val_split=0.6)
+        train_data, valid_data = unused_data.split_stratified(0.66)  # 0.66 split of 60% makes 40% and 20%
+
+        # compute the posterior probabilities of the instances
+        valid_posteriors = self.learner.predict_proba(valid_data.instances)
+        train_posteriors = self.learner.predict_proba(train_data.instances)
+
+        # turn instances' indexes into embeddings
+        valid_data.instances = self.learner.transform(valid_data.instances)
+        train_data.instances = self.learner.transform(train_data.instances)
+
+        # estimate the hard and soft stats tpr and fpr of the classifier
+        self.tr_prev = data.prevalence()
+
+        self.quantifiers = [
+            ClassifyAndCount(self.learner).fit(data, fit_learner=False),
+            AdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False),
+            ProbabilisticClassifyAndCount(self.learner).fit(data, fit_learner=False),
+            ProbabilisticAdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False),
+            ExpectationMaximizationQuantifier(self.learner).fit(data, fit_learner=False),
+        ]
+
+        self.status = {
+            'tr-loss': -1,
+            'va-loss': -1,
+        }
+
+        self.quanet = QuaNetModule(
+            doc_embedding_size=train_data.instances.shape[1],
+            n_classes=data.n_classes,
+            stats_size=len(self.quantifiers) * data.n_classes,
+            **self.quanet_params
+        ).to(self.device)
+
+        self.optim = torch.optim.Adam(self.quanet.parameters(), lr=self.lr)
+        early_stop = EarlyStop(self.patience, lower_is_better=True)
+
+        checkpoint = self.checkpointpath
+
+        for epoch_i in range(1, self.n_epochs):
+            self.epoch(train_data, train_posteriors, self.tr_iter, epoch_i, early_stop, train=True)
+            self.epoch(valid_data, valid_posteriors, self.va_iter, epoch_i, early_stop, train=False)
+
+            early_stop(self.status['va-loss'], epoch_i)
+            if early_stop.IMPROVED:
+                torch.save(self.quanet.state_dict(), checkpoint)
+            elif early_stop.STOP:
+                print(f'training ended by patience exhausted; loading best model parameters in {checkpoint} '
+                      f'for epoch {early_stop.best_epoch}')
+                self.quanet.load_state_dict(torch.load(checkpoint))
+                self.epoch(valid_data, valid_posteriors, self.va_iter, epoch_i, early_stop, train=True)
+                break
+
+        return self
+
+    def get_aggregative_estims(self, posteriors):
+        label_predictions = np.argmax(posteriors, axis=-1)
+        prevs_estim = []
+        for quantifier in self.quantifiers:
+            predictions = posteriors if isprobabilistic(quantifier) else label_predictions
+            prevs_estim.append(quantifier.aggregate(predictions))
+        return np.asarray(prevs_estim).flatten()
+
+    def quantify(self, instances, *args):
+        posteriors = self.learner.predict_proba(instances)
+        embeddings = self.learner.transform(instances)
+        quant_estims = self.get_aggregative_estims(posteriors)
+        self.quanet.eval()
+        with torch.no_grad():
+            prevalence = self.quanet.forward(embeddings, posteriors, quant_estims).item()
+        return prevalence
+
+    def epoch(self, data: LabelledCollection, posteriors, iterations, epoch, early_stop, train):
+        mse_loss = MSELoss()
+        prevpoints = F.get_nprevpoints_approximation(iterations, self.quanet.n_classes)
+
+        self.quanet.train(mode=train)
+        losses = []
+        pbar = tqdm(data.artificial_sampling_index_generator(self.sample_size, prevpoints))
+        for it, index in enumerate(pbar):
+            sample_data = data.sampling_from_index(index)
+            sample_posteriors = posteriors[index]
+            quant_estims = self.get_aggregative_estims(sample_posteriors)
+            ptrue = torch.as_tensor([sample_data.prevalence()], dtype=torch.float, device=self.device)
+            if train:
+                self.optim.zero_grad()
+                phat = self.quanet.forward(sample_data.instances, sample_posteriors, quant_estims)
+                loss = mse_loss(phat, ptrue)
+                loss.backward()
+                self.optim.step()
+            else:
+                with torch.no_grad():
+                    phat = self.quanet.forward(sample_data.instances, sample_posteriors, quant_estims)
+                    loss = mse_loss(phat, ptrue)
+
+            losses.append(loss.item())
+
+            self.status['tr-loss' if train else 'va-loss'] = np.mean(losses[-10:])
+            pbar.set_description(f'[QuaNet][{"training" if train else "validating"}] '
+                                 f'epoch={epoch} [it={it}/{iterations}]\t'
+                                 f'tr-loss={self.status["tr-loss"]:.5f} '
+                                 f'val-loss={self.status["va-loss"]:.5f} '
+                                 f'patience={early_stop.patience}/{early_stop.PATIENCE_LIMIT}')
+
+    def get_params(self, deep=True):
+        return {**self.learner.get_params(), **self.quanet_params}
+
+    def set_params(self, **parameters):
+        learner_params={}
+        for key, val in parameters:
+            if key in self.quanet_params:
+                self.quanet_params[key]=val
+            else:
+                learner_params[key] = val
+        self.learner.set_params(**learner_params)
+
+    def __check_params_colision(self, quanet_params, learner_params):
+        quanet_keys = set(quanet_params.keys())
+        learner_keys = set(learner_params.keys())
+        intersection = quanet_keys.intersection(learner_keys)
+        if len(intersection) > 0:
+            raise ValueError(f'the use of parameters {intersection} is ambiguous sine those can refer to '
+                             f'the parameters of QuaNet or the learner {self.learner.__class__.__name__}')
+
+
+class QuaNetModule(torch.nn.Module):
+    def __init__(self,
+                 doc_embedding_size,
+                 n_classes,
+                 stats_size,
+                 lstm_hidden_size=64,
+                 lstm_nlayers=1,
+                 ff_layers=[1024, 512],
+                 bidirectional=True,
+                 qdrop_p=0.5,
+                 order_by=None):
+        super().__init__()
+
+        self.n_classes = n_classes
+        self.order_by = order_by
+        self.hidden_size = lstm_hidden_size
+        self.nlayers = lstm_nlayers
+        self.bidirectional = bidirectional
+        self.ndirections = 2 if self.bidirectional else 1
+        self.qdrop_p = qdrop_p
+        self.lstm = torch.nn.LSTM(doc_embedding_size + n_classes,  # +n_classes stands for the posterior probs. (concatenated)
+                                  lstm_hidden_size, lstm_nlayers, bidirectional=bidirectional,
+                                  dropout=qdrop_p, batch_first=True)
+        self.dropout = torch.nn.Dropout(self.qdrop_p)
+
+        lstm_output_size = self.hidden_size * self.ndirections
+        ff_input_size = lstm_output_size + stats_size
+        prev_size = ff_input_size
+        self.ff_layers = torch.nn.ModuleList()
+        for lin_size in ff_layers:
+            self.ff_layers.append(torch.nn.Linear(prev_size, lin_size))
+            prev_size = lin_size
+        self.output = torch.nn.Linear(prev_size, n_classes)
+
+    @property
+    def device(self):
+        return torch.device('cuda') if next(self.parameters()).is_cuda else torch.device('cpu')
+
+    def init_hidden(self):
+        directions = 2 if self.bidirectional else 1
+        var_hidden = torch.zeros(self.nlayers * directions, 1, self.hidden_size)
+        var_cell = torch.zeros(self.nlayers * directions, 1, self.hidden_size)
+        if next(self.lstm.parameters()).is_cuda:
+            var_hidden, var_cell = var_hidden.cuda(), var_cell.cuda()
+        return var_hidden, var_cell
+
+    def forward(self, doc_embeddings, doc_posteriors, statistics):
+        device = self.device
+        doc_embeddings = torch.as_tensor(doc_embeddings, dtype=torch.float, device=device)
+        doc_posteriors = torch.as_tensor(doc_posteriors, dtype=torch.float, device=device)
+        statistics = torch.as_tensor(statistics, dtype=torch.float, device=device)
+
+        if self.order_by is not None:
+            order = torch.argsort(doc_posteriors[:, self.order_by])
+            doc_embeddings = doc_embeddings[order]
+            doc_posteriors = doc_posteriors[order]
+
+        embeded_posteriors = torch.cat((doc_embeddings, doc_posteriors), dim=-1)
+
+        # the entire set represents only one instance in quapy contexts, and so the batch_size=1
+        # the shape should be (1, number-of-instances, embedding-size + 1)
+        embeded_posteriors = embeded_posteriors.unsqueeze(0)
+
+        _, (rnn_hidden,_) = self.lstm(embeded_posteriors, self.init_hidden())
+        rnn_hidden = rnn_hidden.view(self.nlayers, self.ndirections, -1, self.hidden_size)
+        quant_embedding = rnn_hidden[0].view(-1)
+        quant_embedding = torch.cat((quant_embedding, statistics))
+
+        abstracted = quant_embedding.unsqueeze(0)
+        for linear in self.ff_layers:
+            abstracted = self.dropout(relu(linear(abstracted)))
+
+        logits = self.output(abstracted).view(1, -1)
+        prevalence = torch.softmax(logits, -1)
+
+        return prevalence
+
+
+
--- a/quapy/method/non_aggregative.py
+++ b/quapy/method/non_aggregative.py
@ -1,4 +1,4 @@
-from quapy import LabelledCollection
+from data import LabelledCollection
 from .base import BaseQuantifier


--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@ -86,7 +86,7 @@ class GridSearchQ:
            self.n_prevpoints = F.get_nprevpoints_approximation(eval_budget, n_classes, n_repetitions)
            eval_computations = F.num_prevalence_combinations(self.n_prevpoints, n_classes, n_repetitions)
            self.sout(f'setting n_prevpoints={self.n_prevpoints} so that the number of \n'
-                  f'evaluations is {eval_computations} (<={eval_budget} eval_budget)')
+                  f'evaluations ({eval_computations}) does not exceed the evaluation budget ({eval_budget})')
        elif eval_budget is None:
            self.n_prevpoints = n_prevpoints
            eval_computations = F.num_prevalence_combinations(self.n_prevpoints, n_classes, n_repetitions)
--- a/quapy/util.py
+++ b/quapy/util.py
@ -75,3 +75,26 @@ def pickled_resource(pickle_path:str, generation_func:callable, *args):
            os.makedirs(str(Path(pickle_path).parent), exist_ok=True)
            pickle.dump(instance, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
            return instance
+
+
+class EarlyStop:
+
+    def __init__(self, patience, lower_is_better=True):
+        self.PATIENCE_LIMIT = patience
+        self.better = lambda a,b: a<b if lower_is_better else a>b
+        self.patience = patience
+        self.best_score = None
+        self.best_epoch = None
+        self.STOP = False
+        self.IMPROVED = False
+
+    def __call__(self, watch_score, epoch):
+        self.IMPROVED = (self.best_score is None or self.better(watch_score, self.best_score))
+        if self.IMPROVED:
+            self.best_score = watch_score
+            self.best_epoch = epoch
+            self.patience = self.PATIENCE_LIMIT
+        else:
+            self.patience -= 1
+            if self.patience <= 0:
+                self.STOP = True
--- a/test.py
+++ b/test.py
@ -4,37 +4,41 @@ import quapy as qp
 import quapy.functional as F
 import sys
 import numpy as np
+from classification.neural import NeuralClassifierTrainer, CNNnet
+from quapy.model_selection import GridSearchQ

-#qp.datasets.fetch_reviews('hp')
-#qp.datasets.fetch_twitter('sst')
+qp.environ['SAMPLE_SIZE'] = 500

-#sys.exit()
-from model_selection import GridSearchQ
-
-SAMPLE_SIZE=500
-binary = False
+sample_size = qp.environ['SAMPLE_SIZE']
+binary = True
 svmperf_home = './svm_perf_quantification'

 if binary:
-    dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
+    dataset = qp.datasets.fetch_reviews('kindle', tfidf=False, min_df=5)
+    qp.data.preprocessing.index(dataset, inplace=True)

 else:
    dataset = qp.datasets.fetch_twitter('hcr', for_model_selection=False, min_df=10, pickle=True)
    # dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3)

-print('dataset loaded')
+print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.test)}')
+

 # training a quantifier
-learner = LogisticRegression(max_iter=1000)
+# learner = LogisticRegression(max_iter=1000)
 # model = qp.method.aggregative.ClassifyAndCount(learner)
-model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
+# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
 # model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
 # model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
 # model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
 # model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100)
 # model = qp.method.aggregative.SVMQ(svmperf_home, C=1)

-if not binary and isinstance(model, qp.method.aggregative.BinaryQuantifier):
+learner = NeuralClassifierTrainer(CNNnet(dataset.vocabulary_size, dataset.n_classes))
+print(learner.get_params())
+model = qp.method.aggregative.QuaNet(learner, sample_size, device='cpu')
+
+if qp.isbinary(model) and not qp.isbinary(dataset):
    model = qp.method.aggregative.OneVsAll(model)


@ -42,8 +46,9 @@ if not binary and isinstance(model, qp.method.aggregative.BinaryQuantifier):
 # ----------------------------------------------------------------------------

 print(f'fitting model {model.__class__.__name__}')
-train, val = dataset.training.split_stratified(0.6)
-model.fit(train, val_split=val)
+#train, val = dataset.training.split_stratified(0.6)
+#model.fit(train, val_split=val)
+model.fit(dataset.training)

 # estimating class prevalences
 print('quantifying')
@ -69,9 +74,9 @@ print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence
      f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded.\n'
      f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.')

-true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)
+true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, sample_size, n_prevpoints)

-qp.error.SAMPLE_SIZE = SAMPLE_SIZE
+qp.error.SAMPLE_SIZE = sample_size
 print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
 for error in qp.error.QUANTIFICATION_ERROR:
    score = error(true_prev, estim_prev)
@ -80,12 +85,12 @@ for error in qp.error.QUANTIFICATION_ERROR:

 # Model selection and Evaluation according to the artificial sampling protocol
 # ----------------------------------------------------------------------------
-
+sys.exit(0)
 param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}

 model_selection = GridSearchQ(model,
                              param_grid=param_grid,
-                              sample_size=SAMPLE_SIZE,
+                              sample_size=sample_size,
                              eval_budget=max_evaluations//10,
                              error='mae',
                              refit=True,
@ -98,7 +103,7 @@ print(f'param scores:')
 for params, score in model_selection.param_scores_.items():
    print(f'\t{params}: {score:.5f}')

-true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)
+true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, sample_size, n_prevpoints)

 print(f'After model selection: Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
 for error in qp.error.QUANTIFICATION_ERROR: