From d8e2f7556e32a502ee258145580fa87d3374a0d7 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Tue, 29 Dec 2020 20:33:59 +0100 Subject: [PATCH] QuaNet added, two examples of TextClassifiers added (CNN, LSTM) --- TODO.txt | 8 +- quapy/__init__.py | 17 +- quapy/classification/neural.py | 351 ++++++++++++++++++++++++++++++++ quapy/data/base.py | 10 + quapy/data/preprocessing.py | 24 ++- quapy/error.py | 11 +- quapy/method/aggregative.py | 23 ++- quapy/method/neural.py | 267 ++++++++++++++++++++++++ quapy/method/non_aggregative.py | 2 +- quapy/model_selection.py | 2 +- quapy/util.py | 23 +++ test.py | 43 ++-- 12 files changed, 746 insertions(+), 35 deletions(-) create mode 100644 quapy/classification/neural.py create mode 100644 quapy/method/neural.py diff --git a/TODO.txt b/TODO.txt index 7837c70..70b4c44 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,8 +1,12 @@ Documentation with sphinx -Add quantification_report (akin to classification_report from sklearn) -Add optimization - artificial sampling +Add quantification_report (akin to classification_report from sklearn) (?) Add NAE, NRAE Add "measures for evaluating ordinal"? Document methods with paper references The parallel training in svmperf seems not to work (not sure...) +In binary quantification (hp, kindle, imdb) we used F1 in the minority class (which in kindle and hp happens to be the +negative class). This is not covered in this new implementation, in which the binary case is not treated as such, but as +an instance of single-label with 2 labels. Check +Add classnames to LabelledCollection ? +Check the overhead in OneVsAll for SVMperf-based (?) diff --git a/quapy/__init__.py b/quapy/__init__.py index 19dc14e..e324da1 100644 --- a/quapy/__init__.py +++ b/quapy/__init__.py @@ -1,5 +1,20 @@ -from .data import * +from . import data +from .data import datasets from . import functional from . import method from . import error from . import evaluation +from method.aggregative import isaggregative, isprobabilistic + + +environ = { + 'SAMPLE_SIZE': None, + 'UNK_TOKEN': '[UNK]', + 'UNK_INDEX': 0, + 'PAD_TOKEN': '[PAD]', + 'PAD_INDEX': 1, +} + + +def isbinary(x): + return data.isbinary(x) or method.aggregative.isbinary(x) \ No newline at end of file diff --git a/quapy/classification/neural.py b/quapy/classification/neural.py new file mode 100644 index 0000000..5f8038d --- /dev/null +++ b/quapy/classification/neural.py @@ -0,0 +1,351 @@ +import os +from abc import ABCMeta, abstractmethod +from pathlib import Path +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from sklearn.metrics import accuracy_score, f1_score +from torch.nn.utils.rnn import pad_sequence +from tqdm import tqdm +from data import LabelledCollection +from util import EarlyStop +import quapy as qp + + + +class NeuralClassifierTrainer: + + def __init__(self, + net, # TextClassifierNet + lr=1e-3, + weight_decay=0, + patience=10, + epochs=200, + batch_size=64, + batch_size_test=512, + padding_length=300, + device='cpu', + checkpointpath='../checkpoint/classifier_net.dat'): + + super().__init__() + + assert isinstance(net, TextClassifierNet), f'net is not an instance of {TextClassifierNet.__name__}' + self.net = net + self.vocab_size = self.net.vocabulary_size + self.trainer_hyperparams={ + 'lr': lr, + 'weight_decay': weight_decay, + 'patience': patience, + 'epochs': epochs, + 'batch_size': batch_size, + 'batch_size_test': batch_size_test, + 'padding_length': padding_length, + 'device': torch.device(device) + } + self.learner_hyperparams = self.net.get_params() + + self.checkpointpath = checkpointpath + self.classes_ = np.asarray([0, 1]) + + print(f'[NeuralNetwork running on {device}]') + os.makedirs(Path(checkpointpath).parent, exist_ok=True) + + def reset_net_params(self, vocab_size, n_classes): + self.net = self.net.__class__(vocab_size, n_classes, **self.learner_hyperparams) + self.net.xavier_uniform() + + def get_params(self): + return {**self.net.get_params(), **self.trainer_hyperparams} + + def set_params(self, **params): + trainer_hyperparams = self.trainer_hyperparams + learner_hyperparams = self.net.get_params() + for key, val in params.items(): + if key in trainer_hyperparams and key in learner_hyperparams: + raise ValueError(f'the use of parameter {key} is ambiguous since it can refer to ' + f'a parameters of the Trainer or the learner {self.netclass.__name__}') + elif key not in trainer_hyperparams and key not in learner_hyperparams: + raise ValueError(f'parameter {key} is not valid') + + if key in trainer_hyperparams: + trainer_hyperparams[key] = val + else: + learner_hyperparams[key] = val + + self.trainer_hyperparams = trainer_hyperparams + self.learner_hyperparams = learner_hyperparams + + @property + def device(self): + return next(self.net.parameters()).device + + def __update_progress_bar(self, pbar): + pbar.set_description(f'[{self.net.__class__.__name__}] training epoch={self.current_epoch} ' + f'tr-loss={self.status["tr"]["loss"]:.5f} ' + f'tr-acc={100 * self.status["tr"]["acc"]:.2f}% ' + f'tr-macroF1={100 * self.status["tr"]["f1"]:.2f}% ' + f'patience={self.early_stop.patience}/{self.early_stop.PATIENCE_LIMIT} ' + f'val-loss={self.status["va"]["loss"]:.5f} ' + f'val-acc={100 * self.status["va"]["acc"]:.2f}% ' + f'macroF1={100 * self.status["va"]["f1"]:.2f}%') + + def _train_epoch(self, data, status, pbar): + self.net.train() + criterion = torch.nn.CrossEntropyLoss() + losses, predictions, true_labels = [], [], [] + for xi, yi in data: + self.optim.zero_grad() + logits = self.net.forward(xi) + loss = criterion(logits, yi) + loss.backward() + self.optim.step() + losses.append(loss.item()) + preds = torch.softmax(logits, dim=-1).detach().cpu().numpy().argmax(axis=-1) + + status["loss"] = np.mean(losses) + predictions.extend(preds.tolist()) + true_labels.extend(yi.detach().cpu().numpy().tolist()) + status["acc"] = accuracy_score(true_labels, predictions) + status["f1"] = f1_score(true_labels, predictions, average='macro') + self.__update_progress_bar(pbar) + + def _test_epoch(self, data, status, pbar): + self.net.eval() + criterion = torch.nn.CrossEntropyLoss() + losses, predictions, true_labels = [], [], [] + with torch.no_grad(): + for xi, yi in data: + logits = self.net.forward(xi) + loss = criterion(logits, yi) + losses.append(loss.item()) + preds = torch.softmax(logits, dim=-1).detach().cpu().numpy().argmax(axis=-1) + predictions.extend(preds.tolist()) + true_labels.extend(yi.detach().cpu().numpy().tolist()) + + status["loss"] = np.mean(losses) + status["acc"] = accuracy_score(true_labels, predictions) + status["f1"] = f1_score(true_labels, predictions, average='macro') + self.__update_progress_bar(pbar) + + def fit(self, instances, labels, val_split=0.3): + train, val = LabelledCollection(instances, labels).split_stratified(1-val_split) + opt = self.trainer_hyperparams + checkpoint = self.checkpointpath + self.reset_net_params(self.vocab_size, train.n_classes) + + train_generator = TorchDataset(train.instances, train.labels).asDataloader( + opt['batch_size'], shuffle=True, pad_length=opt['padding_length'], device=opt['device']) + valid_generator = TorchDataset(val.instances, val.labels).asDataloader( + opt['batch_size_test'], shuffle=False, pad_length=opt['padding_length'], device=opt['device']) + + self.status = {'tr': {'loss': -1, 'acc': -1, 'f1': -1}, + 'va': {'loss': -1, 'acc': -1, 'f1': -1}} + + self.optim = torch.optim.Adam(self.net.parameters(), lr=opt['lr'], weight_decay=opt['weight_decay']) + self.early_stop = EarlyStop(opt['patience'], lower_is_better=False) + + with tqdm(range(1, opt['epochs'] + 1)) as pbar: + for self.current_epoch in pbar: + self._train_epoch(train_generator, self.status['tr'], pbar) + self._test_epoch(valid_generator, self.status['va'], pbar) + + self.early_stop(self.status['va']['f1'], self.current_epoch) + if self.early_stop.IMPROVED: + torch.save(self.net.state_dict(), checkpoint) + elif self.early_stop.STOP: + print(f'training ended by patience exhasted; loading best model parameters in {checkpoint} ' + f'for epoch {self.early_stop.best_epoch}') + self.net.load_state_dict(torch.load(checkpoint)) + break + + print('performing one training pass over the validation set...') + self._train_epoch(valid_generator, self.status['tr'], pbar) + print('[done]') + + return self + + def predict(self, instances): + return np.argmax(self.predict_proba(instances), axis=-1) + + def predict_proba(self, instances): + return self.net.predict_proba(instances) + + def predict_probability_positive(self, instances): + self.net.eval() + opt = self.trainer_hyperparams + with torch.no_grad(): + positive_probs = [] + for xi in TorchDataset(instances).asDataloader( + opt['batch_size_test'], shuffle=False, pad_length=opt['padding_length'], device=opt['device']): + positive_probs.append(self.net.predict_proba(xi)) + return np.concatenate(positive_probs) + + def transform(self, instances): + self.net.eval() + embeddings = [] + with torch.no_grad(): + for xi in TorchDataset(instances).asDataloader( + self.batch_size_test, shuffle=False, pad_length=self.padding_length, device=self.device): + embeddings.append(self.net.document_embedding(xi).detach().cpu().numpy()) + return np.concatenate(embeddings) + + +class TorchDataset(torch.utils.data.Dataset): + + def __init__(self, instances, labels=None): + self.instances = instances + self.labels = labels + + def __len__(self): + return len(self.instances) + + def __getitem__(self, index): + return {'doc': self.instances[index], 'label': self.labels[index] if self.labels is not None else None} + + def asDataloader(self, batch_size, shuffle, pad_length, device): + def collate(batch): + data = [torch.LongTensor(item['doc'][:pad_length]) for item in batch] + data = pad_sequence(data, batch_first=True, padding_value=qp.environ['PAD_INDEX']).to(device) + targets = [item['label'] for item in batch] + if targets[0] is None: + return data + else: + targets = torch.as_tensor(targets, dtype=torch.long).to(device) + return [data, targets] + + torchDataset = TorchDataset(self.instances, self.labels) + return torch.utils.data.DataLoader(torchDataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate) + + +class TextClassifierNet(torch.nn.Module, metaclass=ABCMeta): + + @abstractmethod + def document_embedding(self, x): ... + + def forward(self, x): + doc_embedded = self.document_embedding(x) + return self.output(doc_embedded) + + def dimensions(self): + return self.dim + + def predict_proba(self, x): + logits = self(x) + return torch.softmax(logits).detach().cpu().numpy() + + def xavier_uniform(self): + for p in self.parameters(): + if p.dim() > 1 and p.requires_grad: + torch.nn.init.xavier_uniform_(p) + + @abstractmethod + def get_params(self): ... + + @property + def vocabulary_size(self): ... + + +class LSTMnet(TextClassifierNet): + + def __init__(self, vocabulary_size, n_classes, embedding_size=100, hidden_size=256, repr_size=100, lstm_nlayers=1, + drop_p=0.5): + super().__init__() + self.vocabulary_size_ = vocabulary_size + self.n_classes = n_classes + self.hyperparams={ + 'embedding_size': embedding_size, + 'hidden_size': hidden_size, + 'repr_size': repr_size, + 'lstm_nlayers': lstm_nlayers, + 'drop_p': drop_p + } + + self.word_embedding = torch.nn.Embedding(vocabulary_size, embedding_size) + self.lstm = torch.nn.LSTM(embedding_size, hidden_size, lstm_nlayers, dropout=drop_p, batch_first=True) + self.dropout = torch.nn.Dropout(drop_p) + + self.dim = repr_size + self.doc_embedder = torch.nn.Linear(hidden_size, self.dim) + self.output = torch.nn.Linear(self.dim, n_classes) + + def init_hidden(self, set_size): + opt = self.hyperparams + var_hidden = torch.zeros(opt['lstm_nlayers'], set_size, opt['lstm_hidden_size']) + var_cell = torch.zeros(opt['lstm_nlayers'], set_size, opt['lstm_hidden_size']) + if next(self.lstm.parameters()).is_cuda: + var_hidden, var_cell = var_hidden.cuda(), var_cell.cuda() + return var_hidden, var_cell + + def document_embedding(self, x): + embedded = self.word_embedding(x) + rnn_output, rnn_hidden = self.lstm(embedded, self.init_hidden(x.size()[0])) + abstracted = self.dropout(F.relu(rnn_hidden[0][-1])) + abstracted = self.doc_embedder(abstracted) + return abstracted + + def get_params(self): + return self.hyperparams + + @property + def vocabulary_size(self): + return self.vocabulary_size_ + + +class CNNnet(TextClassifierNet): + + def __init__(self, vocabulary_size, n_classes, embedding_size=100, hidden_size=256, repr_size=100, + kernel_heights=[3, 5, 7], stride=1, padding=0, drop_p=0.5): + super(CNNnet, self).__init__() + + self.vocabulary_size_ = vocabulary_size + self.n_classes = n_classes + self.hyperparams={ + 'embedding_size': embedding_size, + 'hidden_size': hidden_size, + 'repr_size': repr_size, + 'kernel_heights':kernel_heights, + 'stride': stride, + 'drop_p': drop_p + } + self.word_embedding = torch.nn.Embedding(vocabulary_size, embedding_size) + in_channels = 1 + self.conv1 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[0], embedding_size), stride, padding) + self.conv2 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[1], embedding_size), stride, padding) + self.conv3 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[2], embedding_size), stride, padding) + self.dropout = nn.Dropout(drop_p) + + self.dim = repr_size + self.doc_embedder = torch.nn.Linear(len(kernel_heights) * hidden_size, self.dim) + self.output = nn.Linear(self.dim, n_classes) + + def conv_block(self, input, conv_layer): + conv_out = conv_layer(input) # conv_out.size() = (batch_size, out_channels, dim, 1) + activation = F.relu(conv_out.squeeze(3)) # activation.size() = (batch_size, out_channels, dim1) + max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2) # maxpool_out.size() = (batch_size, out_channels) + return max_out + + def document_embedding(self, input): + input = self.word_embedding(input) + input = input.unsqueeze(1) # input.size() = (batch_size, 1, num_seq, embedding_length) + + max_out1 = self.conv_block(input, self.conv1) + max_out2 = self.conv_block(input, self.conv2) + max_out3 = self.conv_block(input, self.conv3) + + all_out = torch.cat((max_out1, max_out2, max_out3), 1) # all_out.size() = (batch_size, num_kernels*out_channels) + abstracted = self.dropout(F.relu(all_out)) # (batch_size, num_kernels*out_channels) + abstracted = self.doc_embedder(abstracted) + return abstracted + + def get_params(self): + return self.hyperparams + + @property + def vocabulary_size(self): + return self.vocabulary_size_ + + + + + diff --git a/quapy/data/base.py b/quapy/data/base.py index eec4d80..6879a75 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -158,6 +158,16 @@ class Dataset: test = LabelledCollection.load(test_path, loader_func) return Dataset(training, test) + @property + def vocabulary_size(self): + return len(self.vocabulary) + + +def isbinary(data): + if isinstance(data, Dataset) or isinstance(data, LabelledCollection): + return data.binary + return False + diff --git a/quapy/data/preprocessing.py b/quapy/data/preprocessing.py index 3c1c6c1..6206be0 100644 --- a/quapy/data/preprocessing.py +++ b/quapy/data/preprocessing.py @@ -5,6 +5,7 @@ from scipy.sparse import spmatrix from util import parallelize from .base import LabelledCollection from tqdm import tqdm +import quapy as qp def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs): @@ -114,6 +115,7 @@ class IndexTransformer: """ self.vect = CountVectorizer(**kwargs) self.unk = -1 # a valid index is assigned after fit + self.pad = -2 # a valid index is assigned after fit def fit(self, X): """ @@ -123,12 +125,13 @@ class IndexTransformer: self.vect.fit(X) self.analyzer = self.vect.build_analyzer() self.vocabulary_ = self.vect.vocabulary_ - self.unk = self.add_word('UNK') + self.unk = self.add_word(qp.environ['UNK_TOKEN'], qp.environ['UNK_INDEX']) + self.pad = self.add_word(qp.environ['PAD_TOKEN'], qp.environ['PAD_INDEX']) return self def transform(self, X, n_jobs=-1): # given the number of tasks and the number of jobs, generates the slices for the parallel threads - assert self.unk > 0, 'transform called before fit' + assert self.unk != -1, 'transform called before fit' indexed = parallelize(func=self.index, args=X, n_jobs=n_jobs) return np.asarray(indexed) @@ -142,9 +145,22 @@ class IndexTransformer: def vocabulary_size(self): return len(self.vocabulary_) - def add_word(self, word): + def add_word(self, word, id=None, nogaps=True): if word in self.vocabulary_: raise ValueError(f'word {word} already in dictionary') - self.vocabulary_[word] = len(self.vocabulary_) + if id is None: + # add the word with the next id + self.vocabulary_[word] = len(self.vocabulary_) + else: + id2word = {id_:word_ for word_, id_ in self.vocabulary_.items()} + if id in id2word: + old_word = id2word[id] + self.vocabulary_[word] = id + del self.vocabulary_[old_word] + self.add_word(old_word) + elif nogaps: + if id > self.vocabulary_size()+1: + raise ValueError(f'word {word} added with id {id}, while the current vocabulary size ' + f'is of {self.vocabulary_size()}, and id gaps are not allowed') return self.vocabulary_[word] diff --git a/quapy/error.py b/quapy/error.py index bd7adf2..fe6f3ec 100644 --- a/quapy/error.py +++ b/quapy/error.py @@ -1,11 +1,9 @@ from sklearn.metrics import f1_score import numpy as np +import quapy as qp -SAMPLE_SIZE = None - - def f1e(y_true, y_pred): return 1. - f1_score(y_true, y_pred, average='macro') @@ -68,11 +66,12 @@ def smooth(p, eps): def __check_eps(eps): + sample_size = qp.environ['SAMPLE_SIZE'] if eps is None: - if SAMPLE_SIZE is None: - raise ValueError('eps was not defined, and qp.error.SAMPLE_SIZE was not set') + if sample_size is None: + raise ValueError('eps was not defined, and qp.environ["SAMPLE_SIZE"] was not set') else: - eps = 1. / (2. * SAMPLE_SIZE) + eps = 1. / (2. * sample_size) return eps diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 3b18090..8d18739 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -289,6 +289,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier): converged = True qs_prev_ = qs + s += 1 if not converged: raise UserWarning('the method has reached the maximum number of iterations; it might have not converged') @@ -443,6 +444,10 @@ class OneVsAll(AggregativeQuantifier): 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \ 'predictions for each document (row) and class (columns)' prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin) + #prevalences = [] + #for c in self.classes: + # prevalences.append(self._delayed_binary_aggregate(c, classif_predictions_bin)) + #prevalences = np.asarray(prevalences) return F.normalize_prevalence(prevalences) def quantify(self, X, *args): @@ -477,4 +482,20 @@ class OneVsAll(AggregativeQuantifier): def _delayed_binary_fit(self, c, data, **kwargs): bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2) - self.dict_binary_quantifiers[c].fit(bindata, **kwargs) \ No newline at end of file + self.dict_binary_quantifiers[c].fit(bindata, **kwargs) + + +def isaggregative(model): + return isinstance(model, AggregativeQuantifier) + + +def isprobabilistic(model): + return isinstance(model, AggregativeProbabilisticQuantifier) + + +def isbinary(model): + return isinstance(model, BinaryQuantifier) + + +from . import neural +QuaNet = neural.QuaNetTrainer \ No newline at end of file diff --git a/quapy/method/neural.py b/quapy/method/neural.py new file mode 100644 index 0000000..0b4ff65 --- /dev/null +++ b/quapy/method/neural.py @@ -0,0 +1,267 @@ +import os +from pathlib import Path +import torch +from torch.nn import MSELoss +from torch.nn.functional import relu +from tqdm import tqdm +from method.aggregative import * +from util import EarlyStop + + +class QuaNetTrainer(BaseQuantifier): + + def __init__(self, + learner, + sample_size, + n_epochs=500, + tr_iter_per_poch=200, + va_iter_per_poch=21, + lr=1e-3, + lstm_hidden_size=64, + lstm_nlayers=1, + ff_layers=[1024, 512], + bidirectional=True, + qdrop_p=0.5, + patience=10, checkpointpath='../checkpoint/quanet.dat', device='cuda'): + assert hasattr(learner, 'transform'), \ + f'the learner {learner.__class__.__name__} does not seem to be able to produce document embeddings ' \ + f'since it does not implement the method "transform"' + assert hasattr(learner, 'predict_proba'), \ + f'the learner {learner.__class__.__name__} does not seem to be able to produce posterior probabilities ' \ + f'since it does not implement the method "predict_proba"' + self.learner = learner + self.sample_size = sample_size + self.n_epochs = n_epochs + self.tr_iter = tr_iter_per_poch + self.va_iter = va_iter_per_poch + self.lr = lr + self.quanet_params = { + 'lstm_hidden_size': lstm_hidden_size, + 'lstm_nlayers': lstm_nlayers, + 'ff_layers': ff_layers, + 'bidirectional': bidirectional, + 'qdrop_p': qdrop_p + } + + self.patience = patience + self.checkpointpath = checkpointpath + os.makedirs(Path(checkpointpath).parent, exist_ok=True) + self.device = torch.device(device) + + self.__check_params_colision(self.quanet_params, self.learner.get_params()) + + def fit(self, data: LabelledCollection, fit_learner=True, *args): + """ + :param data: the training data on which to train QuaNet. If fit_learner=True, the data will be split in + 40/40/20 for training the classifier, training QuaNet, and validating QuaNet, respectively. If + fit_learner=False, the data will be split in 66/34 for training QuaNet and validating it, respectively. + :param fit_learner: if true, trains the classifier on a split containing 40% of the data + :param args: unused + :return: self + """ + # split: 40% for training classification, 40% for training quapy, and 20% for validating quapy + self.learner, unused_data = \ + training_helper(self.learner, data, fit_learner, ensure_probabilistic=True, val_split=0.6) + train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20% + + # compute the posterior probabilities of the instances + valid_posteriors = self.learner.predict_proba(valid_data.instances) + train_posteriors = self.learner.predict_proba(train_data.instances) + + # turn instances' indexes into embeddings + valid_data.instances = self.learner.transform(valid_data.instances) + train_data.instances = self.learner.transform(train_data.instances) + + # estimate the hard and soft stats tpr and fpr of the classifier + self.tr_prev = data.prevalence() + + self.quantifiers = [ + ClassifyAndCount(self.learner).fit(data, fit_learner=False), + AdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False), + ProbabilisticClassifyAndCount(self.learner).fit(data, fit_learner=False), + ProbabilisticAdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False), + ExpectationMaximizationQuantifier(self.learner).fit(data, fit_learner=False), + ] + + self.status = { + 'tr-loss': -1, + 'va-loss': -1, + } + + self.quanet = QuaNetModule( + doc_embedding_size=train_data.instances.shape[1], + n_classes=data.n_classes, + stats_size=len(self.quantifiers) * data.n_classes, + **self.quanet_params + ).to(self.device) + + self.optim = torch.optim.Adam(self.quanet.parameters(), lr=self.lr) + early_stop = EarlyStop(self.patience, lower_is_better=True) + + checkpoint = self.checkpointpath + + for epoch_i in range(1, self.n_epochs): + self.epoch(train_data, train_posteriors, self.tr_iter, epoch_i, early_stop, train=True) + self.epoch(valid_data, valid_posteriors, self.va_iter, epoch_i, early_stop, train=False) + + early_stop(self.status['va-loss'], epoch_i) + if early_stop.IMPROVED: + torch.save(self.quanet.state_dict(), checkpoint) + elif early_stop.STOP: + print(f'training ended by patience exhausted; loading best model parameters in {checkpoint} ' + f'for epoch {early_stop.best_epoch}') + self.quanet.load_state_dict(torch.load(checkpoint)) + self.epoch(valid_data, valid_posteriors, self.va_iter, epoch_i, early_stop, train=True) + break + + return self + + def get_aggregative_estims(self, posteriors): + label_predictions = np.argmax(posteriors, axis=-1) + prevs_estim = [] + for quantifier in self.quantifiers: + predictions = posteriors if isprobabilistic(quantifier) else label_predictions + prevs_estim.append(quantifier.aggregate(predictions)) + return np.asarray(prevs_estim).flatten() + + def quantify(self, instances, *args): + posteriors = self.learner.predict_proba(instances) + embeddings = self.learner.transform(instances) + quant_estims = self.get_aggregative_estims(posteriors) + self.quanet.eval() + with torch.no_grad(): + prevalence = self.quanet.forward(embeddings, posteriors, quant_estims).item() + return prevalence + + def epoch(self, data: LabelledCollection, posteriors, iterations, epoch, early_stop, train): + mse_loss = MSELoss() + prevpoints = F.get_nprevpoints_approximation(iterations, self.quanet.n_classes) + + self.quanet.train(mode=train) + losses = [] + pbar = tqdm(data.artificial_sampling_index_generator(self.sample_size, prevpoints)) + for it, index in enumerate(pbar): + sample_data = data.sampling_from_index(index) + sample_posteriors = posteriors[index] + quant_estims = self.get_aggregative_estims(sample_posteriors) + ptrue = torch.as_tensor([sample_data.prevalence()], dtype=torch.float, device=self.device) + if train: + self.optim.zero_grad() + phat = self.quanet.forward(sample_data.instances, sample_posteriors, quant_estims) + loss = mse_loss(phat, ptrue) + loss.backward() + self.optim.step() + else: + with torch.no_grad(): + phat = self.quanet.forward(sample_data.instances, sample_posteriors, quant_estims) + loss = mse_loss(phat, ptrue) + + losses.append(loss.item()) + + self.status['tr-loss' if train else 'va-loss'] = np.mean(losses[-10:]) + pbar.set_description(f'[QuaNet][{"training" if train else "validating"}] ' + f'epoch={epoch} [it={it}/{iterations}]\t' + f'tr-loss={self.status["tr-loss"]:.5f} ' + f'val-loss={self.status["va-loss"]:.5f} ' + f'patience={early_stop.patience}/{early_stop.PATIENCE_LIMIT}') + + def get_params(self, deep=True): + return {**self.learner.get_params(), **self.quanet_params} + + def set_params(self, **parameters): + learner_params={} + for key, val in parameters: + if key in self.quanet_params: + self.quanet_params[key]=val + else: + learner_params[key] = val + self.learner.set_params(**learner_params) + + def __check_params_colision(self, quanet_params, learner_params): + quanet_keys = set(quanet_params.keys()) + learner_keys = set(learner_params.keys()) + intersection = quanet_keys.intersection(learner_keys) + if len(intersection) > 0: + raise ValueError(f'the use of parameters {intersection} is ambiguous sine those can refer to ' + f'the parameters of QuaNet or the learner {self.learner.__class__.__name__}') + + +class QuaNetModule(torch.nn.Module): + def __init__(self, + doc_embedding_size, + n_classes, + stats_size, + lstm_hidden_size=64, + lstm_nlayers=1, + ff_layers=[1024, 512], + bidirectional=True, + qdrop_p=0.5, + order_by=None): + super().__init__() + + self.n_classes = n_classes + self.order_by = order_by + self.hidden_size = lstm_hidden_size + self.nlayers = lstm_nlayers + self.bidirectional = bidirectional + self.ndirections = 2 if self.bidirectional else 1 + self.qdrop_p = qdrop_p + self.lstm = torch.nn.LSTM(doc_embedding_size + n_classes, # +n_classes stands for the posterior probs. (concatenated) + lstm_hidden_size, lstm_nlayers, bidirectional=bidirectional, + dropout=qdrop_p, batch_first=True) + self.dropout = torch.nn.Dropout(self.qdrop_p) + + lstm_output_size = self.hidden_size * self.ndirections + ff_input_size = lstm_output_size + stats_size + prev_size = ff_input_size + self.ff_layers = torch.nn.ModuleList() + for lin_size in ff_layers: + self.ff_layers.append(torch.nn.Linear(prev_size, lin_size)) + prev_size = lin_size + self.output = torch.nn.Linear(prev_size, n_classes) + + @property + def device(self): + return torch.device('cuda') if next(self.parameters()).is_cuda else torch.device('cpu') + + def init_hidden(self): + directions = 2 if self.bidirectional else 1 + var_hidden = torch.zeros(self.nlayers * directions, 1, self.hidden_size) + var_cell = torch.zeros(self.nlayers * directions, 1, self.hidden_size) + if next(self.lstm.parameters()).is_cuda: + var_hidden, var_cell = var_hidden.cuda(), var_cell.cuda() + return var_hidden, var_cell + + def forward(self, doc_embeddings, doc_posteriors, statistics): + device = self.device + doc_embeddings = torch.as_tensor(doc_embeddings, dtype=torch.float, device=device) + doc_posteriors = torch.as_tensor(doc_posteriors, dtype=torch.float, device=device) + statistics = torch.as_tensor(statistics, dtype=torch.float, device=device) + + if self.order_by is not None: + order = torch.argsort(doc_posteriors[:, self.order_by]) + doc_embeddings = doc_embeddings[order] + doc_posteriors = doc_posteriors[order] + + embeded_posteriors = torch.cat((doc_embeddings, doc_posteriors), dim=-1) + + # the entire set represents only one instance in quapy contexts, and so the batch_size=1 + # the shape should be (1, number-of-instances, embedding-size + 1) + embeded_posteriors = embeded_posteriors.unsqueeze(0) + + _, (rnn_hidden,_) = self.lstm(embeded_posteriors, self.init_hidden()) + rnn_hidden = rnn_hidden.view(self.nlayers, self.ndirections, -1, self.hidden_size) + quant_embedding = rnn_hidden[0].view(-1) + quant_embedding = torch.cat((quant_embedding, statistics)) + + abstracted = quant_embedding.unsqueeze(0) + for linear in self.ff_layers: + abstracted = self.dropout(relu(linear(abstracted))) + + logits = self.output(abstracted).view(1, -1) + prevalence = torch.softmax(logits, -1) + + return prevalence + + + diff --git a/quapy/method/non_aggregative.py b/quapy/method/non_aggregative.py index 1bd7c89..a23ffce 100644 --- a/quapy/method/non_aggregative.py +++ b/quapy/method/non_aggregative.py @@ -1,4 +1,4 @@ -from quapy import LabelledCollection +from data import LabelledCollection from .base import BaseQuantifier diff --git a/quapy/model_selection.py b/quapy/model_selection.py index ba4187a..227987a 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -86,7 +86,7 @@ class GridSearchQ: self.n_prevpoints = F.get_nprevpoints_approximation(eval_budget, n_classes, n_repetitions) eval_computations = F.num_prevalence_combinations(self.n_prevpoints, n_classes, n_repetitions) self.sout(f'setting n_prevpoints={self.n_prevpoints} so that the number of \n' - f'evaluations is {eval_computations} (<={eval_budget} eval_budget)') + f'evaluations ({eval_computations}) does not exceed the evaluation budget ({eval_budget})') elif eval_budget is None: self.n_prevpoints = n_prevpoints eval_computations = F.num_prevalence_combinations(self.n_prevpoints, n_classes, n_repetitions) diff --git a/quapy/util.py b/quapy/util.py index 899519a..bdfbfb9 100644 --- a/quapy/util.py +++ b/quapy/util.py @@ -75,3 +75,26 @@ def pickled_resource(pickle_path:str, generation_func:callable, *args): os.makedirs(str(Path(pickle_path).parent), exist_ok=True) pickle.dump(instance, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL) return instance + + +class EarlyStop: + + def __init__(self, patience, lower_is_better=True): + self.PATIENCE_LIMIT = patience + self.better = lambda a,b: ab + self.patience = patience + self.best_score = None + self.best_epoch = None + self.STOP = False + self.IMPROVED = False + + def __call__(self, watch_score, epoch): + self.IMPROVED = (self.best_score is None or self.better(watch_score, self.best_score)) + if self.IMPROVED: + self.best_score = watch_score + self.best_epoch = epoch + self.patience = self.PATIENCE_LIMIT + else: + self.patience -= 1 + if self.patience <= 0: + self.STOP = True \ No newline at end of file diff --git a/test.py b/test.py index 0ade026..e8d9ebc 100644 --- a/test.py +++ b/test.py @@ -4,37 +4,41 @@ import quapy as qp import quapy.functional as F import sys import numpy as np +from classification.neural import NeuralClassifierTrainer, CNNnet +from quapy.model_selection import GridSearchQ -#qp.datasets.fetch_reviews('hp') -#qp.datasets.fetch_twitter('sst') +qp.environ['SAMPLE_SIZE'] = 500 -#sys.exit() -from model_selection import GridSearchQ - -SAMPLE_SIZE=500 -binary = False +sample_size = qp.environ['SAMPLE_SIZE'] +binary = True svmperf_home = './svm_perf_quantification' if binary: - dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5) + dataset = qp.datasets.fetch_reviews('kindle', tfidf=False, min_df=5) + qp.data.preprocessing.index(dataset, inplace=True) else: dataset = qp.datasets.fetch_twitter('hcr', for_model_selection=False, min_df=10, pickle=True) # dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3) -print('dataset loaded') +print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.test)}') + # training a quantifier -learner = LogisticRegression(max_iter=1000) +# learner = LogisticRegression(max_iter=1000) # model = qp.method.aggregative.ClassifyAndCount(learner) -model = qp.method.aggregative.AdjustedClassifyAndCount(learner) +# model = qp.method.aggregative.AdjustedClassifyAndCount(learner) # model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner) # model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner) # model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner) # model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100) # model = qp.method.aggregative.SVMQ(svmperf_home, C=1) -if not binary and isinstance(model, qp.method.aggregative.BinaryQuantifier): +learner = NeuralClassifierTrainer(CNNnet(dataset.vocabulary_size, dataset.n_classes)) +print(learner.get_params()) +model = qp.method.aggregative.QuaNet(learner, sample_size, device='cpu') + +if qp.isbinary(model) and not qp.isbinary(dataset): model = qp.method.aggregative.OneVsAll(model) @@ -42,8 +46,9 @@ if not binary and isinstance(model, qp.method.aggregative.BinaryQuantifier): # ---------------------------------------------------------------------------- print(f'fitting model {model.__class__.__name__}') -train, val = dataset.training.split_stratified(0.6) -model.fit(train, val_split=val) +#train, val = dataset.training.split_stratified(0.6) +#model.fit(train, val_split=val) +model.fit(dataset.training) # estimating class prevalences print('quantifying') @@ -69,9 +74,9 @@ print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded.\n' f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.') -true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints) +true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, sample_size, n_prevpoints) -qp.error.SAMPLE_SIZE = SAMPLE_SIZE +qp.error.SAMPLE_SIZE = sample_size print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)') for error in qp.error.QUANTIFICATION_ERROR: score = error(true_prev, estim_prev) @@ -80,12 +85,12 @@ for error in qp.error.QUANTIFICATION_ERROR: # Model selection and Evaluation according to the artificial sampling protocol # ---------------------------------------------------------------------------- - +sys.exit(0) param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]} model_selection = GridSearchQ(model, param_grid=param_grid, - sample_size=SAMPLE_SIZE, + sample_size=sample_size, eval_budget=max_evaluations//10, error='mae', refit=True, @@ -98,7 +103,7 @@ print(f'param scores:') for params, score in model_selection.param_scores_.items(): print(f'\t{params}: {score:.5f}') -true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints) +true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, sample_size, n_prevpoints) print(f'After model selection: Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)') for error in qp.error.QUANTIFICATION_ERROR: