1
0
Fork 0

QuaNet added, two examples of TextClassifiers added (CNN, LSTM)

This commit is contained in:
Alejandro Moreo Fernandez 2020-12-29 20:33:59 +01:00
parent 3ec711c96e
commit d8e2f7556e
12 changed files with 746 additions and 35 deletions

View File

@ -1,8 +1,12 @@
Documentation with sphinx
Add quantification_report (akin to classification_report from sklearn)
Add optimization - artificial sampling
Add quantification_report (akin to classification_report from sklearn) (?)
Add NAE, NRAE
Add "measures for evaluating ordinal"?
Document methods with paper references
The parallel training in svmperf seems not to work (not sure...)
In binary quantification (hp, kindle, imdb) we used F1 in the minority class (which in kindle and hp happens to be the
negative class). This is not covered in this new implementation, in which the binary case is not treated as such, but as
an instance of single-label with 2 labels. Check
Add classnames to LabelledCollection ?
Check the overhead in OneVsAll for SVMperf-based (?)

View File

@ -1,5 +1,20 @@
from .data import *
from . import data
from .data import datasets
from . import functional
from . import method
from . import error
from . import evaluation
from method.aggregative import isaggregative, isprobabilistic
environ = {
'SAMPLE_SIZE': None,
'UNK_TOKEN': '[UNK]',
'UNK_INDEX': 0,
'PAD_TOKEN': '[PAD]',
'PAD_INDEX': 1,
}
def isbinary(x):
return data.isbinary(x) or method.aggregative.isbinary(x)

View File

@ -0,0 +1,351 @@
import os
from abc import ABCMeta, abstractmethod
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from data import LabelledCollection
from util import EarlyStop
import quapy as qp
class NeuralClassifierTrainer:
def __init__(self,
net, # TextClassifierNet
lr=1e-3,
weight_decay=0,
patience=10,
epochs=200,
batch_size=64,
batch_size_test=512,
padding_length=300,
device='cpu',
checkpointpath='../checkpoint/classifier_net.dat'):
super().__init__()
assert isinstance(net, TextClassifierNet), f'net is not an instance of {TextClassifierNet.__name__}'
self.net = net
self.vocab_size = self.net.vocabulary_size
self.trainer_hyperparams={
'lr': lr,
'weight_decay': weight_decay,
'patience': patience,
'epochs': epochs,
'batch_size': batch_size,
'batch_size_test': batch_size_test,
'padding_length': padding_length,
'device': torch.device(device)
}
self.learner_hyperparams = self.net.get_params()
self.checkpointpath = checkpointpath
self.classes_ = np.asarray([0, 1])
print(f'[NeuralNetwork running on {device}]')
os.makedirs(Path(checkpointpath).parent, exist_ok=True)
def reset_net_params(self, vocab_size, n_classes):
self.net = self.net.__class__(vocab_size, n_classes, **self.learner_hyperparams)
self.net.xavier_uniform()
def get_params(self):
return {**self.net.get_params(), **self.trainer_hyperparams}
def set_params(self, **params):
trainer_hyperparams = self.trainer_hyperparams
learner_hyperparams = self.net.get_params()
for key, val in params.items():
if key in trainer_hyperparams and key in learner_hyperparams:
raise ValueError(f'the use of parameter {key} is ambiguous since it can refer to '
f'a parameters of the Trainer or the learner {self.netclass.__name__}')
elif key not in trainer_hyperparams and key not in learner_hyperparams:
raise ValueError(f'parameter {key} is not valid')
if key in trainer_hyperparams:
trainer_hyperparams[key] = val
else:
learner_hyperparams[key] = val
self.trainer_hyperparams = trainer_hyperparams
self.learner_hyperparams = learner_hyperparams
@property
def device(self):
return next(self.net.parameters()).device
def __update_progress_bar(self, pbar):
pbar.set_description(f'[{self.net.__class__.__name__}] training epoch={self.current_epoch} '
f'tr-loss={self.status["tr"]["loss"]:.5f} '
f'tr-acc={100 * self.status["tr"]["acc"]:.2f}% '
f'tr-macroF1={100 * self.status["tr"]["f1"]:.2f}% '
f'patience={self.early_stop.patience}/{self.early_stop.PATIENCE_LIMIT} '
f'val-loss={self.status["va"]["loss"]:.5f} '
f'val-acc={100 * self.status["va"]["acc"]:.2f}% '
f'macroF1={100 * self.status["va"]["f1"]:.2f}%')
def _train_epoch(self, data, status, pbar):
self.net.train()
criterion = torch.nn.CrossEntropyLoss()
losses, predictions, true_labels = [], [], []
for xi, yi in data:
self.optim.zero_grad()
logits = self.net.forward(xi)
loss = criterion(logits, yi)
loss.backward()
self.optim.step()
losses.append(loss.item())
preds = torch.softmax(logits, dim=-1).detach().cpu().numpy().argmax(axis=-1)
status["loss"] = np.mean(losses)
predictions.extend(preds.tolist())
true_labels.extend(yi.detach().cpu().numpy().tolist())
status["acc"] = accuracy_score(true_labels, predictions)
status["f1"] = f1_score(true_labels, predictions, average='macro')
self.__update_progress_bar(pbar)
def _test_epoch(self, data, status, pbar):
self.net.eval()
criterion = torch.nn.CrossEntropyLoss()
losses, predictions, true_labels = [], [], []
with torch.no_grad():
for xi, yi in data:
logits = self.net.forward(xi)
loss = criterion(logits, yi)
losses.append(loss.item())
preds = torch.softmax(logits, dim=-1).detach().cpu().numpy().argmax(axis=-1)
predictions.extend(preds.tolist())
true_labels.extend(yi.detach().cpu().numpy().tolist())
status["loss"] = np.mean(losses)
status["acc"] = accuracy_score(true_labels, predictions)
status["f1"] = f1_score(true_labels, predictions, average='macro')
self.__update_progress_bar(pbar)
def fit(self, instances, labels, val_split=0.3):
train, val = LabelledCollection(instances, labels).split_stratified(1-val_split)
opt = self.trainer_hyperparams
checkpoint = self.checkpointpath
self.reset_net_params(self.vocab_size, train.n_classes)
train_generator = TorchDataset(train.instances, train.labels).asDataloader(
opt['batch_size'], shuffle=True, pad_length=opt['padding_length'], device=opt['device'])
valid_generator = TorchDataset(val.instances, val.labels).asDataloader(
opt['batch_size_test'], shuffle=False, pad_length=opt['padding_length'], device=opt['device'])
self.status = {'tr': {'loss': -1, 'acc': -1, 'f1': -1},
'va': {'loss': -1, 'acc': -1, 'f1': -1}}
self.optim = torch.optim.Adam(self.net.parameters(), lr=opt['lr'], weight_decay=opt['weight_decay'])
self.early_stop = EarlyStop(opt['patience'], lower_is_better=False)
with tqdm(range(1, opt['epochs'] + 1)) as pbar:
for self.current_epoch in pbar:
self._train_epoch(train_generator, self.status['tr'], pbar)
self._test_epoch(valid_generator, self.status['va'], pbar)
self.early_stop(self.status['va']['f1'], self.current_epoch)
if self.early_stop.IMPROVED:
torch.save(self.net.state_dict(), checkpoint)
elif self.early_stop.STOP:
print(f'training ended by patience exhasted; loading best model parameters in {checkpoint} '
f'for epoch {self.early_stop.best_epoch}')
self.net.load_state_dict(torch.load(checkpoint))
break
print('performing one training pass over the validation set...')
self._train_epoch(valid_generator, self.status['tr'], pbar)
print('[done]')
return self
def predict(self, instances):
return np.argmax(self.predict_proba(instances), axis=-1)
def predict_proba(self, instances):
return self.net.predict_proba(instances)
def predict_probability_positive(self, instances):
self.net.eval()
opt = self.trainer_hyperparams
with torch.no_grad():
positive_probs = []
for xi in TorchDataset(instances).asDataloader(
opt['batch_size_test'], shuffle=False, pad_length=opt['padding_length'], device=opt['device']):
positive_probs.append(self.net.predict_proba(xi))
return np.concatenate(positive_probs)
def transform(self, instances):
self.net.eval()
embeddings = []
with torch.no_grad():
for xi in TorchDataset(instances).asDataloader(
self.batch_size_test, shuffle=False, pad_length=self.padding_length, device=self.device):
embeddings.append(self.net.document_embedding(xi).detach().cpu().numpy())
return np.concatenate(embeddings)
class TorchDataset(torch.utils.data.Dataset):
def __init__(self, instances, labels=None):
self.instances = instances
self.labels = labels
def __len__(self):
return len(self.instances)
def __getitem__(self, index):
return {'doc': self.instances[index], 'label': self.labels[index] if self.labels is not None else None}
def asDataloader(self, batch_size, shuffle, pad_length, device):
def collate(batch):
data = [torch.LongTensor(item['doc'][:pad_length]) for item in batch]
data = pad_sequence(data, batch_first=True, padding_value=qp.environ['PAD_INDEX']).to(device)
targets = [item['label'] for item in batch]
if targets[0] is None:
return data
else:
targets = torch.as_tensor(targets, dtype=torch.long).to(device)
return [data, targets]
torchDataset = TorchDataset(self.instances, self.labels)
return torch.utils.data.DataLoader(torchDataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate)
class TextClassifierNet(torch.nn.Module, metaclass=ABCMeta):
@abstractmethod
def document_embedding(self, x): ...
def forward(self, x):
doc_embedded = self.document_embedding(x)
return self.output(doc_embedded)
def dimensions(self):
return self.dim
def predict_proba(self, x):
logits = self(x)
return torch.softmax(logits).detach().cpu().numpy()
def xavier_uniform(self):
for p in self.parameters():
if p.dim() > 1 and p.requires_grad:
torch.nn.init.xavier_uniform_(p)
@abstractmethod
def get_params(self): ...
@property
def vocabulary_size(self): ...
class LSTMnet(TextClassifierNet):
def __init__(self, vocabulary_size, n_classes, embedding_size=100, hidden_size=256, repr_size=100, lstm_nlayers=1,
drop_p=0.5):
super().__init__()
self.vocabulary_size_ = vocabulary_size
self.n_classes = n_classes
self.hyperparams={
'embedding_size': embedding_size,
'hidden_size': hidden_size,
'repr_size': repr_size,
'lstm_nlayers': lstm_nlayers,
'drop_p': drop_p
}
self.word_embedding = torch.nn.Embedding(vocabulary_size, embedding_size)
self.lstm = torch.nn.LSTM(embedding_size, hidden_size, lstm_nlayers, dropout=drop_p, batch_first=True)
self.dropout = torch.nn.Dropout(drop_p)
self.dim = repr_size
self.doc_embedder = torch.nn.Linear(hidden_size, self.dim)
self.output = torch.nn.Linear(self.dim, n_classes)
def init_hidden(self, set_size):
opt = self.hyperparams
var_hidden = torch.zeros(opt['lstm_nlayers'], set_size, opt['lstm_hidden_size'])
var_cell = torch.zeros(opt['lstm_nlayers'], set_size, opt['lstm_hidden_size'])
if next(self.lstm.parameters()).is_cuda:
var_hidden, var_cell = var_hidden.cuda(), var_cell.cuda()
return var_hidden, var_cell
def document_embedding(self, x):
embedded = self.word_embedding(x)
rnn_output, rnn_hidden = self.lstm(embedded, self.init_hidden(x.size()[0]))
abstracted = self.dropout(F.relu(rnn_hidden[0][-1]))
abstracted = self.doc_embedder(abstracted)
return abstracted
def get_params(self):
return self.hyperparams
@property
def vocabulary_size(self):
return self.vocabulary_size_
class CNNnet(TextClassifierNet):
def __init__(self, vocabulary_size, n_classes, embedding_size=100, hidden_size=256, repr_size=100,
kernel_heights=[3, 5, 7], stride=1, padding=0, drop_p=0.5):
super(CNNnet, self).__init__()
self.vocabulary_size_ = vocabulary_size
self.n_classes = n_classes
self.hyperparams={
'embedding_size': embedding_size,
'hidden_size': hidden_size,
'repr_size': repr_size,
'kernel_heights':kernel_heights,
'stride': stride,
'drop_p': drop_p
}
self.word_embedding = torch.nn.Embedding(vocabulary_size, embedding_size)
in_channels = 1
self.conv1 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[0], embedding_size), stride, padding)
self.conv2 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[1], embedding_size), stride, padding)
self.conv3 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[2], embedding_size), stride, padding)
self.dropout = nn.Dropout(drop_p)
self.dim = repr_size
self.doc_embedder = torch.nn.Linear(len(kernel_heights) * hidden_size, self.dim)
self.output = nn.Linear(self.dim, n_classes)
def conv_block(self, input, conv_layer):
conv_out = conv_layer(input) # conv_out.size() = (batch_size, out_channels, dim, 1)
activation = F.relu(conv_out.squeeze(3)) # activation.size() = (batch_size, out_channels, dim1)
max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2) # maxpool_out.size() = (batch_size, out_channels)
return max_out
def document_embedding(self, input):
input = self.word_embedding(input)
input = input.unsqueeze(1) # input.size() = (batch_size, 1, num_seq, embedding_length)
max_out1 = self.conv_block(input, self.conv1)
max_out2 = self.conv_block(input, self.conv2)
max_out3 = self.conv_block(input, self.conv3)
all_out = torch.cat((max_out1, max_out2, max_out3), 1) # all_out.size() = (batch_size, num_kernels*out_channels)
abstracted = self.dropout(F.relu(all_out)) # (batch_size, num_kernels*out_channels)
abstracted = self.doc_embedder(abstracted)
return abstracted
def get_params(self):
return self.hyperparams
@property
def vocabulary_size(self):
return self.vocabulary_size_

View File

@ -158,6 +158,16 @@ class Dataset:
test = LabelledCollection.load(test_path, loader_func)
return Dataset(training, test)
@property
def vocabulary_size(self):
return len(self.vocabulary)
def isbinary(data):
if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
return data.binary
return False

View File

@ -5,6 +5,7 @@ from scipy.sparse import spmatrix
from util import parallelize
from .base import LabelledCollection
from tqdm import tqdm
import quapy as qp
def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
@ -114,6 +115,7 @@ class IndexTransformer:
"""
self.vect = CountVectorizer(**kwargs)
self.unk = -1 # a valid index is assigned after fit
self.pad = -2 # a valid index is assigned after fit
def fit(self, X):
"""
@ -123,12 +125,13 @@ class IndexTransformer:
self.vect.fit(X)
self.analyzer = self.vect.build_analyzer()
self.vocabulary_ = self.vect.vocabulary_
self.unk = self.add_word('UNK')
self.unk = self.add_word(qp.environ['UNK_TOKEN'], qp.environ['UNK_INDEX'])
self.pad = self.add_word(qp.environ['PAD_TOKEN'], qp.environ['PAD_INDEX'])
return self
def transform(self, X, n_jobs=-1):
# given the number of tasks and the number of jobs, generates the slices for the parallel threads
assert self.unk > 0, 'transform called before fit'
assert self.unk != -1, 'transform called before fit'
indexed = parallelize(func=self.index, args=X, n_jobs=n_jobs)
return np.asarray(indexed)
@ -142,9 +145,22 @@ class IndexTransformer:
def vocabulary_size(self):
return len(self.vocabulary_)
def add_word(self, word):
def add_word(self, word, id=None, nogaps=True):
if word in self.vocabulary_:
raise ValueError(f'word {word} already in dictionary')
self.vocabulary_[word] = len(self.vocabulary_)
if id is None:
# add the word with the next id
self.vocabulary_[word] = len(self.vocabulary_)
else:
id2word = {id_:word_ for word_, id_ in self.vocabulary_.items()}
if id in id2word:
old_word = id2word[id]
self.vocabulary_[word] = id
del self.vocabulary_[old_word]
self.add_word(old_word)
elif nogaps:
if id > self.vocabulary_size()+1:
raise ValueError(f'word {word} added with id {id}, while the current vocabulary size '
f'is of {self.vocabulary_size()}, and id gaps are not allowed')
return self.vocabulary_[word]

View File

@ -1,11 +1,9 @@
from sklearn.metrics import f1_score
import numpy as np
import quapy as qp
SAMPLE_SIZE = None
def f1e(y_true, y_pred):
return 1. - f1_score(y_true, y_pred, average='macro')
@ -68,11 +66,12 @@ def smooth(p, eps):
def __check_eps(eps):
sample_size = qp.environ['SAMPLE_SIZE']
if eps is None:
if SAMPLE_SIZE is None:
raise ValueError('eps was not defined, and qp.error.SAMPLE_SIZE was not set')
if sample_size is None:
raise ValueError('eps was not defined, and qp.environ["SAMPLE_SIZE"] was not set')
else:
eps = 1. / (2. * SAMPLE_SIZE)
eps = 1. / (2. * sample_size)
return eps

View File

@ -289,6 +289,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
converged = True
qs_prev_ = qs
s += 1
if not converged:
raise UserWarning('the method has reached the maximum number of iterations; it might have not converged')
@ -443,6 +444,10 @@ class OneVsAll(AggregativeQuantifier):
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
'predictions for each document (row) and class (columns)'
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin)
#prevalences = []
#for c in self.classes:
# prevalences.append(self._delayed_binary_aggregate(c, classif_predictions_bin))
#prevalences = np.asarray(prevalences)
return F.normalize_prevalence(prevalences)
def quantify(self, X, *args):
@ -477,4 +482,20 @@ class OneVsAll(AggregativeQuantifier):
def _delayed_binary_fit(self, c, data, **kwargs):
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
self.dict_binary_quantifiers[c].fit(bindata, **kwargs)
self.dict_binary_quantifiers[c].fit(bindata, **kwargs)
def isaggregative(model):
return isinstance(model, AggregativeQuantifier)
def isprobabilistic(model):
return isinstance(model, AggregativeProbabilisticQuantifier)
def isbinary(model):
return isinstance(model, BinaryQuantifier)
from . import neural
QuaNet = neural.QuaNetTrainer

267
quapy/method/neural.py Normal file
View File

@ -0,0 +1,267 @@
import os
from pathlib import Path
import torch
from torch.nn import MSELoss
from torch.nn.functional import relu
from tqdm import tqdm
from method.aggregative import *
from util import EarlyStop
class QuaNetTrainer(BaseQuantifier):
def __init__(self,
learner,
sample_size,
n_epochs=500,
tr_iter_per_poch=200,
va_iter_per_poch=21,
lr=1e-3,
lstm_hidden_size=64,
lstm_nlayers=1,
ff_layers=[1024, 512],
bidirectional=True,
qdrop_p=0.5,
patience=10, checkpointpath='../checkpoint/quanet.dat', device='cuda'):
assert hasattr(learner, 'transform'), \
f'the learner {learner.__class__.__name__} does not seem to be able to produce document embeddings ' \
f'since it does not implement the method "transform"'
assert hasattr(learner, 'predict_proba'), \
f'the learner {learner.__class__.__name__} does not seem to be able to produce posterior probabilities ' \
f'since it does not implement the method "predict_proba"'
self.learner = learner
self.sample_size = sample_size
self.n_epochs = n_epochs
self.tr_iter = tr_iter_per_poch
self.va_iter = va_iter_per_poch
self.lr = lr
self.quanet_params = {
'lstm_hidden_size': lstm_hidden_size,
'lstm_nlayers': lstm_nlayers,
'ff_layers': ff_layers,
'bidirectional': bidirectional,
'qdrop_p': qdrop_p
}
self.patience = patience
self.checkpointpath = checkpointpath
os.makedirs(Path(checkpointpath).parent, exist_ok=True)
self.device = torch.device(device)
self.__check_params_colision(self.quanet_params, self.learner.get_params())
def fit(self, data: LabelledCollection, fit_learner=True, *args):
"""
:param data: the training data on which to train QuaNet. If fit_learner=True, the data will be split in
40/40/20 for training the classifier, training QuaNet, and validating QuaNet, respectively. If
fit_learner=False, the data will be split in 66/34 for training QuaNet and validating it, respectively.
:param fit_learner: if true, trains the classifier on a split containing 40% of the data
:param args: unused
:return: self
"""
# split: 40% for training classification, 40% for training quapy, and 20% for validating quapy
self.learner, unused_data = \
training_helper(self.learner, data, fit_learner, ensure_probabilistic=True, val_split=0.6)
train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20%
# compute the posterior probabilities of the instances
valid_posteriors = self.learner.predict_proba(valid_data.instances)
train_posteriors = self.learner.predict_proba(train_data.instances)
# turn instances' indexes into embeddings
valid_data.instances = self.learner.transform(valid_data.instances)
train_data.instances = self.learner.transform(train_data.instances)
# estimate the hard and soft stats tpr and fpr of the classifier
self.tr_prev = data.prevalence()
self.quantifiers = [
ClassifyAndCount(self.learner).fit(data, fit_learner=False),
AdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False),
ProbabilisticClassifyAndCount(self.learner).fit(data, fit_learner=False),
ProbabilisticAdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False),
ExpectationMaximizationQuantifier(self.learner).fit(data, fit_learner=False),
]
self.status = {
'tr-loss': -1,
'va-loss': -1,
}
self.quanet = QuaNetModule(
doc_embedding_size=train_data.instances.shape[1],
n_classes=data.n_classes,
stats_size=len(self.quantifiers) * data.n_classes,
**self.quanet_params
).to(self.device)
self.optim = torch.optim.Adam(self.quanet.parameters(), lr=self.lr)
early_stop = EarlyStop(self.patience, lower_is_better=True)
checkpoint = self.checkpointpath
for epoch_i in range(1, self.n_epochs):
self.epoch(train_data, train_posteriors, self.tr_iter, epoch_i, early_stop, train=True)
self.epoch(valid_data, valid_posteriors, self.va_iter, epoch_i, early_stop, train=False)
early_stop(self.status['va-loss'], epoch_i)
if early_stop.IMPROVED:
torch.save(self.quanet.state_dict(), checkpoint)
elif early_stop.STOP:
print(f'training ended by patience exhausted; loading best model parameters in {checkpoint} '
f'for epoch {early_stop.best_epoch}')
self.quanet.load_state_dict(torch.load(checkpoint))
self.epoch(valid_data, valid_posteriors, self.va_iter, epoch_i, early_stop, train=True)
break
return self
def get_aggregative_estims(self, posteriors):
label_predictions = np.argmax(posteriors, axis=-1)
prevs_estim = []
for quantifier in self.quantifiers:
predictions = posteriors if isprobabilistic(quantifier) else label_predictions
prevs_estim.append(quantifier.aggregate(predictions))
return np.asarray(prevs_estim).flatten()
def quantify(self, instances, *args):
posteriors = self.learner.predict_proba(instances)
embeddings = self.learner.transform(instances)
quant_estims = self.get_aggregative_estims(posteriors)
self.quanet.eval()
with torch.no_grad():
prevalence = self.quanet.forward(embeddings, posteriors, quant_estims).item()
return prevalence
def epoch(self, data: LabelledCollection, posteriors, iterations, epoch, early_stop, train):
mse_loss = MSELoss()
prevpoints = F.get_nprevpoints_approximation(iterations, self.quanet.n_classes)
self.quanet.train(mode=train)
losses = []
pbar = tqdm(data.artificial_sampling_index_generator(self.sample_size, prevpoints))
for it, index in enumerate(pbar):
sample_data = data.sampling_from_index(index)
sample_posteriors = posteriors[index]
quant_estims = self.get_aggregative_estims(sample_posteriors)
ptrue = torch.as_tensor([sample_data.prevalence()], dtype=torch.float, device=self.device)
if train:
self.optim.zero_grad()
phat = self.quanet.forward(sample_data.instances, sample_posteriors, quant_estims)
loss = mse_loss(phat, ptrue)
loss.backward()
self.optim.step()
else:
with torch.no_grad():
phat = self.quanet.forward(sample_data.instances, sample_posteriors, quant_estims)
loss = mse_loss(phat, ptrue)
losses.append(loss.item())
self.status['tr-loss' if train else 'va-loss'] = np.mean(losses[-10:])
pbar.set_description(f'[QuaNet][{"training" if train else "validating"}] '
f'epoch={epoch} [it={it}/{iterations}]\t'
f'tr-loss={self.status["tr-loss"]:.5f} '
f'val-loss={self.status["va-loss"]:.5f} '
f'patience={early_stop.patience}/{early_stop.PATIENCE_LIMIT}')
def get_params(self, deep=True):
return {**self.learner.get_params(), **self.quanet_params}
def set_params(self, **parameters):
learner_params={}
for key, val in parameters:
if key in self.quanet_params:
self.quanet_params[key]=val
else:
learner_params[key] = val
self.learner.set_params(**learner_params)
def __check_params_colision(self, quanet_params, learner_params):
quanet_keys = set(quanet_params.keys())
learner_keys = set(learner_params.keys())
intersection = quanet_keys.intersection(learner_keys)
if len(intersection) > 0:
raise ValueError(f'the use of parameters {intersection} is ambiguous sine those can refer to '
f'the parameters of QuaNet or the learner {self.learner.__class__.__name__}')
class QuaNetModule(torch.nn.Module):
def __init__(self,
doc_embedding_size,
n_classes,
stats_size,
lstm_hidden_size=64,
lstm_nlayers=1,
ff_layers=[1024, 512],
bidirectional=True,
qdrop_p=0.5,
order_by=None):
super().__init__()
self.n_classes = n_classes
self.order_by = order_by
self.hidden_size = lstm_hidden_size
self.nlayers = lstm_nlayers
self.bidirectional = bidirectional
self.ndirections = 2 if self.bidirectional else 1
self.qdrop_p = qdrop_p
self.lstm = torch.nn.LSTM(doc_embedding_size + n_classes, # +n_classes stands for the posterior probs. (concatenated)
lstm_hidden_size, lstm_nlayers, bidirectional=bidirectional,
dropout=qdrop_p, batch_first=True)
self.dropout = torch.nn.Dropout(self.qdrop_p)
lstm_output_size = self.hidden_size * self.ndirections
ff_input_size = lstm_output_size + stats_size
prev_size = ff_input_size
self.ff_layers = torch.nn.ModuleList()
for lin_size in ff_layers:
self.ff_layers.append(torch.nn.Linear(prev_size, lin_size))
prev_size = lin_size
self.output = torch.nn.Linear(prev_size, n_classes)
@property
def device(self):
return torch.device('cuda') if next(self.parameters()).is_cuda else torch.device('cpu')
def init_hidden(self):
directions = 2 if self.bidirectional else 1
var_hidden = torch.zeros(self.nlayers * directions, 1, self.hidden_size)
var_cell = torch.zeros(self.nlayers * directions, 1, self.hidden_size)
if next(self.lstm.parameters()).is_cuda:
var_hidden, var_cell = var_hidden.cuda(), var_cell.cuda()
return var_hidden, var_cell
def forward(self, doc_embeddings, doc_posteriors, statistics):
device = self.device
doc_embeddings = torch.as_tensor(doc_embeddings, dtype=torch.float, device=device)
doc_posteriors = torch.as_tensor(doc_posteriors, dtype=torch.float, device=device)
statistics = torch.as_tensor(statistics, dtype=torch.float, device=device)
if self.order_by is not None:
order = torch.argsort(doc_posteriors[:, self.order_by])
doc_embeddings = doc_embeddings[order]
doc_posteriors = doc_posteriors[order]
embeded_posteriors = torch.cat((doc_embeddings, doc_posteriors), dim=-1)
# the entire set represents only one instance in quapy contexts, and so the batch_size=1
# the shape should be (1, number-of-instances, embedding-size + 1)
embeded_posteriors = embeded_posteriors.unsqueeze(0)
_, (rnn_hidden,_) = self.lstm(embeded_posteriors, self.init_hidden())
rnn_hidden = rnn_hidden.view(self.nlayers, self.ndirections, -1, self.hidden_size)
quant_embedding = rnn_hidden[0].view(-1)
quant_embedding = torch.cat((quant_embedding, statistics))
abstracted = quant_embedding.unsqueeze(0)
for linear in self.ff_layers:
abstracted = self.dropout(relu(linear(abstracted)))
logits = self.output(abstracted).view(1, -1)
prevalence = torch.softmax(logits, -1)
return prevalence

View File

@ -1,4 +1,4 @@
from quapy import LabelledCollection
from data import LabelledCollection
from .base import BaseQuantifier

View File

@ -86,7 +86,7 @@ class GridSearchQ:
self.n_prevpoints = F.get_nprevpoints_approximation(eval_budget, n_classes, n_repetitions)
eval_computations = F.num_prevalence_combinations(self.n_prevpoints, n_classes, n_repetitions)
self.sout(f'setting n_prevpoints={self.n_prevpoints} so that the number of \n'
f'evaluations is {eval_computations} (<={eval_budget} eval_budget)')
f'evaluations ({eval_computations}) does not exceed the evaluation budget ({eval_budget})')
elif eval_budget is None:
self.n_prevpoints = n_prevpoints
eval_computations = F.num_prevalence_combinations(self.n_prevpoints, n_classes, n_repetitions)

View File

@ -75,3 +75,26 @@ def pickled_resource(pickle_path:str, generation_func:callable, *args):
os.makedirs(str(Path(pickle_path).parent), exist_ok=True)
pickle.dump(instance, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
return instance
class EarlyStop:
def __init__(self, patience, lower_is_better=True):
self.PATIENCE_LIMIT = patience
self.better = lambda a,b: a<b if lower_is_better else a>b
self.patience = patience
self.best_score = None
self.best_epoch = None
self.STOP = False
self.IMPROVED = False
def __call__(self, watch_score, epoch):
self.IMPROVED = (self.best_score is None or self.better(watch_score, self.best_score))
if self.IMPROVED:
self.best_score = watch_score
self.best_epoch = epoch
self.patience = self.PATIENCE_LIMIT
else:
self.patience -= 1
if self.patience <= 0:
self.STOP = True

43
test.py
View File

@ -4,37 +4,41 @@ import quapy as qp
import quapy.functional as F
import sys
import numpy as np
from classification.neural import NeuralClassifierTrainer, CNNnet
from quapy.model_selection import GridSearchQ
#qp.datasets.fetch_reviews('hp')
#qp.datasets.fetch_twitter('sst')
qp.environ['SAMPLE_SIZE'] = 500
#sys.exit()
from model_selection import GridSearchQ
SAMPLE_SIZE=500
binary = False
sample_size = qp.environ['SAMPLE_SIZE']
binary = True
svmperf_home = './svm_perf_quantification'
if binary:
dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
dataset = qp.datasets.fetch_reviews('kindle', tfidf=False, min_df=5)
qp.data.preprocessing.index(dataset, inplace=True)
else:
dataset = qp.datasets.fetch_twitter('hcr', for_model_selection=False, min_df=10, pickle=True)
# dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3)
print('dataset loaded')
print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.test)}')
# training a quantifier
learner = LogisticRegression(max_iter=1000)
# learner = LogisticRegression(max_iter=1000)
# model = qp.method.aggregative.ClassifyAndCount(learner)
model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
# model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
# model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
# model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
# model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100)
# model = qp.method.aggregative.SVMQ(svmperf_home, C=1)
if not binary and isinstance(model, qp.method.aggregative.BinaryQuantifier):
learner = NeuralClassifierTrainer(CNNnet(dataset.vocabulary_size, dataset.n_classes))
print(learner.get_params())
model = qp.method.aggregative.QuaNet(learner, sample_size, device='cpu')
if qp.isbinary(model) and not qp.isbinary(dataset):
model = qp.method.aggregative.OneVsAll(model)
@ -42,8 +46,9 @@ if not binary and isinstance(model, qp.method.aggregative.BinaryQuantifier):
# ----------------------------------------------------------------------------
print(f'fitting model {model.__class__.__name__}')
train, val = dataset.training.split_stratified(0.6)
model.fit(train, val_split=val)
#train, val = dataset.training.split_stratified(0.6)
#model.fit(train, val_split=val)
model.fit(dataset.training)
# estimating class prevalences
print('quantifying')
@ -69,9 +74,9 @@ print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence
f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded.\n'
f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.')
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, sample_size, n_prevpoints)
qp.error.SAMPLE_SIZE = SAMPLE_SIZE
qp.error.SAMPLE_SIZE = sample_size
print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
for error in qp.error.QUANTIFICATION_ERROR:
score = error(true_prev, estim_prev)
@ -80,12 +85,12 @@ for error in qp.error.QUANTIFICATION_ERROR:
# Model selection and Evaluation according to the artificial sampling protocol
# ----------------------------------------------------------------------------
sys.exit(0)
param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
model_selection = GridSearchQ(model,
param_grid=param_grid,
sample_size=SAMPLE_SIZE,
sample_size=sample_size,
eval_budget=max_evaluations//10,
error='mae',
refit=True,
@ -98,7 +103,7 @@ print(f'param scores:')
for params, score in model_selection.param_scores_.items():
print(f'\t{params}: {score:.5f}')
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, sample_size, n_prevpoints)
print(f'After model selection: Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
for error in qp.error.QUANTIFICATION_ERROR: