forked from moreo/QuaPy
QuaNet added, two examples of TextClassifiers added (CNN, LSTM)
This commit is contained in:
parent
3ec711c96e
commit
d8e2f7556e
8
TODO.txt
8
TODO.txt
|
@ -1,8 +1,12 @@
|
|||
Documentation with sphinx
|
||||
Add quantification_report (akin to classification_report from sklearn)
|
||||
Add optimization - artificial sampling
|
||||
Add quantification_report (akin to classification_report from sklearn) (?)
|
||||
Add NAE, NRAE
|
||||
Add "measures for evaluating ordinal"?
|
||||
Document methods with paper references
|
||||
The parallel training in svmperf seems not to work (not sure...)
|
||||
In binary quantification (hp, kindle, imdb) we used F1 in the minority class (which in kindle and hp happens to be the
|
||||
negative class). This is not covered in this new implementation, in which the binary case is not treated as such, but as
|
||||
an instance of single-label with 2 labels. Check
|
||||
Add classnames to LabelledCollection ?
|
||||
Check the overhead in OneVsAll for SVMperf-based (?)
|
||||
|
||||
|
|
|
@ -1,5 +1,20 @@
|
|||
from .data import *
|
||||
from . import data
|
||||
from .data import datasets
|
||||
from . import functional
|
||||
from . import method
|
||||
from . import error
|
||||
from . import evaluation
|
||||
from method.aggregative import isaggregative, isprobabilistic
|
||||
|
||||
|
||||
environ = {
|
||||
'SAMPLE_SIZE': None,
|
||||
'UNK_TOKEN': '[UNK]',
|
||||
'UNK_INDEX': 0,
|
||||
'PAD_TOKEN': '[PAD]',
|
||||
'PAD_INDEX': 1,
|
||||
}
|
||||
|
||||
|
||||
def isbinary(x):
|
||||
return data.isbinary(x) or method.aggregative.isbinary(x)
|
|
@ -0,0 +1,351 @@
|
|||
import os
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from sklearn.metrics import accuracy_score, f1_score
|
||||
from torch.nn.utils.rnn import pad_sequence
|
||||
from tqdm import tqdm
|
||||
from data import LabelledCollection
|
||||
from util import EarlyStop
|
||||
import quapy as qp
|
||||
|
||||
|
||||
|
||||
class NeuralClassifierTrainer:
|
||||
|
||||
def __init__(self,
|
||||
net, # TextClassifierNet
|
||||
lr=1e-3,
|
||||
weight_decay=0,
|
||||
patience=10,
|
||||
epochs=200,
|
||||
batch_size=64,
|
||||
batch_size_test=512,
|
||||
padding_length=300,
|
||||
device='cpu',
|
||||
checkpointpath='../checkpoint/classifier_net.dat'):
|
||||
|
||||
super().__init__()
|
||||
|
||||
assert isinstance(net, TextClassifierNet), f'net is not an instance of {TextClassifierNet.__name__}'
|
||||
self.net = net
|
||||
self.vocab_size = self.net.vocabulary_size
|
||||
self.trainer_hyperparams={
|
||||
'lr': lr,
|
||||
'weight_decay': weight_decay,
|
||||
'patience': patience,
|
||||
'epochs': epochs,
|
||||
'batch_size': batch_size,
|
||||
'batch_size_test': batch_size_test,
|
||||
'padding_length': padding_length,
|
||||
'device': torch.device(device)
|
||||
}
|
||||
self.learner_hyperparams = self.net.get_params()
|
||||
|
||||
self.checkpointpath = checkpointpath
|
||||
self.classes_ = np.asarray([0, 1])
|
||||
|
||||
print(f'[NeuralNetwork running on {device}]')
|
||||
os.makedirs(Path(checkpointpath).parent, exist_ok=True)
|
||||
|
||||
def reset_net_params(self, vocab_size, n_classes):
|
||||
self.net = self.net.__class__(vocab_size, n_classes, **self.learner_hyperparams)
|
||||
self.net.xavier_uniform()
|
||||
|
||||
def get_params(self):
|
||||
return {**self.net.get_params(), **self.trainer_hyperparams}
|
||||
|
||||
def set_params(self, **params):
|
||||
trainer_hyperparams = self.trainer_hyperparams
|
||||
learner_hyperparams = self.net.get_params()
|
||||
for key, val in params.items():
|
||||
if key in trainer_hyperparams and key in learner_hyperparams:
|
||||
raise ValueError(f'the use of parameter {key} is ambiguous since it can refer to '
|
||||
f'a parameters of the Trainer or the learner {self.netclass.__name__}')
|
||||
elif key not in trainer_hyperparams and key not in learner_hyperparams:
|
||||
raise ValueError(f'parameter {key} is not valid')
|
||||
|
||||
if key in trainer_hyperparams:
|
||||
trainer_hyperparams[key] = val
|
||||
else:
|
||||
learner_hyperparams[key] = val
|
||||
|
||||
self.trainer_hyperparams = trainer_hyperparams
|
||||
self.learner_hyperparams = learner_hyperparams
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return next(self.net.parameters()).device
|
||||
|
||||
def __update_progress_bar(self, pbar):
|
||||
pbar.set_description(f'[{self.net.__class__.__name__}] training epoch={self.current_epoch} '
|
||||
f'tr-loss={self.status["tr"]["loss"]:.5f} '
|
||||
f'tr-acc={100 * self.status["tr"]["acc"]:.2f}% '
|
||||
f'tr-macroF1={100 * self.status["tr"]["f1"]:.2f}% '
|
||||
f'patience={self.early_stop.patience}/{self.early_stop.PATIENCE_LIMIT} '
|
||||
f'val-loss={self.status["va"]["loss"]:.5f} '
|
||||
f'val-acc={100 * self.status["va"]["acc"]:.2f}% '
|
||||
f'macroF1={100 * self.status["va"]["f1"]:.2f}%')
|
||||
|
||||
def _train_epoch(self, data, status, pbar):
|
||||
self.net.train()
|
||||
criterion = torch.nn.CrossEntropyLoss()
|
||||
losses, predictions, true_labels = [], [], []
|
||||
for xi, yi in data:
|
||||
self.optim.zero_grad()
|
||||
logits = self.net.forward(xi)
|
||||
loss = criterion(logits, yi)
|
||||
loss.backward()
|
||||
self.optim.step()
|
||||
losses.append(loss.item())
|
||||
preds = torch.softmax(logits, dim=-1).detach().cpu().numpy().argmax(axis=-1)
|
||||
|
||||
status["loss"] = np.mean(losses)
|
||||
predictions.extend(preds.tolist())
|
||||
true_labels.extend(yi.detach().cpu().numpy().tolist())
|
||||
status["acc"] = accuracy_score(true_labels, predictions)
|
||||
status["f1"] = f1_score(true_labels, predictions, average='macro')
|
||||
self.__update_progress_bar(pbar)
|
||||
|
||||
def _test_epoch(self, data, status, pbar):
|
||||
self.net.eval()
|
||||
criterion = torch.nn.CrossEntropyLoss()
|
||||
losses, predictions, true_labels = [], [], []
|
||||
with torch.no_grad():
|
||||
for xi, yi in data:
|
||||
logits = self.net.forward(xi)
|
||||
loss = criterion(logits, yi)
|
||||
losses.append(loss.item())
|
||||
preds = torch.softmax(logits, dim=-1).detach().cpu().numpy().argmax(axis=-1)
|
||||
predictions.extend(preds.tolist())
|
||||
true_labels.extend(yi.detach().cpu().numpy().tolist())
|
||||
|
||||
status["loss"] = np.mean(losses)
|
||||
status["acc"] = accuracy_score(true_labels, predictions)
|
||||
status["f1"] = f1_score(true_labels, predictions, average='macro')
|
||||
self.__update_progress_bar(pbar)
|
||||
|
||||
def fit(self, instances, labels, val_split=0.3):
|
||||
train, val = LabelledCollection(instances, labels).split_stratified(1-val_split)
|
||||
opt = self.trainer_hyperparams
|
||||
checkpoint = self.checkpointpath
|
||||
self.reset_net_params(self.vocab_size, train.n_classes)
|
||||
|
||||
train_generator = TorchDataset(train.instances, train.labels).asDataloader(
|
||||
opt['batch_size'], shuffle=True, pad_length=opt['padding_length'], device=opt['device'])
|
||||
valid_generator = TorchDataset(val.instances, val.labels).asDataloader(
|
||||
opt['batch_size_test'], shuffle=False, pad_length=opt['padding_length'], device=opt['device'])
|
||||
|
||||
self.status = {'tr': {'loss': -1, 'acc': -1, 'f1': -1},
|
||||
'va': {'loss': -1, 'acc': -1, 'f1': -1}}
|
||||
|
||||
self.optim = torch.optim.Adam(self.net.parameters(), lr=opt['lr'], weight_decay=opt['weight_decay'])
|
||||
self.early_stop = EarlyStop(opt['patience'], lower_is_better=False)
|
||||
|
||||
with tqdm(range(1, opt['epochs'] + 1)) as pbar:
|
||||
for self.current_epoch in pbar:
|
||||
self._train_epoch(train_generator, self.status['tr'], pbar)
|
||||
self._test_epoch(valid_generator, self.status['va'], pbar)
|
||||
|
||||
self.early_stop(self.status['va']['f1'], self.current_epoch)
|
||||
if self.early_stop.IMPROVED:
|
||||
torch.save(self.net.state_dict(), checkpoint)
|
||||
elif self.early_stop.STOP:
|
||||
print(f'training ended by patience exhasted; loading best model parameters in {checkpoint} '
|
||||
f'for epoch {self.early_stop.best_epoch}')
|
||||
self.net.load_state_dict(torch.load(checkpoint))
|
||||
break
|
||||
|
||||
print('performing one training pass over the validation set...')
|
||||
self._train_epoch(valid_generator, self.status['tr'], pbar)
|
||||
print('[done]')
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, instances):
|
||||
return np.argmax(self.predict_proba(instances), axis=-1)
|
||||
|
||||
def predict_proba(self, instances):
|
||||
return self.net.predict_proba(instances)
|
||||
|
||||
def predict_probability_positive(self, instances):
|
||||
self.net.eval()
|
||||
opt = self.trainer_hyperparams
|
||||
with torch.no_grad():
|
||||
positive_probs = []
|
||||
for xi in TorchDataset(instances).asDataloader(
|
||||
opt['batch_size_test'], shuffle=False, pad_length=opt['padding_length'], device=opt['device']):
|
||||
positive_probs.append(self.net.predict_proba(xi))
|
||||
return np.concatenate(positive_probs)
|
||||
|
||||
def transform(self, instances):
|
||||
self.net.eval()
|
||||
embeddings = []
|
||||
with torch.no_grad():
|
||||
for xi in TorchDataset(instances).asDataloader(
|
||||
self.batch_size_test, shuffle=False, pad_length=self.padding_length, device=self.device):
|
||||
embeddings.append(self.net.document_embedding(xi).detach().cpu().numpy())
|
||||
return np.concatenate(embeddings)
|
||||
|
||||
|
||||
class TorchDataset(torch.utils.data.Dataset):
|
||||
|
||||
def __init__(self, instances, labels=None):
|
||||
self.instances = instances
|
||||
self.labels = labels
|
||||
|
||||
def __len__(self):
|
||||
return len(self.instances)
|
||||
|
||||
def __getitem__(self, index):
|
||||
return {'doc': self.instances[index], 'label': self.labels[index] if self.labels is not None else None}
|
||||
|
||||
def asDataloader(self, batch_size, shuffle, pad_length, device):
|
||||
def collate(batch):
|
||||
data = [torch.LongTensor(item['doc'][:pad_length]) for item in batch]
|
||||
data = pad_sequence(data, batch_first=True, padding_value=qp.environ['PAD_INDEX']).to(device)
|
||||
targets = [item['label'] for item in batch]
|
||||
if targets[0] is None:
|
||||
return data
|
||||
else:
|
||||
targets = torch.as_tensor(targets, dtype=torch.long).to(device)
|
||||
return [data, targets]
|
||||
|
||||
torchDataset = TorchDataset(self.instances, self.labels)
|
||||
return torch.utils.data.DataLoader(torchDataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate)
|
||||
|
||||
|
||||
class TextClassifierNet(torch.nn.Module, metaclass=ABCMeta):
|
||||
|
||||
@abstractmethod
|
||||
def document_embedding(self, x): ...
|
||||
|
||||
def forward(self, x):
|
||||
doc_embedded = self.document_embedding(x)
|
||||
return self.output(doc_embedded)
|
||||
|
||||
def dimensions(self):
|
||||
return self.dim
|
||||
|
||||
def predict_proba(self, x):
|
||||
logits = self(x)
|
||||
return torch.softmax(logits).detach().cpu().numpy()
|
||||
|
||||
def xavier_uniform(self):
|
||||
for p in self.parameters():
|
||||
if p.dim() > 1 and p.requires_grad:
|
||||
torch.nn.init.xavier_uniform_(p)
|
||||
|
||||
@abstractmethod
|
||||
def get_params(self): ...
|
||||
|
||||
@property
|
||||
def vocabulary_size(self): ...
|
||||
|
||||
|
||||
class LSTMnet(TextClassifierNet):
|
||||
|
||||
def __init__(self, vocabulary_size, n_classes, embedding_size=100, hidden_size=256, repr_size=100, lstm_nlayers=1,
|
||||
drop_p=0.5):
|
||||
super().__init__()
|
||||
self.vocabulary_size_ = vocabulary_size
|
||||
self.n_classes = n_classes
|
||||
self.hyperparams={
|
||||
'embedding_size': embedding_size,
|
||||
'hidden_size': hidden_size,
|
||||
'repr_size': repr_size,
|
||||
'lstm_nlayers': lstm_nlayers,
|
||||
'drop_p': drop_p
|
||||
}
|
||||
|
||||
self.word_embedding = torch.nn.Embedding(vocabulary_size, embedding_size)
|
||||
self.lstm = torch.nn.LSTM(embedding_size, hidden_size, lstm_nlayers, dropout=drop_p, batch_first=True)
|
||||
self.dropout = torch.nn.Dropout(drop_p)
|
||||
|
||||
self.dim = repr_size
|
||||
self.doc_embedder = torch.nn.Linear(hidden_size, self.dim)
|
||||
self.output = torch.nn.Linear(self.dim, n_classes)
|
||||
|
||||
def init_hidden(self, set_size):
|
||||
opt = self.hyperparams
|
||||
var_hidden = torch.zeros(opt['lstm_nlayers'], set_size, opt['lstm_hidden_size'])
|
||||
var_cell = torch.zeros(opt['lstm_nlayers'], set_size, opt['lstm_hidden_size'])
|
||||
if next(self.lstm.parameters()).is_cuda:
|
||||
var_hidden, var_cell = var_hidden.cuda(), var_cell.cuda()
|
||||
return var_hidden, var_cell
|
||||
|
||||
def document_embedding(self, x):
|
||||
embedded = self.word_embedding(x)
|
||||
rnn_output, rnn_hidden = self.lstm(embedded, self.init_hidden(x.size()[0]))
|
||||
abstracted = self.dropout(F.relu(rnn_hidden[0][-1]))
|
||||
abstracted = self.doc_embedder(abstracted)
|
||||
return abstracted
|
||||
|
||||
def get_params(self):
|
||||
return self.hyperparams
|
||||
|
||||
@property
|
||||
def vocabulary_size(self):
|
||||
return self.vocabulary_size_
|
||||
|
||||
|
||||
class CNNnet(TextClassifierNet):
|
||||
|
||||
def __init__(self, vocabulary_size, n_classes, embedding_size=100, hidden_size=256, repr_size=100,
|
||||
kernel_heights=[3, 5, 7], stride=1, padding=0, drop_p=0.5):
|
||||
super(CNNnet, self).__init__()
|
||||
|
||||
self.vocabulary_size_ = vocabulary_size
|
||||
self.n_classes = n_classes
|
||||
self.hyperparams={
|
||||
'embedding_size': embedding_size,
|
||||
'hidden_size': hidden_size,
|
||||
'repr_size': repr_size,
|
||||
'kernel_heights':kernel_heights,
|
||||
'stride': stride,
|
||||
'drop_p': drop_p
|
||||
}
|
||||
self.word_embedding = torch.nn.Embedding(vocabulary_size, embedding_size)
|
||||
in_channels = 1
|
||||
self.conv1 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[0], embedding_size), stride, padding)
|
||||
self.conv2 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[1], embedding_size), stride, padding)
|
||||
self.conv3 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[2], embedding_size), stride, padding)
|
||||
self.dropout = nn.Dropout(drop_p)
|
||||
|
||||
self.dim = repr_size
|
||||
self.doc_embedder = torch.nn.Linear(len(kernel_heights) * hidden_size, self.dim)
|
||||
self.output = nn.Linear(self.dim, n_classes)
|
||||
|
||||
def conv_block(self, input, conv_layer):
|
||||
conv_out = conv_layer(input) # conv_out.size() = (batch_size, out_channels, dim, 1)
|
||||
activation = F.relu(conv_out.squeeze(3)) # activation.size() = (batch_size, out_channels, dim1)
|
||||
max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2) # maxpool_out.size() = (batch_size, out_channels)
|
||||
return max_out
|
||||
|
||||
def document_embedding(self, input):
|
||||
input = self.word_embedding(input)
|
||||
input = input.unsqueeze(1) # input.size() = (batch_size, 1, num_seq, embedding_length)
|
||||
|
||||
max_out1 = self.conv_block(input, self.conv1)
|
||||
max_out2 = self.conv_block(input, self.conv2)
|
||||
max_out3 = self.conv_block(input, self.conv3)
|
||||
|
||||
all_out = torch.cat((max_out1, max_out2, max_out3), 1) # all_out.size() = (batch_size, num_kernels*out_channels)
|
||||
abstracted = self.dropout(F.relu(all_out)) # (batch_size, num_kernels*out_channels)
|
||||
abstracted = self.doc_embedder(abstracted)
|
||||
return abstracted
|
||||
|
||||
def get_params(self):
|
||||
return self.hyperparams
|
||||
|
||||
@property
|
||||
def vocabulary_size(self):
|
||||
return self.vocabulary_size_
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -158,6 +158,16 @@ class Dataset:
|
|||
test = LabelledCollection.load(test_path, loader_func)
|
||||
return Dataset(training, test)
|
||||
|
||||
@property
|
||||
def vocabulary_size(self):
|
||||
return len(self.vocabulary)
|
||||
|
||||
|
||||
def isbinary(data):
|
||||
if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
|
||||
return data.binary
|
||||
return False
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ from scipy.sparse import spmatrix
|
|||
from util import parallelize
|
||||
from .base import LabelledCollection
|
||||
from tqdm import tqdm
|
||||
import quapy as qp
|
||||
|
||||
|
||||
def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
|
||||
|
@ -114,6 +115,7 @@ class IndexTransformer:
|
|||
"""
|
||||
self.vect = CountVectorizer(**kwargs)
|
||||
self.unk = -1 # a valid index is assigned after fit
|
||||
self.pad = -2 # a valid index is assigned after fit
|
||||
|
||||
def fit(self, X):
|
||||
"""
|
||||
|
@ -123,12 +125,13 @@ class IndexTransformer:
|
|||
self.vect.fit(X)
|
||||
self.analyzer = self.vect.build_analyzer()
|
||||
self.vocabulary_ = self.vect.vocabulary_
|
||||
self.unk = self.add_word('UNK')
|
||||
self.unk = self.add_word(qp.environ['UNK_TOKEN'], qp.environ['UNK_INDEX'])
|
||||
self.pad = self.add_word(qp.environ['PAD_TOKEN'], qp.environ['PAD_INDEX'])
|
||||
return self
|
||||
|
||||
def transform(self, X, n_jobs=-1):
|
||||
# given the number of tasks and the number of jobs, generates the slices for the parallel threads
|
||||
assert self.unk > 0, 'transform called before fit'
|
||||
assert self.unk != -1, 'transform called before fit'
|
||||
indexed = parallelize(func=self.index, args=X, n_jobs=n_jobs)
|
||||
return np.asarray(indexed)
|
||||
|
||||
|
@ -142,9 +145,22 @@ class IndexTransformer:
|
|||
def vocabulary_size(self):
|
||||
return len(self.vocabulary_)
|
||||
|
||||
def add_word(self, word):
|
||||
def add_word(self, word, id=None, nogaps=True):
|
||||
if word in self.vocabulary_:
|
||||
raise ValueError(f'word {word} already in dictionary')
|
||||
self.vocabulary_[word] = len(self.vocabulary_)
|
||||
if id is None:
|
||||
# add the word with the next id
|
||||
self.vocabulary_[word] = len(self.vocabulary_)
|
||||
else:
|
||||
id2word = {id_:word_ for word_, id_ in self.vocabulary_.items()}
|
||||
if id in id2word:
|
||||
old_word = id2word[id]
|
||||
self.vocabulary_[word] = id
|
||||
del self.vocabulary_[old_word]
|
||||
self.add_word(old_word)
|
||||
elif nogaps:
|
||||
if id > self.vocabulary_size()+1:
|
||||
raise ValueError(f'word {word} added with id {id}, while the current vocabulary size '
|
||||
f'is of {self.vocabulary_size()}, and id gaps are not allowed')
|
||||
return self.vocabulary_[word]
|
||||
|
||||
|
|
|
@ -1,11 +1,9 @@
|
|||
from sklearn.metrics import f1_score
|
||||
import numpy as np
|
||||
import quapy as qp
|
||||
|
||||
|
||||
|
||||
SAMPLE_SIZE = None
|
||||
|
||||
|
||||
def f1e(y_true, y_pred):
|
||||
return 1. - f1_score(y_true, y_pred, average='macro')
|
||||
|
||||
|
@ -68,11 +66,12 @@ def smooth(p, eps):
|
|||
|
||||
|
||||
def __check_eps(eps):
|
||||
sample_size = qp.environ['SAMPLE_SIZE']
|
||||
if eps is None:
|
||||
if SAMPLE_SIZE is None:
|
||||
raise ValueError('eps was not defined, and qp.error.SAMPLE_SIZE was not set')
|
||||
if sample_size is None:
|
||||
raise ValueError('eps was not defined, and qp.environ["SAMPLE_SIZE"] was not set')
|
||||
else:
|
||||
eps = 1. / (2. * SAMPLE_SIZE)
|
||||
eps = 1. / (2. * sample_size)
|
||||
return eps
|
||||
|
||||
|
||||
|
|
|
@ -289,6 +289,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
|
|||
converged = True
|
||||
|
||||
qs_prev_ = qs
|
||||
s += 1
|
||||
|
||||
if not converged:
|
||||
raise UserWarning('the method has reached the maximum number of iterations; it might have not converged')
|
||||
|
@ -443,6 +444,10 @@ class OneVsAll(AggregativeQuantifier):
|
|||
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
|
||||
'predictions for each document (row) and class (columns)'
|
||||
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin)
|
||||
#prevalences = []
|
||||
#for c in self.classes:
|
||||
# prevalences.append(self._delayed_binary_aggregate(c, classif_predictions_bin))
|
||||
#prevalences = np.asarray(prevalences)
|
||||
return F.normalize_prevalence(prevalences)
|
||||
|
||||
def quantify(self, X, *args):
|
||||
|
@ -477,4 +482,20 @@ class OneVsAll(AggregativeQuantifier):
|
|||
|
||||
def _delayed_binary_fit(self, c, data, **kwargs):
|
||||
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
|
||||
self.dict_binary_quantifiers[c].fit(bindata, **kwargs)
|
||||
self.dict_binary_quantifiers[c].fit(bindata, **kwargs)
|
||||
|
||||
|
||||
def isaggregative(model):
|
||||
return isinstance(model, AggregativeQuantifier)
|
||||
|
||||
|
||||
def isprobabilistic(model):
|
||||
return isinstance(model, AggregativeProbabilisticQuantifier)
|
||||
|
||||
|
||||
def isbinary(model):
|
||||
return isinstance(model, BinaryQuantifier)
|
||||
|
||||
|
||||
from . import neural
|
||||
QuaNet = neural.QuaNetTrainer
|
|
@ -0,0 +1,267 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
import torch
|
||||
from torch.nn import MSELoss
|
||||
from torch.nn.functional import relu
|
||||
from tqdm import tqdm
|
||||
from method.aggregative import *
|
||||
from util import EarlyStop
|
||||
|
||||
|
||||
class QuaNetTrainer(BaseQuantifier):
|
||||
|
||||
def __init__(self,
|
||||
learner,
|
||||
sample_size,
|
||||
n_epochs=500,
|
||||
tr_iter_per_poch=200,
|
||||
va_iter_per_poch=21,
|
||||
lr=1e-3,
|
||||
lstm_hidden_size=64,
|
||||
lstm_nlayers=1,
|
||||
ff_layers=[1024, 512],
|
||||
bidirectional=True,
|
||||
qdrop_p=0.5,
|
||||
patience=10, checkpointpath='../checkpoint/quanet.dat', device='cuda'):
|
||||
assert hasattr(learner, 'transform'), \
|
||||
f'the learner {learner.__class__.__name__} does not seem to be able to produce document embeddings ' \
|
||||
f'since it does not implement the method "transform"'
|
||||
assert hasattr(learner, 'predict_proba'), \
|
||||
f'the learner {learner.__class__.__name__} does not seem to be able to produce posterior probabilities ' \
|
||||
f'since it does not implement the method "predict_proba"'
|
||||
self.learner = learner
|
||||
self.sample_size = sample_size
|
||||
self.n_epochs = n_epochs
|
||||
self.tr_iter = tr_iter_per_poch
|
||||
self.va_iter = va_iter_per_poch
|
||||
self.lr = lr
|
||||
self.quanet_params = {
|
||||
'lstm_hidden_size': lstm_hidden_size,
|
||||
'lstm_nlayers': lstm_nlayers,
|
||||
'ff_layers': ff_layers,
|
||||
'bidirectional': bidirectional,
|
||||
'qdrop_p': qdrop_p
|
||||
}
|
||||
|
||||
self.patience = patience
|
||||
self.checkpointpath = checkpointpath
|
||||
os.makedirs(Path(checkpointpath).parent, exist_ok=True)
|
||||
self.device = torch.device(device)
|
||||
|
||||
self.__check_params_colision(self.quanet_params, self.learner.get_params())
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, *args):
|
||||
"""
|
||||
:param data: the training data on which to train QuaNet. If fit_learner=True, the data will be split in
|
||||
40/40/20 for training the classifier, training QuaNet, and validating QuaNet, respectively. If
|
||||
fit_learner=False, the data will be split in 66/34 for training QuaNet and validating it, respectively.
|
||||
:param fit_learner: if true, trains the classifier on a split containing 40% of the data
|
||||
:param args: unused
|
||||
:return: self
|
||||
"""
|
||||
# split: 40% for training classification, 40% for training quapy, and 20% for validating quapy
|
||||
self.learner, unused_data = \
|
||||
training_helper(self.learner, data, fit_learner, ensure_probabilistic=True, val_split=0.6)
|
||||
train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20%
|
||||
|
||||
# compute the posterior probabilities of the instances
|
||||
valid_posteriors = self.learner.predict_proba(valid_data.instances)
|
||||
train_posteriors = self.learner.predict_proba(train_data.instances)
|
||||
|
||||
# turn instances' indexes into embeddings
|
||||
valid_data.instances = self.learner.transform(valid_data.instances)
|
||||
train_data.instances = self.learner.transform(train_data.instances)
|
||||
|
||||
# estimate the hard and soft stats tpr and fpr of the classifier
|
||||
self.tr_prev = data.prevalence()
|
||||
|
||||
self.quantifiers = [
|
||||
ClassifyAndCount(self.learner).fit(data, fit_learner=False),
|
||||
AdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False),
|
||||
ProbabilisticClassifyAndCount(self.learner).fit(data, fit_learner=False),
|
||||
ProbabilisticAdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False),
|
||||
ExpectationMaximizationQuantifier(self.learner).fit(data, fit_learner=False),
|
||||
]
|
||||
|
||||
self.status = {
|
||||
'tr-loss': -1,
|
||||
'va-loss': -1,
|
||||
}
|
||||
|
||||
self.quanet = QuaNetModule(
|
||||
doc_embedding_size=train_data.instances.shape[1],
|
||||
n_classes=data.n_classes,
|
||||
stats_size=len(self.quantifiers) * data.n_classes,
|
||||
**self.quanet_params
|
||||
).to(self.device)
|
||||
|
||||
self.optim = torch.optim.Adam(self.quanet.parameters(), lr=self.lr)
|
||||
early_stop = EarlyStop(self.patience, lower_is_better=True)
|
||||
|
||||
checkpoint = self.checkpointpath
|
||||
|
||||
for epoch_i in range(1, self.n_epochs):
|
||||
self.epoch(train_data, train_posteriors, self.tr_iter, epoch_i, early_stop, train=True)
|
||||
self.epoch(valid_data, valid_posteriors, self.va_iter, epoch_i, early_stop, train=False)
|
||||
|
||||
early_stop(self.status['va-loss'], epoch_i)
|
||||
if early_stop.IMPROVED:
|
||||
torch.save(self.quanet.state_dict(), checkpoint)
|
||||
elif early_stop.STOP:
|
||||
print(f'training ended by patience exhausted; loading best model parameters in {checkpoint} '
|
||||
f'for epoch {early_stop.best_epoch}')
|
||||
self.quanet.load_state_dict(torch.load(checkpoint))
|
||||
self.epoch(valid_data, valid_posteriors, self.va_iter, epoch_i, early_stop, train=True)
|
||||
break
|
||||
|
||||
return self
|
||||
|
||||
def get_aggregative_estims(self, posteriors):
|
||||
label_predictions = np.argmax(posteriors, axis=-1)
|
||||
prevs_estim = []
|
||||
for quantifier in self.quantifiers:
|
||||
predictions = posteriors if isprobabilistic(quantifier) else label_predictions
|
||||
prevs_estim.append(quantifier.aggregate(predictions))
|
||||
return np.asarray(prevs_estim).flatten()
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
posteriors = self.learner.predict_proba(instances)
|
||||
embeddings = self.learner.transform(instances)
|
||||
quant_estims = self.get_aggregative_estims(posteriors)
|
||||
self.quanet.eval()
|
||||
with torch.no_grad():
|
||||
prevalence = self.quanet.forward(embeddings, posteriors, quant_estims).item()
|
||||
return prevalence
|
||||
|
||||
def epoch(self, data: LabelledCollection, posteriors, iterations, epoch, early_stop, train):
|
||||
mse_loss = MSELoss()
|
||||
prevpoints = F.get_nprevpoints_approximation(iterations, self.quanet.n_classes)
|
||||
|
||||
self.quanet.train(mode=train)
|
||||
losses = []
|
||||
pbar = tqdm(data.artificial_sampling_index_generator(self.sample_size, prevpoints))
|
||||
for it, index in enumerate(pbar):
|
||||
sample_data = data.sampling_from_index(index)
|
||||
sample_posteriors = posteriors[index]
|
||||
quant_estims = self.get_aggregative_estims(sample_posteriors)
|
||||
ptrue = torch.as_tensor([sample_data.prevalence()], dtype=torch.float, device=self.device)
|
||||
if train:
|
||||
self.optim.zero_grad()
|
||||
phat = self.quanet.forward(sample_data.instances, sample_posteriors, quant_estims)
|
||||
loss = mse_loss(phat, ptrue)
|
||||
loss.backward()
|
||||
self.optim.step()
|
||||
else:
|
||||
with torch.no_grad():
|
||||
phat = self.quanet.forward(sample_data.instances, sample_posteriors, quant_estims)
|
||||
loss = mse_loss(phat, ptrue)
|
||||
|
||||
losses.append(loss.item())
|
||||
|
||||
self.status['tr-loss' if train else 'va-loss'] = np.mean(losses[-10:])
|
||||
pbar.set_description(f'[QuaNet][{"training" if train else "validating"}] '
|
||||
f'epoch={epoch} [it={it}/{iterations}]\t'
|
||||
f'tr-loss={self.status["tr-loss"]:.5f} '
|
||||
f'val-loss={self.status["va-loss"]:.5f} '
|
||||
f'patience={early_stop.patience}/{early_stop.PATIENCE_LIMIT}')
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return {**self.learner.get_params(), **self.quanet_params}
|
||||
|
||||
def set_params(self, **parameters):
|
||||
learner_params={}
|
||||
for key, val in parameters:
|
||||
if key in self.quanet_params:
|
||||
self.quanet_params[key]=val
|
||||
else:
|
||||
learner_params[key] = val
|
||||
self.learner.set_params(**learner_params)
|
||||
|
||||
def __check_params_colision(self, quanet_params, learner_params):
|
||||
quanet_keys = set(quanet_params.keys())
|
||||
learner_keys = set(learner_params.keys())
|
||||
intersection = quanet_keys.intersection(learner_keys)
|
||||
if len(intersection) > 0:
|
||||
raise ValueError(f'the use of parameters {intersection} is ambiguous sine those can refer to '
|
||||
f'the parameters of QuaNet or the learner {self.learner.__class__.__name__}')
|
||||
|
||||
|
||||
class QuaNetModule(torch.nn.Module):
|
||||
def __init__(self,
|
||||
doc_embedding_size,
|
||||
n_classes,
|
||||
stats_size,
|
||||
lstm_hidden_size=64,
|
||||
lstm_nlayers=1,
|
||||
ff_layers=[1024, 512],
|
||||
bidirectional=True,
|
||||
qdrop_p=0.5,
|
||||
order_by=None):
|
||||
super().__init__()
|
||||
|
||||
self.n_classes = n_classes
|
||||
self.order_by = order_by
|
||||
self.hidden_size = lstm_hidden_size
|
||||
self.nlayers = lstm_nlayers
|
||||
self.bidirectional = bidirectional
|
||||
self.ndirections = 2 if self.bidirectional else 1
|
||||
self.qdrop_p = qdrop_p
|
||||
self.lstm = torch.nn.LSTM(doc_embedding_size + n_classes, # +n_classes stands for the posterior probs. (concatenated)
|
||||
lstm_hidden_size, lstm_nlayers, bidirectional=bidirectional,
|
||||
dropout=qdrop_p, batch_first=True)
|
||||
self.dropout = torch.nn.Dropout(self.qdrop_p)
|
||||
|
||||
lstm_output_size = self.hidden_size * self.ndirections
|
||||
ff_input_size = lstm_output_size + stats_size
|
||||
prev_size = ff_input_size
|
||||
self.ff_layers = torch.nn.ModuleList()
|
||||
for lin_size in ff_layers:
|
||||
self.ff_layers.append(torch.nn.Linear(prev_size, lin_size))
|
||||
prev_size = lin_size
|
||||
self.output = torch.nn.Linear(prev_size, n_classes)
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return torch.device('cuda') if next(self.parameters()).is_cuda else torch.device('cpu')
|
||||
|
||||
def init_hidden(self):
|
||||
directions = 2 if self.bidirectional else 1
|
||||
var_hidden = torch.zeros(self.nlayers * directions, 1, self.hidden_size)
|
||||
var_cell = torch.zeros(self.nlayers * directions, 1, self.hidden_size)
|
||||
if next(self.lstm.parameters()).is_cuda:
|
||||
var_hidden, var_cell = var_hidden.cuda(), var_cell.cuda()
|
||||
return var_hidden, var_cell
|
||||
|
||||
def forward(self, doc_embeddings, doc_posteriors, statistics):
|
||||
device = self.device
|
||||
doc_embeddings = torch.as_tensor(doc_embeddings, dtype=torch.float, device=device)
|
||||
doc_posteriors = torch.as_tensor(doc_posteriors, dtype=torch.float, device=device)
|
||||
statistics = torch.as_tensor(statistics, dtype=torch.float, device=device)
|
||||
|
||||
if self.order_by is not None:
|
||||
order = torch.argsort(doc_posteriors[:, self.order_by])
|
||||
doc_embeddings = doc_embeddings[order]
|
||||
doc_posteriors = doc_posteriors[order]
|
||||
|
||||
embeded_posteriors = torch.cat((doc_embeddings, doc_posteriors), dim=-1)
|
||||
|
||||
# the entire set represents only one instance in quapy contexts, and so the batch_size=1
|
||||
# the shape should be (1, number-of-instances, embedding-size + 1)
|
||||
embeded_posteriors = embeded_posteriors.unsqueeze(0)
|
||||
|
||||
_, (rnn_hidden,_) = self.lstm(embeded_posteriors, self.init_hidden())
|
||||
rnn_hidden = rnn_hidden.view(self.nlayers, self.ndirections, -1, self.hidden_size)
|
||||
quant_embedding = rnn_hidden[0].view(-1)
|
||||
quant_embedding = torch.cat((quant_embedding, statistics))
|
||||
|
||||
abstracted = quant_embedding.unsqueeze(0)
|
||||
for linear in self.ff_layers:
|
||||
abstracted = self.dropout(relu(linear(abstracted)))
|
||||
|
||||
logits = self.output(abstracted).view(1, -1)
|
||||
prevalence = torch.softmax(logits, -1)
|
||||
|
||||
return prevalence
|
||||
|
||||
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
from quapy import LabelledCollection
|
||||
from data import LabelledCollection
|
||||
from .base import BaseQuantifier
|
||||
|
||||
|
||||
|
|
|
@ -86,7 +86,7 @@ class GridSearchQ:
|
|||
self.n_prevpoints = F.get_nprevpoints_approximation(eval_budget, n_classes, n_repetitions)
|
||||
eval_computations = F.num_prevalence_combinations(self.n_prevpoints, n_classes, n_repetitions)
|
||||
self.sout(f'setting n_prevpoints={self.n_prevpoints} so that the number of \n'
|
||||
f'evaluations is {eval_computations} (<={eval_budget} eval_budget)')
|
||||
f'evaluations ({eval_computations}) does not exceed the evaluation budget ({eval_budget})')
|
||||
elif eval_budget is None:
|
||||
self.n_prevpoints = n_prevpoints
|
||||
eval_computations = F.num_prevalence_combinations(self.n_prevpoints, n_classes, n_repetitions)
|
||||
|
|
|
@ -75,3 +75,26 @@ def pickled_resource(pickle_path:str, generation_func:callable, *args):
|
|||
os.makedirs(str(Path(pickle_path).parent), exist_ok=True)
|
||||
pickle.dump(instance, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
return instance
|
||||
|
||||
|
||||
class EarlyStop:
|
||||
|
||||
def __init__(self, patience, lower_is_better=True):
|
||||
self.PATIENCE_LIMIT = patience
|
||||
self.better = lambda a,b: a<b if lower_is_better else a>b
|
||||
self.patience = patience
|
||||
self.best_score = None
|
||||
self.best_epoch = None
|
||||
self.STOP = False
|
||||
self.IMPROVED = False
|
||||
|
||||
def __call__(self, watch_score, epoch):
|
||||
self.IMPROVED = (self.best_score is None or self.better(watch_score, self.best_score))
|
||||
if self.IMPROVED:
|
||||
self.best_score = watch_score
|
||||
self.best_epoch = epoch
|
||||
self.patience = self.PATIENCE_LIMIT
|
||||
else:
|
||||
self.patience -= 1
|
||||
if self.patience <= 0:
|
||||
self.STOP = True
|
43
test.py
43
test.py
|
@ -4,37 +4,41 @@ import quapy as qp
|
|||
import quapy.functional as F
|
||||
import sys
|
||||
import numpy as np
|
||||
from classification.neural import NeuralClassifierTrainer, CNNnet
|
||||
from quapy.model_selection import GridSearchQ
|
||||
|
||||
#qp.datasets.fetch_reviews('hp')
|
||||
#qp.datasets.fetch_twitter('sst')
|
||||
qp.environ['SAMPLE_SIZE'] = 500
|
||||
|
||||
#sys.exit()
|
||||
from model_selection import GridSearchQ
|
||||
|
||||
SAMPLE_SIZE=500
|
||||
binary = False
|
||||
sample_size = qp.environ['SAMPLE_SIZE']
|
||||
binary = True
|
||||
svmperf_home = './svm_perf_quantification'
|
||||
|
||||
if binary:
|
||||
dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
|
||||
dataset = qp.datasets.fetch_reviews('kindle', tfidf=False, min_df=5)
|
||||
qp.data.preprocessing.index(dataset, inplace=True)
|
||||
|
||||
else:
|
||||
dataset = qp.datasets.fetch_twitter('hcr', for_model_selection=False, min_df=10, pickle=True)
|
||||
# dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3)
|
||||
|
||||
print('dataset loaded')
|
||||
print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.test)}')
|
||||
|
||||
|
||||
# training a quantifier
|
||||
learner = LogisticRegression(max_iter=1000)
|
||||
# learner = LogisticRegression(max_iter=1000)
|
||||
# model = qp.method.aggregative.ClassifyAndCount(learner)
|
||||
model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
|
||||
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
|
||||
# model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
|
||||
# model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
|
||||
# model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
|
||||
# model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100)
|
||||
# model = qp.method.aggregative.SVMQ(svmperf_home, C=1)
|
||||
|
||||
if not binary and isinstance(model, qp.method.aggregative.BinaryQuantifier):
|
||||
learner = NeuralClassifierTrainer(CNNnet(dataset.vocabulary_size, dataset.n_classes))
|
||||
print(learner.get_params())
|
||||
model = qp.method.aggregative.QuaNet(learner, sample_size, device='cpu')
|
||||
|
||||
if qp.isbinary(model) and not qp.isbinary(dataset):
|
||||
model = qp.method.aggregative.OneVsAll(model)
|
||||
|
||||
|
||||
|
@ -42,8 +46,9 @@ if not binary and isinstance(model, qp.method.aggregative.BinaryQuantifier):
|
|||
# ----------------------------------------------------------------------------
|
||||
|
||||
print(f'fitting model {model.__class__.__name__}')
|
||||
train, val = dataset.training.split_stratified(0.6)
|
||||
model.fit(train, val_split=val)
|
||||
#train, val = dataset.training.split_stratified(0.6)
|
||||
#model.fit(train, val_split=val)
|
||||
model.fit(dataset.training)
|
||||
|
||||
# estimating class prevalences
|
||||
print('quantifying')
|
||||
|
@ -69,9 +74,9 @@ print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence
|
|||
f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded.\n'
|
||||
f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.')
|
||||
|
||||
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)
|
||||
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, sample_size, n_prevpoints)
|
||||
|
||||
qp.error.SAMPLE_SIZE = SAMPLE_SIZE
|
||||
qp.error.SAMPLE_SIZE = sample_size
|
||||
print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
|
||||
for error in qp.error.QUANTIFICATION_ERROR:
|
||||
score = error(true_prev, estim_prev)
|
||||
|
@ -80,12 +85,12 @@ for error in qp.error.QUANTIFICATION_ERROR:
|
|||
|
||||
# Model selection and Evaluation according to the artificial sampling protocol
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
sys.exit(0)
|
||||
param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
|
||||
|
||||
model_selection = GridSearchQ(model,
|
||||
param_grid=param_grid,
|
||||
sample_size=SAMPLE_SIZE,
|
||||
sample_size=sample_size,
|
||||
eval_budget=max_evaluations//10,
|
||||
error='mae',
|
||||
refit=True,
|
||||
|
@ -98,7 +103,7 @@ print(f'param scores:')
|
|||
for params, score in model_selection.param_scores_.items():
|
||||
print(f'\t{params}: {score:.5f}')
|
||||
|
||||
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)
|
||||
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, sample_size, n_prevpoints)
|
||||
|
||||
print(f'After model selection: Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
|
||||
for error in qp.error.QUANTIFICATION_ERROR:
|
||||
|
|
Loading…
Reference in New Issue