refactoring

This commit is contained in:
Alejandro Moreo Fernandez 2021-02-04 10:26:05 +01:00
parent c0c116fd66
commit 1cd9ec251a
9 changed files with 417 additions and 330 deletions

View File

@ -4,3 +4,12 @@ b) unos "simplified" que son peores que los de Ruder porque he quitado ese layer
También vi que se mejoraba con l2(phi(x)) así que lo he dejado así También vi que se mejoraba con l2(phi(x)) así que lo he dejado así
Ahora voy a probar a añadir ese layer adicional como último step in phi(x) <-- ejecutando Ahora voy a probar a añadir ese layer adicional como último step in phi(x) <-- ejecutando
Luego quiero probar a imponer la regularización en todos los layers antes de la clasificación... Luego quiero probar a imponer la regularización en todos los layers antes de la clasificación...
Lo de la l2 es un requisito de supervised contrastive learning (SCL)
El problema para aplicar SCL es entender qué quiere decir el "crop" en texto, y en particular en AA. Podría simplemente
ser equivalente a "fragmento", es decir, que un tipo de inductive bias es que un fragmento de un texto de un autor
debe tener una representación similar a otro fragmento del mismo texto. Hay que entender bien cómo generarlos,
de forma que los fragmentos sean caracterizantes (esto quiere decir probablemente imponer una cierta extensión).
También hay que entender cómo tratar los solapamientos entre fragmentos.
Una idea de título sería: "AA is to Classification as SCL is to SAV", or AA = Classif - SCL + SAV

View File

@ -1,3 +1,31 @@
Recap Feb. 2021:
- Adapt everything to testing a classic neural training for AA (i.e., projector+classifier training) vs. applying Supervised
Contrastive Learning (SCL) as a pretraining step for solving SAV, and then training a linear classifier with
the projector network frozen. Reassess the work in terms of SAV and made connections with KTA and SVM. Maybe claim
that SCL+SVM is the way to go.
- Compare (Attribution):
- S.Ruder systems
- My system (projector+classifier layer) as a reimplementation of S.Ruder's systems
- Projector trained via SCL + Classifier layer trained alone.
- Projector trained via SCL + SVM Classifier.
- Projector trained via KTA + SVM Classifier.
- Compare (SAV):
- My system (projector+binary-classifier layer)
- Projector trained via SCL + Binary Classifier layer trained alone.
- Projector trained via SCL + SVM Classifier.
- Projector trained via KTA + SVM Classifier.
- Other systems (maybe Diff-Vectors, maybe Impostors, maybe distance-based)
- Additional experiments:
- show the kernel matrix
Future:
- Test also in general TC? there are some torch datasets in torchtext that could simplify things... but that would
blur the idea of SCL-SAV
Code:
- redo dataset in terms of pytorch's data_loader
---------------------
Things to clarify: Things to clarify:
about the network: about the network:
@ -23,4 +51,6 @@ maybe I have to review the validation of the sav-loss; since it is batched, it m
SAV: how should the range of k(xi,xj) be interpreted? how to decide for value threshold for returning -1 or +1? SAV: how should the range of k(xi,xj) be interpreted? how to decide for value threshold for returning -1 or +1?
I guess the best thing to do is to learn a simple threshold, one feed forward 1-to-1 I guess the best thing to do is to learn a simple threshold, one feed forward 1-to-1
plot the kernel matrix as an imshow, with rows/cols arranged by authors, and check whether the KTA that SCL yields
is better than that obtained using a traditional training for attribution.

View File

@ -1,14 +0,0 @@
#!/bin/bash
conda activate torch
dataset=enron
for authors in 10 50 ; do
for alpha in 1 0.999 0.99 0.9 0.5 ; do
python main.py --dataset $dataset -A $authors -s 0 -o ../results_$dataset.csv --alpha $alpha
done
done
dataset=imdb62
for alpha in 1 0.999 0.99 0.9 0.5 ; do
python main.py --dataset $dataset -A -1 -s 0 -o ../results_$dataset.csv --alpha $alpha
done

View File

@ -9,17 +9,13 @@ import pickle
class LabelledCorpus: class LabelledCorpus:
def __init__(self, documents, labels): def __init__(self, documents, labels):
if not isinstance(documents, np.ndarray): documents = np.asarray(documents, dtype=str) if not isinstance(documents, np.ndarray):
if not isinstance(labels, np.ndarray): labels = np.asarray(labels) documents = np.asarray(documents, dtype=object) #dtype=str occupies too much in memory and is not needed
if not isinstance(labels, np.ndarray):
labels = np.asarray(labels)
self.data = documents self.data = documents
self.target = labels self.target = labels
def _tolist(self):
self.data = self.data.tolist()
def _toarray(self):
self.data = np.asarray(self.data, dtype=str)
def __len__(self): def __len__(self):
return len(self.data) return len(self.data)
@ -41,17 +37,11 @@ class AuthorshipDataset(ABC):
if pickle_path and os.path.exists(pickle_path): if pickle_path and os.path.exists(pickle_path):
print(f'loading dataset image in {pickle_path}') print(f'loading dataset image in {pickle_path}')
dataset = pickle.load(open(pickle_path, 'rb')) dataset = pickle.load(open(pickle_path, 'rb'))
dataset.train._toarray()
dataset.test._toarray()
else: else:
dataset = loader(**kwargs) dataset = loader(**kwargs)
if pickle_path: if pickle_path:
print(f'dumping dataset in {pickle_path} for faster load') print(f'dumping dataset in {pickle_path} for faster load')
dataset.train._tolist()
dataset.test._tolist()
pickle.dump(dataset, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL) pickle.dump(dataset, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
dataset.train._toarray()
dataset.test._toarray()
return dataset return dataset
def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42): def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
@ -62,13 +52,9 @@ class AuthorshipDataset(ABC):
np.random.seed(random_state) np.random.seed(random_state)
self._check_n_authors(n_authors, n_open_set_authors) self._check_n_authors(n_authors, n_open_set_authors)
self.train, self.test, self.target_names = self._fetch_and_split() self.train, self.test, self.target_names = self._fetch_and_split()
self._assure_docs_by_author(docs_by_author) self._assure_docs_by_author(docs_by_author)
self._reduce_authors_documents(n_authors, docs_by_author, n_open_set_authors) self._reduce_authors_documents(n_authors, docs_by_author, n_open_set_authors)
self._remove_label_gaps() self._remove_label_gaps()
super().__init__() super().__init__()

View File

@ -18,7 +18,7 @@ class Imdb62(AuthorshipDataset):
def _fetch_and_split(self): def _fetch_and_split(self):
file = open(self.data_path,'rt', encoding= "utf-8").readlines() file = open(self.data_path,'rt', encoding= "utf-8").readlines()
splits = [line.split('\t') for line in file] splits = [line.split('\t') for line in file]
reviews = np.asarray([split[4]+' '+split[5] for split in splits]) reviews = [split[4]+' '+split[5] for split in splits]
authors=[] authors=[]
authors_ids = dict() authors_ids = dict()

View File

@ -19,7 +19,6 @@ class Victorian(AuthorshipDataset):
csv_reader = csv.reader(file, delimiter = ',') csv_reader = csv.reader(file, delimiter = ',')
next(csv_reader) next(csv_reader)
for row in csv_reader: for row in csv_reader:
# if row[0]!='text':
data.append(row[0]) data.append(row[0])
labels.append(int(row[1])) labels.append(int(row[1]))

View File

@ -5,7 +5,7 @@ from data.fetch_blogs import Blogs
from data.fetch_imdb62 import Imdb62 from data.fetch_imdb62 import Imdb62
from data.fetch_enron_mail import EnronMail from data.fetch_enron_mail import EnronMail
from index import Index from index import Index
from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier from model.classifiers import AuthorshipAttributionClassifier #, SameAuthorClassifier, FullAuthorClassifier
from data.fetch_victorian import Victorian from data.fetch_victorian import Victorian
from evaluation import evaluation from evaluation import evaluation
import torch import torch
@ -16,11 +16,7 @@ import os
import sys import sys
def main(opt): def load_dataset(opt):
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'running on {device}')
# dataset load # dataset load
if opt.dataset == 'enron': if opt.dataset == 'enron':
loader = EnronMail loader = EnronMail
@ -39,13 +35,24 @@ def main(opt):
pickle_path = None pickle_path = None
if opt.pickle: if opt.pickle:
pickle_path = f'{opt.pickle}/{dataset_name}.pickle' pickle_path = f'{opt.pickle}/{dataset_name}.pickle'
dataset = AuthorshipDataset.load(loader, dataset = AuthorshipDataset.load(
loader,
pickle_path=pickle_path, pickle_path=pickle_path,
data_path=data_path, data_path=data_path,
n_authors=opt.authors, n_authors=opt.authors,
docs_by_author=opt.documents, docs_by_author=opt.documents,
random_state=opt.seed random_state=opt.seed
) )
return dataset_name, dataset
def main(opt):
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'running on {device}')
dataset_name, dataset = load_dataset(opt)
# dataset indexing # dataset indexing
Xtr, ytr = dataset.train.data, dataset.train.target Xtr, ytr = dataset.train.data, dataset.train.target
@ -61,12 +68,6 @@ def main(opt):
pad_index = index.add_word('PADTOKEN') pad_index = index.add_word('PADTOKEN')
print(f'vocabulary size={index.vocabulary_size()}') print(f'vocabulary size={index.vocabulary_size()}')
#shuffle1 = np.random.permutation(Xte.shape[0])
#shuffle2 = np.random.permutation(Xte.shape[0])
#x1, y1 = Xte[shuffle1], yte[shuffle1]
#x2, y2 = Xte[shuffle2], yte[shuffle2]
#paired_y = y1==y2
# attribution # attribution
print('Attribution') print('Attribution')
phi = Phi( phi = Phi(
@ -93,6 +94,13 @@ def main(opt):
else: else:
method = opt.name method = opt.name
cls.supervised_contrastive_learning(Xtr, ytr,
batch_size=opt.batchsize, epochs=opt.epochs, alpha=opt.alpha, lr=opt.lr,
log=f'{opt.log}/{method}-{dataset_name}.csv',
checkpointpath=opt.checkpoint)
sys.exit(0)
# train # train
val_microf1 = cls.fit(Xtr, ytr, val_microf1 = cls.fit(Xtr, ytr,
batch_size=opt.batchsize, epochs=opt.epochs, alpha=opt.alpha, lr=opt.lr, batch_size=opt.batchsize, epochs=opt.epochs, alpha=opt.alpha, lr=opt.lr,
@ -154,7 +162,7 @@ if __name__ == '__main__':
parser.add_argument('-e', '--epochs', help='Max number of epochs', type=int, default=250) parser.add_argument('-e', '--epochs', help='Max number of epochs', type=int, default=250)
parser.add_argument('-A', '--authors', help='Number of authors (-1 to select all)', type=int, default=-1) parser.add_argument('-A', '--authors', help='Number of authors (-1 to select all)', type=int, default=-1)
parser.add_argument('-D', '--documents', help='Number of documents per author (-1 to select all)', type=int, default=-1) parser.add_argument('-D', '--documents', help='Number of documents per author (-1 to select all)', type=int, default=-1)
parser.add_argument('-s', '--seed', help='Random seed', type=int, default=-1) parser.add_argument('-s', '--seed', help='Random seed', type=int, default=0)
parser.add_argument('-o', '--output', help='File where to write test results', default='../results.csv') parser.add_argument('-o', '--output', help='File where to write test results', default='../results.csv')
parser.add_argument('-l', '--log', help='Log dir where to output training an validation losses', default='../log') parser.add_argument('-l', '--log', help='Log dir where to output training an validation losses', default='../log')
parser.add_argument('-P', '--pickle', help='If specified, pickles a copy of the dataset for faster reload. ' parser.add_argument('-P', '--pickle', help='If specified, pickles a copy of the dataset for faster reload. '

View File

@ -6,8 +6,11 @@ from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm from tqdm import tqdm
import math import math
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from losses import SupConLoss1View
from model.early_stop import EarlyStop from model.early_stop import EarlyStop
from model.layers import FFProjection from model.layers import FFProjection
from torch.utils.data import DataLoader
class AuthorshipAttributionClassifier(nn.Module): class AuthorshipAttributionClassifier(nn.Module):
@ -17,33 +20,35 @@ class AuthorshipAttributionClassifier(nn.Module):
self.ff = FFProjection(input_size=projector.output_size, self.ff = FFProjection(input_size=projector.output_size,
hidden_sizes=[], hidden_sizes=[],
output_size=num_authors).to(device) output_size=num_authors).to(device)
self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False, device=device) self.pad_index = pad_index
self.pad_length = pad_length
self.device = device self.device = device
def fit(self, X, y, batch_size, epochs, patience=10, lr=0.001, val_prop=0.1, alpha=1., log='../log/tmp.csv', checkpointpath='../checkpoint/model.dat'): def fit(self, X, y, batch_size, epochs, patience=10, lr=0.001, val_prop=0.1, alpha=1., log='../log/tmp.csv', checkpointpath='../checkpoint/model.dat'):
assert 0 <= alpha <= 1, 'wrong range, alpha must be in [0,1]' assert 0 <= alpha <= 1, 'wrong range, alpha must be in [0,1]'
early_stop = EarlyStop(patience) early_stop = EarlyStop(patience)
batcher = Batch(batch_size=batch_size, n_epochs=epochs)
#batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=X.shape[0]//batch_size) #batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=X.shape[0]//batch_size)
batcher_val = Batch(batch_size=batch_size, n_epochs=epochs, shuffle=False)
criterion = torch.nn.CrossEntropyLoss().to(self.device) criterion = torch.nn.CrossEntropyLoss().to(self.device)
savcriterion = torch.nn.BCEWithLogitsLoss().to(self.device) savcriterion = torch.nn.BCEWithLogitsLoss().to(self.device)
optim = torch.optim.Adam(self.parameters(), lr=lr) optim = torch.optim.Adam(self.parameters(), lr=lr)
X, Xval, y, yval = train_test_split(X, y, test_size=val_prop, stratify=y) X, Xval, y, yval = train_test_split(X, y, test_size=val_prop, stratify=y)
tr_data = IndexedDataset(X, y, self.pad_length, self.pad_index, self.device)
val_data = IndexedDataset(Xval, yval, self.pad_length, self.pad_index, self.device)
with open(log, 'wt') as foo: with open(log, 'wt') as foo:
print() print()
foo.write('epoch\ttr-loss\tval-loss\tval-acc\tval-Mf1\tval-mf1\n') foo.write('epoch\ttr-loss\tval-loss\tval-acc\tval-Mf1\tval-mf1\n')
tr_loss, val_loss = -1, -1 tr_loss, val_loss = -1, -1
pbar = tqdm(range(1, batcher.n_epochs+1)) pbar = tqdm(range(1, epochs + 1))
for epoch in pbar: for epoch in pbar:
# training # training
self.train() self.train()
losses, attr_losses, sav_losses = [], [], [] losses, attr_losses, sav_losses = [], [], []
for xi, yi in batcher.epoch(X, y): for xi, yi in tr_data.asDataLoader(batch_size, shuffle=True):
optim.zero_grad() optim.zero_grad()
xi = self.padder.transform(xi)
phi = self.projector(xi) phi = self.projector(xi)
loss_attr = loss_sav = 0 loss_attr = loss_sav = 0
@ -93,9 +98,11 @@ class AuthorshipAttributionClassifier(nn.Module):
# validation # validation
self.eval() self.eval()
with torch.no_grad:
predictions, losses = [], [] predictions, losses = [], []
for xi, yi in batcher_val.epoch(Xval, yval): # for xi, yi in batcher_val.epoch(Xval, yval):
xi = self.padder.transform(xi) for xi, yi in val_data.asDataLoader(batch_size, shuffle=False):
# xi = self.padder.transform(xi)
logits = self.forward(xi) logits = self.forward(xi)
loss = criterion(logits, torch.as_tensor(yi).to(self.device)) loss = criterion(logits, torch.as_tensor(yi).to(self.device))
losses.append(loss.item()) losses.append(loss.item())
@ -120,12 +127,78 @@ class AuthorshipAttributionClassifier(nn.Module):
self.load_state_dict(torch.load(checkpointpath)) self.load_state_dict(torch.load(checkpointpath))
return early_stop.best_score return early_stop.best_score
def supervised_contrastive_learning(self, X, y, batch_size, epochs, patience=10, lr=0.001, val_prop=0.1, alpha=1., log='../log/tmp.csv', checkpointpath='../checkpoint/model.dat'):
assert 0 <= alpha <= 1, 'wrong range, alpha must be in [0,1]'
early_stop = EarlyStop(patience)
criterion = SupConLoss1View().to(self.device)
optim = torch.optim.Adam(self.parameters(), lr=lr)
X, Xval, y, yval = train_test_split(X, y, test_size=val_prop, stratify=y)
tr_data = IndexedDataset(X, y, self.pad_length, self.pad_index, self.device)
val_data = IndexedDataset(Xval, yval, self.pad_length, self.pad_index, self.device)
with open(log, 'wt') as foo:
print()
foo.write('epoch\ttr-loss\tval-loss\tval-acc\tval-Mf1\tval-mf1\n')
tr_loss, val_loss = -1, -1
pbar = tqdm(range(1, epochs + 1))
for epoch in pbar:
# training
self.train()
losses = []
for xi, yi in tr_data.asDataLoader(batch_size, shuffle=True):
optim.zero_grad()
phi = self.projector(xi)
contrastive_loss = criterion(phi, torch.as_tensor(yi).to(self.device))
contrastive_loss.backward()
optim.step()
losses.append(contrastive_loss.item())
tr_loss = np.mean(losses)
pbar.set_description(f'training epoch={epoch} '
f'loss={tr_loss:.5f} '
f'val_loss={val_loss:.5f} '
f'patience={early_stop.patience}/{early_stop.patience_limit}')
# validation
# self.eval()
# with torch.no_grad:
# predictions, losses = [], []
# for xi, yi in val_data.asDataLoader(batch_size, shuffle=False):
# phi = self.projector(xi)
# contrastive_loss = criterion(phi, torch.as_tensor(yi).to(self.device))
#
# logits = self.forward(xi)
# loss = criterion(logits, torch.as_tensor(yi).to(self.device))
# losses.append(loss.item())
# logits = nn.functional.log_softmax(logits, dim=1)
# prediction = tensor2numpy(torch.argmax(logits, dim=1).view(-1))
# predictions.append(prediction)
# val_loss = np.mean(losses)
# predictions = np.concatenate(predictions)
# acc = accuracy_score(yval, predictions)
# macrof1 = f1_score(yval, predictions, average='macro')
# microf1 = f1_score(yval, predictions, average='micro')
#
# foo.write(f'{epoch}\t{tr_loss:.8f}\t{val_loss:.8f}\t{acc:.3f}\t{macrof1:.3f}\t{microf1:.3f}\n')
# foo.flush()
# early_stop(microf1, epoch)
# if early_stop.IMPROVED:
# torch.save(self.state_dict(), checkpointpath)
# elif early_stop.STOP:
# break
print(f'training ended; loading best model parameters in {checkpointpath} for epoch {early_stop.best_epoch}')
self.load_state_dict(torch.load(checkpointpath))
return early_stop.best_score
def predict(self, x, batch_size=100): def predict(self, x, batch_size=100):
self.eval() self.eval()
batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False) te_data = IndexedDataset(x, None, self.pad_length, self.pad_index, self.device)
predictions = [] predictions = []
for xi in tqdm(batcher.epoch(x), desc='test'): with torch.no_grad:
xi = self.padder.transform(xi) for xi, yi in te_data.asDataLoader(batch_size, shuffle=False):
logits = self.forward(xi) logits = self.forward(xi)
logits = nn.functional.log_softmax(logits, dim=1) logits = nn.functional.log_softmax(logits, dim=1)
prediction = tensor2numpy(torch.argmax(logits, dim=1).view(-1)) prediction = tensor2numpy(torch.argmax(logits, dim=1).view(-1))
@ -168,134 +241,133 @@ def choose_sav_pairs(y, npairs):
class SameAuthorClassifier(nn.Module): # class SameAuthorClassifier(nn.Module):
def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'): # def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'):
super(SameAuthorClassifier, self).__init__() # super(SameAuthorClassifier, self).__init__()
self.projector = projector.to(device) # self.projector = projector.to(device)
self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False, device=device) # self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False, device=device)
self.device = device # self.device = device
#
def fit(self, X, y, batch_size, epochs, lr=0.001, steps_per_epoch=100): # def fit(self, X, y, batch_size, epochs, lr=0.001, steps_per_epoch=100):
self.train() # self.train()
batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=steps_per_epoch) # batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=steps_per_epoch)
optim = torch.optim.Adam(self.parameters(), lr=lr) # optim = torch.optim.Adam(self.parameters(), lr=lr)
#
pbar = tqdm(range(batcher.n_epochs)) # pbar = tqdm(range(batcher.n_epochs))
for epoch in pbar: # for epoch in pbar:
losses = [] # losses = []
for xi, yi in batcher.epoch(X, y): # for xi, yi in batcher.epoch(X, y):
optim.zero_grad() # optim.zero_grad()
xi = self.padder.transform(xi) # xi = self.padder.transform(xi)
phi = self.projector(xi) # phi = self.projector(xi)
#normalize phi to have norm 1? maybe better as the last step of projector # #normalize phi to have norm 1? maybe better as the last step of projector
kernel = torch.matmul(phi, phi.T) # kernel = torch.matmul(phi, phi.T)
ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device) # ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
loss = KernelAlignmentLoss(kernel, ideal_kernel) # loss = KernelAlignmentLoss(kernel, ideal_kernel)
loss.backward() # loss.backward()
#clip_gradient(model) # #clip_gradient(model)
optim.step() # optim.step()
losses.append(loss.item()) # losses.append(loss.item())
pbar.set_description(f'training epoch={epoch} loss={np.mean(losses):.5f}') # pbar.set_description(f'training epoch={epoch} loss={np.mean(losses):.5f}')
#
def predict(self, x, z, batch_size=100): # def predict(self, x, z, batch_size=100):
self.eval() # self.eval()
batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False) # batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False)
predictions = [] # predictions = []
for xi, zi in tqdm(batcher.epoch(x, z), desc='test'): # for xi, zi in tqdm(batcher.epoch(x, z), desc='test'):
xi = self.padder.transform(xi) # xi = self.padder.transform(xi)
zi = self.padder.transform(zi) # zi = self.padder.transform(zi)
inners = self.forward(xi, zi) # inners = self.forward(xi, zi)
prediction = tensor2numpy(inners) > 0.5 # is this correct? should it be > 0 and the ideal kernel in field {-1,+1}? # prediction = tensor2numpy(inners) > 0.5 # is this correct? should it be > 0 and the ideal kernel in field {-1,+1}?
predictions.append(prediction) # predictions.append(prediction)
return np.concatenate(predictions) # return np.concatenate(predictions)
#
def forward(self, x, z): # def forward(self, x, z):
assert x.shape == z.shape, 'shape mismatch between matrices x and z' # assert x.shape == z.shape, 'shape mismatch between matrices x and z'
phi_x = self.projector(x) # phi_x = self.projector(x)
phi_z = self.projector(z) # phi_z = self.projector(z)
rows, cols = phi_x.shape # rows, cols = phi_x.shape
pairwise_inners = torch.bmm(phi_x.view(rows, 1, cols), phi_z.view(rows, cols, 1)).squeeze() # pairwise_inners = torch.bmm(phi_x.view(rows, 1, cols), phi_z.view(rows, cols, 1)).squeeze()
return pairwise_inners # return pairwise_inners
class FullAuthorClassifier(nn.Module): # class FullAuthorClassifier(nn.Module):
def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'): # def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'):
super(FullAuthorClassifier, self).__init__() # super(FullAuthorClassifier, self).__init__()
self.projector = projector.to(device) # self.projector = projector.to(device)
self.ff = FFProjection(input_size=projector.space_dimensions(), # self.ff = FFProjection(input_size=projector.space_dimensions(),
hidden_sizes=[1024], # hidden_sizes=[1024],
output_size=num_authors).to(device) # output_size=num_authors).to(device)
self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False, device=device) # self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False, device=device)
self.device = device # self.device = device
#
def fit(self, X, y, batch_size, epochs, lr=0.001, steps_per_epoch=100): # def fit(self, X, y, batch_size, epochs, lr=0.001, steps_per_epoch=100):
self.train() # self.train()
batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=steps_per_epoch) # batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=steps_per_epoch)
criterion = torch.nn.CrossEntropyLoss().to(self.device) # criterion = torch.nn.CrossEntropyLoss().to(self.device)
optim = torch.optim.Adam(self.parameters(), lr=lr) # optim = torch.optim.Adam(self.parameters(), lr=lr)
alpha = 0.5 # alpha = 0.5
#
pbar = tqdm(range(batcher.n_epochs)) # pbar = tqdm(range(batcher.n_epochs))
for epoch in pbar: # for epoch in pbar:
losses, sav_losses, attr_losses = [], [], [] # losses, sav_losses, attr_losses = [], [], []
for xi, yi in batcher.epoch(X, y): # for xi, yi in batcher.epoch(X, y):
optim.zero_grad() # optim.zero_grad()
xi = self.padder.transform(xi) # xi = self.padder.transform(xi)
phi = self.projector(xi) # phi = self.projector(xi)
#normalize phi to have norm 1? maybe better as the last step of projector # #normalize phi to have norm 1? maybe better as the last step of projector
#
#sav-loss # #sav-loss
kernel = torch.matmul(phi, phi.T) # kernel = torch.matmul(phi, phi.T)
ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device) # ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
sav_loss = KernelAlignmentLoss(kernel, ideal_kernel) # sav_loss = KernelAlignmentLoss(kernel, ideal_kernel)
sav_losses.append(sav_loss.item()) # sav_losses.append(sav_loss.item())
#
#attr-loss # #attr-loss
logits = self.ff(phi) # logits = self.ff(phi)
attr_loss = criterion(logits, torch.as_tensor(yi).to(self.device)) # attr_loss = criterion(logits, torch.as_tensor(yi).to(self.device))
attr_losses.append(attr_loss.item()) # attr_losses.append(attr_loss.item())
#
#loss # #loss
loss = (alpha)*sav_loss + (1-alpha)*attr_loss # loss = (alpha)*sav_loss + (1-alpha)*attr_loss
losses.append(loss.item()) # losses.append(loss.item())
#
loss.backward() # loss.backward()
#clip_gradient(model) # #clip_gradient(model)
optim.step() # optim.step()
pbar.set_description( # pbar.set_description(
f'training epoch={epoch} ' # f'training epoch={epoch} '
f'sav-loss={np.mean(sav_losses):.5f} ' # f'sav-loss={np.mean(sav_losses):.5f} '
f'attr-loss={np.mean(attr_losses):.5f} ' # f'attr-loss={np.mean(attr_losses):.5f} '
f'loss={np.mean(losses):.5f}' # f'loss={np.mean(losses):.5f}'
) # )
#
def predict_sav(self, x, z, batch_size=100): # def predict_sav(self, x, z, batch_size=100):
self.eval() # self.eval()
batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False) # batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False)
predictions = [] # predictions = []
for xi, zi in tqdm(batcher.epoch(x, z), desc='test'): # for xi, zi in tqdm(batcher.epoch(x, z), desc='test'):
xi = self.padder.transform(xi) # xi = self.padder.transform(xi)
zi = self.padder.transform(zi) # zi = self.padder.transform(zi)
phi_xi = self.projector(xi) # phi_xi = self.projector(xi)
phi_zi = self.projector(zi) # phi_zi = self.projector(zi)
rows, cols = phi_xi.shape # rows, cols = phi_xi.shape
pairwise_inners = torch.bmm(phi_xi.view(rows, 1, cols), phi_zi.view(rows, cols, 1)).squeeze() # pairwise_inners = torch.bmm(phi_xi.view(rows, 1, cols), phi_zi.view(rows, cols, 1)).squeeze()
prediction = tensor2numpy(pairwise_inners) > 0.5 # is this correct? should it be > 0 and the ideal kernel in field {-1,+1}? # prediction = tensor2numpy(pairwise_inners) > 0.5 # is this correct? should it be > 0 and the ideal kernel in field {-1,+1}?
predictions.append(prediction) # predictions.append(prediction)
return np.concatenate(predictions) # return np.concatenate(predictions)
#
def predict_labels(self, x, batch_size=100): # def predict_labels(self, x, batch_size=100):
self.eval() # self.eval()
batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False) # batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False)
predictions = [] # predictions = []
for xi in tqdm(batcher.epoch(x), desc='test'): # for xi in tqdm(batcher.epoch(x), desc='test'):
xi = self.padder.transform(xi) # xi = self.padder.transform(xi)
phi = self.projector(xi) # phi = self.projector(xi)
logits = self.ff(phi) # logits = self.ff(phi)
prediction = tensor2numpy( torch.argmax(logits, dim=1).view(-1)) # prediction = tensor2numpy( torch.argmax(logits, dim=1).view(-1))
predictions.append(prediction) # predictions.append(prediction)
return np.concatenate(predictions) # return np.concatenate(predictions)
#def KernelAlignmentLoss(K, Y): #def KernelAlignmentLoss(K, Y):
# n_el = K.shape[0]*K.shape[1] # n_el = K.shape[0]*K.shape[1]
@ -304,92 +376,89 @@ class FullAuthorClassifier(nn.Module):
# return loss # return loss
# class TwoClassBatch:
class Batch: # """
def __init__(self, batch_size, n_epochs=1, shuffle=True): # given a X and y (multi-label) produces batches of elements of X, y for two classes (e.g., c1, c2)
self.batch_size = batch_size # of equal size, i.e., the batch is [(x1,c1), ..., (xn,c1), (xn+1,c2), ..., (x2n,c2)]
self.n_epochs = n_epochs # """
self.shuffle = shuffle # def __init__(self, batch_size, n_epochs, steps_per_epoch):
self.current_epoch = 0 # self.batch_size = batch_size
# self.n_epochs = n_epochs
def epoch(self, *args): # self.steps_per_epoch = steps_per_epoch
lengths = list(map(len, args)) # self.current_epoch = 0
assert max(lengths) == min(lengths), 'inconsistent sizes in args' # if self.batch_size % 2 != 0:
n_batches = math.ceil(lengths[0] / self.batch_size) # raise ValueError('warning, batch size is not even')
offset = 0 #
if self.shuffle: # def epoch(self, X, y):
index = np.random.permutation(len(args[0])) # n_el = len(y)
args = [arg[index] for arg in args] # assert X.shape[0] == n_el, 'inconsistent sizes in X, y'
for b in range(n_batches): # classes = np.unique(y)
batch_idx = slice(offset, offset+self.batch_size) # groups = {ci: X[y==ci] for ci in classes}
batch = [arg[batch_idx] for arg in args] # class_prevalences = [len(groups[ci])/n_el for ci in classes]
yield batch if len(batch) > 1 else batch[0] # n_choices = self.batch_size // 2
offset += self.batch_size #
self.current_epoch += 1 # for b in range(self.steps_per_epoch):
# class1, class2 = np.random.choice(classes, p=class_prevalences, size=2, replace=False)
# X1 = np.random.choice(groups[class1], size=n_choices)
class TwoClassBatch: # X2 = np.random.choice(groups[class2], size=n_choices)
""" # X_batch = np.concatenate([X1,X2])
given a X and y (multi-label) produces batches of elements of X, y for two classes (e.g., c1, c2) # y_batch = np.repeat([class1, class2], repeats=[n_choices,n_choices])
of equal size, i.e., the batch is [(x1,c1), ..., (xn,c1), (xn+1,c2), ..., (x2n,c2)] # yield X_batch, y_batch
""" # self.current_epoch += 1
def __init__(self, batch_size, n_epochs, steps_per_epoch):
self.batch_size = batch_size
self.n_epochs = n_epochs
self.steps_per_epoch = steps_per_epoch
self.current_epoch = 0
if self.batch_size % 2 != 0:
raise ValueError('warning, batch size is not even')
def epoch(self, X, y):
n_el = len(y)
assert X.shape[0] == n_el, 'inconsistent sizes in X, y'
classes = np.unique(y)
groups = {ci: X[y==ci] for ci in classes}
class_prevalences = [len(groups[ci])/n_el for ci in classes]
n_choices = self.batch_size // 2
for b in range(self.steps_per_epoch):
class1, class2 = np.random.choice(classes, p=class_prevalences, size=2, replace=False)
X1 = np.random.choice(groups[class1], size=n_choices)
X2 = np.random.choice(groups[class2], size=n_choices)
X_batch = np.concatenate([X1,X2])
y_batch = np.repeat([class1, class2], repeats=[n_choices,n_choices])
yield X_batch, y_batch
self.current_epoch += 1
class Padding:
def __init__(self, pad_index, max_length, dynamic=True, pad_at_end=True, device='cpu'):
"""
:param pad_index: the index representing the PAD token
:param max_length: the length that defines the padding
:param dynamic: if True (default) pads at min(max_length, max_local_length) where max_local_length is the
length of the longest example
:param pad_at_end: if True, the pad tokens are added at the end of the lists, if otherwise they are added
at the beginning
"""
self.pad = pad_index
self.max_length = max_length
self.dynamic = dynamic
self.pad_at_end = pad_at_end
self.device = device
def transform(self, X):
"""
:param X: a list of lists of indexes (integers)
:return: a ndarray of shape (n,m) where n is the number of elements in X and m is the pad length (the maximum
in elements of X if dynamic, or self.max_length if otherwise)
"""
X = [x[:self.max_length] for x in X]
lengths = list(map(len, X))
pad_length = min(max(lengths), self.max_length) if self.dynamic else self.max_length
if self.pad_at_end:
padded = [x + [self.pad] * (pad_length - x_len) for x, x_len in zip(X, lengths)]
else:
padded = [[self.pad] * (pad_length - x_len) + x for x, x_len in zip(X, lengths)]
return torch.from_numpy(np.asarray(padded, dtype=int)).to(self.device)
def tensor2numpy(t): def tensor2numpy(t):
return t.to('cpu').detach().numpy() return t.to('cpu').detach().numpy()
# ------------
class IndexedDataset(torch.utils.data.Dataset):
def __init__(self, X, y, MAX_LENGTH, padindex, device, pad_at_end=False):
self.X = X
self.y = y
self.MAX_LENGTH = MAX_LENGTH
self.padindex = padindex
self.device = device
self.pad_at_end = pad_at_end
def __len__(self):
return len(self.X)
@property
def islabelled(self):
return self.y is not None
def __getitem__(self, index):
if self.islabelled:
return self.X[index], self.y[index]
else:
return self.X[index]
def collate_pad_fn(self, batch):
"""
:param batch: a list of lists of indexes (integers)
:return: a torch.tensor of shape (n,m) where n is the number of elements in X_batch and m is the pad length
(the maximum in elements of X_batch)
"""
if self.islabelled:
X, y = list(zip(*batch))
else:
X = batch
lengths = list(map(len, X))
pad_length = min(max(lengths), self.MAX_LENGTH)
X = [x[:pad_length] for x in X]
if self.pad_at_end:
padded = [x + [self.padindex] * (pad_length - x_len) for x, x_len in zip(X, lengths)]
else:
padded = [[self.padindex] * (pad_length - x_len) + x for x, x_len in zip(X, lengths)]
X = torch.from_numpy(np.asarray(padded, dtype=int)).to(self.device)
if self.islabelled:
y = torch.from_numpy(np.asarray(y)).to(self.device)
return X, y
else:
return X
def asDataLoader(self, batch_size, shuffle):
return torch.utils.data.DataLoader(self, batch_size=batch_size, shuffle=shuffle, collate_fn=self.collate_pad_fn)

View File

@ -70,36 +70,36 @@ class FFProjection(nn.Module):
# deprecated # deprecated
class RNNProjection(nn.Module): # class RNNProjection(nn.Module):
def __init__(self, vocab_size, hidden_size, output_size, device='cpu'): # def __init__(self, vocab_size, hidden_size, output_size, device='cpu'):
super(RNNProjection, self).__init__() # super(RNNProjection, self).__init__()
self.output_size = output_size # self.output_size = output_size
self.hidden_size = hidden_size # self.hidden_size = hidden_size
self.vocab_size = vocab_size # self.vocab_size = vocab_size
self.num_layers=1 # self.num_layers=1
self.num_directions=1 # self.num_directions=1
self.device = device # self.device = device
#
self.embedding = nn.Embedding(vocab_size, hidden_size).to(device) # self.embedding = nn.Embedding(vocab_size, hidden_size).to(device)
self.rnn = nn.GRU( # self.rnn = nn.GRU(
input_size=hidden_size, # input_size=hidden_size,
hidden_size=hidden_size, # hidden_size=hidden_size,
num_layers=self.num_layers, # num_layers=self.num_layers,
bidirectional=(self.num_directions == 2), # bidirectional=(self.num_directions == 2),
batch_first=True # batch_first=True
).to(device) # ).to(device)
self.projection = nn.Linear(self.num_layers * self.num_directions * self.hidden_size, output_size).to(device) # self.projection = nn.Linear(self.num_layers * self.num_directions * self.hidden_size, output_size).to(device)
#
def init_hidden(self, batch_size): # def init_hidden(self, batch_size):
return torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(self.device) # return torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(self.device)
#
def forward(self, x): # def forward(self, x):
batch_size = x.shape[0] # batch_size = x.shape[0]
x = self.embedding(x) # x = self.embedding(x)
output, hn = self.rnn(x, self.init_hidden(batch_size)) # output, hn = self.rnn(x, self.init_hidden(batch_size))
hn = hn.view(self.num_layers, self.num_directions, batch_size, self.hidden_size) # hn = hn.view(self.num_layers, self.num_directions, batch_size, self.hidden_size)
hn = hn.permute(2, 0, 1, 3).reshape(batch_size, -1) # hn = hn.permute(2, 0, 1, 3).reshape(batch_size, -1)
return self.projection(hn) # return self.projection(hn)
#
def space_dimensions(self): # def space_dimensions(self):
return self.output_size # return self.output_size