diff --git a/src/data/fetch_imdb62.py b/src/data/fetch_imdb62.py new file mode 100644 index 0000000..7e747c1 --- /dev/null +++ b/src/data/fetch_imdb62.py @@ -0,0 +1,45 @@ +import numpy as np +from sklearn.model_selection import train_test_split +import random + +from data.AuthorshipDataset import AuthorshipDataset, LabelledCorpus + + +class Imdb62(AuthorshipDataset): + + TEST_SIZE = 0.30 + NUM_AUTHORS = 62 + NUM_DOCS_BY_AUTHOR = int(1000-(1000*TEST_SIZE)) + + def __init__(self, data_path='../data/imdb62/imdb62.txt', n_authors=-1, docs_by_author=-1, n_open_set_authors = 0, random_state=42): + super().__init__(data_path, n_authors, docs_by_author, n_open_set_authors, random_state) + + + def _fetch_and_split(self): + file = open(self.data_path,'rt', encoding= "utf-8").readlines() + splits = [line.split('\t') for line in file] + reviews = np.asarray([split[4]+' '+split[5] for split in splits]) + + authors=[] + authors_ids = dict() + for s in splits: + author_key = s[1] + if author_key not in authors_ids: + authors_ids[author_key]=len(authors_ids) + author_id = authors_ids[author_key] + authors.append(author_id) + authors = np.array(authors) + + authors_names = sorted(np.unique(authors)) + + train_data, test_data, train_labels, test_labels = \ + train_test_split(reviews, authors, test_size=Imdb62.TEST_SIZE, stratify=authors) + + return LabelledCorpus(train_data, train_labels), LabelledCorpus(test_data, test_labels), authors_names + + + def _check_n_authors(self, n_authors, n_open_set_authors): + if n_authors==-1: return + elif n_authors+n_open_set_authors > Imdb62.NUM_AUTHORS: + raise ValueError(f'Too many authors requested. Max is {Imdb62.NUM_AUTHORS}') + diff --git a/src/main.py b/src/main.py index dd1f443..4f1adc2 100644 --- a/src/main.py +++ b/src/main.py @@ -1,47 +1,14 @@ import numpy as np + +from data.fetch_imdb62 import Imdb62 from index import Index -from model.model import RNNProjection, AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier +from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier from data.fetch_victorian import Victorian from evaluation import eval import torch +from model.transformations import CNNProjection +import sys -from model.cnn import CNNProjection - -if torch.cuda.is_available(): - device = torch.device('cuda') -else: - device = torch.device('cpu') -print(f'running on {device}') - -dataset = Victorian(data_path='../../authorship_analysis/data/victoria', n_authors=5, docs_by_author=25) -Xtr, ytr = dataset.train.data, dataset.train.target -Xte, yte = dataset.test.data, dataset.test.target -A = np.unique(ytr) - -#X = X[:100] -#y = y[:100] -#Xte = Xte[:100] -#yte = yte[:100] - -#X = [ -# "esto, es una primera prueba", -# "esto: es una segunda prueba un poco más larga", -# "vamos ahi con la tercera! a ver", -# "una cuarta prueba con otro trozo de texto" -#] -#y = [0,0,1,1] - - -index = Index(analyzer='char') -Xtr = index.fit_transform(Xtr) -Xte = index.transform(Xte) -pad_index = index.add_word('PADTOKEN') - -shuffle1 = np.random.permutation(Xte.shape[0]) -shuffle2 = np.random.permutation(Xte.shape[0]) -x1, y1 = Xte[shuffle1], yte[shuffle1] -x2, y2 = Xte[shuffle2], yte[shuffle2] -paired_y = y1==y2 hidden_size=128 channels_out=128 @@ -50,13 +17,38 @@ kernel_sizes=[3,5,7,11,13] pad_length=1000 batch_size=64 n_epochs=256 -""" -hidden_size=16 -output_size=32 -pad_length=100 -batch_size=10 -n_epochs=2 -""" +bigrams=True + +#hidden_size=16 +#output_size=32 +#pad_length=100 +#batch_size=10 +#n_epochs=20 + +if torch.cuda.is_available(): + device = torch.device('cuda') +else: + device = torch.device('cpu') +print(f'running on {device}') + +#dataset = Victorian(data_path='../../authorship_analysis/data/victoria', n_authors=5, docs_by_author=25) +dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=-1, docs_by_author=-1) +Xtr, ytr = dataset.train.data, dataset.train.target +Xte, yte = dataset.test.data, dataset.test.target +A = np.unique(ytr) +print(f'num authors={len(A)}') + +index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1)) +Xtr = index.fit_transform(Xtr) +Xte = index.transform(Xte) +pad_index = index.add_word('PADTOKEN') +print(f'vocabulary size={index.vocabulary_size()}') + +#shuffle1 = np.random.permutation(Xte.shape[0]) +#shuffle2 = np.random.permutation(Xte.shape[0]) +#x1, y1 = Xte[shuffle1], yte[shuffle1] +#x2, y2 = Xte[shuffle2], yte[shuffle2] +#paired_y = y1==y2 # attribution print('Attribution') diff --git a/src/model.py b/src/model.py deleted file mode 100644 index 9f37389..0000000 --- a/src/model.py +++ /dev/null @@ -1,311 +0,0 @@ -import numpy as np -import torch -import torch.nn as nn -from tqdm import tqdm -import math - - -def tensor2numpy(t,device): - if device=='cpu': - return t.detach().numpy() - else: - return t.cpu().detach().numpy() - - -class AuthorshipAttributionClassifier(nn.Module): - def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'): - super(AuthorshipAttributionClassifier, self).__init__() - self.projector = projector.to(device) - self.label = nn.Linear(self.projector.space_dimensions(), num_authors).to(device) - self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False) - self.device=device - - def fit(self, X, y, batch_size, epochs, lr=0.001): - self.train() - batcher = Batch(batch_size=batch_size, n_epochs=epochs) - criterion = torch.nn.CrossEntropyLoss().to(self.device) - optim = torch.optim.Adam(self.parameters(), lr=lr) - - pbar = tqdm(range(batcher.n_epochs)) - for epoch in pbar: - losses = [] - for xi, yi in batcher.epoch(X, y): - optim.zero_grad() - xi = self.padder.transform(xi) - logits = self.forward(xi) - loss = criterion(logits, torch.as_tensor(yi).to(self.device)) - loss.backward() - #clip_gradient(model) - optim.step() - losses.append(loss.item()) - pbar.set_description(f'training epoch={epoch} loss={np.mean(losses):.5f}') - - def predict(self, x, batch_size=100): - self.eval() - batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False) - predictions = [] - for xi in tqdm(batcher.epoch(x), desc='test'): - xi = self.padder.transform(xi) - logits = self.forward(xi) - prediction = tensor2numpy(torch.argmax(logits, dim=1).view(-1), self.device) - predictions.append(prediction) - return np.concatenate(predictions) - - def forward(self, x): - phi = self.projector(x) - return self.label(phi) - - -class SameAuthorClassifier(nn.Module): - def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'): - super(SameAuthorClassifier, self).__init__() - self.projector = projector.to(device) - self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False) - self.device = device - - def fit(self, X, y, batch_size, epochs, lr=0.001, steps_per_epoch=100): - self.train() - batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=steps_per_epoch) - optim = torch.optim.Adam(self.parameters(), lr=lr) - - pbar = tqdm(range(batcher.n_epochs)) - for epoch in pbar: - losses = [] - for xi, yi in batcher.epoch(X, y): - optim.zero_grad() - xi = self.padder.transform(xi) - phi = self.projector(xi) - #normalize phi to have norm 1? maybe better as the last step of projector - kernel = torch.matmul(phi, phi.T) - ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device) - loss = KernelAlignmentLoss(kernel, ideal_kernel) - loss.backward() - #clip_gradient(model) - optim.step() - losses.append(loss.item()) - pbar.set_description(f'training epoch={epoch} loss={np.mean(losses):.5f}') - - def predict(self, x, z, batch_size=100): - self.eval() - batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False) - predictions = [] - for xi, zi in tqdm(batcher.epoch(x, z), desc='test'): - xi = self.padder.transform(xi) - zi = self.padder.transform(zi) - inners = self.forward(xi, zi) - prediction = tensor2numpy(inners, device=self.device) > 0.5 # is this correct? should it be > 0 and the ideal kernel in field {-1,+1}? - predictions.append(prediction) - return np.concatenate(predictions) - - def forward(self, x, z): - assert x.shape == z.shape, 'shape mismatch between matrices x and z' - phi_x = self.projector(x) - phi_z = self.projector(z) - rows, cols = phi_x.shape - pairwise_inners = torch.bmm(phi_x.view(rows, 1, cols), phi_z.view(rows, cols, 1)).squeeze() - return pairwise_inners - - -class FullAuthorClassifier(nn.Module): - def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'): - super(FullAuthorClassifier, self).__init__() - self.projector = projector.to(device) - self.label = nn.Linear(self.projector.space_dimensions(), num_authors).to(device) - self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False) - self.device = device - - def fit(self, X, y, batch_size, epochs, lr=0.001, steps_per_epoch=100): - self.train() - batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=steps_per_epoch) - criterion = torch.nn.CrossEntropyLoss().to(self.device) - optim = torch.optim.Adam(self.parameters(), lr=lr) - alpha = 0.5 - - pbar = tqdm(range(batcher.n_epochs)) - for epoch in pbar: - losses, sav_losses, attr_losses = [], [], [] - for xi, yi in batcher.epoch(X, y): - optim.zero_grad() - xi = self.padder.transform(xi) - phi = self.projector(xi) - #normalize phi to have norm 1? maybe better as the last step of projector - - #sav-loss - kernel = torch.matmul(phi, phi.T) - ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device) - sav_loss = KernelAlignmentLoss(kernel, ideal_kernel) - sav_losses.append(sav_loss.item()) - - #attr-loss - logits = self.label(phi) - attr_loss = criterion(logits, torch.as_tensor(yi).to(self.device)) - attr_losses.append(attr_loss.item()) - - #loss - loss = (alpha)*sav_loss + (1-alpha)*attr_loss - losses.append(loss.item()) - - loss.backward() - #clip_gradient(model) - optim.step() - pbar.set_description( - f'training epoch={epoch} ' - f'sav-loss={np.mean(sav_losses):.5f} ' - f'attr-loss={np.mean(attr_losses):.5f} ' - f'loss={np.mean(losses):.5f}' - ) - - def predict_sav(self, x, z, batch_size=100): - self.eval() - batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False) - predictions = [] - for xi, zi in tqdm(batcher.epoch(x, z), desc='test'): - xi = self.padder.transform(xi) - zi = self.padder.transform(zi) - phi_xi = self.projector(xi) - phi_zi = self.projector(zi) - rows, cols = phi_xi.shape - pairwise_inners = torch.bmm(phi_xi.view(rows, 1, cols), phi_zi.view(rows, cols, 1)).squeeze() - prediction = tensor2numpy(pairwise_inners, device=self.device) > 0.5 # is this correct? should it be > 0 and the ideal kernel in field {-1,+1}? - predictions.append(prediction) - return np.concatenate(predictions) - - def predict_labels(self, x, batch_size=100): - self.eval() - batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False) - predictions = [] - for xi in tqdm(batcher.epoch(x), desc='test'): - xi = self.padder.transform(xi) - phi = self.projector(xi) - logits = self.label(phi) - prediction =tensor2numpy( torch.argmax(logits, dim=1).view(-1), device=self.device) - predictions.append(prediction) - return np.concatenate(predictions) - - -def KernelAlignmentLoss(K, Y): - n_el = K.shape[0]*K.shape[1] - loss = torch.norm(K - Y, p='fro') # in Nello's paper this is different - loss = loss / n_el # this is in order to factor out the accumulation which is only due to the size - return loss - - -class RNNProjection(nn.Module): - def __init__(self, vocab_size, hidden_size, output_size, device='cpu'): - super(RNNProjection, self).__init__() - self.output_size = output_size - self.hidden_size = hidden_size - self.vocab_size = vocab_size - self.num_layers=1 - self.num_directions=1 - self.device=device - - self.embedding = nn.Embedding(vocab_size, hidden_size).to(device) - self.rnn = nn.GRU( - input_size=hidden_size, - hidden_size=hidden_size, - num_layers=self.num_layers, - bidirectional=(self.num_directions == 2), - batch_first=True - ).to(device) - self.projection = nn.Linear(self.num_layers * self.num_directions * self.hidden_size, output_size).to(device) - - def init_hidden(self, batch_size): - return torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(self.device) - - def forward(self, input): - x = torch.as_tensor(input).to(self.device) - batch_size = x.shape[0] - x = self.embedding(x) - output, hn = self.rnn(x, self.init_hidden(batch_size)) - hn = hn.view(self.num_layers, self.num_directions, batch_size, self.hidden_size) - hn = hn.permute(2, 0, 1, 3).reshape(batch_size, -1) - return self.projection(hn) - - def space_dimensions(self): - return self.output_size - - -class Batch: - def __init__(self, batch_size, n_epochs, shuffle=True): - self.batch_size = batch_size - self.n_epochs = n_epochs - self.shuffle = shuffle - self.current_epoch = 0 - - def epoch(self, *args): - lengths = list(map(len, args)) - assert max(lengths) == min(lengths), 'inconsistent sizes in args' - n_batches = math.ceil(lengths[0] / self.batch_size) - offset = 0 - if self.shuffle: - index = np.random.permutation(len(args[0])) - args = [arg[index] for arg in args] - for b in range(n_batches): - batch_idx = slice(offset, offset+self.batch_size) - batch = [arg[batch_idx] for arg in args] - yield batch if len(batch) > 1 else batch[0] - offset += self.batch_size - self.current_epoch += 1 - - -class TwoClassBatch: - """ - given a X and y (multi-label) produces batches of elements of X, y for two classes (e.g., c1, c2) - of equal size, i.e., the batch is [(x1,c1), ..., (xn,c1), (xn+1,c2), ..., (x2n,c2)] - """ - def __init__(self, batch_size, n_epochs, steps_per_epoch): - self.batch_size = batch_size - self.n_epochs = n_epochs - self.steps_per_epoch = steps_per_epoch - self.current_epoch = 0 - if self.batch_size % 2 != 0: - raise ValueError('warning, batch size is not even') - - def epoch(self, X, y): - n_el = len(y) - assert X.shape[0] == n_el, 'inconsistent sizes in X, y' - classes = np.unique(y) - groups = {ci: X[y==ci] for ci in classes} - class_prevalences = [len(groups[ci])/n_el for ci in classes] - n_choices = self.batch_size // 2 - - for b in range(self.steps_per_epoch): - class1, class2 = np.random.choice(classes, p=class_prevalences, size=2, replace=False) - X1 = np.random.choice(groups[class1], size=n_choices) - X2 = np.random.choice(groups[class2], size=n_choices) - X_batch = np.concatenate([X1,X2]) - y_batch = np.repeat([class1, class2], repeats=[n_choices,n_choices]) - yield X_batch, y_batch - self.current_epoch += 1 - - -class Padding: - def __init__(self, pad_index, max_length, dynamic=True, pad_at_end=True): - """ - :param pad_index: the index representing the PAD token - :param max_length: the length that defines the padding - :param dynamic: if True (default) pads at min(max_length, max_local_length) where max_local_length is the - length of the longest example - :param pad_at_end: if True, the pad tokens are added at the end of the lists, if otherwise they are added - at the beginning - """ - self.pad = pad_index - self.max_length = max_length - self.dynamic = dynamic - self.pad_at_end = pad_at_end - - def transform(self, X): - """ - :param X: a list of lists of indexes (integers) - :return: a ndarray of shape (n,m) where n is the number of elements in X and m is the pad length (the maximum - in elements of X if dynamic, or self.max_length if otherwise) - """ - X = [x[:self.max_length] for x in X] - lengths = list(map(len, X)) - pad_length = min(max(lengths), self.max_length) if self.dynamic else self.max_length - if self.pad_at_end: - padded = [x + [self.pad] * (pad_length - x_len) for x, x_len in zip(X, lengths)] - else: - padded = [[self.pad] * (pad_length - x_len) + x for x, x_len in zip(X, lengths)] - return np.asarray(padded, dtype=int) diff --git a/src/model/model.py b/src/model/classifiers.py similarity index 80% rename from src/model/model.py rename to src/model/classifiers.py index 997bb48..572ba42 100644 --- a/src/model/model.py +++ b/src/model/classifiers.py @@ -3,12 +3,7 @@ import torch import torch.nn as nn from tqdm import tqdm import math - - -def tensor2numpy(t, device): - if device == 'cpu': - t = t.cpu() - return t.detach().numpy() +from sklearn.model_selection import train_test_split class AuthorshipAttributionClassifier(nn.Module): @@ -18,28 +13,48 @@ class AuthorshipAttributionClassifier(nn.Module): self.ff = FFProjection(input_size=projector.space_dimensions(), hidden_sizes=[1024], output_size=num_authors).to(device) - self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False) + self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False, device=device) self.device = device - def fit(self, X, y, batch_size, epochs, lr=0.001): - self.train() + def fit(self, X, y, batch_size, epochs, lr=0.001, val_prop=0.2, log='../log/tmp.csv'): batcher = Batch(batch_size=batch_size, n_epochs=epochs) criterion = torch.nn.CrossEntropyLoss().to(self.device) optim = torch.optim.Adam(self.parameters(), lr=lr) - pbar = tqdm(range(batcher.n_epochs)) - for epoch in pbar: - losses = [] - for xi, yi in batcher.epoch(X, y): - optim.zero_grad() - xi = self.padder.transform(xi) - logits = self.forward(torch.as_tensor(xi).to(self.device)) - loss = criterion(logits, torch.as_tensor(yi).to(self.device)) - loss.backward() - #clip_gradient(model) - optim.step() - losses.append(loss.item()) - pbar.set_description(f'training epoch={epoch} loss={np.mean(losses):.5f}') + X, Xval, y, yval = train_test_split(X, y, test_size=val_prop, stratify=y) + + with open(log, 'wt') as foo: + foo.write('epoch\ttr-loss\tval-loss\n') + tr_loss, val_loss = -1, -1 + pbar = tqdm(range(1,batcher.n_epochs+1)) + for epoch in pbar: + # training + self.train() + losses = [] + for xi, yi in batcher.epoch(X, y): + optim.zero_grad() + loss = self._compute_loss(xi, yi, criterion) + loss.backward() + #clip_gradient(model) + optim.step() + losses.append(loss.item()) + tr_loss = np.mean(losses) + pbar.set_description(f'training epoch={epoch} loss={tr_loss:.5f} val_loss={val_loss:.5f}') + + # validation + self.eval() + losses = [] + for xi, yi in batcher.epoch(Xval, yval): + loss = self._compute_loss(xi, yi, criterion) + losses.append(loss.item()) + val_loss = np.mean(losses) + + foo.write(f'{epoch}\t{tr_loss:.8f}\t{val_loss:.8f}\n') + + def _compute_loss(self, x, y, criterion): + x = self.padder.transform(x) + logits = self.forward(x) + return criterion(logits, torch.as_tensor(y).to(self.device)) def predict(self, x, batch_size=100): self.eval() @@ -47,8 +62,8 @@ class AuthorshipAttributionClassifier(nn.Module): predictions = [] for xi in tqdm(batcher.epoch(x), desc='test'): xi = self.padder.transform(xi) - logits = self.forward(torch.as_tensor(xi).to(self.device)) - prediction = tensor2numpy(torch.argmax(logits, dim=1).view(-1), self.device) + logits = self.forward(xi) + prediction = tensor2numpy(torch.argmax(logits, dim=1).view(-1)) predictions.append(prediction) return np.concatenate(predictions) @@ -61,7 +76,7 @@ class SameAuthorClassifier(nn.Module): def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'): super(SameAuthorClassifier, self).__init__() self.projector = projector.to(device) - self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False) + self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False, device=device) self.device = device def fit(self, X, y, batch_size, epochs, lr=0.001, steps_per_epoch=100): @@ -94,7 +109,7 @@ class SameAuthorClassifier(nn.Module): xi = self.padder.transform(xi) zi = self.padder.transform(zi) inners = self.forward(xi, zi) - prediction = tensor2numpy(inners, device=self.device) > 0.5 # is this correct? should it be > 0 and the ideal kernel in field {-1,+1}? + prediction = tensor2numpy(inners) > 0.5 # is this correct? should it be > 0 and the ideal kernel in field {-1,+1}? predictions.append(prediction) return np.concatenate(predictions) @@ -114,7 +129,7 @@ class FullAuthorClassifier(nn.Module): self.ff = FFProjection(input_size=projector.space_dimensions(), hidden_sizes=[1024], output_size=num_authors).to(device) - self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False) + self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False, device=device) self.device = device def fit(self, X, y, batch_size, epochs, lr=0.001, steps_per_epoch=100): @@ -169,7 +184,7 @@ class FullAuthorClassifier(nn.Module): phi_zi = self.projector(zi) rows, cols = phi_xi.shape pairwise_inners = torch.bmm(phi_xi.view(rows, 1, cols), phi_zi.view(rows, cols, 1)).squeeze() - prediction = tensor2numpy(pairwise_inners, device=self.device) > 0.5 # is this correct? should it be > 0 and the ideal kernel in field {-1,+1}? + prediction = tensor2numpy(pairwise_inners) > 0.5 # is this correct? should it be > 0 and the ideal kernel in field {-1,+1}? predictions.append(prediction) return np.concatenate(predictions) @@ -181,7 +196,7 @@ class FullAuthorClassifier(nn.Module): xi = self.padder.transform(xi) phi = self.projector(xi) logits = self.ff(phi) - prediction = tensor2numpy( torch.argmax(logits, dim=1).view(-1), device=self.device) + prediction = tensor2numpy( torch.argmax(logits, dim=1).view(-1)) predictions.append(prediction) return np.concatenate(predictions) @@ -209,41 +224,6 @@ class FFProjection(nn.Module): x = self.ff[-1](x) return x -class RNNProjection(nn.Module): - def __init__(self, vocab_size, hidden_size, output_size, device='cpu'): - super(RNNProjection, self).__init__() - self.output_size = output_size - self.hidden_size = hidden_size - self.vocab_size = vocab_size - self.num_layers=1 - self.num_directions=1 - self.device=device - - self.embedding = nn.Embedding(vocab_size, hidden_size).to(device) - self.rnn = nn.GRU( - input_size=hidden_size, - hidden_size=hidden_size, - num_layers=self.num_layers, - bidirectional=(self.num_directions == 2), - batch_first=True - ).to(device) - self.projection = nn.Linear(self.num_layers * self.num_directions * self.hidden_size, output_size).to(device) - - def init_hidden(self, batch_size): - return torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(self.device) - - def forward(self, input): - x = torch.as_tensor(input).to(self.device) - batch_size = x.shape[0] - x = self.embedding(x) - output, hn = self.rnn(x, self.init_hidden(batch_size)) - hn = hn.view(self.num_layers, self.num_directions, batch_size, self.hidden_size) - hn = hn.permute(2, 0, 1, 3).reshape(batch_size, -1) - return self.projection(hn) - - def space_dimensions(self): - return self.output_size - class Batch: def __init__(self, batch_size, n_epochs, shuffle=True): @@ -300,7 +280,7 @@ class TwoClassBatch: class Padding: - def __init__(self, pad_index, max_length, dynamic=True, pad_at_end=True): + def __init__(self, pad_index, max_length, dynamic=True, pad_at_end=True, device='cpu'): """ :param pad_index: the index representing the PAD token :param max_length: the length that defines the padding @@ -313,6 +293,7 @@ class Padding: self.max_length = max_length self.dynamic = dynamic self.pad_at_end = pad_at_end + self.device = device def transform(self, X): """ @@ -327,4 +308,8 @@ class Padding: padded = [x + [self.pad] * (pad_length - x_len) for x, x_len in zip(X, lengths)] else: padded = [[self.pad] * (pad_length - x_len) + x for x, x_len in zip(X, lengths)] - return np.asarray(padded, dtype=int) + return torch.from_numpy(np.asarray(padded, dtype=int)).to(self.device) + + +def tensor2numpy(t): + return t.to('cpu').detach().numpy() \ No newline at end of file diff --git a/src/model/transformation.py b/src/model/transformation.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/model/cnn.py b/src/model/transformations.py similarity index 56% rename from src/model/cnn.py rename to src/model/transformations.py index 56cea80..fbbc2cd 100644 --- a/src/model/cnn.py +++ b/src/model/transformations.py @@ -44,5 +44,40 @@ class CNNProjection(nn.Module): logit = self.fc1(x) # (N, C) return logit + def space_dimensions(self): + return self.output_size + + +class RNNProjection(nn.Module): + def __init__(self, vocab_size, hidden_size, output_size, device='cpu'): + super(RNNProjection, self).__init__() + self.output_size = output_size + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.num_layers=1 + self.num_directions=1 + self.device = device + + self.embedding = nn.Embedding(vocab_size, hidden_size).to(device) + self.rnn = nn.GRU( + input_size=hidden_size, + hidden_size=hidden_size, + num_layers=self.num_layers, + bidirectional=(self.num_directions == 2), + batch_first=True + ).to(device) + self.projection = nn.Linear(self.num_layers * self.num_directions * self.hidden_size, output_size).to(device) + + def init_hidden(self, batch_size): + return torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(self.device) + + def forward(self, x): + batch_size = x.shape[0] + x = self.embedding(x) + output, hn = self.rnn(x, self.init_hidden(batch_size)) + hn = hn.view(self.num_layers, self.num_directions, batch_size, self.hidden_size) + hn = hn.permute(2, 0, 1, 3).reshape(batch_size, -1) + return self.projection(hn) + def space_dimensions(self): return self.output_size \ No newline at end of file