diff --git a/Notes.txt b/Notes.txt new file mode 100644 index 0000000..c3c1044 --- /dev/null +++ b/Notes.txt @@ -0,0 +1,6 @@ +Por ahora tengo dos sets de experimentos: +a) unos mejores que los de Ruder donde hay un layer más de clasificación (o sea, está phi(x) y luego dos layers) +b) unos "simplified" que son peores que los de Ruder porque he quitado ese layer adicional +También vi que se mejoraba con l2(phi(x)) así que lo he dejado así +Ahora voy a probar a añadir ese layer adicional como último step in phi(x) <-- ejecutando +Luego quiero probar a imponer la regularización en todos los layers antes de la clasificación... \ No newline at end of file diff --git a/README.md b/README.md index d8673f2..a8a2080 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,3 @@ -# kernel_authorship +# A Kernel-Target Alignement regularization for Authorship Analysis + diff --git a/TODO.txt b/TODO.txt new file mode 100644 index 0000000..7521cbd --- /dev/null +++ b/TODO.txt @@ -0,0 +1,30 @@ +Things to clarify: +maybe I have to review the validation of the sav-loss; since it is batched, it might be always checking the same + submatrices of for alignment, and those may be mostly positive or mostly near an identity? +maybe the sav-loss is something which may have sense to impose, as a regularization, across many last layers, and not + only the last one? +process datasets and leave it as a generic parameter +padding could start at any random point between [0, length_i-pad_length] + - in training, pad to the shortest + - in test, pad to the largest +save and restore checkpoints +should the phi(x) be normalized? if so: + - better at the last step of phi? + - better outside phi, previous to the gram matrix computation? +should the single-label classifier have some sort of non linearity from the phi(x) to the labels? +SAV: how should the range of k(xi,xj) be interpreted? how to decide for value threshold for returning -1 or +1? + I guess the best thing to do is to learn a simple threshold, one feed forward 1-to-1 +is the TwoClassBatch the best way? +are the contribution of the two losses comparable? or one contributes far more than the other? +what is the best representation for inputs? char-based? ngrams-based? word-based? or a multichannel one? + I think this is irrelevant for the paper +not clear whether the single-label classifier should work out a ff on top of the intermediate representation, or should it + instead work directly on the representations with one simple linear projection; not clear either whether the kernel + should be computed on any further elaboration from the intermediate representation... thing is, that the + is imposing unimodality (documents from the same author should point in a single direction) while working out another + representation for the single-label classifier could instead relax this and attribute to the same author vectors that + come from a multimodal distribution. No... This "unimodality" should exist anyway in the last layer. Indeed I start + thinking that the optimum for any classifier should already impose something similar to the KTA criteria in the + last layer... Is this redundant? +not clear whether we should define the loss as in "On kernel target alignment", i.e., a numerator with f (and + change sign to minimize) or as |K-Y|f norm. What about the denominator (now, the normalization factor is n**2)? \ No newline at end of file diff --git a/experiments.sh b/experiments.sh new file mode 100644 index 0000000..7158f75 --- /dev/null +++ b/experiments.sh @@ -0,0 +1,14 @@ +#!/bin/bash +conda activate torch + +dataset=enron +for authors in 10 50 ; do + for alpha in 1 0.999 0.99 0.9 0.5 ; do + python main.py --dataset $dataset -A $authors -s 0 -o ../results_$dataset.csv --alpha $alpha + done +done + +dataset=imdb62 +for alpha in 1 0.999 0.99 0.9 0.5 ; do + python main.py --dataset $dataset -A -1 -s 0 -o ../results_$dataset.csv --alpha $alpha +done diff --git a/src/main.py b/src/main.py index fe2bbd3..8685737 100644 --- a/src/main.py +++ b/src/main.py @@ -76,11 +76,11 @@ def main(opt): kernel_sizes=opt.kernelsizes, dropout=0.5 ).to(device) + print(phi) cls = AuthorshipAttributionClassifier( phi, num_authors=A.size, pad_index=pad_index, pad_length=opt.pad, device=device ) - print(cls) if opt.name == 'auto': diff --git a/src/model/classifiers.py b/src/model/classifiers.py index 93948e4..b2c0515 100644 --- a/src/model/classifiers.py +++ b/src/model/classifiers.py @@ -7,6 +7,7 @@ import math from sklearn.model_selection import train_test_split from model.early_stop import EarlyStop +from model.transformations import FFProjection class AuthorshipAttributionClassifier(nn.Module): @@ -55,8 +56,11 @@ class AuthorshipAttributionClassifier(nn.Module): loss_attr_value = loss_attr.item() if alpha < 1: + # todo: optimize (only upper diagonal) kernel = torch.matmul(phi, phi.T) ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device) + # todo: maybe the KALoss should take into consideration the balance (it is more likely to have + # a pair of negative examples than positives) loss_sav = KernelAlignmentLoss(kernel, ideal_kernel) loss_sav_value = loss_sav.item() @@ -254,26 +258,10 @@ class FullAuthorClassifier(nn.Module): def KernelAlignmentLoss(K, Y): n_el = K.shape[0]*K.shape[1] loss = torch.norm(K - Y, p='fro') # in Nello's paper this is different - loss = loss / n_el # this is in order to factor out the accumulation which is only due to the size + loss = loss / n_el # this is in order to factor out the accumulation which is only due to the size return loss -class FFProjection(nn.Module): - def __init__(self, input_size, hidden_sizes, output_size, activation=nn.functional.relu, dropout=0.5): - super(FFProjection, self).__init__() - sizes = [input_size] + hidden_sizes + [output_size] - self.ff = nn.ModuleList([ - nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1) - ]) - self.activation = activation - self.dropout = nn.Dropout(p=dropout) - - def forward(self, x): - for linear in self.ff[:-1]: - x = self.dropout(self.activation(linear(x))) - x = self.ff[-1](x) - return x - class Batch: def __init__(self, batch_size, n_epochs=1, shuffle=True): diff --git a/src/model/transformations.py b/src/model/transformations.py index 53e7ab1..4380213 100644 --- a/src/model/transformations.py +++ b/src/model/transformations.py @@ -13,47 +13,61 @@ class CNNProjection(nn.Module): self.convs1 = nn.ModuleList( [nn.Conv2d(channels_in, channels_out, (K, embedding_dim)) for K in kernel_sizes] ) - ''' - self.conv13 = nn.Conv2d(Ci, Co, (3, D)) - self.conv14 = nn.Conv2d(Ci, Co, (4, D)) - self.conv15 = nn.Conv2d(Ci, Co, (5, D)) - ''' self.dropout = nn.Dropout(dropout) - self.fc1 = nn.Linear(len(kernel_sizes) * channels_out, out_size) + #self.fc1 = nn.Linear(len(kernel_sizes) * channels_out, out_size) + self.fc = FFProjection(input_size=len(kernel_sizes) * channels_out, + hidden_sizes=[1024], + output_size=out_size, + activation=nn.functional.relu, + dropout=dropout) self.output_size = out_size + def convolve(self, x): + x = x.unsqueeze(1) # (N, Ci, W, D) + x = [self.conv_and_pool(x, conv) for conv in self.convs1] # [(N, Co), ...]*len(Ks) + x = torch.cat(x, 1) + return x + def conv_and_pool(self, x, conv): x = F.relu(conv(x)).squeeze(3) # (N, Co, W) x = F.max_pool1d(x, x.size(2)).squeeze(2) return x - def forward(self, x): - x = self.embed(x) # (N, W, D) - x = x.unsqueeze(1) # (N, Ci, W, D) - x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] # [(N, Co, W), ...]*len(Ks) - x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] # [(N, Co), ...]*len(Ks) - x = torch.cat(x, 1) - - ''' - x1 = self.conv_and_pool(x,self.conv13) #(N,Co) - x2 = self.conv_and_pool(x,self.conv14) #(N,Co) - x3 = self.conv_and_pool(x,self.conv15) #(N,Co) - x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co) - ''' - - x = F.relu(self.fc1(x)) # (N, C) - + def l2norm(self, x): norm = x.norm(p=2, dim=1, keepdim=True) x = x.div(norm.expand_as(x)) + return x - x = self.dropout(x) # (N, len(Ks)*Co) - + def forward(self, x): + x = self.embed(x) # (N, W, D) + x = self.convolve(x) # (N, len(Ks)*Co] + x = self.fc(x) + #x = F.relu(self.fc1(x)) # (N, C) + # x = self.dropout(x) + x = self.l2norm(x) return x def space_dimensions(self): return self.output_size +class FFProjection(nn.Module): + def __init__(self, input_size, hidden_sizes, output_size, activation=nn.functional.relu, dropout=0.5): + super(FFProjection, self).__init__() + sizes = [input_size] + hidden_sizes + [output_size] + self.ff = nn.ModuleList([ + nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1) + ]) + self.activation = activation + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x): + for linear in self.ff[:-1]: + x = self.dropout(self.activation(linear(x))) + x = self.ff[-1](x) + return x + + class RNNProjection(nn.Module): def __init__(self, vocab_size, hidden_size, output_size, device='cpu'): super(RNNProjection, self).__init__() diff --git a/src/tools/gen_tables.py b/src/tools/gen_tables.py new file mode 100644 index 0000000..72e6947 --- /dev/null +++ b/src/tools/gen_tables.py @@ -0,0 +1,14 @@ +import pandas as pd +from glob import glob + +filedir = '../../results_*.csv' + +df = [pd.read_csv(file, sep='\t') for file in glob(filedir)] +df = pd.concat(df) + +df[['dataset','authors','docs','seed']] = df.Dataset.str.split('_',expand=True) +df = df.drop(columns='Dataset') + +pv = df.pivot_table(index = ['dataset','authors','docs','Method'], values=['microF1','val_microF1']) + +print(pv) \ No newline at end of file