From acb38d4aae77441f4addda57f09d1e376c4a3a45 Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Thu, 23 Jul 2020 14:29:00 +0200
Subject: [PATCH] kernel loss based on BCE

---
 src/main.py              |  1 -
 src/model/classifiers.py | 68 ++++++++++++++++++++++++++++++++--------
 src/model/layers.py      | 17 ++--------
 3 files changed, 57 insertions(+), 29 deletions(-)

diff --git a/src/main.py b/src/main.py
index 756d6a2..1bed730 100644
--- a/src/main.py
+++ b/src/main.py
@@ -81,7 +81,6 @@ def main(opt):
                      activation=nn.functional.relu,
                      dropout=0.5,
                      activate_last=True),
-        #norm=L2Norm()
     ).to(device)
 
     cls = AuthorshipAttributionClassifier(
diff --git a/src/model/classifiers.py b/src/model/classifiers.py
index c54e2b0..e69a919 100644
--- a/src/model/classifiers.py
+++ b/src/model/classifiers.py
@@ -27,6 +27,7 @@ class AuthorshipAttributionClassifier(nn.Module):
         #batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=X.shape[0]//batch_size)
         batcher_val = Batch(batch_size=batch_size, n_epochs=epochs, shuffle=False)
         criterion = torch.nn.CrossEntropyLoss().to(self.device)
+        savcriterion = torch.nn.BCEWithLogitsLoss().to(self.device)
         optim = torch.optim.Adam(self.parameters(), lr=lr)
 
         X, Xval, y, yval = train_test_split(X, y, test_size=val_prop, stratify=y)
@@ -53,15 +54,25 @@ class AuthorshipAttributionClassifier(nn.Module):
                         loss_attr_value = loss_attr.item()
 
                     if alpha < 1:
-                        phi = F.normalize(phi)
 
-                        # todo: optimize (only upper diagonal)
-                        kernel = torch.matmul(phi, phi.T)
-                        ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
+                        # choose balanced number of positive (same author) and negative (different authors)
+                        idx1, idx2, sav_labels = choose_sav_pairs(yi, npairs=batch_size)
+
+                        phi1 = phi[idx1]
+                        phi2 = phi[idx2]
+                        cross = torch.bmm(phi1.unsqueeze(1), phi2.unsqueeze(2).permute(0,1,2)).squeeze()
+                        loss_sav = savcriterion(cross.unsqueeze(0), torch.as_tensor(sav_labels).float().unsqueeze(0).to(self.device))
+                        loss_sav_value = loss_sav.item()
+
+                        # add a cross-entropy based criterion (instead of KTA -- let's see how it works)
+
+                        ## todo: optimize (only upper diagonal)
+                        #kernel = torch.matmul(phi, phi.T)
+                        #ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
                         # todo: maybe the KALoss should take into consideration the balance (it is more likely to have
                         # a pair of negative examples than positives)
-                        loss_sav = KernelAlignmentLoss(kernel, ideal_kernel)
-                        loss_sav_value = loss_sav.item()
+                        #loss_sav = KernelAlignmentLoss(kernel, ideal_kernel)
+                        #loss_sav_value = loss_sav.item()
 
                     loss = loss_attr*alpha + loss_sav*(1.-alpha)
 
@@ -77,8 +88,7 @@ class AuthorshipAttributionClassifier(nn.Module):
                                          f'attr-loss={np.mean(attr_losses):.5f} '
                                          f'sav-loss={np.mean(sav_losses):.5f} '
                                          f'val_loss={val_loss:.5f} '
-                                         f'patience={early_stop.patience}/{early_stop.patience_limit}'
-                                         )
+                                         f'patience={early_stop.patience}/{early_stop.patience_limit}')
 
                 # validation
                 self.eval()
@@ -126,6 +136,38 @@ class AuthorshipAttributionClassifier(nn.Module):
         return self.ff(phi)
 
 
+def choose_sav_pairs(y, npairs):
+    n = len(y)
+    y = y+1  # reindex from [0..n_classes-1] to [1..n_classes] for convenience
+    same_author = (np.outer(y, 1/y) == 1)
+    triu = np.triu_indices(n, k=1)
+    same_author_nodup = same_author[triu]
+    idxi, idxj = triu
+
+    posi, negi = idxi[same_author_nodup], idxi[same_author_nodup == False]
+    posj, negj = idxj[same_author_nodup], idxj[same_author_nodup == False]
+    num_pos = same_author_nodup.sum()
+    num_neg = len(same_author_nodup)-num_pos  # == len(posj)
+
+    # balanced:
+    pos_take = np.random.choice(np.arange(num_pos), npairs//2, replace=num_pos < npairs//2)
+    posi, posj = posi[pos_take], posj[pos_take]
+
+    neg_take = np.random.choice(np.arange(num_neg), npairs//2, replace=num_neg < npairs//2)
+    negi, negj = negi[neg_take], negj[neg_take]
+
+    idx1 = np.concatenate([posi, negi])
+    idx2 = np.concatenate([posj, negj])
+    savlabels = np.array([1]*len(posi) + [0]*len(negi))
+
+    print(f'generated {len(posi)} pos and {len(negi)}')
+    return idx1, idx2, savlabels
+
+
+
+
+
+
 class SameAuthorClassifier(nn.Module):
     def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'):
         super(SameAuthorClassifier, self).__init__()
@@ -255,11 +297,11 @@ class FullAuthorClassifier(nn.Module):
         return np.concatenate(predictions)
 
 
-def KernelAlignmentLoss(K, Y):
-    n_el = K.shape[0]*K.shape[1]
-    loss = torch.norm(K - Y, p='fro')  # in Nello's paper this is different
-    loss = loss / n_el  # this is in order to factor out the accumulation which is only due to the size
-    return loss
+#def KernelAlignmentLoss(K, Y):
+#    n_el = K.shape[0]*K.shape[1]
+#    loss = torch.norm(K - Y, p='fro')  # in Nello's paper this is different
+#    loss = loss / n_el  # this is in order to factor out the accumulation which is only due to the size
+#    return loss
 
 
 
diff --git a/src/model/layers.py b/src/model/layers.py
index 3e2b87d..b8b568f 100644
--- a/src/model/layers.py
+++ b/src/model/layers.py
@@ -6,17 +6,16 @@ import torch.nn.functional as F
 
 class Phi(nn.Module):
 
-    def __init__(self, cnn, ff, norm=None):
+    def __init__(self, cnn, ff):
         super(Phi, self).__init__()
         self.cnn = cnn
         self.ff = ff
-        #self.norm = norm
         self.output_size = self.ff.output_size
 
     def forward(self, x):
         x = self.cnn(x)
         x = self.ff(x)
-        #x = self.norm(x)
+        x = F.normalize(x, p=2, dim=-1)
         return x
 
 
@@ -48,18 +47,6 @@ class CNNProjection(nn.Module):
         return x
 
 
-class L2Norm(nn.Module):
-    def __init__(self, p=2, dim=-1):
-        super(L2Norm, self).__init__()
-        self.p=p
-        self.dim=dim
-
-    def forward(self, x):
-        norm = x.norm(p=self.p, dim=self.dim, keepdim=True)
-        x = x.div(norm.expand_as(x))
-        return x
-
-
 class FFProjection(nn.Module):
     def __init__(self, input_size, hidden_sizes, output_size, activation=nn.functional.relu, dropout=0.5,
                  activate_last=False):