From fe5bcfe61bf715a9d0f8817eafbd76520ca2189f Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Tue, 28 Apr 2020 11:18:52 +0200 Subject: [PATCH] first commit --- src/data/AuthorshipDataset.py | 141 ++++++++++++++++ src/data/fetch_victorian.py | 35 ++++ src/evaluation.py | 9 + src/index.py | 52 ++++++ src/main.py | 81 +++++++++ src/model.py | 303 ++++++++++++++++++++++++++++++++++ 6 files changed, 621 insertions(+) create mode 100644 src/data/AuthorshipDataset.py create mode 100644 src/data/fetch_victorian.py create mode 100644 src/evaluation.py create mode 100644 src/index.py create mode 100644 src/main.py create mode 100644 src/model.py diff --git a/src/data/AuthorshipDataset.py b/src/data/AuthorshipDataset.py new file mode 100644 index 0000000..e3b33cc --- /dev/null +++ b/src/data/AuthorshipDataset.py @@ -0,0 +1,141 @@ +from abc import ABC, abstractmethod +import random +import numpy as np +from collections import Counter + + +class LabelledCorpus: + + def __init__(self, documents, labels): + if not isinstance(documents, np.ndarray): documents = np.asarray(documents) + if not isinstance(labels, np.ndarray): labels = np.asarray(labels) + self.data = documents + self.target = labels + + def __len__(self): + return len(self.data) + + @classmethod + def filter(cls, labelled_corpus, to_drop): + sel_data, sel_target = [], [] + for i in range(len(labelled_corpus)): + if labelled_corpus.target[i] not in to_drop: + sel_data.append(labelled_corpus.data[i]) + sel_target.append(labelled_corpus.target[i]) + return LabelledCorpus(sel_data, sel_target) + + +class AuthorshipDataset(ABC): + + def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors = 0, random_state=42): + self.data_path = data_path + + random.seed(random_state) + np.random.seed(random_state) + + self._check_n_authors(n_authors, n_open_set_authors) + + self.train, self.test, self.target_names = self._fetch_and_split() + + self._assure_docs_by_author(docs_by_author) + + self._reduce_authors_documents(n_authors, docs_by_author, n_open_set_authors) + + self._remove_label_gaps() + + super().__init__() + + + @abstractmethod + def _fetch_and_split(self): + pass + + + @abstractmethod + def _check_n_authors(self, n_authors, n_open_set_authors): + pass + + + def _reduce_authors_documents(self, n_authors, n_docs_by_author, n_open_set_authors): + + if n_authors != -1 or n_docs_by_author != -1: + #training data only (test contains all examples by author) + if n_docs_by_author != -1: + docs_by_author = self.group_by(self.train.data, self.train.target) + train_labels, train_data = [], [] + for author, documents in docs_by_author.items(): + if n_docs_by_author > len(documents): + continue + selected_docs = random.sample(documents, n_docs_by_author) + train_labels.extend([author] * n_docs_by_author) + train_data.extend(selected_docs) + + self.train = LabelledCorpus(train_data, train_labels) + + if n_authors == -1: + selected_authors = self.target_names + else: + selected_authors = random.sample(self.target_names, n_authors+n_open_set_authors) + self.test = self.extract_documents_from_authors(self.test, selected_authors) + self.train = self.extract_documents_from_authors(self.train, selected_authors) + else: + selected_authors = np.unique(self.train.target) + + if n_open_set_authors > 0: + self.train, self.test, self.test_out = self.disjoint_train_test_authors( + self.train, self.test, n_open_set_authors, selected_authors + ) + else: + self.test_out = None + + + # reindex labels so that the unique labels are equal to range(#num_different_authors) + # and unique training labels are range(#num_different_training_authors) + def _remove_label_gaps(self): + + # reindex the training labels first, so that they contain no gaps + unique_labels = np.unique(self.train.target) + recode={old:new for old,new in zip(unique_labels,range(len(unique_labels)))} + self.train.target=np.array([recode[l] for l in self.train.target]) + self.test.target = np.array([recode[l] for l in self.test.target]) + + #test_out_labels (if requested) contains additional authors + if self.test_out is not None: + for l in np.unique(self.test_out.target): + if l not in recode: + recode[l] = len(recode) + self.test_out.target = np.array([recode[l] for l in self.test_out.target]) + + + def group_by(self, docs, authors): + return {i: docs[authors == i].tolist() for i in np.unique(authors)} + + def extract_documents_from_authors(self, labelled_docs, authors): + X, y = labelled_docs.data, labelled_docs.target + if not isinstance(X, np.ndarray): X = np.asarray(X) + if not isinstance(y, np.ndarray): y = np.asarray(y) + idx = np.logical_or.reduce([y == i for i in authors]) + return LabelledCorpus(X[idx], y[idx]) + + def disjoint_train_test_authors(self, train, test, n_open_test_authors, selected_authors): + train_authors, test_authors = selected_authors[n_open_test_authors:], selected_authors[:n_open_test_authors] + + train = self.extract_documents_from_authors(train, train_authors) + test_in = self.extract_documents_from_authors(test, train_authors) + test_out = self.extract_documents_from_authors(test, test_authors) + + return train, test_in, test_out + + def _assure_docs_by_author(self, docs_by_author): + if docs_by_author == -1: + return + + author_doc_count = Counter(self.train.target) + to_remove = frozenset([id for id,count in author_doc_count.most_common() if count0: + self.train = LabelledCorpus.filter(self.train, to_remove) + self.test = LabelledCorpus.filter(self.test, to_remove) + self.target_names = sorted(set(self.target_names) - to_remove) + + diff --git a/src/data/fetch_victorian.py b/src/data/fetch_victorian.py new file mode 100644 index 0000000..8d0456e --- /dev/null +++ b/src/data/fetch_victorian.py @@ -0,0 +1,35 @@ +import numpy as np +import csv +from sklearn.model_selection import train_test_split +from data.AuthorshipDataset import AuthorshipDataset, LabelledCorpus + + +class Victorian(AuthorshipDataset): + + TEST_SIZE = 0.30 + + def __init__(self, data_path='../data/victoria', n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42): + super().__init__(data_path, n_authors, docs_by_author, n_open_set_authors, random_state) + + def _fetch_and_split(self): + + data, labels = [], [] + + with open (f'{self.data_path}/Gungor_2018_VictorianAuthorAttribution_data-train.csv','r',encoding="latin-1") as file: + csv_reader = csv.reader(file, delimiter = ',') + next(csv_reader) + for row in csv_reader: + # if row[0]!='text': + data.append(row[0]) + labels.append(int(row[1])) + + target_names = sorted(np.unique(labels)) + + train_data, test_data, train_labels, test_labels = \ + train_test_split(data, labels, test_size=Victorian.TEST_SIZE, stratify=labels) + + return LabelledCorpus(train_data, train_labels), LabelledCorpus(test_data, test_labels), target_names + + + def _check_n_authors(self, n_authors, n_open_set_authors): + pass \ No newline at end of file diff --git a/src/evaluation.py b/src/evaluation.py new file mode 100644 index 0000000..6b018db --- /dev/null +++ b/src/evaluation.py @@ -0,0 +1,9 @@ +from sklearn.metrics import f1_score, accuracy_score + + +def eval(y_true, y_pred): + acc = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred, average='macro') + print(f'acc={acc * 100:.2f}%') + print(f'macro-f1={f1:.2f}') + return acc, f1 diff --git a/src/index.py b/src/index.py new file mode 100644 index 0000000..545240a --- /dev/null +++ b/src/index.py @@ -0,0 +1,52 @@ +import numpy as np +from sklearn.feature_extraction.text import CountVectorizer +from tqdm import tqdm + + +class Index: + def __init__(self, **kwargs): + """ + :param kwargs: keyworded arguments from _sklearn.feature_extraction.text.CountVectorizer_ + """ + self.vect = CountVectorizer(**kwargs) + self.unk = -1 # a valid index is assigned after fit + + def fit(self, X): + """ + :param X: a list of strings + :return: self + """ + self.vect.fit(X) + self.analyzer = self.vect.build_analyzer() + self.vocabulary = self.vect.vocabulary_ + self.unk = self.add_word('UNKTOKEN') + return self + + def transform(self, X): + assert self.unk > 0, 'transform called before fit' + return np.asarray([ + [self.vocabulary.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(X, desc='indexing')] + ) + + def fit_transform(self, X): + return self.fit(X).transform(X) + + def vocabulary_size(self): + return len(self.vocabulary) + 1 # the reserved unk token + + def add_word(self, word): + if word in self.vocabulary: + raise ValueError(f'word {word} already in dictionary') + self.vocabulary[word] = len(self.vocabulary) + return self.vocabulary[word] + + + + + + + + + + + diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..e1a0a5e --- /dev/null +++ b/src/main.py @@ -0,0 +1,81 @@ +import numpy as np +from index import Index +from model import RNNProjection, AuthorshipAttributionClassifier, Batch, SameAuthorClassifier, FullAuthorClassifier +from data.fetch_victorian import Victorian +from evaluation import eval +import torch + +if torch.cuda.is_available(): + device = torch.device('cuda') +else: + device = torch.device('cpu') +print(f'running on {device}') + +dataset = Victorian(data_path='../../authorship_analysis/data/victoria', n_authors=5, docs_by_author=25) +Xtr, ytr = dataset.train.data, dataset.train.target +Xte, yte = dataset.test.data, dataset.test.target +A = np.unique(ytr) + +#X = X[:100] +#y = y[:100] +#Xte = Xte[:100] +#yte = yte[:100] + +#X = [ +# "esto, es una primera prueba", +# "esto: es una segunda prueba un poco más larga", +# "vamos ahi con la tercera! a ver", +# "una cuarta prueba con otro trozo de texto" +#] +#y = [0,0,1,1] + + +index = Index(analyzer='char') +Xtr = index.fit_transform(Xtr) +Xte = index.transform(Xte) +pad_index = index.add_word('PADTOKEN') + +shuffle1 = np.random.permutation(Xte.shape[0]) +shuffle2 = np.random.permutation(Xte.shape[0]) +x1, y1 = Xte[shuffle1], yte[shuffle1] +x2, y2 = Xte[shuffle2], yte[shuffle2] +paired_y = y1==y2 + +hidden_size=64 +output_size=128 +pad_length=1000 +batch_size=50 +n_epochs=10 + +hidden_size=16 +output_size=32 +pad_length=100 +batch_size=10 +n_epochs=2 + + +# attribution +print('Attribution') +phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size) +cls = AuthorshipAttributionClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device) +cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs) +yte_ = cls.predict(Xte) +eval(yte, yte_) + +# verification +print('Verification') +phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size) +cls = SameAuthorClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device) +cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs) +paired_y_ = cls.predict(x1,x2) +eval(paired_y, paired_y_) + +# attribution & verification +print('Attribution & Verification') +phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size) +cls = FullAuthorClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device) +cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs) +yte_ = cls.predict_labels(Xte) +eval(yte, yte_) +paired_y_ = cls.predict_sav(x1,x2) +eval(paired_y, paired_y_) diff --git a/src/model.py b/src/model.py new file mode 100644 index 0000000..49ce2f2 --- /dev/null +++ b/src/model.py @@ -0,0 +1,303 @@ +import numpy as np +import torch +import torch.nn as nn +from tqdm import tqdm +import math + + +class AuthorshipAttributionClassifier(nn.Module): + def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'): + super(AuthorshipAttributionClassifier, self).__init__() + self.projector = projector.to(device) + self.label = nn.Linear(self.projector.space_dimensions(), num_authors).to(device) + self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False) + self.device=device + + def fit(self, X, y, batch_size, epochs, lr=0.001): + self.train() + batcher = Batch(batch_size=batch_size, n_epochs=epochs) + criterion = torch.nn.CrossEntropyLoss().to(self.device) + optim = torch.optim.Adam(self.parameters(), lr=lr) + + pbar = tqdm(range(batcher.n_epochs)) + for epoch in pbar: + losses = [] + for xi, yi in batcher.epoch(X, y): + optim.zero_grad() + xi = self.padder.transform(xi) + logits = self.forward(xi) + loss = criterion(logits, torch.as_tensor(yi)) + loss.backward() + #clip_gradient(model) + optim.step() + losses.append(loss.item()) + pbar.set_description(f'training epoch={epoch} loss={np.mean(losses):.5f}') + + def predict(self, x, batch_size=100): + self.eval() + batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False) + predictions = [] + for xi in tqdm(batcher.epoch(x), desc='test'): + xi = self.padder.transform(xi) + logits = self.forward(xi) + prediction = torch.argmax(logits, dim=1).view(-1).detach().numpy() + predictions.append(prediction) + return np.concatenate(predictions) + + def forward(self, x): + phi = self.projector(x) + return self.label(phi) + + +class SameAuthorClassifier(nn.Module): + def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'): + super(SameAuthorClassifier, self).__init__() + self.projector = projector.to(device) + self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False) + self.device = device + + def fit(self, X, y, batch_size, epochs, lr=0.001, steps_per_epoch=100): + self.train() + batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=steps_per_epoch) + optim = torch.optim.Adam(self.parameters(), lr=lr) + + pbar = tqdm(range(batcher.n_epochs)) + for epoch in pbar: + losses = [] + for xi, yi in batcher.epoch(X, y): + optim.zero_grad() + xi = self.padder.transform(xi) + phi = self.projector(xi) + #normalize phi to have norm 1? maybe better as the last step of projector + kernel = torch.matmul(phi, phi.T) + ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)) + loss = KernelAlignmentLoss(kernel, ideal_kernel) + loss.backward() + #clip_gradient(model) + optim.step() + losses.append(loss.item()) + pbar.set_description(f'training epoch={epoch} loss={np.mean(losses):.5f}') + + def predict(self, x, z, batch_size=100): + self.eval() + batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False) + predictions = [] + for xi, zi in tqdm(batcher.epoch(x, z), desc='test'): + xi = self.padder.transform(xi) + zi = self.padder.transform(zi) + inners = self.forward(xi, zi) + prediction = inners.detach().numpy() > 0.5 # is this correct? should it be > 0 and the ideal kernel in field {-1,+1}? + predictions.append(prediction) + return np.concatenate(predictions) + + def forward(self, x, z): + assert x.shape == z.shape, 'shape mismatch between matrices x and z' + phi_x = self.projector(x) + phi_z = self.projector(z) + rows, cols = phi_x.shape + pairwise_inners = torch.bmm(phi_x.view(rows, 1, cols), phi_z.view(rows, cols, 1)).squeeze() + return pairwise_inners + + +class FullAuthorClassifier(nn.Module): + def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'): + super(FullAuthorClassifier, self).__init__() + self.projector = projector.to(device) + self.label = nn.Linear(self.projector.space_dimensions(), num_authors).to(device) + self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False) + self.device = device + + def fit(self, X, y, batch_size, epochs, lr=0.001, steps_per_epoch=100): + self.train() + batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=steps_per_epoch) + criterion = torch.nn.CrossEntropyLoss().to(self.device) + optim = torch.optim.Adam(self.parameters(), lr=lr) + alpha = 0.5 + + pbar = tqdm(range(batcher.n_epochs)) + for epoch in pbar: + losses, sav_losses, attr_losses = [], [], [] + for xi, yi in batcher.epoch(X, y): + optim.zero_grad() + xi = self.padder.transform(xi) + phi = self.projector(xi) + #normalize phi to have norm 1? maybe better as the last step of projector + + #sav-loss + kernel = torch.matmul(phi, phi.T) + ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)) + sav_loss = KernelAlignmentLoss(kernel, ideal_kernel) + sav_losses.append(sav_loss.item()) + + #attr-loss + logits = self.label(phi) + attr_loss = criterion(logits, torch.as_tensor(yi)) + attr_losses.append(attr_loss.item()) + + #loss + loss = (alpha)*sav_loss + (1-alpha)*attr_loss + losses.append(loss.item()) + + loss.backward() + #clip_gradient(model) + optim.step() + pbar.set_description( + f'training epoch={epoch} ' + f'sav-loss={np.mean(sav_losses):.5f} ' + f'attr-loss={np.mean(attr_losses):.5f} ' + f'loss={np.mean(losses):.5f}' + ) + + def predict_sav(self, x, z, batch_size=100): + self.eval() + batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False) + predictions = [] + for xi, zi in tqdm(batcher.epoch(x, z), desc='test'): + xi = self.padder.transform(xi) + zi = self.padder.transform(zi) + phi_xi = self.projector(xi) + phi_zi = self.projector(zi) + rows, cols = phi_xi.shape + pairwise_inners = torch.bmm(phi_xi.view(rows, 1, cols), phi_zi.view(rows, cols, 1)).squeeze() + prediction = pairwise_inners.detach().numpy() > 0.5 # is this correct? should it be > 0 and the ideal kernel in field {-1,+1}? + predictions.append(prediction) + return np.concatenate(predictions) + + def predict_labels(self, x, batch_size=100): + self.eval() + batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False) + predictions = [] + for xi in tqdm(batcher.epoch(x), desc='test'): + xi = self.padder.transform(xi) + phi = self.projector(xi) + logits = self.label(phi) + prediction = torch.argmax(logits, dim=1).view(-1).detach().numpy() + predictions.append(prediction) + return np.concatenate(predictions) + + +def KernelAlignmentLoss(K, Y): + n_el = K.shape[0]*K.shape[1] + loss = torch.norm(K - Y, p='fro') # in Nello's paper this is different + loss = loss / n_el # this is in order to factor out the accumulation which is only due to the size + return loss + + +class RNNProjection(nn.Module): + def __init__(self, vocab_size, hidden_size, output_size): + super(RNNProjection, self).__init__() + self.output_size = output_size + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.num_layers=1 + self.num_directions=1 + + self.embedding = nn.Embedding(vocab_size, hidden_size) + self.rnn = nn.GRU( + input_size=hidden_size, + hidden_size=hidden_size, + num_layers=self.num_layers, + bidirectional=(self.num_directions == 2), + batch_first=True + ) + self.projection = nn.Linear(self.num_layers * self.num_directions * self.hidden_size, output_size) + + def init_hidden(self, batch_size): + return torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size) #.cuda() + + def forward(self, input): + x = torch.as_tensor(input) + batch_size = x.shape[0] + x = self.embedding(x) + output, hn = self.rnn(x, self.init_hidden(batch_size)) + hn = hn.view(self.num_layers, self.num_directions, batch_size, self.hidden_size) + hn = hn.permute(2, 0, 1, 3).reshape(batch_size, -1) + return self.projection(hn) + + def space_dimensions(self): + return self.output_size + + +class Batch: + def __init__(self, batch_size, n_epochs, shuffle=True): + self.batch_size = batch_size + self.n_epochs = n_epochs + self.shuffle = shuffle + self.current_epoch = 0 + + def epoch(self, *args): + lengths = list(map(len, args)) + assert max(lengths) == min(lengths), 'inconsistent sizes in args' + n_batches = math.ceil(lengths[0] / self.batch_size) + offset = 0 + if self.shuffle: + index = np.random.permutation(len(args[0])) + args = [arg[index] for arg in args] + for b in range(n_batches): + batch_idx = slice(offset, offset+self.batch_size) + batch = [arg[batch_idx] for arg in args] + yield batch if len(batch) > 1 else batch[0] + offset += self.batch_size + self.current_epoch += 1 + + +class TwoClassBatch: + """ + given a X and y (multi-label) produces batches of elements of X, y for two classes (e.g., c1, c2) + of equal size, i.e., the batch is [(x1,c1), ..., (xn,c1), (xn+1,c2), ..., (x2n,c2)] + """ + def __init__(self, batch_size, n_epochs, steps_per_epoch): + self.batch_size = batch_size + self.n_epochs = n_epochs + self.steps_per_epoch = steps_per_epoch + self.current_epoch = 0 + if self.batch_size % 2 != 0: + raise ValueError('warning, batch size is not even') + + def epoch(self, X, y): + n_el = len(y) + assert X.shape[0] == n_el, 'inconsistent sizes in X, y' + classes = np.unique(y) + groups = {ci: X[y==ci] for ci in classes} + class_prevalences = [len(groups[ci])/n_el for ci in classes] + n_choices = self.batch_size // 2 + + for b in range(self.steps_per_epoch): + class1, class2 = np.random.choice(classes, p=class_prevalences, size=2, replace=False) + X1 = np.random.choice(groups[class1], size=n_choices) + X2 = np.random.choice(groups[class2], size=n_choices) + X_batch = np.concatenate([X1,X2]) + y_batch = np.repeat([class1, class2], repeats=[n_choices,n_choices]) + yield X_batch, y_batch + self.current_epoch += 1 + + +class Padding: + def __init__(self, pad_index, max_length, dynamic=True, pad_at_end=True): + """ + :param pad_index: the index representing the PAD token + :param max_length: the length that defines the padding + :param dynamic: if True (default) pads at min(max_length, max_local_length) where max_local_length is the + length of the longest example + :param pad_at_end: if True, the pad tokens are added at the end of the lists, if otherwise they are added + at the beginning + """ + self.pad = pad_index + self.max_length = max_length + self.dynamic = dynamic + self.pad_at_end = pad_at_end + + def transform(self, X): + """ + :param X: a list of lists of indexes (integers) + :return: a ndarray of shape (n,m) where n is the number of elements in X and m is the pad length (the maximum + in elements of X if dynamic, or self.max_length if otherwise) + """ + X = [x[:self.max_length] for x in X] + lengths = list(map(len, X)) + pad_length = min(max(lengths), self.max_length) if self.dynamic else self.max_length + if self.pad_at_end: + padded = [x + [self.pad] * (pad_length - x_len) for x, x_len in zip(X, lengths)] + else: + padded = [[self.pad] * (pad_length - x_len) + x for x, x_len in zip(X, lengths)] + return np.asarray(padded, dtype=int)