From 076f96518d3e416bdb01eda261ad0d0d899eb9b6 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Mon, 15 Feb 2021 16:27:19 +0100 Subject: [PATCH] commit main --- src/data/AuthorshipDataset.py | 15 ----------- src/main.py | 48 +++++++++++++++++++---------------- src/model/classifiers.py | 13 ++-------- src/util.py | 17 +++++++++++++ 4 files changed, 45 insertions(+), 48 deletions(-) diff --git a/src/data/AuthorshipDataset.py b/src/data/AuthorshipDataset.py index 5934033..2dd09fe 100644 --- a/src/data/AuthorshipDataset.py +++ b/src/data/AuthorshipDataset.py @@ -2,8 +2,6 @@ from abc import ABC, abstractmethod import random import numpy as np from collections import Counter -import os -import pickle class LabelledCorpus: @@ -31,19 +29,6 @@ class LabelledCorpus: class AuthorshipDataset(ABC): - @classmethod - def load(cls, loader, pickle_path=None, **kwargs): - #assert isinstance(loader, AuthorshipDataset), 'unknown loader' - if pickle_path and os.path.exists(pickle_path): - print(f'loading dataset image in {pickle_path}') - dataset = pickle.load(open(pickle_path, 'rb')) - else: - dataset = loader(**kwargs) - if pickle_path: - print(f'dumping dataset in {pickle_path} for faster load') - pickle.dump(dataset, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL) - return dataset - def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42): self.data_path = data_path self.n_authors = n_authors diff --git a/src/main.py b/src/main.py index b2317f4..c4e9b9a 100644 --- a/src/main.py +++ b/src/main.py @@ -14,22 +14,29 @@ from evaluation import evaluation import torch import torch.nn as nn from model.layers import * -from util import create_path_if_not_exists +from util import create_path_if_not_exists, pickled_resource import os import sys + def load_dataset(opt): + kwargs={ + 'n_authors': opt.authors, + 'docs_by_author': opt.documents, + 'random_state': opt.seed + } + # dataset load if opt.dataset == 'enron': - loader = EnronMail data_path = '../../authorship_analysis/data/enron_mail_20150507/maildir/*' + loader = EnronMail elif opt.dataset == 'imdb62': - loader = Imdb62 data_path = '../../authorship_analysis/data/imdb62/imdb62.txt' + loader = Imdb62 elif opt.dataset == 'victorian': loader = Victorian - data_path='../../authorship_analysis/data/victoria' + data_path = '../../authorship_analysis/data/victoria' elif opt.dataset == 'blogs': loader = Blogs data_path = '../../authorship_analysis/data/blogs' @@ -38,14 +45,9 @@ def load_dataset(opt): pickle_path = None if opt.pickle: pickle_path = f'{opt.pickle}/{dataset_name}.pickle' - dataset = AuthorshipDataset.load( - loader, - pickle_path=pickle_path, - data_path=data_path, - n_authors=opt.authors, - docs_by_author=opt.documents, - random_state=opt.seed - ) + + dataset = pickled_resource(pickle_path, loader, data_path, **kwargs) + return dataset_name, dataset @@ -57,7 +59,7 @@ def instantiate_model(A, index, pad_index, device): channels_out=opt.chout, kernel_sizes=opt.kernelsizes), ff=FFProjection(input_size=len(opt.kernelsizes) * opt.chout, - hidden_sizes=[], + hidden_sizes=[512], output_size=opt.repr, activation=nn.functional.relu, dropout=0.5, @@ -110,8 +112,7 @@ def main(opt): checkpointpath=opt.checkpoint) # svm_experiment(cls.project(Xtr), ytr, cls.project(Xte), yte, foo, 'svm-pre') - svm_experiment(cls.project_kernel(Xtr), ytr, cls.project_kernel(Xte), yte, foo, 'svm-kernel') - + Xtr_svm, Xte_svm = cls.project_kernel(Xtr), cls.project_kernel(Xte) val_microf1 = cls.train_linear_classifier(Xtr_, ytr_, Xval_, yval_, batch_size=opt.batchsize, epochs=opt.epochs, lr=opt.lr, @@ -124,7 +125,7 @@ def main(opt): acc, macrof1, microf1 = evaluation(yte, yte_) foo.write(f'sav(fix)-lin(trained) network prediction: acc={acc:.3f} macrof1={macrof1:.3f} microf1={microf1:.3f}\n') - val_microf1 = cls.fit(Xtr, ytr, + val_microf1 = cls.fit(Xtr_, ytr_, Xval_, yval_, batch_size=opt.batchsize, epochs=opt.epochs, alpha=opt.alpha, lr=opt.lr, log=f'{opt.log}/{method}-{dataset_name}.csv', checkpointpath=opt.checkpoint @@ -139,7 +140,7 @@ def main(opt): print('training end-to-end without self-supervision init') cls, phi = instantiate_model(A, index, pad_index, device) # train - val_microf1 = cls.fit(Xtr, ytr, + val_microf1 = cls.fit(Xtr_, ytr_, Xval_, yval_, batch_size=opt.batchsize, epochs=opt.epochs, alpha=opt.alpha, lr=opt.lr, log=f'{opt.log}/{method}-{dataset_name}.csv', checkpointpath=opt.checkpoint @@ -149,6 +150,8 @@ def main(opt): print('end-to-end (w/o self-supervised initialization) network prediction') acc, macrof1, microf1 = evaluation(yte, yte_) + svm_experiment(Xtr_svm, ytr, Xte_svm, yte, foo, 'svm-kernel') + # results = Results(opt.output) # results.add(dataset_name, method, acc, macrof1, microf1, val_microf1) @@ -186,6 +189,7 @@ class Results: def close(self): self.foo.close() + def svm_experiment(Xtr, ytr, Xte, yte, foo, name): svm = GridSearchCV( LinearSVC(), param_grid={'C': np.logspace(-2, 3, 6), 'class_weight': ['balanced', None]}, n_jobs=-1 @@ -199,13 +203,13 @@ def svm_experiment(Xtr, ytr, Xte, yte, foo, name): if __name__ == '__main__': parser = argparse.ArgumentParser(description='CNN with KTA regularization') - parser.add_argument('-H', '--hidden', help='Hidden/embedding size', type=int, default=32) + parser.add_argument('-H', '--hidden', help='Hidden/embedding size', type=int, default=16) parser.add_argument('-c', '--chout', help='Channels output size', type=int, default=128) - parser.add_argument('-r', '--repr', help='Projection size (phi)', type=int, default=2048) + parser.add_argument('-r', '--repr', help='Projection size (phi)', type=int, default=256) parser.add_argument('-k', '--kernelsizes', help='Size of the convolutional kernels', nargs='+', default=[6,7,8]) parser.add_argument('-p', '--pad', help='Pad length', type=int, default=3000) - parser.add_argument('-b', '--batchsize', help='Batch size', type=int, default=250) - parser.add_argument('-e', '--epochs', help='Max number of epochs', type=int, default=250) + parser.add_argument('-b', '--batchsize', help='Batch size', type=int, default=100) + parser.add_argument('-e', '--epochs', help='Max number of epochs', type=int, default=500) parser.add_argument('-A', '--authors', help='Number of authors (-1 to select all)', type=int, default=-1) parser.add_argument('-D', '--documents', help='Number of documents per author (-1 to select all)', type=int, default=-1) parser.add_argument('-s', '--seed', help='Random seed', type=int, default=0) @@ -215,7 +219,7 @@ if __name__ == '__main__': 'This parameter indicates a directory, the name of the pickle is ' 'derived automatically.', default='../pickles') parser.add_argument('-a', '--alpha', help='Controls the loss as attr-loss(alpha) + sav-loss(1-alpha)', type=float, default=1.) - parser.add_argument('--lr', help='Learning rate', type=float, default=0.01) + parser.add_argument('--lr', help='Learning rate', type=float, default=0.001) parser.add_argument('--checkpoint', help='Path where to dump model parameters', default='../checkpoint/model.dat') parser.add_argument('-n', '--name', help='Name of the model', default='auto') requiredNamed = parser.add_argument_group('required named arguments') diff --git a/src/model/classifiers.py b/src/model/classifiers.py index a9723e0..3fce2f1 100644 --- a/src/model/classifiers.py +++ b/src/model/classifiers.py @@ -30,7 +30,7 @@ class AuthorshipAttributionClassifier(nn.Module): if p.dim() > 1 and p.requires_grad: nn.init.xavier_uniform_(p) - def fit(self, X, y, batch_size, epochs, patience=50, lr=0.001, val_prop=0.1, alpha=1., log='../log/tmp.csv', checkpointpath='../checkpoint/model.dat'): + def fit(self, X, y, Xval, yval, batch_size, epochs, patience=20, lr=0.001, alpha=1., log='../log/tmp.csv', checkpointpath='../checkpoint/model.dat'): assert 0 <= alpha <= 1, 'wrong range, alpha must be in [0,1]' early_stop = EarlyStop(patience) @@ -39,8 +39,6 @@ class AuthorshipAttributionClassifier(nn.Module): savcriterion = torch.nn.BCEWithLogitsLoss().to(self.device) optim = torch.optim.Adam(self.parameters(), lr=lr) - X, Xval, y, yval = train_test_split(X, y, test_size=val_prop, stratify=y) - tr_data = IndexedDataset(X, y, self.pad_length, self.pad_index, self.device) val_data = IndexedDataset(Xval, yval, self.pad_length, self.pad_index, self.device) @@ -99,16 +97,14 @@ class AuthorshipAttributionClassifier(nn.Module): f'loss={tr_loss:.5f} ' f'attr-loss={np.mean(attr_losses):.5f} ' f'sav-loss={np.mean(sav_losses):.5f} ' - f'val_loss={val_loss:.5f} val_acc={acc:.4f} macrof1={macrof1:.4f} microf1={microf1:.4f}' + f'val_loss={val_loss:.5f} val_acc={acc:.4f} macrof1={macrof1:.4f} microf1={microf1:.4f} ' f'patience={early_stop.patience}/{early_stop.patience_limit}') # validation self.eval() with torch.no_grad(): predictions, losses = [], [] - # for xi, yi in batcher_val.epoch(Xval, yval): for xi, yi in val_data.asDataLoader(batch_size, shuffle=False): - # xi = self.padder.transform(xi) logits = self.forward(xi) loss = criterion(logits, torch.as_tensor(yi).to(self.device)) losses.append(loss.item()) @@ -137,7 +133,6 @@ class AuthorshipAttributionClassifier(nn.Module): early_stop = EarlyStop(patience, lower_is_better=True) criterion = SupConLoss1View().to(self.device) - # criterion = SupConLoss1ViewCrossEntropy().to(self.device) optim = torch.optim.Adam(self.parameters(), lr=lr) tr_data = IndexedDataset(X, y, self.pad_length, self.pad_index, self.device) @@ -153,14 +148,11 @@ class AuthorshipAttributionClassifier(nn.Module): self.train() losses, pos_losses, neg_losses = [], [], [] for xi, yi in tr_data.asDataLoader(batch_size, shuffle=True): - #while True: optim.zero_grad() phi = self.projector(xi) phi = self.linear_proj(phi) phi = F.normalize(phi, p=2, dim=-1) - #contrastive_loss = criterion(phi, torch.as_tensor(yi).to(self.device)) contrastive_loss, neg_loss, pos_loss = criterion(phi, torch.as_tensor(yi).to(self.device)) - #contrastive_loss = neg_loss+pos_loss contrastive_loss.backward() optim.step() losses.append(contrastive_loss.item()) @@ -180,7 +172,6 @@ class AuthorshipAttributionClassifier(nn.Module): for xi, yi in val_data.asDataLoader(batch_size, shuffle=False): phi = self.projector(xi) contrastive_loss, neg_loss, pos_loss = criterion(phi, torch.as_tensor(yi).to(self.device)) - #contrastive_loss = neg_loss + pos_loss losses.append((neg_loss + pos_loss).item()) neg_losses_val.append(neg_loss.item()) pos_losses_val.append(pos_loss.item()) diff --git a/src/util.py b/src/util.py index 7bb10b4..f0c2173 100644 --- a/src/util.py +++ b/src/util.py @@ -3,6 +3,8 @@ import os from joblib import Parallel, delayed import multiprocessing import itertools +import pickle + def create_path_if_not_exists(file): @@ -27,3 +29,18 @@ def parallelize(func, args, n_jobs): ) return list(itertools.chain.from_iterable(results)) + + +def pickled_resource(pickle_path: str, generation_func: callable, *args, **kwargs): + if pickle_path is None: + return generation_func(*args, **kwargs) + else: + if os.path.exists(pickle_path): + return pickle.load(open(pickle_path, 'rb')) + else: + instance = generation_func(*args, **kwargs) + os.makedirs(str(Path(pickle_path).parent), exist_ok=True) + pickle.dump(instance, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL) + return instance + +