commit main

This commit is contained in:
Alejandro Moreo Fernandez 2021-02-15 16:27:19 +01:00
parent b1160c5336
commit 076f96518d
4 changed files with 45 additions and 48 deletions

View File

@ -2,8 +2,6 @@ from abc import ABC, abstractmethod
import random import random
import numpy as np import numpy as np
from collections import Counter from collections import Counter
import os
import pickle
class LabelledCorpus: class LabelledCorpus:
@ -31,19 +29,6 @@ class LabelledCorpus:
class AuthorshipDataset(ABC): class AuthorshipDataset(ABC):
@classmethod
def load(cls, loader, pickle_path=None, **kwargs):
#assert isinstance(loader, AuthorshipDataset), 'unknown loader'
if pickle_path and os.path.exists(pickle_path):
print(f'loading dataset image in {pickle_path}')
dataset = pickle.load(open(pickle_path, 'rb'))
else:
dataset = loader(**kwargs)
if pickle_path:
print(f'dumping dataset in {pickle_path} for faster load')
pickle.dump(dataset, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
return dataset
def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42): def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
self.data_path = data_path self.data_path = data_path
self.n_authors = n_authors self.n_authors = n_authors

View File

@ -14,22 +14,29 @@ from evaluation import evaluation
import torch import torch
import torch.nn as nn import torch.nn as nn
from model.layers import * from model.layers import *
from util import create_path_if_not_exists from util import create_path_if_not_exists, pickled_resource
import os import os
import sys import sys
def load_dataset(opt): def load_dataset(opt):
kwargs={
'n_authors': opt.authors,
'docs_by_author': opt.documents,
'random_state': opt.seed
}
# dataset load # dataset load
if opt.dataset == 'enron': if opt.dataset == 'enron':
loader = EnronMail
data_path = '../../authorship_analysis/data/enron_mail_20150507/maildir/*' data_path = '../../authorship_analysis/data/enron_mail_20150507/maildir/*'
loader = EnronMail
elif opt.dataset == 'imdb62': elif opt.dataset == 'imdb62':
loader = Imdb62
data_path = '../../authorship_analysis/data/imdb62/imdb62.txt' data_path = '../../authorship_analysis/data/imdb62/imdb62.txt'
loader = Imdb62
elif opt.dataset == 'victorian': elif opt.dataset == 'victorian':
loader = Victorian loader = Victorian
data_path='../../authorship_analysis/data/victoria' data_path = '../../authorship_analysis/data/victoria'
elif opt.dataset == 'blogs': elif opt.dataset == 'blogs':
loader = Blogs loader = Blogs
data_path = '../../authorship_analysis/data/blogs' data_path = '../../authorship_analysis/data/blogs'
@ -38,14 +45,9 @@ def load_dataset(opt):
pickle_path = None pickle_path = None
if opt.pickle: if opt.pickle:
pickle_path = f'{opt.pickle}/{dataset_name}.pickle' pickle_path = f'{opt.pickle}/{dataset_name}.pickle'
dataset = AuthorshipDataset.load(
loader, dataset = pickled_resource(pickle_path, loader, data_path, **kwargs)
pickle_path=pickle_path,
data_path=data_path,
n_authors=opt.authors,
docs_by_author=opt.documents,
random_state=opt.seed
)
return dataset_name, dataset return dataset_name, dataset
@ -57,7 +59,7 @@ def instantiate_model(A, index, pad_index, device):
channels_out=opt.chout, channels_out=opt.chout,
kernel_sizes=opt.kernelsizes), kernel_sizes=opt.kernelsizes),
ff=FFProjection(input_size=len(opt.kernelsizes) * opt.chout, ff=FFProjection(input_size=len(opt.kernelsizes) * opt.chout,
hidden_sizes=[], hidden_sizes=[512],
output_size=opt.repr, output_size=opt.repr,
activation=nn.functional.relu, activation=nn.functional.relu,
dropout=0.5, dropout=0.5,
@ -110,8 +112,7 @@ def main(opt):
checkpointpath=opt.checkpoint) checkpointpath=opt.checkpoint)
# svm_experiment(cls.project(Xtr), ytr, cls.project(Xte), yte, foo, 'svm-pre') # svm_experiment(cls.project(Xtr), ytr, cls.project(Xte), yte, foo, 'svm-pre')
svm_experiment(cls.project_kernel(Xtr), ytr, cls.project_kernel(Xte), yte, foo, 'svm-kernel') Xtr_svm, Xte_svm = cls.project_kernel(Xtr), cls.project_kernel(Xte)
val_microf1 = cls.train_linear_classifier(Xtr_, ytr_, Xval_, yval_, val_microf1 = cls.train_linear_classifier(Xtr_, ytr_, Xval_, yval_,
batch_size=opt.batchsize, epochs=opt.epochs, lr=opt.lr, batch_size=opt.batchsize, epochs=opt.epochs, lr=opt.lr,
@ -124,7 +125,7 @@ def main(opt):
acc, macrof1, microf1 = evaluation(yte, yte_) acc, macrof1, microf1 = evaluation(yte, yte_)
foo.write(f'sav(fix)-lin(trained) network prediction: acc={acc:.3f} macrof1={macrof1:.3f} microf1={microf1:.3f}\n') foo.write(f'sav(fix)-lin(trained) network prediction: acc={acc:.3f} macrof1={macrof1:.3f} microf1={microf1:.3f}\n')
val_microf1 = cls.fit(Xtr, ytr, val_microf1 = cls.fit(Xtr_, ytr_, Xval_, yval_,
batch_size=opt.batchsize, epochs=opt.epochs, alpha=opt.alpha, lr=opt.lr, batch_size=opt.batchsize, epochs=opt.epochs, alpha=opt.alpha, lr=opt.lr,
log=f'{opt.log}/{method}-{dataset_name}.csv', log=f'{opt.log}/{method}-{dataset_name}.csv',
checkpointpath=opt.checkpoint checkpointpath=opt.checkpoint
@ -139,7 +140,7 @@ def main(opt):
print('training end-to-end without self-supervision init') print('training end-to-end without self-supervision init')
cls, phi = instantiate_model(A, index, pad_index, device) cls, phi = instantiate_model(A, index, pad_index, device)
# train # train
val_microf1 = cls.fit(Xtr, ytr, val_microf1 = cls.fit(Xtr_, ytr_, Xval_, yval_,
batch_size=opt.batchsize, epochs=opt.epochs, alpha=opt.alpha, lr=opt.lr, batch_size=opt.batchsize, epochs=opt.epochs, alpha=opt.alpha, lr=opt.lr,
log=f'{opt.log}/{method}-{dataset_name}.csv', log=f'{opt.log}/{method}-{dataset_name}.csv',
checkpointpath=opt.checkpoint checkpointpath=opt.checkpoint
@ -149,6 +150,8 @@ def main(opt):
print('end-to-end (w/o self-supervised initialization) network prediction') print('end-to-end (w/o self-supervised initialization) network prediction')
acc, macrof1, microf1 = evaluation(yte, yte_) acc, macrof1, microf1 = evaluation(yte, yte_)
svm_experiment(Xtr_svm, ytr, Xte_svm, yte, foo, 'svm-kernel')
# results = Results(opt.output) # results = Results(opt.output)
# results.add(dataset_name, method, acc, macrof1, microf1, val_microf1) # results.add(dataset_name, method, acc, macrof1, microf1, val_microf1)
@ -186,6 +189,7 @@ class Results:
def close(self): def close(self):
self.foo.close() self.foo.close()
def svm_experiment(Xtr, ytr, Xte, yte, foo, name): def svm_experiment(Xtr, ytr, Xte, yte, foo, name):
svm = GridSearchCV( svm = GridSearchCV(
LinearSVC(), param_grid={'C': np.logspace(-2, 3, 6), 'class_weight': ['balanced', None]}, n_jobs=-1 LinearSVC(), param_grid={'C': np.logspace(-2, 3, 6), 'class_weight': ['balanced', None]}, n_jobs=-1
@ -199,13 +203,13 @@ def svm_experiment(Xtr, ytr, Xte, yte, foo, name):
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description='CNN with KTA regularization') parser = argparse.ArgumentParser(description='CNN with KTA regularization')
parser.add_argument('-H', '--hidden', help='Hidden/embedding size', type=int, default=32) parser.add_argument('-H', '--hidden', help='Hidden/embedding size', type=int, default=16)
parser.add_argument('-c', '--chout', help='Channels output size', type=int, default=128) parser.add_argument('-c', '--chout', help='Channels output size', type=int, default=128)
parser.add_argument('-r', '--repr', help='Projection size (phi)', type=int, default=2048) parser.add_argument('-r', '--repr', help='Projection size (phi)', type=int, default=256)
parser.add_argument('-k', '--kernelsizes', help='Size of the convolutional kernels', nargs='+', default=[6,7,8]) parser.add_argument('-k', '--kernelsizes', help='Size of the convolutional kernels', nargs='+', default=[6,7,8])
parser.add_argument('-p', '--pad', help='Pad length', type=int, default=3000) parser.add_argument('-p', '--pad', help='Pad length', type=int, default=3000)
parser.add_argument('-b', '--batchsize', help='Batch size', type=int, default=250) parser.add_argument('-b', '--batchsize', help='Batch size', type=int, default=100)
parser.add_argument('-e', '--epochs', help='Max number of epochs', type=int, default=250) parser.add_argument('-e', '--epochs', help='Max number of epochs', type=int, default=500)
parser.add_argument('-A', '--authors', help='Number of authors (-1 to select all)', type=int, default=-1) parser.add_argument('-A', '--authors', help='Number of authors (-1 to select all)', type=int, default=-1)
parser.add_argument('-D', '--documents', help='Number of documents per author (-1 to select all)', type=int, default=-1) parser.add_argument('-D', '--documents', help='Number of documents per author (-1 to select all)', type=int, default=-1)
parser.add_argument('-s', '--seed', help='Random seed', type=int, default=0) parser.add_argument('-s', '--seed', help='Random seed', type=int, default=0)
@ -215,7 +219,7 @@ if __name__ == '__main__':
'This parameter indicates a directory, the name of the pickle is ' 'This parameter indicates a directory, the name of the pickle is '
'derived automatically.', default='../pickles') 'derived automatically.', default='../pickles')
parser.add_argument('-a', '--alpha', help='Controls the loss as attr-loss(alpha) + sav-loss(1-alpha)', type=float, default=1.) parser.add_argument('-a', '--alpha', help='Controls the loss as attr-loss(alpha) + sav-loss(1-alpha)', type=float, default=1.)
parser.add_argument('--lr', help='Learning rate', type=float, default=0.01) parser.add_argument('--lr', help='Learning rate', type=float, default=0.001)
parser.add_argument('--checkpoint', help='Path where to dump model parameters', default='../checkpoint/model.dat') parser.add_argument('--checkpoint', help='Path where to dump model parameters', default='../checkpoint/model.dat')
parser.add_argument('-n', '--name', help='Name of the model', default='auto') parser.add_argument('-n', '--name', help='Name of the model', default='auto')
requiredNamed = parser.add_argument_group('required named arguments') requiredNamed = parser.add_argument_group('required named arguments')

View File

@ -30,7 +30,7 @@ class AuthorshipAttributionClassifier(nn.Module):
if p.dim() > 1 and p.requires_grad: if p.dim() > 1 and p.requires_grad:
nn.init.xavier_uniform_(p) nn.init.xavier_uniform_(p)
def fit(self, X, y, batch_size, epochs, patience=50, lr=0.001, val_prop=0.1, alpha=1., log='../log/tmp.csv', checkpointpath='../checkpoint/model.dat'): def fit(self, X, y, Xval, yval, batch_size, epochs, patience=20, lr=0.001, alpha=1., log='../log/tmp.csv', checkpointpath='../checkpoint/model.dat'):
assert 0 <= alpha <= 1, 'wrong range, alpha must be in [0,1]' assert 0 <= alpha <= 1, 'wrong range, alpha must be in [0,1]'
early_stop = EarlyStop(patience) early_stop = EarlyStop(patience)
@ -39,8 +39,6 @@ class AuthorshipAttributionClassifier(nn.Module):
savcriterion = torch.nn.BCEWithLogitsLoss().to(self.device) savcriterion = torch.nn.BCEWithLogitsLoss().to(self.device)
optim = torch.optim.Adam(self.parameters(), lr=lr) optim = torch.optim.Adam(self.parameters(), lr=lr)
X, Xval, y, yval = train_test_split(X, y, test_size=val_prop, stratify=y)
tr_data = IndexedDataset(X, y, self.pad_length, self.pad_index, self.device) tr_data = IndexedDataset(X, y, self.pad_length, self.pad_index, self.device)
val_data = IndexedDataset(Xval, yval, self.pad_length, self.pad_index, self.device) val_data = IndexedDataset(Xval, yval, self.pad_length, self.pad_index, self.device)
@ -99,16 +97,14 @@ class AuthorshipAttributionClassifier(nn.Module):
f'loss={tr_loss:.5f} ' f'loss={tr_loss:.5f} '
f'attr-loss={np.mean(attr_losses):.5f} ' f'attr-loss={np.mean(attr_losses):.5f} '
f'sav-loss={np.mean(sav_losses):.5f} ' f'sav-loss={np.mean(sav_losses):.5f} '
f'val_loss={val_loss:.5f} val_acc={acc:.4f} macrof1={macrof1:.4f} microf1={microf1:.4f}' f'val_loss={val_loss:.5f} val_acc={acc:.4f} macrof1={macrof1:.4f} microf1={microf1:.4f} '
f'patience={early_stop.patience}/{early_stop.patience_limit}') f'patience={early_stop.patience}/{early_stop.patience_limit}')
# validation # validation
self.eval() self.eval()
with torch.no_grad(): with torch.no_grad():
predictions, losses = [], [] predictions, losses = [], []
# for xi, yi in batcher_val.epoch(Xval, yval):
for xi, yi in val_data.asDataLoader(batch_size, shuffle=False): for xi, yi in val_data.asDataLoader(batch_size, shuffle=False):
# xi = self.padder.transform(xi)
logits = self.forward(xi) logits = self.forward(xi)
loss = criterion(logits, torch.as_tensor(yi).to(self.device)) loss = criterion(logits, torch.as_tensor(yi).to(self.device))
losses.append(loss.item()) losses.append(loss.item())
@ -137,7 +133,6 @@ class AuthorshipAttributionClassifier(nn.Module):
early_stop = EarlyStop(patience, lower_is_better=True) early_stop = EarlyStop(patience, lower_is_better=True)
criterion = SupConLoss1View().to(self.device) criterion = SupConLoss1View().to(self.device)
# criterion = SupConLoss1ViewCrossEntropy().to(self.device)
optim = torch.optim.Adam(self.parameters(), lr=lr) optim = torch.optim.Adam(self.parameters(), lr=lr)
tr_data = IndexedDataset(X, y, self.pad_length, self.pad_index, self.device) tr_data = IndexedDataset(X, y, self.pad_length, self.pad_index, self.device)
@ -153,14 +148,11 @@ class AuthorshipAttributionClassifier(nn.Module):
self.train() self.train()
losses, pos_losses, neg_losses = [], [], [] losses, pos_losses, neg_losses = [], [], []
for xi, yi in tr_data.asDataLoader(batch_size, shuffle=True): for xi, yi in tr_data.asDataLoader(batch_size, shuffle=True):
#while True:
optim.zero_grad() optim.zero_grad()
phi = self.projector(xi) phi = self.projector(xi)
phi = self.linear_proj(phi) phi = self.linear_proj(phi)
phi = F.normalize(phi, p=2, dim=-1) phi = F.normalize(phi, p=2, dim=-1)
#contrastive_loss = criterion(phi, torch.as_tensor(yi).to(self.device))
contrastive_loss, neg_loss, pos_loss = criterion(phi, torch.as_tensor(yi).to(self.device)) contrastive_loss, neg_loss, pos_loss = criterion(phi, torch.as_tensor(yi).to(self.device))
#contrastive_loss = neg_loss+pos_loss
contrastive_loss.backward() contrastive_loss.backward()
optim.step() optim.step()
losses.append(contrastive_loss.item()) losses.append(contrastive_loss.item())
@ -180,7 +172,6 @@ class AuthorshipAttributionClassifier(nn.Module):
for xi, yi in val_data.asDataLoader(batch_size, shuffle=False): for xi, yi in val_data.asDataLoader(batch_size, shuffle=False):
phi = self.projector(xi) phi = self.projector(xi)
contrastive_loss, neg_loss, pos_loss = criterion(phi, torch.as_tensor(yi).to(self.device)) contrastive_loss, neg_loss, pos_loss = criterion(phi, torch.as_tensor(yi).to(self.device))
#contrastive_loss = neg_loss + pos_loss
losses.append((neg_loss + pos_loss).item()) losses.append((neg_loss + pos_loss).item())
neg_losses_val.append(neg_loss.item()) neg_losses_val.append(neg_loss.item())
pos_losses_val.append(pos_loss.item()) pos_losses_val.append(pos_loss.item())

View File

@ -3,6 +3,8 @@ import os
from joblib import Parallel, delayed from joblib import Parallel, delayed
import multiprocessing import multiprocessing
import itertools import itertools
import pickle
def create_path_if_not_exists(file): def create_path_if_not_exists(file):
@ -27,3 +29,18 @@ def parallelize(func, args, n_jobs):
) )
return list(itertools.chain.from_iterable(results)) return list(itertools.chain.from_iterable(results))
def pickled_resource(pickle_path: str, generation_func: callable, *args, **kwargs):
if pickle_path is None:
return generation_func(*args, **kwargs)
else:
if os.path.exists(pickle_path):
return pickle.load(open(pickle_path, 'rb'))
else:
instance = generation_func(*args, **kwargs)
os.makedirs(str(Path(pickle_path).parent), exist_ok=True)
pickle.dump(instance, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
return instance