commit main
This commit is contained in:
parent
b1160c5336
commit
076f96518d
|
@ -2,8 +2,6 @@ from abc import ABC, abstractmethod
|
|||
import random
|
||||
import numpy as np
|
||||
from collections import Counter
|
||||
import os
|
||||
import pickle
|
||||
|
||||
|
||||
class LabelledCorpus:
|
||||
|
@ -31,19 +29,6 @@ class LabelledCorpus:
|
|||
|
||||
class AuthorshipDataset(ABC):
|
||||
|
||||
@classmethod
|
||||
def load(cls, loader, pickle_path=None, **kwargs):
|
||||
#assert isinstance(loader, AuthorshipDataset), 'unknown loader'
|
||||
if pickle_path and os.path.exists(pickle_path):
|
||||
print(f'loading dataset image in {pickle_path}')
|
||||
dataset = pickle.load(open(pickle_path, 'rb'))
|
||||
else:
|
||||
dataset = loader(**kwargs)
|
||||
if pickle_path:
|
||||
print(f'dumping dataset in {pickle_path} for faster load')
|
||||
pickle.dump(dataset, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
return dataset
|
||||
|
||||
def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
|
||||
self.data_path = data_path
|
||||
self.n_authors = n_authors
|
||||
|
|
48
src/main.py
48
src/main.py
|
@ -14,22 +14,29 @@ from evaluation import evaluation
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
from model.layers import *
|
||||
from util import create_path_if_not_exists
|
||||
from util import create_path_if_not_exists, pickled_resource
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
|
||||
def load_dataset(opt):
|
||||
kwargs={
|
||||
'n_authors': opt.authors,
|
||||
'docs_by_author': opt.documents,
|
||||
'random_state': opt.seed
|
||||
}
|
||||
|
||||
# dataset load
|
||||
if opt.dataset == 'enron':
|
||||
loader = EnronMail
|
||||
data_path = '../../authorship_analysis/data/enron_mail_20150507/maildir/*'
|
||||
loader = EnronMail
|
||||
elif opt.dataset == 'imdb62':
|
||||
loader = Imdb62
|
||||
data_path = '../../authorship_analysis/data/imdb62/imdb62.txt'
|
||||
loader = Imdb62
|
||||
elif opt.dataset == 'victorian':
|
||||
loader = Victorian
|
||||
data_path='../../authorship_analysis/data/victoria'
|
||||
data_path = '../../authorship_analysis/data/victoria'
|
||||
elif opt.dataset == 'blogs':
|
||||
loader = Blogs
|
||||
data_path = '../../authorship_analysis/data/blogs'
|
||||
|
@ -38,14 +45,9 @@ def load_dataset(opt):
|
|||
pickle_path = None
|
||||
if opt.pickle:
|
||||
pickle_path = f'{opt.pickle}/{dataset_name}.pickle'
|
||||
dataset = AuthorshipDataset.load(
|
||||
loader,
|
||||
pickle_path=pickle_path,
|
||||
data_path=data_path,
|
||||
n_authors=opt.authors,
|
||||
docs_by_author=opt.documents,
|
||||
random_state=opt.seed
|
||||
)
|
||||
|
||||
dataset = pickled_resource(pickle_path, loader, data_path, **kwargs)
|
||||
|
||||
return dataset_name, dataset
|
||||
|
||||
|
||||
|
@ -57,7 +59,7 @@ def instantiate_model(A, index, pad_index, device):
|
|||
channels_out=opt.chout,
|
||||
kernel_sizes=opt.kernelsizes),
|
||||
ff=FFProjection(input_size=len(opt.kernelsizes) * opt.chout,
|
||||
hidden_sizes=[],
|
||||
hidden_sizes=[512],
|
||||
output_size=opt.repr,
|
||||
activation=nn.functional.relu,
|
||||
dropout=0.5,
|
||||
|
@ -110,8 +112,7 @@ def main(opt):
|
|||
checkpointpath=opt.checkpoint)
|
||||
|
||||
# svm_experiment(cls.project(Xtr), ytr, cls.project(Xte), yte, foo, 'svm-pre')
|
||||
svm_experiment(cls.project_kernel(Xtr), ytr, cls.project_kernel(Xte), yte, foo, 'svm-kernel')
|
||||
|
||||
Xtr_svm, Xte_svm = cls.project_kernel(Xtr), cls.project_kernel(Xte)
|
||||
|
||||
val_microf1 = cls.train_linear_classifier(Xtr_, ytr_, Xval_, yval_,
|
||||
batch_size=opt.batchsize, epochs=opt.epochs, lr=opt.lr,
|
||||
|
@ -124,7 +125,7 @@ def main(opt):
|
|||
acc, macrof1, microf1 = evaluation(yte, yte_)
|
||||
foo.write(f'sav(fix)-lin(trained) network prediction: acc={acc:.3f} macrof1={macrof1:.3f} microf1={microf1:.3f}\n')
|
||||
|
||||
val_microf1 = cls.fit(Xtr, ytr,
|
||||
val_microf1 = cls.fit(Xtr_, ytr_, Xval_, yval_,
|
||||
batch_size=opt.batchsize, epochs=opt.epochs, alpha=opt.alpha, lr=opt.lr,
|
||||
log=f'{opt.log}/{method}-{dataset_name}.csv',
|
||||
checkpointpath=opt.checkpoint
|
||||
|
@ -139,7 +140,7 @@ def main(opt):
|
|||
print('training end-to-end without self-supervision init')
|
||||
cls, phi = instantiate_model(A, index, pad_index, device)
|
||||
# train
|
||||
val_microf1 = cls.fit(Xtr, ytr,
|
||||
val_microf1 = cls.fit(Xtr_, ytr_, Xval_, yval_,
|
||||
batch_size=opt.batchsize, epochs=opt.epochs, alpha=opt.alpha, lr=opt.lr,
|
||||
log=f'{opt.log}/{method}-{dataset_name}.csv',
|
||||
checkpointpath=opt.checkpoint
|
||||
|
@ -149,6 +150,8 @@ def main(opt):
|
|||
print('end-to-end (w/o self-supervised initialization) network prediction')
|
||||
acc, macrof1, microf1 = evaluation(yte, yte_)
|
||||
|
||||
svm_experiment(Xtr_svm, ytr, Xte_svm, yte, foo, 'svm-kernel')
|
||||
|
||||
# results = Results(opt.output)
|
||||
# results.add(dataset_name, method, acc, macrof1, microf1, val_microf1)
|
||||
|
||||
|
@ -186,6 +189,7 @@ class Results:
|
|||
def close(self):
|
||||
self.foo.close()
|
||||
|
||||
|
||||
def svm_experiment(Xtr, ytr, Xte, yte, foo, name):
|
||||
svm = GridSearchCV(
|
||||
LinearSVC(), param_grid={'C': np.logspace(-2, 3, 6), 'class_weight': ['balanced', None]}, n_jobs=-1
|
||||
|
@ -199,13 +203,13 @@ def svm_experiment(Xtr, ytr, Xte, yte, foo, name):
|
|||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='CNN with KTA regularization')
|
||||
parser.add_argument('-H', '--hidden', help='Hidden/embedding size', type=int, default=32)
|
||||
parser.add_argument('-H', '--hidden', help='Hidden/embedding size', type=int, default=16)
|
||||
parser.add_argument('-c', '--chout', help='Channels output size', type=int, default=128)
|
||||
parser.add_argument('-r', '--repr', help='Projection size (phi)', type=int, default=2048)
|
||||
parser.add_argument('-r', '--repr', help='Projection size (phi)', type=int, default=256)
|
||||
parser.add_argument('-k', '--kernelsizes', help='Size of the convolutional kernels', nargs='+', default=[6,7,8])
|
||||
parser.add_argument('-p', '--pad', help='Pad length', type=int, default=3000)
|
||||
parser.add_argument('-b', '--batchsize', help='Batch size', type=int, default=250)
|
||||
parser.add_argument('-e', '--epochs', help='Max number of epochs', type=int, default=250)
|
||||
parser.add_argument('-b', '--batchsize', help='Batch size', type=int, default=100)
|
||||
parser.add_argument('-e', '--epochs', help='Max number of epochs', type=int, default=500)
|
||||
parser.add_argument('-A', '--authors', help='Number of authors (-1 to select all)', type=int, default=-1)
|
||||
parser.add_argument('-D', '--documents', help='Number of documents per author (-1 to select all)', type=int, default=-1)
|
||||
parser.add_argument('-s', '--seed', help='Random seed', type=int, default=0)
|
||||
|
@ -215,7 +219,7 @@ if __name__ == '__main__':
|
|||
'This parameter indicates a directory, the name of the pickle is '
|
||||
'derived automatically.', default='../pickles')
|
||||
parser.add_argument('-a', '--alpha', help='Controls the loss as attr-loss(alpha) + sav-loss(1-alpha)', type=float, default=1.)
|
||||
parser.add_argument('--lr', help='Learning rate', type=float, default=0.01)
|
||||
parser.add_argument('--lr', help='Learning rate', type=float, default=0.001)
|
||||
parser.add_argument('--checkpoint', help='Path where to dump model parameters', default='../checkpoint/model.dat')
|
||||
parser.add_argument('-n', '--name', help='Name of the model', default='auto')
|
||||
requiredNamed = parser.add_argument_group('required named arguments')
|
||||
|
|
|
@ -30,7 +30,7 @@ class AuthorshipAttributionClassifier(nn.Module):
|
|||
if p.dim() > 1 and p.requires_grad:
|
||||
nn.init.xavier_uniform_(p)
|
||||
|
||||
def fit(self, X, y, batch_size, epochs, patience=50, lr=0.001, val_prop=0.1, alpha=1., log='../log/tmp.csv', checkpointpath='../checkpoint/model.dat'):
|
||||
def fit(self, X, y, Xval, yval, batch_size, epochs, patience=20, lr=0.001, alpha=1., log='../log/tmp.csv', checkpointpath='../checkpoint/model.dat'):
|
||||
assert 0 <= alpha <= 1, 'wrong range, alpha must be in [0,1]'
|
||||
early_stop = EarlyStop(patience)
|
||||
|
||||
|
@ -39,8 +39,6 @@ class AuthorshipAttributionClassifier(nn.Module):
|
|||
savcriterion = torch.nn.BCEWithLogitsLoss().to(self.device)
|
||||
optim = torch.optim.Adam(self.parameters(), lr=lr)
|
||||
|
||||
X, Xval, y, yval = train_test_split(X, y, test_size=val_prop, stratify=y)
|
||||
|
||||
tr_data = IndexedDataset(X, y, self.pad_length, self.pad_index, self.device)
|
||||
val_data = IndexedDataset(Xval, yval, self.pad_length, self.pad_index, self.device)
|
||||
|
||||
|
@ -99,16 +97,14 @@ class AuthorshipAttributionClassifier(nn.Module):
|
|||
f'loss={tr_loss:.5f} '
|
||||
f'attr-loss={np.mean(attr_losses):.5f} '
|
||||
f'sav-loss={np.mean(sav_losses):.5f} '
|
||||
f'val_loss={val_loss:.5f} val_acc={acc:.4f} macrof1={macrof1:.4f} microf1={microf1:.4f}'
|
||||
f'val_loss={val_loss:.5f} val_acc={acc:.4f} macrof1={macrof1:.4f} microf1={microf1:.4f} '
|
||||
f'patience={early_stop.patience}/{early_stop.patience_limit}')
|
||||
|
||||
# validation
|
||||
self.eval()
|
||||
with torch.no_grad():
|
||||
predictions, losses = [], []
|
||||
# for xi, yi in batcher_val.epoch(Xval, yval):
|
||||
for xi, yi in val_data.asDataLoader(batch_size, shuffle=False):
|
||||
# xi = self.padder.transform(xi)
|
||||
logits = self.forward(xi)
|
||||
loss = criterion(logits, torch.as_tensor(yi).to(self.device))
|
||||
losses.append(loss.item())
|
||||
|
@ -137,7 +133,6 @@ class AuthorshipAttributionClassifier(nn.Module):
|
|||
early_stop = EarlyStop(patience, lower_is_better=True)
|
||||
|
||||
criterion = SupConLoss1View().to(self.device)
|
||||
# criterion = SupConLoss1ViewCrossEntropy().to(self.device)
|
||||
optim = torch.optim.Adam(self.parameters(), lr=lr)
|
||||
|
||||
tr_data = IndexedDataset(X, y, self.pad_length, self.pad_index, self.device)
|
||||
|
@ -153,14 +148,11 @@ class AuthorshipAttributionClassifier(nn.Module):
|
|||
self.train()
|
||||
losses, pos_losses, neg_losses = [], [], []
|
||||
for xi, yi in tr_data.asDataLoader(batch_size, shuffle=True):
|
||||
#while True:
|
||||
optim.zero_grad()
|
||||
phi = self.projector(xi)
|
||||
phi = self.linear_proj(phi)
|
||||
phi = F.normalize(phi, p=2, dim=-1)
|
||||
#contrastive_loss = criterion(phi, torch.as_tensor(yi).to(self.device))
|
||||
contrastive_loss, neg_loss, pos_loss = criterion(phi, torch.as_tensor(yi).to(self.device))
|
||||
#contrastive_loss = neg_loss+pos_loss
|
||||
contrastive_loss.backward()
|
||||
optim.step()
|
||||
losses.append(contrastive_loss.item())
|
||||
|
@ -180,7 +172,6 @@ class AuthorshipAttributionClassifier(nn.Module):
|
|||
for xi, yi in val_data.asDataLoader(batch_size, shuffle=False):
|
||||
phi = self.projector(xi)
|
||||
contrastive_loss, neg_loss, pos_loss = criterion(phi, torch.as_tensor(yi).to(self.device))
|
||||
#contrastive_loss = neg_loss + pos_loss
|
||||
losses.append((neg_loss + pos_loss).item())
|
||||
neg_losses_val.append(neg_loss.item())
|
||||
pos_losses_val.append(pos_loss.item())
|
||||
|
|
17
src/util.py
17
src/util.py
|
@ -3,6 +3,8 @@ import os
|
|||
from joblib import Parallel, delayed
|
||||
import multiprocessing
|
||||
import itertools
|
||||
import pickle
|
||||
|
||||
|
||||
|
||||
def create_path_if_not_exists(file):
|
||||
|
@ -27,3 +29,18 @@ def parallelize(func, args, n_jobs):
|
|||
)
|
||||
return list(itertools.chain.from_iterable(results))
|
||||
|
||||
|
||||
|
||||
def pickled_resource(pickle_path: str, generation_func: callable, *args, **kwargs):
|
||||
if pickle_path is None:
|
||||
return generation_func(*args, **kwargs)
|
||||
else:
|
||||
if os.path.exists(pickle_path):
|
||||
return pickle.load(open(pickle_path, 'rb'))
|
||||
else:
|
||||
instance = generation_func(*args, **kwargs)
|
||||
os.makedirs(str(Path(pickle_path).parent), exist_ok=True)
|
||||
pickle.dump(instance, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
return instance
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue