180 lines
7.6 KiB
Python
180 lines
7.6 KiB
Python
import argparse
|
|
import numpy as np
|
|
from data.AuthorshipDataset import AuthorshipDataset
|
|
from data.fetch_blogs import Blogs
|
|
from data.fetch_imdb62 import Imdb62
|
|
from data.fetch_enron_mail import EnronMail
|
|
from index import Index
|
|
from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier
|
|
from data.fetch_victorian import Victorian
|
|
from evaluation import evaluation
|
|
import torch
|
|
import torch.nn as nn
|
|
from model.layers import *
|
|
from util import create_path_if_not_exists
|
|
import os
|
|
import sys
|
|
|
|
|
|
def main(opt):
|
|
|
|
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
|
print(f'running on {device}')
|
|
|
|
# dataset load
|
|
if opt.dataset == 'enron':
|
|
loader = EnronMail
|
|
data_path = '../../authorship_analysis/data/enron_mail_20150507/maildir/*'
|
|
elif opt.dataset == 'imdb62':
|
|
loader = Imdb62
|
|
data_path = '../../authorship_analysis/data/imdb62/imdb62.txt'
|
|
elif opt.dataset == 'victorian':
|
|
loader = Victorian
|
|
data_path='../../authorship_analysis/data/victoria'
|
|
elif opt.dataset == 'blogs':
|
|
loader = Blogs
|
|
data_path = '../../authorship_analysis/data/blogs'
|
|
|
|
dataset_name = f'{loader.__name__}_A{opt.authors}_D{opt.documents}_S{opt.seed}'
|
|
pickle_path = None
|
|
if opt.pickle:
|
|
pickle_path = f'{opt.pickle}/{dataset_name}.pickle'
|
|
dataset = AuthorshipDataset.load(loader,
|
|
pickle_path=pickle_path,
|
|
data_path=data_path,
|
|
n_authors=opt.authors,
|
|
docs_by_author=opt.documents,
|
|
random_state=opt.seed
|
|
)
|
|
|
|
# dataset indexing
|
|
Xtr, ytr = dataset.train.data, dataset.train.target
|
|
Xte, yte = dataset.test.data, dataset.test.target
|
|
A = np.unique(ytr)
|
|
print(f'num authors={len(A)}')
|
|
print(f'ntr = {len(Xtr)} nte = {len(Xte)}')
|
|
|
|
bigrams = False
|
|
index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1))
|
|
Xtr = index.fit_transform(Xtr)
|
|
Xte = index.transform(Xte)
|
|
pad_index = index.add_word('PADTOKEN')
|
|
print(f'vocabulary size={index.vocabulary_size()}')
|
|
|
|
#shuffle1 = np.random.permutation(Xte.shape[0])
|
|
#shuffle2 = np.random.permutation(Xte.shape[0])
|
|
#x1, y1 = Xte[shuffle1], yte[shuffle1]
|
|
#x2, y2 = Xte[shuffle2], yte[shuffle2]
|
|
#paired_y = y1==y2
|
|
|
|
# attribution
|
|
print('Attribution')
|
|
phi = Phi(
|
|
cnn=CNNProjection(
|
|
vocabulary_size=index.vocabulary_size(),
|
|
embedding_dim=opt.hidden,
|
|
channels_out=opt.chout,
|
|
kernel_sizes=opt.kernelsizes),
|
|
ff=FFProjection(input_size=len(opt.kernelsizes) * opt.chout,
|
|
hidden_sizes=[1024],
|
|
output_size=opt.repr,
|
|
activation=nn.functional.relu,
|
|
dropout=0.5,
|
|
activate_last=True),
|
|
).to(device)
|
|
|
|
cls = AuthorshipAttributionClassifier(
|
|
phi, num_authors=A.size, pad_index=pad_index, pad_length=opt.pad, device=device
|
|
)
|
|
print(cls)
|
|
|
|
if opt.name == 'auto':
|
|
method = f'{phi.__class__.__name__}_alpha{opt.alpha}'
|
|
else:
|
|
method = opt.name
|
|
|
|
# train
|
|
val_microf1 = cls.fit(Xtr, ytr,
|
|
batch_size=opt.batchsize, epochs=opt.epochs, alpha=opt.alpha, lr=opt.lr,
|
|
log=f'{opt.log}/{method}-{dataset_name}.csv',
|
|
checkpointpath=opt.checkpoint
|
|
)
|
|
|
|
# test
|
|
yte_ = cls.predict(Xte)
|
|
acc, macrof1, microf1 = evaluation(yte, yte_)
|
|
|
|
results = Results(opt.output)
|
|
results.add(dataset_name, method, acc, macrof1, microf1, val_microf1)
|
|
|
|
|
|
# verification
|
|
#print('Verification')
|
|
#phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device)
|
|
#cls = SameAuthorClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device)
|
|
#cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs)
|
|
#paired_y_ = cls.predict(x1,x2)
|
|
#eval(paired_y, paired_y_)
|
|
|
|
# attribution & verification
|
|
#print('Attribution & Verification')
|
|
#phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device)
|
|
#cls = FullAuthorClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device)
|
|
#cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs)
|
|
#yte_ = cls.predict_labels(Xte)
|
|
#eval(yte, yte_)
|
|
#paired_y_ = cls.predict_sav(x1,x2)
|
|
#eval(paired_y, paired_y_)
|
|
|
|
|
|
class Results:
|
|
def __init__(self, path):
|
|
addheader = not os.path.exists(path)
|
|
self.foo = open(path, 'at')
|
|
if addheader:
|
|
self.add('Dataset', 'Method', 'Accuracy', 'MacroF1', 'microF1', 'val_microF1')
|
|
|
|
def add(self, dataset, method, acc, macrof1, microf1, val_microF1):
|
|
self.foo.write(f'{dataset}\t{method}\t{acc}\t{macrof1}\t{microf1}\t{val_microF1}\n')
|
|
self.foo.flush()
|
|
|
|
def close(self):
|
|
self.foo.close()
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser(description='CNN with KTA regularization')
|
|
parser.add_argument('-H', '--hidden', help='Hidden/embedding size', type=int, default=32)
|
|
parser.add_argument('-c', '--chout', help='Channels output size', type=int, default=128)
|
|
parser.add_argument('-r', '--repr', help='Projection size (phi)', type=int, default=1024)
|
|
parser.add_argument('-k', '--kernelsizes', help='Size of the convolutional kernels', nargs='+', default=[6,7,8])
|
|
parser.add_argument('-p', '--pad', help='Pad length', type=int, default=3000)
|
|
parser.add_argument('-b', '--batchsize', help='Batch size', type=int, default=50)
|
|
parser.add_argument('-e', '--epochs', help='Max number of epochs', type=int, default=250)
|
|
parser.add_argument('-A', '--authors', help='Number of authors (-1 to select all)', type=int, default=-1)
|
|
parser.add_argument('-D', '--documents', help='Number of documents per author (-1 to select all)', type=int, default=-1)
|
|
parser.add_argument('-s', '--seed', help='Random seed', type=int, default=-1)
|
|
parser.add_argument('-o', '--output', help='File where to write test results', default='../results.csv')
|
|
parser.add_argument('-l', '--log', help='Log dir where to output training an validation losses', default='../log')
|
|
parser.add_argument('-P', '--pickle', help='If specified, pickles a copy of the dataset for faster reload. '
|
|
'This parameter indicates a directory, the name of the pickle is '
|
|
'derived automatically.', default='../pickles')
|
|
parser.add_argument('-a', '--alpha', help='Controls the loss as attr-loss(alpha) + sav-loss(1-alpha)', type=float, default=1.)
|
|
parser.add_argument('--lr', help='Learning rate', type=float, default=0.001)
|
|
parser.add_argument('--checkpoint', help='Path where to dump model parameters', default='../checkpoint/model.dat')
|
|
parser.add_argument('-n', '--name', help='Name of the model', default='auto')
|
|
requiredNamed = parser.add_argument_group('required named arguments')
|
|
requiredNamed.add_argument('-d', '--dataset', help='Name of the dataset', required=True, type=str)
|
|
opt = parser.parse_args()
|
|
|
|
assert opt.dataset in ['enron', 'imdb62', 'blogs', 'victorian'], 'unknown dataset'
|
|
|
|
create_path_if_not_exists(opt.output)
|
|
create_path_if_not_exists(opt.log)
|
|
create_path_if_not_exists(opt.checkpoint)
|
|
if opt.pickle is not None:
|
|
create_path_if_not_exists(opt.pickle)
|
|
|
|
main(opt)
|