import argparse import numpy as np from data.AuthorshipDataset import AuthorshipDataset from data.fetch_imdb62 import Imdb62 from data.fetch_enron_mail import EnronMail from index import Index from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier from data.fetch_victorian import Victorian from evaluation import evaluation import torch from model.transformations import CNNProjection from util import create_path_if_not_exists import os import sys def main(opt): device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') print(f'running on {device}') # dataset load if opt.dataset == 'enron': loader = EnronMail data_path = '../../authorship_analysis/data/enron_mail_20150507/maildir/*' elif opt.dataset == 'imdb62': loader = Imdb62 data_path = '../../authorship_analysis/data/imdb62/imdb62.txt' elif opt.dataset == 'victorian': loader = Victorian data_path='../../authorship_analysis/data/victoria' dataset_name = f'{loader.__name__}_A{opt.authors}_D{opt.documents}_S{opt.seed}' pickle_path = None if opt.pickle: pickle_path = f'{opt.pickle}/{dataset_name}.pickle' dataset = AuthorshipDataset.load(loader, pickle_path=pickle_path, data_path=data_path, n_authors=opt.authors, docs_by_author=opt.documents, random_state=opt.seed ) # dataset indexing Xtr, ytr = dataset.train.data, dataset.train.target Xte, yte = dataset.test.data, dataset.test.target A = np.unique(ytr) print(f'num authors={len(A)}') print(f'ntr = {len(Xtr)} nte = {len(Xte)}') bigrams = False index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1)) Xtr = index.fit_transform(Xtr) Xte = index.transform(Xte) pad_index = index.add_word('PADTOKEN') print(f'vocabulary size={index.vocabulary_size()}') #shuffle1 = np.random.permutation(Xte.shape[0]) #shuffle2 = np.random.permutation(Xte.shape[0]) #x1, y1 = Xte[shuffle1], yte[shuffle1] #x2, y2 = Xte[shuffle2], yte[shuffle2] #paired_y = y1==y2 # attribution print('Attribution') phi = CNNProjection( vocabulary_size=index.vocabulary_size(), embedding_dim=opt.hidden, out_size=opt.repr, channels_out=opt.chout, kernel_sizes=opt.kernelsizes, dropout=0.5 ).to(device) cls = AuthorshipAttributionClassifier( phi, num_authors=A.size, pad_index=pad_index, pad_length=opt.pad, device=device ) if opt.name == 'auto': method = f'{phi.__class__.__name__}_alpha{opt.alpha}' else: method = opt.name # train val_microf1 = cls.fit(Xtr, ytr, batch_size=opt.batchsize, epochs=opt.epochs, alpha=opt.alpha, lr=opt.lr, log=f'{opt.log}/{method}-{dataset_name}.csv', checkpointpath=opt.checkpoint ) # test yte_ = cls.predict(Xte) acc, macrof1, microf1 = evaluation(yte, yte_) results = Results(opt.output) results.add(dataset_name, method, acc, macrof1, microf1, val_microf1) # verification #print('Verification') #phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device) #cls = SameAuthorClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device) #cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs) #paired_y_ = cls.predict(x1,x2) #eval(paired_y, paired_y_) # attribution & verification #print('Attribution & Verification') #phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device) #cls = FullAuthorClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device) #cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs) #yte_ = cls.predict_labels(Xte) #eval(yte, yte_) #paired_y_ = cls.predict_sav(x1,x2) #eval(paired_y, paired_y_) class Results: def __init__(self, path): addheader = not os.path.exists(path) self.foo = open(path, 'at') if addheader: self.add('Dataset', 'Method', 'Accuracy', 'MacroF1', 'microF1', 'val_microF1') def add(self, dataset, method, acc, macrof1, microf1, val_microF1): self.foo.write(f'{dataset}\t{method}\t{acc}\t{macrof1}\t{microf1}\t{val_microF1}\n') self.foo.flush() def close(self): self.foo.close() if __name__ == '__main__': parser = argparse.ArgumentParser(description='CNN with KTA regularization') parser.add_argument('-H', '--hidden', help='Hidden/embedding size', type=int, default=32) parser.add_argument('-c', '--chout', help='Channels output size', type=int, default=128) parser.add_argument('-r', '--repr', help='Projection size (phi)', type=int, default=1024) parser.add_argument('-k', '--kernelsizes', help='Size of the convolutional kernels', nargs='+', default=[6,7,8]) parser.add_argument('-p', '--pad', help='Pad length', type=int, default=3000) parser.add_argument('-b', '--batchsize', help='Batch size', type=int, default=50) parser.add_argument('-e', '--epochs', help='Max number of epochs', type=int, default=250) parser.add_argument('-A', '--authors', help='Number of authors (-1 to select all)', type=int, default=-1) parser.add_argument('-D', '--documents', help='Number of documents per author (-1 to select all)', type=int, default=-1) parser.add_argument('-s', '--seed', help='Random seed', type=int, default=-1) parser.add_argument('-o', '--output', help='File where to write test results', default='../results.csv') parser.add_argument('-l', '--log', help='Log dir where to output training an validation losses', default='../log') parser.add_argument('-P', '--pickle', help='If specified, pickles a copy of the dataset for faster reload. ' 'This parameter indicates a directory, the name of the pickle is ' 'derived automatically.', default='../pickles') parser.add_argument('-a', '--alpha', help='Controls the loss as attr-loss(alpha) + sav-loss(1-alpha)', type=float, default=1.) parser.add_argument('--lr', help='Learning rate', type=float, default=0.001) parser.add_argument('--checkpoint', help='Path where to dump model parameters', default='../checkpoint/model.dat') parser.add_argument('-n', '--name', help='Name of the model', default='auto') requiredNamed = parser.add_argument_group('required named arguments') requiredNamed.add_argument('-d', '--dataset', help='Name of the dataset', required=True, type=str) opt = parser.parse_args() assert opt.dataset in ['enron', 'imdb62'], 'unknown dataset' create_path_if_not_exists(opt.output) create_path_if_not_exists(opt.log) create_path_if_not_exists(opt.checkpoint) if opt.pickle is not None: create_path_if_not_exists(opt.pickle) main(opt)