import numpy as np from data.fetch_imdb62 import Imdb62 from index import Index from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier from data.fetch_victorian import Victorian from evaluation import evaluation import torch from model.transformations import CNNProjection import sys hidden_size=32 channels_out=128 output_size=1024 kernel_sizes=[4,5,6] pad_length=3000 batch_size=50 n_epochs=256 bigrams=False #hidden_size=16 #output_size=32 #pad_length=100 #batch_size=10 #n_epochs=20 if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print(f'running on {device}') #dataset = Victorian(data_path='../../authorship_analysis/data/victoria', n_authors=5, docs_by_author=25) dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=5, docs_by_author=25) Xtr, ytr = dataset.train.data, dataset.train.target Xte, yte = dataset.test.data, dataset.test.target A = np.unique(ytr) print(f'num authors={len(A)}') index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1)) Xtr = index.fit_transform(Xtr) Xte = index.transform(Xte) pad_index = index.add_word('PADTOKEN') print(f'vocabulary size={index.vocabulary_size()}') #shuffle1 = np.random.permutation(Xte.shape[0]) #shuffle2 = np.random.permutation(Xte.shape[0]) #x1, y1 = Xte[shuffle1], yte[shuffle1] #x2, y2 = Xte[shuffle2], yte[shuffle2] #paired_y = y1==y2 # attribution print('Attribution') #phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device) phi = CNNProjection(vocabulary_size=index.vocabulary_size(), embedding_dim=hidden_size, out_size=output_size, channels_out=channels_out, kernel_sizes=kernel_sizes, dropout=0.5).to(device) cls = AuthorshipAttributionClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device) cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs) yte_ = cls.predict(Xte) evaluation(yte, yte_) # verification #print('Verification') #phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device) #cls = SameAuthorClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device) #cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs) #paired_y_ = cls.predict(x1,x2) #eval(paired_y, paired_y_) # attribution & verification #print('Attribution & Verification') #phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device) #cls = FullAuthorClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device) #cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs) #yte_ = cls.predict_labels(Xte) #eval(yte, yte_) #paired_y_ = cls.predict_sav(x1,x2) #eval(paired_y, paired_y_)