79 lines
2.8 KiB
Python
79 lines
2.8 KiB
Python
import numpy as np
|
|
|
|
from data.fetch_imdb62 import Imdb62
|
|
from index import Index
|
|
from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier
|
|
from data.fetch_victorian import Victorian
|
|
from evaluation import evaluation
|
|
import torch
|
|
from model.transformations import CNNProjection
|
|
import sys
|
|
|
|
|
|
hidden_size=32
|
|
channels_out=128
|
|
output_size=1024
|
|
kernel_sizes=[4,5,6]
|
|
pad_length=3000
|
|
batch_size=50
|
|
n_epochs=256
|
|
bigrams=False
|
|
|
|
#hidden_size=16
|
|
#output_size=32
|
|
#pad_length=100
|
|
#batch_size=10
|
|
#n_epochs=20
|
|
|
|
if torch.cuda.is_available():
|
|
device = torch.device('cuda')
|
|
else:
|
|
device = torch.device('cpu')
|
|
print(f'running on {device}')
|
|
|
|
#dataset = Victorian(data_path='../../authorship_analysis/data/victoria', n_authors=5, docs_by_author=25)
|
|
dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=5, docs_by_author=25)
|
|
Xtr, ytr = dataset.train.data, dataset.train.target
|
|
Xte, yte = dataset.test.data, dataset.test.target
|
|
A = np.unique(ytr)
|
|
print(f'num authors={len(A)}')
|
|
|
|
index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1))
|
|
Xtr = index.fit_transform(Xtr)
|
|
Xte = index.transform(Xte)
|
|
pad_index = index.add_word('PADTOKEN')
|
|
print(f'vocabulary size={index.vocabulary_size()}')
|
|
|
|
#shuffle1 = np.random.permutation(Xte.shape[0])
|
|
#shuffle2 = np.random.permutation(Xte.shape[0])
|
|
#x1, y1 = Xte[shuffle1], yte[shuffle1]
|
|
#x2, y2 = Xte[shuffle2], yte[shuffle2]
|
|
#paired_y = y1==y2
|
|
|
|
# attribution
|
|
print('Attribution')
|
|
#phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device)
|
|
phi = CNNProjection(vocabulary_size=index.vocabulary_size(), embedding_dim=hidden_size, out_size=output_size, channels_out=channels_out, kernel_sizes=kernel_sizes, dropout=0.5).to(device)
|
|
cls = AuthorshipAttributionClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device)
|
|
cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs)
|
|
yte_ = cls.predict(Xte)
|
|
evaluation(yte, yte_)
|
|
|
|
# verification
|
|
#print('Verification')
|
|
#phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device)
|
|
#cls = SameAuthorClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device)
|
|
#cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs)
|
|
#paired_y_ = cls.predict(x1,x2)
|
|
#eval(paired_y, paired_y_)
|
|
|
|
# attribution & verification
|
|
#print('Attribution & Verification')
|
|
#phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device)
|
|
#cls = FullAuthorClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device)
|
|
#cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs)
|
|
#yte_ = cls.predict_labels(Xte)
|
|
#eval(yte, yte_)
|
|
#paired_y_ = cls.predict_sav(x1,x2)
|
|
#eval(paired_y, paired_y_)
|