kernel_authorship/src/main.py

79 lines
2.8 KiB
Python

import numpy as np
from data.fetch_imdb62 import Imdb62
from index import Index
from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier
from data.fetch_victorian import Victorian
from evaluation import evaluation
import torch
from model.transformations import CNNProjection
import sys
hidden_size=32
channels_out=128
output_size=1024
kernel_sizes=[4,5,6]
pad_length=3000
batch_size=50
n_epochs=256
bigrams=False
#hidden_size=16
#output_size=32
#pad_length=100
#batch_size=10
#n_epochs=20
if torch.cuda.is_available():
device = torch.device('cuda')
else:
device = torch.device('cpu')
print(f'running on {device}')
#dataset = Victorian(data_path='../../authorship_analysis/data/victoria', n_authors=5, docs_by_author=25)
dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=5, docs_by_author=25)
Xtr, ytr = dataset.train.data, dataset.train.target
Xte, yte = dataset.test.data, dataset.test.target
A = np.unique(ytr)
print(f'num authors={len(A)}')
index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1))
Xtr = index.fit_transform(Xtr)
Xte = index.transform(Xte)
pad_index = index.add_word('PADTOKEN')
print(f'vocabulary size={index.vocabulary_size()}')
#shuffle1 = np.random.permutation(Xte.shape[0])
#shuffle2 = np.random.permutation(Xte.shape[0])
#x1, y1 = Xte[shuffle1], yte[shuffle1]
#x2, y2 = Xte[shuffle2], yte[shuffle2]
#paired_y = y1==y2
# attribution
print('Attribution')
#phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device)
phi = CNNProjection(vocabulary_size=index.vocabulary_size(), embedding_dim=hidden_size, out_size=output_size, channels_out=channels_out, kernel_sizes=kernel_sizes, dropout=0.5).to(device)
cls = AuthorshipAttributionClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device)
cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs)
yte_ = cls.predict(Xte)
evaluation(yte, yte_)
# verification
#print('Verification')
#phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device)
#cls = SameAuthorClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device)
#cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs)
#paired_y_ = cls.predict(x1,x2)
#eval(paired_y, paired_y_)
# attribution & verification
#print('Attribution & Verification')
#phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device)
#cls = FullAuthorClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device)
#cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs)
#yte_ = cls.predict_labels(Xte)
#eval(yte, yte_)
#paired_y_ = cls.predict_sav(x1,x2)
#eval(paired_y, paired_y_)