146 lines
5.5 KiB
Python
Executable File
146 lines
5.5 KiB
Python
Executable File
import warnings
|
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
import numpy as np
|
|
from tqdm import tqdm
|
|
import torch
|
|
from scipy.sparse import vstack, issparse
|
|
from joblib import Parallel, delayed
|
|
import multiprocessing
|
|
import itertools
|
|
|
|
|
|
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
|
|
"""
|
|
Index (i.e., replaces word strings with numerical indexes) a list of string documents
|
|
:param data: list of string documents
|
|
:param vocab: a fixed mapping [str]->[int] of words to indexes
|
|
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
|
|
because they are anyway contained in a pre-trained embedding set that we know in advance)
|
|
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
|
|
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
|
|
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
|
|
are not in the original vocab but that are in the known_words
|
|
:return:
|
|
"""
|
|
indexes=[]
|
|
vocabsize = len(vocab)
|
|
unk_count = 0
|
|
knw_count = 0
|
|
out_count = 0
|
|
pbar = tqdm(data, desc=f'indexing documents')
|
|
for text in pbar:
|
|
words = analyzer(text)
|
|
index = []
|
|
for word in words:
|
|
if word in vocab:
|
|
idx = vocab[word]
|
|
else:
|
|
if word in known_words:
|
|
if word not in out_of_vocabulary:
|
|
out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
|
|
idx = out_of_vocabulary[word]
|
|
out_count += 1
|
|
else:
|
|
idx = unk_index
|
|
unk_count += 1
|
|
index.append(idx)
|
|
indexes.append(index)
|
|
knw_count += len(index)
|
|
pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
|
|
f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
|
|
return indexes
|
|
|
|
|
|
def define_pad_length(index_list):
|
|
lengths = [len(index) for index in index_list]
|
|
return int(np.mean(lengths)+np.std(lengths))
|
|
|
|
|
|
def pad(index_list, pad_index, max_pad_length=None):
|
|
pad_length = np.max([len(index) for index in index_list])
|
|
if max_pad_length is not None:
|
|
pad_length = min(pad_length, max_pad_length)
|
|
for i,indexes in enumerate(index_list):
|
|
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
|
|
return index_list
|
|
|
|
|
|
def get_word_list(word2index1, word2index2=None): #TODO: redo
|
|
def extract_word_list(word2index):
|
|
return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])]
|
|
word_list = extract_word_list(word2index1)
|
|
if word2index2 is not None:
|
|
word_list += extract_word_list(word2index2)
|
|
return word_list
|
|
|
|
|
|
def batchify(index_list, labels, batchsize, pad_index, device, target_long=False, max_pad_length=500):
|
|
nsamples = len(index_list)
|
|
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
|
|
for b in range(nbatches):
|
|
batch = index_list[b*batchsize:(b+1)*batchsize]
|
|
batch_labels = labels[b*batchsize:(b+1)*batchsize]
|
|
if issparse(batch_labels):
|
|
batch_labels = batch_labels.toarray()
|
|
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
|
|
batch = torch.LongTensor(batch)
|
|
totype = torch.LongTensor if target_long else torch.FloatTensor
|
|
target = totype(batch_labels)
|
|
yield batch.to(device), target.to(device)
|
|
|
|
|
|
def batchify_unlabelled(index_list, batchsize, pad_index, device, max_pad_length=500):
|
|
nsamples = len(index_list)
|
|
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
|
|
for b in range(nbatches):
|
|
batch = index_list[b*batchsize:(b+1)*batchsize]
|
|
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
|
|
batch = torch.LongTensor(batch)
|
|
yield batch.to(device)
|
|
|
|
|
|
def clip_gradient(model, clip_value=1e-1):
|
|
params = list(filter(lambda p: p.grad is not None, model.parameters()))
|
|
for p in params:
|
|
p.grad.data.clamp_(-clip_value, clip_value)
|
|
|
|
|
|
def predict(logits, classification_type='singlelabel'):
|
|
if classification_type == 'multilabel':
|
|
prediction = torch.sigmoid(logits) > 0.5
|
|
elif classification_type == 'singlelabel':
|
|
prediction = torch.argmax(logits, dim=1).view(-1, 1)
|
|
else:
|
|
print('unknown classification type')
|
|
|
|
return prediction.detach().cpu().numpy()
|
|
|
|
|
|
def count_parameters(model):
|
|
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
|
|
|
|
|
def get_parallel_slices(n_tasks, n_jobs=-1):
|
|
if n_jobs==-1:
|
|
n_jobs = multiprocessing.cpu_count()
|
|
batch = int(n_tasks / n_jobs)
|
|
remainder = n_tasks % n_jobs
|
|
return [slice(job*batch, (job+1)*batch+ (remainder if job == n_jobs - 1 else 0)) for job in range(n_jobs)]
|
|
|
|
|
|
def tokenize_job(documents, tokenizer, max_tokens, job):
|
|
return [tokenizer(d)[:max_tokens] for d in tqdm(documents, desc=f'tokenizing [job: {job}]')]
|
|
|
|
|
|
def tokenize_parallel(documents, tokenizer, max_tokens, n_jobs=-1):
|
|
slices = get_parallel_slices(n_tasks=len(documents), n_jobs=n_jobs)
|
|
tokens = Parallel(n_jobs=n_jobs)(
|
|
delayed(tokenize_job)(
|
|
documents[slice_i], tokenizer, max_tokens, job
|
|
)
|
|
for job, slice_i in enumerate(slices)
|
|
)
|
|
return list(itertools.chain.from_iterable(tokens))
|
|
|
|
|