QuaPy/MultiLabel/util/common.py

146 lines
5.5 KiB
Python
Executable File

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import numpy as np
from tqdm import tqdm
import torch
from scipy.sparse import vstack, issparse
from joblib import Parallel, delayed
import multiprocessing
import itertools
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
"""
Index (i.e., replaces word strings with numerical indexes) a list of string documents
:param data: list of string documents
:param vocab: a fixed mapping [str]->[int] of words to indexes
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
because they are anyway contained in a pre-trained embedding set that we know in advance)
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
are not in the original vocab but that are in the known_words
:return:
"""
indexes=[]
vocabsize = len(vocab)
unk_count = 0
knw_count = 0
out_count = 0
pbar = tqdm(data, desc=f'indexing documents')
for text in pbar:
words = analyzer(text)
index = []
for word in words:
if word in vocab:
idx = vocab[word]
else:
if word in known_words:
if word not in out_of_vocabulary:
out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
idx = out_of_vocabulary[word]
out_count += 1
else:
idx = unk_index
unk_count += 1
index.append(idx)
indexes.append(index)
knw_count += len(index)
pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
return indexes
def define_pad_length(index_list):
lengths = [len(index) for index in index_list]
return int(np.mean(lengths)+np.std(lengths))
def pad(index_list, pad_index, max_pad_length=None):
pad_length = np.max([len(index) for index in index_list])
if max_pad_length is not None:
pad_length = min(pad_length, max_pad_length)
for i,indexes in enumerate(index_list):
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
return index_list
def get_word_list(word2index1, word2index2=None): #TODO: redo
def extract_word_list(word2index):
return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])]
word_list = extract_word_list(word2index1)
if word2index2 is not None:
word_list += extract_word_list(word2index2)
return word_list
def batchify(index_list, labels, batchsize, pad_index, device, target_long=False, max_pad_length=500):
nsamples = len(index_list)
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
for b in range(nbatches):
batch = index_list[b*batchsize:(b+1)*batchsize]
batch_labels = labels[b*batchsize:(b+1)*batchsize]
if issparse(batch_labels):
batch_labels = batch_labels.toarray()
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
batch = torch.LongTensor(batch)
totype = torch.LongTensor if target_long else torch.FloatTensor
target = totype(batch_labels)
yield batch.to(device), target.to(device)
def batchify_unlabelled(index_list, batchsize, pad_index, device, max_pad_length=500):
nsamples = len(index_list)
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
for b in range(nbatches):
batch = index_list[b*batchsize:(b+1)*batchsize]
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
batch = torch.LongTensor(batch)
yield batch.to(device)
def clip_gradient(model, clip_value=1e-1):
params = list(filter(lambda p: p.grad is not None, model.parameters()))
for p in params:
p.grad.data.clamp_(-clip_value, clip_value)
def predict(logits, classification_type='singlelabel'):
if classification_type == 'multilabel':
prediction = torch.sigmoid(logits) > 0.5
elif classification_type == 'singlelabel':
prediction = torch.argmax(logits, dim=1).view(-1, 1)
else:
print('unknown classification type')
return prediction.detach().cpu().numpy()
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def get_parallel_slices(n_tasks, n_jobs=-1):
if n_jobs==-1:
n_jobs = multiprocessing.cpu_count()
batch = int(n_tasks / n_jobs)
remainder = n_tasks % n_jobs
return [slice(job*batch, (job+1)*batch+ (remainder if job == n_jobs - 1 else 0)) for job in range(n_jobs)]
def tokenize_job(documents, tokenizer, max_tokens, job):
return [tokenizer(d)[:max_tokens] for d in tqdm(documents, desc=f'tokenizing [job: {job}]')]
def tokenize_parallel(documents, tokenizer, max_tokens, n_jobs=-1):
slices = get_parallel_slices(n_tasks=len(documents), n_jobs=n_jobs)
tokens = Parallel(n_jobs=n_jobs)(
delayed(tokenize_job)(
documents[slice_i], tokenizer, max_tokens, job
)
for job, slice_i in enumerate(slices)
)
return list(itertools.chain.from_iterable(tokens))