forked from moreo/QuaPy
generating features from RoBERTa, testing them on Amazons data
This commit is contained in:
parent
d949c77317
commit
464bd60c7c
|
@ -1,6 +1,11 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def smoothness(p):
|
||||||
|
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _check_arrays(prevs):
|
def _check_arrays(prevs):
|
||||||
prevs = np.asarray(prevs)
|
prevs = np.asarray(prevs)
|
||||||
if prevs.ndim==1:
|
if prevs.ndim==1:
|
||||||
|
|
|
@ -0,0 +1,100 @@
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
import datasets
|
||||||
|
import torch.cuda
|
||||||
|
from sklearn.metrics import f1_score
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from transformers import AutoTokenizer, DataCollatorWithPadding, BertTokenizer
|
||||||
|
from datasets import list_datasets, list_metrics, load_dataset, Dataset, DatasetDict, load_metric
|
||||||
|
from transformers import AutoModelForSequenceClassification
|
||||||
|
from transformers import TrainingArguments
|
||||||
|
from transformers import Trainer
|
||||||
|
import pandas as pd
|
||||||
|
import csv
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize_function(example):
|
||||||
|
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else 256)
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
def compute_metrics(eval_preds):
|
||||||
|
logits, labels = eval_preds
|
||||||
|
preds = np.argmax(logits, axis=-1)
|
||||||
|
return {
|
||||||
|
'macro-f1': f1_score(labels, preds, average='macro'),
|
||||||
|
'micro-f1': f1_score(labels, preds, average='micro'),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
debug = False
|
||||||
|
assert torch.cuda.is_available(), 'cuda is not available'
|
||||||
|
|
||||||
|
n_args = len(sys.argv)
|
||||||
|
assert n_args==3, 'wrong arguments, expected: <training-path> <transformer-name>'
|
||||||
|
|
||||||
|
datapath = sys.argv[1] # './data/Books/training_data.txt'
|
||||||
|
checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base'
|
||||||
|
modelout = checkpoint+'-finetuned'
|
||||||
|
|
||||||
|
# load the training set, and extract a held-out validation split of 1000 documents (stratified)
|
||||||
|
df = pd.read_csv(datapath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
|
||||||
|
labels = df['labels'].to_frame()
|
||||||
|
X_train, X_val = train_test_split(df, stratify=labels, test_size=1000, random_state=1)
|
||||||
|
num_labels = len(pd.unique(labels['labels']))
|
||||||
|
|
||||||
|
features = datasets.Features({'labels': datasets.Value('int32'), 'review': datasets.Value('string')})
|
||||||
|
train = Dataset.from_pandas(df=X_train, split='train', features=features)
|
||||||
|
validation = Dataset.from_pandas(df=X_val, split='validation', features=features)
|
||||||
|
|
||||||
|
dataset = DatasetDict({
|
||||||
|
'train': train.select(range(500)) if debug else train,
|
||||||
|
'validation': validation.select(range(500)) if debug else validation
|
||||||
|
})
|
||||||
|
|
||||||
|
# tokenize the dataset
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||||
|
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
||||||
|
|
||||||
|
print(tokenized_datasets)
|
||||||
|
print(tokenized_datasets['train'][0]['labels'])
|
||||||
|
print(tokenized_datasets['train'][0]['review'])
|
||||||
|
print(tokenized_datasets['train'][0]['input_ids'])
|
||||||
|
print(len(tokenized_datasets['train'][0]['input_ids']))
|
||||||
|
# print(tokenized_datasets['train'][0]['token_type_ids'])
|
||||||
|
# print(tokenized_datasets['train'][0]['attention_mask'])
|
||||||
|
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
|
||||||
|
|
||||||
|
# fine-tuning
|
||||||
|
training_args = TrainingArguments(
|
||||||
|
modelout,
|
||||||
|
learning_rate=2e-5,
|
||||||
|
num_train_epochs=5,
|
||||||
|
weight_decay=0.01,
|
||||||
|
evaluation_strategy='epoch',
|
||||||
|
save_strategy='epoch',
|
||||||
|
per_device_train_batch_size=16,
|
||||||
|
per_device_eval_batch_size=16,
|
||||||
|
# eval_steps=10,
|
||||||
|
save_total_limit=1,
|
||||||
|
load_best_model_at_end=True
|
||||||
|
)
|
||||||
|
trainer = Trainer(
|
||||||
|
model,
|
||||||
|
args=training_args,
|
||||||
|
train_dataset=tokenized_datasets['train'],
|
||||||
|
eval_dataset=tokenized_datasets['validation'],
|
||||||
|
data_collator=DataCollatorWithPadding(tokenizer),
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
compute_metrics=compute_metrics
|
||||||
|
)
|
||||||
|
|
||||||
|
trainer.train()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -x
|
||||||
|
|
||||||
|
#conda activate torch
|
||||||
|
|
||||||
|
transformer=roberta-base
|
||||||
|
|
||||||
|
#python3 finetune_bert.py ./data/Books/training_data.txt $transformer
|
||||||
|
#python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned last
|
||||||
|
#python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned average
|
||||||
|
PYTHONPATH=.:.. python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned posteriors
|
|
@ -8,25 +8,34 @@ from Ordinal.main import quantifiers
|
||||||
from Ordinal.tabular import Table
|
from Ordinal.tabular import Table
|
||||||
|
|
||||||
domain = 'Books-tfidf'
|
domain = 'Books-tfidf'
|
||||||
|
domain_bert_last = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
|
||||||
|
domain_bert_ave = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||||
prot = 'app'
|
prot = 'app'
|
||||||
outpath = f'./tables/{domain}/{prot}/results.tex'
|
outpath = f'./tables/{domain}/{prot}/results.tex'
|
||||||
|
|
||||||
resultpath = join('./results', domain, prot)
|
resultpath = join('./results', domain, prot)
|
||||||
|
resultpath_bertlast = join('./results', domain_bert_last, prot)
|
||||||
|
resultpath_bertave = join('./results', domain_bert_ave, prot)
|
||||||
|
|
||||||
methods = [qname for qname, *_ in quantifiers()]
|
methods = [qname for qname, *_ in quantifiers()]
|
||||||
# methods += [m+'-r' for m in methods]
|
methods_Rlast = [m+'-RoBERTa-last' for m in methods]
|
||||||
|
methods_Rave = [m+'-RoBERTa-average' for m in methods]
|
||||||
|
methods = methods + methods_Rlast + methods_Rave
|
||||||
|
methods += [m+'-r' for m in methods]
|
||||||
|
|
||||||
table = Table(benchmarks=['low', 'mid', 'high', 'all'], methods=methods, prec_mean=4, show_std=True, prec_std=4)
|
table = Table(benchmarks=['low', 'mid', 'high', 'all'], methods=methods, prec_mean=4, show_std=True, prec_std=4)
|
||||||
|
|
||||||
|
resultfiles = list(glob(f'{resultpath}/*.csv')) + list(glob(f'{resultpath_bertlast}/*.csv')) + list(glob(f'{resultpath_bertave}/*.csv'))
|
||||||
|
|
||||||
for resultfile in glob(f'{resultpath}/*.csv'):
|
for resultfile in resultfiles:
|
||||||
df = pd.read_csv(resultfile)
|
df = pd.read_csv(resultfile)
|
||||||
nmd = df['nmd'].values
|
nmd = df['nmd'].values
|
||||||
resultname = Path(resultfile).name
|
resultname = Path(resultfile).name
|
||||||
method, drift, *other = resultname.replace('.csv', '').split('.')
|
method, drift, *other = resultname.replace('.csv', '').split('.')
|
||||||
if other:
|
if other:
|
||||||
continue
|
|
||||||
method += '-r'
|
method += '-r'
|
||||||
|
if method not in methods:
|
||||||
|
continue
|
||||||
|
|
||||||
table.add(drift, method, nmd)
|
table.add(drift, method, nmd)
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,145 @@
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
from transformers import AutoModelForSequenceClassification
|
||||||
|
from os.path import join
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from Ordinal.utils import load_samples_folder, load_single_sample_as_csv
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize_function(example):
|
||||||
|
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else None, return_tensors='pt')
|
||||||
|
return {
|
||||||
|
'input_ids': tokens.input_ids.cuda(),
|
||||||
|
'attention_mask': tokens.attention_mask.cuda()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def save_samples_as_txt(tensors, labels, path):
|
||||||
|
vectors = tensors
|
||||||
|
labels = labels.values
|
||||||
|
vec_lab = np.hstack([labels, vectors])
|
||||||
|
n_cols = vectors.shape[1]
|
||||||
|
np.savetxt(path, vec_lab, fmt=['%d']+['%f']*n_cols)
|
||||||
|
|
||||||
|
|
||||||
|
def transform_sample(instances, labels, outpath, batch_size=50):
|
||||||
|
ndocs = len(labels)
|
||||||
|
batches = ndocs // batch_size
|
||||||
|
assert ndocs % batches == 0, 'fragmented last bach not supported'
|
||||||
|
|
||||||
|
transformations = []
|
||||||
|
for batch_id in range(0, ndocs, batch_size):
|
||||||
|
|
||||||
|
batch_instances = instances[batch_id:batch_id + batch_size]
|
||||||
|
|
||||||
|
tokenized_dataset = tokenize_function(batch_instances)
|
||||||
|
out = model(**tokenized_dataset, output_hidden_states=True)
|
||||||
|
|
||||||
|
if generation_mode == 'posteriors':
|
||||||
|
logits = out.logits
|
||||||
|
posteriors = torch.softmax(logits, dim=-1)
|
||||||
|
transformed = posteriors
|
||||||
|
elif generation_mode == 'last':
|
||||||
|
hidden_states = out.hidden_states
|
||||||
|
last_layer_cls = hidden_states[-1][:, 0, :]
|
||||||
|
transformed = last_layer_cls
|
||||||
|
elif generation_mode == 'average':
|
||||||
|
hidden_states = out.hidden_states
|
||||||
|
hidden_states = torch.stack(hidden_states)
|
||||||
|
all_layer_cls = hidden_states[:, :, 0, :]
|
||||||
|
average_cls = torch.mean(all_layer_cls, dim=0)
|
||||||
|
transformed = average_cls
|
||||||
|
else:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
transformations.append(transformed.cpu().numpy())
|
||||||
|
|
||||||
|
transformations = np.vstack(transformations)
|
||||||
|
save_samples_as_txt(transformations, labels, outpath)
|
||||||
|
|
||||||
|
|
||||||
|
def transform_folder_samples(protocol, splitname):
|
||||||
|
in_folder = join(datapath, domain, protocol, splitname)
|
||||||
|
out_folder = join(datapath, outname, protocol, splitname)
|
||||||
|
total = 1000 if splitname.startswith('dev') else 5000
|
||||||
|
|
||||||
|
for i, (instances, labels) in tqdm(enumerate(
|
||||||
|
load_samples_folder(in_folder, load_fn=load_single_sample_as_csv)), desc=f'{protocol} {splitname}', total=total):
|
||||||
|
transform_sample(instances, labels, outpath=join(out_folder, f'{i}.txt'))
|
||||||
|
|
||||||
|
|
||||||
|
def get_best_checkpoint(checkpointdir):
|
||||||
|
from glob import glob
|
||||||
|
steps = []
|
||||||
|
for folder in glob(f'{checkpointdir}/checkpoint-*'):
|
||||||
|
step=int(folder.split('checkpoint-')[1])
|
||||||
|
steps.append(step)
|
||||||
|
assert len(steps) <= 2, 'unexpected number of steps, only two where expected (the best one and the last one)'
|
||||||
|
choosen = f'{checkpointdir}/checkpoint-{min(steps)}'
|
||||||
|
return choosen
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
debug = False
|
||||||
|
assert torch.cuda.is_available(), 'cuda is not available'
|
||||||
|
|
||||||
|
checkpoint='roberta-base-finetuned'
|
||||||
|
generation_mode = 'posteriors'
|
||||||
|
|
||||||
|
# n_args = len(sys.argv)
|
||||||
|
# assert n_args==3, 'wrong arguments, expected: <checkpoint> <generation-mode>\n' \
|
||||||
|
# '\tgeneration-mode: last (last layer), ave (average pooling), or posteriors (posterior probabilities)'
|
||||||
|
|
||||||
|
# checkpoint = sys.argv[1] #e.g., 'bert-base-uncased'
|
||||||
|
# generation_mode = sys.argv[2] # e.g., 'last'
|
||||||
|
|
||||||
|
assert 'finetuned' in checkpoint, 'looks like this model is not finetuned'
|
||||||
|
|
||||||
|
checkpoint = get_best_checkpoint(checkpoint)
|
||||||
|
|
||||||
|
num_labels = 5
|
||||||
|
|
||||||
|
datapath = './data'
|
||||||
|
domain = 'Books'
|
||||||
|
protocols = ['app'] # ['app', 'npp']
|
||||||
|
|
||||||
|
assert generation_mode in ['last', 'average', 'posteriors'], 'unknown generation_model'
|
||||||
|
outname = domain + f'-{checkpoint}-{generation_mode}'
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
print('loading', checkpoint)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
|
||||||
|
|
||||||
|
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||||
|
|
||||||
|
print('transforming the training set')
|
||||||
|
instances, labels = load_single_sample_as_csv(join(datapath, domain), 'training_data')
|
||||||
|
transform_sample(instances, labels, join(datapath, outname, 'training_data.txt'))
|
||||||
|
print('[done]')
|
||||||
|
|
||||||
|
for protocol in protocols:
|
||||||
|
in_path = join(datapath, domain, protocol)
|
||||||
|
out_path = join(datapath, outname, protocol)
|
||||||
|
os.makedirs(out_path, exist_ok=True)
|
||||||
|
os.makedirs(join(out_path, 'dev_samples'), exist_ok=True)
|
||||||
|
os.makedirs(join(out_path, 'test_samples'), exist_ok=True)
|
||||||
|
shutil.copyfile(join(in_path, 'dev_prevalences.txt'), join(out_path, 'dev_prevalences.txt'))
|
||||||
|
shutil.copyfile(join(in_path, 'test_prevalences.txt'), join(out_path, 'test_prevalences.txt'))
|
||||||
|
|
||||||
|
print('processing', protocol)
|
||||||
|
transform_folder_samples(protocol, 'dev_samples')
|
||||||
|
transform_folder_samples(protocol, 'test_samples')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from os.path import join
|
from os.path import join
|
||||||
import os
|
import os
|
||||||
from utils import load_samples, load_samples_pkl
|
from utils import load_samples_folder, load_simple_sample_npytxt, load_single_sample_pkl
|
||||||
from evaluation import nmd, mnmd
|
from evaluation import nmd, mnmd
|
||||||
from time import time
|
from time import time
|
||||||
import pickle
|
import pickle
|
||||||
|
@ -25,22 +25,6 @@ import mord
|
||||||
# add drift='all'
|
# add drift='all'
|
||||||
|
|
||||||
|
|
||||||
def load_test_samples():
|
|
||||||
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
|
|
||||||
ids = set(ids)
|
|
||||||
pklpath = join(datapath, domain, protocol, 'test_samples')
|
|
||||||
for sample in tqdm(load_samples_pkl(pklpath, filter=ids), total=len(ids)):
|
|
||||||
yield sample.instances, sample.prevalence()
|
|
||||||
|
|
||||||
|
|
||||||
def load_dev_samples():
|
|
||||||
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
|
|
||||||
ids = set(ids)
|
|
||||||
pklpath = join(datapath, domain, protocol, 'dev_samples')
|
|
||||||
for sample in tqdm(load_samples_pkl(pklpath, filter=ids), total=len(ids)):
|
|
||||||
yield sample.instances, sample.prevalence()
|
|
||||||
|
|
||||||
|
|
||||||
def quantifiers():
|
def quantifiers():
|
||||||
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||||
# params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
|
# params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
|
||||||
|
@ -58,20 +42,19 @@ def quantifiers():
|
||||||
|
|
||||||
# with order-aware classifiers
|
# with order-aware classifiers
|
||||||
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||||
yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
|
#yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
|
||||||
yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
|
#yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
|
||||||
yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
|
#yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
|
||||||
yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
|
#yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
|
||||||
#yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR
|
#yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR
|
||||||
yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
|
#yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
|
||||||
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
|
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
|
||||||
|
|
||||||
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||||
# I am using my implementation, which caters for predict_proba (linear distance to the two closest classes, 0 in the rest)
|
# I am using my implementation, which caters for predict_proba (linear distance to the two closest classes, 0 in the rest)
|
||||||
# the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do
|
# the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do
|
||||||
# not implement predict_proba nor decision_score
|
# not implement predict_proba nor decision_score
|
||||||
yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
|
#yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
|
||||||
yield 'CC-bal(SVR)', CC(RegressorClassifier()), params_SVR
|
|
||||||
#yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
|
#yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
|
||||||
# yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR
|
# yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR
|
||||||
# yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR
|
# yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR
|
||||||
|
@ -82,6 +65,7 @@ def quantifiers():
|
||||||
|
|
||||||
def run_experiment(params):
|
def run_experiment(params):
|
||||||
qname, q, param_grid, drift = params
|
qname, q, param_grid, drift = params
|
||||||
|
qname += posfix
|
||||||
resultfile = join(resultpath, f'{qname}.{drift}.csv')
|
resultfile = join(resultpath, f'{qname}.{drift}.csv')
|
||||||
if os.path.exists(resultfile):
|
if os.path.exists(resultfile):
|
||||||
print(f'result file {resultfile} already exists: continue')
|
print(f'result file {resultfile} already exists: continue')
|
||||||
|
@ -89,6 +73,22 @@ def run_experiment(params):
|
||||||
|
|
||||||
print(f'fitting {qname} for {drift}-drift')
|
print(f'fitting {qname} for {drift}-drift')
|
||||||
|
|
||||||
|
|
||||||
|
def load_test_samples():
|
||||||
|
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
|
||||||
|
ids = set(ids)
|
||||||
|
folderpath = join(datapath, domain, protocol, 'test_samples')
|
||||||
|
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
|
||||||
|
yield sample.instances, sample.prevalence()
|
||||||
|
|
||||||
|
|
||||||
|
def load_dev_samples():
|
||||||
|
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
|
||||||
|
ids = set(ids)
|
||||||
|
folderpath = join(datapath, domain, protocol, 'dev_samples')
|
||||||
|
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
|
||||||
|
yield sample.instances, sample.prevalence()
|
||||||
|
|
||||||
q = qp.model_selection.GridSearchQ(
|
q = qp.model_selection.GridSearchQ(
|
||||||
q,
|
q,
|
||||||
param_grid,
|
param_grid,
|
||||||
|
@ -125,17 +125,29 @@ def run_experiment(params):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
#preprocessing = 'roberta.last'
|
||||||
|
preprocessing = 'roberta.average'
|
||||||
|
#preprocessing = 'tfidf'
|
||||||
|
if preprocessing=='tfidf':
|
||||||
domain = 'Books-tfidf'
|
domain = 'Books-tfidf'
|
||||||
|
posfix = ''
|
||||||
|
elif preprocessing=='roberta.last':
|
||||||
|
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
|
||||||
|
posfix = '-RoBERTa-last'
|
||||||
|
elif preprocessing=='roberta.average':
|
||||||
|
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||||
|
posfix = '-RoBERTa-average'
|
||||||
|
load_sample_fn = load_single_sample_pkl
|
||||||
datapath = './data'
|
datapath = './data'
|
||||||
protocol = 'app'
|
protocol = 'app'
|
||||||
resultpath = join('./results', domain, protocol)
|
resultpath = join('./results', domain, protocol)
|
||||||
os.makedirs(resultpath, exist_ok=True)
|
os.makedirs(resultpath, exist_ok=True)
|
||||||
|
|
||||||
train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb'))
|
train = load_sample_fn(join(datapath, domain), 'training_data')
|
||||||
|
|
||||||
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
|
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
|
||||||
for drift in ['low', 'mid', 'high', 'all']:
|
#for drift in [f'smooth{i}' for i in range(5)] + ['all']:
|
||||||
params = [(*qs, drift) for qs in quantifiers()]
|
params = [(*qs, drift) for qs in quantifiers() for drift in ['low', 'mid', 'high', 'all']]
|
||||||
hypers = qp.util.parallel(run_experiment, params, n_jobs=-2)
|
hypers = qp.util.parallel(run_experiment, params, n_jobs=-2)
|
||||||
for h in hypers:
|
for h in hypers:
|
||||||
if h is not None:
|
if h is not None:
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from Ordinal.evaluation import nmd
|
from evaluation import nmd
|
||||||
from Ordinal.utils import load_samples_pkl
|
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
import pickle
|
import pickle
|
||||||
import os
|
import os
|
||||||
|
@ -13,7 +13,8 @@ def partition_by_drift(split, training_prevalence):
|
||||||
assert split in ['dev', 'test'], 'invalid split name'
|
assert split in ['dev', 'test'], 'invalid split name'
|
||||||
total=1000 if split=='dev' else 5000
|
total=1000 if split=='dev' else 5000
|
||||||
drifts = []
|
drifts = []
|
||||||
for sample in tqdm(load_samples_pkl(join(datapath, domain, 'app', f'{split}_samples')), total=total):
|
folderpath = join(datapath, domain, 'app', f'{split}_samples')
|
||||||
|
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
|
||||||
drifts.append(nmd(training_prevalence, sample.prevalence()))
|
drifts.append(nmd(training_prevalence, sample.prevalence()))
|
||||||
drifts = np.asarray(drifts)
|
drifts = np.asarray(drifts)
|
||||||
order = np.argsort(drifts)
|
order = np.argsort(drifts)
|
||||||
|
@ -34,7 +35,7 @@ def partition_by_drift(split, training_prevalence):
|
||||||
print(f'all drift: interval [{all.min():.4f}, {all.max():.4f}] mean: {all.mean():.4f}')
|
print(f'all drift: interval [{all.min():.4f}, {all.max():.4f}] mean: {all.mean():.4f}')
|
||||||
|
|
||||||
|
|
||||||
domain = 'Books-tfidf'
|
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||||
datapath = './data'
|
datapath = './data'
|
||||||
|
|
||||||
training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
|
training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
import numpy as np
|
||||||
|
from Ordinal.evaluation import smoothness
|
||||||
|
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
|
||||||
|
|
||||||
|
from os.path import join
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
def partition_by_smoothness(split):
|
||||||
|
assert split in ['dev', 'test'], 'invalid split name'
|
||||||
|
total=1000 if split=='dev' else 5000
|
||||||
|
smooths = []
|
||||||
|
folderpath = join(datapath, domain, 'app', f'{split}_samples')
|
||||||
|
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
|
||||||
|
smooths.append(smoothness(sample.prevalence()))
|
||||||
|
smooths = np.asarray(smooths)
|
||||||
|
order = np.argsort(smooths)
|
||||||
|
nD = len(order)
|
||||||
|
low2high_smooth = np.array_split(order, 5)
|
||||||
|
all_drift = np.arange(nD)
|
||||||
|
for i, smooth_idx in enumerate(low2high_smooth):
|
||||||
|
block = smooths[smooth_idx]
|
||||||
|
print(f'smooth block {i}: shape={smooth_idx.shape}, interval=[{block.min()}, {block.max()}] mean={block.mean()}')
|
||||||
|
np.save(join(datapath, domain, 'app', f'smooth{i}.{split}.id.npy'), smooth_idx)
|
||||||
|
np.save(join(datapath, domain, 'app', f'all.{split}.id.npy'), all_drift)
|
||||||
|
|
||||||
|
|
||||||
|
#domain = 'Books-tfidf'
|
||||||
|
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||||
|
datapath = './data'
|
||||||
|
|
||||||
|
#training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
|
||||||
|
|
||||||
|
partition_by_smoothness('dev')
|
||||||
|
partition_by_smoothness('test')
|
||||||
|
|
|
@ -4,7 +4,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from os.path import join
|
from os.path import join
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
from utils import load_samples
|
from utils import load_samples_raw
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
|
@ -40,7 +40,7 @@ pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pic
|
||||||
|
|
||||||
|
|
||||||
def transform_folder_samples(protocol, splitname):
|
def transform_folder_samples(protocol, splitname):
|
||||||
for i, sample in tqdm(enumerate(load_samples(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
||||||
sample.instances = tfidf.transform(sample.instances)
|
sample.instances = tfidf.transform(sample.instances)
|
||||||
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from os.path import join
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
from utils import *
|
||||||
|
from tqdm import tqdm
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
|
vector_generation = 'average'
|
||||||
|
|
||||||
|
datapath = './data'
|
||||||
|
domain = f'Books-roberta-base-finetuned/checkpoint-1188-{vector_generation}'
|
||||||
|
outname = domain.replace('-finetuned', '-finetuned-pkl')
|
||||||
|
|
||||||
|
protocol = 'app'
|
||||||
|
|
||||||
|
print('pickling npy txt files')
|
||||||
|
print('from:', join(datapath, domain))
|
||||||
|
print('to', join(datapath, outname))
|
||||||
|
print('for protocol:', protocol)
|
||||||
|
|
||||||
|
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, protocol), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, protocol, 'dev_samples'), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, protocol, 'test_samples'), exist_ok=True)
|
||||||
|
shutil.copyfile(join(datapath, domain, protocol, 'dev_prevalences.txt'), join(datapath, outname, protocol, 'dev_prevalences.txt'))
|
||||||
|
shutil.copyfile(join(datapath, domain, protocol, 'test_prevalences.txt'), join(datapath, outname, protocol, 'test_prevalences.txt'))
|
||||||
|
|
||||||
|
|
||||||
|
train = load_simple_sample_npytxt(join(datapath, domain), 'training_data', classes=np.arange(5))
|
||||||
|
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
|
def transform_folder_samples(protocol, splitname):
|
||||||
|
folder_dir=join(datapath, domain, protocol, splitname)
|
||||||
|
for i, sample in tqdm(enumerate(load_samples_folder(folder_dir, filter=None, load_fn=load_simple_sample_npytxt, classes=train.classes_))):
|
||||||
|
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
|
transform_folder_samples(protocol, 'dev_samples')
|
||||||
|
transform_folder_samples(protocol, 'test_samples')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from os.path import join
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
from utils import load_samples_raw
|
||||||
|
from tqdm import tqdm
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
|
datapath = './data'
|
||||||
|
domain = 'Books'
|
||||||
|
outname = domain + '-tfidf'
|
||||||
|
|
||||||
|
def save_preprocessing_info(transformer):
|
||||||
|
with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo:
|
||||||
|
foo.write(f'{str(transformer)}\n')
|
||||||
|
|
||||||
|
|
||||||
|
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, 'app'), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True)
|
||||||
|
shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt'))
|
||||||
|
shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt'))
|
||||||
|
os.makedirs(join(datapath, outname, 'npp'), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, 'npp', 'dev_samples'), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, 'npp', 'test_samples'), exist_ok=True)
|
||||||
|
shutil.copyfile(join(datapath, domain, 'npp', 'dev_prevalences.txt'), join(datapath, outname, 'npp', 'dev_prevalences.txt'))
|
||||||
|
shutil.copyfile(join(datapath, domain, 'npp', 'test_prevalences.txt'), join(datapath, outname, 'npp', 'test_prevalences.txt'))
|
||||||
|
|
||||||
|
|
||||||
|
tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5)
|
||||||
|
|
||||||
|
train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text)
|
||||||
|
train.instances = tfidf.fit_transform(train.instances)
|
||||||
|
save_preprocessing_info(tfidf)
|
||||||
|
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
|
def transform_folder_samples(protocol, splitname):
|
||||||
|
for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
||||||
|
sample.instances = tfidf.transform(sample.instances)
|
||||||
|
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
|
transform_folder_samples('app', 'dev_samples')
|
||||||
|
transform_folder_samples('app', 'test_samples')
|
||||||
|
transform_folder_samples('npp', 'dev_samples')
|
||||||
|
transform_folder_samples('npp', 'test_samples')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,42 +1,64 @@
|
||||||
import quapy as qp
|
import numpy as np
|
||||||
from quapy.data import LabelledCollection
|
|
||||||
from glob import glob
|
from glob import glob
|
||||||
|
from json import load
|
||||||
import os
|
import os
|
||||||
from os.path import join
|
from os.path import join
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
|
|
||||||
def load_samples(path_dir, classes):
|
|
||||||
nsamples = len(glob(join(path_dir, f'*.txt')))
|
|
||||||
for id in range(nsamples):
|
|
||||||
yield LabelledCollection.load(join(path_dir, f'{id}.txt'), loader_func=qp.data.reader.from_text, classes=classes)
|
|
||||||
|
|
||||||
|
|
||||||
def load_samples_as_csv(path_dir, debug=False):
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import csv
|
import csv
|
||||||
import datasets
|
import datasets
|
||||||
from datasets import Dataset
|
from datasets import Dataset
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
|
||||||
nsamples = len(glob(join(path_dir, f'*.txt')))
|
|
||||||
for id in range(nsamples):
|
|
||||||
df = pd.read_csv(join(path_dir, f'{id}.txt'), sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
|
def load_simple_sample_npytxt(parentdir, filename, classes=None):
|
||||||
|
samplepath = join(parentdir, filename+'.txt')
|
||||||
|
yX = np.loadtxt(samplepath)
|
||||||
|
X = yX[:,1:]
|
||||||
|
y = yX[:,0].astype(np.int32)
|
||||||
|
return LabelledCollection(instances=X, labels=y, classes_=classes)
|
||||||
|
|
||||||
|
|
||||||
|
def load_simple_sample_raw(parentdir, filename, classes=None):
|
||||||
|
samplepath = join(parentdir, filename+'.txt')
|
||||||
|
return LabelledCollection.load(samplepath, loader_func=qp.data.reader.from_text, classes=classes)
|
||||||
|
|
||||||
|
|
||||||
|
def load_single_sample_as_csv(parentdir, filename):
|
||||||
|
samplepath = join(parentdir, filename+'.txt')
|
||||||
|
df = pd.read_csv(samplepath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
|
||||||
labels = df.pop('labels').to_frame()
|
labels = df.pop('labels').to_frame()
|
||||||
X = df
|
|
||||||
|
|
||||||
features = datasets.Features({'review': datasets.Value('string')})
|
features = datasets.Features({'review': datasets.Value('string')})
|
||||||
if debug:
|
sample = Dataset.from_pandas(df=df, features=features)
|
||||||
sample = Dataset.from_pandas(df=X, features=features).select(range(50))
|
|
||||||
labels = labels[:50]
|
|
||||||
else:
|
|
||||||
sample = Dataset.from_pandas(df=X, features=features)
|
|
||||||
|
|
||||||
yield sample, labels
|
return sample, labels
|
||||||
|
|
||||||
|
|
||||||
def load_samples_pkl(path_dir, filter=None):
|
def load_single_sample_pkl(parentdir, filename):
|
||||||
nsamples = len(glob(join(path_dir, f'*.pkl')))
|
return pickle.load(open(join(parentdir, filename+'.pkl'), 'rb'))
|
||||||
|
|
||||||
|
|
||||||
|
# def load_samples_npytxt(path_dir, filter=None, classes=None):
|
||||||
|
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_npytxt)
|
||||||
|
|
||||||
|
|
||||||
|
# def load_samples_raw(path_dir, filter=None, classes=None):
|
||||||
|
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_raw, load_fn_kwargs={'classes': classes})
|
||||||
|
|
||||||
|
|
||||||
|
# def load_samples_as_csv(path_dir, filter=None):
|
||||||
|
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_as_csv)
|
||||||
|
|
||||||
|
|
||||||
|
# def load_samples_pkl(path_dir, filter=None):
|
||||||
|
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_pkl)
|
||||||
|
|
||||||
|
|
||||||
|
def load_samples_folder(path_dir, filter=None, load_fn=None, **load_fn_kwargs):
|
||||||
|
nsamples = len(glob(join(path_dir, f'*')))
|
||||||
for id in range(nsamples):
|
for id in range(nsamples):
|
||||||
if (filter is None) or id in filter:
|
if (filter is None) or id in filter:
|
||||||
yield pickle.load(open(join(path_dir, f'{id}.pkl'), 'rb'))
|
yield load_fn(path_dir, f'{id}', **load_fn_kwargs)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue