diff --git a/Ordinal/evaluation.py b/Ordinal/evaluation.py index a23181c..452b512 100644 --- a/Ordinal/evaluation.py +++ b/Ordinal/evaluation.py @@ -1,6 +1,11 @@ import numpy as np +def smoothness(p): + return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:])) + + + def _check_arrays(prevs): prevs = np.asarray(prevs) if prevs.ndim==1: diff --git a/Ordinal/finetune_bert.py b/Ordinal/finetune_bert.py new file mode 100644 index 0000000..f18b08c --- /dev/null +++ b/Ordinal/finetune_bert.py @@ -0,0 +1,100 @@ +import sys +import numpy as np +import datasets +import torch.cuda +from sklearn.metrics import f1_score +from sklearn.model_selection import train_test_split +from transformers import AutoTokenizer, DataCollatorWithPadding, BertTokenizer +from datasets import list_datasets, list_metrics, load_dataset, Dataset, DatasetDict, load_metric +from transformers import AutoModelForSequenceClassification +from transformers import TrainingArguments +from transformers import Trainer +import pandas as pd +import csv + + +def tokenize_function(example): + tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else 256) + return tokens + + +def compute_metrics(eval_preds): + logits, labels = eval_preds + preds = np.argmax(logits, axis=-1) + return { + 'macro-f1': f1_score(labels, preds, average='macro'), + 'micro-f1': f1_score(labels, preds, average='micro'), + } + + +if __name__ == '__main__': + debug = False + assert torch.cuda.is_available(), 'cuda is not available' + + n_args = len(sys.argv) + assert n_args==3, 'wrong arguments, expected: ' + + datapath = sys.argv[1] # './data/Books/training_data.txt' + checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base' + modelout = checkpoint+'-finetuned' + + # load the training set, and extract a held-out validation split of 1000 documents (stratified) + df = pd.read_csv(datapath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE) + labels = df['labels'].to_frame() + X_train, X_val = train_test_split(df, stratify=labels, test_size=1000, random_state=1) + num_labels = len(pd.unique(labels['labels'])) + + features = datasets.Features({'labels': datasets.Value('int32'), 'review': datasets.Value('string')}) + train = Dataset.from_pandas(df=X_train, split='train', features=features) + validation = Dataset.from_pandas(df=X_val, split='validation', features=features) + + dataset = DatasetDict({ + 'train': train.select(range(500)) if debug else train, + 'validation': validation.select(range(500)) if debug else validation + }) + + # tokenize the dataset + tokenizer = AutoTokenizer.from_pretrained(checkpoint) + tokenized_datasets = dataset.map(tokenize_function, batched=True) + + print(tokenized_datasets) + print(tokenized_datasets['train'][0]['labels']) + print(tokenized_datasets['train'][0]['review']) + print(tokenized_datasets['train'][0]['input_ids']) + print(len(tokenized_datasets['train'][0]['input_ids'])) + # print(tokenized_datasets['train'][0]['token_type_ids']) + # print(tokenized_datasets['train'][0]['attention_mask']) + + model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda() + + # fine-tuning + training_args = TrainingArguments( + modelout, + learning_rate=2e-5, + num_train_epochs=5, + weight_decay=0.01, + evaluation_strategy='epoch', + save_strategy='epoch', + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + # eval_steps=10, + save_total_limit=1, + load_best_model_at_end=True + ) + trainer = Trainer( + model, + args=training_args, + train_dataset=tokenized_datasets['train'], + eval_dataset=tokenized_datasets['validation'], + data_collator=DataCollatorWithPadding(tokenizer), + tokenizer=tokenizer, + compute_metrics=compute_metrics + ) + + trainer.train() + + + + + + diff --git a/Ordinal/finetuning_batch.sh b/Ordinal/finetuning_batch.sh new file mode 100755 index 0000000..f11e2d8 --- /dev/null +++ b/Ordinal/finetuning_batch.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -x + +#conda activate torch + +transformer=roberta-base + +#python3 finetune_bert.py ./data/Books/training_data.txt $transformer +#python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned last +#python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned average +PYTHONPATH=.:.. python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned posteriors diff --git a/Ordinal/gen_tables.py b/Ordinal/gen_tables.py index e57517b..bac0227 100644 --- a/Ordinal/gen_tables.py +++ b/Ordinal/gen_tables.py @@ -8,25 +8,34 @@ from Ordinal.main import quantifiers from Ordinal.tabular import Table domain = 'Books-tfidf' +domain_bert_last = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last' +domain_bert_ave = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average' prot = 'app' outpath = f'./tables/{domain}/{prot}/results.tex' resultpath = join('./results', domain, prot) +resultpath_bertlast = join('./results', domain_bert_last, prot) +resultpath_bertave = join('./results', domain_bert_ave, prot) methods = [qname for qname, *_ in quantifiers()] -# methods += [m+'-r' for m in methods] +methods_Rlast = [m+'-RoBERTa-last' for m in methods] +methods_Rave = [m+'-RoBERTa-average' for m in methods] +methods = methods + methods_Rlast + methods_Rave +methods += [m+'-r' for m in methods] table = Table(benchmarks=['low', 'mid', 'high', 'all'], methods=methods, prec_mean=4, show_std=True, prec_std=4) +resultfiles = list(glob(f'{resultpath}/*.csv')) + list(glob(f'{resultpath_bertlast}/*.csv')) + list(glob(f'{resultpath_bertave}/*.csv')) -for resultfile in glob(f'{resultpath}/*.csv'): +for resultfile in resultfiles: df = pd.read_csv(resultfile) nmd = df['nmd'].values resultname = Path(resultfile).name method, drift, *other = resultname.replace('.csv', '').split('.') if other: - continue method += '-r' + if method not in methods: + continue table.add(drift, method, nmd) diff --git a/Ordinal/generate_bert_vectors_npytxt.py b/Ordinal/generate_bert_vectors_npytxt.py new file mode 100644 index 0000000..4e6cc32 --- /dev/null +++ b/Ordinal/generate_bert_vectors_npytxt.py @@ -0,0 +1,145 @@ +import sys +import numpy as np +import torch +from torch.utils.data import DataLoader +from transformers import AutoTokenizer +from transformers import AutoModelForSequenceClassification +from os.path import join +import os +import shutil +from tqdm import tqdm + +from Ordinal.utils import load_samples_folder, load_single_sample_as_csv + + + +def tokenize_function(example): + tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else None, return_tensors='pt') + return { + 'input_ids': tokens.input_ids.cuda(), + 'attention_mask': tokens.attention_mask.cuda() + } + + +def save_samples_as_txt(tensors, labels, path): + vectors = tensors + labels = labels.values + vec_lab = np.hstack([labels, vectors]) + n_cols = vectors.shape[1] + np.savetxt(path, vec_lab, fmt=['%d']+['%f']*n_cols) + + +def transform_sample(instances, labels, outpath, batch_size=50): + ndocs = len(labels) + batches = ndocs // batch_size + assert ndocs % batches == 0, 'fragmented last bach not supported' + + transformations = [] + for batch_id in range(0, ndocs, batch_size): + + batch_instances = instances[batch_id:batch_id + batch_size] + + tokenized_dataset = tokenize_function(batch_instances) + out = model(**tokenized_dataset, output_hidden_states=True) + + if generation_mode == 'posteriors': + logits = out.logits + posteriors = torch.softmax(logits, dim=-1) + transformed = posteriors + elif generation_mode == 'last': + hidden_states = out.hidden_states + last_layer_cls = hidden_states[-1][:, 0, :] + transformed = last_layer_cls + elif generation_mode == 'average': + hidden_states = out.hidden_states + hidden_states = torch.stack(hidden_states) + all_layer_cls = hidden_states[:, :, 0, :] + average_cls = torch.mean(all_layer_cls, dim=0) + transformed = average_cls + else: + raise NotImplementedError() + + transformations.append(transformed.cpu().numpy()) + + transformations = np.vstack(transformations) + save_samples_as_txt(transformations, labels, outpath) + + +def transform_folder_samples(protocol, splitname): + in_folder = join(datapath, domain, protocol, splitname) + out_folder = join(datapath, outname, protocol, splitname) + total = 1000 if splitname.startswith('dev') else 5000 + + for i, (instances, labels) in tqdm(enumerate( + load_samples_folder(in_folder, load_fn=load_single_sample_as_csv)), desc=f'{protocol} {splitname}', total=total): + transform_sample(instances, labels, outpath=join(out_folder, f'{i}.txt')) + + +def get_best_checkpoint(checkpointdir): + from glob import glob + steps = [] + for folder in glob(f'{checkpointdir}/checkpoint-*'): + step=int(folder.split('checkpoint-')[1]) + steps.append(step) + assert len(steps) <= 2, 'unexpected number of steps, only two where expected (the best one and the last one)' + choosen = f'{checkpointdir}/checkpoint-{min(steps)}' + return choosen + + +if __name__ == '__main__': + debug = False + assert torch.cuda.is_available(), 'cuda is not available' + + checkpoint='roberta-base-finetuned' + generation_mode = 'posteriors' + + # n_args = len(sys.argv) + # assert n_args==3, 'wrong arguments, expected: \n' \ + # '\tgeneration-mode: last (last layer), ave (average pooling), or posteriors (posterior probabilities)' + + # checkpoint = sys.argv[1] #e.g., 'bert-base-uncased' + # generation_mode = sys.argv[2] # e.g., 'last' + + assert 'finetuned' in checkpoint, 'looks like this model is not finetuned' + + checkpoint = get_best_checkpoint(checkpoint) + + num_labels = 5 + + datapath = './data' + domain = 'Books' + protocols = ['app'] # ['app', 'npp'] + + assert generation_mode in ['last', 'average', 'posteriors'], 'unknown generation_model' + outname = domain + f'-{checkpoint}-{generation_mode}' + + with torch.no_grad(): + print('loading', checkpoint) + tokenizer = AutoTokenizer.from_pretrained(checkpoint) + model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda() + + os.makedirs(join(datapath, outname), exist_ok=True) + + print('transforming the training set') + instances, labels = load_single_sample_as_csv(join(datapath, domain), 'training_data') + transform_sample(instances, labels, join(datapath, outname, 'training_data.txt')) + print('[done]') + + for protocol in protocols: + in_path = join(datapath, domain, protocol) + out_path = join(datapath, outname, protocol) + os.makedirs(out_path, exist_ok=True) + os.makedirs(join(out_path, 'dev_samples'), exist_ok=True) + os.makedirs(join(out_path, 'test_samples'), exist_ok=True) + shutil.copyfile(join(in_path, 'dev_prevalences.txt'), join(out_path, 'dev_prevalences.txt')) + shutil.copyfile(join(in_path, 'test_prevalences.txt'), join(out_path, 'test_prevalences.txt')) + + print('processing', protocol) + transform_folder_samples(protocol, 'dev_samples') + transform_folder_samples(protocol, 'test_samples') + + + + + + diff --git a/Ordinal/main.py b/Ordinal/main.py index 6fd2e5a..cd01ddd 100644 --- a/Ordinal/main.py +++ b/Ordinal/main.py @@ -9,7 +9,7 @@ from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy from quapy.data import LabelledCollection from os.path import join import os -from utils import load_samples, load_samples_pkl +from utils import load_samples_folder, load_simple_sample_npytxt, load_single_sample_pkl from evaluation import nmd, mnmd from time import time import pickle @@ -25,22 +25,6 @@ import mord # add drift='all' -def load_test_samples(): - ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy')) - ids = set(ids) - pklpath = join(datapath, domain, protocol, 'test_samples') - for sample in tqdm(load_samples_pkl(pklpath, filter=ids), total=len(ids)): - yield sample.instances, sample.prevalence() - - -def load_dev_samples(): - ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy')) - ids = set(ids) - pklpath = join(datapath, domain, protocol, 'dev_samples') - for sample in tqdm(load_samples_pkl(pklpath, filter=ids), total=len(ids)): - yield sample.instances, sample.prevalence() - - def quantifiers(): params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']} # params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']} @@ -58,21 +42,20 @@ def quantifiers(): # with order-aware classifiers # threshold-based ordinal regression (see https://pythonhosted.org/mord/) - yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR - yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR - yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR - yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR + #yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR + #yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR + #yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR + #yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR #yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR - yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR + #yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR # other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.) # regression-based ordinal regression (see https://pythonhosted.org/mord/) # I am using my implementation, which caters for predict_proba (linear distance to the two closest classes, 0 in the rest) # the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do # not implement predict_proba nor decision_score - yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR - yield 'CC-bal(SVR)', CC(RegressorClassifier()), params_SVR - # yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR + #yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR + #yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR # yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR # yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR # yield 'PACC(SVR)', PACC(RegressorClassifier()), params_SVR @@ -82,6 +65,7 @@ def quantifiers(): def run_experiment(params): qname, q, param_grid, drift = params + qname += posfix resultfile = join(resultpath, f'{qname}.{drift}.csv') if os.path.exists(resultfile): print(f'result file {resultfile} already exists: continue') @@ -89,6 +73,22 @@ def run_experiment(params): print(f'fitting {qname} for {drift}-drift') + + def load_test_samples(): + ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy')) + ids = set(ids) + folderpath = join(datapath, domain, protocol, 'test_samples') + for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)): + yield sample.instances, sample.prevalence() + + + def load_dev_samples(): + ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy')) + ids = set(ids) + folderpath = join(datapath, domain, protocol, 'dev_samples') + for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)): + yield sample.instances, sample.prevalence() + q = qp.model_selection.GridSearchQ( q, param_grid, @@ -125,22 +125,34 @@ def run_experiment(params): if __name__ == '__main__': - domain = 'Books-tfidf' + #preprocessing = 'roberta.last' + preprocessing = 'roberta.average' + #preprocessing = 'tfidf' + if preprocessing=='tfidf': + domain = 'Books-tfidf' + posfix = '' + elif preprocessing=='roberta.last': + domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last' + posfix = '-RoBERTa-last' + elif preprocessing=='roberta.average': + domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average' + posfix = '-RoBERTa-average' + load_sample_fn = load_single_sample_pkl datapath = './data' protocol = 'app' resultpath = join('./results', domain, protocol) os.makedirs(resultpath, exist_ok=True) - train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb')) + train = load_sample_fn(join(datapath, domain), 'training_data') with open(join(resultpath, 'hyper.txt'), 'at') as foo: - for drift in ['low', 'mid', 'high', 'all']: - params = [(*qs, drift) for qs in quantifiers()] - hypers = qp.util.parallel(run_experiment, params, n_jobs=-2) - for h in hypers: - if h is not None: - foo.write(h) - foo.write('\n') + #for drift in [f'smooth{i}' for i in range(5)] + ['all']: + params = [(*qs, drift) for qs in quantifiers() for drift in ['low', 'mid', 'high', 'all']] + hypers = qp.util.parallel(run_experiment, params, n_jobs=-2) + for h in hypers: + if h is not None: + foo.write(h) + foo.write('\n') diff --git a/Ordinal/partition_dataset_by_shift.py b/Ordinal/partition_dataset_by_shift.py index fea213d..3b7aaa7 100644 --- a/Ordinal/partition_dataset_by_shift.py +++ b/Ordinal/partition_dataset_by_shift.py @@ -1,7 +1,7 @@ import numpy as np import quapy as qp -from Ordinal.evaluation import nmd -from Ordinal.utils import load_samples_pkl +from evaluation import nmd +from Ordinal.utils import load_samples_folder, load_single_sample_pkl from quapy.data import LabelledCollection import pickle import os @@ -13,7 +13,8 @@ def partition_by_drift(split, training_prevalence): assert split in ['dev', 'test'], 'invalid split name' total=1000 if split=='dev' else 5000 drifts = [] - for sample in tqdm(load_samples_pkl(join(datapath, domain, 'app', f'{split}_samples')), total=total): + folderpath = join(datapath, domain, 'app', f'{split}_samples') + for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total): drifts.append(nmd(training_prevalence, sample.prevalence())) drifts = np.asarray(drifts) order = np.argsort(drifts) @@ -34,7 +35,7 @@ def partition_by_drift(split, training_prevalence): print(f'all drift: interval [{all.min():.4f}, {all.max():.4f}] mean: {all.mean():.4f}') -domain = 'Books-tfidf' +domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average' datapath = './data' training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb')) diff --git a/Ordinal/partition_dataset_by_smoothness.py b/Ordinal/partition_dataset_by_smoothness.py new file mode 100644 index 0000000..616ba5b --- /dev/null +++ b/Ordinal/partition_dataset_by_smoothness.py @@ -0,0 +1,36 @@ +import numpy as np +from Ordinal.evaluation import smoothness +from Ordinal.utils import load_samples_folder, load_single_sample_pkl + +from os.path import join +from tqdm import tqdm + + +def partition_by_smoothness(split): + assert split in ['dev', 'test'], 'invalid split name' + total=1000 if split=='dev' else 5000 + smooths = [] + folderpath = join(datapath, domain, 'app', f'{split}_samples') + for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total): + smooths.append(smoothness(sample.prevalence())) + smooths = np.asarray(smooths) + order = np.argsort(smooths) + nD = len(order) + low2high_smooth = np.array_split(order, 5) + all_drift = np.arange(nD) + for i, smooth_idx in enumerate(low2high_smooth): + block = smooths[smooth_idx] + print(f'smooth block {i}: shape={smooth_idx.shape}, interval=[{block.min()}, {block.max()}] mean={block.mean()}') + np.save(join(datapath, domain, 'app', f'smooth{i}.{split}.id.npy'), smooth_idx) + np.save(join(datapath, domain, 'app', f'all.{split}.id.npy'), all_drift) + + +#domain = 'Books-tfidf' +domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average' +datapath = './data' + +#training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb')) + +partition_by_smoothness('dev') +partition_by_smoothness('test') + diff --git a/Ordinal/preprocess_dataset.py b/Ordinal/preprocess_dataset.py index 38f24df..0e273c4 100644 --- a/Ordinal/preprocess_dataset.py +++ b/Ordinal/preprocess_dataset.py @@ -4,7 +4,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer from os.path import join import os import pickle -from utils import load_samples +from utils import load_samples_raw from tqdm import tqdm import shutil @@ -40,7 +40,7 @@ pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pic def transform_folder_samples(protocol, splitname): - for i, sample in tqdm(enumerate(load_samples(join(datapath, domain, protocol, splitname), classes=train.classes_))): + for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))): sample.instances = tfidf.transform(sample.instances) pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) diff --git a/Ordinal/preprocess_dataset_npytxt2pkl.py b/Ordinal/preprocess_dataset_npytxt2pkl.py new file mode 100644 index 0000000..eb37894 --- /dev/null +++ b/Ordinal/preprocess_dataset_npytxt2pkl.py @@ -0,0 +1,47 @@ +import quapy as qp +from quapy.data import LabelledCollection +from sklearn.feature_extraction.text import TfidfVectorizer +from os.path import join +import os +import pickle +from utils import * +from tqdm import tqdm +import shutil + + +vector_generation = 'average' + +datapath = './data' +domain = f'Books-roberta-base-finetuned/checkpoint-1188-{vector_generation}' +outname = domain.replace('-finetuned', '-finetuned-pkl') + +protocol = 'app' + +print('pickling npy txt files') +print('from:', join(datapath, domain)) +print('to', join(datapath, outname)) +print('for protocol:', protocol) + +os.makedirs(join(datapath, outname), exist_ok=True) +os.makedirs(join(datapath, outname, protocol), exist_ok=True) +os.makedirs(join(datapath, outname, protocol, 'dev_samples'), exist_ok=True) +os.makedirs(join(datapath, outname, protocol, 'test_samples'), exist_ok=True) +shutil.copyfile(join(datapath, domain, protocol, 'dev_prevalences.txt'), join(datapath, outname, protocol, 'dev_prevalences.txt')) +shutil.copyfile(join(datapath, domain, protocol, 'test_prevalences.txt'), join(datapath, outname, protocol, 'test_prevalences.txt')) + + +train = load_simple_sample_npytxt(join(datapath, domain), 'training_data', classes=np.arange(5)) +pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) + + +def transform_folder_samples(protocol, splitname): + folder_dir=join(datapath, domain, protocol, splitname) + for i, sample in tqdm(enumerate(load_samples_folder(folder_dir, filter=None, load_fn=load_simple_sample_npytxt, classes=train.classes_))): + pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) + + +transform_folder_samples(protocol, 'dev_samples') +transform_folder_samples(protocol, 'test_samples') + + + diff --git a/Ordinal/preprocess_dataset_tfidf.py b/Ordinal/preprocess_dataset_tfidf.py new file mode 100644 index 0000000..0e273c4 --- /dev/null +++ b/Ordinal/preprocess_dataset_tfidf.py @@ -0,0 +1,54 @@ +import quapy as qp +from quapy.data import LabelledCollection +from sklearn.feature_extraction.text import TfidfVectorizer +from os.path import join +import os +import pickle +from utils import load_samples_raw +from tqdm import tqdm +import shutil + + +datapath = './data' +domain = 'Books' +outname = domain + '-tfidf' + +def save_preprocessing_info(transformer): + with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo: + foo.write(f'{str(transformer)}\n') + + +os.makedirs(join(datapath, outname), exist_ok=True) +os.makedirs(join(datapath, outname, 'app'), exist_ok=True) +os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True) +os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True) +shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt')) +shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt')) +os.makedirs(join(datapath, outname, 'npp'), exist_ok=True) +os.makedirs(join(datapath, outname, 'npp', 'dev_samples'), exist_ok=True) +os.makedirs(join(datapath, outname, 'npp', 'test_samples'), exist_ok=True) +shutil.copyfile(join(datapath, domain, 'npp', 'dev_prevalences.txt'), join(datapath, outname, 'npp', 'dev_prevalences.txt')) +shutil.copyfile(join(datapath, domain, 'npp', 'test_prevalences.txt'), join(datapath, outname, 'npp', 'test_prevalences.txt')) + + +tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5) + +train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text) +train.instances = tfidf.fit_transform(train.instances) +save_preprocessing_info(tfidf) +pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) + + +def transform_folder_samples(protocol, splitname): + for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))): + sample.instances = tfidf.transform(sample.instances) + pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) + + +transform_folder_samples('app', 'dev_samples') +transform_folder_samples('app', 'test_samples') +transform_folder_samples('npp', 'dev_samples') +transform_folder_samples('npp', 'test_samples') + + + diff --git a/Ordinal/utils.py b/Ordinal/utils.py index ac22671..182851d 100644 --- a/Ordinal/utils.py +++ b/Ordinal/utils.py @@ -1,42 +1,64 @@ -import quapy as qp -from quapy.data import LabelledCollection +import numpy as np from glob import glob +from json import load import os from os.path import join import pickle +import pandas as pd +import csv +import datasets +from datasets import Dataset +import quapy as qp +from quapy.data import LabelledCollection -def load_samples(path_dir, classes): - nsamples = len(glob(join(path_dir, f'*.txt'))) - for id in range(nsamples): - yield LabelledCollection.load(join(path_dir, f'{id}.txt'), loader_func=qp.data.reader.from_text, classes=classes) + +def load_simple_sample_npytxt(parentdir, filename, classes=None): + samplepath = join(parentdir, filename+'.txt') + yX = np.loadtxt(samplepath) + X = yX[:,1:] + y = yX[:,0].astype(np.int32) + return LabelledCollection(instances=X, labels=y, classes_=classes) -def load_samples_as_csv(path_dir, debug=False): - import pandas as pd - import csv - import datasets - from datasets import Dataset - - nsamples = len(glob(join(path_dir, f'*.txt'))) - for id in range(nsamples): - df = pd.read_csv(join(path_dir, f'{id}.txt'), sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE) - labels = df.pop('labels').to_frame() - X = df - - features = datasets.Features({'review': datasets.Value('string')}) - if debug: - sample = Dataset.from_pandas(df=X, features=features).select(range(50)) - labels = labels[:50] - else: - sample = Dataset.from_pandas(df=X, features=features) - - yield sample, labels +def load_simple_sample_raw(parentdir, filename, classes=None): + samplepath = join(parentdir, filename+'.txt') + return LabelledCollection.load(samplepath, loader_func=qp.data.reader.from_text, classes=classes) -def load_samples_pkl(path_dir, filter=None): - nsamples = len(glob(join(path_dir, f'*.pkl'))) +def load_single_sample_as_csv(parentdir, filename): + samplepath = join(parentdir, filename+'.txt') + df = pd.read_csv(samplepath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE) + labels = df.pop('labels').to_frame() + + features = datasets.Features({'review': datasets.Value('string')}) + sample = Dataset.from_pandas(df=df, features=features) + + return sample, labels + + +def load_single_sample_pkl(parentdir, filename): + return pickle.load(open(join(parentdir, filename+'.pkl'), 'rb')) + + +# def load_samples_npytxt(path_dir, filter=None, classes=None): +# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_npytxt) + + +# def load_samples_raw(path_dir, filter=None, classes=None): +# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_raw, load_fn_kwargs={'classes': classes}) + + +# def load_samples_as_csv(path_dir, filter=None): +# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_as_csv) + + +# def load_samples_pkl(path_dir, filter=None): +# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_pkl) + + +def load_samples_folder(path_dir, filter=None, load_fn=None, **load_fn_kwargs): + nsamples = len(glob(join(path_dir, f'*'))) for id in range(nsamples): if (filter is None) or id in filter: - yield pickle.load(open(join(path_dir, f'{id}.pkl'), 'rb')) - + yield load_fn(path_dir, f'{id}', **load_fn_kwargs)