18 changed files with 261 additions and 1569 deletions
--- a/Ordinal/build_Amazon_datasets.py
+++ b/Ordinal/build_Amazon_datasets.py
@ -1,17 +1,14 @@
 import gzip
 import quapy as qp
 from Ordinal.utils import load_simple_sample_raw
 from quapy.data import LabelledCollection
 import quapy.functional as F
 import os
 from os.path import join
 from pathlib import Path
 import numpy as np
 datadir = '/mnt/1T/Datasets/Amazon/reviews'
 outdir  = './data/'
 real_prev_path = './data/Books-real-prevalence-by-product_votes1_reviews100.csv'
 domain = 'Books'
 seed = 7
@ -21,6 +18,13 @@ te_size = 1000
 nval = 1000
 nte = 5000
 # domain = 'Gift_Cards'
 # tr_size = 200
 # val_size = 100
 # te_size = 100
 # nval = 20
 # nte = 40
 def from_gz_text(path, encoding='utf-8', class2int=True):
    """
@ -66,6 +70,7 @@ def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, pre
            write_txt_sample(sample, join(outdir, f'{i}.txt'))
            prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
 def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
    os.makedirs(outdir, exist_ok=True)
    with open(prevpath, 'wt') as prevfile:
@ -75,69 +80,37 @@ def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, pre
            prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
 def gen_samples_real_prevalences(real_prevalences, pool: LabelledCollection, sample_size, outdir, prevpath_out):
    os.makedirs(outdir, exist_ok=True)
    with open(prevpath_out, 'wt') as prevfile:
        prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
        for i, prev in enumerate(real_prevalences):
            sample = pool.sampling(sample_size, *prev[:-1])
            write_txt_sample(sample, join(outdir, f'{i}.txt'))
            prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
 fullpath = join(datadir,domain)+'.txt.gz'
-# fullpath = join(datadir,domain)+'.txt.gz'
+data = LabelledCollection.load(fullpath, from_gz_text)
-#
+print(len(data))
-# data = LabelledCollection.load(fullpath, from_gz_text)
+print(data.classes_)
-# print(len(data))
+print(data.prevalence())
 # print(data.classes_)
 # print(data.prevalence())
 with qp.util.temp_seed(seed):
-    # train, rest = data.split_stratified(train_prop=tr_size)
+    train, rest = data.split_stratified(train_prop=tr_size)
-    #
+
-    # devel, test = rest.split_stratified(train_prop=0.5)
+    devel, test = rest.split_stratified(train_prop=0.5)
-    # print(len(train))
+    print(len(train))
-    # print(len(devel))
+    print(len(devel))
-    # print(len(test))
+    print(len(test))
-    #
+
    domaindir = join(outdir, domain)
-    # write_txt_sample(train, join(domaindir, 'training_data.txt'))
+    write_txt_sample(train, join(domaindir, 'training_data.txt'))
-    # write_txt_sample(devel, join(domaindir, 'development_data.txt'))
+    write_txt_sample(devel, join(domaindir, 'development_data.txt'))
-    # write_txt_sample(test, join(domaindir, 'test_data.txt'))
+    write_txt_sample(test, join(domaindir, 'test_data.txt'))
-    # this part is to be used when the partitions have already been created, in order to avoid re-generating them
+    gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
-    train = load_simple_sample_raw(domaindir, 'training_data')
+                    prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
-    devel = load_simple_sample_raw(domaindir, 'development_data')
+    gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
-    test = load_simple_sample_raw(domaindir, 'test_data')
+                    prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
-    # gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
+    gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
-    #                 prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
+                    prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
-    # gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
+    gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
-    #                 prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
+                    prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
    # gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
    #                 prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
    # gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
    #                 prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
    # this part generates samples based on real prevalences (in this case, prevalences of sets of books reviews
    # groupped by product). It loads the real prevalences (computed elsewhere), and randomly extract 5000 for test
    # and 1000 for val (disjoint). Then realize the samplings
    assert os.path.exists(real_prev_path), f'real prevalence file does not seem to exist...'
    real_prevalences = np.genfromtxt(real_prev_path, delimiter='\t')
    nrows = real_prevalences.shape[0]
    rand_sel = np.random.permutation(nrows)
    real_prevalences_val = real_prevalences[rand_sel[:nval]]
    real_prevalences_te  = real_prevalences[rand_sel[nval:nval+nte]]
    gen_samples_real_prevalences(real_prevalences_val, devel, sample_size=val_size, outdir=join(domaindir, 'real', 'dev_samples'),
                    prevpath_out=join(domaindir, 'real', 'dev_prevalences.txt'))
    gen_samples_real_prevalences(real_prevalences_te, test, sample_size=te_size, outdir=join(domaindir, 'real', 'test_samples'),
                    prevpath_out=join(domaindir, 'real', 'test_prevalences.txt'))
--- a/Ordinal/build_Telescope_datasets.py
+++ b/Ordinal/build_Telescope_datasets.py
@ -1,116 +0,0 @@
 import gzip
 import quapy as qp
 import numpy as np
 import pandas as pd
 from quapy.data import LabelledCollection
 import quapy.functional as F
 import os
 from os.path import join
 from pathlib import Path
 import pickle
 datadir = '../OrdinalQuantification'
 outdir  = './data/'
 domain = 'fact'
 seed = 7
 tr_size = 20000
 val_size = 1000
 te_size = 1000
 nval = 1000
 nte = 5000
 def from_csv(path):
    df = pd.read_csv(path)
    # divide the continuous labels into ordered classes
    energy_boundaries = np.arange(start=2.4, stop=4.2, step=0.15)[1:-1]
    y = np.digitize(np.array(df['log10_energy'], dtype=np.float32), energy_boundaries)
    # note: omitting the dtype will result in a single instance having a different class
    # obtain a matrix of shape (n_samples, n_features)
    X = df.iloc[:, 1:].to_numpy().astype(np.float32)
    return X, y
 def write_pkl(sample: LabelledCollection, path):
    os.makedirs(Path(path).parent, exist_ok=True)
    pickle.dump(sample, open(path, 'wb'), pickle.HIGHEST_PROTOCOL)
 def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
    os.makedirs(outdir, exist_ok=True)
    with open(prevpath, 'wt') as prevfile:
        prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
        for i, prev in enumerate(F.uniform_simplex_sampling(n_classes=pool.n_classes, size=nsamples)):
            sample = pool.sampling(sample_size, *prev)
            write_pkl(sample, join(outdir, f'{i}.pkl'))
            prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
 def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
    os.makedirs(outdir, exist_ok=True)
    with open(prevpath, 'wt') as prevfile:
        prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
        for i, sample in enumerate(pool.natural_sampling_generator(sample_size, repeats=nsamples)):
            write_pkl(sample, join(outdir, f'{i}.pkl'))
            prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
 fullpath = join(datadir,domain, 'fact_wobble.csv')
 data = LabelledCollection.load(fullpath, from_csv)
 if np.isnan(data.instances).any():
    rows, cols = np.where(np.isnan(data.instances))
    data.instances = np.delete(data.instances, rows, axis=0)
    data.labels = np.delete(data.labels, rows, axis=0)
    print('deleted nan rows')
 if np.isnan(data.instances).any():
    rows, cols = np.where(np.isnan(data.instances))
    data.instances = np.delete(data.instances, rows, axis=0)
    data.labels = np.delete(data.labels, rows, axis=0)
    print('deleted nan rows')
 if np.isinf(data.instances).any():
    rows, cols = np.where(np.isinf(data.instances))
    data.instances = np.delete(data.instances, rows, axis=0)
    data.labels = np.delete(data.labels, rows, axis=0)
    print('deleted inf rows')
 print(len(data))
 print(data.classes_)
 print(data.prevalence())
 with qp.util.temp_seed(seed):
    train, rest = data.split_stratified(train_prop=tr_size)
    devel, test = rest.split_stratified(train_prop=0.5)
    print(len(train))
    print(len(devel))
    print(len(test))
    domaindir = join(outdir, domain)
    write_pkl(train, join(domaindir, 'training_data.pkl'))
    write_pkl(devel, join(domaindir, 'development_data.pkl'))
    write_pkl(test, join(domaindir, 'test_data.pkl'))
    gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
                    prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
    gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
                    prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
    gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
                    prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
    gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
                    prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
--- a/Ordinal/evaluation.py
+++ b/Ordinal/evaluation.py
@ -1,11 +1,6 @@
 import numpy as np
 # smoothing approximation
 def smoothness(p):
    return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
 def _check_arrays(prevs):
    prevs = np.asarray(prevs)
    if prevs.ndim==1:
@ -13,7 +8,6 @@ def _check_arrays(prevs):
    return prevs
 # mean normalized match distance
 def mnmd(prevs, prevs_hat):
    prevs = _check_arrays(prevs)
    prevs_hat = _check_arrays(prevs_hat)
@ -23,7 +17,6 @@ def mnmd(prevs, prevs_hat):
    return np.mean(nmds)
 # normalized match distance
 def nmd(prev, prev_hat):
    n = len(prev)
    return (1./(n-1))*mdpa(prev, prev_hat)
--- a/Ordinal/experiments_lr_vs_ordlr.py
+++ b/Ordinal/experiments_lr_vs_ordlr.py
@ -1,150 +0,0 @@
 import numpy as np
 import quapy as qp
 import os
 from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import StandardScaler
 from Ordinal.model import RegressionQuantification, LogisticAT, LogisticSE, LogisticIT, LAD, OrdinalRidge
 from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC
 from os.path import join
 from utils import load_samples_folder, load_single_sample_pkl
 from evaluation import nmd, mnmd
 from tqdm import tqdm
 """
 This script generates all results from Table 1 in the paper, i.e., all results comparing quantifiers equipped with
 standard logistic regression against quantifiers equipped with order-aware classifiers
 """
 def quantifiers():
    params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
    params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
    params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
    params_Ridge = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced'], 'normalize':[True,False]}
    # baselines
    yield 'CC(LR)', CC(LogisticRegression()), params_LR
    yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
    yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
    yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
    yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
    # with order-aware classifiers
    # threshold-based ordinal regression (see https://pythonhosted.org/mord/)
    yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
    yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
    yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
    yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
    yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
    yield 'CC(OLR-SE)', CC(LogisticSE()), params_OLR
    yield 'PCC(OLR-SE)', PCC(LogisticSE()), params_OLR
    yield 'ACC(OLR-SE)', ACC(LogisticSE()), params_OLR
    yield 'PACC(OLR-SE)', PACC(LogisticSE()), params_OLR
    yield 'SLD(OLR-SE)', EMQ(LogisticSE()), params_OLR
    yield 'CC(OLR-IT)', CC(LogisticIT()), params_OLR
    yield 'PCC(OLR-IT)', PCC(LogisticIT()), params_OLR
    yield 'ACC(OLR-IT)', ACC(LogisticIT()), params_OLR
    yield 'PACC(OLR-IT)', PACC(LogisticIT()), params_OLR
    yield 'SLD(OLR-IT)', EMQ(LogisticIT()), params_OLR
    # other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
    # regression-based ordinal regression (see https://pythonhosted.org/mord/) 
    yield 'CC(LAD)', CC(LAD()), params_SVR
    yield 'ACC(LAD)', ACC(LAD()), params_SVR
    yield 'CC(ORidge)', CC(OrdinalRidge()), params_Ridge
    yield 'ACC(ORidge)', ACC(OrdinalRidge()), params_Ridge
 def run_experiment(params):
    qname, q, param_grid = params
    qname += posfix
    resultfile = join(resultpath, f'{qname}.all.csv')
    if os.path.exists(resultfile):
        print(f'result file {resultfile} already exists: continue')
        return None
    print(f'fitting {qname} for all-drift')
    def load_test_samples():
        folderpath = join(datapath, domain, protocol, 'test_samples')
        for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=5000):
            if posfix == '-std':
                sample.instances = zscore.transform(sample.instances)
            yield sample.instances, sample.prevalence()
    def load_dev_samples():
        folderpath = join(datapath, domain, protocol, 'dev_samples')
        for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=1000):
            if posfix == '-std':
                sample.instances = zscore.transform(sample.instances)
            yield sample.instances, sample.prevalence()
    q = qp.model_selection.GridSearchQ(
        q,
        param_grid,
        sample_size=1000,
        protocol='gen',
        error=mnmd,
        val_split=load_dev_samples,
        n_jobs=-1,
        refit=False,
        timeout=60*60*2,
        verbose=True).fit(train)
    hyperparams = f'{qname}\tall\t{q.best_params_}\t{q.best_score_}'
    print('[done]')
    report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
    mean_nmd = report['nmd'].mean()
    std_nmd = report['nmd'].std()
    print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
    report.to_csv(resultfile, index=False)
    print('[learning regressor-based adjustment]')
    q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
    q.fit(None)
    report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
    mean_nmd = report['nmd'].mean()
    std_nmd = report['nmd'].std()
    print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
    resultfile = join(resultpath, f'{qname}.all.reg.csv')
    report.to_csv(resultfile, index=False)
    return hyperparams
 if __name__ == '__main__':
    domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
    #domain = 'Books-tfidf'
    posfix = ''
    # domain = 'fact'
    # posfix = '-std'  # set to '' to avoid standardization
    # posfix = ''
    load_sample_fn = load_single_sample_pkl
    datapath = './data'
    protocol = 'app'
    resultpath = join('./results', domain, protocol)
    os.makedirs(resultpath, exist_ok=True)
    train = load_sample_fn(join(datapath, domain), 'training_data')
    if posfix=='-std':
        zscore = StandardScaler()
        train.instances = zscore.fit_transform(train.instances)
    with open(join(resultpath, 'hyper.txt'), 'at') as foo:
        hypers = qp.util.parallel(run_experiment, quantifiers(), n_jobs=-3)
        for h in hypers:
            if h is not None:
                foo.write(h)
                foo.write('\n')
--- a/Ordinal/finetune_bert.py
+++ b/Ordinal/finetune_bert.py
@ -1,105 +0,0 @@
 import csv
 import sys
 import datasets
 import numpy as np
 import pandas as pd
 import torch.cuda
 from datasets import Dataset, DatasetDict
 from sklearn.metrics import f1_score
 from sklearn.model_selection import train_test_split
 from transformers import AutoModelForSequenceClassification
 from transformers import AutoTokenizer, DataCollatorWithPadding
 from transformers import Trainer
 from transformers import TrainingArguments
 """
 This script fine-tunes a pre-trained language model on a given textual training set.
 The training goes for a maximum of 5 epochs, but stores the model parameters of the best performing epoch according
 to the validation loss in a hold-out val split of 1000 documents (stratified).
 We used it with RoBERTa in the training set of the Amazon-OQ-BK domain, i.e.:
 $> python3 ./data/Books/training_data.txt roberta-base
 """
 def tokenize_function(example):
    tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else 256)
    return tokens
 def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    return {
        'macro-f1': f1_score(labels, preds, average='macro'),
        'micro-f1': f1_score(labels, preds, average='micro'),
    }
 if __name__ == '__main__':
    debug = False
    assert torch.cuda.is_available(), 'cuda is not available'
    # datapath = './data/Books/training_data.txt'
    # checkpoint = 'roberta-base'
    n_args = len(sys.argv)
    assert n_args==3, 'wrong arguments, expected: <training-path> <transformer-name>'
    datapath = sys.argv[1]  # './data/Books/training_data.txt'
    checkpoint = sys.argv[2]  #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base'
    modelout = checkpoint+'-val-finetuned'
    # load the training set, and extract a held-out validation split of 1000 documents (stratified)
    df = pd.read_csv(datapath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
    labels = df['labels'].to_frame()
    X_train, X_val = train_test_split(df, stratify=labels, test_size=.25, random_state=1)
    num_labels = len(pd.unique(labels['labels']))
    features = datasets.Features({'labels': datasets.Value('int32'), 'review': datasets.Value('string')})
    train = Dataset.from_pandas(df=X_train, split='train', features=features)
    validation = Dataset.from_pandas(df=X_val, split='validation', features=features)
    dataset = DatasetDict({
        'train': train.select(range(500)) if debug else train,
        'validation': validation.select(range(500)) if debug else validation
    })
    # tokenize the dataset
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
    # fine-tuning
    training_args = TrainingArguments(
        modelout,
        learning_rate=2e-5,
        num_train_epochs=5,
        weight_decay=0.01,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        # eval_steps=10,
        save_total_limit=1,
        load_best_model_at_end=True
    )
    trainer = Trainer(
        model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
        data_collator=DataCollatorWithPadding(tokenizer),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    trainer.train()
--- a/Ordinal/gen_tables_amazon.py
+++ b/Ordinal/gen_tables_amazon.py
@ -1,70 +0,0 @@
 import pandas as pd
 from os.path import join
 import os
 from glob import glob
 from pathlib import Path
 from Ordinal.main import quantifiers
 from Ordinal.tabular import Table
 """
 This script generates some tables for Amazon-OQ-BK (for internal use only)
 """
 domain = 'Books-tfidf'
 domain_bert_last = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
 domain_bert_ave  = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
 domain_bert_post  = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
 prot = 'app'
 outpath = f'./tables/{domain}/{prot}/results.tex'
 resultpath = join('./results', domain, prot)
 resultpath_bertlast = join('./results', domain_bert_last, prot)
 resultpath_bertave = join('./results', domain_bert_ave, prot)
 resultpath_bertpost = join('./results', domain_bert_post, prot)
 methods = [qname for qname, *_ in quantifiers()]
 methods += ['SLD(LR)-agg']
 methods_Rlast = [m+'-RoBERTa-last' for m in methods]
 methods_Rave = [m+'-RoBERTa-average' for m in methods]
 methods_Rpost = [m+'-RoBERTa-posteriors' for m in methods]
 methods = methods + methods_Rlast + methods_Rave + methods_Rpost
 # methods += [m+'-r' for m in methods]
 table = Table(benchmarks=['low', 'mid', 'high', 'all'], methods=methods, prec_mean=4, show_std=True, prec_std=4)
 resultfiles = list(glob(f'{resultpath}/*.csv')) \
              + list(glob(f'{resultpath_bertlast}/*.csv')) \
              + list(glob(f'{resultpath_bertave}/*.csv')) \
              + list(glob(f'{resultpath_bertpost}/*.csv'))
 for resultfile in resultfiles:
    df = pd.read_csv(resultfile)
    nmd = df['nmd'].values
    resultname = Path(resultfile).name
    method, drift, *other = resultname.replace('.csv', '').split('.')
    if other:
        method += '-r'
    if method not in methods:
        continue  
    table.add(drift, method, nmd)
 os.makedirs(Path(outpath).parent, exist_ok=True)
 tabular = """
    \\resizebox{\\textwidth}{!}{%
            \\begin{tabular}{|c||""" + ('c|' * (table.nbenchmarks)) + """} \hline
            """
 tabular += table.latexTabularT(average=False)
 tabular += """
    \end{tabular}%
    }"""
 print('saving table in', outpath)
 with open(outpath, 'wt') as foo:
    foo.write(tabular)
    foo.write('\n')
 print('[done]')
--- a/Ordinal/gen_tables_telescope.py
+++ b/Ordinal/gen_tables_telescope.py
@ -1,82 +0,0 @@
 import pandas as pd
 from os.path import join
 import os
 from glob import glob
 from pathlib import Path
 from Ordinal.experiments_lr_vs_ordlr import quantifiers
 from Ordinal.tabular import Table
 """
 This script generates some tables for Fact-OQ (for internal use only)
 """
 #domain = 'fact'
 #domain = 'Books-tfidf'
 domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
 prot = 'app'
 outpath = f'./tables/{domain}/{prot}/results.tex'
 resultpath = join('./results', domain, prot)
 withstd=False
 methods = [qname for qname, *_ in quantifiers()]
 if withstd:
    methods = [m+'-std' for m in methods]
 #methods = methods + methods_variant
 # methods += [m+'-r' for m in methods]
 quantifiers_families = ['CC', 'PCC', 'ACC', 'PACC', 'SLD']
 # method_variants = ['LR', 'OLR-AT', 'OLR-SE', 'OLR-IT', 'ORidge', 'LAD']
 method_variants = ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']
 if withstd:
    method_variants = [m+'-std' for m in method_variants]
 print('families:', quantifiers_families)
 print('variants', method_variants)
 table = Table(benchmarks=quantifiers_families, methods=method_variants, prec_mean=4, show_std=True, prec_std=4,
              color=False, show_rel_to=0, missing_str='\multicolumn{1}{c}{---}', clean_zero=True)
 resultfiles = list(glob(f'{resultpath}/*).all.csv'))
 for resultfile in resultfiles:
    df = pd.read_csv(resultfile)
    nmd = df['nmd'].values
    resultname = Path(resultfile).name
    method, drift, *other = resultname.replace('.csv', '').replace('-RoBERTa-average','').split('.')
    if drift!='all':
        continue
    if other:
        method += '-r'
    if method not in methods:
        continue  
    family, variant = method.split('(')
    variant = variant.replace(')', '')
    if variant not in method_variants:
        continue
    table.add(family, variant, nmd)
 os.makedirs(Path(outpath).parent, exist_ok=True)
 tabular = """
    \\resizebox{\\textwidth}{!}{%
            \\begin{tabular}{c""" + ('l' * (table.nbenchmarks)) + """} 
            \\toprule            
            """
 tabular += table.latexTabularT(average=False)
 tabular += """
    \end{tabular}%
    }"""
 print('saving table in', outpath)
 with open(outpath, 'wt') as foo:
    foo.write(tabular)
    foo.write('\n')
 print('[done]')
--- a/Ordinal/generate_bert_vectors_npytxt.py
+++ b/Ordinal/generate_bert_vectors_npytxt.py
@ -1,152 +0,0 @@
 import sys
 import numpy as np
 import torch
 from torch.utils.data import DataLoader
 from transformers import AutoTokenizer
 from transformers import AutoModelForSequenceClassification
 from os.path import join
 import os
 import shutil
 from tqdm import tqdm
 from Ordinal.utils import load_samples_folder, load_single_sample_as_csv
 """
 This scripts takes a pre-trained model (a fine-tuned one) and generates numerical representations for all
 samples in the dataset. The representations are saved in npy-txt plain format.
 """
 def tokenize_function(example):
    tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else None, return_tensors='pt')
    return {
        'input_ids': tokens.input_ids.cuda(),
        'attention_mask': tokens.attention_mask.cuda()
    }
 def save_samples_as_txt(tensors, labels, path):
    vectors = tensors
    labels = labels.values
    vec_lab = np.hstack([labels, vectors])
    n_cols = vectors.shape[1]
    np.savetxt(path, vec_lab, fmt=['%d']+['%f']*n_cols)
 def transform_sample(instances, labels, outpath, batch_size=50):
    ndocs = len(labels)
    batches = ndocs // batch_size
    assert ndocs % batches == 0, 'fragmented last bach not supported'
    transformations = []
    for batch_id in range(0, ndocs, batch_size):
        batch_instances = instances[batch_id:batch_id + batch_size]
        tokenized_dataset = tokenize_function(batch_instances)
        out = model(**tokenized_dataset, output_hidden_states=True)
        if generation_mode == 'posteriors':
            logits = out.logits
            posteriors = torch.softmax(logits, dim=-1)
            transformed = posteriors
        elif generation_mode == 'last':
            hidden_states = out.hidden_states
            last_layer_cls = hidden_states[-1][:, 0, :]
            transformed = last_layer_cls
        elif generation_mode == 'average':
            hidden_states = out.hidden_states
            hidden_states = torch.stack(hidden_states)
            all_layer_cls = hidden_states[:, :, 0, :]
            average_cls = torch.mean(all_layer_cls, dim=0)
            transformed = average_cls
        else:
            raise NotImplementedError()
        transformations.append(transformed.cpu().numpy())
    transformations = np.vstack(transformations)
    save_samples_as_txt(transformations, labels, outpath)
 def transform_folder_samples(protocol, splitname, skip=0):
    in_folder = join(datapath, domain, protocol, splitname)
    out_folder = join(datapath, outname, protocol, splitname)
    total = 1000 if splitname.startswith('dev') else 5000
    for i, (instances, labels) in tqdm(enumerate(
            load_samples_folder(in_folder, load_fn=load_single_sample_as_csv)), desc=f'{protocol} {splitname}', total=total):
        if i>= skip:
            transform_sample(instances, labels, outpath=join(out_folder, f'{i}.txt'))
 def get_best_checkpoint(checkpointdir):
    from glob import glob
    steps = []
    for folder in glob(f'{checkpointdir}/checkpoint-*'):
        step=int(folder.split('checkpoint-')[1])
        steps.append(step)
    assert len(steps) <= 2, 'unexpected number of steps, only two where expected (the best one and the last one)'
    choosen = f'{checkpointdir}/checkpoint-{min(steps)}'
    print(f'choosen checkpoint is {choosen}')
    return choosen
 if __name__ == '__main__':
    debug = False
    assert torch.cuda.is_available(), 'cuda is not available'
    #checkpoint='roberta-base-val-finetuned'
    #generation_mode = 'ave'
    n_args = len(sys.argv)
    assert n_args==3, 'wrong arguments, expected: <checkpoint> <generation-mode>\n' \
                      '\tgeneration-mode: last (last layer), ave (average pooling), or posteriors (posterior probabilities)'
    checkpoint = sys.argv[1]  #e.g., 'bert-base-uncased'
    generation_mode = sys.argv[2]  # e.g., 'last'
    assert 'finetuned' in checkpoint, 'looks like this model is not finetuned'
    checkpoint = get_best_checkpoint(checkpoint)
    num_labels = 5
    datapath = './data'
    domain = 'Books'
    protocols = ['real']  # ['app', 'npp']
    assert generation_mode in ['last', 'average', 'posteriors'], 'unknown generation_model'
    outname = domain + f'-{checkpoint}-{generation_mode}'
    with torch.no_grad():
        print('loading', checkpoint)
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
        os.makedirs(join(datapath, outname), exist_ok=True)
        print('transforming the training set')
        instances, labels = load_single_sample_as_csv(join(datapath, domain), 'training_data')
        transform_sample(instances, labels, join(datapath, outname, 'training_data.txt'))
        print('[done]')
        for protocol in protocols:
            in_path = join(datapath, domain, protocol)
            out_path = join(datapath, outname, protocol)
            os.makedirs(out_path, exist_ok=True)
            os.makedirs(join(out_path, 'dev_samples'), exist_ok=True)
            os.makedirs(join(out_path, 'test_samples'), exist_ok=True)
            shutil.copyfile(join(in_path, 'dev_prevalences.txt'), join(out_path, 'dev_prevalences.txt'))
            shutil.copyfile(join(in_path, 'test_prevalences.txt'), join(out_path, 'test_prevalences.txt'))
            print('processing', protocol)
            transform_folder_samples(protocol, 'dev_samples')
            transform_folder_samples(protocol, 'test_samples')
--- a/Ordinal/inspect_dataset.py
+++ b/Ordinal/inspect_dataset.py
@ -0,0 +1,16 @@
 import quapy as qp
 from quapy.data import LabelledCollection
 from quapy.data.reader import from_text
 from quapy.functional import strprev
 category = 'Books'
 datadir = './data'
 training_path = f'{datadir}/{category}/training_data.txt'
 data = LabelledCollection.load(training_path, loader_func=from_text)
 print(len(data))
 print(strprev(data.prevalence()))
--- a/Ordinal/main.py
+++ b/Ordinal/main.py
@ -3,154 +3,87 @@ from sklearn.linear_model import LogisticRegression
 import quapy as qp
 import numpy as np
-from Ordinal.model import OrderedLogisticRegression, LogisticAT
+from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, RegressorClassifier
-from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy
+from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC
 from quapy.data import LabelledCollection
 from os.path import join
-import os
+from utils import load_samples, load_samples_pkl
 from utils import load_samples_folder, load_simple_sample_npytxt, load_single_sample_pkl
 from evaluation import nmd, mnmd
 from time import time
 import pickle
 from tqdm import tqdm
-import mord
+
-
+domain = 'Books-tfidf'
-
+datapath = './data'
-
+protocol = 'app'
-def quantifiers():
+drift = 'high'
-    params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
+
-    # params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
+train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb'))
-    params_OLR = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
+
-    params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
+
-    # params_SVR = {'C': np.logspace(0, 1, 2)}
+def load_test_samples():
-
+    ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
-    # baselines
+    ids = set(ids)
-    yield 'CC(LR)', CC(LogisticRegression()), params_LR
+    for sample in tqdm(load_samples_pkl(join(datapath, domain, protocol, 'test_samples'), filter=ids), total=len(ids)):
-    yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
+        yield sample.instances, sample.prevalence()
-    yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
+
-    yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
+
-    #yield 'HDy(LR)', HDy(LogisticRegression()), params_LR
+def load_dev_samples():
-    yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
+    ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
-
+    ids = set(ids)
-    # with order-aware classifiers
+    for sample in tqdm(load_samples_pkl(join(datapath, domain, protocol, 'dev_samples'), filter=ids), total=len(ids)):
-    # threshold-based ordinal regression (see https://pythonhosted.org/mord/)
+        yield sample.instances, sample.prevalence()
-    #yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
+
-    #yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
+
-    #yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
+print('fitting the quantifier')
-    #yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
+
-    #yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR
+# q = EMQ(LogisticRegression(class_weight='balanced'))
-    #yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
+# q = PACC(LogisticRegression(class_weight='balanced'))
-    # other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
+q = PACC(OrderedLogisticRegression())
-
+# q = PACC(StackedClassifier(LogisticRegression(class_weight='balanced')))
-    # regression-based ordinal regression (see https://pythonhosted.org/mord/) 
+# q = RegressionQuantification(PCC(LogisticRegression(class_weight='balanced')), val_samples_generator=load_dev_samples)
-    # I am using my implementation, which caters for predict_proba (linear distance to the two closest classes, 0 in the rest)
+# q = ACC(RegressorClassifier())
-    # the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do
+
-    # not implement predict_proba nor decision_score
+param_grid = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
-    #yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
+# param_grid = {'C': np.logspace(-3,3,14)}
-    #yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
+# param_grid = {'alpha':np.logspace(-8, 6, 15)}
-    # yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR
+
-    # yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR
+# q = qp.model_selection.GridSearchQ(
-    # yield 'PACC(SVR)', PACC(RegressorClassifier()), params_SVR
+#     q,
-    #yield 'HDy(SVR)', HDy(RegressorClassifier()), params_SVR
+#     param_grid,
-    # yield 'SLD(SVR)', EMQ(RegressorClassifier()), params_SVR
+#     1000,
-
+#     'gen',
-
+#     error=mnmd,
-def run_experiment(params):
+#     val_split=load_dev_samples,
-    qname, q, param_grid, drift = params
+#     n_jobs=-1,
-    qname += posfix
+#     refit=False,
-    resultfile = join(resultpath, f'{qname}.{drift}.csv')
+#     verbose=True)
-    if os.path.exists(resultfile):
+
-        print(f'result file {resultfile} already exists: continue')
+q.fit(train)
-        return None
+
-
+# q = RegressionQuantification(q, val_samples_generator=load_dev_samples)
-    print(f'fitting {qname} for {drift}-drift')
+# q.fit(None)
-
+
-
+print('[done]')
-    def load_test_samples():
+
-        ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
+report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
-        ids = set(ids)
+mean_nmd = report['nmd'].mean()
-        folderpath = join(datapath, domain, protocol, 'test_samples')
+std_nmd = report['nmd'].std()
-        for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
+print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
-            yield sample.instances, sample.prevalence()
+
-
+q = RegressionQuantification(q, val_samples_generator=load_dev_samples)
-
+q.fit(None)
-    def load_dev_samples():
+
-        ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
+report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
-        ids = set(ids)
+mean_nmd = report['nmd'].mean()
-        folderpath = join(datapath, domain, protocol, 'dev_samples')
+std_nmd = report['nmd'].std()
-        for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
+print(f'[regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
-            yield sample.instances, sample.prevalence()
+
-
+# drift='high'
-    q = qp.model_selection.GridSearchQ(
+# report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
-        q,
+# mean_nmd = report['nmd'].mean()
-        param_grid,
+# std_nmd = report['nmd'].std()
-        sample_size=1000,
+# print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
        protocol='gen',
        error=mnmd,
        val_split=load_dev_samples,
        n_jobs=-1,
        refit=False,
        verbose=True).fit(train)
    hyperparams = f'{qname}\t{drift}\t{q.best_params_}'
    print('[done]')
    report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
    mean_nmd = report['nmd'].mean()
    std_nmd = report['nmd'].std()
    print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
    report.to_csv(resultfile, index=False)
    print('[learning regressor-based adjustment]')
    q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
    q.fit(None)
    report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
    mean_nmd = report['nmd'].mean()
    std_nmd = report['nmd'].std()
    print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
    resultfile = join(resultpath, f'{qname}.{drift}.reg.csv')
    report.to_csv(resultfile, index=False)
    return hyperparams
 if __name__ == '__main__':
    #preprocessing = 'roberta.last'
    preprocessing = 'roberta.average'
    # preprocessing = 'roberta.posteriors'
    #preprocessing = 'tfidf'
    if preprocessing=='tfidf':
        domain = 'Books-tfidf'
        posfix = ''
    elif preprocessing=='roberta.last':
        domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
        posfix = '-RoBERTa-last'
    elif preprocessing=='roberta.average':
        domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
        posfix = '-RoBERTa-average'
    elif preprocessing=='roberta.posteriors':
        domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
        posfix = '-RoBERTa-posteriors'
    load_sample_fn = load_single_sample_pkl
    datapath = './data'
    protocol = 'app'
    resultpath = join('./results', domain, protocol)
    os.makedirs(resultpath, exist_ok=True)
    train = load_sample_fn(join(datapath, domain), 'training_data')
    with open(join(resultpath, 'hyper.txt'), 'at') as foo:
        #for drift in [f'smooth{i}' for i in range(5)] + ['all']:
        params = [(*qs, drift) for qs in quantifiers() for drift in ['low', 'mid', 'high', 'all']]
        hypers = qp.util.parallel(run_experiment, params, n_jobs=-2)
        for h in hypers:
            if h is not None:
                foo.write(h)
                foo.write('\n')
--- a/Ordinal/model.py
+++ b/Ordinal/model.py
@ -1,11 +1,14 @@
-import mord
+from copy import deepcopy
 import numpy as np
 from scipy.sparse import issparse
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.decomposition import TruncatedSVD
-from sklearn.linear_model import Ridge
+from sklearn.linear_model import LogisticRegression, Ridge
-from sklearn.svm import LinearSVR
+from scipy.sparse import issparse
-from sklearn.utils.class_weight import compute_class_weight
+from sklearn.multiclass import OneVsRestClassifier
 from sklearn.multioutput import MultiOutputRegressor
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import LinearSVR, SVR
 from statsmodels.miscmodels.ordinal_model import OrderedModel
@ -33,21 +36,112 @@ class OrderedLogisticRegression:
        return self.res_prob.model.predict(self.res_prob.params, exog=X)
-class LAD(BaseEstimator, ClassifierMixin):
+class StackedClassifier:  # aka Funnelling Monolingual
-    def __init__(self, C=1.0, class_weight=None):
+    def __init__(self, base_estimator=LogisticRegression()):
-        self.C = C
+        if not hasattr(base_estimator, 'predict_proba'):
-        self.class_weight = class_weight
+            print('the estimator does not seem to be probabilistic: calibrating')
            base_estimator = CalibratedClassifierCV(base_estimator)
        # self.base = deepcopy(OneVsRestClassifier(base_estimator))
        # self.meta = deepcopy(OneVsRestClassifier(base_estimator))
        self.base = deepcopy(base_estimator)
        self.meta = deepcopy(base_estimator)
        self.norm = StandardScaler()
-    def fit(self, X, y, sample_weight=None):
+    def fit(self, X, y):
-        self.regressor = LinearSVR(C=self.C)
+        self.base.fit(X, y)
        P = self.base.predict_proba(X)
        P = self.norm.fit_transform(P)
        self.meta.fit(P, y)
        return self
    def predict(self, X):
        P = self.base.predict_proba(X)
        P = self.norm.transform(P)
        return self.meta.predict(P)
    def predict_proba(self, X):
        P = self.base.predict_proba(X)
        P = self.norm.transform(P)
        return self.meta.predict_proba(P)
 class RegressionQuantification:
    def __init__(self,
                 base_quantifier,
                 regression='svr',
                 val_samples_generator=None,
                 norm=True):
        self.base_quantifier = base_quantifier
        if isinstance(regression, str):
            assert regression in ['ridge', 'svr'], 'unknown regression model'
            if regression == 'ridge':
                self.reg = Ridge(normalize=norm)
            elif regression == 'svr':
                self.reg = MultiOutputRegressor(LinearSVR())
        else:
            self.reg = regression
        # self.reg = MultiTaskLassoCV(normalize=norm)
        # self.reg = KernelRidge(kernel='rbf')
        # self.reg = LassoLarsCV(normalize=norm)
        # self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
        #self.reg = LinearRegression(normalize=norm) # <- bien
        # self.reg = MultiOutputRegressor(ARDRegression(normalize=norm))  # <- bastante bien, incluso sin norm
        # self.reg = MultiOutputRegressor(BayesianRidge(normalize=False))  # <- bastante bien, incluso sin norm
        # self.reg = MultiOutputRegressor(SGDRegressor())  # lento, no va
        self.regression = regression
        self.val_samples_generator = val_samples_generator
        # self.norm = StandardScaler()
        # self.covs = covs
    def generate_validation_samples(self):
        Xs, ys = [], []
        for instances, prevalence in self.val_samples_generator():
            ys.append(prevalence)
            Xs.append(self.base_quantifier.quantify(instances))
        Xs = np.asarray(Xs)
        ys = np.asarray(ys)
        return Xs, ys
    def fit(self, data):
        print('fitting quantifier')
        if data is not None:
            self.base_quantifier.fit(data)
        print('generating val samples')
        Xs, ys = self.generate_validation_samples()
        # Xs = self.norm.fit_transform(Xs)
        print('fitting regressor')
        self.reg.fit(Xs, ys)
        print('[done]')
        return self
    def quantify(self, instances):
        Xs = self.base_quantifier.quantify(instances).reshape(1, -1)
        # Xs = self.norm.transform(Xs)
        Xs = self.reg.predict(Xs)
        # Xs = self.norm.inverse_transform(Xs)
        adjusted = Xs / Xs.sum()
        # adjusted = np.clip(Xs, 0, 1)
        adjusted = adjusted.flatten()
        return adjusted
    def get_params(self, deep=True):
        return self.base_quantifier.get_params()
    def set_params(self, **params):
        self.base_quantifier.set_params(**params)
 class RegressorClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.regressor = LinearSVR()
        # self.regressor = SVR()
        # self.regressor = Ridge(normalize=True)
-        classes = sorted(np.unique(y))
+
-        self.nclasses = len(classes)
+
-        if self.class_weight == 'balanced':
+    def fit(self, X, y):
-            class_weight = compute_class_weight('balanced', classes=classes, y=y)
+        self.nclasses = len(np.unique(y))
-            sample_weight = class_weight[y]
+        self.regressor.fit(X, y)
        self.regressor.fit(X, y, sample_weight=sample_weight)
        return self
    def predict(self, X):
@ -57,20 +151,13 @@ class LAD(BaseEstimator, ClassifierMixin):
        c[c>(self.nclasses-1)]=self.nclasses-1
        return c.astype(np.int)
-    # def predict_proba(self, X):
+    def predict_proba(self, X):
    #     r = self.regressor.predict(X)
    #     nC = len(self.classes_)
    #     r = np.clip(r, 0, nC - 1)
    #     dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
    #     invdist = 1 - dists
    #     invdist[invdist < 0] = 0
    #     return invdist
    def decision_function(self, X):
        r = self.regressor.predict(X)
        nC = len(self.classes_)
        r = np.clip(r, 0, nC - 1)
        dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
        invdist = 1 - dists
        invdist[invdist < 0] = 0
        return invdist
    @property
@ -78,118 +165,8 @@ class LAD(BaseEstimator, ClassifierMixin):
        return np.arange(self.nclasses)
    def get_params(self, deep=True):
-        return {'C':self.C, 'class_weight': self.class_weight}
+        return self.regressor.get_params()
    def set_params(self, **params):
-        self.C = params['C']
+        self.regressor.set_params(**params)
        self.class_weight = params['class_weight']
 class OrdinalRidge(BaseEstimator, ClassifierMixin):
    def __init__(self, alpha=1.0, class_weight=None, normalize=False):
        self.alpha = alpha
        self.class_weight = class_weight
        self.normalize = normalize
    def fit(self, X, y, sample_weight=None):
        self.regressor = Ridge(alpha=self.alpha, normalize=self.normalize)
        classes = sorted(np.unique(y))
        self.nclasses = len(classes)
        if self.class_weight == 'balanced':
            class_weight = compute_class_weight('balanced', classes=classes, y=y)
            sample_weight = class_weight[y]
        self.regressor.fit(X, y, sample_weight=sample_weight)
        return self
    def predict(self, X):
        r = self.regressor.predict(X)
        c = np.round(r)
        c[c<0]=0
        c[c>(self.nclasses-1)]=self.nclasses-1
        return c.astype(np.int)
    # def predict_proba(self, X):
    #     r = self.regressor.predict(X)
    #     nC = len(self.classes_)
    #     r = np.clip(r, 0, nC - 1)
    #     dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
    #     invdist = 1 - dists
    #     invdist[invdist < 0] = 0
    #     return invdist
    def decision_function(self, X):
        r = self.regressor.predict(X)
        nC = len(self.classes_)
        dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
        invdist = 1 - dists
        return invdist
    @property
    def classes_(self):
        return np.arange(self.nclasses)
    def get_params(self, deep=True):
        return {'alpha':self.alpha, 'class_weight': self.class_weight, 'normalize': self.normalize}
    def set_params(self, **params):
        self.alpha = params['alpha']
        self.class_weight = params['class_weight']
        self.normalize = params['normalize']
 # with order-aware classifiers
 # threshold-based ordinal regression (see https://pythonhosted.org/mord/)
 class LogisticAT(mord.LogisticAT):
    def __init__(self, alpha=1.0, class_weight=None):
        assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
        self.class_weight = class_weight
        super(LogisticAT, self).__init__(alpha=alpha)
    def fit(self, X, y, sample_weight=None):
        if self.class_weight == 'balanced':
            classes = sorted(np.unique(y))
            class_weight = compute_class_weight('balanced', classes=classes, y=y)
            sample_weight = class_weight[y]
        return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight)
 class LogisticSE(mord.LogisticSE):
    def __init__(self, alpha=1.0, class_weight=None):
        assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
        self.class_weight = class_weight
        super(LogisticSE, self).__init__(alpha=alpha)
    def fit(self, X, y, sample_weight=None):
        if self.class_weight == 'balanced':
            classes = sorted(np.unique(y))
            class_weight = compute_class_weight('balanced', classes=classes, y=y)
            sample_weight = class_weight[y]
        return super(LogisticSE, self).fit(X, y, sample_weight=sample_weight)
 class LogisticIT(mord.LogisticIT):
    def __init__(self, alpha=1.0, class_weight=None):
        assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
        self.class_weight = class_weight
        super(LogisticIT, self).__init__(alpha=alpha)
    def fit(self, X, y, sample_weight=None):
        if self.class_weight == 'balanced':
            classes = sorted(np.unique(y))
            class_weight = compute_class_weight('balanced', classes=classes, y=y)
            sample_weight = class_weight[y]
        return super(LogisticIT, self).fit(X, y, sample_weight=sample_weight)
 # regression-based ordinal regression (see https://pythonhosted.org/mord/)
 # class LAD(mord.LAD):
 #     def fit(self, X, y):
 #         self.classes_ = sorted(np.unique(y))
 #         return super().fit(X, y)
 # class OrdinalRidge(mord.OrdinalRidge):
 #     def fit(self, X, y):
 #         self.classes_ = sorted(np.unique(y))
 #         return super().fit(X, y)
--- a/Ordinal/partition_dataset_by_shift.py
+++ b/Ordinal/partition_dataset_by_shift.py
@ -1,7 +1,7 @@
 import numpy as np
 import quapy as qp
-from evaluation import nmd
+from Ordinal.evaluation import nmd
-from Ordinal.utils import load_samples_folder, load_single_sample_pkl
+from Ordinal.utils import load_samples_pkl
 from quapy.data import LabelledCollection
 import pickle
 import os
@ -9,39 +9,28 @@ from os.path import join
 from tqdm import tqdm
 """
 This scripts generates a partition of a dataset in terms of "shift".
 The partition is only carried out by generating index vectors. 
 """
 def partition_by_drift(split, training_prevalence):
    assert split in ['dev', 'test'], 'invalid split name'
    total=1000 if split=='dev' else 5000
    drifts = []
-    folderpath = join(datapath, domain, 'app', f'{split}_samples')
+    for sample in tqdm(load_samples_pkl(join(datapath, domain, 'app', f'{split}_samples')), total=total):
    for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
        drifts.append(nmd(training_prevalence, sample.prevalence()))
    drifts = np.asarray(drifts)
    order = np.argsort(drifts)
    nD = len(order)
    low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:]
    all_drift = np.arange(nD)
    np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift)
    np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift)
    np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift)
    np.save(join(datapath, domain, 'app', f'alldrift.{split}.id.npy'), all_drift)
    lows = drifts[low_drift]
    mids = drifts[mid_drift]
    highs = drifts[high_drift]
    all = drifts[all_drift]
    print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}')
    print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}')
    print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}')
    print(f'all drift: interval [{all.min():.4f}, {all.max():.4f}] mean: {all.mean():.4f}')
-domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
+domain = 'Books-tfidf'
 datapath = './data'
 training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
--- a/Ordinal/partition_dataset_by_smoothness.py
+++ b/Ordinal/partition_dataset_by_smoothness.py
@ -1,41 +0,0 @@
 import numpy as np
 from Ordinal.evaluation import smoothness
 from Ordinal.utils import load_samples_folder, load_single_sample_pkl
 from os.path import join
 from tqdm import tqdm
 """
 This scripts generates a partition of a dataset in terms of "smoothness".
 The partition is only carried out by generating index vectors. 
 """
 def partition_by_smoothness(split):
    assert split in ['dev', 'test'], 'invalid split name'
    total=1000 if split=='dev' else 5000
    smooths = []
    folderpath = join(datapath, domain, 'app', f'{split}_samples')
    for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
        smooths.append(smoothness(sample.prevalence()))
    smooths = np.asarray(smooths)
    order = np.argsort(smooths)
    nD = len(order)
    low2high_smooth = np.array_split(order, 5) 
    all_drift = np.arange(nD)
    for i, smooth_idx in enumerate(low2high_smooth):
        block = smooths[smooth_idx]
        print(f'smooth block {i}: shape={smooth_idx.shape}, interval=[{block.min()}, {block.max()}] mean={block.mean()}')
        np.save(join(datapath, domain, 'app', f'smooth{i}.{split}.id.npy'), smooth_idx)
    np.save(join(datapath, domain, 'app', f'all.{split}.id.npy'), all_drift)
 #domain = 'Books-tfidf'
 domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
 datapath = './data'
 #training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
 partition_by_smoothness('dev')
 partition_by_smoothness('test')
--- a/Ordinal/preprocess_dataset_raw2tfidf.py
+++ b/Ordinal/preprocess_dataset_raw2tfidf.py
@ -1,20 +1,14 @@
 import quapy as qp
 from Ordinal.utils import load_simple_sample_raw
 from quapy.data import LabelledCollection
 from sklearn.feature_extraction.text import TfidfVectorizer
 from os.path import join
 import os
 import pickle
 from utils import load_samples
 from tqdm import tqdm
 import shutil
 """
 This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into tfidf vectors.
 """
 datapath = './data'
 domain = 'Books'
 outname = domain + '-tfidf'
@ -46,7 +40,7 @@ pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pic
 def transform_folder_samples(protocol, splitname):
-    for i, sample in tqdm(enumerate(load_simple_sample_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
+    for i, sample in tqdm(enumerate(load_samples(join(datapath, domain, protocol, splitname), classes=train.classes_))):
        sample.instances = tfidf.transform(sample.instances)
        pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
--- a/Ordinal/preprocess_dataset_npytxt2pkl.py
+++ b/Ordinal/preprocess_dataset_npytxt2pkl.py
@ -1,51 +0,0 @@
 import quapy as qp
 from quapy.data import LabelledCollection
 from sklearn.feature_extraction.text import TfidfVectorizer
 from os.path import join
 import os
 import pickle
 from utils import *
 from tqdm import tqdm
 import shutil
 """
 This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into dense vectors
 extracted from a pretrained model (here we use the RoBERTa fine-tuned on the training set)
 Three vector generation modes are available: posteriors, last, average
 """
 vector_generation = 'posteriors'
 datapath = './data'
 domain = f'Books-roberta-base-finetuned/checkpoint-1188-{vector_generation}'
 outname = domain.replace('-finetuned', '-finetuned-pkl')
 protocol = 'app'
 print('pickling npy txt files')
 print('from:', join(datapath, domain))
 print('to', join(datapath, outname))
 print('for protocol:', protocol)
 os.makedirs(join(datapath, outname), exist_ok=True)
 os.makedirs(join(datapath, outname, protocol), exist_ok=True)
 os.makedirs(join(datapath, outname, protocol, 'dev_samples'), exist_ok=True)
 os.makedirs(join(datapath, outname, protocol, 'test_samples'), exist_ok=True)
 shutil.copyfile(join(datapath, domain, protocol, 'dev_prevalences.txt'), join(datapath, outname, protocol, 'dev_prevalences.txt'))
 shutil.copyfile(join(datapath, domain, protocol, 'test_prevalences.txt'), join(datapath, outname, protocol, 'test_prevalences.txt'))
 train = load_simple_sample_npytxt(join(datapath, domain), 'training_data', classes=np.arange(5))
 pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
 def transform_folder_samples(protocol, splitname):
    folder_dir=join(datapath, domain, protocol, splitname)
    for i, sample in tqdm(enumerate(load_samples_folder(folder_dir, filter=None, load_fn=load_simple_sample_npytxt, classes=train.classes_))):
        pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
 transform_folder_samples(protocol, 'dev_samples')
 transform_folder_samples(protocol, 'test_samples')
--- a/Ordinal/tabular.py
+++ b/Ordinal/tabular.py
@ -1,374 +0,0 @@
 import numpy as np
 import itertools
 from scipy.stats import ttest_ind_from_stats, wilcoxon
 class Table:
    VALID_TESTS = [None, "wilcoxon", "ttest"]
    def __init__(self, benchmarks, methods, lower_is_better=True, significance_test='wilcoxon', prec_mean=3,
                 clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
                 color=True, show_rel_to=-1):
        assert significance_test in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
        self.benchmarks = np.asarray(benchmarks)
        self.benchmark_index = {row: i for i, row in enumerate(benchmarks)}
        self.methods = np.asarray(methods)
        self.method_index = {col: j for j, col in enumerate(methods)}
        self.map = {}
        # keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
        self._addmap('values', dtype=object)
        self.lower_is_better = lower_is_better
        self.ttest = significance_test
        self.prec_mean = prec_mean
        self.clean_zero = clean_zero
        self.show_std = show_std
        self.prec_std = prec_std
        self.add_average = average
        self.missing = missing
        self.missing_str = missing_str
        self.color = color
        self.show_rel_to = show_rel_to
        self.touch()
    @property
    def nbenchmarks(self):
        return len(self.benchmarks)
    @property
    def nmethods(self):
        return len(self.methods)
    def touch(self):
        self._modif = True
    def update(self):
        if self._modif:
            self.compute()
    def _getfilled(self):
        return np.argwhere(self.map['fill'])
    @property
    def values(self):
        return self.map['values']
    def _indexes(self):
        return itertools.product(range(self.nbenchmarks), range(self.nmethods))
    def _addmap(self, map, dtype, func=None):
        self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
        if func is None:
            return
        m = self.map[map]
        f = func
        indexes = self._indexes() if map == 'fill' else self._getfilled()
        for i, j in indexes:
            m[i, j] = f(self.values[i, j])
    def _addrank(self):
        for i in range(self.nbenchmarks):
            filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
            col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
            ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
            if not self.lower_is_better:
                ranked_cols_idx = ranked_cols_idx[::-1]
            self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx) + 1)
    def _addcolor(self):
        for i in range(self.nbenchmarks):
            filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
            if filled_cols_idx.size == 0:
                continue
            col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
            minval = min(col_means)
            maxval = max(col_means)
            for col_idx in filled_cols_idx:
                val = self.map['mean'][i, col_idx]
                norm = (maxval - minval)
                if norm > 0:
                    normval = (val - minval) / norm
                else:
                    normval = 0.5
                if self.lower_is_better:
                    normval = 1 - normval
                self.map['color'][i, col_idx] = color_red2green_01(normval)
    def _run_ttest(self, row, col1, col2):
        mean1 = self.map['mean'][row, col1]
        std1 = self.map['std'][row, col1]
        nobs1 = self.map['nobs'][row, col1]
        mean2 = self.map['mean'][row, col2]
        std2 = self.map['std'][row, col2]
        nobs2 = self.map['nobs'][row, col2]
        _, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
        return p_val
    def _run_wilcoxon(self, row, col1, col2):
        values1 = self.map['values'][row, col1]
        values2 = self.map['values'][row, col2]
        _, p_val = wilcoxon(values1, values2)
        return p_val
    def _add_statistical_test(self):
        if self.ttest is None:
            return
        self.some_similar = [False] * self.nmethods
        for i in range(self.nbenchmarks):
            filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
            if len(filled_cols_idx) <= 1:
                continue
            col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
            best_pos = filled_cols_idx[np.argmin(col_means)]
            for j in filled_cols_idx:
                if j == best_pos:
                    continue
                if self.ttest == 'ttest':
                    p_val = self._run_ttest(i, best_pos, j)
                else:
                    p_val = self._run_wilcoxon(i, best_pos, j)
                pval_outcome = pval_interpretation(p_val)
                self.map['ttest'][i, j] = pval_outcome
                if pval_outcome != 'Diff':
                    self.some_similar[j] = True
    def compute(self):
        self._addmap('fill', dtype=bool, func=lambda x: x is not None)
        self._addmap('mean', dtype=float, func=np.mean)
        self._addmap('std', dtype=float, func=np.std)
        self._addmap('nobs', dtype=float, func=len)
        self._addmap('rank', dtype=int, func=None)
        self._addmap('color', dtype=object, func=None)
        self._addmap('ttest', dtype=object, func=None)
        self._addmap('latex', dtype=object, func=None)
        self._addrank()
        self._addcolor()
        self._add_statistical_test()
        if self.add_average:
            self._addave()
        self._modif = False
    def _is_column_full(self, col):
        return all(self.map['fill'][:, self.method_index[col]])
    def _addave(self):
        ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, significance_test=self.ttest, average=False,
                    missing=self.missing, missing_str=self.missing_str, prec_mean=self.prec_mean, prec_std=self.prec_std,
                    show_std=self.show_std)
        for col in self.methods:
            values = None
            if self._is_column_full(col):
                if self.ttest == 'ttest':
                    values = np.asarray(self.map['mean'][:, self.method_index[col]])
                else:  # wilcoxon
                    values = np.concatenate(self.values[:, self.method_index[col]])
            ave.add('ave', col, values)
        self.average = ave
    def add(self, benchmark, method, values):
        if values is not None:
            values = np.asarray(values)
            if values.ndim == 0:
                values = values.flatten()
        rid, cid = self._coordinates(benchmark, method)
        if self.map['values'][rid, cid] is None:
            self.map['values'][rid, cid] = values
        elif values is not None:
            self.map['values'][rid, cid] = np.concatenate([self.map['values'][rid, cid], values])
        self.touch()
    def get(self, benchmark, method, attr='mean'):
        self.update()
        assert attr in self.map, f'unknwon attribute {attr}'
        rid, cid = self._coordinates(benchmark, method)
        if self.map['fill'][rid, cid]:
            v = self.map[attr][rid, cid]
            if v is None or (isinstance(v, float) and np.isnan(v)):
                return self.missing
            return v
        else:
            return self.missing
    def _coordinates(self, benchmark, method):
        assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
        assert method in self.method_index, f'method {method} out of range'
        rid = self.benchmark_index[benchmark]
        cid = self.method_index[method]
        return rid, cid
    def get_average(self, method, attr='mean'):
        self.update()
        if self.add_average:
            return self.average.get('ave', method, attr=attr)
        return None
    def get_color(self, benchmark, method):
        color = self.get(benchmark, method, attr='color')
        if color is None:
            return ''
        return color
    def latexCell(self, benchmark, method):
        self.update()
        i, j = self._coordinates(benchmark, method)
        if self.map['fill'][i, j] == False:
            return self.missing_str
        mean = self.map['mean'][i, j]
        l = f" {mean:.{self.prec_mean}f}"
        if self.clean_zero:
            l = l.replace(' 0.', '.')
        isbest = self.map['rank'][i, j] == 1
        if self.ttest is not None:  # and self.some_similar[j]:
            test_label = self.map['ttest'][i, j]
            if test_label in ['Sim', 'Same']:
                isbest = True
        if isbest:
            l = "\\textbf{" + l.strip() + "}\;"
        else:
            l += '\; '
        stat = ''
        # this is commented because we are putting in textbf all results that are similar to the best one
        # if self.ttest is not None: # and self.some_similar[j]:
        #     test_label = self.map['ttest'][i, j]
        #     if test_label == 'Sim':
        #         stat = '^{\dag\phantom{\dag}}'
        #     elif test_label == 'Same':
        #         stat = '^{\ddag}'
        #     elif isbest or test_label == 'Diff':
        #         stat = '^{\phantom{\ddag}}'
        std = ''
        if self.show_std:
            std = self.map['std'][i, j]
            std = f" {std:.{self.prec_std}f}"
            if self.clean_zero:
                std = std.replace(' 0.', '.')
            std = f" \pm {std:{self.prec_std}}"
        relto = ''
        if self.show_rel_to != -1:
            if j != self.show_rel_to:
                ref_ave = self.map['mean'][i, self.show_rel_to]
                rel = 100*(mean-ref_ave)/ref_ave
                if abs(rel) < 0.1:
                    relto=f'(\\approx)'
                else:
                    plussign = '+' if rel>0 else ''  # already plugs the '-' sign
                    relto=f'({plussign}{rel:.1f}\%)'
                std = ''
        if stat != '' or std != '' or relto != '':
            l = f'{l}${stat}{std}{relto}$'
        if self.color:
            l += ' ' + self.map['color'][i, j]
        return l
    def latexTabular(self, benchmark_replace={}, method_replace={}, average=True):
        tab = ' & '
        tab += ' & '.join([method_replace.get(col, col) for col in self.methods])
        tab += ' \\\\\hline\n'
        for row in self.benchmarks:
            rowname = benchmark_replace.get(row, row)
            tab += rowname + ' & '
            tab += self.latexRow(row)
        if average:
            tab += '\hline\n'
            tab += 'Average & '
            tab += self.latexAverage()
        return tab
    def latexTabularT(self, benchmark_replace={}, method_replace={}, average=True, side=False):
        def withside(label):
            return '\side{'+label+'}' if side else label
        def center(label):
            return '\multicolumn{1}{c}{'+label+'}'
        tab = ' & '
        tab += ' & '.join([center(withside(benchmark_replace.get(col, col))) for col in self.benchmarks])
        if average:
            tab += ' & ' + withside('Ave')
        # tab += ' \\\\\hline\n'
        tab += ' \\\\\midrule\n'
        for row in self.methods:
            rowname = method_replace.get(row, row)
            tab += rowname + ' & '
            tab += self.latexRowT(row, endl='')
            if average:
                tab += ' & '
                tab += self.average.latexCell('ave', row)
            # tab += '\\\\\hline\n'
            tab += '\\\\\n'
        tab += '\\bottomrule'
        return tab
    def latexRow(self, benchmark, endl='\\\\\hline\n'):
        s = [self.latexCell(benchmark, col) for col in self.methods]
        s = ' & '.join(s)
        s += ' ' + endl
        return s
    def latexRowT(self, method, endl='\\\\\hline\n'):
        s = [self.latexCell(benchmark, method) for benchmark in self.benchmarks]
        s = ' & '.join(s)
        s += ' ' + endl
        return s
    def latexAverage(self, endl='\\\\\hline\n'):
        if self.add_average:
            return self.average.latexRow('ave', endl=endl)
    def getRankTable(self):
        t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=0, average=True)
        for rid, cid in self._getfilled():
            row = self.benchmarks[rid]
            col = self.methods[cid]
            t.add(row, col, self.get(row, col, 'rank'))
        t.compute()
        return t
    def dropMethods(self, methods):
        drop_index = [self.method_index[m] for m in methods]
        new_methods = np.delete(self.methods, drop_index)
        new_index = {col: j for j, col in enumerate(new_methods)}
        self.map['values'] = self.values[:, np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
        self.methods = new_methods
        self.method_index = new_index
        self.touch()
 def pval_interpretation(p_val):
    if 0.005 >= p_val:
        return 'Diff'
    elif 0.05 >= p_val > 0.005:
        return 'Sim'
    elif p_val > 0.05:
        return 'Same'
 def color_red2green_01(val, maxtone=50):
    if np.isnan(val): return None
    assert 0 <= val <= 1, f'val {val} out of range [0,1]'
    # rescale to [-1,1]
    val = val * 2 - 1
    if val < 0:
        color = 'red'
        tone = maxtone * (-val)
    else:
        color = 'green'
        tone = maxtone * val
    return '\cellcolor{' + color + f'!{int(tone)}' + '}'
--- a/Ordinal/utils.py
+++ b/Ordinal/utils.py
@ -1,64 +1,22 @@
-import numpy as np
+import quapy as qp
 from quapy.data import LabelledCollection
 from glob import glob
 from json import load
 import os
 from os.path import join
 import pickle
 import pandas as pd
 import csv
 import datasets
 from datasets import Dataset
 import quapy as qp
 from quapy.data import LabelledCollection
-
+def load_samples(path_dir, classes):
-def load_simple_sample_npytxt(parentdir, filename, classes=None):
+    nsamples = len(glob(join(path_dir, f'*.txt')))
    samplepath = join(parentdir, filename+'.txt')
    yX = np.loadtxt(samplepath)
    X = yX[:,1:]
    y = yX[:,0].astype(np.int32)
    return LabelledCollection(instances=X, labels=y, classes_=classes)
 def load_simple_sample_raw(parentdir, filename, classes=None):
    samplepath = join(parentdir, filename+'.txt')
    return LabelledCollection.load(samplepath, loader_func=qp.data.reader.from_text, classes=classes)
 def load_single_sample_as_csv(parentdir, filename):
    samplepath = join(parentdir, filename+'.txt')
    df = pd.read_csv(samplepath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
    labels = df.pop('labels').to_frame()
    features = datasets.Features({'review': datasets.Value('string')})
    sample = Dataset.from_pandas(df=df, features=features)
    return sample, labels
 def load_single_sample_pkl(parentdir, filename):
    return pickle.load(open(join(parentdir, filename+'.pkl'), 'rb'))
 # def load_samples_npytxt(path_dir, filter=None, classes=None):
 #     return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_npytxt)
 # def load_samples_raw(path_dir, filter=None, classes=None):
 #     return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_raw, load_fn_kwargs={'classes': classes})
 # def load_samples_as_csv(path_dir, filter=None):
 #     return load_samples_folder(path_dir, filter, load_fn=load_single_sample_as_csv)
 # def load_samples_pkl(path_dir, filter=None):
 #     return load_samples_folder(path_dir, filter, load_fn=load_single_sample_pkl)
 def load_samples_folder(path_dir, filter=None, load_fn=None, **load_fn_kwargs):
    nsamples = len(glob(join(path_dir, f'*')))
    for id in range(nsamples):
-        if (filter is None) or id in filter:
+        yield LabelledCollection.load(join(path_dir, f'{id}.txt'), loader_func=qp.data.reader.from_text, classes=classes)
-            yield load_fn(path_dir, f'{id}', **load_fn_kwargs)
+
 def load_samples_pkl(path_dir, filter=None):
    nsamples = len(glob(join(path_dir, f'*.pkl')))
    for id in range(nsamples):
        if filter is not None:
            if id not in filter:
                continue
        yield pickle.load(open(join(path_dir, f'{id}.pkl'), 'rb'))
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -183,7 +183,7 @@ def _training_helper(learner,
            if not hasattr(learner, 'predict_proba'):
                print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
                      f'The learner will be calibrated.')
-                learner = CalibratedClassifierCV(learner, cv=5, ensemble=True)
+                learner = CalibratedClassifierCV(learner, cv=5)
        if val_split is not None:
            if isinstance(val_split, float):
                if not (0 < val_split < 1):
@ -470,7 +470,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
    def fit(self, data: LabelledCollection, fit_learner=True):
        self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
-        self.train_prevalence = F.prevalence_from_labels(data.labels, data.classes_)        
+        self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
        return self
    def aggregate(self, classif_posteriors, epsilon=EPSILON):