Compare commits

...

7 Commits

18 changed files with 1570 additions and 262 deletions

View File

@ -1,14 +1,17 @@
import gzip
import quapy as qp
from Ordinal.utils import load_simple_sample_raw
from quapy.data import LabelledCollection
import quapy.functional as F
import os
from os.path import join
from pathlib import Path
import numpy as np
datadir = '/mnt/1T/Datasets/Amazon/reviews'
outdir = './data/'
real_prev_path = './data/Books-real-prevalence-by-product_votes1_reviews100.csv'
domain = 'Books'
seed = 7
@ -18,13 +21,6 @@ te_size = 1000
nval = 1000
nte = 5000
# domain = 'Gift_Cards'
# tr_size = 200
# val_size = 100
# te_size = 100
# nval = 20
# nte = 40
def from_gz_text(path, encoding='utf-8', class2int=True):
"""
@ -70,7 +66,6 @@ def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, pre
write_txt_sample(sample, join(outdir, f'{i}.txt'))
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
os.makedirs(outdir, exist_ok=True)
with open(prevpath, 'wt') as prevfile:
@ -80,37 +75,69 @@ def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, pre
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
def gen_samples_real_prevalences(real_prevalences, pool: LabelledCollection, sample_size, outdir, prevpath_out):
os.makedirs(outdir, exist_ok=True)
with open(prevpath_out, 'wt') as prevfile:
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
for i, prev in enumerate(real_prevalences):
sample = pool.sampling(sample_size, *prev[:-1])
write_txt_sample(sample, join(outdir, f'{i}.txt'))
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
fullpath = join(datadir,domain)+'.txt.gz'
data = LabelledCollection.load(fullpath, from_gz_text)
print(len(data))
print(data.classes_)
print(data.prevalence())
# fullpath = join(datadir,domain)+'.txt.gz'
#
# data = LabelledCollection.load(fullpath, from_gz_text)
# print(len(data))
# print(data.classes_)
# print(data.prevalence())
with qp.util.temp_seed(seed):
train, rest = data.split_stratified(train_prop=tr_size)
devel, test = rest.split_stratified(train_prop=0.5)
print(len(train))
print(len(devel))
print(len(test))
# train, rest = data.split_stratified(train_prop=tr_size)
#
# devel, test = rest.split_stratified(train_prop=0.5)
# print(len(train))
# print(len(devel))
# print(len(test))
#
domaindir = join(outdir, domain)
write_txt_sample(train, join(domaindir, 'training_data.txt'))
write_txt_sample(devel, join(domaindir, 'development_data.txt'))
write_txt_sample(test, join(domaindir, 'test_data.txt'))
# write_txt_sample(train, join(domaindir, 'training_data.txt'))
# write_txt_sample(devel, join(domaindir, 'development_data.txt'))
# write_txt_sample(test, join(domaindir, 'test_data.txt'))
gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
# this part is to be used when the partitions have already been created, in order to avoid re-generating them
train = load_simple_sample_raw(domaindir, 'training_data')
devel = load_simple_sample_raw(domaindir, 'development_data')
test = load_simple_sample_raw(domaindir, 'test_data')
gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
# gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
# prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
# gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
# prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
# gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
# prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
# gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
# prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
# this part generates samples based on real prevalences (in this case, prevalences of sets of books reviews
# groupped by product). It loads the real prevalences (computed elsewhere), and randomly extract 5000 for test
# and 1000 for val (disjoint). Then realize the samplings
assert os.path.exists(real_prev_path), f'real prevalence file does not seem to exist...'
real_prevalences = np.genfromtxt(real_prev_path, delimiter='\t')
nrows = real_prevalences.shape[0]
rand_sel = np.random.permutation(nrows)
real_prevalences_val = real_prevalences[rand_sel[:nval]]
real_prevalences_te = real_prevalences[rand_sel[nval:nval+nte]]
gen_samples_real_prevalences(real_prevalences_val, devel, sample_size=val_size, outdir=join(domaindir, 'real', 'dev_samples'),
prevpath_out=join(domaindir, 'real', 'dev_prevalences.txt'))
gen_samples_real_prevalences(real_prevalences_te, test, sample_size=te_size, outdir=join(domaindir, 'real', 'test_samples'),
prevpath_out=join(domaindir, 'real', 'test_prevalences.txt'))

View File

@ -0,0 +1,116 @@
import gzip
import quapy as qp
import numpy as np
import pandas as pd
from quapy.data import LabelledCollection
import quapy.functional as F
import os
from os.path import join
from pathlib import Path
import pickle
datadir = '../OrdinalQuantification'
outdir = './data/'
domain = 'fact'
seed = 7
tr_size = 20000
val_size = 1000
te_size = 1000
nval = 1000
nte = 5000
def from_csv(path):
df = pd.read_csv(path)
# divide the continuous labels into ordered classes
energy_boundaries = np.arange(start=2.4, stop=4.2, step=0.15)[1:-1]
y = np.digitize(np.array(df['log10_energy'], dtype=np.float32), energy_boundaries)
# note: omitting the dtype will result in a single instance having a different class
# obtain a matrix of shape (n_samples, n_features)
X = df.iloc[:, 1:].to_numpy().astype(np.float32)
return X, y
def write_pkl(sample: LabelledCollection, path):
os.makedirs(Path(path).parent, exist_ok=True)
pickle.dump(sample, open(path, 'wb'), pickle.HIGHEST_PROTOCOL)
def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
os.makedirs(outdir, exist_ok=True)
with open(prevpath, 'wt') as prevfile:
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
for i, prev in enumerate(F.uniform_simplex_sampling(n_classes=pool.n_classes, size=nsamples)):
sample = pool.sampling(sample_size, *prev)
write_pkl(sample, join(outdir, f'{i}.pkl'))
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
os.makedirs(outdir, exist_ok=True)
with open(prevpath, 'wt') as prevfile:
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
for i, sample in enumerate(pool.natural_sampling_generator(sample_size, repeats=nsamples)):
write_pkl(sample, join(outdir, f'{i}.pkl'))
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
fullpath = join(datadir,domain, 'fact_wobble.csv')
data = LabelledCollection.load(fullpath, from_csv)
if np.isnan(data.instances).any():
rows, cols = np.where(np.isnan(data.instances))
data.instances = np.delete(data.instances, rows, axis=0)
data.labels = np.delete(data.labels, rows, axis=0)
print('deleted nan rows')
if np.isnan(data.instances).any():
rows, cols = np.where(np.isnan(data.instances))
data.instances = np.delete(data.instances, rows, axis=0)
data.labels = np.delete(data.labels, rows, axis=0)
print('deleted nan rows')
if np.isinf(data.instances).any():
rows, cols = np.where(np.isinf(data.instances))
data.instances = np.delete(data.instances, rows, axis=0)
data.labels = np.delete(data.labels, rows, axis=0)
print('deleted inf rows')
print(len(data))
print(data.classes_)
print(data.prevalence())
with qp.util.temp_seed(seed):
train, rest = data.split_stratified(train_prop=tr_size)
devel, test = rest.split_stratified(train_prop=0.5)
print(len(train))
print(len(devel))
print(len(test))
domaindir = join(outdir, domain)
write_pkl(train, join(domaindir, 'training_data.pkl'))
write_pkl(devel, join(domaindir, 'development_data.pkl'))
write_pkl(test, join(domaindir, 'test_data.pkl'))
gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))

View File

@ -1,6 +1,11 @@
import numpy as np
# smoothing approximation
def smoothness(p):
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
def _check_arrays(prevs):
prevs = np.asarray(prevs)
if prevs.ndim==1:
@ -8,6 +13,7 @@ def _check_arrays(prevs):
return prevs
# mean normalized match distance
def mnmd(prevs, prevs_hat):
prevs = _check_arrays(prevs)
prevs_hat = _check_arrays(prevs_hat)
@ -17,6 +23,7 @@ def mnmd(prevs, prevs_hat):
return np.mean(nmds)
# normalized match distance
def nmd(prev, prev_hat):
n = len(prev)
return (1./(n-1))*mdpa(prev, prev_hat)

View File

@ -0,0 +1,150 @@
import numpy as np
import quapy as qp
import os
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from Ordinal.model import RegressionQuantification, LogisticAT, LogisticSE, LogisticIT, LAD, OrdinalRidge
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC
from os.path import join
from utils import load_samples_folder, load_single_sample_pkl
from evaluation import nmd, mnmd
from tqdm import tqdm
"""
This script generates all results from Table 1 in the paper, i.e., all results comparing quantifiers equipped with
standard logistic regression against quantifiers equipped with order-aware classifiers
"""
def quantifiers():
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
params_Ridge = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced'], 'normalize':[True,False]}
# baselines
yield 'CC(LR)', CC(LogisticRegression()), params_LR
yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
# with order-aware classifiers
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
yield 'CC(OLR-SE)', CC(LogisticSE()), params_OLR
yield 'PCC(OLR-SE)', PCC(LogisticSE()), params_OLR
yield 'ACC(OLR-SE)', ACC(LogisticSE()), params_OLR
yield 'PACC(OLR-SE)', PACC(LogisticSE()), params_OLR
yield 'SLD(OLR-SE)', EMQ(LogisticSE()), params_OLR
yield 'CC(OLR-IT)', CC(LogisticIT()), params_OLR
yield 'PCC(OLR-IT)', PCC(LogisticIT()), params_OLR
yield 'ACC(OLR-IT)', ACC(LogisticIT()), params_OLR
yield 'PACC(OLR-IT)', PACC(LogisticIT()), params_OLR
yield 'SLD(OLR-IT)', EMQ(LogisticIT()), params_OLR
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
yield 'CC(LAD)', CC(LAD()), params_SVR
yield 'ACC(LAD)', ACC(LAD()), params_SVR
yield 'CC(ORidge)', CC(OrdinalRidge()), params_Ridge
yield 'ACC(ORidge)', ACC(OrdinalRidge()), params_Ridge
def run_experiment(params):
qname, q, param_grid = params
qname += posfix
resultfile = join(resultpath, f'{qname}.all.csv')
if os.path.exists(resultfile):
print(f'result file {resultfile} already exists: continue')
return None
print(f'fitting {qname} for all-drift')
def load_test_samples():
folderpath = join(datapath, domain, protocol, 'test_samples')
for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=5000):
if posfix == '-std':
sample.instances = zscore.transform(sample.instances)
yield sample.instances, sample.prevalence()
def load_dev_samples():
folderpath = join(datapath, domain, protocol, 'dev_samples')
for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=1000):
if posfix == '-std':
sample.instances = zscore.transform(sample.instances)
yield sample.instances, sample.prevalence()
q = qp.model_selection.GridSearchQ(
q,
param_grid,
sample_size=1000,
protocol='gen',
error=mnmd,
val_split=load_dev_samples,
n_jobs=-1,
refit=False,
timeout=60*60*2,
verbose=True).fit(train)
hyperparams = f'{qname}\tall\t{q.best_params_}\t{q.best_score_}'
print('[done]')
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
mean_nmd = report['nmd'].mean()
std_nmd = report['nmd'].std()
print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
report.to_csv(resultfile, index=False)
print('[learning regressor-based adjustment]')
q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
q.fit(None)
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
mean_nmd = report['nmd'].mean()
std_nmd = report['nmd'].std()
print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
resultfile = join(resultpath, f'{qname}.all.reg.csv')
report.to_csv(resultfile, index=False)
return hyperparams
if __name__ == '__main__':
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
#domain = 'Books-tfidf'
posfix = ''
# domain = 'fact'
# posfix = '-std' # set to '' to avoid standardization
# posfix = ''
load_sample_fn = load_single_sample_pkl
datapath = './data'
protocol = 'app'
resultpath = join('./results', domain, protocol)
os.makedirs(resultpath, exist_ok=True)
train = load_sample_fn(join(datapath, domain), 'training_data')
if posfix=='-std':
zscore = StandardScaler()
train.instances = zscore.fit_transform(train.instances)
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
hypers = qp.util.parallel(run_experiment, quantifiers(), n_jobs=-3)
for h in hypers:
if h is not None:
foo.write(h)
foo.write('\n')

105
Ordinal/finetune_bert.py Normal file
View File

@ -0,0 +1,105 @@
import csv
import sys
import datasets
import numpy as np
import pandas as pd
import torch.cuda
from datasets import Dataset, DatasetDict
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import Trainer
from transformers import TrainingArguments
"""
This script fine-tunes a pre-trained language model on a given textual training set.
The training goes for a maximum of 5 epochs, but stores the model parameters of the best performing epoch according
to the validation loss in a hold-out val split of 1000 documents (stratified).
We used it with RoBERTa in the training set of the Amazon-OQ-BK domain, i.e.:
$> python3 ./data/Books/training_data.txt roberta-base
"""
def tokenize_function(example):
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else 256)
return tokens
def compute_metrics(eval_preds):
logits, labels = eval_preds
preds = np.argmax(logits, axis=-1)
return {
'macro-f1': f1_score(labels, preds, average='macro'),
'micro-f1': f1_score(labels, preds, average='micro'),
}
if __name__ == '__main__':
debug = False
assert torch.cuda.is_available(), 'cuda is not available'
# datapath = './data/Books/training_data.txt'
# checkpoint = 'roberta-base'
n_args = len(sys.argv)
assert n_args==3, 'wrong arguments, expected: <training-path> <transformer-name>'
datapath = sys.argv[1] # './data/Books/training_data.txt'
checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base'
modelout = checkpoint+'-val-finetuned'
# load the training set, and extract a held-out validation split of 1000 documents (stratified)
df = pd.read_csv(datapath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
labels = df['labels'].to_frame()
X_train, X_val = train_test_split(df, stratify=labels, test_size=.25, random_state=1)
num_labels = len(pd.unique(labels['labels']))
features = datasets.Features({'labels': datasets.Value('int32'), 'review': datasets.Value('string')})
train = Dataset.from_pandas(df=X_train, split='train', features=features)
validation = Dataset.from_pandas(df=X_val, split='validation', features=features)
dataset = DatasetDict({
'train': train.select(range(500)) if debug else train,
'validation': validation.select(range(500)) if debug else validation
})
# tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
# fine-tuning
training_args = TrainingArguments(
modelout,
learning_rate=2e-5,
num_train_epochs=5,
weight_decay=0.01,
evaluation_strategy='epoch',
save_strategy='epoch',
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
# eval_steps=10,
save_total_limit=1,
load_best_model_at_end=True
)
trainer = Trainer(
model,
args=training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['validation'],
data_collator=DataCollatorWithPadding(tokenizer),
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()

View File

@ -0,0 +1,70 @@
import pandas as pd
from os.path import join
import os
from glob import glob
from pathlib import Path
from Ordinal.main import quantifiers
from Ordinal.tabular import Table
"""
This script generates some tables for Amazon-OQ-BK (for internal use only)
"""
domain = 'Books-tfidf'
domain_bert_last = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
domain_bert_ave = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
domain_bert_post = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
prot = 'app'
outpath = f'./tables/{domain}/{prot}/results.tex'
resultpath = join('./results', domain, prot)
resultpath_bertlast = join('./results', domain_bert_last, prot)
resultpath_bertave = join('./results', domain_bert_ave, prot)
resultpath_bertpost = join('./results', domain_bert_post, prot)
methods = [qname for qname, *_ in quantifiers()]
methods += ['SLD(LR)-agg']
methods_Rlast = [m+'-RoBERTa-last' for m in methods]
methods_Rave = [m+'-RoBERTa-average' for m in methods]
methods_Rpost = [m+'-RoBERTa-posteriors' for m in methods]
methods = methods + methods_Rlast + methods_Rave + methods_Rpost
# methods += [m+'-r' for m in methods]
table = Table(benchmarks=['low', 'mid', 'high', 'all'], methods=methods, prec_mean=4, show_std=True, prec_std=4)
resultfiles = list(glob(f'{resultpath}/*.csv')) \
+ list(glob(f'{resultpath_bertlast}/*.csv')) \
+ list(glob(f'{resultpath_bertave}/*.csv')) \
+ list(glob(f'{resultpath_bertpost}/*.csv'))
for resultfile in resultfiles:
df = pd.read_csv(resultfile)
nmd = df['nmd'].values
resultname = Path(resultfile).name
method, drift, *other = resultname.replace('.csv', '').split('.')
if other:
method += '-r'
if method not in methods:
continue
table.add(drift, method, nmd)
os.makedirs(Path(outpath).parent, exist_ok=True)
tabular = """
\\resizebox{\\textwidth}{!}{%
\\begin{tabular}{|c||""" + ('c|' * (table.nbenchmarks)) + """} \hline
"""
tabular += table.latexTabularT(average=False)
tabular += """
\end{tabular}%
}"""
print('saving table in', outpath)
with open(outpath, 'wt') as foo:
foo.write(tabular)
foo.write('\n')
print('[done]')

View File

@ -0,0 +1,82 @@
import pandas as pd
from os.path import join
import os
from glob import glob
from pathlib import Path
from Ordinal.experiments_lr_vs_ordlr import quantifiers
from Ordinal.tabular import Table
"""
This script generates some tables for Fact-OQ (for internal use only)
"""
#domain = 'fact'
#domain = 'Books-tfidf'
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
prot = 'app'
outpath = f'./tables/{domain}/{prot}/results.tex'
resultpath = join('./results', domain, prot)
withstd=False
methods = [qname for qname, *_ in quantifiers()]
if withstd:
methods = [m+'-std' for m in methods]
#methods = methods + methods_variant
# methods += [m+'-r' for m in methods]
quantifiers_families = ['CC', 'PCC', 'ACC', 'PACC', 'SLD']
# method_variants = ['LR', 'OLR-AT', 'OLR-SE', 'OLR-IT', 'ORidge', 'LAD']
method_variants = ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']
if withstd:
method_variants = [m+'-std' for m in method_variants]
print('families:', quantifiers_families)
print('variants', method_variants)
table = Table(benchmarks=quantifiers_families, methods=method_variants, prec_mean=4, show_std=True, prec_std=4,
color=False, show_rel_to=0, missing_str='\multicolumn{1}{c}{---}', clean_zero=True)
resultfiles = list(glob(f'{resultpath}/*).all.csv'))
for resultfile in resultfiles:
df = pd.read_csv(resultfile)
nmd = df['nmd'].values
resultname = Path(resultfile).name
method, drift, *other = resultname.replace('.csv', '').replace('-RoBERTa-average','').split('.')
if drift!='all':
continue
if other:
method += '-r'
if method not in methods:
continue
family, variant = method.split('(')
variant = variant.replace(')', '')
if variant not in method_variants:
continue
table.add(family, variant, nmd)
os.makedirs(Path(outpath).parent, exist_ok=True)
tabular = """
\\resizebox{\\textwidth}{!}{%
\\begin{tabular}{c""" + ('l' * (table.nbenchmarks)) + """}
\\toprule
"""
tabular += table.latexTabularT(average=False)
tabular += """
\end{tabular}%
}"""
print('saving table in', outpath)
with open(outpath, 'wt') as foo:
foo.write(tabular)
foo.write('\n')
print('[done]')

View File

@ -0,0 +1,152 @@
import sys
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from os.path import join
import os
import shutil
from tqdm import tqdm
from Ordinal.utils import load_samples_folder, load_single_sample_as_csv
"""
This scripts takes a pre-trained model (a fine-tuned one) and generates numerical representations for all
samples in the dataset. The representations are saved in npy-txt plain format.
"""
def tokenize_function(example):
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else None, return_tensors='pt')
return {
'input_ids': tokens.input_ids.cuda(),
'attention_mask': tokens.attention_mask.cuda()
}
def save_samples_as_txt(tensors, labels, path):
vectors = tensors
labels = labels.values
vec_lab = np.hstack([labels, vectors])
n_cols = vectors.shape[1]
np.savetxt(path, vec_lab, fmt=['%d']+['%f']*n_cols)
def transform_sample(instances, labels, outpath, batch_size=50):
ndocs = len(labels)
batches = ndocs // batch_size
assert ndocs % batches == 0, 'fragmented last bach not supported'
transformations = []
for batch_id in range(0, ndocs, batch_size):
batch_instances = instances[batch_id:batch_id + batch_size]
tokenized_dataset = tokenize_function(batch_instances)
out = model(**tokenized_dataset, output_hidden_states=True)
if generation_mode == 'posteriors':
logits = out.logits
posteriors = torch.softmax(logits, dim=-1)
transformed = posteriors
elif generation_mode == 'last':
hidden_states = out.hidden_states
last_layer_cls = hidden_states[-1][:, 0, :]
transformed = last_layer_cls
elif generation_mode == 'average':
hidden_states = out.hidden_states
hidden_states = torch.stack(hidden_states)
all_layer_cls = hidden_states[:, :, 0, :]
average_cls = torch.mean(all_layer_cls, dim=0)
transformed = average_cls
else:
raise NotImplementedError()
transformations.append(transformed.cpu().numpy())
transformations = np.vstack(transformations)
save_samples_as_txt(transformations, labels, outpath)
def transform_folder_samples(protocol, splitname, skip=0):
in_folder = join(datapath, domain, protocol, splitname)
out_folder = join(datapath, outname, protocol, splitname)
total = 1000 if splitname.startswith('dev') else 5000
for i, (instances, labels) in tqdm(enumerate(
load_samples_folder(in_folder, load_fn=load_single_sample_as_csv)), desc=f'{protocol} {splitname}', total=total):
if i>= skip:
transform_sample(instances, labels, outpath=join(out_folder, f'{i}.txt'))
def get_best_checkpoint(checkpointdir):
from glob import glob
steps = []
for folder in glob(f'{checkpointdir}/checkpoint-*'):
step=int(folder.split('checkpoint-')[1])
steps.append(step)
assert len(steps) <= 2, 'unexpected number of steps, only two where expected (the best one and the last one)'
choosen = f'{checkpointdir}/checkpoint-{min(steps)}'
print(f'choosen checkpoint is {choosen}')
return choosen
if __name__ == '__main__':
debug = False
assert torch.cuda.is_available(), 'cuda is not available'
#checkpoint='roberta-base-val-finetuned'
#generation_mode = 'ave'
n_args = len(sys.argv)
assert n_args==3, 'wrong arguments, expected: <checkpoint> <generation-mode>\n' \
'\tgeneration-mode: last (last layer), ave (average pooling), or posteriors (posterior probabilities)'
checkpoint = sys.argv[1] #e.g., 'bert-base-uncased'
generation_mode = sys.argv[2] # e.g., 'last'
assert 'finetuned' in checkpoint, 'looks like this model is not finetuned'
checkpoint = get_best_checkpoint(checkpoint)
num_labels = 5
datapath = './data'
domain = 'Books'
protocols = ['real'] # ['app', 'npp']
assert generation_mode in ['last', 'average', 'posteriors'], 'unknown generation_model'
outname = domain + f'-{checkpoint}-{generation_mode}'
with torch.no_grad():
print('loading', checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
os.makedirs(join(datapath, outname), exist_ok=True)
print('transforming the training set')
instances, labels = load_single_sample_as_csv(join(datapath, domain), 'training_data')
transform_sample(instances, labels, join(datapath, outname, 'training_data.txt'))
print('[done]')
for protocol in protocols:
in_path = join(datapath, domain, protocol)
out_path = join(datapath, outname, protocol)
os.makedirs(out_path, exist_ok=True)
os.makedirs(join(out_path, 'dev_samples'), exist_ok=True)
os.makedirs(join(out_path, 'test_samples'), exist_ok=True)
shutil.copyfile(join(in_path, 'dev_prevalences.txt'), join(out_path, 'dev_prevalences.txt'))
shutil.copyfile(join(in_path, 'test_prevalences.txt'), join(out_path, 'test_prevalences.txt'))
print('processing', protocol)
transform_folder_samples(protocol, 'dev_samples')
transform_folder_samples(protocol, 'test_samples')

View File

@ -1,16 +0,0 @@
import quapy as qp
from quapy.data import LabelledCollection
from quapy.data.reader import from_text
from quapy.functional import strprev
category = 'Books'
datadir = './data'
training_path = f'{datadir}/{category}/training_data.txt'
data = LabelledCollection.load(training_path, loader_func=from_text)
print(len(data))
print(strprev(data.prevalence()))

View File

@ -3,87 +3,154 @@ from sklearn.linear_model import LogisticRegression
import quapy as qp
import numpy as np
from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, RegressorClassifier
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC
from Ordinal.model import OrderedLogisticRegression, LogisticAT
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy
from quapy.data import LabelledCollection
from os.path import join
from utils import load_samples, load_samples_pkl
import os
from utils import load_samples_folder, load_simple_sample_npytxt, load_single_sample_pkl
from evaluation import nmd, mnmd
from time import time
import pickle
from tqdm import tqdm
domain = 'Books-tfidf'
datapath = './data'
protocol = 'app'
drift = 'high'
train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb'))
def load_test_samples():
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
ids = set(ids)
for sample in tqdm(load_samples_pkl(join(datapath, domain, protocol, 'test_samples'), filter=ids), total=len(ids)):
yield sample.instances, sample.prevalence()
def load_dev_samples():
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
ids = set(ids)
for sample in tqdm(load_samples_pkl(join(datapath, domain, protocol, 'dev_samples'), filter=ids), total=len(ids)):
yield sample.instances, sample.prevalence()
print('fitting the quantifier')
# q = EMQ(LogisticRegression(class_weight='balanced'))
# q = PACC(LogisticRegression(class_weight='balanced'))
q = PACC(OrderedLogisticRegression())
# q = PACC(StackedClassifier(LogisticRegression(class_weight='balanced')))
# q = RegressionQuantification(PCC(LogisticRegression(class_weight='balanced')), val_samples_generator=load_dev_samples)
# q = ACC(RegressorClassifier())
param_grid = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
# param_grid = {'C': np.logspace(-3,3,14)}
# param_grid = {'alpha':np.logspace(-8, 6, 15)}
# q = qp.model_selection.GridSearchQ(
# q,
# param_grid,
# 1000,
# 'gen',
# error=mnmd,
# val_split=load_dev_samples,
# n_jobs=-1,
# refit=False,
# verbose=True)
q.fit(train)
# q = RegressionQuantification(q, val_samples_generator=load_dev_samples)
# q.fit(None)
print('[done]')
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
mean_nmd = report['nmd'].mean()
std_nmd = report['nmd'].std()
print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
q = RegressionQuantification(q, val_samples_generator=load_dev_samples)
q.fit(None)
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
mean_nmd = report['nmd'].mean()
std_nmd = report['nmd'].std()
print(f'[regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
# drift='high'
# report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
# mean_nmd = report['nmd'].mean()
# std_nmd = report['nmd'].std()
# print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
import mord
def quantifiers():
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
# params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
params_OLR = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
# params_SVR = {'C': np.logspace(0, 1, 2)}
# baselines
yield 'CC(LR)', CC(LogisticRegression()), params_LR
yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
#yield 'HDy(LR)', HDy(LogisticRegression()), params_LR
yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
# with order-aware classifiers
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
#yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
#yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
#yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
#yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
#yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR
#yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
# I am using my implementation, which caters for predict_proba (linear distance to the two closest classes, 0 in the rest)
# the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do
# not implement predict_proba nor decision_score
#yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
#yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
# yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR
# yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR
# yield 'PACC(SVR)', PACC(RegressorClassifier()), params_SVR
#yield 'HDy(SVR)', HDy(RegressorClassifier()), params_SVR
# yield 'SLD(SVR)', EMQ(RegressorClassifier()), params_SVR
def run_experiment(params):
qname, q, param_grid, drift = params
qname += posfix
resultfile = join(resultpath, f'{qname}.{drift}.csv')
if os.path.exists(resultfile):
print(f'result file {resultfile} already exists: continue')
return None
print(f'fitting {qname} for {drift}-drift')
def load_test_samples():
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
ids = set(ids)
folderpath = join(datapath, domain, protocol, 'test_samples')
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
yield sample.instances, sample.prevalence()
def load_dev_samples():
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
ids = set(ids)
folderpath = join(datapath, domain, protocol, 'dev_samples')
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
yield sample.instances, sample.prevalence()
q = qp.model_selection.GridSearchQ(
q,
param_grid,
sample_size=1000,
protocol='gen',
error=mnmd,
val_split=load_dev_samples,
n_jobs=-1,
refit=False,
verbose=True).fit(train)
hyperparams = f'{qname}\t{drift}\t{q.best_params_}'
print('[done]')
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
mean_nmd = report['nmd'].mean()
std_nmd = report['nmd'].std()
print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
report.to_csv(resultfile, index=False)
print('[learning regressor-based adjustment]')
q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
q.fit(None)
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
mean_nmd = report['nmd'].mean()
std_nmd = report['nmd'].std()
print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
resultfile = join(resultpath, f'{qname}.{drift}.reg.csv')
report.to_csv(resultfile, index=False)
return hyperparams
if __name__ == '__main__':
#preprocessing = 'roberta.last'
preprocessing = 'roberta.average'
# preprocessing = 'roberta.posteriors'
#preprocessing = 'tfidf'
if preprocessing=='tfidf':
domain = 'Books-tfidf'
posfix = ''
elif preprocessing=='roberta.last':
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
posfix = '-RoBERTa-last'
elif preprocessing=='roberta.average':
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
posfix = '-RoBERTa-average'
elif preprocessing=='roberta.posteriors':
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
posfix = '-RoBERTa-posteriors'
load_sample_fn = load_single_sample_pkl
datapath = './data'
protocol = 'app'
resultpath = join('./results', domain, protocol)
os.makedirs(resultpath, exist_ok=True)
train = load_sample_fn(join(datapath, domain), 'training_data')
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
#for drift in [f'smooth{i}' for i in range(5)] + ['all']:
params = [(*qs, drift) for qs in quantifiers() for drift in ['low', 'mid', 'high', 'all']]
hypers = qp.util.parallel(run_experiment, params, n_jobs=-2)
for h in hypers:
if h is not None:
foo.write(h)
foo.write('\n')

View File

@ -1,14 +1,11 @@
from copy import deepcopy
import mord
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression, Ridge
from scipy.sparse import issparse
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVR, SVR
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import Ridge
from sklearn.svm import LinearSVR
from sklearn.utils.class_weight import compute_class_weight
from statsmodels.miscmodels.ordinal_model import OrderedModel
@ -36,112 +33,21 @@ class OrderedLogisticRegression:
return self.res_prob.model.predict(self.res_prob.params, exog=X)
class StackedClassifier: # aka Funnelling Monolingual
def __init__(self, base_estimator=LogisticRegression()):
if not hasattr(base_estimator, 'predict_proba'):
print('the estimator does not seem to be probabilistic: calibrating')
base_estimator = CalibratedClassifierCV(base_estimator)
# self.base = deepcopy(OneVsRestClassifier(base_estimator))
# self.meta = deepcopy(OneVsRestClassifier(base_estimator))
self.base = deepcopy(base_estimator)
self.meta = deepcopy(base_estimator)
self.norm = StandardScaler()
class LAD(BaseEstimator, ClassifierMixin):
def __init__(self, C=1.0, class_weight=None):
self.C = C
self.class_weight = class_weight
def fit(self, X, y):
self.base.fit(X, y)
P = self.base.predict_proba(X)
P = self.norm.fit_transform(P)
self.meta.fit(P, y)
return self
def predict(self, X):
P = self.base.predict_proba(X)
P = self.norm.transform(P)
return self.meta.predict(P)
def predict_proba(self, X):
P = self.base.predict_proba(X)
P = self.norm.transform(P)
return self.meta.predict_proba(P)
class RegressionQuantification:
def __init__(self,
base_quantifier,
regression='svr',
val_samples_generator=None,
norm=True):
self.base_quantifier = base_quantifier
if isinstance(regression, str):
assert regression in ['ridge', 'svr'], 'unknown regression model'
if regression == 'ridge':
self.reg = Ridge(normalize=norm)
elif regression == 'svr':
self.reg = MultiOutputRegressor(LinearSVR())
else:
self.reg = regression
# self.reg = MultiTaskLassoCV(normalize=norm)
# self.reg = KernelRidge(kernel='rbf')
# self.reg = LassoLarsCV(normalize=norm)
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
#self.reg = LinearRegression(normalize=norm) # <- bien
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
self.regression = regression
self.val_samples_generator = val_samples_generator
# self.norm = StandardScaler()
# self.covs = covs
def generate_validation_samples(self):
Xs, ys = [], []
for instances, prevalence in self.val_samples_generator():
ys.append(prevalence)
Xs.append(self.base_quantifier.quantify(instances))
Xs = np.asarray(Xs)
ys = np.asarray(ys)
return Xs, ys
def fit(self, data):
print('fitting quantifier')
if data is not None:
self.base_quantifier.fit(data)
print('generating val samples')
Xs, ys = self.generate_validation_samples()
# Xs = self.norm.fit_transform(Xs)
print('fitting regressor')
self.reg.fit(Xs, ys)
print('[done]')
return self
def quantify(self, instances):
Xs = self.base_quantifier.quantify(instances).reshape(1, -1)
# Xs = self.norm.transform(Xs)
Xs = self.reg.predict(Xs)
# Xs = self.norm.inverse_transform(Xs)
adjusted = Xs / Xs.sum()
# adjusted = np.clip(Xs, 0, 1)
adjusted = adjusted.flatten()
return adjusted
def get_params(self, deep=True):
return self.base_quantifier.get_params()
def set_params(self, **params):
self.base_quantifier.set_params(**params)
class RegressorClassifier(BaseEstimator, ClassifierMixin):
def __init__(self):
self.regressor = LinearSVR()
def fit(self, X, y, sample_weight=None):
self.regressor = LinearSVR(C=self.C)
# self.regressor = SVR()
# self.regressor = Ridge(normalize=True)
def fit(self, X, y):
self.nclasses = len(np.unique(y))
self.regressor.fit(X, y)
classes = sorted(np.unique(y))
self.nclasses = len(classes)
if self.class_weight == 'balanced':
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
self.regressor.fit(X, y, sample_weight=sample_weight)
return self
def predict(self, X):
@ -151,13 +57,20 @@ class RegressorClassifier(BaseEstimator, ClassifierMixin):
c[c>(self.nclasses-1)]=self.nclasses-1
return c.astype(np.int)
def predict_proba(self, X):
# def predict_proba(self, X):
# r = self.regressor.predict(X)
# nC = len(self.classes_)
# r = np.clip(r, 0, nC - 1)
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
# invdist = 1 - dists
# invdist[invdist < 0] = 0
# return invdist
def decision_function(self, X):
r = self.regressor.predict(X)
nC = len(self.classes_)
r = np.clip(r, 0, nC - 1)
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
invdist = 1 - dists
invdist[invdist < 0] = 0
return invdist
@property
@ -165,8 +78,118 @@ class RegressorClassifier(BaseEstimator, ClassifierMixin):
return np.arange(self.nclasses)
def get_params(self, deep=True):
return self.regressor.get_params()
return {'C':self.C, 'class_weight': self.class_weight}
def set_params(self, **params):
self.regressor.set_params(**params)
self.C = params['C']
self.class_weight = params['class_weight']
class OrdinalRidge(BaseEstimator, ClassifierMixin):
def __init__(self, alpha=1.0, class_weight=None, normalize=False):
self.alpha = alpha
self.class_weight = class_weight
self.normalize = normalize
def fit(self, X, y, sample_weight=None):
self.regressor = Ridge(alpha=self.alpha, normalize=self.normalize)
classes = sorted(np.unique(y))
self.nclasses = len(classes)
if self.class_weight == 'balanced':
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
self.regressor.fit(X, y, sample_weight=sample_weight)
return self
def predict(self, X):
r = self.regressor.predict(X)
c = np.round(r)
c[c<0]=0
c[c>(self.nclasses-1)]=self.nclasses-1
return c.astype(np.int)
# def predict_proba(self, X):
# r = self.regressor.predict(X)
# nC = len(self.classes_)
# r = np.clip(r, 0, nC - 1)
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
# invdist = 1 - dists
# invdist[invdist < 0] = 0
# return invdist
def decision_function(self, X):
r = self.regressor.predict(X)
nC = len(self.classes_)
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
invdist = 1 - dists
return invdist
@property
def classes_(self):
return np.arange(self.nclasses)
def get_params(self, deep=True):
return {'alpha':self.alpha, 'class_weight': self.class_weight, 'normalize': self.normalize}
def set_params(self, **params):
self.alpha = params['alpha']
self.class_weight = params['class_weight']
self.normalize = params['normalize']
# with order-aware classifiers
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
class LogisticAT(mord.LogisticAT):
def __init__(self, alpha=1.0, class_weight=None):
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
self.class_weight = class_weight
super(LogisticAT, self).__init__(alpha=alpha)
def fit(self, X, y, sample_weight=None):
if self.class_weight == 'balanced':
classes = sorted(np.unique(y))
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight)
class LogisticSE(mord.LogisticSE):
def __init__(self, alpha=1.0, class_weight=None):
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
self.class_weight = class_weight
super(LogisticSE, self).__init__(alpha=alpha)
def fit(self, X, y, sample_weight=None):
if self.class_weight == 'balanced':
classes = sorted(np.unique(y))
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
return super(LogisticSE, self).fit(X, y, sample_weight=sample_weight)
class LogisticIT(mord.LogisticIT):
def __init__(self, alpha=1.0, class_weight=None):
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
self.class_weight = class_weight
super(LogisticIT, self).__init__(alpha=alpha)
def fit(self, X, y, sample_weight=None):
if self.class_weight == 'balanced':
classes = sorted(np.unique(y))
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
return super(LogisticIT, self).fit(X, y, sample_weight=sample_weight)
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
# class LAD(mord.LAD):
# def fit(self, X, y):
# self.classes_ = sorted(np.unique(y))
# return super().fit(X, y)
# class OrdinalRidge(mord.OrdinalRidge):
# def fit(self, X, y):
# self.classes_ = sorted(np.unique(y))
# return super().fit(X, y)

View File

@ -1,7 +1,7 @@
import numpy as np
import quapy as qp
from Ordinal.evaluation import nmd
from Ordinal.utils import load_samples_pkl
from evaluation import nmd
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
from quapy.data import LabelledCollection
import pickle
import os
@ -9,28 +9,39 @@ from os.path import join
from tqdm import tqdm
"""
This scripts generates a partition of a dataset in terms of "shift".
The partition is only carried out by generating index vectors.
"""
def partition_by_drift(split, training_prevalence):
assert split in ['dev', 'test'], 'invalid split name'
total=1000 if split=='dev' else 5000
drifts = []
for sample in tqdm(load_samples_pkl(join(datapath, domain, 'app', f'{split}_samples')), total=total):
folderpath = join(datapath, domain, 'app', f'{split}_samples')
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
drifts.append(nmd(training_prevalence, sample.prevalence()))
drifts = np.asarray(drifts)
order = np.argsort(drifts)
nD = len(order)
low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:]
all_drift = np.arange(nD)
np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift)
np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift)
np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift)
np.save(join(datapath, domain, 'app', f'alldrift.{split}.id.npy'), all_drift)
lows = drifts[low_drift]
mids = drifts[mid_drift]
highs = drifts[high_drift]
all = drifts[all_drift]
print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}')
print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}')
print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}')
print(f'all drift: interval [{all.min():.4f}, {all.max():.4f}] mean: {all.mean():.4f}')
domain = 'Books-tfidf'
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
datapath = './data'
training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))

View File

@ -0,0 +1,41 @@
import numpy as np
from Ordinal.evaluation import smoothness
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
from os.path import join
from tqdm import tqdm
"""
This scripts generates a partition of a dataset in terms of "smoothness".
The partition is only carried out by generating index vectors.
"""
def partition_by_smoothness(split):
assert split in ['dev', 'test'], 'invalid split name'
total=1000 if split=='dev' else 5000
smooths = []
folderpath = join(datapath, domain, 'app', f'{split}_samples')
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
smooths.append(smoothness(sample.prevalence()))
smooths = np.asarray(smooths)
order = np.argsort(smooths)
nD = len(order)
low2high_smooth = np.array_split(order, 5)
all_drift = np.arange(nD)
for i, smooth_idx in enumerate(low2high_smooth):
block = smooths[smooth_idx]
print(f'smooth block {i}: shape={smooth_idx.shape}, interval=[{block.min()}, {block.max()}] mean={block.mean()}')
np.save(join(datapath, domain, 'app', f'smooth{i}.{split}.id.npy'), smooth_idx)
np.save(join(datapath, domain, 'app', f'all.{split}.id.npy'), all_drift)
#domain = 'Books-tfidf'
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
datapath = './data'
#training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
partition_by_smoothness('dev')
partition_by_smoothness('test')

View File

@ -0,0 +1,51 @@
import quapy as qp
from quapy.data import LabelledCollection
from sklearn.feature_extraction.text import TfidfVectorizer
from os.path import join
import os
import pickle
from utils import *
from tqdm import tqdm
import shutil
"""
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into dense vectors
extracted from a pretrained model (here we use the RoBERTa fine-tuned on the training set)
Three vector generation modes are available: posteriors, last, average
"""
vector_generation = 'posteriors'
datapath = './data'
domain = f'Books-roberta-base-finetuned/checkpoint-1188-{vector_generation}'
outname = domain.replace('-finetuned', '-finetuned-pkl')
protocol = 'app'
print('pickling npy txt files')
print('from:', join(datapath, domain))
print('to', join(datapath, outname))
print('for protocol:', protocol)
os.makedirs(join(datapath, outname), exist_ok=True)
os.makedirs(join(datapath, outname, protocol), exist_ok=True)
os.makedirs(join(datapath, outname, protocol, 'dev_samples'), exist_ok=True)
os.makedirs(join(datapath, outname, protocol, 'test_samples'), exist_ok=True)
shutil.copyfile(join(datapath, domain, protocol, 'dev_prevalences.txt'), join(datapath, outname, protocol, 'dev_prevalences.txt'))
shutil.copyfile(join(datapath, domain, protocol, 'test_prevalences.txt'), join(datapath, outname, protocol, 'test_prevalences.txt'))
train = load_simple_sample_npytxt(join(datapath, domain), 'training_data', classes=np.arange(5))
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
def transform_folder_samples(protocol, splitname):
folder_dir=join(datapath, domain, protocol, splitname)
for i, sample in tqdm(enumerate(load_samples_folder(folder_dir, filter=None, load_fn=load_simple_sample_npytxt, classes=train.classes_))):
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
transform_folder_samples(protocol, 'dev_samples')
transform_folder_samples(protocol, 'test_samples')

View File

@ -1,14 +1,20 @@
import quapy as qp
from Ordinal.utils import load_simple_sample_raw
from quapy.data import LabelledCollection
from sklearn.feature_extraction.text import TfidfVectorizer
from os.path import join
import os
import pickle
from utils import load_samples
from tqdm import tqdm
import shutil
"""
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into tfidf vectors.
"""
datapath = './data'
domain = 'Books'
outname = domain + '-tfidf'
@ -40,7 +46,7 @@ pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pic
def transform_folder_samples(protocol, splitname):
for i, sample in tqdm(enumerate(load_samples(join(datapath, domain, protocol, splitname), classes=train.classes_))):
for i, sample in tqdm(enumerate(load_simple_sample_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
sample.instances = tfidf.transform(sample.instances)
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)

374
Ordinal/tabular.py Normal file
View File

@ -0,0 +1,374 @@
import numpy as np
import itertools
from scipy.stats import ttest_ind_from_stats, wilcoxon
class Table:
VALID_TESTS = [None, "wilcoxon", "ttest"]
def __init__(self, benchmarks, methods, lower_is_better=True, significance_test='wilcoxon', prec_mean=3,
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
color=True, show_rel_to=-1):
assert significance_test in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
self.benchmarks = np.asarray(benchmarks)
self.benchmark_index = {row: i for i, row in enumerate(benchmarks)}
self.methods = np.asarray(methods)
self.method_index = {col: j for j, col in enumerate(methods)}
self.map = {}
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
self._addmap('values', dtype=object)
self.lower_is_better = lower_is_better
self.ttest = significance_test
self.prec_mean = prec_mean
self.clean_zero = clean_zero
self.show_std = show_std
self.prec_std = prec_std
self.add_average = average
self.missing = missing
self.missing_str = missing_str
self.color = color
self.show_rel_to = show_rel_to
self.touch()
@property
def nbenchmarks(self):
return len(self.benchmarks)
@property
def nmethods(self):
return len(self.methods)
def touch(self):
self._modif = True
def update(self):
if self._modif:
self.compute()
def _getfilled(self):
return np.argwhere(self.map['fill'])
@property
def values(self):
return self.map['values']
def _indexes(self):
return itertools.product(range(self.nbenchmarks), range(self.nmethods))
def _addmap(self, map, dtype, func=None):
self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
if func is None:
return
m = self.map[map]
f = func
indexes = self._indexes() if map == 'fill' else self._getfilled()
for i, j in indexes:
m[i, j] = f(self.values[i, j])
def _addrank(self):
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
if not self.lower_is_better:
ranked_cols_idx = ranked_cols_idx[::-1]
self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx) + 1)
def _addcolor(self):
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
if filled_cols_idx.size == 0:
continue
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
minval = min(col_means)
maxval = max(col_means)
for col_idx in filled_cols_idx:
val = self.map['mean'][i, col_idx]
norm = (maxval - minval)
if norm > 0:
normval = (val - minval) / norm
else:
normval = 0.5
if self.lower_is_better:
normval = 1 - normval
self.map['color'][i, col_idx] = color_red2green_01(normval)
def _run_ttest(self, row, col1, col2):
mean1 = self.map['mean'][row, col1]
std1 = self.map['std'][row, col1]
nobs1 = self.map['nobs'][row, col1]
mean2 = self.map['mean'][row, col2]
std2 = self.map['std'][row, col2]
nobs2 = self.map['nobs'][row, col2]
_, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
return p_val
def _run_wilcoxon(self, row, col1, col2):
values1 = self.map['values'][row, col1]
values2 = self.map['values'][row, col2]
_, p_val = wilcoxon(values1, values2)
return p_val
def _add_statistical_test(self):
if self.ttest is None:
return
self.some_similar = [False] * self.nmethods
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
if len(filled_cols_idx) <= 1:
continue
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
best_pos = filled_cols_idx[np.argmin(col_means)]
for j in filled_cols_idx:
if j == best_pos:
continue
if self.ttest == 'ttest':
p_val = self._run_ttest(i, best_pos, j)
else:
p_val = self._run_wilcoxon(i, best_pos, j)
pval_outcome = pval_interpretation(p_val)
self.map['ttest'][i, j] = pval_outcome
if pval_outcome != 'Diff':
self.some_similar[j] = True
def compute(self):
self._addmap('fill', dtype=bool, func=lambda x: x is not None)
self._addmap('mean', dtype=float, func=np.mean)
self._addmap('std', dtype=float, func=np.std)
self._addmap('nobs', dtype=float, func=len)
self._addmap('rank', dtype=int, func=None)
self._addmap('color', dtype=object, func=None)
self._addmap('ttest', dtype=object, func=None)
self._addmap('latex', dtype=object, func=None)
self._addrank()
self._addcolor()
self._add_statistical_test()
if self.add_average:
self._addave()
self._modif = False
def _is_column_full(self, col):
return all(self.map['fill'][:, self.method_index[col]])
def _addave(self):
ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, significance_test=self.ttest, average=False,
missing=self.missing, missing_str=self.missing_str, prec_mean=self.prec_mean, prec_std=self.prec_std,
show_std=self.show_std)
for col in self.methods:
values = None
if self._is_column_full(col):
if self.ttest == 'ttest':
values = np.asarray(self.map['mean'][:, self.method_index[col]])
else: # wilcoxon
values = np.concatenate(self.values[:, self.method_index[col]])
ave.add('ave', col, values)
self.average = ave
def add(self, benchmark, method, values):
if values is not None:
values = np.asarray(values)
if values.ndim == 0:
values = values.flatten()
rid, cid = self._coordinates(benchmark, method)
if self.map['values'][rid, cid] is None:
self.map['values'][rid, cid] = values
elif values is not None:
self.map['values'][rid, cid] = np.concatenate([self.map['values'][rid, cid], values])
self.touch()
def get(self, benchmark, method, attr='mean'):
self.update()
assert attr in self.map, f'unknwon attribute {attr}'
rid, cid = self._coordinates(benchmark, method)
if self.map['fill'][rid, cid]:
v = self.map[attr][rid, cid]
if v is None or (isinstance(v, float) and np.isnan(v)):
return self.missing
return v
else:
return self.missing
def _coordinates(self, benchmark, method):
assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
assert method in self.method_index, f'method {method} out of range'
rid = self.benchmark_index[benchmark]
cid = self.method_index[method]
return rid, cid
def get_average(self, method, attr='mean'):
self.update()
if self.add_average:
return self.average.get('ave', method, attr=attr)
return None
def get_color(self, benchmark, method):
color = self.get(benchmark, method, attr='color')
if color is None:
return ''
return color
def latexCell(self, benchmark, method):
self.update()
i, j = self._coordinates(benchmark, method)
if self.map['fill'][i, j] == False:
return self.missing_str
mean = self.map['mean'][i, j]
l = f" {mean:.{self.prec_mean}f}"
if self.clean_zero:
l = l.replace(' 0.', '.')
isbest = self.map['rank'][i, j] == 1
if self.ttest is not None: # and self.some_similar[j]:
test_label = self.map['ttest'][i, j]
if test_label in ['Sim', 'Same']:
isbest = True
if isbest:
l = "\\textbf{" + l.strip() + "}\;"
else:
l += '\; '
stat = ''
# this is commented because we are putting in textbf all results that are similar to the best one
# if self.ttest is not None: # and self.some_similar[j]:
# test_label = self.map['ttest'][i, j]
# if test_label == 'Sim':
# stat = '^{\dag\phantom{\dag}}'
# elif test_label == 'Same':
# stat = '^{\ddag}'
# elif isbest or test_label == 'Diff':
# stat = '^{\phantom{\ddag}}'
std = ''
if self.show_std:
std = self.map['std'][i, j]
std = f" {std:.{self.prec_std}f}"
if self.clean_zero:
std = std.replace(' 0.', '.')
std = f" \pm {std:{self.prec_std}}"
relto = ''
if self.show_rel_to != -1:
if j != self.show_rel_to:
ref_ave = self.map['mean'][i, self.show_rel_to]
rel = 100*(mean-ref_ave)/ref_ave
if abs(rel) < 0.1:
relto=f'(\\approx)'
else:
plussign = '+' if rel>0 else '' # already plugs the '-' sign
relto=f'({plussign}{rel:.1f}\%)'
std = ''
if stat != '' or std != '' or relto != '':
l = f'{l}${stat}{std}{relto}$'
if self.color:
l += ' ' + self.map['color'][i, j]
return l
def latexTabular(self, benchmark_replace={}, method_replace={}, average=True):
tab = ' & '
tab += ' & '.join([method_replace.get(col, col) for col in self.methods])
tab += ' \\\\\hline\n'
for row in self.benchmarks:
rowname = benchmark_replace.get(row, row)
tab += rowname + ' & '
tab += self.latexRow(row)
if average:
tab += '\hline\n'
tab += 'Average & '
tab += self.latexAverage()
return tab
def latexTabularT(self, benchmark_replace={}, method_replace={}, average=True, side=False):
def withside(label):
return '\side{'+label+'}' if side else label
def center(label):
return '\multicolumn{1}{c}{'+label+'}'
tab = ' & '
tab += ' & '.join([center(withside(benchmark_replace.get(col, col))) for col in self.benchmarks])
if average:
tab += ' & ' + withside('Ave')
# tab += ' \\\\\hline\n'
tab += ' \\\\\midrule\n'
for row in self.methods:
rowname = method_replace.get(row, row)
tab += rowname + ' & '
tab += self.latexRowT(row, endl='')
if average:
tab += ' & '
tab += self.average.latexCell('ave', row)
# tab += '\\\\\hline\n'
tab += '\\\\\n'
tab += '\\bottomrule'
return tab
def latexRow(self, benchmark, endl='\\\\\hline\n'):
s = [self.latexCell(benchmark, col) for col in self.methods]
s = ' & '.join(s)
s += ' ' + endl
return s
def latexRowT(self, method, endl='\\\\\hline\n'):
s = [self.latexCell(benchmark, method) for benchmark in self.benchmarks]
s = ' & '.join(s)
s += ' ' + endl
return s
def latexAverage(self, endl='\\\\\hline\n'):
if self.add_average:
return self.average.latexRow('ave', endl=endl)
def getRankTable(self):
t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=0, average=True)
for rid, cid in self._getfilled():
row = self.benchmarks[rid]
col = self.methods[cid]
t.add(row, col, self.get(row, col, 'rank'))
t.compute()
return t
def dropMethods(self, methods):
drop_index = [self.method_index[m] for m in methods]
new_methods = np.delete(self.methods, drop_index)
new_index = {col: j for j, col in enumerate(new_methods)}
self.map['values'] = self.values[:, np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
self.methods = new_methods
self.method_index = new_index
self.touch()
def pval_interpretation(p_val):
if 0.005 >= p_val:
return 'Diff'
elif 0.05 >= p_val > 0.005:
return 'Sim'
elif p_val > 0.05:
return 'Same'
def color_red2green_01(val, maxtone=50):
if np.isnan(val): return None
assert 0 <= val <= 1, f'val {val} out of range [0,1]'
# rescale to [-1,1]
val = val * 2 - 1
if val < 0:
color = 'red'
tone = maxtone * (-val)
else:
color = 'green'
tone = maxtone * val
return '\cellcolor{' + color + f'!{int(tone)}' + '}'

View File

@ -1,22 +1,64 @@
import quapy as qp
from quapy.data import LabelledCollection
import numpy as np
from glob import glob
from json import load
import os
from os.path import join
import pickle
import pandas as pd
import csv
import datasets
from datasets import Dataset
import quapy as qp
from quapy.data import LabelledCollection
def load_samples(path_dir, classes):
nsamples = len(glob(join(path_dir, f'*.txt')))
def load_simple_sample_npytxt(parentdir, filename, classes=None):
samplepath = join(parentdir, filename+'.txt')
yX = np.loadtxt(samplepath)
X = yX[:,1:]
y = yX[:,0].astype(np.int32)
return LabelledCollection(instances=X, labels=y, classes_=classes)
def load_simple_sample_raw(parentdir, filename, classes=None):
samplepath = join(parentdir, filename+'.txt')
return LabelledCollection.load(samplepath, loader_func=qp.data.reader.from_text, classes=classes)
def load_single_sample_as_csv(parentdir, filename):
samplepath = join(parentdir, filename+'.txt')
df = pd.read_csv(samplepath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
labels = df.pop('labels').to_frame()
features = datasets.Features({'review': datasets.Value('string')})
sample = Dataset.from_pandas(df=df, features=features)
return sample, labels
def load_single_sample_pkl(parentdir, filename):
return pickle.load(open(join(parentdir, filename+'.pkl'), 'rb'))
# def load_samples_npytxt(path_dir, filter=None, classes=None):
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_npytxt)
# def load_samples_raw(path_dir, filter=None, classes=None):
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_raw, load_fn_kwargs={'classes': classes})
# def load_samples_as_csv(path_dir, filter=None):
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_as_csv)
# def load_samples_pkl(path_dir, filter=None):
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_pkl)
def load_samples_folder(path_dir, filter=None, load_fn=None, **load_fn_kwargs):
nsamples = len(glob(join(path_dir, f'*')))
for id in range(nsamples):
yield LabelledCollection.load(join(path_dir, f'{id}.txt'), loader_func=qp.data.reader.from_text, classes=classes)
def load_samples_pkl(path_dir, filter=None):
nsamples = len(glob(join(path_dir, f'*.pkl')))
for id in range(nsamples):
if filter is not None:
if id not in filter:
continue
yield pickle.load(open(join(path_dir, f'{id}.pkl'), 'rb'))
if (filter is None) or id in filter:
yield load_fn(path_dir, f'{id}', **load_fn_kwargs)

View File

@ -183,7 +183,7 @@ def _training_helper(learner,
if not hasattr(learner, 'predict_proba'):
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
f'The learner will be calibrated.')
learner = CalibratedClassifierCV(learner, cv=5)
learner = CalibratedClassifierCV(learner, cv=5, ensemble=True)
if val_split is not None:
if isinstance(val_split, float):
if not (0 < val_split < 1):
@ -470,7 +470,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
def fit(self, data: LabelledCollection, fit_learner=True):
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
self.train_prevalence = F.prevalence_from_labels(data.labels, data.classes_)
return self
def aggregate(self, classif_posteriors, epsilon=EPSILON):