Compare commits
7 Commits
5df355a4e1
...
72c63fff09
| Author | SHA1 | Date |
|---|---|---|
|
|
72c63fff09 | |
|
|
85abaf2ba2 | |
|
|
b4c3e57343 | |
|
|
464bd60c7c | |
|
|
d949c77317 | |
|
|
ad64dfe2a0 | |
|
|
b2e161480e |
|
|
@ -1,14 +1,17 @@
|
|||
import gzip
|
||||
import quapy as qp
|
||||
from Ordinal.utils import load_simple_sample_raw
|
||||
from quapy.data import LabelledCollection
|
||||
import quapy.functional as F
|
||||
import os
|
||||
from os.path import join
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
|
||||
|
||||
datadir = '/mnt/1T/Datasets/Amazon/reviews'
|
||||
outdir = './data/'
|
||||
real_prev_path = './data/Books-real-prevalence-by-product_votes1_reviews100.csv'
|
||||
domain = 'Books'
|
||||
seed = 7
|
||||
|
||||
|
|
@ -18,13 +21,6 @@ te_size = 1000
|
|||
nval = 1000
|
||||
nte = 5000
|
||||
|
||||
# domain = 'Gift_Cards'
|
||||
# tr_size = 200
|
||||
# val_size = 100
|
||||
# te_size = 100
|
||||
# nval = 20
|
||||
# nte = 40
|
||||
|
||||
|
||||
def from_gz_text(path, encoding='utf-8', class2int=True):
|
||||
"""
|
||||
|
|
@ -70,7 +66,6 @@ def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, pre
|
|||
write_txt_sample(sample, join(outdir, f'{i}.txt'))
|
||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
|
||||
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(prevpath, 'wt') as prevfile:
|
||||
|
|
@ -80,37 +75,69 @@ def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, pre
|
|||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
|
||||
def gen_samples_real_prevalences(real_prevalences, pool: LabelledCollection, sample_size, outdir, prevpath_out):
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(prevpath_out, 'wt') as prevfile:
|
||||
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||
for i, prev in enumerate(real_prevalences):
|
||||
sample = pool.sampling(sample_size, *prev[:-1])
|
||||
write_txt_sample(sample, join(outdir, f'{i}.txt'))
|
||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
fullpath = join(datadir,domain)+'.txt.gz'
|
||||
|
||||
data = LabelledCollection.load(fullpath, from_gz_text)
|
||||
print(len(data))
|
||||
print(data.classes_)
|
||||
print(data.prevalence())
|
||||
# fullpath = join(datadir,domain)+'.txt.gz'
|
||||
#
|
||||
# data = LabelledCollection.load(fullpath, from_gz_text)
|
||||
# print(len(data))
|
||||
# print(data.classes_)
|
||||
# print(data.prevalence())
|
||||
|
||||
with qp.util.temp_seed(seed):
|
||||
train, rest = data.split_stratified(train_prop=tr_size)
|
||||
|
||||
devel, test = rest.split_stratified(train_prop=0.5)
|
||||
print(len(train))
|
||||
print(len(devel))
|
||||
print(len(test))
|
||||
|
||||
# train, rest = data.split_stratified(train_prop=tr_size)
|
||||
#
|
||||
# devel, test = rest.split_stratified(train_prop=0.5)
|
||||
# print(len(train))
|
||||
# print(len(devel))
|
||||
# print(len(test))
|
||||
#
|
||||
domaindir = join(outdir, domain)
|
||||
|
||||
write_txt_sample(train, join(domaindir, 'training_data.txt'))
|
||||
write_txt_sample(devel, join(domaindir, 'development_data.txt'))
|
||||
write_txt_sample(test, join(domaindir, 'test_data.txt'))
|
||||
# write_txt_sample(train, join(domaindir, 'training_data.txt'))
|
||||
# write_txt_sample(devel, join(domaindir, 'development_data.txt'))
|
||||
# write_txt_sample(test, join(domaindir, 'test_data.txt'))
|
||||
|
||||
gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
|
||||
prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
|
||||
gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
|
||||
prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
|
||||
# this part is to be used when the partitions have already been created, in order to avoid re-generating them
|
||||
train = load_simple_sample_raw(domaindir, 'training_data')
|
||||
devel = load_simple_sample_raw(domaindir, 'development_data')
|
||||
test = load_simple_sample_raw(domaindir, 'test_data')
|
||||
|
||||
gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
|
||||
prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
|
||||
gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
|
||||
prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
|
||||
# gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
|
||||
# prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
|
||||
# gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
|
||||
# prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
|
||||
|
||||
# gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
|
||||
# prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
|
||||
# gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
|
||||
# prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
|
||||
|
||||
|
||||
# this part generates samples based on real prevalences (in this case, prevalences of sets of books reviews
|
||||
# groupped by product). It loads the real prevalences (computed elsewhere), and randomly extract 5000 for test
|
||||
# and 1000 for val (disjoint). Then realize the samplings
|
||||
|
||||
assert os.path.exists(real_prev_path), f'real prevalence file does not seem to exist...'
|
||||
real_prevalences = np.genfromtxt(real_prev_path, delimiter='\t')
|
||||
|
||||
nrows = real_prevalences.shape[0]
|
||||
rand_sel = np.random.permutation(nrows)
|
||||
real_prevalences_val = real_prevalences[rand_sel[:nval]]
|
||||
real_prevalences_te = real_prevalences[rand_sel[nval:nval+nte]]
|
||||
|
||||
gen_samples_real_prevalences(real_prevalences_val, devel, sample_size=val_size, outdir=join(domaindir, 'real', 'dev_samples'),
|
||||
prevpath_out=join(domaindir, 'real', 'dev_prevalences.txt'))
|
||||
gen_samples_real_prevalences(real_prevalences_te, test, sample_size=te_size, outdir=join(domaindir, 'real', 'test_samples'),
|
||||
prevpath_out=join(domaindir, 'real', 'test_prevalences.txt'))
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,116 @@
|
|||
import gzip
|
||||
import quapy as qp
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from quapy.data import LabelledCollection
|
||||
import quapy.functional as F
|
||||
import os
|
||||
from os.path import join
|
||||
from pathlib import Path
|
||||
import pickle
|
||||
|
||||
|
||||
datadir = '../OrdinalQuantification'
|
||||
outdir = './data/'
|
||||
domain = 'fact'
|
||||
seed = 7
|
||||
|
||||
tr_size = 20000
|
||||
val_size = 1000
|
||||
te_size = 1000
|
||||
nval = 1000
|
||||
nte = 5000
|
||||
|
||||
|
||||
def from_csv(path):
|
||||
df = pd.read_csv(path)
|
||||
|
||||
# divide the continuous labels into ordered classes
|
||||
energy_boundaries = np.arange(start=2.4, stop=4.2, step=0.15)[1:-1]
|
||||
y = np.digitize(np.array(df['log10_energy'], dtype=np.float32), energy_boundaries)
|
||||
|
||||
# note: omitting the dtype will result in a single instance having a different class
|
||||
|
||||
# obtain a matrix of shape (n_samples, n_features)
|
||||
X = df.iloc[:, 1:].to_numpy().astype(np.float32)
|
||||
return X, y
|
||||
|
||||
|
||||
def write_pkl(sample: LabelledCollection, path):
|
||||
os.makedirs(Path(path).parent, exist_ok=True)
|
||||
pickle.dump(sample, open(path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(prevpath, 'wt') as prevfile:
|
||||
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||
for i, prev in enumerate(F.uniform_simplex_sampling(n_classes=pool.n_classes, size=nsamples)):
|
||||
sample = pool.sampling(sample_size, *prev)
|
||||
write_pkl(sample, join(outdir, f'{i}.pkl'))
|
||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
|
||||
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(prevpath, 'wt') as prevfile:
|
||||
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||
for i, sample in enumerate(pool.natural_sampling_generator(sample_size, repeats=nsamples)):
|
||||
write_pkl(sample, join(outdir, f'{i}.pkl'))
|
||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
|
||||
|
||||
fullpath = join(datadir,domain, 'fact_wobble.csv')
|
||||
|
||||
data = LabelledCollection.load(fullpath, from_csv)
|
||||
|
||||
if np.isnan(data.instances).any():
|
||||
rows, cols = np.where(np.isnan(data.instances))
|
||||
data.instances = np.delete(data.instances, rows, axis=0)
|
||||
data.labels = np.delete(data.labels, rows, axis=0)
|
||||
print('deleted nan rows')
|
||||
|
||||
if np.isnan(data.instances).any():
|
||||
rows, cols = np.where(np.isnan(data.instances))
|
||||
data.instances = np.delete(data.instances, rows, axis=0)
|
||||
data.labels = np.delete(data.labels, rows, axis=0)
|
||||
print('deleted nan rows')
|
||||
|
||||
if np.isinf(data.instances).any():
|
||||
rows, cols = np.where(np.isinf(data.instances))
|
||||
data.instances = np.delete(data.instances, rows, axis=0)
|
||||
data.labels = np.delete(data.labels, rows, axis=0)
|
||||
print('deleted inf rows')
|
||||
|
||||
|
||||
print(len(data))
|
||||
print(data.classes_)
|
||||
print(data.prevalence())
|
||||
|
||||
with qp.util.temp_seed(seed):
|
||||
train, rest = data.split_stratified(train_prop=tr_size)
|
||||
|
||||
devel, test = rest.split_stratified(train_prop=0.5)
|
||||
print(len(train))
|
||||
print(len(devel))
|
||||
print(len(test))
|
||||
|
||||
domaindir = join(outdir, domain)
|
||||
|
||||
write_pkl(train, join(domaindir, 'training_data.pkl'))
|
||||
write_pkl(devel, join(domaindir, 'development_data.pkl'))
|
||||
write_pkl(test, join(domaindir, 'test_data.pkl'))
|
||||
|
||||
gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
|
||||
prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
|
||||
gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
|
||||
prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
|
||||
|
||||
gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
|
||||
prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
|
||||
gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
|
||||
prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,6 +1,11 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
# smoothing approximation
|
||||
def smoothness(p):
|
||||
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||
|
||||
|
||||
def _check_arrays(prevs):
|
||||
prevs = np.asarray(prevs)
|
||||
if prevs.ndim==1:
|
||||
|
|
@ -8,6 +13,7 @@ def _check_arrays(prevs):
|
|||
return prevs
|
||||
|
||||
|
||||
# mean normalized match distance
|
||||
def mnmd(prevs, prevs_hat):
|
||||
prevs = _check_arrays(prevs)
|
||||
prevs_hat = _check_arrays(prevs_hat)
|
||||
|
|
@ -17,6 +23,7 @@ def mnmd(prevs, prevs_hat):
|
|||
return np.mean(nmds)
|
||||
|
||||
|
||||
# normalized match distance
|
||||
def nmd(prev, prev_hat):
|
||||
n = len(prev)
|
||||
return (1./(n-1))*mdpa(prev, prev_hat)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,150 @@
|
|||
import numpy as np
|
||||
import quapy as qp
|
||||
import os
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from Ordinal.model import RegressionQuantification, LogisticAT, LogisticSE, LogisticIT, LAD, OrdinalRidge
|
||||
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC
|
||||
from os.path import join
|
||||
from utils import load_samples_folder, load_single_sample_pkl
|
||||
from evaluation import nmd, mnmd
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
"""
|
||||
This script generates all results from Table 1 in the paper, i.e., all results comparing quantifiers equipped with
|
||||
standard logistic regression against quantifiers equipped with order-aware classifiers
|
||||
"""
|
||||
|
||||
def quantifiers():
|
||||
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||
params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
|
||||
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||
params_Ridge = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced'], 'normalize':[True,False]}
|
||||
|
||||
# baselines
|
||||
yield 'CC(LR)', CC(LogisticRegression()), params_LR
|
||||
yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
|
||||
yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
|
||||
yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
|
||||
yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
|
||||
|
||||
# with order-aware classifiers
|
||||
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
|
||||
yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
|
||||
yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
|
||||
yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
|
||||
yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
|
||||
|
||||
yield 'CC(OLR-SE)', CC(LogisticSE()), params_OLR
|
||||
yield 'PCC(OLR-SE)', PCC(LogisticSE()), params_OLR
|
||||
yield 'ACC(OLR-SE)', ACC(LogisticSE()), params_OLR
|
||||
yield 'PACC(OLR-SE)', PACC(LogisticSE()), params_OLR
|
||||
yield 'SLD(OLR-SE)', EMQ(LogisticSE()), params_OLR
|
||||
|
||||
yield 'CC(OLR-IT)', CC(LogisticIT()), params_OLR
|
||||
yield 'PCC(OLR-IT)', PCC(LogisticIT()), params_OLR
|
||||
yield 'ACC(OLR-IT)', ACC(LogisticIT()), params_OLR
|
||||
yield 'PACC(OLR-IT)', PACC(LogisticIT()), params_OLR
|
||||
yield 'SLD(OLR-IT)', EMQ(LogisticIT()), params_OLR
|
||||
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
|
||||
|
||||
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
yield 'CC(LAD)', CC(LAD()), params_SVR
|
||||
yield 'ACC(LAD)', ACC(LAD()), params_SVR
|
||||
yield 'CC(ORidge)', CC(OrdinalRidge()), params_Ridge
|
||||
yield 'ACC(ORidge)', ACC(OrdinalRidge()), params_Ridge
|
||||
|
||||
|
||||
def run_experiment(params):
|
||||
qname, q, param_grid = params
|
||||
qname += posfix
|
||||
resultfile = join(resultpath, f'{qname}.all.csv')
|
||||
if os.path.exists(resultfile):
|
||||
print(f'result file {resultfile} already exists: continue')
|
||||
return None
|
||||
|
||||
print(f'fitting {qname} for all-drift')
|
||||
|
||||
|
||||
def load_test_samples():
|
||||
folderpath = join(datapath, domain, protocol, 'test_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=5000):
|
||||
if posfix == '-std':
|
||||
sample.instances = zscore.transform(sample.instances)
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
|
||||
def load_dev_samples():
|
||||
folderpath = join(datapath, domain, protocol, 'dev_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=1000):
|
||||
if posfix == '-std':
|
||||
sample.instances = zscore.transform(sample.instances)
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
q = qp.model_selection.GridSearchQ(
|
||||
q,
|
||||
param_grid,
|
||||
sample_size=1000,
|
||||
protocol='gen',
|
||||
error=mnmd,
|
||||
val_split=load_dev_samples,
|
||||
n_jobs=-1,
|
||||
refit=False,
|
||||
timeout=60*60*2,
|
||||
verbose=True).fit(train)
|
||||
|
||||
hyperparams = f'{qname}\tall\t{q.best_params_}\t{q.best_score_}'
|
||||
|
||||
print('[done]')
|
||||
|
||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||
mean_nmd = report['nmd'].mean()
|
||||
std_nmd = report['nmd'].std()
|
||||
print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||
report.to_csv(resultfile, index=False)
|
||||
|
||||
print('[learning regressor-based adjustment]')
|
||||
q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
|
||||
q.fit(None)
|
||||
|
||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||
mean_nmd = report['nmd'].mean()
|
||||
std_nmd = report['nmd'].std()
|
||||
print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||
resultfile = join(resultpath, f'{qname}.all.reg.csv')
|
||||
report.to_csv(resultfile, index=False)
|
||||
|
||||
return hyperparams
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
#domain = 'Books-tfidf'
|
||||
posfix = ''
|
||||
|
||||
# domain = 'fact'
|
||||
# posfix = '-std' # set to '' to avoid standardization
|
||||
# posfix = ''
|
||||
|
||||
load_sample_fn = load_single_sample_pkl
|
||||
datapath = './data'
|
||||
protocol = 'app'
|
||||
resultpath = join('./results', domain, protocol)
|
||||
os.makedirs(resultpath, exist_ok=True)
|
||||
|
||||
train = load_sample_fn(join(datapath, domain), 'training_data')
|
||||
|
||||
if posfix=='-std':
|
||||
zscore = StandardScaler()
|
||||
train.instances = zscore.fit_transform(train.instances)
|
||||
|
||||
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
|
||||
hypers = qp.util.parallel(run_experiment, quantifiers(), n_jobs=-3)
|
||||
for h in hypers:
|
||||
if h is not None:
|
||||
foo.write(h)
|
||||
foo.write('\n')
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,105 @@
|
|||
import csv
|
||||
import sys
|
||||
import datasets
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch.cuda
|
||||
from datasets import Dataset, DatasetDict
|
||||
from sklearn.metrics import f1_score
|
||||
from sklearn.model_selection import train_test_split
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
from transformers import AutoTokenizer, DataCollatorWithPadding
|
||||
from transformers import Trainer
|
||||
from transformers import TrainingArguments
|
||||
|
||||
|
||||
"""
|
||||
This script fine-tunes a pre-trained language model on a given textual training set.
|
||||
The training goes for a maximum of 5 epochs, but stores the model parameters of the best performing epoch according
|
||||
to the validation loss in a hold-out val split of 1000 documents (stratified).
|
||||
|
||||
We used it with RoBERTa in the training set of the Amazon-OQ-BK domain, i.e.:
|
||||
$> python3 ./data/Books/training_data.txt roberta-base
|
||||
"""
|
||||
|
||||
|
||||
def tokenize_function(example):
|
||||
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else 256)
|
||||
return tokens
|
||||
|
||||
|
||||
def compute_metrics(eval_preds):
|
||||
logits, labels = eval_preds
|
||||
preds = np.argmax(logits, axis=-1)
|
||||
return {
|
||||
'macro-f1': f1_score(labels, preds, average='macro'),
|
||||
'micro-f1': f1_score(labels, preds, average='micro'),
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
debug = False
|
||||
assert torch.cuda.is_available(), 'cuda is not available'
|
||||
|
||||
# datapath = './data/Books/training_data.txt'
|
||||
# checkpoint = 'roberta-base'
|
||||
n_args = len(sys.argv)
|
||||
assert n_args==3, 'wrong arguments, expected: <training-path> <transformer-name>'
|
||||
|
||||
datapath = sys.argv[1] # './data/Books/training_data.txt'
|
||||
checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base'
|
||||
|
||||
modelout = checkpoint+'-val-finetuned'
|
||||
|
||||
# load the training set, and extract a held-out validation split of 1000 documents (stratified)
|
||||
df = pd.read_csv(datapath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
|
||||
labels = df['labels'].to_frame()
|
||||
X_train, X_val = train_test_split(df, stratify=labels, test_size=.25, random_state=1)
|
||||
num_labels = len(pd.unique(labels['labels']))
|
||||
|
||||
features = datasets.Features({'labels': datasets.Value('int32'), 'review': datasets.Value('string')})
|
||||
train = Dataset.from_pandas(df=X_train, split='train', features=features)
|
||||
validation = Dataset.from_pandas(df=X_val, split='validation', features=features)
|
||||
|
||||
dataset = DatasetDict({
|
||||
'train': train.select(range(500)) if debug else train,
|
||||
'validation': validation.select(range(500)) if debug else validation
|
||||
})
|
||||
|
||||
# tokenize the dataset
|
||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
|
||||
|
||||
# fine-tuning
|
||||
training_args = TrainingArguments(
|
||||
modelout,
|
||||
learning_rate=2e-5,
|
||||
num_train_epochs=5,
|
||||
weight_decay=0.01,
|
||||
evaluation_strategy='epoch',
|
||||
save_strategy='epoch',
|
||||
per_device_train_batch_size=16,
|
||||
per_device_eval_batch_size=16,
|
||||
# eval_steps=10,
|
||||
save_total_limit=1,
|
||||
load_best_model_at_end=True
|
||||
)
|
||||
trainer = Trainer(
|
||||
model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_datasets['train'],
|
||||
eval_dataset=tokenized_datasets['validation'],
|
||||
data_collator=DataCollatorWithPadding(tokenizer),
|
||||
tokenizer=tokenizer,
|
||||
compute_metrics=compute_metrics
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
import pandas as pd
|
||||
from os.path import join
|
||||
import os
|
||||
from glob import glob
|
||||
from pathlib import Path
|
||||
|
||||
from Ordinal.main import quantifiers
|
||||
from Ordinal.tabular import Table
|
||||
|
||||
"""
|
||||
This script generates some tables for Amazon-OQ-BK (for internal use only)
|
||||
"""
|
||||
|
||||
domain = 'Books-tfidf'
|
||||
domain_bert_last = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
|
||||
domain_bert_ave = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
domain_bert_post = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
|
||||
prot = 'app'
|
||||
outpath = f'./tables/{domain}/{prot}/results.tex'
|
||||
|
||||
resultpath = join('./results', domain, prot)
|
||||
resultpath_bertlast = join('./results', domain_bert_last, prot)
|
||||
resultpath_bertave = join('./results', domain_bert_ave, prot)
|
||||
resultpath_bertpost = join('./results', domain_bert_post, prot)
|
||||
|
||||
methods = [qname for qname, *_ in quantifiers()]
|
||||
methods += ['SLD(LR)-agg']
|
||||
methods_Rlast = [m+'-RoBERTa-last' for m in methods]
|
||||
methods_Rave = [m+'-RoBERTa-average' for m in methods]
|
||||
methods_Rpost = [m+'-RoBERTa-posteriors' for m in methods]
|
||||
methods = methods + methods_Rlast + methods_Rave + methods_Rpost
|
||||
# methods += [m+'-r' for m in methods]
|
||||
|
||||
table = Table(benchmarks=['low', 'mid', 'high', 'all'], methods=methods, prec_mean=4, show_std=True, prec_std=4)
|
||||
|
||||
resultfiles = list(glob(f'{resultpath}/*.csv')) \
|
||||
+ list(glob(f'{resultpath_bertlast}/*.csv')) \
|
||||
+ list(glob(f'{resultpath_bertave}/*.csv')) \
|
||||
+ list(glob(f'{resultpath_bertpost}/*.csv'))
|
||||
|
||||
for resultfile in resultfiles:
|
||||
df = pd.read_csv(resultfile)
|
||||
nmd = df['nmd'].values
|
||||
resultname = Path(resultfile).name
|
||||
method, drift, *other = resultname.replace('.csv', '').split('.')
|
||||
if other:
|
||||
method += '-r'
|
||||
if method not in methods:
|
||||
continue
|
||||
|
||||
table.add(drift, method, nmd)
|
||||
|
||||
os.makedirs(Path(outpath).parent, exist_ok=True)
|
||||
|
||||
tabular = """
|
||||
\\resizebox{\\textwidth}{!}{%
|
||||
\\begin{tabular}{|c||""" + ('c|' * (table.nbenchmarks)) + """} \hline
|
||||
"""
|
||||
tabular += table.latexTabularT(average=False)
|
||||
tabular += """
|
||||
\end{tabular}%
|
||||
}"""
|
||||
|
||||
print('saving table in', outpath)
|
||||
with open(outpath, 'wt') as foo:
|
||||
foo.write(tabular)
|
||||
foo.write('\n')
|
||||
|
||||
print('[done]')
|
||||
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
import pandas as pd
|
||||
from os.path import join
|
||||
import os
|
||||
from glob import glob
|
||||
from pathlib import Path
|
||||
|
||||
from Ordinal.experiments_lr_vs_ordlr import quantifiers
|
||||
from Ordinal.tabular import Table
|
||||
|
||||
"""
|
||||
This script generates some tables for Fact-OQ (for internal use only)
|
||||
"""
|
||||
|
||||
#domain = 'fact'
|
||||
#domain = 'Books-tfidf'
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
prot = 'app'
|
||||
outpath = f'./tables/{domain}/{prot}/results.tex'
|
||||
|
||||
resultpath = join('./results', domain, prot)
|
||||
|
||||
withstd=False
|
||||
|
||||
methods = [qname for qname, *_ in quantifiers()]
|
||||
if withstd:
|
||||
methods = [m+'-std' for m in methods]
|
||||
#methods = methods + methods_variant
|
||||
# methods += [m+'-r' for m in methods]
|
||||
|
||||
quantifiers_families = ['CC', 'PCC', 'ACC', 'PACC', 'SLD']
|
||||
# method_variants = ['LR', 'OLR-AT', 'OLR-SE', 'OLR-IT', 'ORidge', 'LAD']
|
||||
method_variants = ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']
|
||||
if withstd:
|
||||
method_variants = [m+'-std' for m in method_variants]
|
||||
|
||||
print('families:', quantifiers_families)
|
||||
print('variants', method_variants)
|
||||
table = Table(benchmarks=quantifiers_families, methods=method_variants, prec_mean=4, show_std=True, prec_std=4,
|
||||
color=False, show_rel_to=0, missing_str='\multicolumn{1}{c}{---}', clean_zero=True)
|
||||
|
||||
resultfiles = list(glob(f'{resultpath}/*).all.csv'))
|
||||
|
||||
for resultfile in resultfiles:
|
||||
df = pd.read_csv(resultfile)
|
||||
nmd = df['nmd'].values
|
||||
resultname = Path(resultfile).name
|
||||
|
||||
method, drift, *other = resultname.replace('.csv', '').replace('-RoBERTa-average','').split('.')
|
||||
if drift!='all':
|
||||
continue
|
||||
if other:
|
||||
method += '-r'
|
||||
if method not in methods:
|
||||
continue
|
||||
|
||||
family, variant = method.split('(')
|
||||
variant = variant.replace(')', '')
|
||||
if variant not in method_variants:
|
||||
continue
|
||||
table.add(family, variant, nmd)
|
||||
|
||||
os.makedirs(Path(outpath).parent, exist_ok=True)
|
||||
|
||||
tabular = """
|
||||
\\resizebox{\\textwidth}{!}{%
|
||||
|
||||
\\begin{tabular}{c""" + ('l' * (table.nbenchmarks)) + """}
|
||||
\\toprule
|
||||
"""
|
||||
|
||||
tabular += table.latexTabularT(average=False)
|
||||
tabular += """
|
||||
\end{tabular}%
|
||||
}"""
|
||||
|
||||
print('saving table in', outpath)
|
||||
with open(outpath, 'wt') as foo:
|
||||
foo.write(tabular)
|
||||
foo.write('\n')
|
||||
|
||||
print('[done]')
|
||||
|
||||
|
|
@ -0,0 +1,152 @@
|
|||
import sys
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import AutoTokenizer
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
from os.path import join
|
||||
import os
|
||||
import shutil
|
||||
from tqdm import tqdm
|
||||
|
||||
from Ordinal.utils import load_samples_folder, load_single_sample_as_csv
|
||||
|
||||
|
||||
"""
|
||||
This scripts takes a pre-trained model (a fine-tuned one) and generates numerical representations for all
|
||||
samples in the dataset. The representations are saved in npy-txt plain format.
|
||||
"""
|
||||
|
||||
|
||||
def tokenize_function(example):
|
||||
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else None, return_tensors='pt')
|
||||
return {
|
||||
'input_ids': tokens.input_ids.cuda(),
|
||||
'attention_mask': tokens.attention_mask.cuda()
|
||||
}
|
||||
|
||||
|
||||
def save_samples_as_txt(tensors, labels, path):
|
||||
vectors = tensors
|
||||
labels = labels.values
|
||||
vec_lab = np.hstack([labels, vectors])
|
||||
n_cols = vectors.shape[1]
|
||||
np.savetxt(path, vec_lab, fmt=['%d']+['%f']*n_cols)
|
||||
|
||||
|
||||
def transform_sample(instances, labels, outpath, batch_size=50):
|
||||
ndocs = len(labels)
|
||||
batches = ndocs // batch_size
|
||||
assert ndocs % batches == 0, 'fragmented last bach not supported'
|
||||
|
||||
transformations = []
|
||||
for batch_id in range(0, ndocs, batch_size):
|
||||
|
||||
batch_instances = instances[batch_id:batch_id + batch_size]
|
||||
|
||||
tokenized_dataset = tokenize_function(batch_instances)
|
||||
out = model(**tokenized_dataset, output_hidden_states=True)
|
||||
|
||||
if generation_mode == 'posteriors':
|
||||
logits = out.logits
|
||||
posteriors = torch.softmax(logits, dim=-1)
|
||||
transformed = posteriors
|
||||
elif generation_mode == 'last':
|
||||
hidden_states = out.hidden_states
|
||||
last_layer_cls = hidden_states[-1][:, 0, :]
|
||||
transformed = last_layer_cls
|
||||
elif generation_mode == 'average':
|
||||
hidden_states = out.hidden_states
|
||||
hidden_states = torch.stack(hidden_states)
|
||||
all_layer_cls = hidden_states[:, :, 0, :]
|
||||
average_cls = torch.mean(all_layer_cls, dim=0)
|
||||
transformed = average_cls
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
transformations.append(transformed.cpu().numpy())
|
||||
|
||||
transformations = np.vstack(transformations)
|
||||
save_samples_as_txt(transformations, labels, outpath)
|
||||
|
||||
|
||||
def transform_folder_samples(protocol, splitname, skip=0):
|
||||
in_folder = join(datapath, domain, protocol, splitname)
|
||||
out_folder = join(datapath, outname, protocol, splitname)
|
||||
total = 1000 if splitname.startswith('dev') else 5000
|
||||
|
||||
for i, (instances, labels) in tqdm(enumerate(
|
||||
load_samples_folder(in_folder, load_fn=load_single_sample_as_csv)), desc=f'{protocol} {splitname}', total=total):
|
||||
if i>= skip:
|
||||
transform_sample(instances, labels, outpath=join(out_folder, f'{i}.txt'))
|
||||
|
||||
|
||||
def get_best_checkpoint(checkpointdir):
|
||||
from glob import glob
|
||||
steps = []
|
||||
for folder in glob(f'{checkpointdir}/checkpoint-*'):
|
||||
step=int(folder.split('checkpoint-')[1])
|
||||
steps.append(step)
|
||||
assert len(steps) <= 2, 'unexpected number of steps, only two where expected (the best one and the last one)'
|
||||
choosen = f'{checkpointdir}/checkpoint-{min(steps)}'
|
||||
print(f'choosen checkpoint is {choosen}')
|
||||
return choosen
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
debug = False
|
||||
assert torch.cuda.is_available(), 'cuda is not available'
|
||||
|
||||
#checkpoint='roberta-base-val-finetuned'
|
||||
#generation_mode = 'ave'
|
||||
|
||||
n_args = len(sys.argv)
|
||||
assert n_args==3, 'wrong arguments, expected: <checkpoint> <generation-mode>\n' \
|
||||
'\tgeneration-mode: last (last layer), ave (average pooling), or posteriors (posterior probabilities)'
|
||||
|
||||
checkpoint = sys.argv[1] #e.g., 'bert-base-uncased'
|
||||
generation_mode = sys.argv[2] # e.g., 'last'
|
||||
|
||||
assert 'finetuned' in checkpoint, 'looks like this model is not finetuned'
|
||||
|
||||
checkpoint = get_best_checkpoint(checkpoint)
|
||||
|
||||
num_labels = 5
|
||||
|
||||
datapath = './data'
|
||||
domain = 'Books'
|
||||
protocols = ['real'] # ['app', 'npp']
|
||||
|
||||
assert generation_mode in ['last', 'average', 'posteriors'], 'unknown generation_model'
|
||||
outname = domain + f'-{checkpoint}-{generation_mode}'
|
||||
|
||||
with torch.no_grad():
|
||||
print('loading', checkpoint)
|
||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
|
||||
|
||||
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||
|
||||
print('transforming the training set')
|
||||
instances, labels = load_single_sample_as_csv(join(datapath, domain), 'training_data')
|
||||
transform_sample(instances, labels, join(datapath, outname, 'training_data.txt'))
|
||||
print('[done]')
|
||||
|
||||
for protocol in protocols:
|
||||
in_path = join(datapath, domain, protocol)
|
||||
out_path = join(datapath, outname, protocol)
|
||||
os.makedirs(out_path, exist_ok=True)
|
||||
os.makedirs(join(out_path, 'dev_samples'), exist_ok=True)
|
||||
os.makedirs(join(out_path, 'test_samples'), exist_ok=True)
|
||||
shutil.copyfile(join(in_path, 'dev_prevalences.txt'), join(out_path, 'dev_prevalences.txt'))
|
||||
shutil.copyfile(join(in_path, 'test_prevalences.txt'), join(out_path, 'test_prevalences.txt'))
|
||||
|
||||
print('processing', protocol)
|
||||
transform_folder_samples(protocol, 'dev_samples')
|
||||
transform_folder_samples(protocol, 'test_samples')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.data.reader import from_text
|
||||
from quapy.functional import strprev
|
||||
|
||||
category = 'Books'
|
||||
datadir = './data'
|
||||
|
||||
training_path = f'{datadir}/{category}/training_data.txt'
|
||||
|
||||
data = LabelledCollection.load(training_path, loader_func=from_text)
|
||||
|
||||
print(len(data))
|
||||
print(strprev(data.prevalence()))
|
||||
|
||||
|
||||
217
Ordinal/main.py
217
Ordinal/main.py
|
|
@ -3,87 +3,154 @@ from sklearn.linear_model import LogisticRegression
|
|||
import quapy as qp
|
||||
import numpy as np
|
||||
|
||||
from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, RegressorClassifier
|
||||
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC
|
||||
from Ordinal.model import OrderedLogisticRegression, LogisticAT
|
||||
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy
|
||||
from quapy.data import LabelledCollection
|
||||
from os.path import join
|
||||
from utils import load_samples, load_samples_pkl
|
||||
import os
|
||||
from utils import load_samples_folder, load_simple_sample_npytxt, load_single_sample_pkl
|
||||
from evaluation import nmd, mnmd
|
||||
from time import time
|
||||
import pickle
|
||||
from tqdm import tqdm
|
||||
|
||||
domain = 'Books-tfidf'
|
||||
datapath = './data'
|
||||
protocol = 'app'
|
||||
drift = 'high'
|
||||
|
||||
train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb'))
|
||||
|
||||
|
||||
def load_test_samples():
|
||||
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
|
||||
ids = set(ids)
|
||||
for sample in tqdm(load_samples_pkl(join(datapath, domain, protocol, 'test_samples'), filter=ids), total=len(ids)):
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
|
||||
def load_dev_samples():
|
||||
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
|
||||
ids = set(ids)
|
||||
for sample in tqdm(load_samples_pkl(join(datapath, domain, protocol, 'dev_samples'), filter=ids), total=len(ids)):
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
|
||||
print('fitting the quantifier')
|
||||
|
||||
# q = EMQ(LogisticRegression(class_weight='balanced'))
|
||||
# q = PACC(LogisticRegression(class_weight='balanced'))
|
||||
q = PACC(OrderedLogisticRegression())
|
||||
# q = PACC(StackedClassifier(LogisticRegression(class_weight='balanced')))
|
||||
# q = RegressionQuantification(PCC(LogisticRegression(class_weight='balanced')), val_samples_generator=load_dev_samples)
|
||||
# q = ACC(RegressorClassifier())
|
||||
|
||||
param_grid = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||
# param_grid = {'C': np.logspace(-3,3,14)}
|
||||
# param_grid = {'alpha':np.logspace(-8, 6, 15)}
|
||||
|
||||
# q = qp.model_selection.GridSearchQ(
|
||||
# q,
|
||||
# param_grid,
|
||||
# 1000,
|
||||
# 'gen',
|
||||
# error=mnmd,
|
||||
# val_split=load_dev_samples,
|
||||
# n_jobs=-1,
|
||||
# refit=False,
|
||||
# verbose=True)
|
||||
|
||||
q.fit(train)
|
||||
|
||||
# q = RegressionQuantification(q, val_samples_generator=load_dev_samples)
|
||||
# q.fit(None)
|
||||
|
||||
print('[done]')
|
||||
|
||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||
mean_nmd = report['nmd'].mean()
|
||||
std_nmd = report['nmd'].std()
|
||||
print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||
|
||||
q = RegressionQuantification(q, val_samples_generator=load_dev_samples)
|
||||
q.fit(None)
|
||||
|
||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||
mean_nmd = report['nmd'].mean()
|
||||
std_nmd = report['nmd'].std()
|
||||
print(f'[regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||
|
||||
# drift='high'
|
||||
# report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||
# mean_nmd = report['nmd'].mean()
|
||||
# std_nmd = report['nmd'].std()
|
||||
# print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||
import mord
|
||||
|
||||
|
||||
|
||||
def quantifiers():
|
||||
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||
# params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
|
||||
params_OLR = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
|
||||
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||
# params_SVR = {'C': np.logspace(0, 1, 2)}
|
||||
|
||||
# baselines
|
||||
yield 'CC(LR)', CC(LogisticRegression()), params_LR
|
||||
yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
|
||||
yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
|
||||
yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
|
||||
#yield 'HDy(LR)', HDy(LogisticRegression()), params_LR
|
||||
yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
|
||||
|
||||
# with order-aware classifiers
|
||||
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
#yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
|
||||
#yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
|
||||
#yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
|
||||
#yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
|
||||
#yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR
|
||||
#yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
|
||||
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
|
||||
|
||||
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
# I am using my implementation, which caters for predict_proba (linear distance to the two closest classes, 0 in the rest)
|
||||
# the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do
|
||||
# not implement predict_proba nor decision_score
|
||||
#yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
|
||||
#yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
|
||||
# yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR
|
||||
# yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR
|
||||
# yield 'PACC(SVR)', PACC(RegressorClassifier()), params_SVR
|
||||
#yield 'HDy(SVR)', HDy(RegressorClassifier()), params_SVR
|
||||
# yield 'SLD(SVR)', EMQ(RegressorClassifier()), params_SVR
|
||||
|
||||
|
||||
def run_experiment(params):
|
||||
qname, q, param_grid, drift = params
|
||||
qname += posfix
|
||||
resultfile = join(resultpath, f'{qname}.{drift}.csv')
|
||||
if os.path.exists(resultfile):
|
||||
print(f'result file {resultfile} already exists: continue')
|
||||
return None
|
||||
|
||||
print(f'fitting {qname} for {drift}-drift')
|
||||
|
||||
|
||||
def load_test_samples():
|
||||
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
|
||||
ids = set(ids)
|
||||
folderpath = join(datapath, domain, protocol, 'test_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
|
||||
def load_dev_samples():
|
||||
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
|
||||
ids = set(ids)
|
||||
folderpath = join(datapath, domain, protocol, 'dev_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
q = qp.model_selection.GridSearchQ(
|
||||
q,
|
||||
param_grid,
|
||||
sample_size=1000,
|
||||
protocol='gen',
|
||||
error=mnmd,
|
||||
val_split=load_dev_samples,
|
||||
n_jobs=-1,
|
||||
refit=False,
|
||||
verbose=True).fit(train)
|
||||
|
||||
hyperparams = f'{qname}\t{drift}\t{q.best_params_}'
|
||||
|
||||
print('[done]')
|
||||
|
||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||
mean_nmd = report['nmd'].mean()
|
||||
std_nmd = report['nmd'].std()
|
||||
print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||
report.to_csv(resultfile, index=False)
|
||||
|
||||
print('[learning regressor-based adjustment]')
|
||||
q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
|
||||
q.fit(None)
|
||||
|
||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||
mean_nmd = report['nmd'].mean()
|
||||
std_nmd = report['nmd'].std()
|
||||
print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||
resultfile = join(resultpath, f'{qname}.{drift}.reg.csv')
|
||||
report.to_csv(resultfile, index=False)
|
||||
|
||||
return hyperparams
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
#preprocessing = 'roberta.last'
|
||||
preprocessing = 'roberta.average'
|
||||
# preprocessing = 'roberta.posteriors'
|
||||
#preprocessing = 'tfidf'
|
||||
if preprocessing=='tfidf':
|
||||
domain = 'Books-tfidf'
|
||||
posfix = ''
|
||||
elif preprocessing=='roberta.last':
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
|
||||
posfix = '-RoBERTa-last'
|
||||
elif preprocessing=='roberta.average':
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
posfix = '-RoBERTa-average'
|
||||
elif preprocessing=='roberta.posteriors':
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
|
||||
posfix = '-RoBERTa-posteriors'
|
||||
load_sample_fn = load_single_sample_pkl
|
||||
datapath = './data'
|
||||
protocol = 'app'
|
||||
resultpath = join('./results', domain, protocol)
|
||||
os.makedirs(resultpath, exist_ok=True)
|
||||
|
||||
train = load_sample_fn(join(datapath, domain), 'training_data')
|
||||
|
||||
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
|
||||
#for drift in [f'smooth{i}' for i in range(5)] + ['all']:
|
||||
params = [(*qs, drift) for qs in quantifiers() for drift in ['low', 'mid', 'high', 'all']]
|
||||
hypers = qp.util.parallel(run_experiment, params, n_jobs=-2)
|
||||
for h in hypers:
|
||||
if h is not None:
|
||||
foo.write(h)
|
||||
foo.write('\n')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
257
Ordinal/model.py
257
Ordinal/model.py
|
|
@ -1,14 +1,11 @@
|
|||
from copy import deepcopy
|
||||
import mord
|
||||
import numpy as np
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.linear_model import LogisticRegression, Ridge
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.multioutput import MultiOutputRegressor
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.svm import LinearSVR, SVR
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.linear_model import Ridge
|
||||
from sklearn.svm import LinearSVR
|
||||
from sklearn.utils.class_weight import compute_class_weight
|
||||
from statsmodels.miscmodels.ordinal_model import OrderedModel
|
||||
|
||||
|
||||
|
|
@ -36,112 +33,21 @@ class OrderedLogisticRegression:
|
|||
return self.res_prob.model.predict(self.res_prob.params, exog=X)
|
||||
|
||||
|
||||
class StackedClassifier: # aka Funnelling Monolingual
|
||||
def __init__(self, base_estimator=LogisticRegression()):
|
||||
if not hasattr(base_estimator, 'predict_proba'):
|
||||
print('the estimator does not seem to be probabilistic: calibrating')
|
||||
base_estimator = CalibratedClassifierCV(base_estimator)
|
||||
# self.base = deepcopy(OneVsRestClassifier(base_estimator))
|
||||
# self.meta = deepcopy(OneVsRestClassifier(base_estimator))
|
||||
self.base = deepcopy(base_estimator)
|
||||
self.meta = deepcopy(base_estimator)
|
||||
self.norm = StandardScaler()
|
||||
class LAD(BaseEstimator, ClassifierMixin):
|
||||
def __init__(self, C=1.0, class_weight=None):
|
||||
self.C = C
|
||||
self.class_weight = class_weight
|
||||
|
||||
def fit(self, X, y):
|
||||
self.base.fit(X, y)
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.fit_transform(P)
|
||||
self.meta.fit(P, y)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.transform(P)
|
||||
return self.meta.predict(P)
|
||||
|
||||
def predict_proba(self, X):
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.transform(P)
|
||||
return self.meta.predict_proba(P)
|
||||
|
||||
|
||||
class RegressionQuantification:
|
||||
def __init__(self,
|
||||
base_quantifier,
|
||||
regression='svr',
|
||||
val_samples_generator=None,
|
||||
norm=True):
|
||||
|
||||
self.base_quantifier = base_quantifier
|
||||
if isinstance(regression, str):
|
||||
assert regression in ['ridge', 'svr'], 'unknown regression model'
|
||||
if regression == 'ridge':
|
||||
self.reg = Ridge(normalize=norm)
|
||||
elif regression == 'svr':
|
||||
self.reg = MultiOutputRegressor(LinearSVR())
|
||||
else:
|
||||
self.reg = regression
|
||||
# self.reg = MultiTaskLassoCV(normalize=norm)
|
||||
# self.reg = KernelRidge(kernel='rbf')
|
||||
# self.reg = LassoLarsCV(normalize=norm)
|
||||
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
|
||||
#self.reg = LinearRegression(normalize=norm) # <- bien
|
||||
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
|
||||
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
|
||||
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
|
||||
self.regression = regression
|
||||
self.val_samples_generator = val_samples_generator
|
||||
# self.norm = StandardScaler()
|
||||
# self.covs = covs
|
||||
|
||||
def generate_validation_samples(self):
|
||||
Xs, ys = [], []
|
||||
for instances, prevalence in self.val_samples_generator():
|
||||
ys.append(prevalence)
|
||||
Xs.append(self.base_quantifier.quantify(instances))
|
||||
Xs = np.asarray(Xs)
|
||||
ys = np.asarray(ys)
|
||||
return Xs, ys
|
||||
|
||||
def fit(self, data):
|
||||
print('fitting quantifier')
|
||||
if data is not None:
|
||||
self.base_quantifier.fit(data)
|
||||
print('generating val samples')
|
||||
Xs, ys = self.generate_validation_samples()
|
||||
# Xs = self.norm.fit_transform(Xs)
|
||||
print('fitting regressor')
|
||||
self.reg.fit(Xs, ys)
|
||||
print('[done]')
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
Xs = self.base_quantifier.quantify(instances).reshape(1, -1)
|
||||
# Xs = self.norm.transform(Xs)
|
||||
Xs = self.reg.predict(Xs)
|
||||
# Xs = self.norm.inverse_transform(Xs)
|
||||
adjusted = Xs / Xs.sum()
|
||||
# adjusted = np.clip(Xs, 0, 1)
|
||||
adjusted = adjusted.flatten()
|
||||
return adjusted
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.base_quantifier.get_params()
|
||||
|
||||
def set_params(self, **params):
|
||||
self.base_quantifier.set_params(**params)
|
||||
|
||||
|
||||
class RegressorClassifier(BaseEstimator, ClassifierMixin):
|
||||
def __init__(self):
|
||||
self.regressor = LinearSVR()
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
self.regressor = LinearSVR(C=self.C)
|
||||
# self.regressor = SVR()
|
||||
# self.regressor = Ridge(normalize=True)
|
||||
|
||||
|
||||
def fit(self, X, y):
|
||||
self.nclasses = len(np.unique(y))
|
||||
self.regressor.fit(X, y)
|
||||
classes = sorted(np.unique(y))
|
||||
self.nclasses = len(classes)
|
||||
if self.class_weight == 'balanced':
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
self.regressor.fit(X, y, sample_weight=sample_weight)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
|
|
@ -151,13 +57,20 @@ class RegressorClassifier(BaseEstimator, ClassifierMixin):
|
|||
c[c>(self.nclasses-1)]=self.nclasses-1
|
||||
return c.astype(np.int)
|
||||
|
||||
def predict_proba(self, X):
|
||||
# def predict_proba(self, X):
|
||||
# r = self.regressor.predict(X)
|
||||
# nC = len(self.classes_)
|
||||
# r = np.clip(r, 0, nC - 1)
|
||||
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
# invdist = 1 - dists
|
||||
# invdist[invdist < 0] = 0
|
||||
# return invdist
|
||||
|
||||
def decision_function(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
nC = len(self.classes_)
|
||||
r = np.clip(r, 0, nC - 1)
|
||||
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
invdist = 1 - dists
|
||||
invdist[invdist < 0] = 0
|
||||
return invdist
|
||||
|
||||
@property
|
||||
|
|
@ -165,8 +78,118 @@ class RegressorClassifier(BaseEstimator, ClassifierMixin):
|
|||
return np.arange(self.nclasses)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.regressor.get_params()
|
||||
return {'C':self.C, 'class_weight': self.class_weight}
|
||||
|
||||
def set_params(self, **params):
|
||||
self.regressor.set_params(**params)
|
||||
self.C = params['C']
|
||||
self.class_weight = params['class_weight']
|
||||
|
||||
|
||||
class OrdinalRidge(BaseEstimator, ClassifierMixin):
|
||||
def __init__(self, alpha=1.0, class_weight=None, normalize=False):
|
||||
self.alpha = alpha
|
||||
self.class_weight = class_weight
|
||||
self.normalize = normalize
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
self.regressor = Ridge(alpha=self.alpha, normalize=self.normalize)
|
||||
classes = sorted(np.unique(y))
|
||||
self.nclasses = len(classes)
|
||||
if self.class_weight == 'balanced':
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
self.regressor.fit(X, y, sample_weight=sample_weight)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
c = np.round(r)
|
||||
c[c<0]=0
|
||||
c[c>(self.nclasses-1)]=self.nclasses-1
|
||||
return c.astype(np.int)
|
||||
|
||||
# def predict_proba(self, X):
|
||||
# r = self.regressor.predict(X)
|
||||
# nC = len(self.classes_)
|
||||
# r = np.clip(r, 0, nC - 1)
|
||||
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
# invdist = 1 - dists
|
||||
# invdist[invdist < 0] = 0
|
||||
# return invdist
|
||||
|
||||
def decision_function(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
nC = len(self.classes_)
|
||||
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
invdist = 1 - dists
|
||||
return invdist
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return np.arange(self.nclasses)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return {'alpha':self.alpha, 'class_weight': self.class_weight, 'normalize': self.normalize}
|
||||
|
||||
def set_params(self, **params):
|
||||
self.alpha = params['alpha']
|
||||
self.class_weight = params['class_weight']
|
||||
self.normalize = params['normalize']
|
||||
|
||||
|
||||
# with order-aware classifiers
|
||||
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
class LogisticAT(mord.LogisticAT):
|
||||
def __init__(self, alpha=1.0, class_weight=None):
|
||||
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||
self.class_weight = class_weight
|
||||
super(LogisticAT, self).__init__(alpha=alpha)
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
if self.class_weight == 'balanced':
|
||||
classes = sorted(np.unique(y))
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
class LogisticSE(mord.LogisticSE):
|
||||
def __init__(self, alpha=1.0, class_weight=None):
|
||||
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||
self.class_weight = class_weight
|
||||
super(LogisticSE, self).__init__(alpha=alpha)
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
if self.class_weight == 'balanced':
|
||||
classes = sorted(np.unique(y))
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
return super(LogisticSE, self).fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
class LogisticIT(mord.LogisticIT):
|
||||
def __init__(self, alpha=1.0, class_weight=None):
|
||||
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||
self.class_weight = class_weight
|
||||
super(LogisticIT, self).__init__(alpha=alpha)
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
if self.class_weight == 'balanced':
|
||||
classes = sorted(np.unique(y))
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
return super(LogisticIT, self).fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
# class LAD(mord.LAD):
|
||||
# def fit(self, X, y):
|
||||
# self.classes_ = sorted(np.unique(y))
|
||||
# return super().fit(X, y)
|
||||
|
||||
|
||||
# class OrdinalRidge(mord.OrdinalRidge):
|
||||
# def fit(self, X, y):
|
||||
# self.classes_ = sorted(np.unique(y))
|
||||
# return super().fit(X, y)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import numpy as np
|
||||
import quapy as qp
|
||||
from Ordinal.evaluation import nmd
|
||||
from Ordinal.utils import load_samples_pkl
|
||||
from evaluation import nmd
|
||||
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
|
||||
from quapy.data import LabelledCollection
|
||||
import pickle
|
||||
import os
|
||||
|
|
@ -9,28 +9,39 @@ from os.path import join
|
|||
from tqdm import tqdm
|
||||
|
||||
|
||||
"""
|
||||
This scripts generates a partition of a dataset in terms of "shift".
|
||||
The partition is only carried out by generating index vectors.
|
||||
"""
|
||||
|
||||
|
||||
def partition_by_drift(split, training_prevalence):
|
||||
assert split in ['dev', 'test'], 'invalid split name'
|
||||
total=1000 if split=='dev' else 5000
|
||||
drifts = []
|
||||
for sample in tqdm(load_samples_pkl(join(datapath, domain, 'app', f'{split}_samples')), total=total):
|
||||
folderpath = join(datapath, domain, 'app', f'{split}_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
|
||||
drifts.append(nmd(training_prevalence, sample.prevalence()))
|
||||
drifts = np.asarray(drifts)
|
||||
order = np.argsort(drifts)
|
||||
nD = len(order)
|
||||
low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:]
|
||||
all_drift = np.arange(nD)
|
||||
np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift)
|
||||
np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift)
|
||||
np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift)
|
||||
np.save(join(datapath, domain, 'app', f'alldrift.{split}.id.npy'), all_drift)
|
||||
lows = drifts[low_drift]
|
||||
mids = drifts[mid_drift]
|
||||
highs = drifts[high_drift]
|
||||
all = drifts[all_drift]
|
||||
print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}')
|
||||
print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}')
|
||||
print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}')
|
||||
print(f'all drift: interval [{all.min():.4f}, {all.max():.4f}] mean: {all.mean():.4f}')
|
||||
|
||||
|
||||
domain = 'Books-tfidf'
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
|
||||
datapath = './data'
|
||||
|
||||
training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
|
||||
|
|
|
|||
|
|
@ -0,0 +1,41 @@
|
|||
import numpy as np
|
||||
from Ordinal.evaluation import smoothness
|
||||
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
|
||||
from os.path import join
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
"""
|
||||
This scripts generates a partition of a dataset in terms of "smoothness".
|
||||
The partition is only carried out by generating index vectors.
|
||||
"""
|
||||
|
||||
|
||||
def partition_by_smoothness(split):
|
||||
assert split in ['dev', 'test'], 'invalid split name'
|
||||
total=1000 if split=='dev' else 5000
|
||||
smooths = []
|
||||
folderpath = join(datapath, domain, 'app', f'{split}_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
|
||||
smooths.append(smoothness(sample.prevalence()))
|
||||
smooths = np.asarray(smooths)
|
||||
order = np.argsort(smooths)
|
||||
nD = len(order)
|
||||
low2high_smooth = np.array_split(order, 5)
|
||||
all_drift = np.arange(nD)
|
||||
for i, smooth_idx in enumerate(low2high_smooth):
|
||||
block = smooths[smooth_idx]
|
||||
print(f'smooth block {i}: shape={smooth_idx.shape}, interval=[{block.min()}, {block.max()}] mean={block.mean()}')
|
||||
np.save(join(datapath, domain, 'app', f'smooth{i}.{split}.id.npy'), smooth_idx)
|
||||
np.save(join(datapath, domain, 'app', f'all.{split}.id.npy'), all_drift)
|
||||
|
||||
|
||||
#domain = 'Books-tfidf'
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
datapath = './data'
|
||||
|
||||
#training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
|
||||
|
||||
partition_by_smoothness('dev')
|
||||
partition_by_smoothness('test')
|
||||
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from os.path import join
|
||||
import os
|
||||
import pickle
|
||||
from utils import *
|
||||
from tqdm import tqdm
|
||||
import shutil
|
||||
|
||||
"""
|
||||
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into dense vectors
|
||||
extracted from a pretrained model (here we use the RoBERTa fine-tuned on the training set)
|
||||
Three vector generation modes are available: posteriors, last, average
|
||||
"""
|
||||
|
||||
vector_generation = 'posteriors'
|
||||
|
||||
datapath = './data'
|
||||
domain = f'Books-roberta-base-finetuned/checkpoint-1188-{vector_generation}'
|
||||
outname = domain.replace('-finetuned', '-finetuned-pkl')
|
||||
|
||||
protocol = 'app'
|
||||
|
||||
print('pickling npy txt files')
|
||||
print('from:', join(datapath, domain))
|
||||
print('to', join(datapath, outname))
|
||||
print('for protocol:', protocol)
|
||||
|
||||
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, protocol), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, protocol, 'dev_samples'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, protocol, 'test_samples'), exist_ok=True)
|
||||
shutil.copyfile(join(datapath, domain, protocol, 'dev_prevalences.txt'), join(datapath, outname, protocol, 'dev_prevalences.txt'))
|
||||
shutil.copyfile(join(datapath, domain, protocol, 'test_prevalences.txt'), join(datapath, outname, protocol, 'test_prevalences.txt'))
|
||||
|
||||
train = load_simple_sample_npytxt(join(datapath, domain), 'training_data', classes=np.arange(5))
|
||||
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
def transform_folder_samples(protocol, splitname):
|
||||
folder_dir=join(datapath, domain, protocol, splitname)
|
||||
for i, sample in tqdm(enumerate(load_samples_folder(folder_dir, filter=None, load_fn=load_simple_sample_npytxt, classes=train.classes_))):
|
||||
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
transform_folder_samples(protocol, 'dev_samples')
|
||||
transform_folder_samples(protocol, 'test_samples')
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,14 +1,20 @@
|
|||
import quapy as qp
|
||||
from Ordinal.utils import load_simple_sample_raw
|
||||
from quapy.data import LabelledCollection
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from os.path import join
|
||||
import os
|
||||
import pickle
|
||||
from utils import load_samples
|
||||
from tqdm import tqdm
|
||||
import shutil
|
||||
|
||||
|
||||
|
||||
"""
|
||||
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into tfidf vectors.
|
||||
"""
|
||||
|
||||
|
||||
datapath = './data'
|
||||
domain = 'Books'
|
||||
outname = domain + '-tfidf'
|
||||
|
|
@ -40,7 +46,7 @@ pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pic
|
|||
|
||||
|
||||
def transform_folder_samples(protocol, splitname):
|
||||
for i, sample in tqdm(enumerate(load_samples(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
||||
for i, sample in tqdm(enumerate(load_simple_sample_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
||||
sample.instances = tfidf.transform(sample.instances)
|
||||
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
|
@ -0,0 +1,374 @@
|
|||
import numpy as np
|
||||
import itertools
|
||||
from scipy.stats import ttest_ind_from_stats, wilcoxon
|
||||
|
||||
|
||||
class Table:
|
||||
VALID_TESTS = [None, "wilcoxon", "ttest"]
|
||||
|
||||
def __init__(self, benchmarks, methods, lower_is_better=True, significance_test='wilcoxon', prec_mean=3,
|
||||
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
|
||||
color=True, show_rel_to=-1):
|
||||
assert significance_test in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
|
||||
|
||||
self.benchmarks = np.asarray(benchmarks)
|
||||
self.benchmark_index = {row: i for i, row in enumerate(benchmarks)}
|
||||
|
||||
self.methods = np.asarray(methods)
|
||||
self.method_index = {col: j for j, col in enumerate(methods)}
|
||||
|
||||
self.map = {}
|
||||
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
|
||||
self._addmap('values', dtype=object)
|
||||
self.lower_is_better = lower_is_better
|
||||
self.ttest = significance_test
|
||||
self.prec_mean = prec_mean
|
||||
self.clean_zero = clean_zero
|
||||
self.show_std = show_std
|
||||
self.prec_std = prec_std
|
||||
self.add_average = average
|
||||
self.missing = missing
|
||||
self.missing_str = missing_str
|
||||
self.color = color
|
||||
self.show_rel_to = show_rel_to
|
||||
|
||||
self.touch()
|
||||
|
||||
@property
|
||||
def nbenchmarks(self):
|
||||
return len(self.benchmarks)
|
||||
|
||||
@property
|
||||
def nmethods(self):
|
||||
return len(self.methods)
|
||||
|
||||
def touch(self):
|
||||
self._modif = True
|
||||
|
||||
def update(self):
|
||||
if self._modif:
|
||||
self.compute()
|
||||
|
||||
def _getfilled(self):
|
||||
return np.argwhere(self.map['fill'])
|
||||
|
||||
@property
|
||||
def values(self):
|
||||
return self.map['values']
|
||||
|
||||
def _indexes(self):
|
||||
return itertools.product(range(self.nbenchmarks), range(self.nmethods))
|
||||
|
||||
def _addmap(self, map, dtype, func=None):
|
||||
self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
|
||||
if func is None:
|
||||
return
|
||||
m = self.map[map]
|
||||
f = func
|
||||
indexes = self._indexes() if map == 'fill' else self._getfilled()
|
||||
for i, j in indexes:
|
||||
m[i, j] = f(self.values[i, j])
|
||||
|
||||
def _addrank(self):
|
||||
for i in range(self.nbenchmarks):
|
||||
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
||||
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
|
||||
ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
|
||||
if not self.lower_is_better:
|
||||
ranked_cols_idx = ranked_cols_idx[::-1]
|
||||
self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx) + 1)
|
||||
|
||||
def _addcolor(self):
|
||||
for i in range(self.nbenchmarks):
|
||||
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
||||
if filled_cols_idx.size == 0:
|
||||
continue
|
||||
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
|
||||
minval = min(col_means)
|
||||
maxval = max(col_means)
|
||||
for col_idx in filled_cols_idx:
|
||||
val = self.map['mean'][i, col_idx]
|
||||
norm = (maxval - minval)
|
||||
if norm > 0:
|
||||
normval = (val - minval) / norm
|
||||
else:
|
||||
normval = 0.5
|
||||
if self.lower_is_better:
|
||||
normval = 1 - normval
|
||||
self.map['color'][i, col_idx] = color_red2green_01(normval)
|
||||
|
||||
def _run_ttest(self, row, col1, col2):
|
||||
mean1 = self.map['mean'][row, col1]
|
||||
std1 = self.map['std'][row, col1]
|
||||
nobs1 = self.map['nobs'][row, col1]
|
||||
mean2 = self.map['mean'][row, col2]
|
||||
std2 = self.map['std'][row, col2]
|
||||
nobs2 = self.map['nobs'][row, col2]
|
||||
_, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
|
||||
return p_val
|
||||
|
||||
def _run_wilcoxon(self, row, col1, col2):
|
||||
values1 = self.map['values'][row, col1]
|
||||
values2 = self.map['values'][row, col2]
|
||||
_, p_val = wilcoxon(values1, values2)
|
||||
return p_val
|
||||
|
||||
def _add_statistical_test(self):
|
||||
if self.ttest is None:
|
||||
return
|
||||
self.some_similar = [False] * self.nmethods
|
||||
for i in range(self.nbenchmarks):
|
||||
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
||||
if len(filled_cols_idx) <= 1:
|
||||
continue
|
||||
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
|
||||
best_pos = filled_cols_idx[np.argmin(col_means)]
|
||||
|
||||
for j in filled_cols_idx:
|
||||
if j == best_pos:
|
||||
continue
|
||||
if self.ttest == 'ttest':
|
||||
p_val = self._run_ttest(i, best_pos, j)
|
||||
else:
|
||||
p_val = self._run_wilcoxon(i, best_pos, j)
|
||||
|
||||
pval_outcome = pval_interpretation(p_val)
|
||||
self.map['ttest'][i, j] = pval_outcome
|
||||
if pval_outcome != 'Diff':
|
||||
self.some_similar[j] = True
|
||||
|
||||
def compute(self):
|
||||
self._addmap('fill', dtype=bool, func=lambda x: x is not None)
|
||||
self._addmap('mean', dtype=float, func=np.mean)
|
||||
self._addmap('std', dtype=float, func=np.std)
|
||||
self._addmap('nobs', dtype=float, func=len)
|
||||
self._addmap('rank', dtype=int, func=None)
|
||||
self._addmap('color', dtype=object, func=None)
|
||||
self._addmap('ttest', dtype=object, func=None)
|
||||
self._addmap('latex', dtype=object, func=None)
|
||||
self._addrank()
|
||||
self._addcolor()
|
||||
self._add_statistical_test()
|
||||
if self.add_average:
|
||||
self._addave()
|
||||
self._modif = False
|
||||
|
||||
def _is_column_full(self, col):
|
||||
return all(self.map['fill'][:, self.method_index[col]])
|
||||
|
||||
def _addave(self):
|
||||
ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, significance_test=self.ttest, average=False,
|
||||
missing=self.missing, missing_str=self.missing_str, prec_mean=self.prec_mean, prec_std=self.prec_std,
|
||||
show_std=self.show_std)
|
||||
for col in self.methods:
|
||||
values = None
|
||||
if self._is_column_full(col):
|
||||
if self.ttest == 'ttest':
|
||||
values = np.asarray(self.map['mean'][:, self.method_index[col]])
|
||||
else: # wilcoxon
|
||||
values = np.concatenate(self.values[:, self.method_index[col]])
|
||||
ave.add('ave', col, values)
|
||||
self.average = ave
|
||||
|
||||
def add(self, benchmark, method, values):
|
||||
if values is not None:
|
||||
values = np.asarray(values)
|
||||
if values.ndim == 0:
|
||||
values = values.flatten()
|
||||
rid, cid = self._coordinates(benchmark, method)
|
||||
if self.map['values'][rid, cid] is None:
|
||||
self.map['values'][rid, cid] = values
|
||||
elif values is not None:
|
||||
self.map['values'][rid, cid] = np.concatenate([self.map['values'][rid, cid], values])
|
||||
self.touch()
|
||||
|
||||
def get(self, benchmark, method, attr='mean'):
|
||||
self.update()
|
||||
assert attr in self.map, f'unknwon attribute {attr}'
|
||||
rid, cid = self._coordinates(benchmark, method)
|
||||
if self.map['fill'][rid, cid]:
|
||||
v = self.map[attr][rid, cid]
|
||||
if v is None or (isinstance(v, float) and np.isnan(v)):
|
||||
return self.missing
|
||||
return v
|
||||
else:
|
||||
return self.missing
|
||||
|
||||
def _coordinates(self, benchmark, method):
|
||||
assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
|
||||
assert method in self.method_index, f'method {method} out of range'
|
||||
rid = self.benchmark_index[benchmark]
|
||||
cid = self.method_index[method]
|
||||
return rid, cid
|
||||
|
||||
def get_average(self, method, attr='mean'):
|
||||
self.update()
|
||||
if self.add_average:
|
||||
return self.average.get('ave', method, attr=attr)
|
||||
return None
|
||||
|
||||
def get_color(self, benchmark, method):
|
||||
color = self.get(benchmark, method, attr='color')
|
||||
if color is None:
|
||||
return ''
|
||||
return color
|
||||
|
||||
def latexCell(self, benchmark, method):
|
||||
self.update()
|
||||
i, j = self._coordinates(benchmark, method)
|
||||
if self.map['fill'][i, j] == False:
|
||||
return self.missing_str
|
||||
|
||||
mean = self.map['mean'][i, j]
|
||||
l = f" {mean:.{self.prec_mean}f}"
|
||||
if self.clean_zero:
|
||||
l = l.replace(' 0.', '.')
|
||||
|
||||
isbest = self.map['rank'][i, j] == 1
|
||||
if self.ttest is not None: # and self.some_similar[j]:
|
||||
test_label = self.map['ttest'][i, j]
|
||||
if test_label in ['Sim', 'Same']:
|
||||
isbest = True
|
||||
|
||||
if isbest:
|
||||
l = "\\textbf{" + l.strip() + "}\;"
|
||||
else:
|
||||
l += '\; '
|
||||
|
||||
stat = ''
|
||||
# this is commented because we are putting in textbf all results that are similar to the best one
|
||||
# if self.ttest is not None: # and self.some_similar[j]:
|
||||
# test_label = self.map['ttest'][i, j]
|
||||
# if test_label == 'Sim':
|
||||
# stat = '^{\dag\phantom{\dag}}'
|
||||
# elif test_label == 'Same':
|
||||
# stat = '^{\ddag}'
|
||||
# elif isbest or test_label == 'Diff':
|
||||
# stat = '^{\phantom{\ddag}}'
|
||||
|
||||
std = ''
|
||||
if self.show_std:
|
||||
std = self.map['std'][i, j]
|
||||
std = f" {std:.{self.prec_std}f}"
|
||||
if self.clean_zero:
|
||||
std = std.replace(' 0.', '.')
|
||||
std = f" \pm {std:{self.prec_std}}"
|
||||
|
||||
relto = ''
|
||||
if self.show_rel_to != -1:
|
||||
if j != self.show_rel_to:
|
||||
ref_ave = self.map['mean'][i, self.show_rel_to]
|
||||
rel = 100*(mean-ref_ave)/ref_ave
|
||||
if abs(rel) < 0.1:
|
||||
relto=f'(\\approx)'
|
||||
else:
|
||||
plussign = '+' if rel>0 else '' # already plugs the '-' sign
|
||||
relto=f'({plussign}{rel:.1f}\%)'
|
||||
std = ''
|
||||
|
||||
if stat != '' or std != '' or relto != '':
|
||||
l = f'{l}${stat}{std}{relto}$'
|
||||
|
||||
if self.color:
|
||||
l += ' ' + self.map['color'][i, j]
|
||||
|
||||
return l
|
||||
|
||||
def latexTabular(self, benchmark_replace={}, method_replace={}, average=True):
|
||||
tab = ' & '
|
||||
tab += ' & '.join([method_replace.get(col, col) for col in self.methods])
|
||||
tab += ' \\\\\hline\n'
|
||||
for row in self.benchmarks:
|
||||
rowname = benchmark_replace.get(row, row)
|
||||
tab += rowname + ' & '
|
||||
tab += self.latexRow(row)
|
||||
|
||||
if average:
|
||||
tab += '\hline\n'
|
||||
tab += 'Average & '
|
||||
tab += self.latexAverage()
|
||||
return tab
|
||||
|
||||
def latexTabularT(self, benchmark_replace={}, method_replace={}, average=True, side=False):
|
||||
def withside(label):
|
||||
return '\side{'+label+'}' if side else label
|
||||
|
||||
def center(label):
|
||||
return '\multicolumn{1}{c}{'+label+'}'
|
||||
|
||||
tab = ' & '
|
||||
tab += ' & '.join([center(withside(benchmark_replace.get(col, col))) for col in self.benchmarks])
|
||||
if average:
|
||||
tab += ' & ' + withside('Ave')
|
||||
# tab += ' \\\\\hline\n'
|
||||
tab += ' \\\\\midrule\n'
|
||||
for row in self.methods:
|
||||
rowname = method_replace.get(row, row)
|
||||
tab += rowname + ' & '
|
||||
tab += self.latexRowT(row, endl='')
|
||||
if average:
|
||||
tab += ' & '
|
||||
tab += self.average.latexCell('ave', row)
|
||||
# tab += '\\\\\hline\n'
|
||||
tab += '\\\\\n'
|
||||
tab += '\\bottomrule'
|
||||
return tab
|
||||
|
||||
def latexRow(self, benchmark, endl='\\\\\hline\n'):
|
||||
s = [self.latexCell(benchmark, col) for col in self.methods]
|
||||
s = ' & '.join(s)
|
||||
s += ' ' + endl
|
||||
return s
|
||||
|
||||
def latexRowT(self, method, endl='\\\\\hline\n'):
|
||||
s = [self.latexCell(benchmark, method) for benchmark in self.benchmarks]
|
||||
s = ' & '.join(s)
|
||||
s += ' ' + endl
|
||||
return s
|
||||
|
||||
def latexAverage(self, endl='\\\\\hline\n'):
|
||||
if self.add_average:
|
||||
return self.average.latexRow('ave', endl=endl)
|
||||
|
||||
def getRankTable(self):
|
||||
t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=0, average=True)
|
||||
for rid, cid in self._getfilled():
|
||||
row = self.benchmarks[rid]
|
||||
col = self.methods[cid]
|
||||
t.add(row, col, self.get(row, col, 'rank'))
|
||||
t.compute()
|
||||
return t
|
||||
|
||||
def dropMethods(self, methods):
|
||||
drop_index = [self.method_index[m] for m in methods]
|
||||
new_methods = np.delete(self.methods, drop_index)
|
||||
new_index = {col: j for j, col in enumerate(new_methods)}
|
||||
|
||||
self.map['values'] = self.values[:, np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
|
||||
self.methods = new_methods
|
||||
self.method_index = new_index
|
||||
self.touch()
|
||||
|
||||
|
||||
def pval_interpretation(p_val):
|
||||
if 0.005 >= p_val:
|
||||
return 'Diff'
|
||||
elif 0.05 >= p_val > 0.005:
|
||||
return 'Sim'
|
||||
elif p_val > 0.05:
|
||||
return 'Same'
|
||||
|
||||
|
||||
def color_red2green_01(val, maxtone=50):
|
||||
if np.isnan(val): return None
|
||||
assert 0 <= val <= 1, f'val {val} out of range [0,1]'
|
||||
|
||||
# rescale to [-1,1]
|
||||
val = val * 2 - 1
|
||||
if val < 0:
|
||||
color = 'red'
|
||||
tone = maxtone * (-val)
|
||||
else:
|
||||
color = 'green'
|
||||
tone = maxtone * val
|
||||
return '\cellcolor{' + color + f'!{int(tone)}' + '}'
|
||||
|
|
@ -1,22 +1,64 @@
|
|||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
import numpy as np
|
||||
from glob import glob
|
||||
from json import load
|
||||
import os
|
||||
from os.path import join
|
||||
import pickle
|
||||
import pandas as pd
|
||||
import csv
|
||||
import datasets
|
||||
from datasets import Dataset
|
||||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
|
||||
|
||||
def load_samples(path_dir, classes):
|
||||
nsamples = len(glob(join(path_dir, f'*.txt')))
|
||||
|
||||
def load_simple_sample_npytxt(parentdir, filename, classes=None):
|
||||
samplepath = join(parentdir, filename+'.txt')
|
||||
yX = np.loadtxt(samplepath)
|
||||
X = yX[:,1:]
|
||||
y = yX[:,0].astype(np.int32)
|
||||
return LabelledCollection(instances=X, labels=y, classes_=classes)
|
||||
|
||||
|
||||
def load_simple_sample_raw(parentdir, filename, classes=None):
|
||||
samplepath = join(parentdir, filename+'.txt')
|
||||
return LabelledCollection.load(samplepath, loader_func=qp.data.reader.from_text, classes=classes)
|
||||
|
||||
|
||||
def load_single_sample_as_csv(parentdir, filename):
|
||||
samplepath = join(parentdir, filename+'.txt')
|
||||
df = pd.read_csv(samplepath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
|
||||
labels = df.pop('labels').to_frame()
|
||||
|
||||
features = datasets.Features({'review': datasets.Value('string')})
|
||||
sample = Dataset.from_pandas(df=df, features=features)
|
||||
|
||||
return sample, labels
|
||||
|
||||
|
||||
def load_single_sample_pkl(parentdir, filename):
|
||||
return pickle.load(open(join(parentdir, filename+'.pkl'), 'rb'))
|
||||
|
||||
|
||||
# def load_samples_npytxt(path_dir, filter=None, classes=None):
|
||||
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_npytxt)
|
||||
|
||||
|
||||
# def load_samples_raw(path_dir, filter=None, classes=None):
|
||||
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_raw, load_fn_kwargs={'classes': classes})
|
||||
|
||||
|
||||
# def load_samples_as_csv(path_dir, filter=None):
|
||||
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_as_csv)
|
||||
|
||||
|
||||
# def load_samples_pkl(path_dir, filter=None):
|
||||
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_pkl)
|
||||
|
||||
|
||||
def load_samples_folder(path_dir, filter=None, load_fn=None, **load_fn_kwargs):
|
||||
nsamples = len(glob(join(path_dir, f'*')))
|
||||
for id in range(nsamples):
|
||||
yield LabelledCollection.load(join(path_dir, f'{id}.txt'), loader_func=qp.data.reader.from_text, classes=classes)
|
||||
|
||||
|
||||
def load_samples_pkl(path_dir, filter=None):
|
||||
nsamples = len(glob(join(path_dir, f'*.pkl')))
|
||||
for id in range(nsamples):
|
||||
if filter is not None:
|
||||
if id not in filter:
|
||||
continue
|
||||
yield pickle.load(open(join(path_dir, f'{id}.pkl'), 'rb'))
|
||||
|
||||
if (filter is None) or id in filter:
|
||||
yield load_fn(path_dir, f'{id}', **load_fn_kwargs)
|
||||
|
|
|
|||
|
|
@ -183,7 +183,7 @@ def _training_helper(learner,
|
|||
if not hasattr(learner, 'predict_proba'):
|
||||
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
|
||||
f'The learner will be calibrated.')
|
||||
learner = CalibratedClassifierCV(learner, cv=5)
|
||||
learner = CalibratedClassifierCV(learner, cv=5, ensemble=True)
|
||||
if val_split is not None:
|
||||
if isinstance(val_split, float):
|
||||
if not (0 < val_split < 1):
|
||||
|
|
@ -470,7 +470,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
|
|||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
||||
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
|
||||
self.train_prevalence = F.prevalence_from_labels(data.labels, data.classes_)
|
||||
return self
|
||||
|
||||
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
||||
|
|
|
|||
Loading…
Reference in New Issue