Compare commits
No commits in common. "72c63fff094d0207c6547c0995a96348b91226a2" and "5df355a4e1f65484ececc78923dbdcedf01028c1" have entirely different histories.
72c63fff09
...
5df355a4e1
|
|
@ -1,17 +1,14 @@
|
||||||
import gzip
|
import gzip
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from Ordinal.utils import load_simple_sample_raw
|
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
import os
|
import os
|
||||||
from os.path import join
|
from os.path import join
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
datadir = '/mnt/1T/Datasets/Amazon/reviews'
|
datadir = '/mnt/1T/Datasets/Amazon/reviews'
|
||||||
outdir = './data/'
|
outdir = './data/'
|
||||||
real_prev_path = './data/Books-real-prevalence-by-product_votes1_reviews100.csv'
|
|
||||||
domain = 'Books'
|
domain = 'Books'
|
||||||
seed = 7
|
seed = 7
|
||||||
|
|
||||||
|
|
@ -21,6 +18,13 @@ te_size = 1000
|
||||||
nval = 1000
|
nval = 1000
|
||||||
nte = 5000
|
nte = 5000
|
||||||
|
|
||||||
|
# domain = 'Gift_Cards'
|
||||||
|
# tr_size = 200
|
||||||
|
# val_size = 100
|
||||||
|
# te_size = 100
|
||||||
|
# nval = 20
|
||||||
|
# nte = 40
|
||||||
|
|
||||||
|
|
||||||
def from_gz_text(path, encoding='utf-8', class2int=True):
|
def from_gz_text(path, encoding='utf-8', class2int=True):
|
||||||
"""
|
"""
|
||||||
|
|
@ -66,6 +70,7 @@ def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, pre
|
||||||
write_txt_sample(sample, join(outdir, f'{i}.txt'))
|
write_txt_sample(sample, join(outdir, f'{i}.txt'))
|
||||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||||
|
|
||||||
|
|
||||||
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||||
os.makedirs(outdir, exist_ok=True)
|
os.makedirs(outdir, exist_ok=True)
|
||||||
with open(prevpath, 'wt') as prevfile:
|
with open(prevpath, 'wt') as prevfile:
|
||||||
|
|
@ -75,69 +80,37 @@ def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, pre
|
||||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||||
|
|
||||||
|
|
||||||
def gen_samples_real_prevalences(real_prevalences, pool: LabelledCollection, sample_size, outdir, prevpath_out):
|
|
||||||
os.makedirs(outdir, exist_ok=True)
|
|
||||||
with open(prevpath_out, 'wt') as prevfile:
|
|
||||||
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
|
||||||
for i, prev in enumerate(real_prevalences):
|
|
||||||
sample = pool.sampling(sample_size, *prev[:-1])
|
|
||||||
write_txt_sample(sample, join(outdir, f'{i}.txt'))
|
|
||||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
|
||||||
|
|
||||||
|
fullpath = join(datadir,domain)+'.txt.gz'
|
||||||
|
|
||||||
# fullpath = join(datadir,domain)+'.txt.gz'
|
data = LabelledCollection.load(fullpath, from_gz_text)
|
||||||
#
|
print(len(data))
|
||||||
# data = LabelledCollection.load(fullpath, from_gz_text)
|
print(data.classes_)
|
||||||
# print(len(data))
|
print(data.prevalence())
|
||||||
# print(data.classes_)
|
|
||||||
# print(data.prevalence())
|
|
||||||
|
|
||||||
with qp.util.temp_seed(seed):
|
with qp.util.temp_seed(seed):
|
||||||
# train, rest = data.split_stratified(train_prop=tr_size)
|
train, rest = data.split_stratified(train_prop=tr_size)
|
||||||
#
|
|
||||||
# devel, test = rest.split_stratified(train_prop=0.5)
|
devel, test = rest.split_stratified(train_prop=0.5)
|
||||||
# print(len(train))
|
print(len(train))
|
||||||
# print(len(devel))
|
print(len(devel))
|
||||||
# print(len(test))
|
print(len(test))
|
||||||
#
|
|
||||||
domaindir = join(outdir, domain)
|
domaindir = join(outdir, domain)
|
||||||
|
|
||||||
# write_txt_sample(train, join(domaindir, 'training_data.txt'))
|
write_txt_sample(train, join(domaindir, 'training_data.txt'))
|
||||||
# write_txt_sample(devel, join(domaindir, 'development_data.txt'))
|
write_txt_sample(devel, join(domaindir, 'development_data.txt'))
|
||||||
# write_txt_sample(test, join(domaindir, 'test_data.txt'))
|
write_txt_sample(test, join(domaindir, 'test_data.txt'))
|
||||||
|
|
||||||
# this part is to be used when the partitions have already been created, in order to avoid re-generating them
|
gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
|
||||||
train = load_simple_sample_raw(domaindir, 'training_data')
|
prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
|
||||||
devel = load_simple_sample_raw(domaindir, 'development_data')
|
gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
|
||||||
test = load_simple_sample_raw(domaindir, 'test_data')
|
prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
|
||||||
|
|
||||||
# gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
|
gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
|
||||||
# prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
|
prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
|
||||||
# gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
|
gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
|
||||||
# prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
|
prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
|
||||||
|
|
||||||
# gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
|
|
||||||
# prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
|
|
||||||
# gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
|
|
||||||
# prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
|
|
||||||
|
|
||||||
|
|
||||||
# this part generates samples based on real prevalences (in this case, prevalences of sets of books reviews
|
|
||||||
# groupped by product). It loads the real prevalences (computed elsewhere), and randomly extract 5000 for test
|
|
||||||
# and 1000 for val (disjoint). Then realize the samplings
|
|
||||||
|
|
||||||
assert os.path.exists(real_prev_path), f'real prevalence file does not seem to exist...'
|
|
||||||
real_prevalences = np.genfromtxt(real_prev_path, delimiter='\t')
|
|
||||||
|
|
||||||
nrows = real_prevalences.shape[0]
|
|
||||||
rand_sel = np.random.permutation(nrows)
|
|
||||||
real_prevalences_val = real_prevalences[rand_sel[:nval]]
|
|
||||||
real_prevalences_te = real_prevalences[rand_sel[nval:nval+nte]]
|
|
||||||
|
|
||||||
gen_samples_real_prevalences(real_prevalences_val, devel, sample_size=val_size, outdir=join(domaindir, 'real', 'dev_samples'),
|
|
||||||
prevpath_out=join(domaindir, 'real', 'dev_prevalences.txt'))
|
|
||||||
gen_samples_real_prevalences(real_prevalences_te, test, sample_size=te_size, outdir=join(domaindir, 'real', 'test_samples'),
|
|
||||||
prevpath_out=join(domaindir, 'real', 'test_prevalences.txt'))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,116 +0,0 @@
|
||||||
import gzip
|
|
||||||
import quapy as qp
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from quapy.data import LabelledCollection
|
|
||||||
import quapy.functional as F
|
|
||||||
import os
|
|
||||||
from os.path import join
|
|
||||||
from pathlib import Path
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
|
|
||||||
datadir = '../OrdinalQuantification'
|
|
||||||
outdir = './data/'
|
|
||||||
domain = 'fact'
|
|
||||||
seed = 7
|
|
||||||
|
|
||||||
tr_size = 20000
|
|
||||||
val_size = 1000
|
|
||||||
te_size = 1000
|
|
||||||
nval = 1000
|
|
||||||
nte = 5000
|
|
||||||
|
|
||||||
|
|
||||||
def from_csv(path):
|
|
||||||
df = pd.read_csv(path)
|
|
||||||
|
|
||||||
# divide the continuous labels into ordered classes
|
|
||||||
energy_boundaries = np.arange(start=2.4, stop=4.2, step=0.15)[1:-1]
|
|
||||||
y = np.digitize(np.array(df['log10_energy'], dtype=np.float32), energy_boundaries)
|
|
||||||
|
|
||||||
# note: omitting the dtype will result in a single instance having a different class
|
|
||||||
|
|
||||||
# obtain a matrix of shape (n_samples, n_features)
|
|
||||||
X = df.iloc[:, 1:].to_numpy().astype(np.float32)
|
|
||||||
return X, y
|
|
||||||
|
|
||||||
|
|
||||||
def write_pkl(sample: LabelledCollection, path):
|
|
||||||
os.makedirs(Path(path).parent, exist_ok=True)
|
|
||||||
pickle.dump(sample, open(path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
|
||||||
|
|
||||||
|
|
||||||
def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
|
||||||
os.makedirs(outdir, exist_ok=True)
|
|
||||||
with open(prevpath, 'wt') as prevfile:
|
|
||||||
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
|
||||||
for i, prev in enumerate(F.uniform_simplex_sampling(n_classes=pool.n_classes, size=nsamples)):
|
|
||||||
sample = pool.sampling(sample_size, *prev)
|
|
||||||
write_pkl(sample, join(outdir, f'{i}.pkl'))
|
|
||||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
|
||||||
|
|
||||||
|
|
||||||
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
|
||||||
os.makedirs(outdir, exist_ok=True)
|
|
||||||
with open(prevpath, 'wt') as prevfile:
|
|
||||||
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
|
||||||
for i, sample in enumerate(pool.natural_sampling_generator(sample_size, repeats=nsamples)):
|
|
||||||
write_pkl(sample, join(outdir, f'{i}.pkl'))
|
|
||||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
fullpath = join(datadir,domain, 'fact_wobble.csv')
|
|
||||||
|
|
||||||
data = LabelledCollection.load(fullpath, from_csv)
|
|
||||||
|
|
||||||
if np.isnan(data.instances).any():
|
|
||||||
rows, cols = np.where(np.isnan(data.instances))
|
|
||||||
data.instances = np.delete(data.instances, rows, axis=0)
|
|
||||||
data.labels = np.delete(data.labels, rows, axis=0)
|
|
||||||
print('deleted nan rows')
|
|
||||||
|
|
||||||
if np.isnan(data.instances).any():
|
|
||||||
rows, cols = np.where(np.isnan(data.instances))
|
|
||||||
data.instances = np.delete(data.instances, rows, axis=0)
|
|
||||||
data.labels = np.delete(data.labels, rows, axis=0)
|
|
||||||
print('deleted nan rows')
|
|
||||||
|
|
||||||
if np.isinf(data.instances).any():
|
|
||||||
rows, cols = np.where(np.isinf(data.instances))
|
|
||||||
data.instances = np.delete(data.instances, rows, axis=0)
|
|
||||||
data.labels = np.delete(data.labels, rows, axis=0)
|
|
||||||
print('deleted inf rows')
|
|
||||||
|
|
||||||
|
|
||||||
print(len(data))
|
|
||||||
print(data.classes_)
|
|
||||||
print(data.prevalence())
|
|
||||||
|
|
||||||
with qp.util.temp_seed(seed):
|
|
||||||
train, rest = data.split_stratified(train_prop=tr_size)
|
|
||||||
|
|
||||||
devel, test = rest.split_stratified(train_prop=0.5)
|
|
||||||
print(len(train))
|
|
||||||
print(len(devel))
|
|
||||||
print(len(test))
|
|
||||||
|
|
||||||
domaindir = join(outdir, domain)
|
|
||||||
|
|
||||||
write_pkl(train, join(domaindir, 'training_data.pkl'))
|
|
||||||
write_pkl(devel, join(domaindir, 'development_data.pkl'))
|
|
||||||
write_pkl(test, join(domaindir, 'test_data.pkl'))
|
|
||||||
|
|
||||||
gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
|
|
||||||
prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
|
|
||||||
gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
|
|
||||||
prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
|
|
||||||
|
|
||||||
gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
|
|
||||||
prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
|
|
||||||
gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
|
|
||||||
prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,11 +1,6 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
# smoothing approximation
|
|
||||||
def smoothness(p):
|
|
||||||
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
|
||||||
|
|
||||||
|
|
||||||
def _check_arrays(prevs):
|
def _check_arrays(prevs):
|
||||||
prevs = np.asarray(prevs)
|
prevs = np.asarray(prevs)
|
||||||
if prevs.ndim==1:
|
if prevs.ndim==1:
|
||||||
|
|
@ -13,7 +8,6 @@ def _check_arrays(prevs):
|
||||||
return prevs
|
return prevs
|
||||||
|
|
||||||
|
|
||||||
# mean normalized match distance
|
|
||||||
def mnmd(prevs, prevs_hat):
|
def mnmd(prevs, prevs_hat):
|
||||||
prevs = _check_arrays(prevs)
|
prevs = _check_arrays(prevs)
|
||||||
prevs_hat = _check_arrays(prevs_hat)
|
prevs_hat = _check_arrays(prevs_hat)
|
||||||
|
|
@ -23,7 +17,6 @@ def mnmd(prevs, prevs_hat):
|
||||||
return np.mean(nmds)
|
return np.mean(nmds)
|
||||||
|
|
||||||
|
|
||||||
# normalized match distance
|
|
||||||
def nmd(prev, prev_hat):
|
def nmd(prev, prev_hat):
|
||||||
n = len(prev)
|
n = len(prev)
|
||||||
return (1./(n-1))*mdpa(prev, prev_hat)
|
return (1./(n-1))*mdpa(prev, prev_hat)
|
||||||
|
|
|
||||||
|
|
@ -1,150 +0,0 @@
|
||||||
import numpy as np
|
|
||||||
import quapy as qp
|
|
||||||
import os
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
from sklearn.preprocessing import StandardScaler
|
|
||||||
from Ordinal.model import RegressionQuantification, LogisticAT, LogisticSE, LogisticIT, LAD, OrdinalRidge
|
|
||||||
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC
|
|
||||||
from os.path import join
|
|
||||||
from utils import load_samples_folder, load_single_sample_pkl
|
|
||||||
from evaluation import nmd, mnmd
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
This script generates all results from Table 1 in the paper, i.e., all results comparing quantifiers equipped with
|
|
||||||
standard logistic regression against quantifiers equipped with order-aware classifiers
|
|
||||||
"""
|
|
||||||
|
|
||||||
def quantifiers():
|
|
||||||
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
|
||||||
params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
|
|
||||||
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
|
||||||
params_Ridge = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced'], 'normalize':[True,False]}
|
|
||||||
|
|
||||||
# baselines
|
|
||||||
yield 'CC(LR)', CC(LogisticRegression()), params_LR
|
|
||||||
yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
|
|
||||||
yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
|
|
||||||
yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
|
|
||||||
yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
|
|
||||||
|
|
||||||
# with order-aware classifiers
|
|
||||||
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
|
||||||
yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
|
|
||||||
yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
|
|
||||||
yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
|
|
||||||
yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
|
|
||||||
yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
|
|
||||||
|
|
||||||
yield 'CC(OLR-SE)', CC(LogisticSE()), params_OLR
|
|
||||||
yield 'PCC(OLR-SE)', PCC(LogisticSE()), params_OLR
|
|
||||||
yield 'ACC(OLR-SE)', ACC(LogisticSE()), params_OLR
|
|
||||||
yield 'PACC(OLR-SE)', PACC(LogisticSE()), params_OLR
|
|
||||||
yield 'SLD(OLR-SE)', EMQ(LogisticSE()), params_OLR
|
|
||||||
|
|
||||||
yield 'CC(OLR-IT)', CC(LogisticIT()), params_OLR
|
|
||||||
yield 'PCC(OLR-IT)', PCC(LogisticIT()), params_OLR
|
|
||||||
yield 'ACC(OLR-IT)', ACC(LogisticIT()), params_OLR
|
|
||||||
yield 'PACC(OLR-IT)', PACC(LogisticIT()), params_OLR
|
|
||||||
yield 'SLD(OLR-IT)', EMQ(LogisticIT()), params_OLR
|
|
||||||
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
|
|
||||||
|
|
||||||
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
|
||||||
yield 'CC(LAD)', CC(LAD()), params_SVR
|
|
||||||
yield 'ACC(LAD)', ACC(LAD()), params_SVR
|
|
||||||
yield 'CC(ORidge)', CC(OrdinalRidge()), params_Ridge
|
|
||||||
yield 'ACC(ORidge)', ACC(OrdinalRidge()), params_Ridge
|
|
||||||
|
|
||||||
|
|
||||||
def run_experiment(params):
|
|
||||||
qname, q, param_grid = params
|
|
||||||
qname += posfix
|
|
||||||
resultfile = join(resultpath, f'{qname}.all.csv')
|
|
||||||
if os.path.exists(resultfile):
|
|
||||||
print(f'result file {resultfile} already exists: continue')
|
|
||||||
return None
|
|
||||||
|
|
||||||
print(f'fitting {qname} for all-drift')
|
|
||||||
|
|
||||||
|
|
||||||
def load_test_samples():
|
|
||||||
folderpath = join(datapath, domain, protocol, 'test_samples')
|
|
||||||
for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=5000):
|
|
||||||
if posfix == '-std':
|
|
||||||
sample.instances = zscore.transform(sample.instances)
|
|
||||||
yield sample.instances, sample.prevalence()
|
|
||||||
|
|
||||||
|
|
||||||
def load_dev_samples():
|
|
||||||
folderpath = join(datapath, domain, protocol, 'dev_samples')
|
|
||||||
for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=1000):
|
|
||||||
if posfix == '-std':
|
|
||||||
sample.instances = zscore.transform(sample.instances)
|
|
||||||
yield sample.instances, sample.prevalence()
|
|
||||||
|
|
||||||
q = qp.model_selection.GridSearchQ(
|
|
||||||
q,
|
|
||||||
param_grid,
|
|
||||||
sample_size=1000,
|
|
||||||
protocol='gen',
|
|
||||||
error=mnmd,
|
|
||||||
val_split=load_dev_samples,
|
|
||||||
n_jobs=-1,
|
|
||||||
refit=False,
|
|
||||||
timeout=60*60*2,
|
|
||||||
verbose=True).fit(train)
|
|
||||||
|
|
||||||
hyperparams = f'{qname}\tall\t{q.best_params_}\t{q.best_score_}'
|
|
||||||
|
|
||||||
print('[done]')
|
|
||||||
|
|
||||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
|
||||||
mean_nmd = report['nmd'].mean()
|
|
||||||
std_nmd = report['nmd'].std()
|
|
||||||
print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
|
|
||||||
report.to_csv(resultfile, index=False)
|
|
||||||
|
|
||||||
print('[learning regressor-based adjustment]')
|
|
||||||
q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
|
|
||||||
q.fit(None)
|
|
||||||
|
|
||||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
|
||||||
mean_nmd = report['nmd'].mean()
|
|
||||||
std_nmd = report['nmd'].std()
|
|
||||||
print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
|
|
||||||
resultfile = join(resultpath, f'{qname}.all.reg.csv')
|
|
||||||
report.to_csv(resultfile, index=False)
|
|
||||||
|
|
||||||
return hyperparams
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
|
||||||
#domain = 'Books-tfidf'
|
|
||||||
posfix = ''
|
|
||||||
|
|
||||||
# domain = 'fact'
|
|
||||||
# posfix = '-std' # set to '' to avoid standardization
|
|
||||||
# posfix = ''
|
|
||||||
|
|
||||||
load_sample_fn = load_single_sample_pkl
|
|
||||||
datapath = './data'
|
|
||||||
protocol = 'app'
|
|
||||||
resultpath = join('./results', domain, protocol)
|
|
||||||
os.makedirs(resultpath, exist_ok=True)
|
|
||||||
|
|
||||||
train = load_sample_fn(join(datapath, domain), 'training_data')
|
|
||||||
|
|
||||||
if posfix=='-std':
|
|
||||||
zscore = StandardScaler()
|
|
||||||
train.instances = zscore.fit_transform(train.instances)
|
|
||||||
|
|
||||||
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
|
|
||||||
hypers = qp.util.parallel(run_experiment, quantifiers(), n_jobs=-3)
|
|
||||||
for h in hypers:
|
|
||||||
if h is not None:
|
|
||||||
foo.write(h)
|
|
||||||
foo.write('\n')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,105 +0,0 @@
|
||||||
import csv
|
|
||||||
import sys
|
|
||||||
import datasets
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import torch.cuda
|
|
||||||
from datasets import Dataset, DatasetDict
|
|
||||||
from sklearn.metrics import f1_score
|
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
from transformers import AutoModelForSequenceClassification
|
|
||||||
from transformers import AutoTokenizer, DataCollatorWithPadding
|
|
||||||
from transformers import Trainer
|
|
||||||
from transformers import TrainingArguments
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
This script fine-tunes a pre-trained language model on a given textual training set.
|
|
||||||
The training goes for a maximum of 5 epochs, but stores the model parameters of the best performing epoch according
|
|
||||||
to the validation loss in a hold-out val split of 1000 documents (stratified).
|
|
||||||
|
|
||||||
We used it with RoBERTa in the training set of the Amazon-OQ-BK domain, i.e.:
|
|
||||||
$> python3 ./data/Books/training_data.txt roberta-base
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def tokenize_function(example):
|
|
||||||
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else 256)
|
|
||||||
return tokens
|
|
||||||
|
|
||||||
|
|
||||||
def compute_metrics(eval_preds):
|
|
||||||
logits, labels = eval_preds
|
|
||||||
preds = np.argmax(logits, axis=-1)
|
|
||||||
return {
|
|
||||||
'macro-f1': f1_score(labels, preds, average='macro'),
|
|
||||||
'micro-f1': f1_score(labels, preds, average='micro'),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
debug = False
|
|
||||||
assert torch.cuda.is_available(), 'cuda is not available'
|
|
||||||
|
|
||||||
# datapath = './data/Books/training_data.txt'
|
|
||||||
# checkpoint = 'roberta-base'
|
|
||||||
n_args = len(sys.argv)
|
|
||||||
assert n_args==3, 'wrong arguments, expected: <training-path> <transformer-name>'
|
|
||||||
|
|
||||||
datapath = sys.argv[1] # './data/Books/training_data.txt'
|
|
||||||
checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base'
|
|
||||||
|
|
||||||
modelout = checkpoint+'-val-finetuned'
|
|
||||||
|
|
||||||
# load the training set, and extract a held-out validation split of 1000 documents (stratified)
|
|
||||||
df = pd.read_csv(datapath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
|
|
||||||
labels = df['labels'].to_frame()
|
|
||||||
X_train, X_val = train_test_split(df, stratify=labels, test_size=.25, random_state=1)
|
|
||||||
num_labels = len(pd.unique(labels['labels']))
|
|
||||||
|
|
||||||
features = datasets.Features({'labels': datasets.Value('int32'), 'review': datasets.Value('string')})
|
|
||||||
train = Dataset.from_pandas(df=X_train, split='train', features=features)
|
|
||||||
validation = Dataset.from_pandas(df=X_val, split='validation', features=features)
|
|
||||||
|
|
||||||
dataset = DatasetDict({
|
|
||||||
'train': train.select(range(500)) if debug else train,
|
|
||||||
'validation': validation.select(range(500)) if debug else validation
|
|
||||||
})
|
|
||||||
|
|
||||||
# tokenize the dataset
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
|
||||||
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
|
||||||
|
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
|
|
||||||
|
|
||||||
# fine-tuning
|
|
||||||
training_args = TrainingArguments(
|
|
||||||
modelout,
|
|
||||||
learning_rate=2e-5,
|
|
||||||
num_train_epochs=5,
|
|
||||||
weight_decay=0.01,
|
|
||||||
evaluation_strategy='epoch',
|
|
||||||
save_strategy='epoch',
|
|
||||||
per_device_train_batch_size=16,
|
|
||||||
per_device_eval_batch_size=16,
|
|
||||||
# eval_steps=10,
|
|
||||||
save_total_limit=1,
|
|
||||||
load_best_model_at_end=True
|
|
||||||
)
|
|
||||||
trainer = Trainer(
|
|
||||||
model,
|
|
||||||
args=training_args,
|
|
||||||
train_dataset=tokenized_datasets['train'],
|
|
||||||
eval_dataset=tokenized_datasets['validation'],
|
|
||||||
data_collator=DataCollatorWithPadding(tokenizer),
|
|
||||||
tokenizer=tokenizer,
|
|
||||||
compute_metrics=compute_metrics
|
|
||||||
)
|
|
||||||
|
|
||||||
trainer.train()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,70 +0,0 @@
|
||||||
import pandas as pd
|
|
||||||
from os.path import join
|
|
||||||
import os
|
|
||||||
from glob import glob
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from Ordinal.main import quantifiers
|
|
||||||
from Ordinal.tabular import Table
|
|
||||||
|
|
||||||
"""
|
|
||||||
This script generates some tables for Amazon-OQ-BK (for internal use only)
|
|
||||||
"""
|
|
||||||
|
|
||||||
domain = 'Books-tfidf'
|
|
||||||
domain_bert_last = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
|
|
||||||
domain_bert_ave = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
|
||||||
domain_bert_post = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
|
|
||||||
prot = 'app'
|
|
||||||
outpath = f'./tables/{domain}/{prot}/results.tex'
|
|
||||||
|
|
||||||
resultpath = join('./results', domain, prot)
|
|
||||||
resultpath_bertlast = join('./results', domain_bert_last, prot)
|
|
||||||
resultpath_bertave = join('./results', domain_bert_ave, prot)
|
|
||||||
resultpath_bertpost = join('./results', domain_bert_post, prot)
|
|
||||||
|
|
||||||
methods = [qname for qname, *_ in quantifiers()]
|
|
||||||
methods += ['SLD(LR)-agg']
|
|
||||||
methods_Rlast = [m+'-RoBERTa-last' for m in methods]
|
|
||||||
methods_Rave = [m+'-RoBERTa-average' for m in methods]
|
|
||||||
methods_Rpost = [m+'-RoBERTa-posteriors' for m in methods]
|
|
||||||
methods = methods + methods_Rlast + methods_Rave + methods_Rpost
|
|
||||||
# methods += [m+'-r' for m in methods]
|
|
||||||
|
|
||||||
table = Table(benchmarks=['low', 'mid', 'high', 'all'], methods=methods, prec_mean=4, show_std=True, prec_std=4)
|
|
||||||
|
|
||||||
resultfiles = list(glob(f'{resultpath}/*.csv')) \
|
|
||||||
+ list(glob(f'{resultpath_bertlast}/*.csv')) \
|
|
||||||
+ list(glob(f'{resultpath_bertave}/*.csv')) \
|
|
||||||
+ list(glob(f'{resultpath_bertpost}/*.csv'))
|
|
||||||
|
|
||||||
for resultfile in resultfiles:
|
|
||||||
df = pd.read_csv(resultfile)
|
|
||||||
nmd = df['nmd'].values
|
|
||||||
resultname = Path(resultfile).name
|
|
||||||
method, drift, *other = resultname.replace('.csv', '').split('.')
|
|
||||||
if other:
|
|
||||||
method += '-r'
|
|
||||||
if method not in methods:
|
|
||||||
continue
|
|
||||||
|
|
||||||
table.add(drift, method, nmd)
|
|
||||||
|
|
||||||
os.makedirs(Path(outpath).parent, exist_ok=True)
|
|
||||||
|
|
||||||
tabular = """
|
|
||||||
\\resizebox{\\textwidth}{!}{%
|
|
||||||
\\begin{tabular}{|c||""" + ('c|' * (table.nbenchmarks)) + """} \hline
|
|
||||||
"""
|
|
||||||
tabular += table.latexTabularT(average=False)
|
|
||||||
tabular += """
|
|
||||||
\end{tabular}%
|
|
||||||
}"""
|
|
||||||
|
|
||||||
print('saving table in', outpath)
|
|
||||||
with open(outpath, 'wt') as foo:
|
|
||||||
foo.write(tabular)
|
|
||||||
foo.write('\n')
|
|
||||||
|
|
||||||
print('[done]')
|
|
||||||
|
|
||||||
|
|
@ -1,82 +0,0 @@
|
||||||
import pandas as pd
|
|
||||||
from os.path import join
|
|
||||||
import os
|
|
||||||
from glob import glob
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from Ordinal.experiments_lr_vs_ordlr import quantifiers
|
|
||||||
from Ordinal.tabular import Table
|
|
||||||
|
|
||||||
"""
|
|
||||||
This script generates some tables for Fact-OQ (for internal use only)
|
|
||||||
"""
|
|
||||||
|
|
||||||
#domain = 'fact'
|
|
||||||
#domain = 'Books-tfidf'
|
|
||||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
|
||||||
prot = 'app'
|
|
||||||
outpath = f'./tables/{domain}/{prot}/results.tex'
|
|
||||||
|
|
||||||
resultpath = join('./results', domain, prot)
|
|
||||||
|
|
||||||
withstd=False
|
|
||||||
|
|
||||||
methods = [qname for qname, *_ in quantifiers()]
|
|
||||||
if withstd:
|
|
||||||
methods = [m+'-std' for m in methods]
|
|
||||||
#methods = methods + methods_variant
|
|
||||||
# methods += [m+'-r' for m in methods]
|
|
||||||
|
|
||||||
quantifiers_families = ['CC', 'PCC', 'ACC', 'PACC', 'SLD']
|
|
||||||
# method_variants = ['LR', 'OLR-AT', 'OLR-SE', 'OLR-IT', 'ORidge', 'LAD']
|
|
||||||
method_variants = ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']
|
|
||||||
if withstd:
|
|
||||||
method_variants = [m+'-std' for m in method_variants]
|
|
||||||
|
|
||||||
print('families:', quantifiers_families)
|
|
||||||
print('variants', method_variants)
|
|
||||||
table = Table(benchmarks=quantifiers_families, methods=method_variants, prec_mean=4, show_std=True, prec_std=4,
|
|
||||||
color=False, show_rel_to=0, missing_str='\multicolumn{1}{c}{---}', clean_zero=True)
|
|
||||||
|
|
||||||
resultfiles = list(glob(f'{resultpath}/*).all.csv'))
|
|
||||||
|
|
||||||
for resultfile in resultfiles:
|
|
||||||
df = pd.read_csv(resultfile)
|
|
||||||
nmd = df['nmd'].values
|
|
||||||
resultname = Path(resultfile).name
|
|
||||||
|
|
||||||
method, drift, *other = resultname.replace('.csv', '').replace('-RoBERTa-average','').split('.')
|
|
||||||
if drift!='all':
|
|
||||||
continue
|
|
||||||
if other:
|
|
||||||
method += '-r'
|
|
||||||
if method not in methods:
|
|
||||||
continue
|
|
||||||
|
|
||||||
family, variant = method.split('(')
|
|
||||||
variant = variant.replace(')', '')
|
|
||||||
if variant not in method_variants:
|
|
||||||
continue
|
|
||||||
table.add(family, variant, nmd)
|
|
||||||
|
|
||||||
os.makedirs(Path(outpath).parent, exist_ok=True)
|
|
||||||
|
|
||||||
tabular = """
|
|
||||||
\\resizebox{\\textwidth}{!}{%
|
|
||||||
|
|
||||||
\\begin{tabular}{c""" + ('l' * (table.nbenchmarks)) + """}
|
|
||||||
\\toprule
|
|
||||||
"""
|
|
||||||
|
|
||||||
tabular += table.latexTabularT(average=False)
|
|
||||||
tabular += """
|
|
||||||
\end{tabular}%
|
|
||||||
}"""
|
|
||||||
|
|
||||||
print('saving table in', outpath)
|
|
||||||
with open(outpath, 'wt') as foo:
|
|
||||||
foo.write(tabular)
|
|
||||||
foo.write('\n')
|
|
||||||
|
|
||||||
print('[done]')
|
|
||||||
|
|
||||||
|
|
@ -1,152 +0,0 @@
|
||||||
import sys
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from torch.utils.data import DataLoader
|
|
||||||
from transformers import AutoTokenizer
|
|
||||||
from transformers import AutoModelForSequenceClassification
|
|
||||||
from os.path import join
|
|
||||||
import os
|
|
||||||
import shutil
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from Ordinal.utils import load_samples_folder, load_single_sample_as_csv
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
This scripts takes a pre-trained model (a fine-tuned one) and generates numerical representations for all
|
|
||||||
samples in the dataset. The representations are saved in npy-txt plain format.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def tokenize_function(example):
|
|
||||||
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else None, return_tensors='pt')
|
|
||||||
return {
|
|
||||||
'input_ids': tokens.input_ids.cuda(),
|
|
||||||
'attention_mask': tokens.attention_mask.cuda()
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def save_samples_as_txt(tensors, labels, path):
|
|
||||||
vectors = tensors
|
|
||||||
labels = labels.values
|
|
||||||
vec_lab = np.hstack([labels, vectors])
|
|
||||||
n_cols = vectors.shape[1]
|
|
||||||
np.savetxt(path, vec_lab, fmt=['%d']+['%f']*n_cols)
|
|
||||||
|
|
||||||
|
|
||||||
def transform_sample(instances, labels, outpath, batch_size=50):
|
|
||||||
ndocs = len(labels)
|
|
||||||
batches = ndocs // batch_size
|
|
||||||
assert ndocs % batches == 0, 'fragmented last bach not supported'
|
|
||||||
|
|
||||||
transformations = []
|
|
||||||
for batch_id in range(0, ndocs, batch_size):
|
|
||||||
|
|
||||||
batch_instances = instances[batch_id:batch_id + batch_size]
|
|
||||||
|
|
||||||
tokenized_dataset = tokenize_function(batch_instances)
|
|
||||||
out = model(**tokenized_dataset, output_hidden_states=True)
|
|
||||||
|
|
||||||
if generation_mode == 'posteriors':
|
|
||||||
logits = out.logits
|
|
||||||
posteriors = torch.softmax(logits, dim=-1)
|
|
||||||
transformed = posteriors
|
|
||||||
elif generation_mode == 'last':
|
|
||||||
hidden_states = out.hidden_states
|
|
||||||
last_layer_cls = hidden_states[-1][:, 0, :]
|
|
||||||
transformed = last_layer_cls
|
|
||||||
elif generation_mode == 'average':
|
|
||||||
hidden_states = out.hidden_states
|
|
||||||
hidden_states = torch.stack(hidden_states)
|
|
||||||
all_layer_cls = hidden_states[:, :, 0, :]
|
|
||||||
average_cls = torch.mean(all_layer_cls, dim=0)
|
|
||||||
transformed = average_cls
|
|
||||||
else:
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
transformations.append(transformed.cpu().numpy())
|
|
||||||
|
|
||||||
transformations = np.vstack(transformations)
|
|
||||||
save_samples_as_txt(transformations, labels, outpath)
|
|
||||||
|
|
||||||
|
|
||||||
def transform_folder_samples(protocol, splitname, skip=0):
|
|
||||||
in_folder = join(datapath, domain, protocol, splitname)
|
|
||||||
out_folder = join(datapath, outname, protocol, splitname)
|
|
||||||
total = 1000 if splitname.startswith('dev') else 5000
|
|
||||||
|
|
||||||
for i, (instances, labels) in tqdm(enumerate(
|
|
||||||
load_samples_folder(in_folder, load_fn=load_single_sample_as_csv)), desc=f'{protocol} {splitname}', total=total):
|
|
||||||
if i>= skip:
|
|
||||||
transform_sample(instances, labels, outpath=join(out_folder, f'{i}.txt'))
|
|
||||||
|
|
||||||
|
|
||||||
def get_best_checkpoint(checkpointdir):
|
|
||||||
from glob import glob
|
|
||||||
steps = []
|
|
||||||
for folder in glob(f'{checkpointdir}/checkpoint-*'):
|
|
||||||
step=int(folder.split('checkpoint-')[1])
|
|
||||||
steps.append(step)
|
|
||||||
assert len(steps) <= 2, 'unexpected number of steps, only two where expected (the best one and the last one)'
|
|
||||||
choosen = f'{checkpointdir}/checkpoint-{min(steps)}'
|
|
||||||
print(f'choosen checkpoint is {choosen}')
|
|
||||||
return choosen
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
debug = False
|
|
||||||
assert torch.cuda.is_available(), 'cuda is not available'
|
|
||||||
|
|
||||||
#checkpoint='roberta-base-val-finetuned'
|
|
||||||
#generation_mode = 'ave'
|
|
||||||
|
|
||||||
n_args = len(sys.argv)
|
|
||||||
assert n_args==3, 'wrong arguments, expected: <checkpoint> <generation-mode>\n' \
|
|
||||||
'\tgeneration-mode: last (last layer), ave (average pooling), or posteriors (posterior probabilities)'
|
|
||||||
|
|
||||||
checkpoint = sys.argv[1] #e.g., 'bert-base-uncased'
|
|
||||||
generation_mode = sys.argv[2] # e.g., 'last'
|
|
||||||
|
|
||||||
assert 'finetuned' in checkpoint, 'looks like this model is not finetuned'
|
|
||||||
|
|
||||||
checkpoint = get_best_checkpoint(checkpoint)
|
|
||||||
|
|
||||||
num_labels = 5
|
|
||||||
|
|
||||||
datapath = './data'
|
|
||||||
domain = 'Books'
|
|
||||||
protocols = ['real'] # ['app', 'npp']
|
|
||||||
|
|
||||||
assert generation_mode in ['last', 'average', 'posteriors'], 'unknown generation_model'
|
|
||||||
outname = domain + f'-{checkpoint}-{generation_mode}'
|
|
||||||
|
|
||||||
with torch.no_grad():
|
|
||||||
print('loading', checkpoint)
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
|
|
||||||
|
|
||||||
os.makedirs(join(datapath, outname), exist_ok=True)
|
|
||||||
|
|
||||||
print('transforming the training set')
|
|
||||||
instances, labels = load_single_sample_as_csv(join(datapath, domain), 'training_data')
|
|
||||||
transform_sample(instances, labels, join(datapath, outname, 'training_data.txt'))
|
|
||||||
print('[done]')
|
|
||||||
|
|
||||||
for protocol in protocols:
|
|
||||||
in_path = join(datapath, domain, protocol)
|
|
||||||
out_path = join(datapath, outname, protocol)
|
|
||||||
os.makedirs(out_path, exist_ok=True)
|
|
||||||
os.makedirs(join(out_path, 'dev_samples'), exist_ok=True)
|
|
||||||
os.makedirs(join(out_path, 'test_samples'), exist_ok=True)
|
|
||||||
shutil.copyfile(join(in_path, 'dev_prevalences.txt'), join(out_path, 'dev_prevalences.txt'))
|
|
||||||
shutil.copyfile(join(in_path, 'test_prevalences.txt'), join(out_path, 'test_prevalences.txt'))
|
|
||||||
|
|
||||||
print('processing', protocol)
|
|
||||||
transform_folder_samples(protocol, 'dev_samples')
|
|
||||||
transform_folder_samples(protocol, 'test_samples')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,16 @@
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.data.reader import from_text
|
||||||
|
from quapy.functional import strprev
|
||||||
|
|
||||||
|
category = 'Books'
|
||||||
|
datadir = './data'
|
||||||
|
|
||||||
|
training_path = f'{datadir}/{category}/training_data.txt'
|
||||||
|
|
||||||
|
data = LabelledCollection.load(training_path, loader_func=from_text)
|
||||||
|
|
||||||
|
print(len(data))
|
||||||
|
print(strprev(data.prevalence()))
|
||||||
|
|
||||||
|
|
||||||
217
Ordinal/main.py
217
Ordinal/main.py
|
|
@ -3,154 +3,87 @@ from sklearn.linear_model import LogisticRegression
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from Ordinal.model import OrderedLogisticRegression, LogisticAT
|
from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, RegressorClassifier
|
||||||
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy
|
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from os.path import join
|
from os.path import join
|
||||||
import os
|
from utils import load_samples, load_samples_pkl
|
||||||
from utils import load_samples_folder, load_simple_sample_npytxt, load_single_sample_pkl
|
|
||||||
from evaluation import nmd, mnmd
|
from evaluation import nmd, mnmd
|
||||||
from time import time
|
from time import time
|
||||||
import pickle
|
import pickle
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import mord
|
|
||||||
|
domain = 'Books-tfidf'
|
||||||
|
datapath = './data'
|
||||||
|
protocol = 'app'
|
||||||
def quantifiers():
|
drift = 'high'
|
||||||
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
|
||||||
# params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
|
train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb'))
|
||||||
params_OLR = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
|
|
||||||
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
|
||||||
# params_SVR = {'C': np.logspace(0, 1, 2)}
|
def load_test_samples():
|
||||||
|
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
|
||||||
# baselines
|
ids = set(ids)
|
||||||
yield 'CC(LR)', CC(LogisticRegression()), params_LR
|
for sample in tqdm(load_samples_pkl(join(datapath, domain, protocol, 'test_samples'), filter=ids), total=len(ids)):
|
||||||
yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
|
yield sample.instances, sample.prevalence()
|
||||||
yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
|
|
||||||
yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
|
|
||||||
#yield 'HDy(LR)', HDy(LogisticRegression()), params_LR
|
def load_dev_samples():
|
||||||
yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
|
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
|
||||||
|
ids = set(ids)
|
||||||
# with order-aware classifiers
|
for sample in tqdm(load_samples_pkl(join(datapath, domain, protocol, 'dev_samples'), filter=ids), total=len(ids)):
|
||||||
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
yield sample.instances, sample.prevalence()
|
||||||
#yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
|
|
||||||
#yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
|
|
||||||
#yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
|
print('fitting the quantifier')
|
||||||
#yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
|
|
||||||
#yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR
|
# q = EMQ(LogisticRegression(class_weight='balanced'))
|
||||||
#yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
|
# q = PACC(LogisticRegression(class_weight='balanced'))
|
||||||
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
|
q = PACC(OrderedLogisticRegression())
|
||||||
|
# q = PACC(StackedClassifier(LogisticRegression(class_weight='balanced')))
|
||||||
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
# q = RegressionQuantification(PCC(LogisticRegression(class_weight='balanced')), val_samples_generator=load_dev_samples)
|
||||||
# I am using my implementation, which caters for predict_proba (linear distance to the two closest classes, 0 in the rest)
|
# q = ACC(RegressorClassifier())
|
||||||
# the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do
|
|
||||||
# not implement predict_proba nor decision_score
|
param_grid = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||||
#yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
|
# param_grid = {'C': np.logspace(-3,3,14)}
|
||||||
#yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
|
# param_grid = {'alpha':np.logspace(-8, 6, 15)}
|
||||||
# yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR
|
|
||||||
# yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR
|
# q = qp.model_selection.GridSearchQ(
|
||||||
# yield 'PACC(SVR)', PACC(RegressorClassifier()), params_SVR
|
# q,
|
||||||
#yield 'HDy(SVR)', HDy(RegressorClassifier()), params_SVR
|
# param_grid,
|
||||||
# yield 'SLD(SVR)', EMQ(RegressorClassifier()), params_SVR
|
# 1000,
|
||||||
|
# 'gen',
|
||||||
|
# error=mnmd,
|
||||||
def run_experiment(params):
|
# val_split=load_dev_samples,
|
||||||
qname, q, param_grid, drift = params
|
# n_jobs=-1,
|
||||||
qname += posfix
|
# refit=False,
|
||||||
resultfile = join(resultpath, f'{qname}.{drift}.csv')
|
# verbose=True)
|
||||||
if os.path.exists(resultfile):
|
|
||||||
print(f'result file {resultfile} already exists: continue')
|
q.fit(train)
|
||||||
return None
|
|
||||||
|
# q = RegressionQuantification(q, val_samples_generator=load_dev_samples)
|
||||||
print(f'fitting {qname} for {drift}-drift')
|
# q.fit(None)
|
||||||
|
|
||||||
|
print('[done]')
|
||||||
def load_test_samples():
|
|
||||||
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
|
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||||
ids = set(ids)
|
mean_nmd = report['nmd'].mean()
|
||||||
folderpath = join(datapath, domain, protocol, 'test_samples')
|
std_nmd = report['nmd'].std()
|
||||||
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
|
print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||||
yield sample.instances, sample.prevalence()
|
|
||||||
|
q = RegressionQuantification(q, val_samples_generator=load_dev_samples)
|
||||||
|
q.fit(None)
|
||||||
def load_dev_samples():
|
|
||||||
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
|
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||||
ids = set(ids)
|
mean_nmd = report['nmd'].mean()
|
||||||
folderpath = join(datapath, domain, protocol, 'dev_samples')
|
std_nmd = report['nmd'].std()
|
||||||
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
|
print(f'[regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||||
yield sample.instances, sample.prevalence()
|
|
||||||
|
# drift='high'
|
||||||
q = qp.model_selection.GridSearchQ(
|
# report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||||
q,
|
# mean_nmd = report['nmd'].mean()
|
||||||
param_grid,
|
# std_nmd = report['nmd'].std()
|
||||||
sample_size=1000,
|
# print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||||
protocol='gen',
|
|
||||||
error=mnmd,
|
|
||||||
val_split=load_dev_samples,
|
|
||||||
n_jobs=-1,
|
|
||||||
refit=False,
|
|
||||||
verbose=True).fit(train)
|
|
||||||
|
|
||||||
hyperparams = f'{qname}\t{drift}\t{q.best_params_}'
|
|
||||||
|
|
||||||
print('[done]')
|
|
||||||
|
|
||||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
|
||||||
mean_nmd = report['nmd'].mean()
|
|
||||||
std_nmd = report['nmd'].std()
|
|
||||||
print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
|
|
||||||
report.to_csv(resultfile, index=False)
|
|
||||||
|
|
||||||
print('[learning regressor-based adjustment]')
|
|
||||||
q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
|
|
||||||
q.fit(None)
|
|
||||||
|
|
||||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
|
||||||
mean_nmd = report['nmd'].mean()
|
|
||||||
std_nmd = report['nmd'].std()
|
|
||||||
print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
|
|
||||||
resultfile = join(resultpath, f'{qname}.{drift}.reg.csv')
|
|
||||||
report.to_csv(resultfile, index=False)
|
|
||||||
|
|
||||||
return hyperparams
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
#preprocessing = 'roberta.last'
|
|
||||||
preprocessing = 'roberta.average'
|
|
||||||
# preprocessing = 'roberta.posteriors'
|
|
||||||
#preprocessing = 'tfidf'
|
|
||||||
if preprocessing=='tfidf':
|
|
||||||
domain = 'Books-tfidf'
|
|
||||||
posfix = ''
|
|
||||||
elif preprocessing=='roberta.last':
|
|
||||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
|
|
||||||
posfix = '-RoBERTa-last'
|
|
||||||
elif preprocessing=='roberta.average':
|
|
||||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
|
||||||
posfix = '-RoBERTa-average'
|
|
||||||
elif preprocessing=='roberta.posteriors':
|
|
||||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
|
|
||||||
posfix = '-RoBERTa-posteriors'
|
|
||||||
load_sample_fn = load_single_sample_pkl
|
|
||||||
datapath = './data'
|
|
||||||
protocol = 'app'
|
|
||||||
resultpath = join('./results', domain, protocol)
|
|
||||||
os.makedirs(resultpath, exist_ok=True)
|
|
||||||
|
|
||||||
train = load_sample_fn(join(datapath, domain), 'training_data')
|
|
||||||
|
|
||||||
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
|
|
||||||
#for drift in [f'smooth{i}' for i in range(5)] + ['all']:
|
|
||||||
params = [(*qs, drift) for qs in quantifiers() for drift in ['low', 'mid', 'high', 'all']]
|
|
||||||
hypers = qp.util.parallel(run_experiment, params, n_jobs=-2)
|
|
||||||
for h in hypers:
|
|
||||||
if h is not None:
|
|
||||||
foo.write(h)
|
|
||||||
foo.write('\n')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
255
Ordinal/model.py
255
Ordinal/model.py
|
|
@ -1,11 +1,14 @@
|
||||||
import mord
|
from copy import deepcopy
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy.sparse import issparse
|
|
||||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
|
from sklearn.calibration import CalibratedClassifierCV
|
||||||
from sklearn.decomposition import TruncatedSVD
|
from sklearn.decomposition import TruncatedSVD
|
||||||
from sklearn.linear_model import Ridge
|
from sklearn.linear_model import LogisticRegression, Ridge
|
||||||
from sklearn.svm import LinearSVR
|
from scipy.sparse import issparse
|
||||||
from sklearn.utils.class_weight import compute_class_weight
|
from sklearn.multiclass import OneVsRestClassifier
|
||||||
|
from sklearn.multioutput import MultiOutputRegressor
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.svm import LinearSVR, SVR
|
||||||
from statsmodels.miscmodels.ordinal_model import OrderedModel
|
from statsmodels.miscmodels.ordinal_model import OrderedModel
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -33,21 +36,112 @@ class OrderedLogisticRegression:
|
||||||
return self.res_prob.model.predict(self.res_prob.params, exog=X)
|
return self.res_prob.model.predict(self.res_prob.params, exog=X)
|
||||||
|
|
||||||
|
|
||||||
class LAD(BaseEstimator, ClassifierMixin):
|
class StackedClassifier: # aka Funnelling Monolingual
|
||||||
def __init__(self, C=1.0, class_weight=None):
|
def __init__(self, base_estimator=LogisticRegression()):
|
||||||
self.C = C
|
if not hasattr(base_estimator, 'predict_proba'):
|
||||||
self.class_weight = class_weight
|
print('the estimator does not seem to be probabilistic: calibrating')
|
||||||
|
base_estimator = CalibratedClassifierCV(base_estimator)
|
||||||
|
# self.base = deepcopy(OneVsRestClassifier(base_estimator))
|
||||||
|
# self.meta = deepcopy(OneVsRestClassifier(base_estimator))
|
||||||
|
self.base = deepcopy(base_estimator)
|
||||||
|
self.meta = deepcopy(base_estimator)
|
||||||
|
self.norm = StandardScaler()
|
||||||
|
|
||||||
def fit(self, X, y, sample_weight=None):
|
def fit(self, X, y):
|
||||||
self.regressor = LinearSVR(C=self.C)
|
self.base.fit(X, y)
|
||||||
|
P = self.base.predict_proba(X)
|
||||||
|
P = self.norm.fit_transform(P)
|
||||||
|
self.meta.fit(P, y)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
P = self.base.predict_proba(X)
|
||||||
|
P = self.norm.transform(P)
|
||||||
|
return self.meta.predict(P)
|
||||||
|
|
||||||
|
def predict_proba(self, X):
|
||||||
|
P = self.base.predict_proba(X)
|
||||||
|
P = self.norm.transform(P)
|
||||||
|
return self.meta.predict_proba(P)
|
||||||
|
|
||||||
|
|
||||||
|
class RegressionQuantification:
|
||||||
|
def __init__(self,
|
||||||
|
base_quantifier,
|
||||||
|
regression='svr',
|
||||||
|
val_samples_generator=None,
|
||||||
|
norm=True):
|
||||||
|
|
||||||
|
self.base_quantifier = base_quantifier
|
||||||
|
if isinstance(regression, str):
|
||||||
|
assert regression in ['ridge', 'svr'], 'unknown regression model'
|
||||||
|
if regression == 'ridge':
|
||||||
|
self.reg = Ridge(normalize=norm)
|
||||||
|
elif regression == 'svr':
|
||||||
|
self.reg = MultiOutputRegressor(LinearSVR())
|
||||||
|
else:
|
||||||
|
self.reg = regression
|
||||||
|
# self.reg = MultiTaskLassoCV(normalize=norm)
|
||||||
|
# self.reg = KernelRidge(kernel='rbf')
|
||||||
|
# self.reg = LassoLarsCV(normalize=norm)
|
||||||
|
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
|
||||||
|
#self.reg = LinearRegression(normalize=norm) # <- bien
|
||||||
|
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
|
||||||
|
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
|
||||||
|
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
|
||||||
|
self.regression = regression
|
||||||
|
self.val_samples_generator = val_samples_generator
|
||||||
|
# self.norm = StandardScaler()
|
||||||
|
# self.covs = covs
|
||||||
|
|
||||||
|
def generate_validation_samples(self):
|
||||||
|
Xs, ys = [], []
|
||||||
|
for instances, prevalence in self.val_samples_generator():
|
||||||
|
ys.append(prevalence)
|
||||||
|
Xs.append(self.base_quantifier.quantify(instances))
|
||||||
|
Xs = np.asarray(Xs)
|
||||||
|
ys = np.asarray(ys)
|
||||||
|
return Xs, ys
|
||||||
|
|
||||||
|
def fit(self, data):
|
||||||
|
print('fitting quantifier')
|
||||||
|
if data is not None:
|
||||||
|
self.base_quantifier.fit(data)
|
||||||
|
print('generating val samples')
|
||||||
|
Xs, ys = self.generate_validation_samples()
|
||||||
|
# Xs = self.norm.fit_transform(Xs)
|
||||||
|
print('fitting regressor')
|
||||||
|
self.reg.fit(Xs, ys)
|
||||||
|
print('[done]')
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
Xs = self.base_quantifier.quantify(instances).reshape(1, -1)
|
||||||
|
# Xs = self.norm.transform(Xs)
|
||||||
|
Xs = self.reg.predict(Xs)
|
||||||
|
# Xs = self.norm.inverse_transform(Xs)
|
||||||
|
adjusted = Xs / Xs.sum()
|
||||||
|
# adjusted = np.clip(Xs, 0, 1)
|
||||||
|
adjusted = adjusted.flatten()
|
||||||
|
return adjusted
|
||||||
|
|
||||||
|
def get_params(self, deep=True):
|
||||||
|
return self.base_quantifier.get_params()
|
||||||
|
|
||||||
|
def set_params(self, **params):
|
||||||
|
self.base_quantifier.set_params(**params)
|
||||||
|
|
||||||
|
|
||||||
|
class RegressorClassifier(BaseEstimator, ClassifierMixin):
|
||||||
|
def __init__(self):
|
||||||
|
self.regressor = LinearSVR()
|
||||||
# self.regressor = SVR()
|
# self.regressor = SVR()
|
||||||
# self.regressor = Ridge(normalize=True)
|
# self.regressor = Ridge(normalize=True)
|
||||||
classes = sorted(np.unique(y))
|
|
||||||
self.nclasses = len(classes)
|
|
||||||
if self.class_weight == 'balanced':
|
def fit(self, X, y):
|
||||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
self.nclasses = len(np.unique(y))
|
||||||
sample_weight = class_weight[y]
|
self.regressor.fit(X, y)
|
||||||
self.regressor.fit(X, y, sample_weight=sample_weight)
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def predict(self, X):
|
def predict(self, X):
|
||||||
|
|
@ -57,20 +151,13 @@ class LAD(BaseEstimator, ClassifierMixin):
|
||||||
c[c>(self.nclasses-1)]=self.nclasses-1
|
c[c>(self.nclasses-1)]=self.nclasses-1
|
||||||
return c.astype(np.int)
|
return c.astype(np.int)
|
||||||
|
|
||||||
# def predict_proba(self, X):
|
def predict_proba(self, X):
|
||||||
# r = self.regressor.predict(X)
|
|
||||||
# nC = len(self.classes_)
|
|
||||||
# r = np.clip(r, 0, nC - 1)
|
|
||||||
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
|
||||||
# invdist = 1 - dists
|
|
||||||
# invdist[invdist < 0] = 0
|
|
||||||
# return invdist
|
|
||||||
|
|
||||||
def decision_function(self, X):
|
|
||||||
r = self.regressor.predict(X)
|
r = self.regressor.predict(X)
|
||||||
nC = len(self.classes_)
|
nC = len(self.classes_)
|
||||||
|
r = np.clip(r, 0, nC - 1)
|
||||||
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||||
invdist = 1 - dists
|
invdist = 1 - dists
|
||||||
|
invdist[invdist < 0] = 0
|
||||||
return invdist
|
return invdist
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
@ -78,118 +165,8 @@ class LAD(BaseEstimator, ClassifierMixin):
|
||||||
return np.arange(self.nclasses)
|
return np.arange(self.nclasses)
|
||||||
|
|
||||||
def get_params(self, deep=True):
|
def get_params(self, deep=True):
|
||||||
return {'C':self.C, 'class_weight': self.class_weight}
|
return self.regressor.get_params()
|
||||||
|
|
||||||
def set_params(self, **params):
|
def set_params(self, **params):
|
||||||
self.C = params['C']
|
self.regressor.set_params(**params)
|
||||||
self.class_weight = params['class_weight']
|
|
||||||
|
|
||||||
|
|
||||||
class OrdinalRidge(BaseEstimator, ClassifierMixin):
|
|
||||||
def __init__(self, alpha=1.0, class_weight=None, normalize=False):
|
|
||||||
self.alpha = alpha
|
|
||||||
self.class_weight = class_weight
|
|
||||||
self.normalize = normalize
|
|
||||||
|
|
||||||
def fit(self, X, y, sample_weight=None):
|
|
||||||
self.regressor = Ridge(alpha=self.alpha, normalize=self.normalize)
|
|
||||||
classes = sorted(np.unique(y))
|
|
||||||
self.nclasses = len(classes)
|
|
||||||
if self.class_weight == 'balanced':
|
|
||||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
|
||||||
sample_weight = class_weight[y]
|
|
||||||
self.regressor.fit(X, y, sample_weight=sample_weight)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def predict(self, X):
|
|
||||||
r = self.regressor.predict(X)
|
|
||||||
c = np.round(r)
|
|
||||||
c[c<0]=0
|
|
||||||
c[c>(self.nclasses-1)]=self.nclasses-1
|
|
||||||
return c.astype(np.int)
|
|
||||||
|
|
||||||
# def predict_proba(self, X):
|
|
||||||
# r = self.regressor.predict(X)
|
|
||||||
# nC = len(self.classes_)
|
|
||||||
# r = np.clip(r, 0, nC - 1)
|
|
||||||
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
|
||||||
# invdist = 1 - dists
|
|
||||||
# invdist[invdist < 0] = 0
|
|
||||||
# return invdist
|
|
||||||
|
|
||||||
def decision_function(self, X):
|
|
||||||
r = self.regressor.predict(X)
|
|
||||||
nC = len(self.classes_)
|
|
||||||
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
|
||||||
invdist = 1 - dists
|
|
||||||
return invdist
|
|
||||||
|
|
||||||
@property
|
|
||||||
def classes_(self):
|
|
||||||
return np.arange(self.nclasses)
|
|
||||||
|
|
||||||
def get_params(self, deep=True):
|
|
||||||
return {'alpha':self.alpha, 'class_weight': self.class_weight, 'normalize': self.normalize}
|
|
||||||
|
|
||||||
def set_params(self, **params):
|
|
||||||
self.alpha = params['alpha']
|
|
||||||
self.class_weight = params['class_weight']
|
|
||||||
self.normalize = params['normalize']
|
|
||||||
|
|
||||||
|
|
||||||
# with order-aware classifiers
|
|
||||||
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
|
||||||
class LogisticAT(mord.LogisticAT):
|
|
||||||
def __init__(self, alpha=1.0, class_weight=None):
|
|
||||||
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
|
||||||
self.class_weight = class_weight
|
|
||||||
super(LogisticAT, self).__init__(alpha=alpha)
|
|
||||||
|
|
||||||
def fit(self, X, y, sample_weight=None):
|
|
||||||
if self.class_weight == 'balanced':
|
|
||||||
classes = sorted(np.unique(y))
|
|
||||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
|
||||||
sample_weight = class_weight[y]
|
|
||||||
return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight)
|
|
||||||
|
|
||||||
|
|
||||||
class LogisticSE(mord.LogisticSE):
|
|
||||||
def __init__(self, alpha=1.0, class_weight=None):
|
|
||||||
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
|
||||||
self.class_weight = class_weight
|
|
||||||
super(LogisticSE, self).__init__(alpha=alpha)
|
|
||||||
|
|
||||||
def fit(self, X, y, sample_weight=None):
|
|
||||||
if self.class_weight == 'balanced':
|
|
||||||
classes = sorted(np.unique(y))
|
|
||||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
|
||||||
sample_weight = class_weight[y]
|
|
||||||
return super(LogisticSE, self).fit(X, y, sample_weight=sample_weight)
|
|
||||||
|
|
||||||
|
|
||||||
class LogisticIT(mord.LogisticIT):
|
|
||||||
def __init__(self, alpha=1.0, class_weight=None):
|
|
||||||
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
|
||||||
self.class_weight = class_weight
|
|
||||||
super(LogisticIT, self).__init__(alpha=alpha)
|
|
||||||
|
|
||||||
def fit(self, X, y, sample_weight=None):
|
|
||||||
if self.class_weight == 'balanced':
|
|
||||||
classes = sorted(np.unique(y))
|
|
||||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
|
||||||
sample_weight = class_weight[y]
|
|
||||||
return super(LogisticIT, self).fit(X, y, sample_weight=sample_weight)
|
|
||||||
|
|
||||||
|
|
||||||
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
|
||||||
# class LAD(mord.LAD):
|
|
||||||
# def fit(self, X, y):
|
|
||||||
# self.classes_ = sorted(np.unique(y))
|
|
||||||
# return super().fit(X, y)
|
|
||||||
|
|
||||||
|
|
||||||
# class OrdinalRidge(mord.OrdinalRidge):
|
|
||||||
# def fit(self, X, y):
|
|
||||||
# self.classes_ = sorted(np.unique(y))
|
|
||||||
# return super().fit(X, y)
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from evaluation import nmd
|
from Ordinal.evaluation import nmd
|
||||||
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
|
from Ordinal.utils import load_samples_pkl
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
import pickle
|
import pickle
|
||||||
import os
|
import os
|
||||||
|
|
@ -9,39 +9,28 @@ from os.path import join
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
This scripts generates a partition of a dataset in terms of "shift".
|
|
||||||
The partition is only carried out by generating index vectors.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def partition_by_drift(split, training_prevalence):
|
def partition_by_drift(split, training_prevalence):
|
||||||
assert split in ['dev', 'test'], 'invalid split name'
|
assert split in ['dev', 'test'], 'invalid split name'
|
||||||
total=1000 if split=='dev' else 5000
|
total=1000 if split=='dev' else 5000
|
||||||
drifts = []
|
drifts = []
|
||||||
folderpath = join(datapath, domain, 'app', f'{split}_samples')
|
for sample in tqdm(load_samples_pkl(join(datapath, domain, 'app', f'{split}_samples')), total=total):
|
||||||
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
|
|
||||||
drifts.append(nmd(training_prevalence, sample.prevalence()))
|
drifts.append(nmd(training_prevalence, sample.prevalence()))
|
||||||
drifts = np.asarray(drifts)
|
drifts = np.asarray(drifts)
|
||||||
order = np.argsort(drifts)
|
order = np.argsort(drifts)
|
||||||
nD = len(order)
|
nD = len(order)
|
||||||
low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:]
|
low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:]
|
||||||
all_drift = np.arange(nD)
|
|
||||||
np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift)
|
np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift)
|
||||||
np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift)
|
np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift)
|
||||||
np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift)
|
np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift)
|
||||||
np.save(join(datapath, domain, 'app', f'alldrift.{split}.id.npy'), all_drift)
|
|
||||||
lows = drifts[low_drift]
|
lows = drifts[low_drift]
|
||||||
mids = drifts[mid_drift]
|
mids = drifts[mid_drift]
|
||||||
highs = drifts[high_drift]
|
highs = drifts[high_drift]
|
||||||
all = drifts[all_drift]
|
|
||||||
print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}')
|
print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}')
|
||||||
print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}')
|
print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}')
|
||||||
print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}')
|
print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}')
|
||||||
print(f'all drift: interval [{all.min():.4f}, {all.max():.4f}] mean: {all.mean():.4f}')
|
|
||||||
|
|
||||||
|
|
||||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
|
domain = 'Books-tfidf'
|
||||||
datapath = './data'
|
datapath = './data'
|
||||||
|
|
||||||
training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
|
training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
|
||||||
|
|
|
||||||
|
|
@ -1,41 +0,0 @@
|
||||||
import numpy as np
|
|
||||||
from Ordinal.evaluation import smoothness
|
|
||||||
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
|
|
||||||
from os.path import join
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
This scripts generates a partition of a dataset in terms of "smoothness".
|
|
||||||
The partition is only carried out by generating index vectors.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def partition_by_smoothness(split):
|
|
||||||
assert split in ['dev', 'test'], 'invalid split name'
|
|
||||||
total=1000 if split=='dev' else 5000
|
|
||||||
smooths = []
|
|
||||||
folderpath = join(datapath, domain, 'app', f'{split}_samples')
|
|
||||||
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
|
|
||||||
smooths.append(smoothness(sample.prevalence()))
|
|
||||||
smooths = np.asarray(smooths)
|
|
||||||
order = np.argsort(smooths)
|
|
||||||
nD = len(order)
|
|
||||||
low2high_smooth = np.array_split(order, 5)
|
|
||||||
all_drift = np.arange(nD)
|
|
||||||
for i, smooth_idx in enumerate(low2high_smooth):
|
|
||||||
block = smooths[smooth_idx]
|
|
||||||
print(f'smooth block {i}: shape={smooth_idx.shape}, interval=[{block.min()}, {block.max()}] mean={block.mean()}')
|
|
||||||
np.save(join(datapath, domain, 'app', f'smooth{i}.{split}.id.npy'), smooth_idx)
|
|
||||||
np.save(join(datapath, domain, 'app', f'all.{split}.id.npy'), all_drift)
|
|
||||||
|
|
||||||
|
|
||||||
#domain = 'Books-tfidf'
|
|
||||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
|
||||||
datapath = './data'
|
|
||||||
|
|
||||||
#training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
|
|
||||||
|
|
||||||
partition_by_smoothness('dev')
|
|
||||||
partition_by_smoothness('test')
|
|
||||||
|
|
||||||
|
|
@ -1,20 +1,14 @@
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from Ordinal.utils import load_simple_sample_raw
|
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from os.path import join
|
from os.path import join
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
|
from utils import load_samples
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into tfidf vectors.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
datapath = './data'
|
datapath = './data'
|
||||||
domain = 'Books'
|
domain = 'Books'
|
||||||
outname = domain + '-tfidf'
|
outname = domain + '-tfidf'
|
||||||
|
|
@ -46,7 +40,7 @@ pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pic
|
||||||
|
|
||||||
|
|
||||||
def transform_folder_samples(protocol, splitname):
|
def transform_folder_samples(protocol, splitname):
|
||||||
for i, sample in tqdm(enumerate(load_simple_sample_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
for i, sample in tqdm(enumerate(load_samples(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
||||||
sample.instances = tfidf.transform(sample.instances)
|
sample.instances = tfidf.transform(sample.instances)
|
||||||
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
@ -1,51 +0,0 @@
|
||||||
import quapy as qp
|
|
||||||
from quapy.data import LabelledCollection
|
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
||||||
from os.path import join
|
|
||||||
import os
|
|
||||||
import pickle
|
|
||||||
from utils import *
|
|
||||||
from tqdm import tqdm
|
|
||||||
import shutil
|
|
||||||
|
|
||||||
"""
|
|
||||||
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into dense vectors
|
|
||||||
extracted from a pretrained model (here we use the RoBERTa fine-tuned on the training set)
|
|
||||||
Three vector generation modes are available: posteriors, last, average
|
|
||||||
"""
|
|
||||||
|
|
||||||
vector_generation = 'posteriors'
|
|
||||||
|
|
||||||
datapath = './data'
|
|
||||||
domain = f'Books-roberta-base-finetuned/checkpoint-1188-{vector_generation}'
|
|
||||||
outname = domain.replace('-finetuned', '-finetuned-pkl')
|
|
||||||
|
|
||||||
protocol = 'app'
|
|
||||||
|
|
||||||
print('pickling npy txt files')
|
|
||||||
print('from:', join(datapath, domain))
|
|
||||||
print('to', join(datapath, outname))
|
|
||||||
print('for protocol:', protocol)
|
|
||||||
|
|
||||||
os.makedirs(join(datapath, outname), exist_ok=True)
|
|
||||||
os.makedirs(join(datapath, outname, protocol), exist_ok=True)
|
|
||||||
os.makedirs(join(datapath, outname, protocol, 'dev_samples'), exist_ok=True)
|
|
||||||
os.makedirs(join(datapath, outname, protocol, 'test_samples'), exist_ok=True)
|
|
||||||
shutil.copyfile(join(datapath, domain, protocol, 'dev_prevalences.txt'), join(datapath, outname, protocol, 'dev_prevalences.txt'))
|
|
||||||
shutil.copyfile(join(datapath, domain, protocol, 'test_prevalences.txt'), join(datapath, outname, protocol, 'test_prevalences.txt'))
|
|
||||||
|
|
||||||
train = load_simple_sample_npytxt(join(datapath, domain), 'training_data', classes=np.arange(5))
|
|
||||||
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
|
||||||
|
|
||||||
|
|
||||||
def transform_folder_samples(protocol, splitname):
|
|
||||||
folder_dir=join(datapath, domain, protocol, splitname)
|
|
||||||
for i, sample in tqdm(enumerate(load_samples_folder(folder_dir, filter=None, load_fn=load_simple_sample_npytxt, classes=train.classes_))):
|
|
||||||
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
|
||||||
|
|
||||||
|
|
||||||
transform_folder_samples(protocol, 'dev_samples')
|
|
||||||
transform_folder_samples(protocol, 'test_samples')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,374 +0,0 @@
|
||||||
import numpy as np
|
|
||||||
import itertools
|
|
||||||
from scipy.stats import ttest_ind_from_stats, wilcoxon
|
|
||||||
|
|
||||||
|
|
||||||
class Table:
|
|
||||||
VALID_TESTS = [None, "wilcoxon", "ttest"]
|
|
||||||
|
|
||||||
def __init__(self, benchmarks, methods, lower_is_better=True, significance_test='wilcoxon', prec_mean=3,
|
|
||||||
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
|
|
||||||
color=True, show_rel_to=-1):
|
|
||||||
assert significance_test in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
|
|
||||||
|
|
||||||
self.benchmarks = np.asarray(benchmarks)
|
|
||||||
self.benchmark_index = {row: i for i, row in enumerate(benchmarks)}
|
|
||||||
|
|
||||||
self.methods = np.asarray(methods)
|
|
||||||
self.method_index = {col: j for j, col in enumerate(methods)}
|
|
||||||
|
|
||||||
self.map = {}
|
|
||||||
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
|
|
||||||
self._addmap('values', dtype=object)
|
|
||||||
self.lower_is_better = lower_is_better
|
|
||||||
self.ttest = significance_test
|
|
||||||
self.prec_mean = prec_mean
|
|
||||||
self.clean_zero = clean_zero
|
|
||||||
self.show_std = show_std
|
|
||||||
self.prec_std = prec_std
|
|
||||||
self.add_average = average
|
|
||||||
self.missing = missing
|
|
||||||
self.missing_str = missing_str
|
|
||||||
self.color = color
|
|
||||||
self.show_rel_to = show_rel_to
|
|
||||||
|
|
||||||
self.touch()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def nbenchmarks(self):
|
|
||||||
return len(self.benchmarks)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def nmethods(self):
|
|
||||||
return len(self.methods)
|
|
||||||
|
|
||||||
def touch(self):
|
|
||||||
self._modif = True
|
|
||||||
|
|
||||||
def update(self):
|
|
||||||
if self._modif:
|
|
||||||
self.compute()
|
|
||||||
|
|
||||||
def _getfilled(self):
|
|
||||||
return np.argwhere(self.map['fill'])
|
|
||||||
|
|
||||||
@property
|
|
||||||
def values(self):
|
|
||||||
return self.map['values']
|
|
||||||
|
|
||||||
def _indexes(self):
|
|
||||||
return itertools.product(range(self.nbenchmarks), range(self.nmethods))
|
|
||||||
|
|
||||||
def _addmap(self, map, dtype, func=None):
|
|
||||||
self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
|
|
||||||
if func is None:
|
|
||||||
return
|
|
||||||
m = self.map[map]
|
|
||||||
f = func
|
|
||||||
indexes = self._indexes() if map == 'fill' else self._getfilled()
|
|
||||||
for i, j in indexes:
|
|
||||||
m[i, j] = f(self.values[i, j])
|
|
||||||
|
|
||||||
def _addrank(self):
|
|
||||||
for i in range(self.nbenchmarks):
|
|
||||||
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
|
||||||
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
|
|
||||||
ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
|
|
||||||
if not self.lower_is_better:
|
|
||||||
ranked_cols_idx = ranked_cols_idx[::-1]
|
|
||||||
self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx) + 1)
|
|
||||||
|
|
||||||
def _addcolor(self):
|
|
||||||
for i in range(self.nbenchmarks):
|
|
||||||
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
|
||||||
if filled_cols_idx.size == 0:
|
|
||||||
continue
|
|
||||||
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
|
|
||||||
minval = min(col_means)
|
|
||||||
maxval = max(col_means)
|
|
||||||
for col_idx in filled_cols_idx:
|
|
||||||
val = self.map['mean'][i, col_idx]
|
|
||||||
norm = (maxval - minval)
|
|
||||||
if norm > 0:
|
|
||||||
normval = (val - minval) / norm
|
|
||||||
else:
|
|
||||||
normval = 0.5
|
|
||||||
if self.lower_is_better:
|
|
||||||
normval = 1 - normval
|
|
||||||
self.map['color'][i, col_idx] = color_red2green_01(normval)
|
|
||||||
|
|
||||||
def _run_ttest(self, row, col1, col2):
|
|
||||||
mean1 = self.map['mean'][row, col1]
|
|
||||||
std1 = self.map['std'][row, col1]
|
|
||||||
nobs1 = self.map['nobs'][row, col1]
|
|
||||||
mean2 = self.map['mean'][row, col2]
|
|
||||||
std2 = self.map['std'][row, col2]
|
|
||||||
nobs2 = self.map['nobs'][row, col2]
|
|
||||||
_, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
|
|
||||||
return p_val
|
|
||||||
|
|
||||||
def _run_wilcoxon(self, row, col1, col2):
|
|
||||||
values1 = self.map['values'][row, col1]
|
|
||||||
values2 = self.map['values'][row, col2]
|
|
||||||
_, p_val = wilcoxon(values1, values2)
|
|
||||||
return p_val
|
|
||||||
|
|
||||||
def _add_statistical_test(self):
|
|
||||||
if self.ttest is None:
|
|
||||||
return
|
|
||||||
self.some_similar = [False] * self.nmethods
|
|
||||||
for i in range(self.nbenchmarks):
|
|
||||||
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
|
||||||
if len(filled_cols_idx) <= 1:
|
|
||||||
continue
|
|
||||||
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
|
|
||||||
best_pos = filled_cols_idx[np.argmin(col_means)]
|
|
||||||
|
|
||||||
for j in filled_cols_idx:
|
|
||||||
if j == best_pos:
|
|
||||||
continue
|
|
||||||
if self.ttest == 'ttest':
|
|
||||||
p_val = self._run_ttest(i, best_pos, j)
|
|
||||||
else:
|
|
||||||
p_val = self._run_wilcoxon(i, best_pos, j)
|
|
||||||
|
|
||||||
pval_outcome = pval_interpretation(p_val)
|
|
||||||
self.map['ttest'][i, j] = pval_outcome
|
|
||||||
if pval_outcome != 'Diff':
|
|
||||||
self.some_similar[j] = True
|
|
||||||
|
|
||||||
def compute(self):
|
|
||||||
self._addmap('fill', dtype=bool, func=lambda x: x is not None)
|
|
||||||
self._addmap('mean', dtype=float, func=np.mean)
|
|
||||||
self._addmap('std', dtype=float, func=np.std)
|
|
||||||
self._addmap('nobs', dtype=float, func=len)
|
|
||||||
self._addmap('rank', dtype=int, func=None)
|
|
||||||
self._addmap('color', dtype=object, func=None)
|
|
||||||
self._addmap('ttest', dtype=object, func=None)
|
|
||||||
self._addmap('latex', dtype=object, func=None)
|
|
||||||
self._addrank()
|
|
||||||
self._addcolor()
|
|
||||||
self._add_statistical_test()
|
|
||||||
if self.add_average:
|
|
||||||
self._addave()
|
|
||||||
self._modif = False
|
|
||||||
|
|
||||||
def _is_column_full(self, col):
|
|
||||||
return all(self.map['fill'][:, self.method_index[col]])
|
|
||||||
|
|
||||||
def _addave(self):
|
|
||||||
ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, significance_test=self.ttest, average=False,
|
|
||||||
missing=self.missing, missing_str=self.missing_str, prec_mean=self.prec_mean, prec_std=self.prec_std,
|
|
||||||
show_std=self.show_std)
|
|
||||||
for col in self.methods:
|
|
||||||
values = None
|
|
||||||
if self._is_column_full(col):
|
|
||||||
if self.ttest == 'ttest':
|
|
||||||
values = np.asarray(self.map['mean'][:, self.method_index[col]])
|
|
||||||
else: # wilcoxon
|
|
||||||
values = np.concatenate(self.values[:, self.method_index[col]])
|
|
||||||
ave.add('ave', col, values)
|
|
||||||
self.average = ave
|
|
||||||
|
|
||||||
def add(self, benchmark, method, values):
|
|
||||||
if values is not None:
|
|
||||||
values = np.asarray(values)
|
|
||||||
if values.ndim == 0:
|
|
||||||
values = values.flatten()
|
|
||||||
rid, cid = self._coordinates(benchmark, method)
|
|
||||||
if self.map['values'][rid, cid] is None:
|
|
||||||
self.map['values'][rid, cid] = values
|
|
||||||
elif values is not None:
|
|
||||||
self.map['values'][rid, cid] = np.concatenate([self.map['values'][rid, cid], values])
|
|
||||||
self.touch()
|
|
||||||
|
|
||||||
def get(self, benchmark, method, attr='mean'):
|
|
||||||
self.update()
|
|
||||||
assert attr in self.map, f'unknwon attribute {attr}'
|
|
||||||
rid, cid = self._coordinates(benchmark, method)
|
|
||||||
if self.map['fill'][rid, cid]:
|
|
||||||
v = self.map[attr][rid, cid]
|
|
||||||
if v is None or (isinstance(v, float) and np.isnan(v)):
|
|
||||||
return self.missing
|
|
||||||
return v
|
|
||||||
else:
|
|
||||||
return self.missing
|
|
||||||
|
|
||||||
def _coordinates(self, benchmark, method):
|
|
||||||
assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
|
|
||||||
assert method in self.method_index, f'method {method} out of range'
|
|
||||||
rid = self.benchmark_index[benchmark]
|
|
||||||
cid = self.method_index[method]
|
|
||||||
return rid, cid
|
|
||||||
|
|
||||||
def get_average(self, method, attr='mean'):
|
|
||||||
self.update()
|
|
||||||
if self.add_average:
|
|
||||||
return self.average.get('ave', method, attr=attr)
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_color(self, benchmark, method):
|
|
||||||
color = self.get(benchmark, method, attr='color')
|
|
||||||
if color is None:
|
|
||||||
return ''
|
|
||||||
return color
|
|
||||||
|
|
||||||
def latexCell(self, benchmark, method):
|
|
||||||
self.update()
|
|
||||||
i, j = self._coordinates(benchmark, method)
|
|
||||||
if self.map['fill'][i, j] == False:
|
|
||||||
return self.missing_str
|
|
||||||
|
|
||||||
mean = self.map['mean'][i, j]
|
|
||||||
l = f" {mean:.{self.prec_mean}f}"
|
|
||||||
if self.clean_zero:
|
|
||||||
l = l.replace(' 0.', '.')
|
|
||||||
|
|
||||||
isbest = self.map['rank'][i, j] == 1
|
|
||||||
if self.ttest is not None: # and self.some_similar[j]:
|
|
||||||
test_label = self.map['ttest'][i, j]
|
|
||||||
if test_label in ['Sim', 'Same']:
|
|
||||||
isbest = True
|
|
||||||
|
|
||||||
if isbest:
|
|
||||||
l = "\\textbf{" + l.strip() + "}\;"
|
|
||||||
else:
|
|
||||||
l += '\; '
|
|
||||||
|
|
||||||
stat = ''
|
|
||||||
# this is commented because we are putting in textbf all results that are similar to the best one
|
|
||||||
# if self.ttest is not None: # and self.some_similar[j]:
|
|
||||||
# test_label = self.map['ttest'][i, j]
|
|
||||||
# if test_label == 'Sim':
|
|
||||||
# stat = '^{\dag\phantom{\dag}}'
|
|
||||||
# elif test_label == 'Same':
|
|
||||||
# stat = '^{\ddag}'
|
|
||||||
# elif isbest or test_label == 'Diff':
|
|
||||||
# stat = '^{\phantom{\ddag}}'
|
|
||||||
|
|
||||||
std = ''
|
|
||||||
if self.show_std:
|
|
||||||
std = self.map['std'][i, j]
|
|
||||||
std = f" {std:.{self.prec_std}f}"
|
|
||||||
if self.clean_zero:
|
|
||||||
std = std.replace(' 0.', '.')
|
|
||||||
std = f" \pm {std:{self.prec_std}}"
|
|
||||||
|
|
||||||
relto = ''
|
|
||||||
if self.show_rel_to != -1:
|
|
||||||
if j != self.show_rel_to:
|
|
||||||
ref_ave = self.map['mean'][i, self.show_rel_to]
|
|
||||||
rel = 100*(mean-ref_ave)/ref_ave
|
|
||||||
if abs(rel) < 0.1:
|
|
||||||
relto=f'(\\approx)'
|
|
||||||
else:
|
|
||||||
plussign = '+' if rel>0 else '' # already plugs the '-' sign
|
|
||||||
relto=f'({plussign}{rel:.1f}\%)'
|
|
||||||
std = ''
|
|
||||||
|
|
||||||
if stat != '' or std != '' or relto != '':
|
|
||||||
l = f'{l}${stat}{std}{relto}$'
|
|
||||||
|
|
||||||
if self.color:
|
|
||||||
l += ' ' + self.map['color'][i, j]
|
|
||||||
|
|
||||||
return l
|
|
||||||
|
|
||||||
def latexTabular(self, benchmark_replace={}, method_replace={}, average=True):
|
|
||||||
tab = ' & '
|
|
||||||
tab += ' & '.join([method_replace.get(col, col) for col in self.methods])
|
|
||||||
tab += ' \\\\\hline\n'
|
|
||||||
for row in self.benchmarks:
|
|
||||||
rowname = benchmark_replace.get(row, row)
|
|
||||||
tab += rowname + ' & '
|
|
||||||
tab += self.latexRow(row)
|
|
||||||
|
|
||||||
if average:
|
|
||||||
tab += '\hline\n'
|
|
||||||
tab += 'Average & '
|
|
||||||
tab += self.latexAverage()
|
|
||||||
return tab
|
|
||||||
|
|
||||||
def latexTabularT(self, benchmark_replace={}, method_replace={}, average=True, side=False):
|
|
||||||
def withside(label):
|
|
||||||
return '\side{'+label+'}' if side else label
|
|
||||||
|
|
||||||
def center(label):
|
|
||||||
return '\multicolumn{1}{c}{'+label+'}'
|
|
||||||
|
|
||||||
tab = ' & '
|
|
||||||
tab += ' & '.join([center(withside(benchmark_replace.get(col, col))) for col in self.benchmarks])
|
|
||||||
if average:
|
|
||||||
tab += ' & ' + withside('Ave')
|
|
||||||
# tab += ' \\\\\hline\n'
|
|
||||||
tab += ' \\\\\midrule\n'
|
|
||||||
for row in self.methods:
|
|
||||||
rowname = method_replace.get(row, row)
|
|
||||||
tab += rowname + ' & '
|
|
||||||
tab += self.latexRowT(row, endl='')
|
|
||||||
if average:
|
|
||||||
tab += ' & '
|
|
||||||
tab += self.average.latexCell('ave', row)
|
|
||||||
# tab += '\\\\\hline\n'
|
|
||||||
tab += '\\\\\n'
|
|
||||||
tab += '\\bottomrule'
|
|
||||||
return tab
|
|
||||||
|
|
||||||
def latexRow(self, benchmark, endl='\\\\\hline\n'):
|
|
||||||
s = [self.latexCell(benchmark, col) for col in self.methods]
|
|
||||||
s = ' & '.join(s)
|
|
||||||
s += ' ' + endl
|
|
||||||
return s
|
|
||||||
|
|
||||||
def latexRowT(self, method, endl='\\\\\hline\n'):
|
|
||||||
s = [self.latexCell(benchmark, method) for benchmark in self.benchmarks]
|
|
||||||
s = ' & '.join(s)
|
|
||||||
s += ' ' + endl
|
|
||||||
return s
|
|
||||||
|
|
||||||
def latexAverage(self, endl='\\\\\hline\n'):
|
|
||||||
if self.add_average:
|
|
||||||
return self.average.latexRow('ave', endl=endl)
|
|
||||||
|
|
||||||
def getRankTable(self):
|
|
||||||
t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=0, average=True)
|
|
||||||
for rid, cid in self._getfilled():
|
|
||||||
row = self.benchmarks[rid]
|
|
||||||
col = self.methods[cid]
|
|
||||||
t.add(row, col, self.get(row, col, 'rank'))
|
|
||||||
t.compute()
|
|
||||||
return t
|
|
||||||
|
|
||||||
def dropMethods(self, methods):
|
|
||||||
drop_index = [self.method_index[m] for m in methods]
|
|
||||||
new_methods = np.delete(self.methods, drop_index)
|
|
||||||
new_index = {col: j for j, col in enumerate(new_methods)}
|
|
||||||
|
|
||||||
self.map['values'] = self.values[:, np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
|
|
||||||
self.methods = new_methods
|
|
||||||
self.method_index = new_index
|
|
||||||
self.touch()
|
|
||||||
|
|
||||||
|
|
||||||
def pval_interpretation(p_val):
|
|
||||||
if 0.005 >= p_val:
|
|
||||||
return 'Diff'
|
|
||||||
elif 0.05 >= p_val > 0.005:
|
|
||||||
return 'Sim'
|
|
||||||
elif p_val > 0.05:
|
|
||||||
return 'Same'
|
|
||||||
|
|
||||||
|
|
||||||
def color_red2green_01(val, maxtone=50):
|
|
||||||
if np.isnan(val): return None
|
|
||||||
assert 0 <= val <= 1, f'val {val} out of range [0,1]'
|
|
||||||
|
|
||||||
# rescale to [-1,1]
|
|
||||||
val = val * 2 - 1
|
|
||||||
if val < 0:
|
|
||||||
color = 'red'
|
|
||||||
tone = maxtone * (-val)
|
|
||||||
else:
|
|
||||||
color = 'green'
|
|
||||||
tone = maxtone * val
|
|
||||||
return '\cellcolor{' + color + f'!{int(tone)}' + '}'
|
|
||||||
|
|
@ -1,64 +1,22 @@
|
||||||
import numpy as np
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
from glob import glob
|
from glob import glob
|
||||||
from json import load
|
|
||||||
import os
|
import os
|
||||||
from os.path import join
|
from os.path import join
|
||||||
import pickle
|
import pickle
|
||||||
import pandas as pd
|
|
||||||
import csv
|
|
||||||
import datasets
|
|
||||||
from datasets import Dataset
|
|
||||||
import quapy as qp
|
|
||||||
from quapy.data import LabelledCollection
|
|
||||||
|
|
||||||
|
|
||||||
|
def load_samples(path_dir, classes):
|
||||||
def load_simple_sample_npytxt(parentdir, filename, classes=None):
|
nsamples = len(glob(join(path_dir, f'*.txt')))
|
||||||
samplepath = join(parentdir, filename+'.txt')
|
|
||||||
yX = np.loadtxt(samplepath)
|
|
||||||
X = yX[:,1:]
|
|
||||||
y = yX[:,0].astype(np.int32)
|
|
||||||
return LabelledCollection(instances=X, labels=y, classes_=classes)
|
|
||||||
|
|
||||||
|
|
||||||
def load_simple_sample_raw(parentdir, filename, classes=None):
|
|
||||||
samplepath = join(parentdir, filename+'.txt')
|
|
||||||
return LabelledCollection.load(samplepath, loader_func=qp.data.reader.from_text, classes=classes)
|
|
||||||
|
|
||||||
|
|
||||||
def load_single_sample_as_csv(parentdir, filename):
|
|
||||||
samplepath = join(parentdir, filename+'.txt')
|
|
||||||
df = pd.read_csv(samplepath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
|
|
||||||
labels = df.pop('labels').to_frame()
|
|
||||||
|
|
||||||
features = datasets.Features({'review': datasets.Value('string')})
|
|
||||||
sample = Dataset.from_pandas(df=df, features=features)
|
|
||||||
|
|
||||||
return sample, labels
|
|
||||||
|
|
||||||
|
|
||||||
def load_single_sample_pkl(parentdir, filename):
|
|
||||||
return pickle.load(open(join(parentdir, filename+'.pkl'), 'rb'))
|
|
||||||
|
|
||||||
|
|
||||||
# def load_samples_npytxt(path_dir, filter=None, classes=None):
|
|
||||||
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_npytxt)
|
|
||||||
|
|
||||||
|
|
||||||
# def load_samples_raw(path_dir, filter=None, classes=None):
|
|
||||||
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_raw, load_fn_kwargs={'classes': classes})
|
|
||||||
|
|
||||||
|
|
||||||
# def load_samples_as_csv(path_dir, filter=None):
|
|
||||||
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_as_csv)
|
|
||||||
|
|
||||||
|
|
||||||
# def load_samples_pkl(path_dir, filter=None):
|
|
||||||
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_pkl)
|
|
||||||
|
|
||||||
|
|
||||||
def load_samples_folder(path_dir, filter=None, load_fn=None, **load_fn_kwargs):
|
|
||||||
nsamples = len(glob(join(path_dir, f'*')))
|
|
||||||
for id in range(nsamples):
|
for id in range(nsamples):
|
||||||
if (filter is None) or id in filter:
|
yield LabelledCollection.load(join(path_dir, f'{id}.txt'), loader_func=qp.data.reader.from_text, classes=classes)
|
||||||
yield load_fn(path_dir, f'{id}', **load_fn_kwargs)
|
|
||||||
|
|
||||||
|
def load_samples_pkl(path_dir, filter=None):
|
||||||
|
nsamples = len(glob(join(path_dir, f'*.pkl')))
|
||||||
|
for id in range(nsamples):
|
||||||
|
if filter is not None:
|
||||||
|
if id not in filter:
|
||||||
|
continue
|
||||||
|
yield pickle.load(open(join(path_dir, f'{id}.pkl'), 'rb'))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -183,7 +183,7 @@ def _training_helper(learner,
|
||||||
if not hasattr(learner, 'predict_proba'):
|
if not hasattr(learner, 'predict_proba'):
|
||||||
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
|
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
|
||||||
f'The learner will be calibrated.')
|
f'The learner will be calibrated.')
|
||||||
learner = CalibratedClassifierCV(learner, cv=5, ensemble=True)
|
learner = CalibratedClassifierCV(learner, cv=5)
|
||||||
if val_split is not None:
|
if val_split is not None:
|
||||||
if isinstance(val_split, float):
|
if isinstance(val_split, float):
|
||||||
if not (0 < val_split < 1):
|
if not (0 < val_split < 1):
|
||||||
|
|
@ -470,7 +470,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True):
|
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||||
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
||||||
self.train_prevalence = F.prevalence_from_labels(data.labels, data.classes_)
|
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue