1
0
Fork 0
QuaPy/Ordinal/main.py

157 lines
6.1 KiB
Python

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import quapy as qp
import numpy as np
from Ordinal.model import OrderedLogisticRegression, LogisticAT
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy
from quapy.data import LabelledCollection
from os.path import join
import os
from utils import load_samples_folder, load_simple_sample_npytxt, load_single_sample_pkl
from evaluation import nmd, mnmd
from time import time
import pickle
from tqdm import tqdm
import mord
def quantifiers():
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
# params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
params_OLR = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
# params_SVR = {'C': np.logspace(0, 1, 2)}
# baselines
yield 'CC(LR)', CC(LogisticRegression()), params_LR
yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
#yield 'HDy(LR)', HDy(LogisticRegression()), params_LR
yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
# with order-aware classifiers
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
#yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
#yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
#yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
#yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
#yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR
#yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
# I am using my implementation, which caters for predict_proba (linear distance to the two closest classes, 0 in the rest)
# the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do
# not implement predict_proba nor decision_score
#yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
#yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
# yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR
# yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR
# yield 'PACC(SVR)', PACC(RegressorClassifier()), params_SVR
#yield 'HDy(SVR)', HDy(RegressorClassifier()), params_SVR
# yield 'SLD(SVR)', EMQ(RegressorClassifier()), params_SVR
def run_experiment(params):
qname, q, param_grid, drift = params
qname += posfix
resultfile = join(resultpath, f'{qname}.{drift}.csv')
if os.path.exists(resultfile):
print(f'result file {resultfile} already exists: continue')
return None
print(f'fitting {qname} for {drift}-drift')
def load_test_samples():
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
ids = set(ids)
folderpath = join(datapath, domain, protocol, 'test_samples')
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
yield sample.instances, sample.prevalence()
def load_dev_samples():
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
ids = set(ids)
folderpath = join(datapath, domain, protocol, 'dev_samples')
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
yield sample.instances, sample.prevalence()
q = qp.model_selection.GridSearchQ(
q,
param_grid,
sample_size=1000,
protocol='gen',
error=mnmd,
val_split=load_dev_samples,
n_jobs=-1,
refit=False,
verbose=True).fit(train)
hyperparams = f'{qname}\t{drift}\t{q.best_params_}'
print('[done]')
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
mean_nmd = report['nmd'].mean()
std_nmd = report['nmd'].std()
print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
report.to_csv(resultfile, index=False)
print('[learning regressor-based adjustment]')
q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
q.fit(None)
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
mean_nmd = report['nmd'].mean()
std_nmd = report['nmd'].std()
print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
resultfile = join(resultpath, f'{qname}.{drift}.reg.csv')
report.to_csv(resultfile, index=False)
return hyperparams
if __name__ == '__main__':
#preprocessing = 'roberta.last'
preprocessing = 'roberta.average'
# preprocessing = 'roberta.posteriors'
#preprocessing = 'tfidf'
if preprocessing=='tfidf':
domain = 'Books-tfidf'
posfix = ''
elif preprocessing=='roberta.last':
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
posfix = '-RoBERTa-last'
elif preprocessing=='roberta.average':
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
posfix = '-RoBERTa-average'
elif preprocessing=='roberta.posteriors':
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
posfix = '-RoBERTa-posteriors'
load_sample_fn = load_single_sample_pkl
datapath = './data'
protocol = 'app'
resultpath = join('./results', domain, protocol)
os.makedirs(resultpath, exist_ok=True)
train = load_sample_fn(join(datapath, domain), 'training_data')
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
#for drift in [f'smooth{i}' for i in range(5)] + ['all']:
params = [(*qs, drift) for qs in quantifiers() for drift in ['low', 'mid', 'high', 'all']]
hypers = qp.util.parallel(run_experiment, params, n_jobs=-2)
for h in hypers:
if h is not None:
foo.write(h)
foo.write('\n')