from copy import deepcopy import quapy as qp from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression from quapy.classification.methods import LowRankLogisticRegression from quapy.method.meta import QuaNet from quapy.protocol import APP from quapy.method.aggregative import CC, ACC, PCC, PACC, MAX, MS, MS2, EMQ, HDy, newSVMAE, T50, X from quapy.method.meta import EHDy import numpy as np import os import pickle import itertools import argparse import torch import shutil N_JOBS = -1 CUDA_N_JOBS = 2 ENSEMBLE_N_JOBS = -1 qp.environ['SAMPLE_SIZE'] = 100 def newLR(): return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1) def calibratedLR(): return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)) __C_range = np.logspace(-3, 3, 7) lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']} svmperf_params = {'classifier__C': __C_range} def quantification_models(): yield 'acc', ACC(newLR()), lr_params yield 'T50', T50(newLR()), lr_params yield 'X', X(newLR()), lr_params yield 'MAX', MAX(newLR()), lr_params yield 'MS', MS(newLR()), lr_params yield 'MS2', MS2(newLR()), lr_params def evaluate_experiment(true_prevalences, estim_prevalences): print('\nEvaluation Metrics:\n' + '=' * 22) for eval_measure in [qp.error.mae, qp.error.mrae]: err = eval_measure(true_prevalences, estim_prevalences) print(f'\t{eval_measure.__name__}={err:.4f}') print() def result_path(path, dataset_name, model_name, run, optim_loss): return os.path.join(path, f'{dataset_name}-{model_name}-run{run}-{optim_loss}.pkl') def is_already_computed(dataset_name, model_name, run, optim_loss): return os.path.exists(result_path(args.results, dataset_name, model_name, run, optim_loss)) def save_results(dataset_name, model_name, run, optim_loss, *results): rpath = result_path(args.results, dataset_name, model_name, run, optim_loss) qp.util.create_parent_dir(rpath) with open(rpath, 'wb') as foo: pickle.dump(tuple(results), foo, pickle.HIGHEST_PROTOCOL) def run(experiment): optim_loss, dataset_name, (model_name, model, hyperparams) = experiment if dataset_name in ['acute.a', 'acute.b', 'iris.1']: return collection = qp.datasets.fetch_UCILabelledCollection(dataset_name) for run, data in enumerate(qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=1)): if is_already_computed(dataset_name, model_name, run=run, optim_loss=optim_loss): print(f'result for dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5 already computed.') continue print(f'running dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5') # model selection (hyperparameter optimization for a quantification-oriented loss) train, test = data.train_test train, val = train.split_stratified() if hyperparams is not None: model_selection = qp.model_selection.GridSearchQ( deepcopy(model), param_grid=hyperparams, protocol=APP(val, n_prevalences=21, repeats=25), error=optim_loss, refit=True, timeout=60*60, verbose=True ) model_selection.fit(data.training) model = model_selection.best_model() best_params = model_selection.best_params_ else: model.fit(data.training) best_params = {} # model evaluation true_prevalences, estim_prevalences = qp.evaluation.prediction( model, protocol=APP(test, n_prevalences=21, repeats=100) ) test_true_prevalence = data.test.prevalence() evaluate_experiment(true_prevalences, estim_prevalences) save_results(dataset_name, model_name, run, optim_loss, true_prevalences, estim_prevalences, data.training.prevalence(), test_true_prevalence, best_params) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification') parser.add_argument('--results', metavar='RESULT_PATH', type=str, default='results_tmp', help='path to the directory where to store the results') parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='../svm_perf_quantification', help='path to the directory with svmperf') args = parser.parse_args() print(f'Result folder: {args.results}') np.random.seed(0) qp.environ['SVMPERF_HOME'] = args.svmperfpath optim_losses = ['mae'] datasets = qp.datasets.UCI_DATASETS models = quantification_models() qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=N_JOBS) shutil.rmtree(args.checkpointdir, ignore_errors=True)