from copy import deepcopy import numpy as np from sklearn.linear_model import LogisticRegression import quapy as qp from method.non_aggregative import DMx from protocol import APP from quapy.method.aggregative import CC, ACC, DMy from sklearn.svm import LinearSVC qp.environ['SAMPLE_SIZE'] = 100 DATASETS = qp.datasets.UCI_DATASETS[10:] def fit_eval_task(args): model_name, model, train, test = args with qp.util.temp_seed(0): model = deepcopy(model) model.fit(train) true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0)) return model_name, true_prev, estim_prev def gen_data(): def base_classifier(): return LogisticRegression() #return LinearSVC(class_weight='balanced') def models(): yield 'CC', CC(base_classifier()) yield 'ACC', ACC(base_classifier()) yield 'HDy', DMy(base_classifier(), val_split=10, nbins=10, n_jobs=-1) yield 'HDx', DMx(nbins=10, n_jobs=-1) # train, test = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=10).train_test method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], [] for dataset_name in DATASETS: train, test = qp.datasets.fetch_UCIDataset(dataset_name).train_test print(dataset_name, train.X.shape) outs = qp.util.parallel( fit_eval_task, ((method_name, model, train, test) for method_name, model in models()), seed=0, n_jobs=-1 ) for method_name, true_prev, estim_prev in outs: method_names.append(method_name) true_prevs.append(true_prev) estim_prevs.append(estim_prev) tr_prevs.append(train.prevalence()) return method_names, true_prevs, estim_prevs, tr_prevs method_names, true_prevs, estim_prevs, tr_prevs = qp.util.pickled_resource('../quick_experiment/pickled_plot_data.pkl', gen_data) def remove_dataset(dataset_order, num_methods=4): sel_names, sel_true, sel_estim, sel_tr = [],[],[],[] for i, (name, true, estim, tr) in enumerate(zip(method_names, true_prevs, estim_prevs, tr_prevs)): dataset_pos = i//num_methods if dataset_pos not in dataset_order: sel_names.append(name) sel_true.append(true) sel_estim.append(estim) sel_tr.append(tr) return np.asarray(sel_names), np.asarray(sel_true), np.asarray(sel_estim), np.asarray(sel_tr) print(DATASETS) selected = 10 for i in [selected]: print(i, DATASETS[i]) all_ = set(range(len(DATASETS))) remove_index = sorted(all_ - {i}) sel_names, sel_true, sel_estim, sel_tr = remove_dataset(dataset_order=remove_index, num_methods=4) p=sel_tr[0][1] sel_names = ['CC$_{'+str(p)+'}$' if x=='CC' else x for x in sel_names] # qp.plot.binary_diagonal(sel_names, sel_true, sel_estim, train_prev=sel_tr[0], show_std=False, savepath=f'./plots/bin_diag_{i}.png') qp.plot.error_by_drift(sel_names, sel_true, sel_estim, sel_tr, n_bins=10, savepath=f'./plots/err_drift_{i}.png', show_std=True, show_density=False, title="") # qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, savepath='./plots/bin_bias.png') # qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, nbins=3, savepath='./plots/bin_bias_bin.png')