import logging from logging.handlers import QueueHandler from multiprocessing import Manager, Queue from threading import Thread from time import sleep, time import numpy as np import scipy.sparse as sp from joblib import Parallel, delayed from quapy.protocol import APP from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.metrics import accuracy_score from baselines.mandoline import estimate_performance from quacc.dataset import Dataset from quacc.logger import logger, logger_manager, setup_logger, setup_worker_logger def test_lr(): d = Dataset(name="rcv1", target="CCAT", n_prevalences=1).get_raw() classifier = LogisticRegression() classifier.fit(*d.train.Xy) val, _ = d.validation.split_stratified(0.5, random_state=0) val_X, val_y = val.X, val.y val_probs = classifier.predict_proba(val_X) reg_X = sp.hstack([val_X, val_probs]) reg_y = val_probs[np.arange(val_probs.shape[0]), val_y] reg = LinearRegression() reg.fit(reg_X, reg_y) _test_num = 10000 test_X = d.test.X[:_test_num, :] test_probs = classifier.predict_proba(test_X) test_reg_X = sp.hstack([test_X, test_probs]) reg_pred = reg.predict(test_reg_X) def threshold(pred): # return np.mean( # (reg.predict(test_reg_X) >= pred) # == ( # test_probs[np.arange(_test_num), d.test.y[:_test_num]] == np.max(test_probs, axis=1) # ) # ) return np.mean( (reg.predict(test_reg_X) >= pred) == (np.argmax(test_probs, axis=1) == d.test.y[:_test_num]) ) max_p, max_acc = 0, 0 for p in reg_pred: acc = threshold(p) if acc > max_acc: max_acc = acc max_p = p print(f"{max_p = }, {max_acc = }") reg_pred = reg_pred - max_p + 0.5 print(reg_pred) print(np.mean(reg_pred >= 0.5)) print(np.mean(np.argmax(test_probs, axis=1) == d.test.y[:_test_num])) def entropy(probas): return -np.sum(np.multiply(probas, np.log(probas + 1e-20)), axis=1) def get_slices(probas): ln, ncl = probas.shape preds = np.argmax(probas, axis=1) pred_slices = np.full((ln, ncl), fill_value=-1, dtype="