from time import time import numpy as np import scipy.sparse as sp from quapy.protocol import APP from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.metrics import accuracy_score from baselines.mandoline import estimate_performance from quacc.dataset import Dataset def test_lr(): d = Dataset(name="rcv1", target="CCAT", n_prevalences=1).get_raw() classifier = LogisticRegression() classifier.fit(*d.train.Xy) val, _ = d.validation.split_stratified(0.5, random_state=0) val_X, val_y = val.X, val.y val_probs = classifier.predict_proba(val_X) reg_X = sp.hstack([val_X, val_probs]) reg_y = val_probs[np.arange(val_probs.shape[0]), val_y] reg = LinearRegression() reg.fit(reg_X, reg_y) _test_num = 10000 test_X = d.test.X[:_test_num, :] test_probs = classifier.predict_proba(test_X) test_reg_X = sp.hstack([test_X, test_probs]) reg_pred = reg.predict(test_reg_X) def threshold(pred): # return np.mean( # (reg.predict(test_reg_X) >= pred) # == ( # test_probs[np.arange(_test_num), d.test.y[:_test_num]] == np.max(test_probs, axis=1) # ) # ) return np.mean( (reg.predict(test_reg_X) >= pred) == (np.argmax(test_probs, axis=1) == d.test.y[:_test_num]) ) max_p, max_acc = 0, 0 for p in reg_pred: acc = threshold(p) if acc > max_acc: max_acc = acc max_p = p print(f"{max_p = }, {max_acc = }") reg_pred = reg_pred - max_p + 0.5 print(reg_pred) print(np.mean(reg_pred >= 0.5)) print(np.mean(np.argmax(test_probs, axis=1) == d.test.y[:_test_num])) def entropy(probas): return -np.sum(np.multiply(probas, np.log(probas + 1e-20)), axis=1) def get_slices(probas): ln, ncl = probas.shape preds = np.argmax(probas, axis=1) pred_slices = np.full((ln, ncl), fill_value=-1, dtype="