cleaning

2024-04-05 17:28:11 +02:00 · 2024-04-05 17:28:11 +02:00 · 4c6a5e69f3
parent af0f1c7085
commit 4c6a5e69f3
3 changed files with 1 additions and 359 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,6 +7,7 @@ quavenv/*
 *__pycache__*
 htmlcov/*
 accuracy_prediction*.py
 test*.py
 selected_gs.py 
--- a/accuracy_prediction_via_quantification.py
+++ b/accuracy_prediction_via_quantification.py
@ -1,90 +0,0 @@
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import f1_score
 import quapy as qp
 from method.kdey import KDEyML, KDEyCS, KDEyHD
 from quapy.protocol import APP
 from quapy.method.aggregative import PACC, ACC, EMQ, PCC, CC, DMy
 datasets = qp.datasets.UCI_DATASETS
 # target = 'f1'
 target = 'acc'
 errors = []
 # dataset_name = datasets[-2]
 for dataset_name in datasets:
    if dataset_name in ['balance.2', 'acute.a', 'acute.b', 'iris.1']:
        continue
    train, test = qp.datasets.fetch_UCIDataset(dataset_name).train_test
    print(f'dataset name = {dataset_name}')
    print(f'#train = {len(train)}')
    print(f'#test = {len(test)}')
    cls = LogisticRegression()
    train, val = train.split_stratified(random_state=0)
    cls.fit(*train.Xy)
    y_val = val.labels
    y_hat_val = cls.predict(val.instances)
    for sample in APP(test, n_prevalences=11, repeats=1, sample_size=100, return_type='labelled_collection')():
        print('='*80)
        y_hat = cls.predict(sample.instances)
        y = sample.labels
        if target == 'acc':
            acc = (y_hat==y).mean()
        else:
            acc = f1_score(y, y_hat, zero_division=0)
        q = EMQ(cls)
        q.fit(train, fit_classifier=False)
        # q = EMQ(cls)
        # q.fit(train, val_split=val, fit_classifier=False)
        M_hat = ACC.getPteCondEstim(train.classes_, y_val, y_hat_val)
        M_true = ACC.getPteCondEstim(train.classes_, y, y_hat)
        p_hat = q.quantify(sample.instances)
        cont_table_hat = p_hat * M_hat
        tp = cont_table_hat[1,1]
        tn = cont_table_hat[0,0]
        fn = cont_table_hat[0,1]
        fp = cont_table_hat[1,0]
        if target == 'acc':
            acc_hat = (tp+tn)
        else:
            den = (2*tp + fn + fp)
            if den > 0:
                acc_hat = 2*tp / den
            else:
                acc_hat = 0
        error = abs(acc - acc_hat)
        errors.append(error)
        print('true_prev: ', sample.prevalence())
        print('estim_prev: ', p_hat)
        print('M-true:\n', M_true)
        print('M-hat:\n', M_hat)
        print('cont_table:\n', cont_table_hat)
        print(f'classifier accuracy={acc:.3f}')
        print(f'estimated accuracy={acc_hat:.3f}')
        print(f'estimation error={error:.4f}')
 print('process end')
 print('='*80)
 print(f'mean error = {np.mean(errors)}')
 print(f'std error = {np.std(errors)}')
--- a/accuracy_prediction_via_quantification2.py
+++ b/accuracy_prediction_via_quantification2.py
@ -1,269 +0,0 @@
 import numpy as np
 import scipy.special
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import f1_score
 import quapy as qp
 from quapy.protocol import APP
 from quapy.method.aggregative import PACC, ACC, EMQ, PCC, CC, DMy, T50, MS2, KDEyML, KDEyCS, KDEyHD
 from sklearn import clone
 import quapy.functional as F
 # datasets = qp.datasets.UCI_DATASETS
 datasets = ['imdb']
 # target = 'f1'
 target = 'acc'
 errors = []
 def method_1(cls, train, val, sample, y=None, y_hat=None):
    """
    Converts a misclassification matrix computed in validation (i.e., in the train distribution P) into
    the corresponding equivalent misclassification matrix in test (i.e., in the test distribution Q)
    by relying on the PPS assumptions.
    :return: tuple (tn, fn, fp, tp,) of floats in [0,1] summing up to 1
    """
    y_val = val.labels
    y_hat_val = cls.predict(val.instances)
    # q = EMQ(LogisticRegression(class_weight='balanced'))
    # q.fit(val, fit_classifier=True)
    q = EMQ(cls)
    q.fit(train, fit_classifier=False)
    # q = KDEyML(cls)
    # q.fit(train, val_split=val, fit_classifier=False)
    M_hat = ACC.getPteCondEstim(train.classes_, y_val, y_hat_val)
    M_true = ACC.getPteCondEstim(train.classes_, y, y_hat)
    p_hat = q.quantify(sample.instances)
    cont_table_hat = p_hat * M_hat
    # cont_table_hat = np.clip(cont_table_hat, 0, 1)
    # cont_table_hat = cont_table_hat / cont_table_hat.sum()
    print('true_prev: ', sample.prevalence())
    print('estim_prev: ', p_hat)
    print('M-true:\n', M_true)
    print('M-hat:\n', M_hat)
    print('cont_table:\n', cont_table_hat)
    print('cont_table Sum :\n', cont_table_hat.sum())
    tp = cont_table_hat[1, 1]
    tn = cont_table_hat[0, 0]
    fn = cont_table_hat[0, 1]
    fp = cont_table_hat[1, 0]
    return tn, fn, fp, tp
 def method_2(cls, train, val, sample, y=None, y_hat=None):
    """
    Assume P and Q are the training and test distributions
    Solves the following system of linear equations:
    tp + fp = CC (the classify & count estimate, observed)
    fn + tp = Q(Y=1) (this is not observed but is estimated via quantification)
    tp + fp + fn + tn = 1 (trivial)
    There are 4 unknowns and 3 equations. The fourth required one is established
    by assuming that the PPS conditions hold, i.e., that P(X|Y)=Q(X|Y); note that
    this implies P(hatY|Y)=Q(hatY|Y) if hatY is computed by any measurable function.
    In particular, we consider that the tpr in P (estimated via validation, hereafter tpr) and
    in Q (unknown, hereafter tpr_Q) should
    be the same. This means:
    tpr = tpr_Q = tp / (tp + fn)
    after some manipulation:
    tp (tpr-1) + fn (tpr) = 0 <-- our last equation
    Note that the last equation relies on the estimate tpr. It is likely that, the more
    positives we have, the more reliable this estimate is. This suggests that, in cases
    in which we have more negatives in the validation set than positives, it might be
    convenient to resort to the true negative rate (tnr) instead. This gives rise to
    the alternative fourth equation:
    tn (tnr-1) + fp (tnr) = 0
    :return: tuple (tn, fn, fp, tp,) of floats in [0,1] summing up to 1
    """
    y_val = val.labels
    y_hat_val = cls.predict(val.instances)
    q = ACC(cls)
    q.fit(train, val_split=val, fit_classifier=False)
    p_hat = q.quantify(sample.instances)
    pos_prev = p_hat[1]
    # pos_prev = sample.prevalence()[1]
    cc = CC(cls)
    cc.fit(train, fit_classifier=False)
    cc_prev = cc.quantify(sample.instances)[1]
    M_hat = ACC.getPteCondEstim(train.classes_, y_val, y_hat_val)
    M_true = ACC.getPteCondEstim(train.classes_, y, y_hat)
    cont_table_true = sample.prevalence() * M_true
    if val.prevalence()[1] > 0.5:
        # in this case, the tpr might be a more reliable estimate than tnr
        tpr_hat = M_hat[1, 1]
        A = np.asarray([
            [0, 0, 1, 1],
            [0, 1, 0, 1],
            [1, 1, 1, 1],
            [0, tpr_hat, 0, tpr_hat - 1]
        ])
    else:
        # in this case, the tnr might be a more reliable estimate than tpr
        tnr_hat = M_hat[0, 0]
        A = np.asarray([
            [0, 0, 1, 1],
            [0, 1, 0, 1],
            [1, 1, 1, 1],
            [tnr_hat-1, 0, tnr_hat, 0]
        ])
    b = np.asarray(
        [cc_prev, pos_prev, 1, 0]
    )
    tn, fn, fp, tp = np.linalg.solve(A, b)
    cont_table_estim = np.asarray([
        [tn, fn],
        [fp, tp]
    ])
    # if (cont_table_estim < 0).any() or (cont_table_estim>1).any():
    #     cont_table_estim = scipy.special.softmax(cont_table_estim)
    print('true_prev: ', sample.prevalence())
    print('estim_prev: ', p_hat)
    print('true_cont_table:\n', cont_table_true)
    print('estim_cont_table:\n', cont_table_estim)
    # print('true_tpr', M_true[1,1])
    # print('estim_tpr', tpr_hat)
    return tn, fn, fp, tp
 def method_3(cls, train, val, sample, y=None, y_hat=None):
    """
    This is just method 2 but without involving any quapy's quantifier.
    :return: tuple (tn, fn, fp, tp,) of floats in [0,1] summing up to 1
    """
    classes = val.classes_
    y_val = val.labels
    y_hat_val = cls.predict(val.instances)
    M_hat = ACC.getPteCondEstim(classes, y_val, y_hat_val)
    y_hat_test = cls.predict(sample.instances)
    pos_prev_cc = F.prevalence_from_labels(y_hat_test, classes)[1]
    tpr_hat = M_hat[1,1]
    fpr_hat = M_hat[1,0]
    tnr_hat = M_hat[0,0]
    pos_prev_test_hat = (pos_prev_cc - fpr_hat) / (tpr_hat - fpr_hat)
    pos_prev_test_hat = np.clip(pos_prev_test_hat, 0, 1)
    pos_prev_val = val.prevalence()[1]
    if pos_prev_val > 0.5:
        # in this case, the tpr might be a more reliable estimate than tnr
        A = np.asarray([
            [0, 0, 1, 1],
            [0, 1, 0, 1],
            [1, 1, 1, 1],
            [0, tpr_hat, 0, tpr_hat - 1]
        ])
    else:
        # in this case, the tnr might be a more reliable estimate than tpr
        A = np.asarray([
            [0, 0, 1, 1],
            [0, 1, 0, 1],
            [1, 1, 1, 1],
            [tnr_hat-1, 0, tnr_hat, 0]
        ])
    b = np.asarray(
        [pos_prev_cc, pos_prev_test_hat, 1, 0]
    )
    tn, fn, fp, tp = np.linalg.solve(A, b)
    return tn, fn, fp, tp
 def cls_eval_from_counters(tn, fn, fp, tp):
    if target == 'acc':
        acc_hat = (tp + tn)
    else:
        den = (2 * tp + fn + fp)
        if den > 0:
            acc_hat = 2 * tp / den
        else:
            acc_hat = 0
    return acc_hat
 def cls_eval_from_labels(y, y_hat):
    if target == 'acc':
        acc = (y_hat == y).mean()
    else:
        acc = f1_score(y, y_hat, zero_division=0)
    return acc
 for dataset_name in datasets:
    train_orig, test = qp.datasets.fetch_reviews(dataset_name, tfidf=True, min_df=10).train_test
    train_prot = APP(train_orig, n_prevalences=11, repeats=1, return_type='labelled_collection', random_state=0, sample_size=10000)
    for train in train_prot():
        if np.product(train.prevalence()) == 0:
            # skip experiments with no positives or no negatives in training
            continue
        cls = LogisticRegression(class_weight='balanced')
        train, val = train.split_stratified(train_prop=0.5, random_state=0)
        print(f'dataset name = {dataset_name}')
        print(f'#train = {len(train)}, prev={F.strprev(train.prevalence())}')
        print(f'#val = {len(val)}, prev={F.strprev(val.prevalence())}')
        print(f'#test = {len(test)}, prev={F.strprev(test.prevalence())}')
        cls.fit(*train.Xy)
        for sample in APP(test, n_prevalences=21, repeats=10, sample_size=1000, return_type='labelled_collection')():
            print('='*80)
            y_hat = cls.predict(sample.instances)
            y = sample.labels
            acc_true = cls_eval_from_labels(y, y_hat)
            tn, fn, fp, tp = method_3(cls, train, val, sample, y, y_hat)
            acc_hat = cls_eval_from_counters(tn, fn, fp, tp)
            error = abs(acc_true - acc_hat)
            errors.append(error)
            print(f'classifier accuracy={acc_true:.3f}')
            print(f'estimated accuracy={acc_hat:.3f}')
            print(f'estimation error={error:.4f}')
 print('process end')
 print('='*80)
 print(f'mean error = {np.mean(errors)}')
 print(f'std error = {np.std(errors)}')