update

2024-01-30 13:56:17 +01:00 · 2024-01-30 13:56:17 +01:00 · 2d8d4c3c68
parent 6bf2fb9e1b
commit 2d8d4c3c68
26 changed files with 11884 additions and 622 deletions
--- a/accuracy_prediction_via_quantification.py
+++ b/accuracy_prediction_via_quantification.py
@ -0,0 +1,90 @@
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import f1_score
 import quapy as qp
 from method.kdey import KDEyML, KDEyCS, KDEyHD
 from quapy.protocol import APP
 from quapy.method.aggregative import PACC, ACC, EMQ, PCC, CC, DMy
 datasets = qp.datasets.UCI_DATASETS
 # target = 'f1'
 target = 'acc'
 errors = []
 # dataset_name = datasets[-2]
 for dataset_name in datasets:
    if dataset_name in ['balance.2', 'acute.a', 'acute.b', 'iris.1']:
        continue
    train, test = qp.datasets.fetch_UCIDataset(dataset_name).train_test
    print(f'dataset name = {dataset_name}')
    print(f'#train = {len(train)}')
    print(f'#test = {len(test)}')
    cls = LogisticRegression()
    train, val = train.split_stratified(random_state=0)
    cls.fit(*train.Xy)
    y_val = val.labels
    y_hat_val = cls.predict(val.instances)
    for sample in APP(test, n_prevalences=11, repeats=1, sample_size=100, return_type='labelled_collection')():
        print('='*80)
        y_hat = cls.predict(sample.instances)
        y = sample.labels
        if target == 'acc':
            acc = (y_hat==y).mean()
        else:
            acc = f1_score(y, y_hat, zero_division=0)
        q = EMQ(cls)
        q.fit(train, fit_classifier=False)
        # q = EMQ(cls)
        # q.fit(train, val_split=val, fit_classifier=False)
        M_hat = ACC.getPteCondEstim(train.classes_, y_val, y_hat_val)
        M_true = ACC.getPteCondEstim(train.classes_, y, y_hat)
        p_hat = q.quantify(sample.instances)
        cont_table_hat = p_hat * M_hat
        tp = cont_table_hat[1,1]
        tn = cont_table_hat[0,0]
        fn = cont_table_hat[0,1]
        fp = cont_table_hat[1,0]
        if target == 'acc':
            acc_hat = (tp+tn)
        else:
            den = (2*tp + fn + fp)
            if den > 0:
                acc_hat = 2*tp / den
            else:
                acc_hat = 0
        error = abs(acc - acc_hat)
        errors.append(error)
        print('true_prev: ', sample.prevalence())
        print('estim_prev: ', p_hat)
        print('M-true:\n', M_true)
        print('M-hat:\n', M_hat)
        print('cont_table:\n', cont_table_hat)
        print(f'classifier accuracy={acc:.3f}')
        print(f'estimated accuracy={acc_hat:.3f}')
        print(f'estimation error={error:.4f}')
 print('process end')
 print('='*80)
 print(f'mean error = {np.mean(errors)}')
 print(f'std error = {np.std(errors)}')
--- a/accuracy_prediction_via_quantification2.py
+++ b/accuracy_prediction_via_quantification2.py
@ -0,0 +1,269 @@
 import numpy as np
 import scipy.special
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import f1_score
 import quapy as qp
 from quapy.protocol import APP
 from quapy.method.aggregative import PACC, ACC, EMQ, PCC, CC, DMy, T50, MS2, KDEyML, KDEyCS, KDEyHD
 from sklearn import clone
 import quapy.functional as F
 # datasets = qp.datasets.UCI_DATASETS
 datasets = ['imdb']
 # target = 'f1'
 target = 'acc'
 errors = []
 def method_1(cls, train, val, sample, y=None, y_hat=None):
    """
    Converts a misclassification matrix computed in validation (i.e., in the train distribution P) into
    the corresponding equivalent misclassification matrix in test (i.e., in the test distribution Q)
    by relying on the PPS assumptions.
    :return: tuple (tn, fn, fp, tp,) of floats in [0,1] summing up to 1
    """
    y_val = val.labels
    y_hat_val = cls.predict(val.instances)
    # q = EMQ(LogisticRegression(class_weight='balanced'))
    # q.fit(val, fit_classifier=True)
    q = EMQ(cls)
    q.fit(train, fit_classifier=False)
    # q = KDEyML(cls)
    # q.fit(train, val_split=val, fit_classifier=False)
    M_hat = ACC.getPteCondEstim(train.classes_, y_val, y_hat_val)
    M_true = ACC.getPteCondEstim(train.classes_, y, y_hat)
    p_hat = q.quantify(sample.instances)
    cont_table_hat = p_hat * M_hat
    # cont_table_hat = np.clip(cont_table_hat, 0, 1)
    # cont_table_hat = cont_table_hat / cont_table_hat.sum()
    print('true_prev: ', sample.prevalence())
    print('estim_prev: ', p_hat)
    print('M-true:\n', M_true)
    print('M-hat:\n', M_hat)
    print('cont_table:\n', cont_table_hat)
    print('cont_table Sum :\n', cont_table_hat.sum())
    tp = cont_table_hat[1, 1]
    tn = cont_table_hat[0, 0]
    fn = cont_table_hat[0, 1]
    fp = cont_table_hat[1, 0]
    return tn, fn, fp, tp
 def method_2(cls, train, val, sample, y=None, y_hat=None):
    """
    Assume P and Q are the training and test distributions
    Solves the following system of linear equations:
    tp + fp = CC (the classify & count estimate, observed)
    fn + tp = Q(Y=1) (this is not observed but is estimated via quantification)
    tp + fp + fn + tn = 1 (trivial)
    There are 4 unknowns and 3 equations. The fourth required one is established
    by assuming that the PPS conditions hold, i.e., that P(X|Y)=Q(X|Y); note that
    this implies P(hatY|Y)=Q(hatY|Y) if hatY is computed by any measurable function.
    In particular, we consider that the tpr in P (estimated via validation, hereafter tpr) and
    in Q (unknown, hereafter tpr_Q) should
    be the same. This means:
    tpr = tpr_Q = tp / (tp + fn)
    after some manipulation:
    tp (tpr-1) + fn (tpr) = 0 <-- our last equation
    Note that the last equation relies on the estimate tpr. It is likely that, the more
    positives we have, the more reliable this estimate is. This suggests that, in cases
    in which we have more negatives in the validation set than positives, it might be
    convenient to resort to the true negative rate (tnr) instead. This gives rise to
    the alternative fourth equation:
    tn (tnr-1) + fp (tnr) = 0
    :return: tuple (tn, fn, fp, tp,) of floats in [0,1] summing up to 1
    """
    y_val = val.labels
    y_hat_val = cls.predict(val.instances)
    q = ACC(cls)
    q.fit(train, val_split=val, fit_classifier=False)
    p_hat = q.quantify(sample.instances)
    pos_prev = p_hat[1]
    # pos_prev = sample.prevalence()[1]
    cc = CC(cls)
    cc.fit(train, fit_classifier=False)
    cc_prev = cc.quantify(sample.instances)[1]
    M_hat = ACC.getPteCondEstim(train.classes_, y_val, y_hat_val)
    M_true = ACC.getPteCondEstim(train.classes_, y, y_hat)
    cont_table_true = sample.prevalence() * M_true
    if val.prevalence()[1] > 0.5:
        # in this case, the tpr might be a more reliable estimate than tnr
        tpr_hat = M_hat[1, 1]
        A = np.asarray([
            [0, 0, 1, 1],
            [0, 1, 0, 1],
            [1, 1, 1, 1],
            [0, tpr_hat, 0, tpr_hat - 1]
        ])
    else:
        # in this case, the tnr might be a more reliable estimate than tpr
        tnr_hat = M_hat[0, 0]
        A = np.asarray([
            [0, 0, 1, 1],
            [0, 1, 0, 1],
            [1, 1, 1, 1],
            [tnr_hat-1, 0, tnr_hat, 0]
        ])
    b = np.asarray(
        [cc_prev, pos_prev, 1, 0]
    )
    tn, fn, fp, tp = np.linalg.solve(A, b)
    cont_table_estim = np.asarray([
        [tn, fn],
        [fp, tp]
    ])
    # if (cont_table_estim < 0).any() or (cont_table_estim>1).any():
    #     cont_table_estim = scipy.special.softmax(cont_table_estim)
    print('true_prev: ', sample.prevalence())
    print('estim_prev: ', p_hat)
    print('true_cont_table:\n', cont_table_true)
    print('estim_cont_table:\n', cont_table_estim)
    # print('true_tpr', M_true[1,1])
    # print('estim_tpr', tpr_hat)
    return tn, fn, fp, tp
 def method_3(cls, train, val, sample, y=None, y_hat=None):
    """
    This is just method 2 but without involving any quapy's quantifier.
    :return: tuple (tn, fn, fp, tp,) of floats in [0,1] summing up to 1
    """
    classes = val.classes_
    y_val = val.labels
    y_hat_val = cls.predict(val.instances)
    M_hat = ACC.getPteCondEstim(classes, y_val, y_hat_val)
    y_hat_test = cls.predict(sample.instances)
    pos_prev_cc = F.prevalence_from_labels(y_hat_test, classes)[1]
    tpr_hat = M_hat[1,1]
    fpr_hat = M_hat[1,0]
    tnr_hat = M_hat[0,0]
    pos_prev_test_hat = (pos_prev_cc - fpr_hat) / (tpr_hat - fpr_hat)
    pos_prev_test_hat = np.clip(pos_prev_test_hat, 0, 1)
    pos_prev_val = val.prevalence()[1]
    if pos_prev_val > 0.5:
        # in this case, the tpr might be a more reliable estimate than tnr
        A = np.asarray([
            [0, 0, 1, 1],
            [0, 1, 0, 1],
            [1, 1, 1, 1],
            [0, tpr_hat, 0, tpr_hat - 1]
        ])
    else:
        # in this case, the tnr might be a more reliable estimate than tpr
        A = np.asarray([
            [0, 0, 1, 1],
            [0, 1, 0, 1],
            [1, 1, 1, 1],
            [tnr_hat-1, 0, tnr_hat, 0]
        ])
    b = np.asarray(
        [pos_prev_cc, pos_prev_test_hat, 1, 0]
    )
    tn, fn, fp, tp = np.linalg.solve(A, b)
    return tn, fn, fp, tp
 def cls_eval_from_counters(tn, fn, fp, tp):
    if target == 'acc':
        acc_hat = (tp + tn)
    else:
        den = (2 * tp + fn + fp)
        if den > 0:
            acc_hat = 2 * tp / den
        else:
            acc_hat = 0
    return acc_hat
 def cls_eval_from_labels(y, y_hat):
    if target == 'acc':
        acc = (y_hat == y).mean()
    else:
        acc = f1_score(y, y_hat, zero_division=0)
    return acc
 for dataset_name in datasets:
    train_orig, test = qp.datasets.fetch_reviews(dataset_name, tfidf=True, min_df=10).train_test
    train_prot = APP(train_orig, n_prevalences=11, repeats=1, return_type='labelled_collection', random_state=0, sample_size=10000)
    for train in train_prot():
        if np.product(train.prevalence()) == 0:
            # skip experiments with no positives or no negatives in training
            continue
        cls = LogisticRegression(class_weight='balanced')
        train, val = train.split_stratified(train_prop=0.5, random_state=0)
        print(f'dataset name = {dataset_name}')
        print(f'#train = {len(train)}, prev={F.strprev(train.prevalence())}')
        print(f'#val = {len(val)}, prev={F.strprev(val.prevalence())}')
        print(f'#test = {len(test)}, prev={F.strprev(test.prevalence())}')
        cls.fit(*train.Xy)
        for sample in APP(test, n_prevalences=21, repeats=10, sample_size=1000, return_type='labelled_collection')():
            print('='*80)
            y_hat = cls.predict(sample.instances)
            y = sample.labels
            acc_true = cls_eval_from_labels(y, y_hat)
            tn, fn, fp, tp = method_3(cls, train, val, sample, y, y_hat)
            acc_hat = cls_eval_from_counters(tn, fn, fp, tp)
            error = abs(acc_true - acc_hat)
            errors.append(error)
            print(f'classifier accuracy={acc_true:.3f}')
            print(f'estimated accuracy={acc_hat:.3f}')
            print(f'estimation error={error:.4f}')
 print('process end')
 print('='*80)
 print(f'mean error = {np.mean(errors)}')
 print(f'std error = {np.std(errors)}')
--- a/conf.yaml
+++ b/conf.yaml
@ -5,47 +5,80 @@ debug_conf: &debug_conf
    OUT_DIR_NAME: output/debug
    DATASET_N_PREVS: 9
    COMP_ESTIMATORS:
-      - bin_sld_lr
+      # - bin_sld_lr
-      - mul_sld_lr
+      # - mul_sld_lr
-      - m3w_sld_lr
+      # - m3w_sld_lr
-      - d_bin_sld_lr
+      # - d_bin_sld_lr
-      - d_mul_sld_lr
+      # - d_mul_sld_lr
-      - d_m3w_sld_lr
+      # - d_m3w_sld_lr
-      - d_bin_sld_rbf
+      # - d_bin_sld_rbf
-      - d_mul_sld_rbf
+      # - d_mul_sld_rbf
-      - d_m3w_sld_rbf
+      # - d_m3w_sld_rbf
-      - bin_kde_lr
+      # - bin_kde_lr
-      - mul_kde_lr
+      # - mul_kde_lr
-      - m3w_kde_lr
+      # - m3w_kde_lr
-      - d_bin_kde_lr
+      # - d_bin_kde_lr
-      - d_mul_kde_lr
+      # - d_mul_kde_lr
-      - d_m3w_kde_lr
+      # - d_m3w_kde_lr
-      - d_bin_kde_rbf
+      # - d_bin_kde_rbf
-      - d_mul_kde_rbf
+      # - d_mul_kde_rbf
-      - d_m3w_kde_rbf
+      # - d_m3w_kde_rbf
      # - mandoline
-      # - rca
+      - bin_sld_lr_is
      - mul_sld_lr_is
      - m3w_sld_lr_is
      - rca
      - rca_star
      - doc
      - atc_mc
    N_JOBS: -2
  confs:
    - DATASET_NAME: imdb
    - DATASET_NAME: rcv1
      DATASET_TARGET: CCAT
  other_confs:
    - DATASET_NAME: twitter_gasp
    - DATASET_NAME: rcv1
      DATASET_TARGET: CCAT
 test_conf: &test_conf
  global:
    METRICS: 
      - acc
      - f1
    OUT_DIR_NAME: output/test
    DATASET_N_PREVS: 9
    COMP_ESTIMATORS:
      - cross
      - cross2
      - bin_sld_lr
      - mul_sld_lr
      - m3w_sld_lr
      - bin_sld_lr_is
      - mul_sld_lr_is
      - m3w_sld_lr_is
      - doc 
      - atc_mc
    N_JOBS: -2
  confs:
    - DATASET_NAME: imdb
    - DATASET_NAME: rcv1
      DATASET_TARGET: CCAT
  other_confs:
    - DATASET_NAME: twitter_gasp
 main:
  confs: &main_confs
    - DATASET_NAME: rcv1
-      DATASET_TARGET: MCAT
+      DATASET_TARGET: CCAT
  other_confs:
    - DATASET_NAME: imdb
    - DATASET_NAME: rcv1
      DATASET_TARGET: CCAT
    - DATASET_NAME: rcv1
      DATASET_TARGET: GCAT
    - DATASET_NAME: rcv1
      DATASET_TARGET: MCAT
 sld_lr_conf: &sld_lr_conf
@ -72,6 +105,9 @@ sld_lr_conf: &sld_lr_conf
      - bin_sld_lr_is
      - mul_sld_lr_is
      - m3w_sld_lr_is
      - bin_sld_lr_a
      - mul_sld_lr_a
      - m3w_sld_lr_a
      - bin_sld_lr_gs
      - mul_sld_lr_gs
      - m3w_sld_lr_gs
@ -116,6 +152,9 @@ d_sld_lr_conf: &d_sld_lr_conf
      - d_bin_sld_lr_is
      - d_mul_sld_lr_is
      - d_m3w_sld_lr_is
      - d_bin_sld_lr_a
      - d_mul_sld_lr_a
      - d_m3w_sld_lr_a
      - d_bin_sld_lr_gs
      - d_mul_sld_lr_gs
      - d_m3w_sld_lr_gs
@ -160,6 +199,9 @@ d_sld_rbf_conf: &d_sld_rbf_conf
      - d_bin_sld_rbf_is
      - d_mul_sld_rbf_is
      - d_m3w_sld_rbf_is
      - d_bin_sld_rbf_a
      - d_mul_sld_rbf_a
      - d_m3w_sld_rbf_a
      - d_bin_sld_rbf_gs
      - d_mul_sld_rbf_gs
      - d_m3w_sld_rbf_gs
@ -202,6 +244,9 @@ kde_lr_conf: &kde_lr_conf
      - bin_kde_lr_is
      - mul_kde_lr_is
      - m3w_kde_lr_is
      - bin_kde_lr_a
      - mul_kde_lr_a
      - m3w_kde_lr_a
      - bin_kde_lr_gs
      - mul_kde_lr_gs
      - m3w_kde_lr_gs
@ -238,6 +283,9 @@ d_kde_lr_conf: &d_kde_lr_conf
      - d_bin_kde_lr_is
      - d_mul_kde_lr_is
      - d_m3w_kde_lr_is
      - d_bin_kde_lr_a
      - d_mul_kde_lr_a
      - d_m3w_kde_lr_a
      - d_bin_kde_lr_gs
      - d_mul_kde_lr_gs
      - d_m3w_kde_lr_gs
@ -274,6 +322,9 @@ d_kde_rbf_conf: &d_kde_rbf_conf
      - d_bin_kde_rbf_is
      - d_mul_kde_rbf_is
      - d_m3w_kde_rbf_is
      - d_bin_kde_rbf_a
      - d_mul_kde_rbf_a
      - d_m3w_kde_rbf_a
      - d_bin_kde_rbf_gs
      - d_mul_kde_rbf_gs
      - d_m3w_kde_rbf_gs
@ -287,5 +338,72 @@ d_kde_rbf_conf: &d_kde_rbf_conf
    - DATASET_NAME: rcv1
      DATASET_TARGET: CCAT
 baselines_conf: &baselines_conf
  global:
    METRICS: 
      - acc
      - f1
    OUT_DIR_NAME: output/baselines
    DATASET_N_PREVS: 9
    COMP_ESTIMATORS:
      - doc
      - atc_mc
      - mandoline
      - rca
      - rca_star
    N_JOBS: -2
-exec: *d_sld_rbf_conf
+  confs: *main_confs
  other_confs:
    - DATASET_NAME: imdb
    - DATASET_NAME: rcv1
      DATASET_TARGET: CCAT
 kde_lr_gs_conf: &kde_lr_gs_conf
  global:
    METRICS: 
      - acc
      - f1
    OUT_DIR_NAME: output/kde_lr_gs
    DATASET_N_PREVS: 9
    COMP_ESTIMATORS:
      - bin_kde_lr_gs
      - mul_kde_lr_gs
      - m3w_kde_lr_gs
    N_JOBS: -2
  confs: *main_confs
 timing_conf: &timing_conf
  global:
    METRICS:
      - acc
      - f1
    OUT_DIR_NAME: output/timing
    DATASET_N_PREVS: 1
    COMP_ESTIMATORS:
      - bin_sld_lr_a
      - mul_sld_lr_a
      - m3w_sld_lr_a
      - bin_kde_lr_a
      - mul_kde_lr_a
      - m3w_kde_lr_a
      - bin_sld_lr_gs
      - mul_sld_lr_gs
      - m3w_sld_lr_gs
      - bin_kde_lr_gs
      - mul_kde_lr_gs
      - m3w_kde_lr_gs
      - doc 
      - atc_mc
      - rca
      - rca_star
      - mandoline
    N_JOBS: 1
    PROTOCOL_N_PREVS: 1,
    PROTOCOL_REPEATS: 1,
    SAMPLE_SIZE: 1000,
  confs: *main_confs
 exec: *kde_lr_gs_conf
--- a/copy_source.sh
+++ b/copy_source.sh
@ -0,0 +1,11 @@
 #!/bin/bash
 CMD="cp"
 DEST="~/tesi_docker/"
 bash -c "${CMD} -r quacc ${DEST}"
 bash -c "${CMD} -r baselines ${DEST}"
 bash -c "${CMD} run.py ${DEST}"
 bash -c "${CMD} remote.py ${DEST}"
 bash -c "${CMD} conf.yaml ${DEST}"
 bash -c "${CMD} requirements.txt ${DEST}"
--- a/8
+++ b/8
@ -0,0 +1,8 @@
 #!/bin/bash
 if [[ "${1}" == "r" ]]; then
 	scp volpi@ilona.isti.cnr.it:~/tesi/quacc.log ~/tesi/remote.log &>/dev/null
 	ssh volpi@ilona.isti.cnr.it tail -n 500 -f /home/volpi/tesi/quacc.log | bat -P --language=log
 else
 	tail -n 500 -f /home/lorev/tesi/quacc.log | bat --paging=never --language log
 fi
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -13,6 +13,7 @@ jinja2 = "^3.1.2"
 pyyaml = "^6.0.1"
 logging = "^0.4.9.6"
 abstention = "^0.1.3.1"
 pytest = "^8.0.0"
 [tool.poetry.scripts]
 main = "quacc.main:main"
@ -34,21 +35,20 @@ dash = "gunicorn qcdash.app:server -b ilona.isti.cnr.it:33421"
 shell = """
    scp {$HOST}:~/tesi/quacc.log ~/tesi/remote.log &> /dev/null
    ssh {$HOST} tail -n 0 -f /home/volpi/tesi/quacc.log >> ~/tesi/remote.log
 """
 [tool.poe.tasks.logrf]
 shell = """
-    scp {$HOST}:~/tesi/quacc.log ~/tesi/remote.log &> /dev/null
+    ssh {$HOST} tail -n 500 -f /home/volpi/tesi/quacc.log | bat -P --language=log
-    ssh {$HOST} tail -n 500 -f /home/volpi/tesi/quacc.log | bat --paging=never --language log
+"""
-
+[tool.poe.tasks.logf]
 shell = """
    tail -n 500 -f /home/lorev/tesi/quacc.log | bat --paging=never --language log
 """
 interpreter = "fish"
 env = { HOST = "volpi@ilona.isti.cnr.it" }
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.4.0"
 pylance = "^0.5.9"
 pytest-mock = "^3.11.1"
 pytest-cov = "^4.1.0"
--- a/qcdash/app.py
+++ b/qcdash/app.py
@ -85,6 +85,8 @@ def get_table(dr: DatasetReport, metric, estimators, view, mode):
        case ("avg", "train_table"):
            # return dr.data(metric=metric, estimators=estimators).groupby(level=1).mean()
            return dr.train_table(metric=metric, estimators=estimators)
        case ("avg", "train_std_table"):
            return dr.train_std_table(metric=metric, estimators=estimators)
        case ("avg", "test_table"):
            # return dr.data(metric=metric, estimators=estimators).groupby(level=0).mean()
            return dr.test_table(metric=metric, estimators=estimators)
@ -121,24 +123,44 @@ def get_DataTable(df, mode):
    _index_name = dict(
        train_table="test prev.",
        train_std_table="train prev.",
        test_table="train prev.",
        shift_table="shift",
        stats_table="method",
    )
    df = df.reset_index()
    if mode == "train_std_table":
        columns_format = Format()
        df_columns = np.concatenate([["index"], df.columns.unique(1)[1:]])
        data = [
            dict(
                index="(" + ", ".join([f"{v:.2f}" for v in idx]) + ")"
                if isinstance(idx, tuple | list | np.ndarray)
                else str(idx)
            )
            | {
                k: f"{df.loc[i,('avg',k)]:.4f}~{df.loc[i,('std',k)]:.3f}"
                for k in df.columns.unique(1)[1:]
            }
            for i, idx in zip(df.index, df.loc[:, ("index", "")])
        ]
    else:
        columns_format = Format(precision=6, scheme=Scheme.exponent, nully="nan")
        df_columns = df.columns
        data = df.to_dict("records")
    columns = {
        c: dict(
            id=c,
            name=_index_name[mode] if c == "index" else c,
            type="numeric",
-            format=Format(precision=6, scheme=Scheme.exponent, nully="nan"),
+            format=columns_format,
        )
-        for c in df.columns
+        for c in df_columns
    }
    # columns["index"]["format"] = Format(precision=2, scheme=Scheme.fixed)
    columns["index"]["format"] = Format()
    columns = list(columns.values())
    data = df.to_dict("records")
    for d in data:
        if isinstance(d["index"], tuple | list | np.ndarray):
            d["index"] = "(" + ", ".join([f"{v:.2f}" for v in d["index"]]) + ")"
--- a/quacc.log
+++ b/quacc.log
--- a/quacc/dataset.py
+++ b/quacc/dataset.py
@ -320,25 +320,59 @@ def rcv1_info():
    n_train = 23149
    targets = []
-    for target in range(103):
+    for target in ["CCAT", "MCAT", "GCAT"]:
-        train_t_prev = np.average(dataset.target[:n_train, target].toarray().flatten())
+        target_index = np.where(dataset.target_names == target)[0]
-        test_t_prev = np.average(dataset.target[n_train:, target].toarray().flatten())
+        train_t_prev = np.average(
            dataset.target[:n_train, target_index].toarray().flatten()
        )
        test_t_prev = np.average(
            dataset.target[n_train:, target_index].toarray().flatten()
        )
        d = Dataset(name="rcv1", target=target)()[0]
        targets.append(
            (
-                dataset.target_names[target],
+                target,
                {
                    "train": (1.0 - train_t_prev, train_t_prev),
                    "test": (1.0 - test_t_prev, test_t_prev),
                    "train_size": len(d.train),
                    "val_size": len(d.validation),
                    "test_size": len(d.test),
                },
            )
        )
    targets.sort(key=lambda t: t[1]["train"][1])
    for n, d in targets:
        print(f"{n}:")
-        for k, (fp, tp) in d.items():
+        for k, v in d.items():
-            print(f"\t{k}: {fp:.4f}, {tp:.4f}")
+            if isinstance(v, tuple):
                print(f"\t{k}: {v[0]:.4f}, {v[1]:.4f}")
            else:
                print(f"\t{k}: {v}")
 def imdb_info():
    train, test = qp.datasets.fetch_reviews("imdb", tfidf=True, min_df=3).train_test
    train_t_prev = train.prevalence()
    test_t_prev = test.prevalence()
    dst = Dataset(name="imdb")()[0]
    d = {
        "train": (train_t_prev[0], train_t_prev[1]),
        "test": (test_t_prev[0], test_t_prev[1]),
        "train_size": len(dst.train),
        "val_size": len(dst.validation),
        "test_size": len(dst.test),
    }
    print("imdb:")
    for k, v in d.items():
        if isinstance(v, tuple):
            print(f"\t{k}: {v[0]:.4f}, {v[1]:.4f}")
        else:
            print(f"\t{k}: {v}")
 if __name__ == "__main__":
-    fetch_cifar100()
+    rcv1_info()
    imdb_info()
--- a/quacc/evaluation/alt.py
+++ b/quacc/evaluation/alt.py
@ -0,0 +1,115 @@
 from functools import wraps
 import numpy as np
 import quapy.functional as F
 import sklearn.metrics as metrics
 from quapy.method.aggregative import ACC, EMQ
 from sklearn import clone
 from sklearn.linear_model import LogisticRegression
 import quacc as qc
 from quacc.evaluation.report import EvaluationReport
 _alts = {}
 def alt(func):
    @wraps(func)
    def wrapper(c_model, validation, protocol):
        return func(c_model, validation, protocol)
    wrapper.name = func.__name__
    _alts[func.__name__] = wrapper
    return wrapper
@alt
 def cross(c_model, validation, protocol):
    y_val = validation.labels
    y_hat_val = c_model.predict(validation.instances)
    qcls = clone(c_model)
    qcls.fit(*validation.Xy)
    er = EvaluationReport(name="cross")
    for sample in protocol():
        y_hat = c_model.predict(sample.instances)
        y = sample.labels
        ground_acc = (y_hat == y).mean()
        ground_f1 = metrics.f1_score(y, y_hat, zero_division=0)
        q = EMQ(qcls)
        q.fit(validation, fit_classifier=False)
        M_hat = ACC.getPteCondEstim(validation.classes_, y_val, y_hat_val)
        p_hat = q.quantify(sample.instances)
        cont_table_hat = p_hat * M_hat
        acc_score = qc.error.acc(cont_table_hat)
        f1_score = qc.error.f1(cont_table_hat)
        meta_acc = abs(acc_score - ground_acc)
        meta_f1 = abs(f1_score - ground_f1)
        er.append_row(
            sample.prevalence(),
            acc=meta_acc,
            f1=meta_f1,
            acc_score=acc_score,
            f1_score=f1_score,
        )
    return er
@alt
 def cross2(c_model, validation, protocol):
    classes = validation.classes_
    y_val = validation.labels
    y_hat_val = c_model.predict(validation.instances)
    M_hat = ACC.getPteCondEstim(classes, y_val, y_hat_val)
    pos_prev_val = validation.prevalence()[1]
    er = EvaluationReport(name="cross2")
    for sample in protocol():
        y_test = sample.labels
        y_hat_test = c_model.predict(sample.instances)
        ground_acc = (y_hat_test == y_test).mean()
        ground_f1 = metrics.f1_score(y_test, y_hat_test, zero_division=0)
        pos_prev_cc = F.prevalence_from_labels(y_hat_test, classes)[1]
        tpr_hat = M_hat[1, 1]
        fpr_hat = M_hat[1, 0]
        tnr_hat = M_hat[0, 0]
        pos_prev_test_hat = (pos_prev_cc - fpr_hat) / (tpr_hat - fpr_hat)
        pos_prev_test_hat = np.clip(pos_prev_test_hat, 0, 1)
        if pos_prev_val > 0.5:
            # in this case, the tpr might be a more reliable estimate than tnr
            A = np.asarray(
                [[0, 0, 1, 1], [0, 1, 0, 1], [1, 1, 1, 1], [0, tpr_hat, 0, tpr_hat - 1]]
            )
        else:
            # in this case, the tnr might be a more reliable estimate than tpr
            A = np.asarray(
                [[0, 0, 1, 1], [0, 1, 0, 1], [1, 1, 1, 1], [tnr_hat - 1, 0, tnr_hat, 0]]
            )
        b = np.asarray([pos_prev_cc, pos_prev_test_hat, 1, 0])
        tn, fn, fp, tp = np.linalg.solve(A, b)
        cont_table_hat = np.array([[tn, fp], [fn, tp]])
        acc_score = qc.error.acc(cont_table_hat)
        f1_score = qc.error.f1(cont_table_hat)
        meta_acc = abs(acc_score - ground_acc)
        meta_f1 = abs(f1_score - ground_f1)
        er.append_row(
            sample.prevalence(),
            acc=meta_acc,
            f1=meta_f1,
            acc_score=acc_score,
            f1_score=f1_score,
        )
    return er
--- a/quacc/evaluation/baseline.py
+++ b/quacc/evaluation/baseline.py
@ -288,21 +288,76 @@ def rca(
 ):
    """elsahar19"""
    c_model_predict = getattr(c_model, predict_method)
-    val_pred1 = c_model_predict(validation.X)
+    f1_average = "binary" if validation.n_classes == 2 else "macro"
    val1, val2 = validation.split_stratified(train_prop=0.5, random_state=env._R_SEED)
    val1_pred1 = c_model_predict(val1.X)
    val2_protocol = APP(
        val2,
        n_prevalences=21,
        repeats=100,
        return_type="labelled_collection",
    )
    val2_prot_preds = []
    val2_rca = []
    val2_prot_preds = []
    val2_prot_y = []
    for v2 in val2_protocol():
        _preds = c_model_predict(v2.X)
        try:
            c_model2 = clone_fit(c_model, v2.X, _preds)
            c_model2_predict = getattr(c_model2, predict_method)
            val1_pred2 = c_model2_predict(val1.X)
            rca_score = 1.0 - rcalib.get_score(val1_pred1, val1_pred2, val1.y)
            val2_rca.append(rca_score)
            val2_prot_preds.append(_preds)
            val2_prot_y.append(v2.y)
        except ValueError:
            pass
    val_targets_acc = np.array(
        [
            metrics.accuracy_score(v2_y, v2_preds)
            for v2_y, v2_preds in zip(val2_prot_y, val2_prot_preds)
        ]
    )
    reg_acc = LinearRegression().fit(np.array(val2_rca)[:, np.newaxis], val_targets_acc)
    val_targets_f1 = np.array(
        [
            metrics.f1_score(v2_y, v2_preds, average=f1_average)
            for v2_y, v2_preds in zip(val2_prot_y, val2_prot_preds)
        ]
    )
    reg_f1 = LinearRegression().fit(np.array(val2_rca)[:, np.newaxis], val_targets_f1)
    report = EvaluationReport(name="rca")
    for test in protocol():
        try:
-            test_pred = c_model_predict(test.X)
+            test_preds = c_model_predict(test.X)
-            c_model2 = clone_fit(c_model, test.X, test_pred)
+            c_model2 = clone_fit(c_model, test.X, test_preds)
            c_model2_predict = getattr(c_model2, predict_method)
-            val_pred2 = c_model2_predict(validation.X)
+            val1_pred2 = c_model2_predict(val1.X)
-            rca_score = 1.0 - rcalib.get_score(val_pred1, val_pred2, validation.y)
+            rca_score = 1.0 - rcalib.get_score(val1_pred1, val1_pred2, val1.y)
-            meta_score = abs(rca_score - metrics.accuracy_score(test.y, test_pred))
+            acc_score = reg_acc.predict(np.array([[rca_score]]))[0]
-            report.append_row(test.prevalence(), acc=meta_score, acc_score=rca_score)
+            f1_score = reg_f1.predict(np.array([[rca_score]]))[0]
            meta_acc = abs(acc_score - metrics.accuracy_score(test.y, test_preds))
            meta_f1 = abs(
                f1_score - metrics.f1_score(test.y, test_preds, average=f1_average)
            )
            report.append_row(
                test.prevalence(),
                acc=meta_acc,
                acc_score=acc_score,
                f1=meta_f1,
                f1_score=f1_score,
            )
        except ValueError:
            report.append_row(
-                test.prevalence(), acc=float("nan"), acc_score=float("nan")
+                test.prevalence(),
                acc=np.nan,
                acc_score=np.nan,
                f1=np.nan,
                f1_score=np.nan,
            )
    return report
@ -317,13 +372,56 @@ def rca_star(
 ):
    """elsahar19"""
    c_model_predict = getattr(c_model, predict_method)
-    validation1, validation2 = validation.split_stratified(
+    f1_average = "binary" if validation.n_classes == 2 else "macro"
    validation1, val2 = validation.split_stratified(
        train_prop=0.5, random_state=env._R_SEED
    )
-    val1_pred = c_model_predict(validation1.X)
+    val11, val12 = validation1.split_stratified(
-    c_model1 = clone_fit(c_model, validation1.X, val1_pred)
+        train_prop=0.5, random_state=env._R_SEED
    )
    val11_pred = c_model_predict(val11.X)
    c_model1 = clone_fit(c_model, val11.X, val11_pred)
    c_model1_predict = getattr(c_model1, predict_method)
-    val2_pred1 = c_model1_predict(validation2.X)
+    val12_pred1 = c_model1_predict(val12.X)
    val2_protocol = APP(
        val2,
        n_prevalences=21,
        repeats=100,
        return_type="labelled_collection",
    )
    val2_prot_preds = []
    val2_rca = []
    val2_prot_preds = []
    val2_prot_y = []
    for v2 in val2_protocol():
        _preds = c_model_predict(v2.X)
        try:
            c_model2 = clone_fit(c_model, v2.X, _preds)
            c_model2_predict = getattr(c_model2, predict_method)
            val12_pred2 = c_model2_predict(val12.X)
            rca_score = 1.0 - rcalib.get_score(val12_pred1, val12_pred2, val12.y)
            val2_rca.append(rca_score)
            val2_prot_preds.append(_preds)
            val2_prot_y.append(v2.y)
        except ValueError:
            pass
    val_targets_acc = np.array(
        [
            metrics.accuracy_score(v2_y, v2_preds)
            for v2_y, v2_preds in zip(val2_prot_y, val2_prot_preds)
        ]
    )
    reg_acc = LinearRegression().fit(np.array(val2_rca)[:, np.newaxis], val_targets_acc)
    val_targets_f1 = np.array(
        [
            metrics.f1_score(v2_y, v2_preds, average=f1_average)
            for v2_y, v2_preds in zip(val2_prot_y, val2_prot_preds)
        ]
    )
    reg_f1 = LinearRegression().fit(np.array(val2_rca)[:, np.newaxis], val_targets_f1)
    report = EvaluationReport(name="rca_star")
    for test in protocol():
@ -331,17 +429,28 @@ def rca_star(
            test_pred = c_model_predict(test.X)
            c_model2 = clone_fit(c_model, test.X, test_pred)
            c_model2_predict = getattr(c_model2, predict_method)
-            val2_pred2 = c_model2_predict(validation2.X)
+            val12_pred2 = c_model2_predict(val12.X)
-            rca_star_score = 1.0 - rcalib.get_score(
+            rca_star_score = 1.0 - rcalib.get_score(val12_pred1, val12_pred2, val12.y)
-                val2_pred1, val2_pred2, validation2.y
+            acc_score = reg_acc.predict(np.array([[rca_star_score]]))[0]
            f1_score = reg_f1.predict(np.array([[rca_score]]))[0]
            meta_acc = abs(acc_score - metrics.accuracy_score(test.y, test_pred))
            meta_f1 = abs(
                f1_score - metrics.f1_score(test.y, test_pred, average=f1_average)
            )
            meta_score = abs(rca_star_score - metrics.accuracy_score(test.y, test_pred))
            report.append_row(
-                test.prevalence(), acc=meta_score, acc_score=rca_star_score
+                test.prevalence(),
                acc=meta_acc,
                acc_score=acc_score,
                f1=meta_f1,
                f1_score=f1_score,
            )
        except ValueError:
            report.append_row(
-                test.prevalence(), acc=float("nan"), acc_score=float("nan")
+                test.prevalence(),
                acc=np.nan,
                acc_score=np.nan,
                f1=np.nan,
                f1_score=np.nan,
            )
    return report
@ -447,3 +556,4 @@ def kdex2(
        report.append_row(test.prevalence(), acc=meta_score, acc_score=estim_acc)
    return report
--- a/quacc/evaluation/comp.py
+++ b/quacc/evaluation/comp.py
@ -57,6 +57,8 @@ def estimate_worker(_estimate, train, validation, test, q=None):
 def split_tasks(estimators, train, validation, test, q):
    _par, _seq = [], []
    for estim in estimators:
        if hasattr(estim, "nocall"):
            continue
        _task = [estim, train, validation, test]
        match estim.name:
            case n if n.endswith("_gs"):
--- a/quacc/evaluation/estimators.py
+++ b/quacc/evaluation/estimators.py
@ -2,7 +2,7 @@ from typing import List
 import numpy as np
-from quacc.evaluation import baseline, method
+from quacc.evaluation import baseline, method, alt
 class CompEstimatorFunc_:
@ -40,7 +40,7 @@ class CompEstimatorName_:
 class CompEstimator:
    def __get(cls, e: str | List[str], get_ref=True):
-        _dict = method._methods | baseline._baselines
+        _dict = alt._alts | method._methods | baseline._baselines
        match e:
            case "__all":
--- a/quacc/evaluation/method.py
+++ b/quacc/evaluation/method.py
@ -26,7 +26,12 @@ def _param_grid(method, X_fit: np.ndarray):
                "q__classifier__C": np.logspace(-3, 3, 7),
                "q__classifier__class_weight": [None, "balanced"],
                "q__recalib": [None, "bcts"],
-                "confidence": [None, ["isoft"], ["max_conf", "entropy"]],
+                "confidence": [
                    None,
                    ["isoft"],
                    ["max_conf", "entropy"],
                    ["max_conf", "entropy", "isoft"],
                ],
            }
        case "sld_rbf":
            _scale = 1.0 / (X_fit.shape[1] * X_fit.var())
@ -35,7 +40,12 @@ def _param_grid(method, X_fit: np.ndarray):
                "q__classifier__class_weight": [None, "balanced"],
                "q__classifier__gamma": _scale * np.logspace(-2, 2, 5),
                "q__recalib": [None, "bcts"],
-                "confidence": [None, ["isoft"], ["max_conf", "entropy"]],
+                "confidence": [
                    None,
                    ["isoft"],
                    ["max_conf", "entropy"],
                    ["max_conf", "entropy", "isoft"],
                ],
            }
        case "pacc":
            return {
@ -48,7 +58,7 @@ def _param_grid(method, X_fit: np.ndarray):
                "q__classifier__C": np.logspace(-3, 3, 7),
                "q__classifier__class_weight": [None, "balanced"],
                "q__bandwidth": np.linspace(0.01, 0.2, 20),
-                "confidence": [None, ["isoft"]],
+                "confidence": [None, ["isoft"], ["max_conf", "entropy", "isoft"]],
            }
        case "kde_rbf":
            _scale = 1.0 / (X_fit.shape[1] * X_fit.var())
@ -57,7 +67,7 @@ def _param_grid(method, X_fit: np.ndarray):
                "q__classifier__class_weight": [None, "balanced"],
                "q__classifier__gamma": _scale * np.logspace(-2, 2, 5),
                "q__bandwidth": np.linspace(0.01, 0.2, 20),
-                "confidence": [None, ["isoft"]],
+                "confidence": [None, ["isoft"], ["max_conf", "entropy", "isoft"]],
            }
@ -96,6 +106,15 @@ def evaluation_report(
    return report
@dataclass(frozen=True)
 class EmptyMethod:
    name: str
    nocall: bool = True
    def __call__(self, c_model, validation, protocol) -> EvaluationReport:
        pass
@dataclass(frozen=True)
 class EvaluationMethod:
    name: str
@ -162,13 +181,16 @@ class EvaluationMethodGridSearch(EvaluationMethod):
            verbose=False,
            **_search_params,
        ).fit(v_train)
-        return evaluation_report(
+        er = evaluation_report(
            estimator=est,
            protocol=protocol,
            method_name=self.name,
        )
        er.fit_score = est.best_score()
        return er
 E = EmptyMethod
 M = EvaluationMethod
 G = EvaluationMethodGridSearch
@ -229,12 +251,19 @@ __sld_lr_set = [
    M("mul_sld_lr_is",   __sld_lr(),  "mul", conf="isoft",                        ),
    M("m3w_sld_lr_is",   __sld_lr(),  "mul", conf="isoft",                 cf=True),
    M("mgf_sld_lr_is",   __sld_lr(),  "mul", conf="isoft",                 gf=True),
    # sld all
    M("bin_sld_lr_a",   __sld_lr(),  "bin", conf=["max_conf", "entropy", "isoft"],         ),
    M("bgf_sld_lr_a",   __sld_lr(),  "bin", conf=["max_conf", "entropy", "isoft"],  gf=True),
    M("mul_sld_lr_a",   __sld_lr(),  "mul", conf=["max_conf", "entropy", "isoft"],         ),
    M("m3w_sld_lr_a",   __sld_lr(),  "mul", conf=["max_conf", "entropy", "isoft"],  cf=True),
    M("mgf_sld_lr_a",   __sld_lr(),  "mul", conf=["max_conf", "entropy", "isoft"],  gf=True),
    # gs sld
    G("bin_sld_lr_gs",   __sld_lr(),  "bin", pg="sld_lr"                          ),
    G("bgf_sld_lr_gs",   __sld_lr(),  "bin", pg="sld_lr",                  gf=True),
    G("mul_sld_lr_gs",   __sld_lr(),  "mul", pg="sld_lr"                          ),
    G("m3w_sld_lr_gs",   __sld_lr(),  "mul", pg="sld_lr",                  cf=True),
    G("mgf_sld_lr_gs",   __sld_lr(),  "mul", pg="sld_lr",                  gf=True),
    E("sld_lr_gs"),
 ]
 __dense_sld_lr_set = [
@ -267,12 +296,18 @@ __dense_sld_lr_set = [
    M("d_mul_sld_lr_is",   __sld_lr(),  "mul", d=True, conf="isoft",                        ),
    M("d_m3w_sld_lr_is",   __sld_lr(),  "mul", d=True, conf="isoft",                 cf=True),
    M("d_mgf_sld_lr_is",   __sld_lr(),  "mul", d=True, conf="isoft",                 gf=True),
    # sld all
    M("d_bin_sld_lr_a",    __sld_lr(),  "bin", d=True, conf=["max_conf", "entropy", "isoft"],         ),
    M("d_bgf_sld_lr_a",    __sld_lr(),  "bin", d=True, conf=["max_conf", "entropy", "isoft"],  gf=True),
    M("d_mul_sld_lr_a",    __sld_lr(),  "mul", d=True, conf=["max_conf", "entropy", "isoft"],         ),
    M("d_m3w_sld_lr_a",    __sld_lr(),  "mul", d=True, conf=["max_conf", "entropy", "isoft"],  cf=True),
    M("d_mgf_sld_lr_a",    __sld_lr(),  "mul", d=True, conf=["max_conf", "entropy", "isoft"],  gf=True),
    # gs sld
-    G("d_bin_sld_lr_gs",   __sld_lr(),  "bin", d=True, pg="sld_lr"                             ),
+    G("d_bin_sld_lr_gs",   __sld_lr(),  "bin", d=True, pg="sld_lr"                          ),
-    G("d_bgf_sld_lr_gs",   __sld_lr(),  "bin", d=True, pg="sld_lr",                     gf=True),
+    G("d_bgf_sld_lr_gs",   __sld_lr(),  "bin", d=True, pg="sld_lr",                  gf=True),
-    G("d_mul_sld_lr_gs",   __sld_lr(),  "mul", d=True, pg="sld_lr"                             ),
+    G("d_mul_sld_lr_gs",   __sld_lr(),  "mul", d=True, pg="sld_lr"                          ),
-    G("d_m3w_sld_lr_gs",   __sld_lr(),  "mul", d=True, pg="sld_lr",                     cf=True),
+    G("d_m3w_sld_lr_gs",   __sld_lr(),  "mul", d=True, pg="sld_lr",                  cf=True),
-    G("d_mgf_sld_lr_gs",   __sld_lr(),  "mul", d=True, pg="sld_lr",                     gf=True),
+    G("d_mgf_sld_lr_gs",   __sld_lr(),  "mul", d=True, pg="sld_lr",                  gf=True),
 ]
 __dense_sld_rbf_set = [
@ -305,6 +340,12 @@ __dense_sld_rbf_set = [
    M("d_mul_sld_rbf_is", __sld_rbf(), "mul", d=True, conf="isoft",                          ),
    M("d_m3w_sld_rbf_is", __sld_rbf(), "mul", d=True, conf="isoft",                   cf=True),
    M("d_mgf_sld_rbf_is", __sld_rbf(), "mul", d=True, conf="isoft",                   gf=True),
    # sld all
    M("d_bin_sld_rbf_a",  __sld_rbf(), "bin", d=True, conf=["max_conf", "entropy", "isoft"],         ),
    M("d_bgf_sld_rbf_a",  __sld_rbf(), "bin", d=True, conf=["max_conf", "entropy", "isoft"],  gf=True),
    M("d_mul_sld_rbf_a",  __sld_rbf(), "mul", d=True, conf=["max_conf", "entropy", "isoft"],         ),
    M("d_m3w_sld_rbf_a",  __sld_rbf(), "mul", d=True, conf=["max_conf", "entropy", "isoft"],  cf=True),
    M("d_mgf_sld_rbf_a",  __sld_rbf(), "mul", d=True, conf=["max_conf", "entropy", "isoft"],  gf=True),
    # gs sld
    G("d_bin_sld_rbf_gs", __sld_rbf(), "bin", d=True, pg="sld_rbf", search="spider",        ),
    G("d_bgf_sld_rbf_gs", __sld_rbf(), "bin", d=True, pg="sld_rbf", search="spider", gf=True),
@ -334,10 +375,15 @@ __kde_lr_set = [
    M("bin_kde_lr_is", __kde_lr(), "bin", conf="isoft",                        ),
    M("mul_kde_lr_is", __kde_lr(), "mul", conf="isoft",                        ),
    M("m3w_kde_lr_is", __kde_lr(), "mul", conf="isoft",                 cf=True),
    # kde all
    M("bin_kde_lr_a",  __kde_lr(), "bin", conf=["max_conf", "entropy", "isoft"],         ),
    M("mul_kde_lr_a",  __kde_lr(), "mul", conf=["max_conf", "entropy", "isoft"],         ),
    M("m3w_kde_lr_a",  __kde_lr(), "mul", conf=["max_conf", "entropy", "isoft"],  cf=True),
    # gs kde
-    G("bin_kde_lr_gs", __kde_lr(), "bin", pg="kde_lr", search="spider"         ),
+    G("bin_kde_lr_gs", __kde_lr(), "bin", pg="kde_lr", search="grid"         ),
-    G("mul_kde_lr_gs", __kde_lr(), "mul", pg="kde_lr", search="spider"         ),
+    G("mul_kde_lr_gs", __kde_lr(), "mul", pg="kde_lr", search="grid"         ),
-    G("m3w_kde_lr_gs", __kde_lr(), "mul", pg="kde_lr", search="spider", cf=True),
+    G("m3w_kde_lr_gs", __kde_lr(), "mul", pg="kde_lr", search="grid", cf=True),
    E("kde_lr_gs"),
 ]
 __dense_kde_lr_set = [
@ -361,6 +407,10 @@ __dense_kde_lr_set = [
    M("d_bin_kde_lr_is", __kde_lr(), "bin", d=True, conf="isoft",                        ),
    M("d_mul_kde_lr_is", __kde_lr(), "mul", d=True, conf="isoft",                        ),
    M("d_m3w_kde_lr_is", __kde_lr(), "mul", d=True, conf="isoft",                 cf=True),
    # kde all
    M("d_bin_kde_lr_a",  __kde_lr(), "bin", d=True, conf=["max_conf", "entropy", "isoft"],         ),
    M("d_mul_kde_lr_a",  __kde_lr(), "mul", d=True, conf=["max_conf", "entropy", "isoft"],         ),
    M("d_m3w_kde_lr_a",  __kde_lr(), "mul", d=True, conf=["max_conf", "entropy", "isoft"],  cf=True),
    # gs kde                             
    G("d_bin_kde_lr_gs", __kde_lr(), "bin", d=True, pg="kde_lr", search="spider"            ),
    G("d_mul_kde_lr_gs", __kde_lr(), "mul", d=True, pg="kde_lr", search="spider"            ),
@ -388,6 +438,10 @@ __dense_kde_rbf_set = [
    M("d_bin_kde_rbf_is", __kde_rbf(), "bin", d=True, conf="isoft",                         ),
    M("d_mul_kde_rbf_is", __kde_rbf(), "mul", d=True, conf="isoft",                         ),
    M("d_m3w_kde_rbf_is", __kde_rbf(), "mul", d=True, conf="isoft",                  cf=True),
    # kde all
    M("d_bin_kde_rbf_a",  __kde_rbf(), "bin", d=True, conf=["max_conf", "entropy", "isoft"],         ),
    M("d_mul_kde_rbf_a",  __kde_rbf(), "mul", d=True, conf=["max_conf", "entropy", "isoft"],         ),
    M("d_m3w_kde_rbf_a",  __kde_rbf(), "mul", d=True, conf=["max_conf", "entropy", "isoft"],  cf=True),
    # gs kde
    G("d_bin_kde_rbf_gs", __kde_rbf(), "bin", d=True, pg="kde_rbf", search="spider"          ),
    G("d_mul_kde_rbf_gs", __kde_rbf(), "mul", d=True, pg="kde_rbf", search="spider"          ),
--- a/quacc/evaluation/report.py
+++ b/quacc/evaluation/report.py
@ -1,7 +1,6 @@
 import json
 import pickle
 from collections import defaultdict
 from itertools import chain
 from pathlib import Path
 from typing import List, Tuple
@ -39,6 +38,7 @@ class EvaluationReport:
        self.data: pd.DataFrame | None = None
        self.name = name if name is not None else "default"
        self.time = 0.0
        self.fit_score = None
    def append_row(self, basep: np.ndarray | Tuple, **row):
        # bp = basep[1]
@ -89,6 +89,7 @@ class CompReport:
        train_prev: np.ndarray = None,
        valid_prev: np.ndarray = None,
        times=None,
        fit_scores=None,
        g_time=None,
    ):
        if isinstance(datas, pd.DataFrame):
@ -105,6 +106,13 @@ class CompReport:
                .sort_index(axis=0, level=0, ascending=False, sort_remaining=False)
            )
        if fit_scores is None:
            self.fit_scores = {
                er.name: er.fit_score for er in datas if er.fit_score is not None
            }
        else:
            self.fit_scores = fit_scores
        if times is None:
            self.times = {er.name: er.time for er in datas}
        else:
@ -114,6 +122,51 @@ class CompReport:
        self.train_prev = train_prev
        self.valid_prev = valid_prev
    def postprocess(
        self,
        f_data: pd.DataFrame,
        _data: pd.DataFrame,
        metric=None,
        estimators=None,
    ) -> pd.DataFrame:
        _mapping = {
            "sld_lr_gs": [
                "bin_sld_lr_gs",
                "mul_sld_lr_gs",
                "m3w_sld_lr_gs",
            ],
            "kde_lr_gs": [
                "bin_kde_lr_gs",
                "mul_kde_lr_gs",
                "m3w_kde_lr_gs",
            ],
        }
        for name, methods in _mapping.items():
            if estimators is not None and name not in estimators:
                continue
            if len(np.where(np.in1d(methods, self._data.columns.unique(1)))[0]) != len(
                methods
            ):
                continue
            _metric = _get_metric(metric)
            m_data = _data.loc[:, (_metric, methods)]
            _fit_scores = [(k, v) for (k, v) in self.fit_scores.items() if k in methods]
            _best_method = [k for k, v in _fit_scores][
                np.argmin([v for k, v in _fit_scores])
            ]
            _metric = (
                [_metric]
                if _metric is isinstance(_metric, str)
                else m_data.columns.unique(0)
            )
            for _m in _metric:
                f_data.loc[:, (_m, name)] = m_data.loc[:, (_m, _best_method)]
        return f_data
    @property
    def prevs(self) -> np.ndarray:
        return self.data().index.unique(0)
@ -149,6 +202,7 @@ class CompReport:
            train_prev=self.train_prev,
            valid_prev=self.valid_prev,
            times=self.times | other.times,
            fit_scores=self.fit_scores | other.fit_scores,
            g_time=self.times["tot"] + other.times["tot"],
        )
@ -159,7 +213,10 @@ class CompReport:
        _estimators = _get_estimators(
            estimators, self._data.loc[:, (_metric, slice(None))].columns.unique(1)
        )
-        f_data: pd.DataFrame = self._data.copy().loc[:, (_metric, _estimators)]
+        _data: pd.DataFrame = self._data.copy()
        f_data: pd.DataFrame = _data.loc[:, (_metric, _estimators)]
        f_data = self.postprocess(f_data, _data, metric=metric, estimators=estimators)
        if len(f_data.columns.unique(0)) == 1:
            f_data = f_data.droplevel(level=0, axis=1)
@ -187,7 +244,11 @@ class CompReport:
        _estimators = _get_estimators(
            estimators, shift_data.loc[:, (_metric, slice(None))].columns.unique(1)
        )
        s_data: pd.DataFrame = shift_data
        shift_data: pd.DataFrame = shift_data.loc[:, (_metric, _estimators)]
        shift_data = self.postprocess(
            shift_data, s_data, metric=metric, estimators=estimators
        )
        if len(shift_data.columns.unique(0)) == 1:
            shift_data = shift_data.droplevel(level=0, axis=1)
@ -354,17 +415,27 @@ class CompReport:
        return res
 def _cr_train_prev(cr: CompReport):
    return tuple(np.around(cr.train_prev, decimals=2))
 def _cr_data(cr: CompReport, metric=None, estimators=None):
    return cr.data(metric, estimators)
 class DatasetReport:
    _default_dr_modes = [
        "delta_train",
        "stdev_train",
        "train_table",
        "train_std_table",
        "shift",
        "shift_table",
        "delta_test",
        "stdev_test",
        "test_table",
        "stats_table",
        "fit_scores",
    ]
    _default_cr_modes = CompReport._default_modes
@ -380,15 +451,62 @@ class DatasetReport:
        return DatasetReport(self.name, _crs)
    def fit_scores(self, metric: str = None, estimators: List[str] = None):
        def _get_sort_idx(arr):
            return np.array([np.searchsorted(np.sort(a), a) + 1 for a in arr])
        def _get_best_idx(arr):
            return np.argmin(arr, axis=1)
        def _fdata_idx(idx) -> np.ndarray:
            return _fdata.loc[(idx, slice(None), slice(None)), :].to_numpy()
        _crs_train = [_cr_train_prev(cr) for cr in self.crs]
        for cr in self.crs:
            if not hasattr(cr, "fit_scores"):
                return None
        _crs_fit_scores = [cr.fit_scores for cr in self.crs]
        _fit_scores = pd.DataFrame(_crs_fit_scores, index=_crs_train)
        _fit_scores = _fit_scores.sort_index(axis=0, ascending=False)
        _estimators = _get_estimators(estimators, _fit_scores.columns)
        if _estimators.shape[0] == 0:
            return None
        _fdata = self.data(metric=metric, estimators=_estimators)
        # ensure that columns in _fit_scores have the same ordering of _fdata
        _fit_scores = _fit_scores.loc[:, _fdata.columns]
        _best_fit_estimators = _get_best_idx(_fit_scores.to_numpy())
        # scores = np.array(
        #     [
        #         _get_sort_idx(
        #             _fdata.loc[(idx, slice(None), slice(None)), :].to_numpy()
        #         )[:, cl].mean()
        #         for idx, cl in zip(_fit_scores.index, _best_fit_estimators)
        #     ]
        # )
        # for idx, cl in zip(_fit_scores.index, _best_fit_estimators):
        #     print(_fdata_idx(idx)[:, cl])
        #     print(_fdata_idx(idx).min(axis=1), end="\n\n")
        scores = np.array(
            [
                np.abs(_fdata_idx(idx)[:, cl] - _fdata_idx(idx).min(axis=1)).mean()
                for idx, cl in zip(_fit_scores.index, _best_fit_estimators)
            ]
        )
        return scores
    def data(self, metric: str = None, estimators: List[str] = None) -> pd.DataFrame:
        def _cr_train_prev(cr: CompReport):
            return tuple(np.around(cr.train_prev, decimals=2))
        def _cr_data(cr: CompReport):
            return cr.data(metric, estimators)
        _crs_sorted = sorted(
-            [(_cr_train_prev(cr), _cr_data(cr)) for cr in self.crs],
+            [(_cr_train_prev(cr), _cr_data(cr, metric, estimators)) for cr in self.crs],
            key=lambda cr: len(cr[1].columns),
            reverse=True,
        )
@ -460,6 +578,15 @@ class DatasetReport:
        avg_p.loc["mean", :] = f_data.mean()
        return avg_p
    def train_std_table(self, metric: str = None, estimators: List[str] = None):
        f_data = self.data(metric=metric, estimators=estimators)
        avg_p = f_data.groupby(level=1, sort=False).mean()
        avg_p.loc["mean", :] = f_data.mean()
        avg_s = f_data.groupby(level=1, sort=False).std()
        avg_s.loc["mean", :] = f_data.std()
        avg_r = pd.concat([avg_p, avg_s], axis=1, keys=["avg", "std"])
        return avg_r
    def test_table(
        self, metric: str = None, estimators: List[str] = None
    ) -> pd.DataFrame:
@ -591,6 +718,20 @@ class DatasetReport:
                base_path=base_path,
                backend=backend,
            )
        elif mode == "fit_scores":
            _fit_scores = self.fit_scores(metric, estimators) if data is None else data
            if _fit_scores is None:
                return None
            train_prevs = self.data(metric, estimators).index.unique(0)
            return plot.plot_fit_scores(
                train_prevs=train_prevs,
                scores=_fit_scores,
                metric=metric,
                name=conf,
                save_fig=save_fig,
                base_path=base_path,
                backend=backend,
            )
    def to_md(
        self,
--- a/quacc/method/base.py
+++ b/quacc/method/base.py
@ -42,7 +42,7 @@ class BaseAccuracyEstimator(BaseQuantifier):
            pred_proba = self.classifier.predict_proba(coll.X)
        return ExtendedCollection.from_lc(
-            coll, pred_proba=pred_proba, extpol=self.extpol
+            coll, pred_proba=pred_proba, ext=pred_proba, extpol=self.extpol
        )
    def _extend_instances(self, instances: np.ndarray | sp.csr_matrix):
--- a/quacc/method/confidence.py
+++ b/quacc/method/confidence.py
@ -63,6 +63,13 @@ class Threshold(ConfidenceMetric):
        _exp = scores - self.threshold
        return _exp
    # def conf(self, X, probas):
    #     scores = self.get_scores(probas)
    #     _exp = np.where(
    #         scores >= self.threshold, np.ones(scores.shape), np.zeros(scores.shape)
    #     )
    #     return _exp[:, np.newaxis]
@metric("linreg")
 class LinReg(ConfidenceMetric):
--- a/quacc/method/model_selection.py
+++ b/quacc/method/model_selection.py
@ -242,6 +242,11 @@ class GridSearchAE(BaseAccuracyEstimator):
            return self.best_model_
        raise ValueError("best_model called before fit")
    def best_score(self):
        if hasattr(self, "best_score_"):
            return self.best_score_
        raise ValueError("best_score called before fit")
 class RandomizedSearchAE(GridSearchAE):
    ERR_THRESHOLD = 1e-4
@ -473,3 +478,4 @@ class SpiderSearchAE(GridSearchAE):
                score += 1
        return score
--- a/quacc/plot/init.py
+++ b/quacc/plot/init.py
@ -1 +1,7 @@
-from quacc.plot.plot import get_backend, plot_delta, plot_diagonal, plot_shift
+from quacc.plot.plot import (
    get_backend,
    plot_delta,
    plot_diagonal,
    plot_shift,
    plot_fit_scores,
 )
--- a/quacc/plot/base.py
+++ b/quacc/plot/base.py
@ -52,3 +52,16 @@ class BasePlot:
        legend=True,
    ):
        ...
    @classmethod
    def plot_fit_scores(
        train_prevs,
        scores,
        *,
        pos_class=1,
        title="default",
        x_label="prev.",
        y_label="position",
        legend=True,
    ):
        ...
--- a/quacc/plot/plot.py
+++ b/quacc/plot/plot.py
@ -142,3 +142,37 @@ def plot_shift(
        return fig, output_path
    return fig
 def plot_fit_scores(
    train_prevs,
    scores,
    *,
    pos_class=1,
    metric="acc",
    name="default",
    legend=True,
    save_fig=False,
    base_path=None,
    backend=None,
 ):
    backend = __backend if backend is None else backend
    title = f"fit_scores_{name}_avg_{metric}"
    x_label = "train prev."
    y_label = "position"
    fig = backend.plot_fit_scores(
        train_prevs,
        scores,
        pos_class=pos_class,
        title=title,
        x_label=x_label,
        y_label=y_label,
        legend=legend,
    )
    if save_fig:
        output_path = backend.save_fig(fig, base_path, title)
        return fig, output_path
    return fig
--- a/quacc/plot/plotly.py
+++ b/quacc/plot/plotly.py
@ -8,10 +8,38 @@ import plotly.graph_objects as go
 from quacc.plot.base import BasePlot
 class PlotCfg:
    def __init__(self, mode, lwidth, font=None, legend=None, template="seaborn"):
        self.mode = mode
        self.lwidth = lwidth
        self.legend = {} if legend is None else legend
        self.font = {} if font is None else font
        self.template = template
 web_cfg = PlotCfg("lines+markers", 2)
 png_cfg = PlotCfg(
    "lines",
    5,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        xanchor="right",
        y=1.02,
        x=1,
        font=dict(size=24),
    ),
    font=dict(size=24),
    # template="ggplot2",
 )
 _cfg = png_cfg
 class PlotlyPlot(BasePlot):
    __themes = defaultdict(
        lambda: {
-            "template": "seaborn",
+            "template": _cfg.template,
        }
    )
    __themes = __themes | {
@ -35,7 +63,7 @@ class PlotlyPlot(BasePlot):
            case v if v > 10:
                __colors = plotly.colors.qualitative.Light24
            case _:
-                __colors = plotly.colors.qualitative.Plotly
+                __colors = plotly.colors.qualitative.G10
        def __generator(cs):
            while True:
@ -50,9 +78,8 @@ class PlotlyPlot(BasePlot):
            xaxis_title=x_label,
            yaxis_title=y_label,
            template=self.theme["template"],
-            font=dict(
+            font=_cfg.font,
-                size=18,
+            legend=_cfg.legend,
            ),
        )
    def save_fig(self, fig, base_path, title) -> Path:
@ -82,9 +109,9 @@ class PlotlyPlot(BasePlot):
                go.Scatter(
                    x=x,
                    y=delta,
-                    mode="lines+markers",
+                    mode=_cfg.mode,
                    name=name,
-                    line=dict(color=self.hex_to_rgb(color)),
+                    line=dict(color=self.hex_to_rgb(color), width=_cfg.lwidth),
                    hovertemplate="prev.: %{x}<br>error: %{y:,.4f}",
                )
            ]
@ -193,9 +220,9 @@ class PlotlyPlot(BasePlot):
                    x=x,
                    y=delta,
                    customdata=np.stack((counts[col_idx],), axis=-1),
-                    mode="lines+markers",
+                    mode=_cfg.mode,
                    name=name,
-                    line=dict(color=self.hex_to_rgb(color)),
+                    line=dict(color=self.hex_to_rgb(color), width=_cfg.lwidth),
                    hovertemplate="shift: %{x}<br>error: %{y}"
                    + "<br>count: %{customdata[0]}"
                    if counts is not None
@ -205,3 +232,29 @@ class PlotlyPlot(BasePlot):
        self.update_layout(fig, title, x_label, y_label)
        return fig
    def plot_fit_scores(
        self,
        train_prevs,
        scores,
        *,
        pos_class=1,
        title="default",
        x_label="prev.",
        y_label="position",
        legend=True,
    ) -> go.Figure:
        fig = go.Figure()
        # x = train_prevs
        x = [str(tuple(bp)) for bp in train_prevs]
        fig.add_trace(
            go.Scatter(
                x=x,
                y=scores,
                mode="lines+markers",
                showlegend=False,
            ),
        )
        self.update_layout(fig, title, x_label, y_label)
        return fig
--- a/remote.log
+++ b/remote.log
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,116 @@
 abstention==0.1.3.1 ; python_version >= "3.10" and python_version < "4.0"
 ansi2html==1.9.1 ; python_version >= "3.10" and python_version < "4.0"
 appnope==0.1.3 ; python_version >= "3.10" and python_version < "4.0" and platform_system == "Darwin"
 asttokens==2.4.1 ; python_version >= "3.10" and python_version < "4.0"
 bcrypt==4.1.2 ; python_version >= "3.10" and python_version < "4.0"
 bleach==6.1.0 ; python_version >= "3.10" and python_version < "4.0"
 blinker==1.7.0 ; python_version >= "3.10" and python_version < "4.0"
 bokeh==3.3.4 ; python_version >= "3.10" and python_version < "4.0"
 certifi==2023.11.17 ; python_version >= "3.10" and python_version < "4.0"
 cffi==1.16.0 ; python_version >= "3.10" and python_version < "4.0"
 charset-normalizer==3.3.2 ; python_version >= "3.10" and python_version < "4.0"
 click==8.1.7 ; python_version >= "3.10" and python_version < "4.0"
 colorama==0.4.6 ; python_version >= "3.10" and python_version < "4" and sys_platform == "win32" or python_version >= "3.10" and python_version < "4" and platform_system == "Windows"
 comm==0.2.1 ; python_version >= "3.10" and python_version < "4.0"
 contourpy==1.2.0 ; python_version >= "3.10" and python_version < "4"
 coverage[toml]==7.4.1 ; python_version >= "3.10" and python_version < "4.0"
 cryptography==42.0.1 ; python_version >= "3.10" and python_version < "4.0"
 cycler==0.12.1 ; python_version >= "3.10" and python_version < "4"
 dash-bootstrap-components==1.5.0 ; python_version >= "3.10" and python_version < "4"
 dash-core-components==2.0.0 ; python_version >= "3.10" and python_version < "4.0"
 dash-html-components==2.0.0 ; python_version >= "3.10" and python_version < "4.0"
 dash-table==5.0.0 ; python_version >= "3.10" and python_version < "4.0"
 dash==2.14.2 ; python_version >= "3.10" and python_version < "4.0"
 debugpy==1.8.0 ; python_version >= "3.10" and python_version < "4.0"
 decorator==5.1.1 ; python_version >= "3.10" and python_version < "4.0"
 exceptiongroup==1.2.0 ; python_version >= "3.10" and python_version < "3.11"
 executing==2.0.1 ; python_version >= "3.10" and python_version < "4.0"
 flask==3.0.1 ; python_version >= "3.10" and python_version < "4.0"
 fonttools==4.47.2 ; python_version >= "3.10" and python_version < "4"
 gunicorn==21.2.0 ; python_version >= "3.10" and python_version < "4.0"
 idna==3.6 ; python_version >= "3.10" and python_version < "4.0"
 importlib-metadata==7.0.1 ; python_version >= "3.10" and python_version < "4.0"
 iniconfig==2.0.0 ; python_version >= "3.10" and python_version < "4.0"
 ipykernel==6.29.0 ; python_version >= "3.10" and python_version < "4.0"
 ipympl==0.9.3 ; python_version >= "3.10" and python_version < "4.0"
 ipython-genutils==0.2.0 ; python_version >= "3.10" and python_version < "4.0"
 ipython==8.20.0 ; python_version >= "3.10" and python_version < "4.0"
 ipywidgets-bokeh==1.5.0 ; python_version >= "3.10" and python_version < "4.0"
 ipywidgets==8.1.1 ; python_version >= "3.10" and python_version < "4.0"
 itsdangerous==2.1.2 ; python_version >= "3.10" and python_version < "4.0"
 jedi==0.19.1 ; python_version >= "3.10" and python_version < "4.0"
 jinja2==3.1.3 ; python_version >= "3.10" and python_version < "4.0"
 joblib==1.3.2 ; python_version >= "3.10" and python_version < "4"
 jupyter-client==8.6.0 ; python_version >= "3.10" and python_version < "4.0"
 jupyter-core==5.7.1 ; python_version >= "3.10" and python_version < "4.0"
 jupyterlab-widgets==3.0.9 ; python_version >= "3.10" and python_version < "4.0"
 kiwisolver==1.4.5 ; python_version >= "3.10" and python_version < "4"
 linkify-it-py==2.0.2 ; python_version >= "3.10" and python_version < "4.0"
 logging==0.4.9.6 ; python_version >= "3.10" and python_version < "4.0"
 markdown-it-py==3.0.0 ; python_version >= "3.10" and python_version < "4.0"
 markdown==3.5.2 ; python_version >= "3.10" and python_version < "4.0"
 markupsafe==2.1.4 ; python_version >= "3.10" and python_version < "4.0"
 matplotlib-inline==0.1.6 ; python_version >= "3.10" and python_version < "4.0"
 matplotlib==3.8.2 ; python_version >= "3.10" and python_version < "4"
 mdit-py-plugins==0.4.0 ; python_version >= "3.10" and python_version < "4.0"
 mdurl==0.1.2 ; python_version >= "3.10" and python_version < "4.0"
 nest-asyncio==1.6.0 ; python_version >= "3.10" and python_version < "4.0"
 numpy==1.26.3 ; python_version >= "3.10" and python_version < "4.0"
 packaging==23.2 ; python_version >= "3.10" and python_version < "4.0"
 pandas-stubs==2.1.4.231227 ; python_version >= "3.10" and python_version < "4.0"
 pandas==2.2.0 ; python_version >= "3.10" and python_version < "4.0"
 panel==1.3.8 ; python_version >= "3.10" and python_version < "4.0"
 param==2.0.2 ; python_version >= "3.10" and python_version < "4.0"
 paramiko==3.4.0 ; python_version >= "3.10" and python_version < "4.0"
 parso==0.8.3 ; python_version >= "3.10" and python_version < "4.0"
 pexpect==4.9.0 ; python_version >= "3.10" and python_version < "4.0" and sys_platform != "win32"
 pillow==10.2.0 ; python_version >= "3.10" and python_version < "4.0"
 platformdirs==4.1.0 ; python_version >= "3.10" and python_version < "4.0"
 plotly==5.18.0 ; python_version >= "3.10" and python_version < "4.0"
 pluggy==1.4.0 ; python_version >= "3.10" and python_version < "4.0"
 prompt-toolkit==3.0.43 ; python_version >= "3.10" and python_version < "4.0"
 psutil==5.9.8 ; python_version >= "3.10" and python_version < "4.0"
 ptyprocess==0.7.0 ; python_version >= "3.10" and python_version < "4.0" and sys_platform != "win32"
 pure-eval==0.2.2 ; python_version >= "3.10" and python_version < "4.0"
 pyarrow==15.0.0 ; python_version >= "3.10" and python_version < "4.0"
 pycparser==2.21 ; python_version >= "3.10" and python_version < "4.0"
 pygments==2.17.2 ; python_version >= "3.10" and python_version < "4.0"
 pylance==0.5.10 ; python_version >= "3.10" and python_version < "4.0"
 pynacl==1.5.0 ; python_version >= "3.10" and python_version < "4.0"
 pyparsing==3.1.1 ; python_version >= "3.10" and python_version < "4"
 pytest-cov==4.1.0 ; python_version >= "3.10" and python_version < "4.0"
 pytest-mock==3.12.0 ; python_version >= "3.10" and python_version < "4.0"
 pytest==8.0.0 ; python_version >= "3.10" and python_version < "4.0"
 python-dateutil==2.8.2 ; python_version >= "3.10" and python_version < "4.0"
 pytz==2023.4 ; python_version >= "3.10" and python_version < "4.0"
 pyviz-comms==3.0.1 ; python_version >= "3.10" and python_version < "4.0"
 pywin32==306 ; sys_platform == "win32" and platform_python_implementation != "PyPy" and python_version >= "3.10" and python_version < "4.0"
 pyyaml==6.0.1 ; python_version >= "3.10" and python_version < "4.0"
 pyzmq==25.1.2 ; python_version >= "3.10" and python_version < "4.0"
 quapy==0.1.7 ; python_version >= "3.10" and python_version < "4"
 requests==2.31.0 ; python_version >= "3.10" and python_version < "4.0"
 retrying==1.3.4 ; python_version >= "3.10" and python_version < "4.0"
 scikit-learn==1.4.0 ; python_version >= "3.10" and python_version < "4"
 scipy==1.12.0 ; python_version >= "3.10" and python_version < "4.0"
 setuptools==69.0.3 ; python_version >= "3.10" and python_version < "4.0"
 six==1.16.0 ; python_version >= "3.10" and python_version < "4.0"
 stack-data==0.6.3 ; python_version >= "3.10" and python_version < "4.0"
 tabulate==0.9.0 ; python_version >= "3.10" and python_version < "4.0"
 tenacity==8.2.3 ; python_version >= "3.10" and python_version < "4.0"
 threadpoolctl==3.2.0 ; python_version >= "3.10" and python_version < "4"
 tomli==2.0.1 ; python_version >= "3.10" and python_full_version <= "3.11.0a6"
 tornado==6.4 ; python_version >= "3.10" and python_version < "4.0"
 tqdm==4.66.1 ; python_version >= "3.10" and python_version < "4"
 traitlets==5.14.1 ; python_version >= "3.10" and python_version < "4.0"
 types-pytz==2023.4.0.20240130 ; python_version >= "3.10" and python_version < "4.0"
 typing-extensions==4.9.0 ; python_version >= "3.10" and python_version < "4.0"
 tzdata==2023.4 ; python_version >= "3.10" and python_version < "4.0"
 uc-micro-py==1.0.2 ; python_version >= "3.10" and python_version < "4.0"
 urllib3==2.1.0 ; python_version >= "3.10" and python_version < "4.0"
 wcwidth==0.2.13 ; python_version >= "3.10" and python_version < "4.0"
 webencodings==0.5.1 ; python_version >= "3.10" and python_version < "4.0"
 werkzeug==3.0.1 ; python_version >= "3.10" and python_version < "4.0"
 widgetsnbextension==4.0.9 ; python_version >= "3.10" and python_version < "4.0"
 xlrd==2.0.1 ; python_version >= "3.10" and python_version < "4"
 xyzservices==2023.10.1 ; python_version >= "3.10" and python_version < "4.0"
 zipp==3.17.0 ; python_version >= "3.10" and python_version < "4.0"
--- a/test_postprocess.py
+++ b/test_postprocess.py
@ -0,0 +1,9 @@
 from quacc.evaluation.report import DatasetReport
 dr = DatasetReport.unpickle("output/main/imdb/imdb.pickle")
 _estimators = ["sld_lr_gs", "bin_sld_lr_gs", "mul_sld_lr_gs", "m3w_sld_lr_gs"]
 _data = dr.data(metric="acc", estimators=_estimators)
 for idx, cr in zip(_data.index.unique(0), dr.crs[::-1]):
    print(cr.train_prev)
    print({k: v for k, v in cr.fit_scores.items() if k in _estimators})
    print(_data.loc[(idx, slice(None), slice(None)), :])