docker merged

2024-02-03 12:36:41 +01:00 · 2024-02-03 12:36:41 +01:00 · 5d82419ce8
parent 921caaf426 feb9e0a59b
commit 5d82419ce8
15 changed files with 318 additions and 31 deletions
--- a/conf.yaml
+++ b/conf.yaml
@ -72,6 +72,10 @@ test_conf: &test_conf
 main:
  confs: &main_confs
    - DATASET_NAME: imdb
+<<<<<<< HEAD
+=======
+  other_confs:
+>>>>>>> docker
    - DATASET_NAME: rcv1
      DATASET_TARGET: CCAT
    - DATASET_NAME: rcv1
@ -338,6 +342,43 @@ d_kde_rbf_conf: &d_kde_rbf_conf
    - DATASET_NAME: rcv1
      DATASET_TARGET: CCAT

+cc_lr_conf: &cc_lr_conf
+  global:
+    METRICS: 
+      - acc
+      - f1
+    OUT_DIR_NAME: output/cc_lr
+    DATASET_N_PREVS: 9
+    COMP_ESTIMATORS:
+      # - bin_cc_lr
+      # - mul_cc_lr
+      # - m3w_cc_lr
+      # - bin_cc_lr_c
+      # - mul_cc_lr_c
+      # - m3w_cc_lr_c
+      # - bin_cc_lr_mc
+      # - mul_cc_lr_mc
+      # - m3w_cc_lr_mc
+      # - bin_cc_lr_ne
+      # - mul_cc_lr_ne
+      # - m3w_cc_lr_ne
+      # - bin_cc_lr_is
+      # - mul_cc_lr_is
+      # - m3w_cc_lr_is
+      # - bin_cc_lr_a
+      # - mul_cc_lr_a
+      # - m3w_cc_lr_a
+      - bin_cc_lr_gs
+      - mul_cc_lr_gs
+      - m3w_cc_lr_gs
+    N_JOBS: -2
+
+  confs: *main_confs
+  other_confs:
+    - DATASET_NAME: imdb
+    - DATASET_NAME: rcv1
+      DATASET_TARGET: CCAT
+
 baselines_conf: &baselines_conf
  global:
    METRICS: 
@ -349,9 +390,12 @@ baselines_conf: &baselines_conf
      - doc
      - atc_mc
      - naive
+<<<<<<< HEAD
      # - mandoline
      # - rca
      # - rca_star
+=======
+>>>>>>> docker
    N_JOBS: -2

  confs: *main_confs
@ -389,22 +433,34 @@ timing_conf: &timing_conf
      - bin_kde_lr_a
      - mul_kde_lr_a
      - m3w_kde_lr_a
+      - doc 
+      - atc_mc
+      - rca
+      - rca_star
+      - mandoline
+      - naive
+    N_JOBS: 1
+    PROTOCOL_REPEATS: 1
+
+  confs: *main_confs
+
+timing_gs_conf: &timing_gs_conf
+  global:
+    METRICS:
+      - acc
+      - f1
+    OUT_DIR_NAME: output/timing_gs
+    DATASET_N_PREVS: 1
+    COMP_ESTIMATORS:
      - bin_sld_lr_gs
      - mul_sld_lr_gs
      - m3w_sld_lr_gs
      - bin_kde_lr_gs
      - mul_kde_lr_gs
      - m3w_kde_lr_gs
-      - doc 
-      - atc_mc
-      - rca
-      - rca_star
-      - mandoline
-    N_JOBS: 1
-    PROTOCOL_N_PREVS: 1,
-    PROTOCOL_REPEATS: 1,
-    SAMPLE_SIZE: 1000,
+    N_JOBS: -1
+    PROTOCOL_REPEATS: 1

  confs: *main_confs

-exec: *baselines_conf
+exec: *timing_gs_conf
--- a/copy_res.sh
+++ b/copy_res.sh
@ -0,0 +1,9 @@
+#!/bin/bash
+
+# scp -r andreaesuli@edge-nd1.isti.cnr.it:/home/andreaesuli/raid/lorenzo/output/kde_lr_gs ./output/
+# scp -r andreaesuli@edge-nd1.isti.cnr.it:/home/andreaesuli/raid/lorenzo/output/cc_lr ./output/
+scp -r andreaesuli@edge-nd1.isti.cnr.it:/home/andreaesuli/raid/lorenzo/output/baselines ./output/
+
+# scp -r ./output/kde_lr_gs volpi@ilona.isti.cnr.it:/home/volpi/tesi/output/
+# scp -r ./output/cc_lr volpi@ilona.isti.cnr.it:/home/volpi/tesi/output/
+scp -r ./output/baselines volpi@ilona.isti.cnr.it:/home/volpi/tesi/output/
--- a/2
+++ b/2
@ -3,6 +3,8 @@
 if [[ "${1}" == "r" ]]; then
 	scp volpi@ilona.isti.cnr.it:~/tesi/quacc.log ~/tesi/remote.log &>/dev/null
 	ssh volpi@ilona.isti.cnr.it tail -n 500 -f /home/volpi/tesi/quacc.log | bat -P --language=log
+elif [[ "${1}" == "d" ]]; then
+	ssh andreaesuli@edge-nd1.isti.cnr.it tail -n 500 -f /home/andreaesuli/raid/lorenzo/quacc.log | bat -P --language=log
 else
 	tail -n 500 -f /home/lorev/tesi/quacc.log | bat --paging=never --language log
 fi
--- a/qcdash/app.py
+++ b/qcdash/app.py
@ -13,7 +13,7 @@ from dash import Dash, Input, Output, State, callback, ctx, dash_table, dcc, htm
 from dash.dash_table.Format import Align, Format, Scheme

 from quacc import plot
-from quacc.evaluation.estimators import CE
+from quacc.evaluation.estimators import CE, _renames
 from quacc.evaluation.report import CompReport, DatasetReport
 from quacc.evaluation.stats import wilcoxon

@ -26,6 +26,23 @@ def _get_prev_str(prev: np.ndarray):
    return str(tuple(np.around(prev, decimals=2)))


+def rename_estimators(estimators, rev=False):
+    _rnm = _renames
+    if rev:
+        _rnm = {v: k for k, v in _renames.items()}
+
+    new_estimators = []
+    for c in estimators:
+        nc = c
+        for old, new in _rnm.items():
+            if c.startswith(old):
+                nc = new + c[len(old) :]
+
+        new_estimators.append(nc)
+
+    return new_estimators
+
+
 def get_datasets(root: str | Path) -> List[DatasetReport]:
    def load_dataset(dataset):
        dataset = Path(dataset)
@ -153,7 +170,7 @@ def get_DataTable(df, mode):
    columns = {
        c: dict(
            id=c,
-            name=_index_name[mode] if c == "index" else c,
+            name=_index_name[mode] if c == "index" else rename_estimators([c])[0],
            type="numeric",
            format=columns_format,
        )
@ -412,12 +429,13 @@ def update_estimators(href, dataset, metric, curr_estimators, root):
            old_estimators = json.loads(old_estimators)
        except JSONDecodeError:
            old_estimators = []
+    old_estimators = rename_estimators(old_estimators, rev=True)
    valid_estimators: np.ndarray = dr.data(metric=metric).columns.unique(0).to_numpy()
    new_estimators = valid_estimators[
        np.isin(valid_estimators, old_estimators)
    ].tolist()
    valid_estimators = CE.name.sort(valid_estimators.tolist())
-    return valid_estimators, new_estimators
+    return rename_estimators(valid_estimators), rename_estimators(new_estimators)


@callback(
@ -473,6 +491,7 @@ def update_content(dataset, metric, estimators, view, mode, root):
        quote_via=quote,
    )
    dr = get_dr(root, dataset)
+    estimators = rename_estimators(estimators, rev=True)
    match mode:
        case m if m.endswith("table"):
            df = get_table(
--- a/quacc/dataset.py
+++ b/quacc/dataset.py
@ -126,7 +126,9 @@ class DatasetProvider:

    # provare min_df=5
    def __imdb(self, **kwargs):
-        return qp.datasets.fetch_reviews("imdb", tfidf=True, min_df=3).train_test
+        return qp.datasets.fetch_reviews(
+            "imdb", data_home="./quapy_data", tfidf=True, min_df=3
+        ).train_test

    def __rcv1(self, target, **kwargs):
        n_train = 23149
@ -135,7 +137,7 @@ class DatasetProvider:
        if target is None or target not in available_targets:
            raise ValueError(f"Invalid target {target}")

-        dataset = fetch_rcv1()
+        dataset = fetch_rcv1(data_home="./scikit_learn_data")
        target_index = np.where(dataset.target_names == target)[0]
        all_train_d = dataset.data[:n_train, :]
        test_d = dataset.data[n_train:, :]
--- a/quacc/evaluation/baseline.py
+++ b/quacc/evaluation/baseline.py
@ -85,14 +85,14 @@ def naive(
    report = EvaluationReport(name="naive")
    for test in protocol():
        test_preds = c_model_predict(test.X)
-        acc_score = metrics.accuracy_score(test.y, test_preds)
-        f1_score = metrics.f1_score(test.y, test_preds, average=f1_average)
-        meta_acc = abs(val_acc - acc_score)
-        meta_f1 = abs(val_f1 - f1_score)
+        test_acc = metrics.accuracy_score(test.y, test_preds)
+        test_f1 = metrics.f1_score(test.y, test_preds, average=f1_average)
+        meta_acc = abs(val_acc - test_acc)
+        meta_f1 = abs(val_f1 - test_f1)
        report.append_row(
            test.prevalence(),
-            acc_score=acc_score,
-            f1_score=f1_score,
+            acc_score=val_acc,
+            f1_score=val_f1,
            acc=meta_acc,
            f1=meta_f1,
        )
--- a/quacc/evaluation/estimators.py
+++ b/quacc/evaluation/estimators.py
@ -78,3 +78,33 @@ class CompEstimator:


 CE = CompEstimator()
+
+_renames = {
+    "bin_sld_lr": "(2x2)_SLD_LR",
+    "mul_sld_lr": "(1x4)_SLD_LR",
+    "m3w_sld_lr": "(1x3)_SLD_LR",
+    "d_bin_sld_lr": "d_(2x2)_SLD_LR",
+    "d_mul_sld_lr": "d_(1x4)_SLD_LR",
+    "d_m3w_sld_lr": "d_(1x3)_SLD_LR",
+    "d_bin_sld_rbf": "(2x2)_SLD_RBF",
+    "d_mul_sld_rbf": "(1x4)_SLD_RBF",
+    "d_m3w_sld_rbf": "(1x3)_SLD_RBF",
+    "sld_lr": "SLD_LR",
+    "bin_kde_lr": "(2x2)_KDEy_LR",
+    "mul_kde_lr": "(1x4)_KDEy_LR",
+    "m3w_kde_lr": "(1x3)_KDEy_LR",
+    "d_bin_kde_lr": "d_(2x2)_KDEy_LR",
+    "d_mul_kde_lr": "d_(1x4)_KDEy_LR",
+    "d_m3w_kde_lr": "d_(1x3)_KDEy_LR",
+    "bin_cc_lr": "(2x2)_CC_LR",
+    "mul_cc_lr": "(1x4)_CC_LR",
+    "m3w_cc_lr": "(1x3)_CC_LR",
+    "kde_lr": "KDEy_LR",
+    "cc_lr": "CC_LR",
+    "atc_mc": "ATC",
+    "doc": "DoC",
+    "mandoline": "Mandoline",
+    "rca": "RCA",
+    "rca_star": "RCA*",
+    "naive": "Naive",
+}
--- a/quacc/evaluation/method.py
+++ b/quacc/evaluation/method.py
@ -3,7 +3,7 @@ from typing import Callable, List, Union

 import numpy as np
 from matplotlib.pylab import rand
-from quapy.method.aggregative import PACC, SLD, BaseQuantifier
+from quapy.method.aggregative import CC, PACC, SLD, BaseQuantifier
 from quapy.protocol import UPP, AbstractProtocol, OnLabelledCollectionProtocol
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import SVC, LinearSVC
@ -53,6 +53,17 @@ def _param_grid(method, X_fit: np.ndarray):
                "q__classifier__class_weight": [None, "balanced"],
                "confidence": [None, ["isoft"], ["max_conf", "entropy"]],
            }
+        case "cc_lr":
+            return {
+                "q__classifier__C": np.logspace(-3, 3, 7),
+                "q__classifier__class_weight": [None, "balanced"],
+                "confidence": [
+                    None,
+                    ["isoft"],
+                    ["max_conf", "entropy"],
+                    ["max_conf", "entropy", "isoft"],
+                ],
+            }
        case "kde_lr":
            return {
                "q__classifier__C": np.logspace(-3, 3, 7),
@ -219,6 +230,10 @@ def __pacc_lr():
    return PACC(LogisticRegression())


+def __cc_lr():
+    return CC(LogisticRegression())
+
+
 # fmt: off

 __sld_lr_set = [
@ -380,9 +395,9 @@ __kde_lr_set = [
    M("mul_kde_lr_a",  __kde_lr(), "mul", conf=["max_conf", "entropy", "isoft"],         ),
    M("m3w_kde_lr_a",  __kde_lr(), "mul", conf=["max_conf", "entropy", "isoft"],  cf=True),
    # gs kde
-    G("bin_kde_lr_gs", __kde_lr(), "bin", pg="kde_lr", search="spider"         ),
-    G("mul_kde_lr_gs", __kde_lr(), "mul", pg="kde_lr", search="spider"         ),
-    G("m3w_kde_lr_gs", __kde_lr(), "mul", pg="kde_lr", search="spider", cf=True),
+    G("bin_kde_lr_gs", __kde_lr(), "bin", pg="kde_lr", search="grid"         ),
+    G("mul_kde_lr_gs", __kde_lr(), "mul", pg="kde_lr", search="grid"         ),
+    G("m3w_kde_lr_gs", __kde_lr(), "mul", pg="kde_lr", search="grid", cf=True),
    E("kde_lr_gs"),
 ]

@ -448,6 +463,37 @@ __dense_kde_rbf_set = [
    G("d_m3w_kde_rbf_gs", __kde_rbf(), "mul", d=True, pg="kde_rbf", search="spider", cf=True),
 ]

+__cc_lr_set = [
+    # base cc
+    M("bin_cc_lr",    __cc_lr(), "bin"                                       ),
+    M("mul_cc_lr",    __cc_lr(), "mul"                                       ),
+    M("m3w_cc_lr",    __cc_lr(), "mul",                               cf=True),
+    # max_conf + entropy cc
+    M("bin_cc_lr_c",  __cc_lr(), "bin", conf=["max_conf", "entropy"]         ),
+    M("mul_cc_lr_c",  __cc_lr(), "mul", conf=["max_conf", "entropy"]         ),
+    M("m3w_cc_lr_c",  __cc_lr(), "mul", conf=["max_conf", "entropy"], cf=True),
+    # max_conf cc
+    M("bin_cc_lr_mc", __cc_lr(), "bin", conf="max_conf",                     ),
+    M("mul_cc_lr_mc", __cc_lr(), "mul", conf="max_conf",                     ),
+    M("m3w_cc_lr_mc", __cc_lr(), "mul", conf="max_conf",              cf=True),
+    # entropy cc
+    M("bin_cc_lr_ne", __cc_lr(), "bin", conf="entropy",                      ),
+    M("mul_cc_lr_ne", __cc_lr(), "mul", conf="entropy",                      ),
+    M("m3w_cc_lr_ne", __cc_lr(), "mul", conf="entropy",               cf=True),
+    # inverse softmax cc
+    M("bin_cc_lr_is", __cc_lr(), "bin", conf="isoft",                        ),
+    M("mul_cc_lr_is", __cc_lr(), "mul", conf="isoft",                        ),
+    M("m3w_cc_lr_is", __cc_lr(), "mul", conf="isoft",                 cf=True),
+    # cc all
+    M("bin_cc_lr_a",  __cc_lr(), "bin", conf=["max_conf", "entropy", "isoft"],         ),
+    M("mul_cc_lr_a",  __cc_lr(), "mul", conf=["max_conf", "entropy", "isoft"],         ),
+    M("m3w_cc_lr_a",  __cc_lr(), "mul", conf=["max_conf", "entropy", "isoft"],  cf=True),
+    # gs cc
+    G("bin_cc_lr_gs", __cc_lr(), "bin", pg="cc_lr", search="grid"         ),
+    G("mul_cc_lr_gs", __cc_lr(), "mul", pg="cc_lr", search="grid"         ),
+    G("m3w_cc_lr_gs", __cc_lr(), "mul", pg="cc_lr", search="grid", cf=True),
+    E("cc_lr_gs"),
+]

 # fmt: on

@ -458,6 +504,8 @@ __methods_set = (
    + __kde_lr_set
    + __dense_kde_lr_set
    + __dense_kde_rbf_set
+    + __cc_lr_set
+    + [E("QuAcc")]
 )

 _methods = {m.name: m for m in __methods_set}
--- a/quacc/evaluation/report.py
+++ b/quacc/evaluation/report.py
@ -140,6 +140,19 @@ class CompReport:
                "mul_kde_lr_gs",
                "m3w_kde_lr_gs",
            ],
+            "cc_lr_gs": [
+                "bin_cc_lr_gs",
+                "mul_cc_lr_gs",
+                "m3w_cc_lr_gs",
+            ],
+            "QuAcc": [
+                "bin_sld_lr_gs",
+                "mul_sld_lr_gs",
+                "m3w_sld_lr_gs",
+                "bin_kde_lr_gs",
+                "mul_kde_lr_gs",
+                "m3w_kde_lr_gs",
+            ],
        }

        for name, methods in _mapping.items():
--- a/quacc/evaluation/stats.py
+++ b/quacc/evaluation/stats.py
@ -25,6 +25,7 @@ def wilcoxon(
 ) -> pd.DataFrame:
    _data = r.data(metric, estimators)

+    _data = _data.dropna(axis=0, how="any")
    _wilcoxon = {}
    for est in _data.columns.unique(0):
        _wilcoxon[est] = [
--- a/quacc/plot/plot.py
+++ b/quacc/plot/plot.py
@ -39,8 +39,16 @@ def plot_delta(
    else:
        title = f"{_base_title}_{name}_avg_{avg}_{metric}"

-    x_label = f"{'test' if avg is None or avg == 'train' else 'train'} prevalence"
-    y_label = f"{metric} error"
+    if avg is None or avg == "train":
+        x_label = "Test Prevalence"
+    else:
+        x_label = "Train Prevalence"
+    if metric == "acc":
+        y_label = "Prediction Error for Vanilla Accuracy"
+    elif metric == "f1":
+        y_label = "Prediction Error for F1"
+    else:
+        y_label = f"{metric} error"
    fig = backend.plot_delta(
        base_prevs,
        columns,
@ -81,8 +89,12 @@ def plot_diagonal(
    else:
        title = f"diagonal_{name}_{metric}"

-    x_label = f"true {metric}"
-    y_label = f"estim. {metric}"
+    if metric == "acc":
+        x_label = "True Vanilla Accuracy"
+        y_label = "Estimated Vanilla Accuracy"
+    else:
+        x_label = f"true {metric}"
+        y_label = f"estim. {metric}"
    fig = backend.plot_diagonal(
        reference,
        columns,
@ -123,8 +135,13 @@ def plot_shift(
    else:
        title = f"shift_{name}_avg_{metric}"

-    x_label = "dataset shift"
-    y_label = f"{metric} error"
+    x_label = "Amount of Prior Probability Shift"
+    if metric == "acc":
+        y_label = "Prediction Error for Vanilla Accuracy"
+    elif metric == "f1":
+        y_label = "Prediction Error for F1"
+    else:
+        y_label = f"{metric} error"
    fig = backend.plot_shift(
        shift_prevs,
        columns,
--- a/quacc/plot/plotly.py
+++ b/quacc/plot/plotly.py
@ -5,6 +5,7 @@ import numpy as np
 import plotly
 import plotly.graph_objects as go

+from quacc.evaluation.estimators import _renames
 from quacc.plot.base import BasePlot


@ -50,6 +51,7 @@ class PlotlyPlot(BasePlot):

    def __init__(self, theme=None):
        self.theme = PlotlyPlot.__themes[theme]
+        self.rename = True

    def hex_to_rgb(self, hex: str, t: float | None = None):
        hex = hex.lstrip("#")
@ -85,6 +87,24 @@ class PlotlyPlot(BasePlot):
    def save_fig(self, fig, base_path, title) -> Path:
        return None

+    def rename_plots(
+        self,
+        columns,
+    ):
+        if not self.rename:
+            return columns
+
+        new_columns = []
+        for c in columns:
+            nc = c
+            for old, new in _renames.items():
+                if c.startswith(old):
+                    nc = new + c[len(old) :]
+
+            new_columns.append(nc)
+
+        return np.array(new_columns)
+
    def plot_delta(
        self,
        base_prevs,
@ -102,6 +122,7 @@ class PlotlyPlot(BasePlot):
        if isinstance(base_prevs[0], float):
            base_prevs = np.around([(1 - bp, bp) for bp in base_prevs], decimals=4)
        x = [str(tuple(bp)) for bp in base_prevs]
+        columns = self.rename_plots(columns)
        line_colors = self.get_colors(len(columns))
        for name, delta in zip(columns, data):
            color = next(line_colors)
@ -150,6 +171,7 @@ class PlotlyPlot(BasePlot):
    ) -> go.Figure:
        fig = go.Figure()
        x = reference
+        columns = self.rename_plots(columns)
        line_colors = self.get_colors(len(columns))

        _edges = (np.min([np.min(x), np.min(data)]), np.max([np.max(x), np.max(data)]))
@ -211,6 +233,7 @@ class PlotlyPlot(BasePlot):
        fig = go.Figure()
        # x = shift_prevs[:, pos_class]
        x = shift_prevs
+        columns = self.rename_plots(columns)
        line_colors = self.get_colors(len(columns))
        for name, delta in zip(columns, data):
            col_idx = (columns == name).nonzero()[0][0]
--- a/rates.md
+++ b/rates.md
@ -0,0 +1,15 @@
+# Additional covariates percentage
+
+Rate of usage of additional covariates, recalibration and "balanced" class_weight
+during grid search:
+
+| method          | av %   | recalib % | rebalance % |
+| --------------: | :----: | :-------: | :---------: |
+| imdb_sld_lr     | 81.49% | 77.78%    | 59.26%      |
+| imdb_kde_lr     | 71.43% | NA        | 88.18%      |
+| rcv1_CCAT_sld_lr| 62.97% | 70.38%    | 77.78%      |
+| rcv1_CCAT_kde_lr| 78.06% | NA        | 84.82%      |
+| rcv1_GCAT_sld_lr| 76.93% | 61.54%    | 65.39%      |
+| rcv1_GCAT_kde_lr| 71.36% | NA        | 78.65%      |
+| rcv1_MCAT_sld_lr| 62.97% | 48.15%    | 74.08%      |
+| rcv1_MCAT_kde_lr| 71.03% | NA        | 68.70%      |
--- a/run.py
+++ b/run.py
@ -15,3 +15,7 @@ def run():
        run_local()
    elif args.remote:
        run_remote(detatch=args.detatch)
+
+
+if __name__ == "__main__":
+    run()
--- a/selected_gs.py
+++ b/selected_gs.py
@ -0,0 +1,48 @@
+import numpy as np
+
+from quacc.evaluation.report import DatasetReport
+
+datasets = [
+    "imdb/imdb.pickle",
+    "rcv1_CCAT/rcv1_CCAT.pickle",
+    "rcv1_GCAT/rcv1_GCAT.pickle",
+    "rcv1_MCAT/rcv1_MCAT.pickle",
+]
+
+gs = {
+    "sld_lr_gs": [
+        "bin_sld_lr_gs",
+        "mul_sld_lr_gs",
+        "m3w_sld_lr_gs",
+    ],
+    "kde_lr_gs": [
+        "bin_kde_lr_gs",
+        "mul_kde_lr_gs",
+        "m3w_kde_lr_gs",
+    ],
+}
+
+for dst in datasets:
+    dr = DatasetReport.unpickle("output/main/" + dst)
+    print(f"{dst}\n")
+    for name, methods in gs.items():
+        print(f"{name}")
+        sel_methods = [
+            {k: v for k, v in cr.fit_scores.items() if k in methods} for cr in dr.crs
+        ]
+
+        best_methods = [
+            list(ms.keys())[np.argmin(list(ms.values()))] for ms in sel_methods
+        ]
+        m_cnt = []
+        for m in methods:
+            m_cnt.append((np.array(best_methods) == m).nonzero()[0].shape[0])
+        m_cnt = np.array(m_cnt)
+        m_freq = m_cnt / len(best_methods)
+
+        for n in methods:
+            print(n, end="\t")
+        print()
+        for v in m_freq:
+            print(f"{v*100:.2f}", end="\t")
+        print("\n\n")