docker merged

This commit is contained in:
Lorenzo Volpi 2024-02-03 12:36:41 +01:00
commit 5d82419ce8
15 changed files with 318 additions and 31 deletions

View File

@ -72,6 +72,10 @@ test_conf: &test_conf
main:
confs: &main_confs
- DATASET_NAME: imdb
<<<<<<< HEAD
=======
other_confs:
>>>>>>> docker
- DATASET_NAME: rcv1
DATASET_TARGET: CCAT
- DATASET_NAME: rcv1
@ -338,6 +342,43 @@ d_kde_rbf_conf: &d_kde_rbf_conf
- DATASET_NAME: rcv1
DATASET_TARGET: CCAT
cc_lr_conf: &cc_lr_conf
global:
METRICS:
- acc
- f1
OUT_DIR_NAME: output/cc_lr
DATASET_N_PREVS: 9
COMP_ESTIMATORS:
# - bin_cc_lr
# - mul_cc_lr
# - m3w_cc_lr
# - bin_cc_lr_c
# - mul_cc_lr_c
# - m3w_cc_lr_c
# - bin_cc_lr_mc
# - mul_cc_lr_mc
# - m3w_cc_lr_mc
# - bin_cc_lr_ne
# - mul_cc_lr_ne
# - m3w_cc_lr_ne
# - bin_cc_lr_is
# - mul_cc_lr_is
# - m3w_cc_lr_is
# - bin_cc_lr_a
# - mul_cc_lr_a
# - m3w_cc_lr_a
- bin_cc_lr_gs
- mul_cc_lr_gs
- m3w_cc_lr_gs
N_JOBS: -2
confs: *main_confs
other_confs:
- DATASET_NAME: imdb
- DATASET_NAME: rcv1
DATASET_TARGET: CCAT
baselines_conf: &baselines_conf
global:
METRICS:
@ -349,9 +390,12 @@ baselines_conf: &baselines_conf
- doc
- atc_mc
- naive
<<<<<<< HEAD
# - mandoline
# - rca
# - rca_star
=======
>>>>>>> docker
N_JOBS: -2
confs: *main_confs
@ -389,22 +433,34 @@ timing_conf: &timing_conf
- bin_kde_lr_a
- mul_kde_lr_a
- m3w_kde_lr_a
- doc
- atc_mc
- rca
- rca_star
- mandoline
- naive
N_JOBS: 1
PROTOCOL_REPEATS: 1
confs: *main_confs
timing_gs_conf: &timing_gs_conf
global:
METRICS:
- acc
- f1
OUT_DIR_NAME: output/timing_gs
DATASET_N_PREVS: 1
COMP_ESTIMATORS:
- bin_sld_lr_gs
- mul_sld_lr_gs
- m3w_sld_lr_gs
- bin_kde_lr_gs
- mul_kde_lr_gs
- m3w_kde_lr_gs
- doc
- atc_mc
- rca
- rca_star
- mandoline
N_JOBS: 1
PROTOCOL_N_PREVS: 1,
PROTOCOL_REPEATS: 1,
SAMPLE_SIZE: 1000,
N_JOBS: -1
PROTOCOL_REPEATS: 1
confs: *main_confs
exec: *baselines_conf
exec: *timing_gs_conf

9
copy_res.sh Executable file
View File

@ -0,0 +1,9 @@
#!/bin/bash
# scp -r andreaesuli@edge-nd1.isti.cnr.it:/home/andreaesuli/raid/lorenzo/output/kde_lr_gs ./output/
# scp -r andreaesuli@edge-nd1.isti.cnr.it:/home/andreaesuli/raid/lorenzo/output/cc_lr ./output/
scp -r andreaesuli@edge-nd1.isti.cnr.it:/home/andreaesuli/raid/lorenzo/output/baselines ./output/
# scp -r ./output/kde_lr_gs volpi@ilona.isti.cnr.it:/home/volpi/tesi/output/
# scp -r ./output/cc_lr volpi@ilona.isti.cnr.it:/home/volpi/tesi/output/
scp -r ./output/baselines volpi@ilona.isti.cnr.it:/home/volpi/tesi/output/

2
log
View File

@ -3,6 +3,8 @@
if [[ "${1}" == "r" ]]; then
scp volpi@ilona.isti.cnr.it:~/tesi/quacc.log ~/tesi/remote.log &>/dev/null
ssh volpi@ilona.isti.cnr.it tail -n 500 -f /home/volpi/tesi/quacc.log | bat -P --language=log
elif [[ "${1}" == "d" ]]; then
ssh andreaesuli@edge-nd1.isti.cnr.it tail -n 500 -f /home/andreaesuli/raid/lorenzo/quacc.log | bat -P --language=log
else
tail -n 500 -f /home/lorev/tesi/quacc.log | bat --paging=never --language log
fi

View File

@ -13,7 +13,7 @@ from dash import Dash, Input, Output, State, callback, ctx, dash_table, dcc, htm
from dash.dash_table.Format import Align, Format, Scheme
from quacc import plot
from quacc.evaluation.estimators import CE
from quacc.evaluation.estimators import CE, _renames
from quacc.evaluation.report import CompReport, DatasetReport
from quacc.evaluation.stats import wilcoxon
@ -26,6 +26,23 @@ def _get_prev_str(prev: np.ndarray):
return str(tuple(np.around(prev, decimals=2)))
def rename_estimators(estimators, rev=False):
_rnm = _renames
if rev:
_rnm = {v: k for k, v in _renames.items()}
new_estimators = []
for c in estimators:
nc = c
for old, new in _rnm.items():
if c.startswith(old):
nc = new + c[len(old) :]
new_estimators.append(nc)
return new_estimators
def get_datasets(root: str | Path) -> List[DatasetReport]:
def load_dataset(dataset):
dataset = Path(dataset)
@ -153,7 +170,7 @@ def get_DataTable(df, mode):
columns = {
c: dict(
id=c,
name=_index_name[mode] if c == "index" else c,
name=_index_name[mode] if c == "index" else rename_estimators([c])[0],
type="numeric",
format=columns_format,
)
@ -412,12 +429,13 @@ def update_estimators(href, dataset, metric, curr_estimators, root):
old_estimators = json.loads(old_estimators)
except JSONDecodeError:
old_estimators = []
old_estimators = rename_estimators(old_estimators, rev=True)
valid_estimators: np.ndarray = dr.data(metric=metric).columns.unique(0).to_numpy()
new_estimators = valid_estimators[
np.isin(valid_estimators, old_estimators)
].tolist()
valid_estimators = CE.name.sort(valid_estimators.tolist())
return valid_estimators, new_estimators
return rename_estimators(valid_estimators), rename_estimators(new_estimators)
@callback(
@ -473,6 +491,7 @@ def update_content(dataset, metric, estimators, view, mode, root):
quote_via=quote,
)
dr = get_dr(root, dataset)
estimators = rename_estimators(estimators, rev=True)
match mode:
case m if m.endswith("table"):
df = get_table(

View File

@ -126,7 +126,9 @@ class DatasetProvider:
# provare min_df=5
def __imdb(self, **kwargs):
return qp.datasets.fetch_reviews("imdb", tfidf=True, min_df=3).train_test
return qp.datasets.fetch_reviews(
"imdb", data_home="./quapy_data", tfidf=True, min_df=3
).train_test
def __rcv1(self, target, **kwargs):
n_train = 23149
@ -135,7 +137,7 @@ class DatasetProvider:
if target is None or target not in available_targets:
raise ValueError(f"Invalid target {target}")
dataset = fetch_rcv1()
dataset = fetch_rcv1(data_home="./scikit_learn_data")
target_index = np.where(dataset.target_names == target)[0]
all_train_d = dataset.data[:n_train, :]
test_d = dataset.data[n_train:, :]

View File

@ -85,14 +85,14 @@ def naive(
report = EvaluationReport(name="naive")
for test in protocol():
test_preds = c_model_predict(test.X)
acc_score = metrics.accuracy_score(test.y, test_preds)
f1_score = metrics.f1_score(test.y, test_preds, average=f1_average)
meta_acc = abs(val_acc - acc_score)
meta_f1 = abs(val_f1 - f1_score)
test_acc = metrics.accuracy_score(test.y, test_preds)
test_f1 = metrics.f1_score(test.y, test_preds, average=f1_average)
meta_acc = abs(val_acc - test_acc)
meta_f1 = abs(val_f1 - test_f1)
report.append_row(
test.prevalence(),
acc_score=acc_score,
f1_score=f1_score,
acc_score=val_acc,
f1_score=val_f1,
acc=meta_acc,
f1=meta_f1,
)

View File

@ -78,3 +78,33 @@ class CompEstimator:
CE = CompEstimator()
_renames = {
"bin_sld_lr": "(2x2)_SLD_LR",
"mul_sld_lr": "(1x4)_SLD_LR",
"m3w_sld_lr": "(1x3)_SLD_LR",
"d_bin_sld_lr": "d_(2x2)_SLD_LR",
"d_mul_sld_lr": "d_(1x4)_SLD_LR",
"d_m3w_sld_lr": "d_(1x3)_SLD_LR",
"d_bin_sld_rbf": "(2x2)_SLD_RBF",
"d_mul_sld_rbf": "(1x4)_SLD_RBF",
"d_m3w_sld_rbf": "(1x3)_SLD_RBF",
"sld_lr": "SLD_LR",
"bin_kde_lr": "(2x2)_KDEy_LR",
"mul_kde_lr": "(1x4)_KDEy_LR",
"m3w_kde_lr": "(1x3)_KDEy_LR",
"d_bin_kde_lr": "d_(2x2)_KDEy_LR",
"d_mul_kde_lr": "d_(1x4)_KDEy_LR",
"d_m3w_kde_lr": "d_(1x3)_KDEy_LR",
"bin_cc_lr": "(2x2)_CC_LR",
"mul_cc_lr": "(1x4)_CC_LR",
"m3w_cc_lr": "(1x3)_CC_LR",
"kde_lr": "KDEy_LR",
"cc_lr": "CC_LR",
"atc_mc": "ATC",
"doc": "DoC",
"mandoline": "Mandoline",
"rca": "RCA",
"rca_star": "RCA*",
"naive": "Naive",
}

View File

@ -3,7 +3,7 @@ from typing import Callable, List, Union
import numpy as np
from matplotlib.pylab import rand
from quapy.method.aggregative import PACC, SLD, BaseQuantifier
from quapy.method.aggregative import CC, PACC, SLD, BaseQuantifier
from quapy.protocol import UPP, AbstractProtocol, OnLabelledCollectionProtocol
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
@ -53,6 +53,17 @@ def _param_grid(method, X_fit: np.ndarray):
"q__classifier__class_weight": [None, "balanced"],
"confidence": [None, ["isoft"], ["max_conf", "entropy"]],
}
case "cc_lr":
return {
"q__classifier__C": np.logspace(-3, 3, 7),
"q__classifier__class_weight": [None, "balanced"],
"confidence": [
None,
["isoft"],
["max_conf", "entropy"],
["max_conf", "entropy", "isoft"],
],
}
case "kde_lr":
return {
"q__classifier__C": np.logspace(-3, 3, 7),
@ -219,6 +230,10 @@ def __pacc_lr():
return PACC(LogisticRegression())
def __cc_lr():
return CC(LogisticRegression())
# fmt: off
__sld_lr_set = [
@ -380,9 +395,9 @@ __kde_lr_set = [
M("mul_kde_lr_a", __kde_lr(), "mul", conf=["max_conf", "entropy", "isoft"], ),
M("m3w_kde_lr_a", __kde_lr(), "mul", conf=["max_conf", "entropy", "isoft"], cf=True),
# gs kde
G("bin_kde_lr_gs", __kde_lr(), "bin", pg="kde_lr", search="spider" ),
G("mul_kde_lr_gs", __kde_lr(), "mul", pg="kde_lr", search="spider" ),
G("m3w_kde_lr_gs", __kde_lr(), "mul", pg="kde_lr", search="spider", cf=True),
G("bin_kde_lr_gs", __kde_lr(), "bin", pg="kde_lr", search="grid" ),
G("mul_kde_lr_gs", __kde_lr(), "mul", pg="kde_lr", search="grid" ),
G("m3w_kde_lr_gs", __kde_lr(), "mul", pg="kde_lr", search="grid", cf=True),
E("kde_lr_gs"),
]
@ -448,6 +463,37 @@ __dense_kde_rbf_set = [
G("d_m3w_kde_rbf_gs", __kde_rbf(), "mul", d=True, pg="kde_rbf", search="spider", cf=True),
]
__cc_lr_set = [
# base cc
M("bin_cc_lr", __cc_lr(), "bin" ),
M("mul_cc_lr", __cc_lr(), "mul" ),
M("m3w_cc_lr", __cc_lr(), "mul", cf=True),
# max_conf + entropy cc
M("bin_cc_lr_c", __cc_lr(), "bin", conf=["max_conf", "entropy"] ),
M("mul_cc_lr_c", __cc_lr(), "mul", conf=["max_conf", "entropy"] ),
M("m3w_cc_lr_c", __cc_lr(), "mul", conf=["max_conf", "entropy"], cf=True),
# max_conf cc
M("bin_cc_lr_mc", __cc_lr(), "bin", conf="max_conf", ),
M("mul_cc_lr_mc", __cc_lr(), "mul", conf="max_conf", ),
M("m3w_cc_lr_mc", __cc_lr(), "mul", conf="max_conf", cf=True),
# entropy cc
M("bin_cc_lr_ne", __cc_lr(), "bin", conf="entropy", ),
M("mul_cc_lr_ne", __cc_lr(), "mul", conf="entropy", ),
M("m3w_cc_lr_ne", __cc_lr(), "mul", conf="entropy", cf=True),
# inverse softmax cc
M("bin_cc_lr_is", __cc_lr(), "bin", conf="isoft", ),
M("mul_cc_lr_is", __cc_lr(), "mul", conf="isoft", ),
M("m3w_cc_lr_is", __cc_lr(), "mul", conf="isoft", cf=True),
# cc all
M("bin_cc_lr_a", __cc_lr(), "bin", conf=["max_conf", "entropy", "isoft"], ),
M("mul_cc_lr_a", __cc_lr(), "mul", conf=["max_conf", "entropy", "isoft"], ),
M("m3w_cc_lr_a", __cc_lr(), "mul", conf=["max_conf", "entropy", "isoft"], cf=True),
# gs cc
G("bin_cc_lr_gs", __cc_lr(), "bin", pg="cc_lr", search="grid" ),
G("mul_cc_lr_gs", __cc_lr(), "mul", pg="cc_lr", search="grid" ),
G("m3w_cc_lr_gs", __cc_lr(), "mul", pg="cc_lr", search="grid", cf=True),
E("cc_lr_gs"),
]
# fmt: on
@ -458,6 +504,8 @@ __methods_set = (
+ __kde_lr_set
+ __dense_kde_lr_set
+ __dense_kde_rbf_set
+ __cc_lr_set
+ [E("QuAcc")]
)
_methods = {m.name: m for m in __methods_set}

View File

@ -140,6 +140,19 @@ class CompReport:
"mul_kde_lr_gs",
"m3w_kde_lr_gs",
],
"cc_lr_gs": [
"bin_cc_lr_gs",
"mul_cc_lr_gs",
"m3w_cc_lr_gs",
],
"QuAcc": [
"bin_sld_lr_gs",
"mul_sld_lr_gs",
"m3w_sld_lr_gs",
"bin_kde_lr_gs",
"mul_kde_lr_gs",
"m3w_kde_lr_gs",
],
}
for name, methods in _mapping.items():

View File

@ -25,6 +25,7 @@ def wilcoxon(
) -> pd.DataFrame:
_data = r.data(metric, estimators)
_data = _data.dropna(axis=0, how="any")
_wilcoxon = {}
for est in _data.columns.unique(0):
_wilcoxon[est] = [

View File

@ -39,8 +39,16 @@ def plot_delta(
else:
title = f"{_base_title}_{name}_avg_{avg}_{metric}"
x_label = f"{'test' if avg is None or avg == 'train' else 'train'} prevalence"
y_label = f"{metric} error"
if avg is None or avg == "train":
x_label = "Test Prevalence"
else:
x_label = "Train Prevalence"
if metric == "acc":
y_label = "Prediction Error for Vanilla Accuracy"
elif metric == "f1":
y_label = "Prediction Error for F1"
else:
y_label = f"{metric} error"
fig = backend.plot_delta(
base_prevs,
columns,
@ -81,8 +89,12 @@ def plot_diagonal(
else:
title = f"diagonal_{name}_{metric}"
x_label = f"true {metric}"
y_label = f"estim. {metric}"
if metric == "acc":
x_label = "True Vanilla Accuracy"
y_label = "Estimated Vanilla Accuracy"
else:
x_label = f"true {metric}"
y_label = f"estim. {metric}"
fig = backend.plot_diagonal(
reference,
columns,
@ -123,8 +135,13 @@ def plot_shift(
else:
title = f"shift_{name}_avg_{metric}"
x_label = "dataset shift"
y_label = f"{metric} error"
x_label = "Amount of Prior Probability Shift"
if metric == "acc":
y_label = "Prediction Error for Vanilla Accuracy"
elif metric == "f1":
y_label = "Prediction Error for F1"
else:
y_label = f"{metric} error"
fig = backend.plot_shift(
shift_prevs,
columns,

View File

@ -5,6 +5,7 @@ import numpy as np
import plotly
import plotly.graph_objects as go
from quacc.evaluation.estimators import _renames
from quacc.plot.base import BasePlot
@ -50,6 +51,7 @@ class PlotlyPlot(BasePlot):
def __init__(self, theme=None):
self.theme = PlotlyPlot.__themes[theme]
self.rename = True
def hex_to_rgb(self, hex: str, t: float | None = None):
hex = hex.lstrip("#")
@ -85,6 +87,24 @@ class PlotlyPlot(BasePlot):
def save_fig(self, fig, base_path, title) -> Path:
return None
def rename_plots(
self,
columns,
):
if not self.rename:
return columns
new_columns = []
for c in columns:
nc = c
for old, new in _renames.items():
if c.startswith(old):
nc = new + c[len(old) :]
new_columns.append(nc)
return np.array(new_columns)
def plot_delta(
self,
base_prevs,
@ -102,6 +122,7 @@ class PlotlyPlot(BasePlot):
if isinstance(base_prevs[0], float):
base_prevs = np.around([(1 - bp, bp) for bp in base_prevs], decimals=4)
x = [str(tuple(bp)) for bp in base_prevs]
columns = self.rename_plots(columns)
line_colors = self.get_colors(len(columns))
for name, delta in zip(columns, data):
color = next(line_colors)
@ -150,6 +171,7 @@ class PlotlyPlot(BasePlot):
) -> go.Figure:
fig = go.Figure()
x = reference
columns = self.rename_plots(columns)
line_colors = self.get_colors(len(columns))
_edges = (np.min([np.min(x), np.min(data)]), np.max([np.max(x), np.max(data)]))
@ -211,6 +233,7 @@ class PlotlyPlot(BasePlot):
fig = go.Figure()
# x = shift_prevs[:, pos_class]
x = shift_prevs
columns = self.rename_plots(columns)
line_colors = self.get_colors(len(columns))
for name, delta in zip(columns, data):
col_idx = (columns == name).nonzero()[0][0]

15
rates.md Normal file
View File

@ -0,0 +1,15 @@
# Additional covariates percentage
Rate of usage of additional covariates, recalibration and "balanced" class_weight
during grid search:
| method | av % | recalib % | rebalance % |
| --------------: | :----: | :-------: | :---------: |
| imdb_sld_lr | 81.49% | 77.78% | 59.26% |
| imdb_kde_lr | 71.43% | NA | 88.18% |
| rcv1_CCAT_sld_lr| 62.97% | 70.38% | 77.78% |
| rcv1_CCAT_kde_lr| 78.06% | NA | 84.82% |
| rcv1_GCAT_sld_lr| 76.93% | 61.54% | 65.39% |
| rcv1_GCAT_kde_lr| 71.36% | NA | 78.65% |
| rcv1_MCAT_sld_lr| 62.97% | 48.15% | 74.08% |
| rcv1_MCAT_kde_lr| 71.03% | NA | 68.70% |

4
run.py
View File

@ -15,3 +15,7 @@ def run():
run_local()
elif args.remote:
run_remote(detatch=args.detatch)
if __name__ == "__main__":
run()

48
selected_gs.py Normal file
View File

@ -0,0 +1,48 @@
import numpy as np
from quacc.evaluation.report import DatasetReport
datasets = [
"imdb/imdb.pickle",
"rcv1_CCAT/rcv1_CCAT.pickle",
"rcv1_GCAT/rcv1_GCAT.pickle",
"rcv1_MCAT/rcv1_MCAT.pickle",
]
gs = {
"sld_lr_gs": [
"bin_sld_lr_gs",
"mul_sld_lr_gs",
"m3w_sld_lr_gs",
],
"kde_lr_gs": [
"bin_kde_lr_gs",
"mul_kde_lr_gs",
"m3w_kde_lr_gs",
],
}
for dst in datasets:
dr = DatasetReport.unpickle("output/main/" + dst)
print(f"{dst}\n")
for name, methods in gs.items():
print(f"{name}")
sel_methods = [
{k: v for k, v in cr.fit_scores.items() if k in methods} for cr in dr.crs
]
best_methods = [
list(ms.keys())[np.argmin(list(ms.values()))] for ms in sel_methods
]
m_cnt = []
for m in methods:
m_cnt.append((np.array(best_methods) == m).nonzero()[0].shape[0])
m_cnt = np.array(m_cnt)
m_freq = m_cnt / len(best_methods)
for n in methods:
print(n, end="\t")
print()
for v in m_freq:
print(f"{v*100:.2f}", end="\t")
print("\n\n")