baseline performance test updated
This commit is contained in:
parent
575c1dd6a1
commit
db129c4093
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -201,13 +201,13 @@ def rca_score(
|
|||
]
|
||||
results = []
|
||||
for test in protocol():
|
||||
[f_prev, t_prev] = test.prevalence()
|
||||
try:
|
||||
try:
|
||||
[f_prev, t_prev] = test.prevalence()
|
||||
test_pred = c_model_predict(test.X)
|
||||
c_model2 = rca.clone_fit(c_model, test.X, test_pred)
|
||||
c_model2_predict = getattr(c_model2, predict_method)
|
||||
val_pred2 = c_model2_predict(validation.X)
|
||||
rca_score = 1.0 - rca.get_score(val_pred1, val_pred2, validation.y)
|
||||
rca_score = rca.get_score(val_pred1, val_pred2, validation.y)
|
||||
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, rca_score])})
|
||||
except ValueError:
|
||||
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, float("nan")])})
|
||||
|
@ -248,7 +248,7 @@ def rca_star_score(
|
|||
c_model2 = rca.clone_fit(c_model, test.X, test_pred)
|
||||
c_model2_predict = getattr(c_model2, predict_method)
|
||||
val2_pred2 = c_model2_predict(validation2.X)
|
||||
rca_star_score = 1.0 - rca.get_score(val2_pred1, val2_pred2, validation2.y)
|
||||
rca_star_score = rca.get_score(val2_pred1, val2_pred2, validation2.y)
|
||||
results.append(
|
||||
{k: v for k, v in zip(cols, [f_prev, t_prev, rca_star_score])}
|
||||
)
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from operator import index
|
||||
from typing import Tuple
|
||||
import numpy as np
|
||||
from quapy.data.base import LabelledCollection
|
||||
|
@ -18,11 +19,29 @@ def get_spambase() -> Tuple[LabelledCollection]:
|
|||
train, validation = train.split_stratified(train_prop=TRAIN_VAL_PROP)
|
||||
return train, validation, test
|
||||
|
||||
# >>> fetch_rcv1().target_names
|
||||
# array(['C11', 'C12', 'C13', 'C14', 'C15', 'C151', 'C1511', 'C152', 'C16',
|
||||
# 'C17', 'C171', 'C172', 'C173', 'C174', 'C18', 'C181', 'C182',
|
||||
# 'C183', 'C21', 'C22', 'C23', 'C24', 'C31', 'C311', 'C312', 'C313',
|
||||
# 'C32', 'C33', 'C331', 'C34', 'C41', 'C411', 'C42', 'CCAT', 'E11',
|
||||
# 'E12', 'E121', 'E13', 'E131', 'E132', 'E14', 'E141', 'E142',
|
||||
# 'E143', 'E21', 'E211', 'E212', 'E31', 'E311', 'E312', 'E313',
|
||||
# 'E41', 'E411', 'E51', 'E511', 'E512', 'E513', 'E61', 'E71', 'ECAT',
|
||||
# 'G15', 'G151', 'G152', 'G153', 'G154', 'G155', 'G156', 'G157',
|
||||
# 'G158', 'G159', 'GCAT', 'GCRIM', 'GDEF', 'GDIP', 'GDIS', 'GENT',
|
||||
# 'GENV', 'GFAS', 'GHEA', 'GJOB', 'GMIL', 'GOBIT', 'GODD', 'GPOL',
|
||||
# 'GPRO', 'GREL', 'GSCI', 'GSPO', 'GTOUR', 'GVIO', 'GVOTE', 'GWEA',
|
||||
# 'GWELF', 'M11', 'M12', 'M13', 'M131', 'M132', 'M14', 'M141',
|
||||
# 'M142', 'M143', 'MCAT'], dtype=object)
|
||||
|
||||
def get_rcv1(sample_size=100):
|
||||
def get_rcv1(target:str):
|
||||
sample_size = qp.environ["SAMPLE_SIZE"]
|
||||
n_train = 23149
|
||||
dataset = fetch_rcv1()
|
||||
|
||||
if target not in dataset.target_names:
|
||||
raise ValueError("Invalid target")
|
||||
|
||||
def dataset_split(data, labels, classes=[0, 1]) -> Tuple[LabelledCollection]:
|
||||
all_train_d, test_d = data[:n_train, :], data[n_train:, :]
|
||||
all_train_l, test_l = labels[:n_train], labels[n_train:]
|
||||
|
@ -31,14 +50,13 @@ def get_rcv1(sample_size=100):
|
|||
train, validation = all_train.split_stratified(train_prop=TRAIN_VAL_PROP)
|
||||
return train, validation, test
|
||||
|
||||
target_labels = [
|
||||
(target, dataset.target[:, ind].toarray().flatten())
|
||||
for (ind, target) in enumerate(dataset.target_names)
|
||||
]
|
||||
filtered_target_labels = filter(
|
||||
lambda _, labels: np.sum(labels[n_train:]) >= sample_size, target_labels
|
||||
)
|
||||
return {
|
||||
target: dataset_split(dataset.data, labels, classes=[0, 1])
|
||||
for (target, labels) in filtered_target_labels
|
||||
}
|
||||
target_index = np.where(dataset.target_names == target)[0]
|
||||
target_labels = dataset.target[:, target_index].toarray().flatten()
|
||||
|
||||
if np.sum(target_labels[n_train:]) < sample_size:
|
||||
raise ValueError("Target has too few positive samples")
|
||||
|
||||
d = dataset_split(dataset.data, target_labels, classes=[0, 1])
|
||||
|
||||
return d
|
||||
|
||||
|
|
|
@ -8,18 +8,28 @@ def from_name(err_name):
|
|||
else:
|
||||
return qp.error.from_name(err_name)
|
||||
|
||||
# def f1(prev):
|
||||
# # https://github.com/dice-group/gerbil/wiki/Precision,-Recall-and-F1-measure
|
||||
# if prev[0] == 0 and prev[1] == 0 and prev[2] == 0:
|
||||
# return 1.0
|
||||
# elif prev[0] == 0 and prev[1] > 0 and prev[2] == 0:
|
||||
# return 0.0
|
||||
# elif prev[0] == 0 and prev[1] == 0 and prev[2] > 0:
|
||||
# return float('NaN')
|
||||
# else:
|
||||
# recall = prev[0] / (prev[0] + prev[1])
|
||||
# precision = prev[0] / (prev[0] + prev[2])
|
||||
# return 2 * (precision * recall) / (precision + recall)
|
||||
|
||||
def f1(prev):
|
||||
# https://github.com/dice-group/gerbil/wiki/Precision,-Recall-and-F1-measure
|
||||
if prev[0] == 0 and prev[1] == 0 and prev[2] == 0:
|
||||
den = (2*prev[3]) + prev[1] + prev[2]
|
||||
if den == 0:
|
||||
return 1.0
|
||||
elif prev[0] == 0 and prev[1] > 0 and prev[2] == 0:
|
||||
return 0.0
|
||||
elif prev[0] == 0 and prev[1] == 0 and prev[2] > 0:
|
||||
return float('NaN')
|
||||
else:
|
||||
recall = prev[0] / (prev[0] + prev[1])
|
||||
precision = prev[0] / (prev[0] + prev[2])
|
||||
return 2 * (precision * recall) / (precision + recall)
|
||||
return (2*prev[3])/den
|
||||
|
||||
def f1e(prev):
|
||||
return 1 - f1(prev)
|
||||
|
||||
def mae(prev):
|
||||
return (prev[1] + prev[2]) / sum(prev)
|
|
@ -80,55 +80,63 @@ def evaluation_report(
|
|||
protocol: AbstractStochasticSeededProtocol,
|
||||
error_metrics: Iterable[Union[str, Callable]] = "all",
|
||||
aggregate: bool = True,
|
||||
prevalence: bool = True,
|
||||
):
|
||||
def _report_columns(err_names):
|
||||
base_cols = list(itertools.product(["base"], ["F", "T"]))
|
||||
prev_cols = list(itertools.product(["true", "estim"], ["TN", "FP", "FN", "TP"]))
|
||||
err_cols = list(itertools.product(["errors"], err_names))
|
||||
return base_cols + prev_cols, err_cols
|
||||
return base_cols, prev_cols, err_cols
|
||||
|
||||
base_prevs, true_prevs, estim_prevs = estimate(estimator, protocol)
|
||||
|
||||
if error_metrics == "all":
|
||||
error_metrics = ["ae", "f1"]
|
||||
error_metrics = ["mae", "f1"]
|
||||
|
||||
error_funcs = [
|
||||
error.from_name(e) if isinstance(e, str) else e for e in error_metrics
|
||||
]
|
||||
assert all(hasattr(e, "__call__") for e in error_funcs), "invalid error function"
|
||||
error_names = [e.__name__ for e in error_funcs]
|
||||
error_cols = error_names.copy()
|
||||
if "f1" in error_cols:
|
||||
error_cols.remove("f1")
|
||||
error_cols.extend(["f1_true", "f1_estim"])
|
||||
if "f1e" in error_cols:
|
||||
error_cols.remove("f1e")
|
||||
error_cols.extend(["f1e_true", "f1e_estim"])
|
||||
error_cols = []
|
||||
for err in error_names:
|
||||
if err == "mae":
|
||||
error_cols.extend(["mae_estim", "mae_true"])
|
||||
elif err == "f1":
|
||||
error_cols.extend(["f1_estim", "f1_true"])
|
||||
elif err == "f1e":
|
||||
error_cols.extend(["f1e_estim", "f1e_true"])
|
||||
else:
|
||||
error_cols.append(err)
|
||||
|
||||
# df_cols = ["base_prev", "true_prev", "estim_prev"] + error_names
|
||||
prev_cols, err_cols = _report_columns(error_cols)
|
||||
base_cols, prev_cols, err_cols = _report_columns(error_cols)
|
||||
|
||||
lst = []
|
||||
for base_prev, true_prev, estim_prev in zip(base_prevs, true_prevs, estim_prevs):
|
||||
series = {
|
||||
k: v
|
||||
for (k, v) in zip(
|
||||
prev_cols, np.concatenate((base_prev, true_prev, estim_prev), axis=0)
|
||||
)
|
||||
}
|
||||
for error_name, error_metric in zip(error_names, error_funcs):
|
||||
if error_name == "f1e":
|
||||
series[("errors", "f1e_true")] = error_metric(true_prev)
|
||||
series[("errors", "f1e_estim")] = error_metric(estim_prev)
|
||||
continue
|
||||
if error_name == "f1":
|
||||
f1_true, f1_estim = error_metric(true_prev), error_metric(estim_prev)
|
||||
series[("errors", "f1_true")] = f1_true
|
||||
series[("errors", "f1_estim")] = f1_estim
|
||||
continue
|
||||
if prevalence:
|
||||
series = {
|
||||
k: v
|
||||
for (k, v) in zip(
|
||||
base_cols + prev_cols,
|
||||
np.concatenate((base_prev, true_prev, estim_prev), axis=0),
|
||||
)
|
||||
}
|
||||
df_cols = base_cols + prev_cols + err_cols
|
||||
else:
|
||||
series = {k: v for (k, v) in zip(base_cols, base_prev)}
|
||||
df_cols = base_cols + err_cols
|
||||
|
||||
score = error_metric(true_prev, estim_prev)
|
||||
series[("errors", error_name)] = score
|
||||
for err in error_cols:
|
||||
error_funcs = {
|
||||
"mae_true": lambda: error.mae(true_prev),
|
||||
"mae_estim": lambda: error.mae(estim_prev),
|
||||
"f1_true": lambda: error.f1(true_prev),
|
||||
"f1_estim": lambda: error.f1(estim_prev),
|
||||
"f1e_true": lambda: error.f1e(true_prev),
|
||||
"f1e_estim": lambda: error.f1e(estim_prev),
|
||||
}
|
||||
series[("errors", err)] = error_funcs[err]()
|
||||
|
||||
lst.append(series)
|
||||
|
||||
|
@ -136,6 +144,6 @@ def evaluation_report(
|
|||
|
||||
df = pd.DataFrame(
|
||||
lst,
|
||||
columns=pd.MultiIndex.from_tuples(prev_cols + err_cols),
|
||||
columns=pd.MultiIndex.from_tuples(df_cols),
|
||||
)
|
||||
return df
|
||||
|
|
|
@ -2,6 +2,7 @@ import pandas as pd
|
|||
import quapy as qp
|
||||
from quapy.protocol import APP
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from quacc import utils
|
||||
|
||||
import quacc.evaluation as eval
|
||||
import quacc.baseline as baseline
|
||||
|
@ -10,7 +11,7 @@ from quacc.estimator import (
|
|||
MulticlassAccuracyEstimator,
|
||||
)
|
||||
|
||||
from quacc.dataset import get_imdb, get_spambase
|
||||
from quacc.dataset import get_imdb, get_rcv1, get_spambase
|
||||
|
||||
qp.environ["SAMPLE_SIZE"] = 100
|
||||
|
||||
|
@ -109,25 +110,21 @@ def estimate_comparison():
|
|||
|
||||
estimator = BinaryQuantifierAccuracyEstimator(model)
|
||||
estimator.fit(validation)
|
||||
df = eval.evaluation_report(estimator, protocol)
|
||||
df = eval.evaluation_report(estimator, protocol, prevalence=False)
|
||||
|
||||
df_index = [("base", "F"), ("base", "T")]
|
||||
df = utils.combine_dataframes(
|
||||
baseline.atc_mc(model, validation, protocol),
|
||||
baseline.atc_ne(model, validation, protocol),
|
||||
baseline.doc_feat(model, validation, protocol),
|
||||
baseline.rca_score(model, validation, protocol),
|
||||
baseline.rca_star_score(model, validation, protocol),
|
||||
baseline.bbse_score(model, validation, protocol),
|
||||
df,
|
||||
df_index=[("base", "F"), ("base", "T")]
|
||||
)
|
||||
|
||||
atc_mc_df = baseline.atc_mc(model, validation, protocol)
|
||||
atc_ne_df = baseline.atc_ne(model, validation, protocol)
|
||||
doc_feat_df = baseline.doc_feat(model, validation, protocol)
|
||||
rca_df = baseline.rca_score(model, validation, protocol)
|
||||
rca_star_df = baseline.rca_star_score(model, validation, protocol)
|
||||
bbse_df = baseline.bbse_score(model, validation, protocol)
|
||||
|
||||
df = df.join(atc_mc_df.set_index(df_index), on=df_index)
|
||||
df = df.join(atc_ne_df.set_index(df_index), on=df_index)
|
||||
df = df.join(doc_feat_df.set_index(df_index), on=df_index)
|
||||
df = df.join(rca_df.set_index(df_index), on=df_index)
|
||||
df = df.join(rca_star_df.set_index(df_index), on=df_index)
|
||||
df = df.join(bbse_df.set_index(df_index), on=df_index)
|
||||
|
||||
print(df.to_string())
|
||||
print(df.to_latex(float_format="{:.4f}".format))
|
||||
print(utils.avg_group_report(df).to_latex(float_format="{:.4f}".format))
|
||||
|
||||
def main():
|
||||
estimate_comparison()
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
|
||||
import functools
|
||||
import pandas as pd
|
||||
|
||||
def combine_dataframes(*dfs, df_index=[]) -> pd.DataFrame:
|
||||
if len(dfs) < 1:
|
||||
raise ValueError
|
||||
if len(dfs) == 1:
|
||||
return dfs[0]
|
||||
df = dfs[0]
|
||||
for ndf in dfs[1:]:
|
||||
df = df.join(ndf.set_index(df_index), on=df_index)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def avg_group_report(df: pd.DataFrame) -> pd.DataFrame:
|
||||
def _reduce_func(s1, s2):
|
||||
return {
|
||||
(n1, n2): v + s2[(n1, n2)] for ((n1, n2), v) in s1.items()
|
||||
}
|
||||
|
||||
lst = df.to_dict(orient="records")[1:-1]
|
||||
summed_series = functools.reduce(_reduce_func, lst)
|
||||
idx = df.columns.drop([("base", "T"), ("base", "F")])
|
||||
avg_report = {
|
||||
(n1, n2): (v / len(lst))
|
||||
for ((n1, n2), v) in summed_series.items()
|
||||
if n1 != "base"
|
||||
}
|
||||
return pd.DataFrame([avg_report], columns=idx)
|
Loading…
Reference in New Issue