update trailing char

This commit is contained in:
Lorenzo Volpi 2023-11-08 17:26:44 +01:00
parent dd581f7937
commit f346005515
47 changed files with 31354 additions and 31354 deletions

38
.gitignore vendored
View File

@ -1,20 +1,20 @@
*.code-workspace
quavenv/*
*.pdf
__pycache__/*
baselines/__pycache__/*
baselines/densratio/__pycache__/*
quacc/__pycache__/*
quacc/evaluation/__pycache__/*
quacc/method/__pycache__/*
tests/__pycache__/*
*.coverage
.coverage
scp_sync.py
out/*
output/*
*.code-workspace
quavenv/*
*.pdf
__pycache__/*
baselines/__pycache__/*
baselines/densratio/__pycache__/*
quacc/__pycache__/*
quacc/evaluation/__pycache__/*
quacc/method/__pycache__/*
tests/__pycache__/*
*.coverage
.coverage
scp_sync.py
out/*
output/*
!output/main/

48
.vscode/launch.json vendored
View File

@ -1,25 +1,25 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "main",
"type": "python",
"request": "launch",
"program": "C:\\Users\\Lorenzo Volpi\\source\\tesi\\quacc\\main.py",
"console": "integratedTerminal",
"justMyCode": true
},
{
"name": "main_test",
"type": "python",
"request": "launch",
"program": "C:\\Users\\Lorenzo Volpi\\source\\tesi\\quacc\\main_test.py",
"console": "integratedTerminal",
"justMyCode": false
},
]
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "main",
"type": "python",
"request": "launch",
"program": "C:\\Users\\Lorenzo Volpi\\source\\tesi\\quacc\\main.py",
"console": "integratedTerminal",
"justMyCode": true
},
{
"name": "main_test",
"type": "python",
"request": "launch",
"program": "C:\\Users\\Lorenzo Volpi\\source\\tesi\\quacc\\main_test.py",
"console": "integratedTerminal",
"justMyCode": false
},
]
}

View File

@ -1,54 +1,54 @@
{
"todo": [
{
"assignedTo": {
"name": "Lorenzo Volpi"
},
"creation_time": "2023-10-28T14:33:36.069Z",
"id": "2",
"references": [],
"title": "Creare plot avg con training prevalence sull'asse x e media rispetto a test prevalence"
},
{
"assignedTo": {
"name": "Lorenzo Volpi"
},
"creation_time": "2023-10-28T14:32:37.610Z",
"id": "1",
"references": [],
"title": "Testare su imdb"
}
],
"in-progress": [
{
"assignedTo": {
"name": "Lorenzo Volpi"
},
"creation_time": "2023-10-28T14:34:23.217Z",
"id": "3",
"references": [],
"title": "Relaizzare grid search per task specifico partedno da GridSearchQ"
},
{
"assignedTo": {
"name": "Lorenzo Volpi"
},
"creation_time": "2023-10-28T14:34:46.226Z",
"id": "4",
"references": [],
"title": "Aggingere estimator basati su PACC (quantificatore)"
}
],
"testing": [],
"done": [
{
"assignedTo": {
"name": "Lorenzo Volpi"
},
"creation_time": "2023-10-28T14:35:12.683Z",
"id": "5",
"references": [],
"title": "Rework rappresentazione dati di report"
}
]
{
"todo": [
{
"assignedTo": {
"name": "Lorenzo Volpi"
},
"creation_time": "2023-10-28T14:33:36.069Z",
"id": "2",
"references": [],
"title": "Creare plot avg con training prevalence sull'asse x e media rispetto a test prevalence"
},
{
"assignedTo": {
"name": "Lorenzo Volpi"
},
"creation_time": "2023-10-28T14:32:37.610Z",
"id": "1",
"references": [],
"title": "Testare su imdb"
}
],
"in-progress": [
{
"assignedTo": {
"name": "Lorenzo Volpi"
},
"creation_time": "2023-10-28T14:34:23.217Z",
"id": "3",
"references": [],
"title": "Relaizzare grid search per task specifico partedno da GridSearchQ"
},
{
"assignedTo": {
"name": "Lorenzo Volpi"
},
"creation_time": "2023-10-28T14:34:46.226Z",
"id": "4",
"references": [],
"title": "Aggingere estimator basati su PACC (quantificatore)"
}
],
"testing": [],
"done": [
{
"assignedTo": {
"name": "Lorenzo Volpi"
},
"creation_time": "2023-10-28T14:35:12.683Z",
"id": "5",
"references": [],
"title": "Rework rappresentazione dati di report"
}
]
}

284
TODO.html
View File

@ -1,143 +1,143 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title></title>
<style>
/* From extension vscode.github */
/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/
.vscode-dark img[src$=\#gh-light-mode-only],
.vscode-light img[src$=\#gh-dark-mode-only] {
display: none;
}
</style>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/Microsoft/vscode/extensions/markdown-language-features/media/markdown.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/Microsoft/vscode/extensions/markdown-language-features/media/highlight.css">
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe WPC', 'Segoe UI', system-ui, 'Ubuntu', 'Droid Sans', sans-serif;
font-size: 14px;
line-height: 1.6;
}
</style>
<style>
.task-list-item {
list-style-type: none;
}
.task-list-item-checkbox {
margin-left: -20px;
vertical-align: middle;
pointer-events: none;
}
</style>
</head>
<body class="vscode-body vscode-light">
<ul class="contains-task-list">
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> aggiungere media tabelle</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> plot; 3 tipi (appunti + email + garg)</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> sistemare kfcv baseline</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> aggiungere metodo con CC oltre SLD</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> prendere classe più popolosa di rcv1, togliere negativi fino a raggiungere 50/50; poi fare subsampling con 9 training prvalences (da 0.1-0.9 a 0.9-0.1)</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> variare parametro recalibration in SLD</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> fix grafico diagonal</p>
<ul>
<li>seaborn example gallery</li>
</ul>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> varianti recalib: bcts, SLD (provare exact_train_prev=False)</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> vedere cosa usa garg di validation size</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> per model selection testare il parametro c del classificatore, si esplora in np.logscale(-3,3, 7) oppure np.logscale(-4, 4, 9), parametro class_weight si esplora in None oppure &quot;balanced&quot;; va usato qp.model_selection.GridSearchQ in funzione di mae come errore, UPP come protocollo</p>
<ul>
<li>qp.train_test_split per avere v_train e v_val</li>
<li>GridSearchQ(
model: BaseQuantifier,
param_grid: {
'classifier__C': np.logspace(-3,3,7),
'classifier__class_weight': [None, 'balanced'],
'recalib': [None, 'bcts']
},
protocol: UPP(V_val, repeats=1000),
error = qp.error.mae,
refit=True,
timeout=-1,
n_jobs=-2,
verbose=True).fit(V_tr)</li>
</ul>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> plot collettivo, con sulla x lo shift e prenda in considerazione tutti i training set, facendo la media sui 9 casi (ogni line è un metodo), risultati non ottimizzati e ottimizzati</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> salvare il best score ottenuto da ogni applicazione di GridSearchQ</p>
<ul>
<li>nel caso di bin fare media dei due best score</li>
</ul>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> import baselines</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox"type="checkbox"> importare mandoline</p>
<ul>
<li>mandoline può essere importato, ma richiedere uno slicing delle features a priori che devere essere realizzato ad hoc</li>
</ul>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox"type="checkbox"> sistemare vecchie iw baselines</p>
<ul>
<li>non possono essere fixate perché dipendono da numpy</li>
</ul>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> plot avg con train prevalence sull'asse x e media su test prevalecne</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> realizzare grid search per task specifico partendo da GridSearchQ</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> provare PACC come quantificatore</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> aggiungere etichette in shift plot</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> sistemare exact_train quapy</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> testare anche su imbd</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox"type="checkbox"> rivedere nuove baselines</p>
</li>
</ul>
</body>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title></title>
<style>
/* From extension vscode.github */
/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/
.vscode-dark img[src$=\#gh-light-mode-only],
.vscode-light img[src$=\#gh-dark-mode-only] {
display: none;
}
</style>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/Microsoft/vscode/extensions/markdown-language-features/media/markdown.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/Microsoft/vscode/extensions/markdown-language-features/media/highlight.css">
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe WPC', 'Segoe UI', system-ui, 'Ubuntu', 'Droid Sans', sans-serif;
font-size: 14px;
line-height: 1.6;
}
</style>
<style>
.task-list-item {
list-style-type: none;
}
.task-list-item-checkbox {
margin-left: -20px;
vertical-align: middle;
pointer-events: none;
}
</style>
</head>
<body class="vscode-body vscode-light">
<ul class="contains-task-list">
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> aggiungere media tabelle</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> plot; 3 tipi (appunti + email + garg)</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> sistemare kfcv baseline</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> aggiungere metodo con CC oltre SLD</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> prendere classe più popolosa di rcv1, togliere negativi fino a raggiungere 50/50; poi fare subsampling con 9 training prvalences (da 0.1-0.9 a 0.9-0.1)</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> variare parametro recalibration in SLD</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> fix grafico diagonal</p>
<ul>
<li>seaborn example gallery</li>
</ul>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> varianti recalib: bcts, SLD (provare exact_train_prev=False)</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> vedere cosa usa garg di validation size</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> per model selection testare il parametro c del classificatore, si esplora in np.logscale(-3,3, 7) oppure np.logscale(-4, 4, 9), parametro class_weight si esplora in None oppure &quot;balanced&quot;; va usato qp.model_selection.GridSearchQ in funzione di mae come errore, UPP come protocollo</p>
<ul>
<li>qp.train_test_split per avere v_train e v_val</li>
<li>GridSearchQ(
model: BaseQuantifier,
param_grid: {
'classifier__C': np.logspace(-3,3,7),
'classifier__class_weight': [None, 'balanced'],
'recalib': [None, 'bcts']
},
protocol: UPP(V_val, repeats=1000),
error = qp.error.mae,
refit=True,
timeout=-1,
n_jobs=-2,
verbose=True).fit(V_tr)</li>
</ul>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> plot collettivo, con sulla x lo shift e prenda in considerazione tutti i training set, facendo la media sui 9 casi (ogni line è un metodo), risultati non ottimizzati e ottimizzati</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> salvare il best score ottenuto da ogni applicazione di GridSearchQ</p>
<ul>
<li>nel caso di bin fare media dei due best score</li>
</ul>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> import baselines</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox"type="checkbox"> importare mandoline</p>
<ul>
<li>mandoline può essere importato, ma richiedere uno slicing delle features a priori che devere essere realizzato ad hoc</li>
</ul>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox"type="checkbox"> sistemare vecchie iw baselines</p>
<ul>
<li>non possono essere fixate perché dipendono da numpy</li>
</ul>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> plot avg con train prevalence sull'asse x e media su test prevalecne</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> realizzare grid search per task specifico partendo da GridSearchQ</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> provare PACC come quantificatore</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> aggiungere etichette in shift plot</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> sistemare exact_train quapy</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> testare anche su imbd</p>
</li>
<li class="task-list-item enabled">
<p><input class="task-list-item-checkbox"type="checkbox"> rivedere nuove baselines</p>
</li>
</ul>
</body>
</html>

86
TODO.md
View File

@ -1,44 +1,44 @@
- [x] aggiungere media tabelle
- [x] plot; 3 tipi (appunti + email + garg)
- [x] sistemare kfcv baseline
- [x] aggiungere metodo con CC oltre SLD
- [x] prendere classe più popolosa di rcv1, togliere negativi fino a raggiungere 50/50; poi fare subsampling con 9 training prvalences (da 0.1-0.9 a 0.9-0.1)
- [x] variare parametro recalibration in SLD
- [x] fix grafico diagonal
- seaborn example gallery
- [x] varianti recalib: bcts, SLD (provare exact_train_prev=False)
- [x] vedere cosa usa garg di validation size
- [x] per model selection testare il parametro c del classificatore, si esplora in np.logscale(-3,3, 7) oppure np.logscale(-4, 4, 9), parametro class_weight si esplora in None oppure "balanced"; va usato qp.model_selection.GridSearchQ in funzione di mae come errore, UPP come protocollo
- qp.train_test_split per avere v_train e v_val
- GridSearchQ(
model: BaseQuantifier,
param_grid: {
'classifier__C': np.logspace(-3,3,7),
'classifier__class_weight': [None, 'balanced'],
'recalib': [None, 'bcts']
},
protocol: UPP(V_val, repeats=1000),
error = qp.error.mae,
refit=True,
timeout=-1,
n_jobs=-2,
verbose=True).fit(V_tr)
- [x] plot collettivo, con sulla x lo shift e prenda in considerazione tutti i training set, facendo la media sui 9 casi (ogni line è un metodo), risultati non ottimizzati e ottimizzati
- [x] salvare il best score ottenuto da ogni applicazione di GridSearchQ
- nel caso di bin fare media dei due best score
- [x] import baselines
- [ ] importare mandoline
- mandoline può essere importato, ma richiedere uno slicing delle features a priori che devere essere realizzato ad hoc
- [ ] sistemare vecchie iw baselines
- non possono essere fixate perché dipendono da numpy
- [x] plot avg con train prevalence sull'asse x e media su test prevalecne
- [x] realizzare grid search per task specifico partendo da GridSearchQ
- [x] provare PACC come quantificatore
- [x] aggiungere etichette in shift plot
- [x] sistemare exact_train quapy
- [x] testare anche su imbd
- [x] aggiungere media tabelle
- [x] plot; 3 tipi (appunti + email + garg)
- [x] sistemare kfcv baseline
- [x] aggiungere metodo con CC oltre SLD
- [x] prendere classe più popolosa di rcv1, togliere negativi fino a raggiungere 50/50; poi fare subsampling con 9 training prvalences (da 0.1-0.9 a 0.9-0.1)
- [x] variare parametro recalibration in SLD
- [x] fix grafico diagonal
- seaborn example gallery
- [x] varianti recalib: bcts, SLD (provare exact_train_prev=False)
- [x] vedere cosa usa garg di validation size
- [x] per model selection testare il parametro c del classificatore, si esplora in np.logscale(-3,3, 7) oppure np.logscale(-4, 4, 9), parametro class_weight si esplora in None oppure "balanced"; va usato qp.model_selection.GridSearchQ in funzione di mae come errore, UPP come protocollo
- qp.train_test_split per avere v_train e v_val
- GridSearchQ(
model: BaseQuantifier,
param_grid: {
'classifier__C': np.logspace(-3,3,7),
'classifier__class_weight': [None, 'balanced'],
'recalib': [None, 'bcts']
},
protocol: UPP(V_val, repeats=1000),
error = qp.error.mae,
refit=True,
timeout=-1,
n_jobs=-2,
verbose=True).fit(V_tr)
- [x] plot collettivo, con sulla x lo shift e prenda in considerazione tutti i training set, facendo la media sui 9 casi (ogni line è un metodo), risultati non ottimizzati e ottimizzati
- [x] salvare il best score ottenuto da ogni applicazione di GridSearchQ
- nel caso di bin fare media dei due best score
- [x] import baselines
- [ ] importare mandoline
- mandoline può essere importato, ma richiedere uno slicing delle features a priori che devere essere realizzato ad hoc
- [ ] sistemare vecchie iw baselines
- non possono essere fixate perché dipendono da numpy
- [x] plot avg con train prevalence sull'asse x e media su test prevalecne
- [x] realizzare grid search per task specifico partendo da GridSearchQ
- [x] provare PACC come quantificatore
- [x] aggiungere etichette in shift plot
- [x] sistemare exact_train quapy
- [x] testare anche su imbd
- [ ] rivedere nuove baselines

View File

@ -1,44 +1,44 @@
import numpy as np
from sklearn.metrics import f1_score
def get_entropy(probs):
return np.sum(np.multiply(probs, np.log(probs + 1e-20)), axis=1)
def get_max_conf(probs):
return np.max(probs, axis=-1)
def find_ATC_threshold(scores, labels):
sorted_idx = np.argsort(scores)
sorted_scores = scores[sorted_idx]
sorted_labels = labels[sorted_idx]
fp = np.sum(labels == 0)
fn = 0.0
min_fp_fn = np.abs(fp - fn)
thres = 0.0
for i in range(len(labels)):
if sorted_labels[i] == 0:
fp -= 1
else:
fn += 1
if np.abs(fp - fn) < min_fp_fn:
min_fp_fn = np.abs(fp - fn)
thres = sorted_scores[i]
return min_fp_fn, thres
def get_ATC_acc(thres, scores):
return np.mean(scores >= thres)
def get_ATC_f1(thres, scores, probs):
preds = np.argmax(probs, axis=-1)
estim_y = np.abs(1 - (scores >= thres) ^ preds)
return f1_score(estim_y, preds)
import numpy as np
from sklearn.metrics import f1_score
def get_entropy(probs):
return np.sum(np.multiply(probs, np.log(probs + 1e-20)), axis=1)
def get_max_conf(probs):
return np.max(probs, axis=-1)
def find_ATC_threshold(scores, labels):
sorted_idx = np.argsort(scores)
sorted_scores = scores[sorted_idx]
sorted_labels = labels[sorted_idx]
fp = np.sum(labels == 0)
fn = 0.0
min_fp_fn = np.abs(fp - fn)
thres = 0.0
for i in range(len(labels)):
if sorted_labels[i] == 0:
fp -= 1
else:
fn += 1
if np.abs(fp - fn) < min_fp_fn:
min_fp_fn = np.abs(fp - fn)
thres = sorted_scores[i]
return min_fp_fn, thres
def get_ATC_acc(thres, scores):
return np.mean(scores >= thres)
def get_ATC_f1(thres, scores, probs):
preds = np.argmax(probs, axis=-1)
estim_y = np.abs(1 - (scores >= thres) ^ preds)
return f1_score(estim_y, preds)

View File

@ -1,277 +1,277 @@
"""
Relative Unconstrained Least-Squares Fitting (RuLSIF): A Python Implementation
References:
'Change-point detection in time-series data by relative density-ratio estimation'
Song Liu, Makoto Yamada, Nigel Collier and Masashi Sugiyama,
Neural Networks 43 (2013) 72-83.
'A Least-squares Approach to Direct Importance Estimation'
Takafumi Kanamori, Shohei Hido, and Masashi Sugiyama,
Journal of Machine Learning Research 10 (2009) 1391-1445.
"""
from warnings import warn
from numpy import (
array,
asarray,
asmatrix,
diag,
diagflat,
empty,
exp,
inf,
log,
matrix,
multiply,
ones,
power,
sum,
)
from numpy.linalg import solve
from numpy.random import randint
from .density_ratio import DensityRatio, KernelInfo
from .helpers import guvectorize_compute, np_float, to_ndarray
def RuLSIF(x, y, alpha, sigma_range, lambda_range, kernel_num=100, verbose=True):
"""
Estimation of the alpha-Relative Density Ratio p(x)/p_alpha(x) by RuLSIF
(Relative Unconstrained Least-Square Importance Fitting)
p_alpha(x) = alpha * p(x) + (1 - alpha) * q(x)
Arguments:
x (numpy.matrix): Sample from p(x).
y (numpy.matrix): Sample from q(x).
alpha (float): Mixture parameter.
sigma_range (list<float>): Search range of Gaussian kernel bandwidth.
lambda_range (list<float>): Search range of regularization parameter.
kernel_num (int): Number of kernels. (Default 100)
verbose (bool): Indicator to print messages (Default True)
Returns:
densratio.DensityRatio object which has `compute_density_ratio()`.
"""
# Number of samples.
nx = x.shape[0]
ny = y.shape[0]
# Number of kernel functions.
kernel_num = min(kernel_num, nx)
# Randomly take a subset of x, to identify centers for the kernels.
centers = x[randint(nx, size=kernel_num)]
if verbose:
print("RuLSIF starting...")
if len(sigma_range) == 1 and len(lambda_range) == 1:
sigma = sigma_range[0]
lambda_ = lambda_range[0]
else:
if verbose:
print("Searching for the optimal sigma and lambda...")
# Grid-search cross-validation for optimal kernel and regularization parameters.
opt_params = search_sigma_and_lambda(
x, y, alpha, centers, sigma_range, lambda_range, verbose
)
sigma = opt_params["sigma"]
lambda_ = opt_params["lambda"]
if verbose:
print(
"Found optimal sigma = {:.3f}, lambda = {:.3f}.".format(sigma, lambda_)
)
if verbose:
print("Optimizing theta...")
phi_x = compute_kernel_Gaussian(x, centers, sigma)
phi_y = compute_kernel_Gaussian(y, centers, sigma)
H = alpha * (phi_x.T.dot(phi_x) / nx) + (1 - alpha) * (phi_y.T.dot(phi_y) / ny)
h = phi_x.mean(axis=0).T
theta = asarray(solve(H + diag(array(lambda_).repeat(kernel_num)), h)).ravel()
# No negative coefficients.
theta[theta < 0] = 0
# Compute the alpha-relative density ratio, at the given coordinates.
def alpha_density_ratio(coordinates):
# Evaluate the kernel at these coordinates, and take the dot-product with the weights.
coordinates = to_ndarray(coordinates)
phi_x = compute_kernel_Gaussian(coordinates, centers, sigma)
alpha_density_ratio = phi_x @ theta
return alpha_density_ratio
# Compute the approximate alpha-relative PE-divergence, given samples x and y from the respective distributions.
def alpha_PE_divergence(x, y):
# This is Y, in Reference 1.
x = to_ndarray(x)
# Obtain alpha-relative density ratio at these points.
g_x = alpha_density_ratio(x)
# This is Y', in Reference 1.
y = to_ndarray(y)
# Obtain alpha-relative density ratio at these points.
g_y = alpha_density_ratio(y)
# Compute the alpha-relative PE-divergence as given in Reference 1.
n = x.shape[0]
divergence = (
-alpha * (g_x @ g_x) / 2 - (1 - alpha) * (g_y @ g_y) / 2 + g_x.sum(axis=0)
) / n - 1.0 / 2
return divergence
# Compute the approximate alpha-relative KL-divergence, given samples x and y from the respective distributions.
def alpha_KL_divergence(x, y):
# This is Y, in Reference 1.
x = to_ndarray(x)
# Obtain alpha-relative density ratio at these points.
g_x = alpha_density_ratio(x)
# Compute the alpha-relative KL-divergence.
n = x.shape[0]
divergence = log(g_x).sum(axis=0) / n
return divergence
alpha_PE = alpha_PE_divergence(x, y)
alpha_KL = alpha_KL_divergence(x, y)
if verbose:
print("Approximate alpha-relative PE-divergence = {:03.2f}".format(alpha_PE))
print("Approximate alpha-relative KL-divergence = {:03.2f}".format(alpha_KL))
kernel_info = KernelInfo(
kernel_type="Gaussian", kernel_num=kernel_num, sigma=sigma, centers=centers
)
result = DensityRatio(
method="RuLSIF",
alpha=alpha,
theta=theta,
lambda_=lambda_,
alpha_PE=alpha_PE,
alpha_KL=alpha_KL,
kernel_info=kernel_info,
compute_density_ratio=alpha_density_ratio,
)
if verbose:
print("RuLSIF completed.")
return result
# Grid-search cross-validation for the optimal parameters sigma and lambda by leave-one-out cross-validation. See Reference 2.
def search_sigma_and_lambda(x, y, alpha, centers, sigma_range, lambda_range, verbose):
nx = x.shape[0]
ny = y.shape[0]
n_min = min(nx, ny)
kernel_num = centers.shape[0]
score_new = inf
sigma_new = 0
lambda_new = 0
for sigma in sigma_range:
phi_x = compute_kernel_Gaussian(x, centers, sigma) # (nx, kernel_num)
phi_y = compute_kernel_Gaussian(y, centers, sigma) # (ny, kernel_num)
H = alpha * (phi_x.T @ phi_x / nx) + (1 - alpha) * (
phi_y.T @ phi_y / ny
) # (kernel_num, kernel_num)
h = phi_x.mean(axis=0).reshape(-1, 1) # (kernel_num, 1)
phi_x = phi_x[:n_min].T # (kernel_num, n_min)
phi_y = phi_y[:n_min].T # (kernel_num, n_min)
for lambda_ in lambda_range:
B = H + diag(
array(lambda_ * (ny - 1) / ny).repeat(kernel_num)
) # (kernel_num, kernel_num)
B_inv_X = solve(B, phi_y) # (kernel_num, n_min)
X_B_inv_X = multiply(phi_y, B_inv_X) # (kernel_num, n_min)
denom = ny * ones(n_min) - ones(kernel_num) @ X_B_inv_X # (n_min, )
B0 = solve(B, h @ ones((1, n_min))) + B_inv_X @ diagflat(
h.T @ B_inv_X / denom
) # (kernel_num, n_min)
B1 = solve(B, phi_x) + B_inv_X @ diagflat(
ones(kernel_num) @ multiply(phi_x, B_inv_X)
) # (kernel_num, n_min)
B2 = (ny - 1) * (nx * B0 - B1) / (ny * (nx - 1)) # (kernel_num, n_min)
B2[B2 < 0] = 0
r_y = multiply(phi_y, B2).sum(axis=0).T # (n_min, )
r_x = multiply(phi_x, B2).sum(axis=0).T # (n_min, )
# Squared loss of RuLSIF, without regularization term.
# Directly related to the negative of the PE-divergence.
score = (r_y @ r_y / 2 - r_x.sum(axis=0)) / n_min
if verbose:
print(
"sigma = %.5f, lambda = %.5f, score = %.5f"
% (sigma, lambda_, score)
)
if score < score_new:
score_new = score
sigma_new = sigma
lambda_new = lambda_
return {"sigma": sigma_new, "lambda": lambda_new}
def _compute_kernel_Gaussian(x_list, y_row, neg_gamma, res) -> None:
sq_norm = sum(power(x_list - y_row, 2), 1)
multiply(neg_gamma, sq_norm, res)
exp(res, res)
def _target_numpy_wrapper(x_list, y_list, neg_gamma):
res = empty((y_list.shape[0], x_list.shape[0]), np_float)
if isinstance(x_list, matrix) or isinstance(y_list, matrix):
res = asmatrix(res)
for j, y_row in enumerate(y_list):
# `.T` aligns shapes for matrices, does nothing for 1D ndarray.
_compute_kernel_Gaussian(x_list, y_row, neg_gamma, res[j].T)
return res
_compute_functions = {"numpy": _target_numpy_wrapper}
if guvectorize_compute:
_compute_functions.update(
{
key: guvectorize_compute(key)(_compute_kernel_Gaussian)
for key in ("cpu", "parallel")
}
)
_compute_function = _compute_functions[
"cpu" if "cpu" in _compute_functions else "numpy"
]
# Returns a 2D numpy matrix of kernel evaluated at the gridpoints with coordinates from x_list and y_list.
def compute_kernel_Gaussian(x_list, y_list, sigma):
return _compute_function(x_list, y_list, -0.5 * sigma**-2).T
def set_compute_kernel_target(target: str) -> None:
global _compute_function
if target not in ("numpy", "cpu", "parallel"):
raise ValueError(
"'target' must be one of the following: 'numpy', 'cpu', or 'parallel'."
)
if target not in _compute_functions:
warn("'numba' not available; defaulting to 'numpy'.", ImportWarning)
target = "numpy"
_compute_function = _compute_functions[target]
"""
Relative Unconstrained Least-Squares Fitting (RuLSIF): A Python Implementation
References:
'Change-point detection in time-series data by relative density-ratio estimation'
Song Liu, Makoto Yamada, Nigel Collier and Masashi Sugiyama,
Neural Networks 43 (2013) 72-83.
'A Least-squares Approach to Direct Importance Estimation'
Takafumi Kanamori, Shohei Hido, and Masashi Sugiyama,
Journal of Machine Learning Research 10 (2009) 1391-1445.
"""
from warnings import warn
from numpy import (
array,
asarray,
asmatrix,
diag,
diagflat,
empty,
exp,
inf,
log,
matrix,
multiply,
ones,
power,
sum,
)
from numpy.linalg import solve
from numpy.random import randint
from .density_ratio import DensityRatio, KernelInfo
from .helpers import guvectorize_compute, np_float, to_ndarray
def RuLSIF(x, y, alpha, sigma_range, lambda_range, kernel_num=100, verbose=True):
"""
Estimation of the alpha-Relative Density Ratio p(x)/p_alpha(x) by RuLSIF
(Relative Unconstrained Least-Square Importance Fitting)
p_alpha(x) = alpha * p(x) + (1 - alpha) * q(x)
Arguments:
x (numpy.matrix): Sample from p(x).
y (numpy.matrix): Sample from q(x).
alpha (float): Mixture parameter.
sigma_range (list<float>): Search range of Gaussian kernel bandwidth.
lambda_range (list<float>): Search range of regularization parameter.
kernel_num (int): Number of kernels. (Default 100)
verbose (bool): Indicator to print messages (Default True)
Returns:
densratio.DensityRatio object which has `compute_density_ratio()`.
"""
# Number of samples.
nx = x.shape[0]
ny = y.shape[0]
# Number of kernel functions.
kernel_num = min(kernel_num, nx)
# Randomly take a subset of x, to identify centers for the kernels.
centers = x[randint(nx, size=kernel_num)]
if verbose:
print("RuLSIF starting...")
if len(sigma_range) == 1 and len(lambda_range) == 1:
sigma = sigma_range[0]
lambda_ = lambda_range[0]
else:
if verbose:
print("Searching for the optimal sigma and lambda...")
# Grid-search cross-validation for optimal kernel and regularization parameters.
opt_params = search_sigma_and_lambda(
x, y, alpha, centers, sigma_range, lambda_range, verbose
)
sigma = opt_params["sigma"]
lambda_ = opt_params["lambda"]
if verbose:
print(
"Found optimal sigma = {:.3f}, lambda = {:.3f}.".format(sigma, lambda_)
)
if verbose:
print("Optimizing theta...")
phi_x = compute_kernel_Gaussian(x, centers, sigma)
phi_y = compute_kernel_Gaussian(y, centers, sigma)
H = alpha * (phi_x.T.dot(phi_x) / nx) + (1 - alpha) * (phi_y.T.dot(phi_y) / ny)
h = phi_x.mean(axis=0).T
theta = asarray(solve(H + diag(array(lambda_).repeat(kernel_num)), h)).ravel()
# No negative coefficients.
theta[theta < 0] = 0
# Compute the alpha-relative density ratio, at the given coordinates.
def alpha_density_ratio(coordinates):
# Evaluate the kernel at these coordinates, and take the dot-product with the weights.
coordinates = to_ndarray(coordinates)
phi_x = compute_kernel_Gaussian(coordinates, centers, sigma)
alpha_density_ratio = phi_x @ theta
return alpha_density_ratio
# Compute the approximate alpha-relative PE-divergence, given samples x and y from the respective distributions.
def alpha_PE_divergence(x, y):
# This is Y, in Reference 1.
x = to_ndarray(x)
# Obtain alpha-relative density ratio at these points.
g_x = alpha_density_ratio(x)
# This is Y', in Reference 1.
y = to_ndarray(y)
# Obtain alpha-relative density ratio at these points.
g_y = alpha_density_ratio(y)
# Compute the alpha-relative PE-divergence as given in Reference 1.
n = x.shape[0]
divergence = (
-alpha * (g_x @ g_x) / 2 - (1 - alpha) * (g_y @ g_y) / 2 + g_x.sum(axis=0)
) / n - 1.0 / 2
return divergence
# Compute the approximate alpha-relative KL-divergence, given samples x and y from the respective distributions.
def alpha_KL_divergence(x, y):
# This is Y, in Reference 1.
x = to_ndarray(x)
# Obtain alpha-relative density ratio at these points.
g_x = alpha_density_ratio(x)
# Compute the alpha-relative KL-divergence.
n = x.shape[0]
divergence = log(g_x).sum(axis=0) / n
return divergence
alpha_PE = alpha_PE_divergence(x, y)
alpha_KL = alpha_KL_divergence(x, y)
if verbose:
print("Approximate alpha-relative PE-divergence = {:03.2f}".format(alpha_PE))
print("Approximate alpha-relative KL-divergence = {:03.2f}".format(alpha_KL))
kernel_info = KernelInfo(
kernel_type="Gaussian", kernel_num=kernel_num, sigma=sigma, centers=centers
)
result = DensityRatio(
method="RuLSIF",
alpha=alpha,
theta=theta,
lambda_=lambda_,
alpha_PE=alpha_PE,
alpha_KL=alpha_KL,
kernel_info=kernel_info,
compute_density_ratio=alpha_density_ratio,
)
if verbose:
print("RuLSIF completed.")
return result
# Grid-search cross-validation for the optimal parameters sigma and lambda by leave-one-out cross-validation. See Reference 2.
def search_sigma_and_lambda(x, y, alpha, centers, sigma_range, lambda_range, verbose):
nx = x.shape[0]
ny = y.shape[0]
n_min = min(nx, ny)
kernel_num = centers.shape[0]
score_new = inf
sigma_new = 0
lambda_new = 0
for sigma in sigma_range:
phi_x = compute_kernel_Gaussian(x, centers, sigma) # (nx, kernel_num)
phi_y = compute_kernel_Gaussian(y, centers, sigma) # (ny, kernel_num)
H = alpha * (phi_x.T @ phi_x / nx) + (1 - alpha) * (
phi_y.T @ phi_y / ny
) # (kernel_num, kernel_num)
h = phi_x.mean(axis=0).reshape(-1, 1) # (kernel_num, 1)
phi_x = phi_x[:n_min].T # (kernel_num, n_min)
phi_y = phi_y[:n_min].T # (kernel_num, n_min)
for lambda_ in lambda_range:
B = H + diag(
array(lambda_ * (ny - 1) / ny).repeat(kernel_num)
) # (kernel_num, kernel_num)
B_inv_X = solve(B, phi_y) # (kernel_num, n_min)
X_B_inv_X = multiply(phi_y, B_inv_X) # (kernel_num, n_min)
denom = ny * ones(n_min) - ones(kernel_num) @ X_B_inv_X # (n_min, )
B0 = solve(B, h @ ones((1, n_min))) + B_inv_X @ diagflat(
h.T @ B_inv_X / denom
) # (kernel_num, n_min)
B1 = solve(B, phi_x) + B_inv_X @ diagflat(
ones(kernel_num) @ multiply(phi_x, B_inv_X)
) # (kernel_num, n_min)
B2 = (ny - 1) * (nx * B0 - B1) / (ny * (nx - 1)) # (kernel_num, n_min)
B2[B2 < 0] = 0
r_y = multiply(phi_y, B2).sum(axis=0).T # (n_min, )
r_x = multiply(phi_x, B2).sum(axis=0).T # (n_min, )
# Squared loss of RuLSIF, without regularization term.
# Directly related to the negative of the PE-divergence.
score = (r_y @ r_y / 2 - r_x.sum(axis=0)) / n_min
if verbose:
print(
"sigma = %.5f, lambda = %.5f, score = %.5f"
% (sigma, lambda_, score)
)
if score < score_new:
score_new = score
sigma_new = sigma
lambda_new = lambda_
return {"sigma": sigma_new, "lambda": lambda_new}
def _compute_kernel_Gaussian(x_list, y_row, neg_gamma, res) -> None:
sq_norm = sum(power(x_list - y_row, 2), 1)
multiply(neg_gamma, sq_norm, res)
exp(res, res)
def _target_numpy_wrapper(x_list, y_list, neg_gamma):
res = empty((y_list.shape[0], x_list.shape[0]), np_float)
if isinstance(x_list, matrix) or isinstance(y_list, matrix):
res = asmatrix(res)
for j, y_row in enumerate(y_list):
# `.T` aligns shapes for matrices, does nothing for 1D ndarray.
_compute_kernel_Gaussian(x_list, y_row, neg_gamma, res[j].T)
return res
_compute_functions = {"numpy": _target_numpy_wrapper}
if guvectorize_compute:
_compute_functions.update(
{
key: guvectorize_compute(key)(_compute_kernel_Gaussian)
for key in ("cpu", "parallel")
}
)
_compute_function = _compute_functions[
"cpu" if "cpu" in _compute_functions else "numpy"
]
# Returns a 2D numpy matrix of kernel evaluated at the gridpoints with coordinates from x_list and y_list.
def compute_kernel_Gaussian(x_list, y_list, sigma):
return _compute_function(x_list, y_list, -0.5 * sigma**-2).T
def set_compute_kernel_target(target: str) -> None:
global _compute_function
if target not in ("numpy", "cpu", "parallel"):
raise ValueError(
"'target' must be one of the following: 'numpy', 'cpu', or 'parallel'."
)
if target not in _compute_functions:
warn("'numba' not available; defaulting to 'numpy'.", ImportWarning)
target = "numpy"
_compute_function = _compute_functions[target]

View File

@ -1,7 +1,7 @@
from warnings import filterwarnings
from .core import densratio
from .RuLSIF import set_compute_kernel_target
filterwarnings("default", message="'numba'", category=ImportWarning, module="densratio")
__all__ = ["densratio", "set_compute_kernel_target"]
from warnings import filterwarnings
from .core import densratio
from .RuLSIF import set_compute_kernel_target
filterwarnings("default", message="'numba'", category=ImportWarning, module="densratio")
__all__ = ["densratio", "set_compute_kernel_target"]

View File

@ -1,70 +1,70 @@
"""
densratio.core
~~~~~~~~~~~~~~
Estimate Density Ratio p(x)/q(y)
"""
from numpy import linspace
from .helpers import to_ndarray
from .RuLSIF import RuLSIF
def densratio(
x, y, alpha=0, sigma_range="auto", lambda_range="auto", kernel_num=100, verbose=True
):
"""Estimate alpha-mixture Density Ratio p(x)/(alpha*p(x) + (1 - alpha)*q(x))
Arguments:
x: sample from p(x).
y: sample from q(x).
alpha: Default 0 - corresponds to ordinary density ratio.
sigma_range: search range of Gaussian kernel bandwidth.
Default "auto" means 10^-3, 10^-2, ..., 10^9.
lambda_range: search range of regularization parameter for uLSIF.
Default "auto" means 10^-3, 10^-2, ..., 10^9.
kernel_num: number of kernels. Default 100.
verbose: indicator to print messages. Default True.
Returns:
densratio.DensityRatio object which has `compute_density_ratio()`.
Raises:
ValueError: if dimension of x != dimension of y
Usage::
>>> from scipy.stats import norm
>>> from densratio import densratio
>>> x = norm.rvs(size=200, loc=1, scale=1./8)
>>> y = norm.rvs(size=200, loc=1, scale=1./2)
>>> result = densratio(x, y, alpha=0.7)
>>> print(result)
>>> density_ratio = result.compute_density_ratio(y)
>>> print(density_ratio)
"""
x = to_ndarray(x)
y = to_ndarray(y)
if x.shape[1] != y.shape[1]:
raise ValueError("x and y must be same dimensions.")
if isinstance(sigma_range, str) and sigma_range != "auto":
raise TypeError("Invalid value for sigma_range.")
if isinstance(lambda_range, str) and lambda_range != "auto":
raise TypeError("Invalid value for lambda_range.")
if sigma_range is None or (isinstance(sigma_range, str) and sigma_range == "auto"):
sigma_range = 10 ** linspace(-3, 9, 13)
if lambda_range is None or (
isinstance(lambda_range, str) and lambda_range == "auto"
):
lambda_range = 10 ** linspace(-3, 9, 13)
result = RuLSIF(x, y, alpha, sigma_range, lambda_range, kernel_num, verbose)
return result
"""
densratio.core
~~~~~~~~~~~~~~
Estimate Density Ratio p(x)/q(y)
"""
from numpy import linspace
from .helpers import to_ndarray
from .RuLSIF import RuLSIF
def densratio(
x, y, alpha=0, sigma_range="auto", lambda_range="auto", kernel_num=100, verbose=True
):
"""Estimate alpha-mixture Density Ratio p(x)/(alpha*p(x) + (1 - alpha)*q(x))
Arguments:
x: sample from p(x).
y: sample from q(x).
alpha: Default 0 - corresponds to ordinary density ratio.
sigma_range: search range of Gaussian kernel bandwidth.
Default "auto" means 10^-3, 10^-2, ..., 10^9.
lambda_range: search range of regularization parameter for uLSIF.
Default "auto" means 10^-3, 10^-2, ..., 10^9.
kernel_num: number of kernels. Default 100.
verbose: indicator to print messages. Default True.
Returns:
densratio.DensityRatio object which has `compute_density_ratio()`.
Raises:
ValueError: if dimension of x != dimension of y
Usage::
>>> from scipy.stats import norm
>>> from densratio import densratio
>>> x = norm.rvs(size=200, loc=1, scale=1./8)
>>> y = norm.rvs(size=200, loc=1, scale=1./2)
>>> result = densratio(x, y, alpha=0.7)
>>> print(result)
>>> density_ratio = result.compute_density_ratio(y)
>>> print(density_ratio)
"""
x = to_ndarray(x)
y = to_ndarray(y)
if x.shape[1] != y.shape[1]:
raise ValueError("x and y must be same dimensions.")
if isinstance(sigma_range, str) and sigma_range != "auto":
raise TypeError("Invalid value for sigma_range.")
if isinstance(lambda_range, str) and lambda_range != "auto":
raise TypeError("Invalid value for lambda_range.")
if sigma_range is None or (isinstance(sigma_range, str) and sigma_range == "auto"):
sigma_range = 10 ** linspace(-3, 9, 13)
if lambda_range is None or (
isinstance(lambda_range, str) and lambda_range == "auto"
):
lambda_range = 10 ** linspace(-3, 9, 13)
result = RuLSIF(x, y, alpha, sigma_range, lambda_range, kernel_num, verbose)
return result

View File

@ -1,88 +1,88 @@
from pprint import pformat
from re import sub
class DensityRatio:
"""Density Ratio."""
def __init__(
self,
method,
alpha,
theta,
lambda_,
alpha_PE,
alpha_KL,
kernel_info,
compute_density_ratio,
):
self.method = method
self.alpha = alpha
self.theta = theta
self.lambda_ = lambda_
self.alpha_PE = alpha_PE
self.alpha_KL = alpha_KL
self.kernel_info = kernel_info
self.compute_density_ratio = compute_density_ratio
def __str__(self):
return """
Method: %(method)s
Alpha: %(alpha)s
Kernel Information:
%(kernel_info)s
Kernel Weights (theta):
%(theta)s
Regularization Parameter (lambda): %(lambda_)s
Alpha-Relative PE-Divergence: %(alpha_PE)s
Alpha-Relative KL-Divergence: %(alpha_KL)s
Function to Estimate Density Ratio:
compute_density_ratio(x)
"""[
1:-1
] % dict(
method=self.method,
kernel_info=self.kernel_info,
alpha=self.alpha,
theta=my_format(self.theta),
lambda_=self.lambda_,
alpha_PE=self.alpha_PE,
alpha_KL=self.alpha_KL,
)
class KernelInfo:
"""Kernel Information."""
def __init__(self, kernel_type, kernel_num, sigma, centers):
self.kernel_type = kernel_type
self.kernel_num = kernel_num
self.sigma = sigma
self.centers = centers
def __str__(self):
return """
Kernel type: %(kernel_type)s
Number of kernels: %(kernel_num)s
Bandwidth(sigma): %(sigma)s
Centers: %(centers)s
"""[
1:-1
] % dict(
kernel_type=self.kernel_type,
kernel_num=self.kernel_num,
sigma=self.sigma,
centers=my_format(self.centers),
)
def my_format(str):
return sub(r"\s+", " ", (pformat(str).split("\n")[0] + ".."))
from pprint import pformat
from re import sub
class DensityRatio:
"""Density Ratio."""
def __init__(
self,
method,
alpha,
theta,
lambda_,
alpha_PE,
alpha_KL,
kernel_info,
compute_density_ratio,
):
self.method = method
self.alpha = alpha
self.theta = theta
self.lambda_ = lambda_
self.alpha_PE = alpha_PE
self.alpha_KL = alpha_KL
self.kernel_info = kernel_info
self.compute_density_ratio = compute_density_ratio
def __str__(self):
return """
Method: %(method)s
Alpha: %(alpha)s
Kernel Information:
%(kernel_info)s
Kernel Weights (theta):
%(theta)s
Regularization Parameter (lambda): %(lambda_)s
Alpha-Relative PE-Divergence: %(alpha_PE)s
Alpha-Relative KL-Divergence: %(alpha_KL)s
Function to Estimate Density Ratio:
compute_density_ratio(x)
"""[
1:-1
] % dict(
method=self.method,
kernel_info=self.kernel_info,
alpha=self.alpha,
theta=my_format(self.theta),
lambda_=self.lambda_,
alpha_PE=self.alpha_PE,
alpha_KL=self.alpha_KL,
)
class KernelInfo:
"""Kernel Information."""
def __init__(self, kernel_type, kernel_num, sigma, centers):
self.kernel_type = kernel_type
self.kernel_num = kernel_num
self.sigma = sigma
self.centers = centers
def __str__(self):
return """
Kernel type: %(kernel_type)s
Number of kernels: %(kernel_num)s
Bandwidth(sigma): %(sigma)s
Centers: %(centers)s
"""[
1:-1
] % dict(
kernel_type=self.kernel_type,
kernel_num=self.kernel_num,
sigma=self.sigma,
centers=my_format(self.centers),
)
def my_format(str):
return sub(r"\s+", " ", (pformat(str).split("\n")[0] + ".."))

View File

@ -1,36 +1,36 @@
from numpy import array, ndarray, result_type
np_float = result_type(float)
try:
import numba as nb
except ModuleNotFoundError:
guvectorize_compute = None
else:
_nb_float = nb.from_dtype(np_float)
def guvectorize_compute(target: str, *, cache: bool = True):
return nb.guvectorize(
[nb.void(_nb_float[:, :], _nb_float[:], _nb_float, _nb_float[:])],
"(m, p),(p),()->(m)",
nopython=True,
target=target,
cache=cache,
)
def is_numeric(x):
return isinstance(x, int) or isinstance(x, float)
def to_ndarray(x):
if isinstance(x, ndarray):
if len(x.shape) == 1:
return x.reshape(-1, 1)
else:
return x
elif str(type(x)) == "<class 'pandas.core.frame.DataFrame'>":
return x.values
elif not x:
raise ValueError("Cannot transform to numpy.matrix.")
else:
return to_ndarray(array(x))
from numpy import array, ndarray, result_type
np_float = result_type(float)
try:
import numba as nb
except ModuleNotFoundError:
guvectorize_compute = None
else:
_nb_float = nb.from_dtype(np_float)
def guvectorize_compute(target: str, *, cache: bool = True):
return nb.guvectorize(
[nb.void(_nb_float[:, :], _nb_float[:], _nb_float, _nb_float[:])],
"(m, p),(p),()->(m)",
nopython=True,
target=target,
cache=cache,
)
def is_numeric(x):
return isinstance(x, int) or isinstance(x, float)
def to_ndarray(x):
if isinstance(x, ndarray):
if len(x.shape) == 1:
return x.reshape(-1, 1)
else:
return x
elif str(type(x)) == "<class 'pandas.core.frame.DataFrame'>":
return x.values
elif not x:
raise ValueError("Cannot transform to numpy.matrix.")
else:
return to_ndarray(array(x))

View File

@ -1,4 +1,4 @@
import numpy as np
def get_doc(probs1, probs2):
import numpy as np
def get_doc(probs1, probs2):
return np.mean(probs2) - np.mean(probs1)

View File

@ -1,66 +1,66 @@
import numpy as np
from scipy.sparse import issparse, vstack
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
from baselines import densratio
from baselines.pykliep import DensityRatioEstimator
def kliep(Xtr, ytr, Xte):
kliep = DensityRatioEstimator()
kliep.fit(Xtr, Xte)
return kliep.predict(Xtr)
def usilf(Xtr, ytr, Xte, alpha=0.0):
dense_ratio_obj = densratio(Xtr, Xte, alpha=alpha, verbose=False)
return dense_ratio_obj.compute_density_ratio(Xtr)
def logreg(Xtr, ytr, Xte):
# check "Direct Density Ratio Estimation for
# Large-scale Covariate Shift Adaptation", Eq.28
if issparse(Xtr):
X = vstack([Xtr, Xte])
else:
X = np.concatenate([Xtr, Xte])
y = [0] * Xtr.shape[0] + [1] * Xte.shape[0]
logreg = GridSearchCV(
LogisticRegression(),
param_grid={"C": np.logspace(-3, 3, 7), "class_weight": ["balanced", None]},
n_jobs=-1,
)
logreg.fit(X, y)
probs = logreg.predict_proba(Xtr)
prob_train, prob_test = probs[:, 0], probs[:, 1]
prior_train = Xtr.shape[0]
prior_test = Xte.shape[0]
w = (prior_train / prior_test) * (prob_test / prob_train)
return w
kdex2_params = {"bandwidth": np.logspace(-1, 1, 20)}
def kdex2_lltr(Xtr):
if issparse(Xtr):
Xtr = Xtr.toarray()
return GridSearchCV(KernelDensity(), kdex2_params).fit(Xtr).score_samples(Xtr)
def kdex2_weights(Xtr, Xte, log_likelihood_tr):
log_likelihood_te = (
GridSearchCV(KernelDensity(), kdex2_params).fit(Xte).score_samples(Xtr)
)
likelihood_tr = np.exp(log_likelihood_tr)
likelihood_te = np.exp(log_likelihood_te)
return likelihood_te / likelihood_tr
def get_acc(tr_preds, ytr, w):
return np.sum((1.0 * (tr_preds == ytr)) * w) / np.sum(w)
import numpy as np
from scipy.sparse import issparse, vstack
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
from baselines import densratio
from baselines.pykliep import DensityRatioEstimator
def kliep(Xtr, ytr, Xte):
kliep = DensityRatioEstimator()
kliep.fit(Xtr, Xte)
return kliep.predict(Xtr)
def usilf(Xtr, ytr, Xte, alpha=0.0):
dense_ratio_obj = densratio(Xtr, Xte, alpha=alpha, verbose=False)
return dense_ratio_obj.compute_density_ratio(Xtr)
def logreg(Xtr, ytr, Xte):
# check "Direct Density Ratio Estimation for
# Large-scale Covariate Shift Adaptation", Eq.28
if issparse(Xtr):
X = vstack([Xtr, Xte])
else:
X = np.concatenate([Xtr, Xte])
y = [0] * Xtr.shape[0] + [1] * Xte.shape[0]
logreg = GridSearchCV(
LogisticRegression(),
param_grid={"C": np.logspace(-3, 3, 7), "class_weight": ["balanced", None]},
n_jobs=-1,
)
logreg.fit(X, y)
probs = logreg.predict_proba(Xtr)
prob_train, prob_test = probs[:, 0], probs[:, 1]
prior_train = Xtr.shape[0]
prior_test = Xte.shape[0]
w = (prior_train / prior_test) * (prob_test / prob_train)
return w
kdex2_params = {"bandwidth": np.logspace(-1, 1, 20)}
def kdex2_lltr(Xtr):
if issparse(Xtr):
Xtr = Xtr.toarray()
return GridSearchCV(KernelDensity(), kdex2_params).fit(Xtr).score_samples(Xtr)
def kdex2_weights(Xtr, Xte, log_likelihood_tr):
log_likelihood_te = (
GridSearchCV(KernelDensity(), kdex2_params).fit(Xte).score_samples(Xtr)
)
likelihood_tr = np.exp(log_likelihood_tr)
likelihood_te = np.exp(log_likelihood_te)
return likelihood_te / likelihood_tr
def get_acc(tr_preds, ytr, w):
return np.sum((1.0 * (tr_preds == ytr)) * w) / np.sum(w)

View File

@ -1,140 +1,140 @@
# import itertools
# from typing import Iterable
# import quapy as qp
# import quapy.functional as F
# from densratio import densratio
# from quapy.method.aggregative import *
# from quapy.protocol import (
# AbstractStochasticSeededProtocol,
# OnLabelledCollectionProtocol,
# )
# from scipy.sparse import issparse, vstack
# from scipy.spatial.distance import cdist
# from scipy.stats import multivariate_normal
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
# from sklearn.neighbors import KernelDensity
import time
import numpy as np
import sklearn.metrics as metrics
from pykliep import DensityRatioEstimator
from quapy.protocol import APP
from scipy.sparse import issparse, vstack
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
import baselines.impweight as iw
from baselines.densratio import densratio
from quacc.dataset import Dataset
# ---------------------------------------------------------------------------------------
# Methods of "importance weight", e.g., by ratio density estimation (KLIEP, SILF, LogReg)
# ---------------------------------------------------------------------------------------
class ImportanceWeight:
def weights(self, Xtr, ytr, Xte):
...
class KLIEP(ImportanceWeight):
def __init__(self):
pass
def weights(self, Xtr, ytr, Xte):
kliep = DensityRatioEstimator()
kliep.fit(Xtr, Xte)
return kliep.predict(Xtr)
class USILF(ImportanceWeight):
def __init__(self, alpha=0.0):
self.alpha = alpha
def weights(self, Xtr, ytr, Xte):
dense_ratio_obj = densratio(Xtr, Xte, alpha=self.alpha, verbose=False)
return dense_ratio_obj.compute_density_ratio(Xtr)
class LogReg(ImportanceWeight):
def __init__(self):
pass
def weights(self, Xtr, ytr, Xte):
# check "Direct Density Ratio Estimation for
# Large-scale Covariate Shift Adaptation", Eq.28
if issparse(Xtr):
X = vstack([Xtr, Xte])
else:
X = np.concatenate([Xtr, Xte])
y = [0] * Xtr.shape[0] + [1] * Xte.shape[0]
logreg = GridSearchCV(
LogisticRegression(),
param_grid={"C": np.logspace(-3, 3, 7), "class_weight": ["balanced", None]},
n_jobs=-1,
)
logreg.fit(X, y)
probs = logreg.predict_proba(Xtr)
prob_train, prob_test = probs[:, 0], probs[:, 1]
prior_train = Xtr.shape[0]
prior_test = Xte.shape[0]
w = (prior_train / prior_test) * (prob_test / prob_train)
return w
class KDEx2(ImportanceWeight):
def __init__(self):
pass
def weights(self, Xtr, ytr, Xte):
params = {"bandwidth": np.logspace(-1, 1, 20)}
log_likelihood_tr = (
GridSearchCV(KernelDensity(), params).fit(Xtr).score_samples(Xtr)
)
log_likelihood_te = (
GridSearchCV(KernelDensity(), params).fit(Xte).score_samples(Xtr)
)
likelihood_tr = np.exp(log_likelihood_tr)
likelihood_te = np.exp(log_likelihood_te)
return likelihood_te / likelihood_tr
if __name__ == "__main__":
# d = Dataset("rcv1", target="CCAT").get_raw()
d = Dataset("imdb", n_prevalences=1).get()[0]
tstart = time.time()
lr = LogisticRegression()
lr.fit(*d.train.Xy)
val_preds = lr.predict(d.validation.X)
protocol = APP(
d.test,
n_prevalences=21,
repeats=1,
sample_size=100,
return_type="labelled_collection",
)
results = []
for sample in protocol():
wx = iw.kliep(d.validation.X, d.validation.y, sample.X)
test_preds = lr.predict(sample.X)
estim_acc = np.sum((1.0 * (val_preds == d.validation.y)) * wx) / np.sum(wx)
true_acc = metrics.accuracy_score(sample.y, test_preds)
results.append((sample.prevalence(), estim_acc, true_acc))
tend = time.time()
for r in results:
print(*r)
print(f"logreg finished [took {tend-tstart:.3f}s]")
import win11toast
win11toast.notify("models.py", "Completed")
# import itertools
# from typing import Iterable
# import quapy as qp
# import quapy.functional as F
# from densratio import densratio
# from quapy.method.aggregative import *
# from quapy.protocol import (
# AbstractStochasticSeededProtocol,
# OnLabelledCollectionProtocol,
# )
# from scipy.sparse import issparse, vstack
# from scipy.spatial.distance import cdist
# from scipy.stats import multivariate_normal
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
# from sklearn.neighbors import KernelDensity
import time
import numpy as np
import sklearn.metrics as metrics
from pykliep import DensityRatioEstimator
from quapy.protocol import APP
from scipy.sparse import issparse, vstack
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
import baselines.impweight as iw
from baselines.densratio import densratio
from quacc.dataset import Dataset
# ---------------------------------------------------------------------------------------
# Methods of "importance weight", e.g., by ratio density estimation (KLIEP, SILF, LogReg)
# ---------------------------------------------------------------------------------------
class ImportanceWeight:
def weights(self, Xtr, ytr, Xte):
...
class KLIEP(ImportanceWeight):
def __init__(self):
pass
def weights(self, Xtr, ytr, Xte):
kliep = DensityRatioEstimator()
kliep.fit(Xtr, Xte)
return kliep.predict(Xtr)
class USILF(ImportanceWeight):
def __init__(self, alpha=0.0):
self.alpha = alpha
def weights(self, Xtr, ytr, Xte):
dense_ratio_obj = densratio(Xtr, Xte, alpha=self.alpha, verbose=False)
return dense_ratio_obj.compute_density_ratio(Xtr)
class LogReg(ImportanceWeight):
def __init__(self):
pass
def weights(self, Xtr, ytr, Xte):
# check "Direct Density Ratio Estimation for
# Large-scale Covariate Shift Adaptation", Eq.28
if issparse(Xtr):
X = vstack([Xtr, Xte])
else:
X = np.concatenate([Xtr, Xte])
y = [0] * Xtr.shape[0] + [1] * Xte.shape[0]
logreg = GridSearchCV(
LogisticRegression(),
param_grid={"C": np.logspace(-3, 3, 7), "class_weight": ["balanced", None]},
n_jobs=-1,
)
logreg.fit(X, y)
probs = logreg.predict_proba(Xtr)
prob_train, prob_test = probs[:, 0], probs[:, 1]
prior_train = Xtr.shape[0]
prior_test = Xte.shape[0]
w = (prior_train / prior_test) * (prob_test / prob_train)
return w
class KDEx2(ImportanceWeight):
def __init__(self):
pass
def weights(self, Xtr, ytr, Xte):
params = {"bandwidth": np.logspace(-1, 1, 20)}
log_likelihood_tr = (
GridSearchCV(KernelDensity(), params).fit(Xtr).score_samples(Xtr)
)
log_likelihood_te = (
GridSearchCV(KernelDensity(), params).fit(Xte).score_samples(Xtr)
)
likelihood_tr = np.exp(log_likelihood_tr)
likelihood_te = np.exp(log_likelihood_te)
return likelihood_te / likelihood_tr
if __name__ == "__main__":
# d = Dataset("rcv1", target="CCAT").get_raw()
d = Dataset("imdb", n_prevalences=1).get()[0]
tstart = time.time()
lr = LogisticRegression()
lr.fit(*d.train.Xy)
val_preds = lr.predict(d.validation.X)
protocol = APP(
d.test,
n_prevalences=21,
repeats=1,
sample_size=100,
return_type="labelled_collection",
)
results = []
for sample in protocol():
wx = iw.kliep(d.validation.X, d.validation.y, sample.X)
test_preds = lr.predict(sample.X)
estim_acc = np.sum((1.0 * (val_preds == d.validation.y)) * wx) / np.sum(wx)
true_acc = metrics.accuracy_score(sample.y, test_preds)
results.append((sample.prevalence(), estim_acc, true_acc))
tend = time.time()
for r in results:
print(*r)
print(f"logreg finished [took {tend-tstart:.3f}s]")
import win11toast
win11toast.notify("models.py", "Completed")

View File

@ -1,221 +1,221 @@
import warnings
import numpy as np
from scipy.sparse import csr_matrix
class DensityRatioEstimator:
"""
Class to accomplish direct density estimation implementing the original KLIEP
algorithm from Direct Importance Estimation with Model Selection
and Its Application to Covariate Shift Adaptation by Sugiyama et al.
The training set is distributed via
train ~ p(x)
and the test set is distributed via
test ~ q(x).
The KLIEP algorithm and its variants approximate w(x) = q(x) / p(x) directly. The predict function returns the
estimate of w(x). The function w(x) can serve as sample weights for the training set during
training to modify the expectation function that the model's loss function is optimized via,
i.e.
E_{x ~ w(x)p(x)} loss(x) = E_{x ~ q(x)} loss(x).
Usage :
The fit method is used to run the KLIEP algorithm using LCV and returns value of J
trained on the entire training/test set with the best sigma found.
Use the predict method on the training set to determine the sample weights from the KLIEP algorithm.
"""
def __init__(
self,
max_iter=5000,
num_params=[0.1, 0.2],
epsilon=1e-4,
cv=3,
sigmas=[0.01, 0.1, 0.25, 0.5, 0.75, 1],
random_state=None,
verbose=0,
):
"""
Direct density estimation using an inner LCV loop to estimate the proper model. Can be used with sklearn
cross validation methods with or without storing the inner CV. To use a standard grid search.
max_iter : Number of iterations to perform
num_params : List of number of test set vectors used to construct the approximation for inner LCV.
Must be a float. Original paper used 10%, i.e. =.1
sigmas : List of sigmas to be used in inner LCV loop.
epsilon : Additive factor in the iterative algorithm for numerical stability.
"""
self.max_iter = max_iter
self.num_params = num_params
self.epsilon = epsilon
self.verbose = verbose
self.sigmas = sigmas
self.cv = cv
self.random_state = 0
def fit(self, X_train, X_test, alpha_0=None):
"""Uses cross validation to select sigma as in the original paper (LCV).
In a break from sklearn convention, y=X_test.
The parameter cv corresponds to R in the original paper.
Once found, the best sigma is used to train on the full set."""
# LCV loop, shuffle a copy in place for performance.
cv = self.cv
chunk = int(X_test.shape[0] / float(cv))
if self.random_state is not None:
np.random.seed(self.random_state)
# if isinstance(X_test, csr_matrix):
# X_test_shuffled = X_test.toarray()
# else:
# X_test_shuffled = X_test.copy()
X_test_shuffled = X_test.copy()
X_test_index = np.arange(X_test_shuffled.shape[0])
np.random.shuffle(X_test_index)
X_test_shuffled = X_test_shuffled[X_test_index, :]
j_scores = {}
if type(self.sigmas) != list:
self.sigmas = [self.sigmas]
if type(self.num_params) != list:
self.num_params = [self.num_params]
if len(self.sigmas) * len(self.num_params) > 1:
# Inner LCV loop
for num_param in self.num_params:
for sigma in self.sigmas:
j_scores[(num_param, sigma)] = np.zeros(cv)
for k in range(1, cv + 1):
if self.verbose > 0:
print("Training: sigma: %s R: %s" % (sigma, k))
X_test_fold = X_test_shuffled[(k - 1) * chunk : k * chunk, :]
j_scores[(num_param, sigma)][k - 1] = self._fit(
X_train=X_train,
X_test=X_test_fold,
num_parameters=num_param,
sigma=sigma,
)
j_scores[(num_param, sigma)] = np.mean(j_scores[(num_param, sigma)])
sorted_scores = sorted(
[x for x in j_scores.items() if np.isfinite(x[1])],
key=lambda x: x[1],
reverse=True,
)
if len(sorted_scores) == 0:
warnings.warn("LCV failed to converge for all values of sigma.")
return self
self._sigma = sorted_scores[0][0][1]
self._num_parameters = sorted_scores[0][0][0]
self._j_scores = sorted_scores
else:
self._sigma = self.sigmas[0]
self._num_parameters = self.num_params[0]
# best sigma
self._j = self._fit(
X_train=X_train,
X_test=X_test_shuffled,
num_parameters=self._num_parameters,
sigma=self._sigma,
)
return self # Compatibility with sklearn
def _fit(self, X_train, X_test, num_parameters, sigma, alpha_0=None):
"""Fits the estimator with the given parameters w-hat and returns J"""
num_parameters = num_parameters
if type(num_parameters) == float:
num_parameters = int(X_test.shape[0] * num_parameters)
self._select_param_vectors(
X_test=X_test, sigma=sigma, num_parameters=num_parameters
)
# if isinstance(X_train, csr_matrix):
# X_train = X_train.toarray()
X_train = self._reshape_X(X_train)
X_test = self._reshape_X(X_test)
if alpha_0 is None:
alpha_0 = np.ones(shape=(num_parameters, 1)) / float(num_parameters)
self._find_alpha(
X_train=X_train,
X_test=X_test,
num_parameters=num_parameters,
epsilon=self.epsilon,
alpha_0=alpha_0,
sigma=sigma,
)
return self._calculate_j(X_test, sigma=sigma)
def _calculate_j(self, X_test, sigma):
pred = self.predict(X_test, sigma=sigma) + 0.0000001
log = np.log(pred).sum()
return log / (X_test.shape[0])
def score(self, X_test):
"""Return the J score, similar to sklearn's API"""
return self._calculate_j(X_test=X_test, sigma=self._sigma)
@staticmethod
def _reshape_X(X):
"""Reshape input from mxn to mx1xn to take advantage of numpy broadcasting."""
if len(X.shape) != 3:
return X.reshape((X.shape[0], 1, X.shape[1]))
return X
def _select_param_vectors(self, X_test, sigma, num_parameters):
"""X_test is the test set. b is the number of parameters."""
indices = np.random.choice(X_test.shape[0], size=num_parameters, replace=False)
self._test_vectors = X_test[indices, :].copy()
self._phi_fitted = True
def _phi(self, X, sigma=None):
if sigma is None:
sigma = self._sigma
if self._phi_fitted:
return np.exp(
-np.sum((X - self._test_vectors) ** 2, axis=-1) / (2 * sigma**2)
)
raise Exception("Phi not fitted.")
def _find_alpha(self, alpha_0, X_train, X_test, num_parameters, sigma, epsilon):
A = np.zeros(shape=(X_test.shape[0], num_parameters))
b = np.zeros(shape=(num_parameters, 1))
A = self._phi(X_test, sigma)
b = self._phi(X_train, sigma).sum(axis=0) / X_train.shape[0]
b = b.reshape((num_parameters, 1))
out = alpha_0.copy()
for k in range(self.max_iter):
mat = np.dot(A, out)
mat += 0.000000001
out += epsilon * np.dot(np.transpose(A), 1.0 / mat)
out += b * (
((1 - np.dot(np.transpose(b), out)) / np.dot(np.transpose(b), b))
)
out = np.maximum(0, out)
out /= np.dot(np.transpose(b), out)
self._alpha = out
self._fitted = True
def predict(self, X, sigma=None):
"""Equivalent of w(X) from the original paper."""
X = self._reshape_X(X)
if not self._fitted:
raise Exception("Not fitted!")
return np.dot(self._phi(X, sigma=sigma), self._alpha).reshape((X.shape[0],))
import warnings
import numpy as np
from scipy.sparse import csr_matrix
class DensityRatioEstimator:
"""
Class to accomplish direct density estimation implementing the original KLIEP
algorithm from Direct Importance Estimation with Model Selection
and Its Application to Covariate Shift Adaptation by Sugiyama et al.
The training set is distributed via
train ~ p(x)
and the test set is distributed via
test ~ q(x).
The KLIEP algorithm and its variants approximate w(x) = q(x) / p(x) directly. The predict function returns the
estimate of w(x). The function w(x) can serve as sample weights for the training set during
training to modify the expectation function that the model's loss function is optimized via,
i.e.
E_{x ~ w(x)p(x)} loss(x) = E_{x ~ q(x)} loss(x).
Usage :
The fit method is used to run the KLIEP algorithm using LCV and returns value of J
trained on the entire training/test set with the best sigma found.
Use the predict method on the training set to determine the sample weights from the KLIEP algorithm.
"""
def __init__(
self,
max_iter=5000,
num_params=[0.1, 0.2],
epsilon=1e-4,
cv=3,
sigmas=[0.01, 0.1, 0.25, 0.5, 0.75, 1],
random_state=None,
verbose=0,
):
"""
Direct density estimation using an inner LCV loop to estimate the proper model. Can be used with sklearn
cross validation methods with or without storing the inner CV. To use a standard grid search.
max_iter : Number of iterations to perform
num_params : List of number of test set vectors used to construct the approximation for inner LCV.
Must be a float. Original paper used 10%, i.e. =.1
sigmas : List of sigmas to be used in inner LCV loop.
epsilon : Additive factor in the iterative algorithm for numerical stability.
"""
self.max_iter = max_iter
self.num_params = num_params
self.epsilon = epsilon
self.verbose = verbose
self.sigmas = sigmas
self.cv = cv
self.random_state = 0
def fit(self, X_train, X_test, alpha_0=None):
"""Uses cross validation to select sigma as in the original paper (LCV).
In a break from sklearn convention, y=X_test.
The parameter cv corresponds to R in the original paper.
Once found, the best sigma is used to train on the full set."""
# LCV loop, shuffle a copy in place for performance.
cv = self.cv
chunk = int(X_test.shape[0] / float(cv))
if self.random_state is not None:
np.random.seed(self.random_state)
# if isinstance(X_test, csr_matrix):
# X_test_shuffled = X_test.toarray()
# else:
# X_test_shuffled = X_test.copy()
X_test_shuffled = X_test.copy()
X_test_index = np.arange(X_test_shuffled.shape[0])
np.random.shuffle(X_test_index)
X_test_shuffled = X_test_shuffled[X_test_index, :]
j_scores = {}
if type(self.sigmas) != list:
self.sigmas = [self.sigmas]
if type(self.num_params) != list:
self.num_params = [self.num_params]
if len(self.sigmas) * len(self.num_params) > 1:
# Inner LCV loop
for num_param in self.num_params:
for sigma in self.sigmas:
j_scores[(num_param, sigma)] = np.zeros(cv)
for k in range(1, cv + 1):
if self.verbose > 0:
print("Training: sigma: %s R: %s" % (sigma, k))
X_test_fold = X_test_shuffled[(k - 1) * chunk : k * chunk, :]
j_scores[(num_param, sigma)][k - 1] = self._fit(
X_train=X_train,
X_test=X_test_fold,
num_parameters=num_param,
sigma=sigma,
)
j_scores[(num_param, sigma)] = np.mean(j_scores[(num_param, sigma)])
sorted_scores = sorted(
[x for x in j_scores.items() if np.isfinite(x[1])],
key=lambda x: x[1],
reverse=True,
)
if len(sorted_scores) == 0:
warnings.warn("LCV failed to converge for all values of sigma.")
return self
self._sigma = sorted_scores[0][0][1]
self._num_parameters = sorted_scores[0][0][0]
self._j_scores = sorted_scores
else:
self._sigma = self.sigmas[0]
self._num_parameters = self.num_params[0]
# best sigma
self._j = self._fit(
X_train=X_train,
X_test=X_test_shuffled,
num_parameters=self._num_parameters,
sigma=self._sigma,
)
return self # Compatibility with sklearn
def _fit(self, X_train, X_test, num_parameters, sigma, alpha_0=None):
"""Fits the estimator with the given parameters w-hat and returns J"""
num_parameters = num_parameters
if type(num_parameters) == float:
num_parameters = int(X_test.shape[0] * num_parameters)
self._select_param_vectors(
X_test=X_test, sigma=sigma, num_parameters=num_parameters
)
# if isinstance(X_train, csr_matrix):
# X_train = X_train.toarray()
X_train = self._reshape_X(X_train)
X_test = self._reshape_X(X_test)
if alpha_0 is None:
alpha_0 = np.ones(shape=(num_parameters, 1)) / float(num_parameters)
self._find_alpha(
X_train=X_train,
X_test=X_test,
num_parameters=num_parameters,
epsilon=self.epsilon,
alpha_0=alpha_0,
sigma=sigma,
)
return self._calculate_j(X_test, sigma=sigma)
def _calculate_j(self, X_test, sigma):
pred = self.predict(X_test, sigma=sigma) + 0.0000001
log = np.log(pred).sum()
return log / (X_test.shape[0])
def score(self, X_test):
"""Return the J score, similar to sklearn's API"""
return self._calculate_j(X_test=X_test, sigma=self._sigma)
@staticmethod
def _reshape_X(X):
"""Reshape input from mxn to mx1xn to take advantage of numpy broadcasting."""
if len(X.shape) != 3:
return X.reshape((X.shape[0], 1, X.shape[1]))
return X
def _select_param_vectors(self, X_test, sigma, num_parameters):
"""X_test is the test set. b is the number of parameters."""
indices = np.random.choice(X_test.shape[0], size=num_parameters, replace=False)
self._test_vectors = X_test[indices, :].copy()
self._phi_fitted = True
def _phi(self, X, sigma=None):
if sigma is None:
sigma = self._sigma
if self._phi_fitted:
return np.exp(
-np.sum((X - self._test_vectors) ** 2, axis=-1) / (2 * sigma**2)
)
raise Exception("Phi not fitted.")
def _find_alpha(self, alpha_0, X_train, X_test, num_parameters, sigma, epsilon):
A = np.zeros(shape=(X_test.shape[0], num_parameters))
b = np.zeros(shape=(num_parameters, 1))
A = self._phi(X_test, sigma)
b = self._phi(X_train, sigma).sum(axis=0) / X_train.shape[0]
b = b.reshape((num_parameters, 1))
out = alpha_0.copy()
for k in range(self.max_iter):
mat = np.dot(A, out)
mat += 0.000000001
out += epsilon * np.dot(np.transpose(A), 1.0 / mat)
out += b * (
((1 - np.dot(np.transpose(b), out)) / np.dot(np.transpose(b), b))
)
out = np.maximum(0, out)
out /= np.dot(np.transpose(b), out)
self._alpha = out
self._fitted = True
def predict(self, X, sigma=None):
"""Equivalent of w(X) from the original paper."""
X = self._reshape_X(X)
if not self._fitted:
raise Exception("Not fitted!")
return np.dot(self._phi(X, sigma=sigma), self._alpha).reshape((X.shape[0],))

View File

@ -1,14 +1,14 @@
import numpy as np
from sklearn import clone
from sklearn.base import BaseEstimator
def clone_fit(c_model: BaseEstimator, data, labels):
c_model2 = clone(c_model)
c_model2.fit(data, labels)
return c_model2
def get_score(pred1, pred2, labels):
return np.mean((pred1 == labels).astype(int) - (pred2 == labels).astype(int))
import numpy as np
from sklearn import clone
from sklearn.base import BaseEstimator
def clone_fit(c_model: BaseEstimator, data, labels):
c_model2 = clone(c_model)
c_model2.fit(data, labels)
return c_model2
def get_score(pred1, pred2, labels):
return np.mean((pred1 == labels).astype(int) - (pred2 == labels).astype(int))

464
conf.yaml
View File

@ -1,233 +1,233 @@
debug_conf: &debug_conf
global:
METRICS:
- acc
DATASET_N_PREVS: 5
DATASET_PREVS:
# - 0.2
- 0.5
# - 0.8
confs:
- DATASET_NAME: rcv1
DATASET_TARGET: CCAT
plot_confs:
debug:
PLOT_ESTIMATORS:
- mulmc_sld
- atc_mc
PLOT_STDEV: true
mc_conf: &mc_conf
global:
METRICS:
- acc
DATASET_N_PREVS: 9
DATASET_DIR_UPDATE: true
confs:
- DATASET_NAME: rcv1
DATASET_TARGET: CCAT
# - DATASET_NAME: imdb
plot_confs:
debug3:
PLOT_ESTIMATORS:
- binmc_sld
- mulmc_sld
- binne_sld
- mulne_sld
- bin_sld_gs
- mul_sld_gs
- atc_mc
PLOT_STDEV: true
test_conf: &test_conf
global:
METRICS:
- acc
- f1
DATASET_N_PREVS: 9
confs:
- DATASET_NAME: rcv1
DATASET_TARGET: CCAT
# - DATASET_NAME: imdb
plot_confs:
gs_vs_gsq:
PLOT_ESTIMATORS:
- bin_sld
- bin_sld_gs
- bin_sld_gsq
- mul_sld
- mul_sld_gs
- mul_sld_gsq
gs_vs_atc:
PLOT_ESTIMATORS:
- bin_sld
- bin_sld_gs
- mul_sld
- mul_sld_gs
- atc_mc
- atc_ne
sld_vs_pacc:
PLOT_ESTIMATORS:
- bin_sld
- bin_sld_gs
- mul_sld
- mul_sld_gs
- bin_pacc
- bin_pacc_gs
- mul_pacc
- mul_pacc_gs
- atc_mc
- atc_ne
pacc_vs_atc:
PLOT_ESTIMATORS:
- bin_pacc
- bin_pacc_gs
- mul_pacc
- mul_pacc_gs
- atc_mc
- atc_ne
main_conf: &main_conf
global:
METRICS:
- acc
- f1
DATASET_N_PREVS: 9
DATASET_DIR_UPDATE: true
confs:
- DATASET_NAME: rcv1
DATASET_TARGET: CCAT
- DATASET_NAME: imdb
confs_next:
- DATASET_NAME: rcv1
DATASET_TARGET: GCAT
- DATASET_NAME: rcv1
DATASET_TARGET: MCAT
plot_confs:
gs_vs_qgs:
PLOT_ESTIMATORS:
- mul_sld_gs
- bin_sld_gs
- mul_sld_gsq
- bin_sld_gsq
- atc_mc
- atc_ne
PLOT_STDEV: true
plot_confs_completed:
max_conf_vs_atc_pacc:
PLOT_ESTIMATORS:
- bin_pacc
- binmc_pacc
- mul_pacc
- mulmc_pacc
- atc_mc
PLOT_STDEV: true
max_conf_vs_entropy_pacc:
PLOT_ESTIMATORS:
- binmc_pacc
- binne_pacc
- mulmc_pacc
- mulne_pacc
- atc_mc
PLOT_STDEV: true
gs_vs_atc:
PLOT_ESTIMATORS:
- mul_sld_gs
- bin_sld_gs
- mul_pacc_gs
- bin_pacc_gs
- atc_mc
- atc_ne
PLOT_STDEV: true
gs_vs_all:
PLOT_ESTIMATORS:
- mul_sld_gs
- bin_sld_gs
- mul_pacc_gs
- bin_pacc_gs
- atc_mc
- doc_feat
- kfcv
PLOT_STDEV: true
gs_vs_qgs:
PLOT_ESTIMATORS:
- mul_sld_gs
- bin_sld_gs
- mul_sld_gsq
- bin_sld_gsq
- atc_mc
- atc_ne
PLOT_STDEV: true
cc_vs_other:
PLOT_ESTIMATORS:
- mul_cc
- bin_cc
- mul_sld
- bin_sld
- mul_pacc
- bin_pacc
PLOT_STDEV: true
max_conf_vs_atc:
PLOT_ESTIMATORS:
- bin_sld
- binmc_sld
- mul_sld
- mulmc_sld
- atc_mc
PLOT_STDEV: true
max_conf_vs_entropy:
PLOT_ESTIMATORS:
- binmc_sld
- binne_sld
- mulmc_sld
- mulne_sld
- atc_mc
PLOT_STDEV: true
sld_vs_pacc:
PLOT_ESTIMATORS:
- bin_sld
- mul_sld
- bin_pacc
- mul_pacc
- atc_mc
PLOT_STDEV: true
plot_confs_other:
best_vs_atc:
PLOT_ESTIMATORS:
- mul_sld_bcts
- mul_sld_gs
- bin_sld_bcts
- bin_sld_gs
- atc_mc
- atc_ne
all_vs_atc:
PLOT_ESTIMATORS:
- bin_sld
- bin_sld_bcts
- bin_sld_gs
- mul_sld
- mul_sld_bcts
- mul_sld_gs
- atc_mc
- atc_ne
best_vs_all:
PLOT_ESTIMATORS:
- bin_sld_bcts
- bin_sld_gs
- mul_sld_bcts
- mul_sld_gs
- kfcv
- atc_mc
- atc_ne
- doc_feat
debug_conf: &debug_conf
global:
METRICS:
- acc
DATASET_N_PREVS: 5
DATASET_PREVS:
# - 0.2
- 0.5
# - 0.8
confs:
- DATASET_NAME: rcv1
DATASET_TARGET: CCAT
plot_confs:
debug:
PLOT_ESTIMATORS:
- mulmc_sld
- atc_mc
PLOT_STDEV: true
mc_conf: &mc_conf
global:
METRICS:
- acc
DATASET_N_PREVS: 9
DATASET_DIR_UPDATE: true
confs:
- DATASET_NAME: rcv1
DATASET_TARGET: CCAT
# - DATASET_NAME: imdb
plot_confs:
debug3:
PLOT_ESTIMATORS:
- binmc_sld
- mulmc_sld
- binne_sld
- mulne_sld
- bin_sld_gs
- mul_sld_gs
- atc_mc
PLOT_STDEV: true
test_conf: &test_conf
global:
METRICS:
- acc
- f1
DATASET_N_PREVS: 9
confs:
- DATASET_NAME: rcv1
DATASET_TARGET: CCAT
# - DATASET_NAME: imdb
plot_confs:
gs_vs_gsq:
PLOT_ESTIMATORS:
- bin_sld
- bin_sld_gs
- bin_sld_gsq
- mul_sld
- mul_sld_gs
- mul_sld_gsq
gs_vs_atc:
PLOT_ESTIMATORS:
- bin_sld
- bin_sld_gs
- mul_sld
- mul_sld_gs
- atc_mc
- atc_ne
sld_vs_pacc:
PLOT_ESTIMATORS:
- bin_sld
- bin_sld_gs
- mul_sld
- mul_sld_gs
- bin_pacc
- bin_pacc_gs
- mul_pacc
- mul_pacc_gs
- atc_mc
- atc_ne
pacc_vs_atc:
PLOT_ESTIMATORS:
- bin_pacc
- bin_pacc_gs
- mul_pacc
- mul_pacc_gs
- atc_mc
- atc_ne
main_conf: &main_conf
global:
METRICS:
- acc
- f1
DATASET_N_PREVS: 9
DATASET_DIR_UPDATE: true
confs:
- DATASET_NAME: rcv1
DATASET_TARGET: CCAT
- DATASET_NAME: imdb
confs_next:
- DATASET_NAME: rcv1
DATASET_TARGET: GCAT
- DATASET_NAME: rcv1
DATASET_TARGET: MCAT
plot_confs:
gs_vs_qgs:
PLOT_ESTIMATORS:
- mul_sld_gs
- bin_sld_gs
- mul_sld_gsq
- bin_sld_gsq
- atc_mc
- atc_ne
PLOT_STDEV: true
plot_confs_completed:
max_conf_vs_atc_pacc:
PLOT_ESTIMATORS:
- bin_pacc
- binmc_pacc
- mul_pacc
- mulmc_pacc
- atc_mc
PLOT_STDEV: true
max_conf_vs_entropy_pacc:
PLOT_ESTIMATORS:
- binmc_pacc
- binne_pacc
- mulmc_pacc
- mulne_pacc
- atc_mc
PLOT_STDEV: true
gs_vs_atc:
PLOT_ESTIMATORS:
- mul_sld_gs
- bin_sld_gs
- mul_pacc_gs
- bin_pacc_gs
- atc_mc
- atc_ne
PLOT_STDEV: true
gs_vs_all:
PLOT_ESTIMATORS:
- mul_sld_gs
- bin_sld_gs
- mul_pacc_gs
- bin_pacc_gs
- atc_mc
- doc_feat
- kfcv
PLOT_STDEV: true
gs_vs_qgs:
PLOT_ESTIMATORS:
- mul_sld_gs
- bin_sld_gs
- mul_sld_gsq
- bin_sld_gsq
- atc_mc
- atc_ne
PLOT_STDEV: true
cc_vs_other:
PLOT_ESTIMATORS:
- mul_cc
- bin_cc
- mul_sld
- bin_sld
- mul_pacc
- bin_pacc
PLOT_STDEV: true
max_conf_vs_atc:
PLOT_ESTIMATORS:
- bin_sld
- binmc_sld
- mul_sld
- mulmc_sld
- atc_mc
PLOT_STDEV: true
max_conf_vs_entropy:
PLOT_ESTIMATORS:
- binmc_sld
- binne_sld
- mulmc_sld
- mulne_sld
- atc_mc
PLOT_STDEV: true
sld_vs_pacc:
PLOT_ESTIMATORS:
- bin_sld
- mul_sld
- bin_pacc
- mul_pacc
- atc_mc
PLOT_STDEV: true
plot_confs_other:
best_vs_atc:
PLOT_ESTIMATORS:
- mul_sld_bcts
- mul_sld_gs
- bin_sld_bcts
- bin_sld_gs
- atc_mc
- atc_ne
all_vs_atc:
PLOT_ESTIMATORS:
- bin_sld
- bin_sld_bcts
- bin_sld_gs
- mul_sld
- mul_sld_bcts
- mul_sld_gs
- atc_mc
- atc_ne
best_vs_all:
PLOT_ESTIMATORS:
- bin_sld_bcts
- bin_sld_gs
- mul_sld_bcts
- mul_sld_gs
- kfcv
- atc_mc
- atc_ne
- doc_feat
exec: *main_conf

View File

@ -1,445 +1,445 @@
<div>target: default</div>
<div>train: [0.5 0.5]</div>
<div>validation: [0.5 0.5]</div>
<div>evaluate_binary: 277.300s</div>
<div>evaluate_multiclass: 139.986s</div>
<div>kfcv: 98.625s</div>
<div>atc_mc: 93.304s</div>
<div>atc_ne: 91.201s</div>
<div>doc_feat: 29.930s</div>
<div>rca_score: 1018.341s</div>
<div>rca_star_score: 1013.733s</div>
<div>tot: 1054.413s</div>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>bin</th>
<th>mul</th>
<th>kfcv</th>
<th>atc_mc</th>
<th>atc_ne</th>
<th>doc_feat</th>
<th>rca</th>
<th>rca_star</th>
</tr>
</thead>
<tbody>
<tr>
<th>(0.0, 1.0)</th>
<td>0.0154</td>
<td>0.0177</td>
<td>0.0249</td>
<td>0.0291</td>
<td>0.0291</td>
<td>0.0248</td>
<td>0.2705</td>
<td>0.2413</td>
</tr>
<tr>
<th>(0.05, 0.95)</th>
<td>0.0309</td>
<td>0.0284</td>
<td>0.0252</td>
<td>0.0300</td>
<td>0.0300</td>
<td>0.0247</td>
<td>0.2796</td>
<td>0.2504</td>
</tr>
<tr>
<th>(0.1, 0.9)</th>
<td>0.0309</td>
<td>0.0302</td>
<td>0.0251</td>
<td>0.0279</td>
<td>0.0279</td>
<td>0.0250</td>
<td>0.2722</td>
<td>0.2430</td>
</tr>
<tr>
<th>(0.15, 0.85)</th>
<td>0.0310</td>
<td>0.0339</td>
<td>0.0245</td>
<td>0.0269</td>
<td>0.0269</td>
<td>0.0244</td>
<td>0.2684</td>
<td>0.2392</td>
</tr>
<tr>
<th>(0.2, 0.8)</th>
<td>0.0411</td>
<td>0.0407</td>
<td>0.0259</td>
<td>0.0292</td>
<td>0.0292</td>
<td>0.0257</td>
<td>0.2724</td>
<td>0.2432</td>
</tr>
<tr>
<th>(0.25, 0.75)</th>
<td>0.0381</td>
<td>0.0376</td>
<td>0.0262</td>
<td>0.0319</td>
<td>0.0319</td>
<td>0.0259</td>
<td>0.2701</td>
<td>0.2409</td>
</tr>
<tr>
<th>(0.3, 0.7)</th>
<td>0.0442</td>
<td>0.0452</td>
<td>0.0254</td>
<td>0.0273</td>
<td>0.0273</td>
<td>0.0256</td>
<td>0.2650</td>
<td>0.2358</td>
</tr>
<tr>
<th>(0.35, 0.65)</th>
<td>0.0480</td>
<td>0.0498</td>
<td>0.0236</td>
<td>0.0257</td>
<td>0.0257</td>
<td>0.0235</td>
<td>0.2640</td>
<td>0.2347</td>
</tr>
<tr>
<th>(0.4, 0.6)</th>
<td>0.0401</td>
<td>0.0431</td>
<td>0.0222</td>
<td>0.0296</td>
<td>0.0296</td>
<td>0.0220</td>
<td>0.2654</td>
<td>0.2361</td>
</tr>
<tr>
<th>(0.45, 0.55)</th>
<td>0.0551</td>
<td>0.0558</td>
<td>0.0243</td>
<td>0.0295</td>
<td>0.0295</td>
<td>0.0246</td>
<td>0.1838</td>
<td>0.1551</td>
</tr>
<tr>
<th>(0.5, 0.5)</th>
<td>0.0499</td>
<td>0.0513</td>
<td>0.0308</td>
<td>0.0319</td>
<td>0.0319</td>
<td>0.0309</td>
<td>0.1472</td>
<td>0.1202</td>
</tr>
<tr>
<th>(0.55, 0.45)</th>
<td>0.0538</td>
<td>0.0542</td>
<td>0.0278</td>
<td>0.0329</td>
<td>0.0329</td>
<td>0.0280</td>
<td>0.1717</td>
<td>0.1459</td>
</tr>
<tr>
<th>(0.6, 0.4)</th>
<td>0.0476</td>
<td>0.0484</td>
<td>0.0258</td>
<td>0.0298</td>
<td>0.0298</td>
<td>0.0259</td>
<td>0.2434</td>
<td>0.2147</td>
</tr>
<tr>
<th>(0.65, 0.35)</th>
<td>0.0447</td>
<td>0.0474</td>
<td>0.0287</td>
<td>0.0332</td>
<td>0.0332</td>
<td>0.0288</td>
<td>0.2632</td>
<td>0.2340</td>
</tr>
<tr>
<th>(0.7, 0.3)</th>
<td>0.0388</td>
<td>0.0397</td>
<td>0.0295</td>
<td>0.0328</td>
<td>0.0328</td>
<td>0.0296</td>
<td>0.2659</td>
<td>0.2367</td>
</tr>
<tr>
<th>(0.75, 0.25)</th>
<td>0.0336</td>
<td>0.0399</td>
<td>0.0241</td>
<td>0.0293</td>
<td>0.0293</td>
<td>0.0244</td>
<td>0.2612</td>
<td>0.2320</td>
</tr>
<tr>
<th>(0.8, 0.2)</th>
<td>0.0407</td>
<td>0.0447</td>
<td>0.0266</td>
<td>0.0303</td>
<td>0.0303</td>
<td>0.0271</td>
<td>0.2601</td>
<td>0.2309</td>
</tr>
<tr>
<th>(0.85, 0.15)</th>
<td>0.0383</td>
<td>0.0423</td>
<td>0.0219</td>
<td>0.0278</td>
<td>0.0278</td>
<td>0.0220</td>
<td>0.2670</td>
<td>0.2378</td>
</tr>
<tr>
<th>(0.9, 0.1)</th>
<td>0.0351</td>
<td>0.0387</td>
<td>0.0244</td>
<td>0.0275</td>
<td>0.0275</td>
<td>0.0245</td>
<td>0.2618</td>
<td>0.2326</td>
</tr>
<tr>
<th>(0.95, 0.05)</th>
<td>0.0238</td>
<td>0.0263</td>
<td>0.0269</td>
<td>0.0296</td>
<td>0.0296</td>
<td>0.0272</td>
<td>0.2602</td>
<td>0.2310</td>
</tr>
<tr>
<th>(1.0, 0.0)</th>
<td>0.0118</td>
<td>0.0202</td>
<td>0.0241</td>
<td>0.0279</td>
<td>0.0279</td>
<td>0.0244</td>
<td>0.2571</td>
<td>0.2279</td>
</tr>
</tbody>
</table>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>bin</th>
<th>mul</th>
<th>kfcv</th>
<th>atc_mc</th>
<th>atc_ne</th>
</tr>
</thead>
<tbody>
<tr>
<th>(0.0, 1.0)</th>
<td>0.0088</td>
<td>0.0100</td>
<td>0.0580</td>
<td>0.0183</td>
<td>0.0183</td>
</tr>
<tr>
<th>(0.05, 0.95)</th>
<td>0.0175</td>
<td>0.0159</td>
<td>0.0605</td>
<td>0.0193</td>
<td>0.0193</td>
</tr>
<tr>
<th>(0.1, 0.9)</th>
<td>0.0184</td>
<td>0.0176</td>
<td>0.0532</td>
<td>0.0189</td>
<td>0.0189</td>
</tr>
<tr>
<th>(0.15, 0.85)</th>
<td>0.0188</td>
<td>0.0204</td>
<td>0.0475</td>
<td>0.0180</td>
<td>0.0180</td>
</tr>
<tr>
<th>(0.2, 0.8)</th>
<td>0.0269</td>
<td>0.0266</td>
<td>0.0455</td>
<td>0.0206</td>
<td>0.0206</td>
</tr>
<tr>
<th>(0.25, 0.75)</th>
<td>0.0265</td>
<td>0.0261</td>
<td>0.0401</td>
<td>0.0242</td>
<td>0.0242</td>
</tr>
<tr>
<th>(0.3, 0.7)</th>
<td>0.0328</td>
<td>0.0336</td>
<td>0.0331</td>
<td>0.0208</td>
<td>0.0208</td>
</tr>
<tr>
<th>(0.35, 0.65)</th>
<td>0.0386</td>
<td>0.0394</td>
<td>0.0307</td>
<td>0.0211</td>
<td>0.0211</td>
</tr>
<tr>
<th>(0.4, 0.6)</th>
<td>0.0343</td>
<td>0.0371</td>
<td>0.0273</td>
<td>0.0265</td>
<td>0.0265</td>
</tr>
<tr>
<th>(0.45, 0.55)</th>
<td>0.0511</td>
<td>0.0512</td>
<td>0.0231</td>
<td>0.0275</td>
<td>0.0275</td>
</tr>
<tr>
<th>(0.5, 0.5)</th>
<td>0.0517</td>
<td>0.0529</td>
<td>0.0306</td>
<td>0.0319</td>
<td>0.0319</td>
</tr>
<tr>
<th>(0.55, 0.45)</th>
<td>0.0584</td>
<td>0.0583</td>
<td>0.0308</td>
<td>0.0354</td>
<td>0.0354</td>
</tr>
<tr>
<th>(0.6, 0.4)</th>
<td>0.0590</td>
<td>0.0599</td>
<td>0.0363</td>
<td>0.0357</td>
<td>0.0357</td>
</tr>
<tr>
<th>(0.65, 0.35)</th>
<td>0.0635</td>
<td>0.0662</td>
<td>0.0506</td>
<td>0.0440</td>
<td>0.0440</td>
</tr>
<tr>
<th>(0.7, 0.3)</th>
<td>0.0596</td>
<td>0.0638</td>
<td>0.0654</td>
<td>0.0457</td>
<td>0.0457</td>
</tr>
<tr>
<th>(0.75, 0.25)</th>
<td>0.0627</td>
<td>0.0744</td>
<td>0.0964</td>
<td>0.0461</td>
<td>0.0461</td>
</tr>
<tr>
<th>(0.8, 0.2)</th>
<td>0.0909</td>
<td>0.0999</td>
<td>0.1400</td>
<td>0.0629</td>
<td>0.0629</td>
</tr>
<tr>
<th>(0.85, 0.15)</th>
<td>0.1052</td>
<td>0.1126</td>
<td>0.1829</td>
<td>0.0727</td>
<td>0.0727</td>
</tr>
<tr>
<th>(0.9, 0.1)</th>
<td>0.1377</td>
<td>0.1481</td>
<td>0.2839</td>
<td>0.1215</td>
<td>0.1215</td>
</tr>
<tr>
<th>(0.95, 0.05)</th>
<td>0.1305</td>
<td>0.1450</td>
<td>0.4592</td>
<td>0.2037</td>
<td>0.2037</td>
</tr>
<tr>
<th>(1.0, 0.0)</th>
<td>0.1092</td>
<td>0.1387</td>
<td>0.8818</td>
<td>0.5267</td>
<td>0.5267</td>
</tr>
</tbody>
</table>
<div>target: default</div>
<div>train: [0.5 0.5]</div>
<div>validation: [0.5 0.5]</div>
<div>evaluate_binary: 277.300s</div>
<div>evaluate_multiclass: 139.986s</div>
<div>kfcv: 98.625s</div>
<div>atc_mc: 93.304s</div>
<div>atc_ne: 91.201s</div>
<div>doc_feat: 29.930s</div>
<div>rca_score: 1018.341s</div>
<div>rca_star_score: 1013.733s</div>
<div>tot: 1054.413s</div>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>bin</th>
<th>mul</th>
<th>kfcv</th>
<th>atc_mc</th>
<th>atc_ne</th>
<th>doc_feat</th>
<th>rca</th>
<th>rca_star</th>
</tr>
</thead>
<tbody>
<tr>
<th>(0.0, 1.0)</th>
<td>0.0154</td>
<td>0.0177</td>
<td>0.0249</td>
<td>0.0291</td>
<td>0.0291</td>
<td>0.0248</td>
<td>0.2705</td>
<td>0.2413</td>
</tr>
<tr>
<th>(0.05, 0.95)</th>
<td>0.0309</td>
<td>0.0284</td>
<td>0.0252</td>
<td>0.0300</td>
<td>0.0300</td>
<td>0.0247</td>
<td>0.2796</td>
<td>0.2504</td>
</tr>
<tr>
<th>(0.1, 0.9)</th>
<td>0.0309</td>
<td>0.0302</td>
<td>0.0251</td>
<td>0.0279</td>
<td>0.0279</td>
<td>0.0250</td>
<td>0.2722</td>
<td>0.2430</td>
</tr>
<tr>
<th>(0.15, 0.85)</th>
<td>0.0310</td>
<td>0.0339</td>
<td>0.0245</td>
<td>0.0269</td>
<td>0.0269</td>
<td>0.0244</td>
<td>0.2684</td>
<td>0.2392</td>
</tr>
<tr>
<th>(0.2, 0.8)</th>
<td>0.0411</td>
<td>0.0407</td>
<td>0.0259</td>
<td>0.0292</td>
<td>0.0292</td>
<td>0.0257</td>
<td>0.2724</td>
<td>0.2432</td>
</tr>
<tr>
<th>(0.25, 0.75)</th>
<td>0.0381</td>
<td>0.0376</td>
<td>0.0262</td>
<td>0.0319</td>
<td>0.0319</td>
<td>0.0259</td>
<td>0.2701</td>
<td>0.2409</td>
</tr>
<tr>
<th>(0.3, 0.7)</th>
<td>0.0442</td>
<td>0.0452</td>
<td>0.0254</td>
<td>0.0273</td>
<td>0.0273</td>
<td>0.0256</td>
<td>0.2650</td>
<td>0.2358</td>
</tr>
<tr>
<th>(0.35, 0.65)</th>
<td>0.0480</td>
<td>0.0498</td>
<td>0.0236</td>
<td>0.0257</td>
<td>0.0257</td>
<td>0.0235</td>
<td>0.2640</td>
<td>0.2347</td>
</tr>
<tr>
<th>(0.4, 0.6)</th>
<td>0.0401</td>
<td>0.0431</td>
<td>0.0222</td>
<td>0.0296</td>
<td>0.0296</td>
<td>0.0220</td>
<td>0.2654</td>
<td>0.2361</td>
</tr>
<tr>
<th>(0.45, 0.55)</th>
<td>0.0551</td>
<td>0.0558</td>
<td>0.0243</td>
<td>0.0295</td>
<td>0.0295</td>
<td>0.0246</td>
<td>0.1838</td>
<td>0.1551</td>
</tr>
<tr>
<th>(0.5, 0.5)</th>
<td>0.0499</td>
<td>0.0513</td>
<td>0.0308</td>
<td>0.0319</td>
<td>0.0319</td>
<td>0.0309</td>
<td>0.1472</td>
<td>0.1202</td>
</tr>
<tr>
<th>(0.55, 0.45)</th>
<td>0.0538</td>
<td>0.0542</td>
<td>0.0278</td>
<td>0.0329</td>
<td>0.0329</td>
<td>0.0280</td>
<td>0.1717</td>
<td>0.1459</td>
</tr>
<tr>
<th>(0.6, 0.4)</th>
<td>0.0476</td>
<td>0.0484</td>
<td>0.0258</td>
<td>0.0298</td>
<td>0.0298</td>
<td>0.0259</td>
<td>0.2434</td>
<td>0.2147</td>
</tr>
<tr>
<th>(0.65, 0.35)</th>
<td>0.0447</td>
<td>0.0474</td>
<td>0.0287</td>
<td>0.0332</td>
<td>0.0332</td>
<td>0.0288</td>
<td>0.2632</td>
<td>0.2340</td>
</tr>
<tr>
<th>(0.7, 0.3)</th>
<td>0.0388</td>
<td>0.0397</td>
<td>0.0295</td>
<td>0.0328</td>
<td>0.0328</td>
<td>0.0296</td>
<td>0.2659</td>
<td>0.2367</td>
</tr>
<tr>
<th>(0.75, 0.25)</th>
<td>0.0336</td>
<td>0.0399</td>
<td>0.0241</td>
<td>0.0293</td>
<td>0.0293</td>
<td>0.0244</td>
<td>0.2612</td>
<td>0.2320</td>
</tr>
<tr>
<th>(0.8, 0.2)</th>
<td>0.0407</td>
<td>0.0447</td>
<td>0.0266</td>
<td>0.0303</td>
<td>0.0303</td>
<td>0.0271</td>
<td>0.2601</td>
<td>0.2309</td>
</tr>
<tr>
<th>(0.85, 0.15)</th>
<td>0.0383</td>
<td>0.0423</td>
<td>0.0219</td>
<td>0.0278</td>
<td>0.0278</td>
<td>0.0220</td>
<td>0.2670</td>
<td>0.2378</td>
</tr>
<tr>
<th>(0.9, 0.1)</th>
<td>0.0351</td>
<td>0.0387</td>
<td>0.0244</td>
<td>0.0275</td>
<td>0.0275</td>
<td>0.0245</td>
<td>0.2618</td>
<td>0.2326</td>
</tr>
<tr>
<th>(0.95, 0.05)</th>
<td>0.0238</td>
<td>0.0263</td>
<td>0.0269</td>
<td>0.0296</td>
<td>0.0296</td>
<td>0.0272</td>
<td>0.2602</td>
<td>0.2310</td>
</tr>
<tr>
<th>(1.0, 0.0)</th>
<td>0.0118</td>
<td>0.0202</td>
<td>0.0241</td>
<td>0.0279</td>
<td>0.0279</td>
<td>0.0244</td>
<td>0.2571</td>
<td>0.2279</td>
</tr>
</tbody>
</table>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>bin</th>
<th>mul</th>
<th>kfcv</th>
<th>atc_mc</th>
<th>atc_ne</th>
</tr>
</thead>
<tbody>
<tr>
<th>(0.0, 1.0)</th>
<td>0.0088</td>
<td>0.0100</td>
<td>0.0580</td>
<td>0.0183</td>
<td>0.0183</td>
</tr>
<tr>
<th>(0.05, 0.95)</th>
<td>0.0175</td>
<td>0.0159</td>
<td>0.0605</td>
<td>0.0193</td>
<td>0.0193</td>
</tr>
<tr>
<th>(0.1, 0.9)</th>
<td>0.0184</td>
<td>0.0176</td>
<td>0.0532</td>
<td>0.0189</td>
<td>0.0189</td>
</tr>
<tr>
<th>(0.15, 0.85)</th>
<td>0.0188</td>
<td>0.0204</td>
<td>0.0475</td>
<td>0.0180</td>
<td>0.0180</td>
</tr>
<tr>
<th>(0.2, 0.8)</th>
<td>0.0269</td>
<td>0.0266</td>
<td>0.0455</td>
<td>0.0206</td>
<td>0.0206</td>
</tr>
<tr>
<th>(0.25, 0.75)</th>
<td>0.0265</td>
<td>0.0261</td>
<td>0.0401</td>
<td>0.0242</td>
<td>0.0242</td>
</tr>
<tr>
<th>(0.3, 0.7)</th>
<td>0.0328</td>
<td>0.0336</td>
<td>0.0331</td>
<td>0.0208</td>
<td>0.0208</td>
</tr>
<tr>
<th>(0.35, 0.65)</th>
<td>0.0386</td>
<td>0.0394</td>
<td>0.0307</td>
<td>0.0211</td>
<td>0.0211</td>
</tr>
<tr>
<th>(0.4, 0.6)</th>
<td>0.0343</td>
<td>0.0371</td>
<td>0.0273</td>
<td>0.0265</td>
<td>0.0265</td>
</tr>
<tr>
<th>(0.45, 0.55)</th>
<td>0.0511</td>
<td>0.0512</td>
<td>0.0231</td>
<td>0.0275</td>
<td>0.0275</td>
</tr>
<tr>
<th>(0.5, 0.5)</th>
<td>0.0517</td>
<td>0.0529</td>
<td>0.0306</td>
<td>0.0319</td>
<td>0.0319</td>
</tr>
<tr>
<th>(0.55, 0.45)</th>
<td>0.0584</td>
<td>0.0583</td>
<td>0.0308</td>
<td>0.0354</td>
<td>0.0354</td>
</tr>
<tr>
<th>(0.6, 0.4)</th>
<td>0.0590</td>
<td>0.0599</td>
<td>0.0363</td>
<td>0.0357</td>
<td>0.0357</td>
</tr>
<tr>
<th>(0.65, 0.35)</th>
<td>0.0635</td>
<td>0.0662</td>
<td>0.0506</td>
<td>0.0440</td>
<td>0.0440</td>
</tr>
<tr>
<th>(0.7, 0.3)</th>
<td>0.0596</td>
<td>0.0638</td>
<td>0.0654</td>
<td>0.0457</td>
<td>0.0457</td>
</tr>
<tr>
<th>(0.75, 0.25)</th>
<td>0.0627</td>
<td>0.0744</td>
<td>0.0964</td>
<td>0.0461</td>
<td>0.0461</td>
</tr>
<tr>
<th>(0.8, 0.2)</th>
<td>0.0909</td>
<td>0.0999</td>
<td>0.1400</td>
<td>0.0629</td>
<td>0.0629</td>
</tr>
<tr>
<th>(0.85, 0.15)</th>
<td>0.1052</td>
<td>0.1126</td>
<td>0.1829</td>
<td>0.0727</td>
<td>0.0727</td>
</tr>
<tr>
<th>(0.9, 0.1)</th>
<td>0.1377</td>
<td>0.1481</td>
<td>0.2839</td>
<td>0.1215</td>
<td>0.1215</td>
</tr>
<tr>
<th>(0.95, 0.05)</th>
<td>0.1305</td>
<td>0.1450</td>
<td>0.4592</td>
<td>0.2037</td>
<td>0.2037</td>
</tr>
<tr>
<th>(1.0, 0.0)</th>
<td>0.1092</td>
<td>0.1387</td>
<td>0.8818</td>
<td>0.5267</td>
<td>0.5267</td>
</tr>
</tbody>
</table>

34710
out_rcv1.md

File diff suppressed because it is too large Load Diff

View File

@ -1,445 +1,445 @@
<div>target: default</div>
<div>train: [0.60621118 0.39378882]</div>
<div>validation: [0.60559006 0.39440994]</div>
<div>evaluate_binary: 31.883s</div>
<div>evaluate_multiclass: 24.748s</div>
<div>kfcv: 23.957s</div>
<div>atc_mc: 36.062s</div>
<div>atc_ne: 37.123s</div>
<div>doc_feat: 7.063s</div>
<div>rca_score: 148.420s</div>
<div>rca_star_score: 145.690s</div>
<div>tot: 149.118s</div>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>bin</th>
<th>mul</th>
<th>kfcv</th>
<th>atc_mc</th>
<th>atc_ne</th>
<th>doc_feat</th>
<th>rca</th>
<th>rca_star</th>
</tr>
</thead>
<tbody>
<tr>
<th>(0.0, 1.0)</th>
<td>0.0411</td>
<td>0.0907</td>
<td>0.0208</td>
<td>0.0267</td>
<td>0.0267</td>
<td>0.0204</td>
<td>0.1106</td>
<td>0.1059</td>
</tr>
<tr>
<th>(0.05, 0.95)</th>
<td>0.0392</td>
<td>0.0897</td>
<td>0.0216</td>
<td>0.0266</td>
<td>0.0266</td>
<td>0.0211</td>
<td>0.0523</td>
<td>0.0510</td>
</tr>
<tr>
<th>(0.1, 0.9)</th>
<td>0.0371</td>
<td>0.0891</td>
<td>0.0232</td>
<td>0.0267</td>
<td>0.0267</td>
<td>0.0227</td>
<td>0.0347</td>
<td>0.0354</td>
</tr>
<tr>
<th>(0.15, 0.85)</th>
<td>0.0464</td>
<td>0.0853</td>
<td>0.0226</td>
<td>0.0257</td>
<td>0.0257</td>
<td>0.0222</td>
<td>0.0315</td>
<td>0.0341</td>
</tr>
<tr>
<th>(0.2, 0.8)</th>
<td>0.0414</td>
<td>0.0757</td>
<td>0.0202</td>
<td>0.0249</td>
<td>0.0249</td>
<td>0.0200</td>
<td>0.0280</td>
<td>0.0302</td>
</tr>
<tr>
<th>(0.25, 0.75)</th>
<td>0.0468</td>
<td>0.0768</td>
<td>0.0204</td>
<td>0.0250</td>
<td>0.0250</td>
<td>0.0201</td>
<td>0.0335</td>
<td>0.0376</td>
</tr>
<tr>
<th>(0.3, 0.7)</th>
<td>0.0384</td>
<td>0.0739</td>
<td>0.0201</td>
<td>0.0252</td>
<td>0.0252</td>
<td>0.0200</td>
<td>0.0349</td>
<td>0.0410</td>
</tr>
<tr>
<th>(0.35, 0.65)</th>
<td>0.0386</td>
<td>0.0715</td>
<td>0.0198</td>
<td>0.0239</td>
<td>0.0239</td>
<td>0.0196</td>
<td>0.0376</td>
<td>0.0448</td>
</tr>
<tr>
<th>(0.4, 0.6)</th>
<td>0.0392</td>
<td>0.0657</td>
<td>0.0199</td>
<td>0.0249</td>
<td>0.0249</td>
<td>0.0197</td>
<td>0.0315</td>
<td>0.0391</td>
</tr>
<tr>
<th>(0.45, 0.55)</th>
<td>0.0380</td>
<td>0.0679</td>
<td>0.0213</td>
<td>0.0258</td>
<td>0.0258</td>
<td>0.0212</td>
<td>0.0358</td>
<td>0.0450</td>
</tr>
<tr>
<th>(0.5, 0.5)</th>
<td>0.0400</td>
<td>0.0670</td>
<td>0.0218</td>
<td>0.0228</td>
<td>0.0228</td>
<td>0.0217</td>
<td>0.0441</td>
<td>0.0550</td>
</tr>
<tr>
<th>(0.55, 0.45)</th>
<td>0.0403</td>
<td>0.0686</td>
<td>0.0203</td>
<td>0.0237</td>
<td>0.0237</td>
<td>0.0200</td>
<td>0.0398</td>
<td>0.0507</td>
</tr>
<tr>
<th>(0.6, 0.4)</th>
<td>0.0432</td>
<td>0.0625</td>
<td>0.0201</td>
<td>0.0245</td>
<td>0.0245</td>
<td>0.0200</td>
<td>0.0370</td>
<td>0.0487</td>
</tr>
<tr>
<th>(0.65, 0.35)</th>
<td>0.0384</td>
<td>0.0620</td>
<td>0.0195</td>
<td>0.0236</td>
<td>0.0236</td>
<td>0.0195</td>
<td>0.0356</td>
<td>0.0460</td>
</tr>
<tr>
<th>(0.7, 0.3)</th>
<td>0.0304</td>
<td>0.0570</td>
<td>0.0236</td>
<td>0.0227</td>
<td>0.0227</td>
<td>0.0236</td>
<td>0.0302</td>
<td>0.0396</td>
</tr>
<tr>
<th>(0.75, 0.25)</th>
<td>0.0321</td>
<td>0.0614</td>
<td>0.0187</td>
<td>0.0273</td>
<td>0.0273</td>
<td>0.0187</td>
<td>0.0332</td>
<td>0.0439</td>
</tr>
<tr>
<th>(0.8, 0.2)</th>
<td>0.0300</td>
<td>0.0555</td>
<td>0.0221</td>
<td>0.0230</td>
<td>0.0230</td>
<td>0.0222</td>
<td>0.0287</td>
<td>0.0340</td>
</tr>
<tr>
<th>(0.85, 0.15)</th>
<td>0.0325</td>
<td>0.0540</td>
<td>0.0224</td>
<td>0.0229</td>
<td>0.0229</td>
<td>0.0225</td>
<td>0.0342</td>
<td>0.0360</td>
</tr>
<tr>
<th>(0.9, 0.1)</th>
<td>0.0262</td>
<td>0.0518</td>
<td>0.0211</td>
<td>0.0238</td>
<td>0.0238</td>
<td>0.0211</td>
<td>0.0483</td>
<td>0.0469</td>
</tr>
<tr>
<th>(0.95, 0.05)</th>
<td>0.0243</td>
<td>0.0576</td>
<td>0.0197</td>
<td>0.0240</td>
<td>0.0240</td>
<td>0.0196</td>
<td>0.0806</td>
<td>0.0746</td>
</tr>
<tr>
<th>(1.0, 0.0)</th>
<td>0.0146</td>
<td>0.0597</td>
<td>0.0231</td>
<td>0.0244</td>
<td>0.0244</td>
<td>0.0232</td>
<td>0.1600</td>
<td>0.1515</td>
</tr>
</tbody>
</table>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>bin</th>
<th>mul</th>
<th>kfcv</th>
<th>atc_mc</th>
<th>atc_ne</th>
</tr>
</thead>
<tbody>
<tr>
<th>(0.0, 1.0)</th>
<td>0.0239</td>
<td>0.0477</td>
<td>0.0345</td>
<td>0.0162</td>
<td>0.0162</td>
</tr>
<tr>
<th>(0.05, 0.95)</th>
<td>0.0235</td>
<td>0.0496</td>
<td>0.0320</td>
<td>0.0169</td>
<td>0.0169</td>
</tr>
<tr>
<th>(0.1, 0.9)</th>
<td>0.0230</td>
<td>0.0520</td>
<td>0.0289</td>
<td>0.0171</td>
<td>0.0171</td>
</tr>
<tr>
<th>(0.15, 0.85)</th>
<td>0.0308</td>
<td>0.0528</td>
<td>0.0274</td>
<td>0.0171</td>
<td>0.0171</td>
</tr>
<tr>
<th>(0.2, 0.8)</th>
<td>0.0286</td>
<td>0.0490</td>
<td>0.0291</td>
<td>0.0186</td>
<td>0.0186</td>
</tr>
<tr>
<th>(0.25, 0.75)</th>
<td>0.0346</td>
<td>0.0534</td>
<td>0.0255</td>
<td>0.0186</td>
<td>0.0186</td>
</tr>
<tr>
<th>(0.3, 0.7)</th>
<td>0.0299</td>
<td>0.0545</td>
<td>0.0232</td>
<td>0.0205</td>
<td>0.0205</td>
</tr>
<tr>
<th>(0.35, 0.65)</th>
<td>0.0335</td>
<td>0.0566</td>
<td>0.0217</td>
<td>0.0211</td>
<td>0.0211</td>
</tr>
<tr>
<th>(0.4, 0.6)</th>
<td>0.0360</td>
<td>0.0562</td>
<td>0.0217</td>
<td>0.0226</td>
<td>0.0226</td>
</tr>
<tr>
<th>(0.45, 0.55)</th>
<td>0.0372</td>
<td>0.0626</td>
<td>0.0213</td>
<td>0.0246</td>
<td>0.0246</td>
</tr>
<tr>
<th>(0.5, 0.5)</th>
<td>0.0437</td>
<td>0.0677</td>
<td>0.0223</td>
<td>0.0241</td>
<td>0.0241</td>
</tr>
<tr>
<th>(0.55, 0.45)</th>
<td>0.0486</td>
<td>0.0762</td>
<td>0.0241</td>
<td>0.0269</td>
<td>0.0269</td>
</tr>
<tr>
<th>(0.6, 0.4)</th>
<td>0.0572</td>
<td>0.0779</td>
<td>0.0290</td>
<td>0.0312</td>
<td>0.0312</td>
</tr>
<tr>
<th>(0.65, 0.35)</th>
<td>0.0580</td>
<td>0.0866</td>
<td>0.0340</td>
<td>0.0341</td>
<td>0.0341</td>
</tr>
<tr>
<th>(0.7, 0.3)</th>
<td>0.0546</td>
<td>0.0919</td>
<td>0.0420</td>
<td>0.0374</td>
<td>0.0374</td>
</tr>
<tr>
<th>(0.75, 0.25)</th>
<td>0.0636</td>
<td>0.1161</td>
<td>0.0689</td>
<td>0.0533</td>
<td>0.0533</td>
</tr>
<tr>
<th>(0.8, 0.2)</th>
<td>0.0750</td>
<td>0.1192</td>
<td>0.0768</td>
<td>0.0560</td>
<td>0.0560</td>
</tr>
<tr>
<th>(0.85, 0.15)</th>
<td>0.1031</td>
<td>0.1580</td>
<td>0.1244</td>
<td>0.0728</td>
<td>0.0728</td>
</tr>
<tr>
<th>(0.9, 0.1)</th>
<td>0.1175</td>
<td>0.2412</td>
<td>0.1885</td>
<td>0.1100</td>
<td>0.1100</td>
</tr>
<tr>
<th>(0.95, 0.05)</th>
<td>0.1877</td>
<td>0.3434</td>
<td>0.3579</td>
<td>0.2053</td>
<td>0.2053</td>
</tr>
<tr>
<th>(1.0, 0.0)</th>
<td>0.2717</td>
<td>0.3136</td>
<td>0.9178</td>
<td>0.6264</td>
<td>0.6264</td>
</tr>
</tbody>
</table>
<div>target: default</div>
<div>train: [0.60621118 0.39378882]</div>
<div>validation: [0.60559006 0.39440994]</div>
<div>evaluate_binary: 31.883s</div>
<div>evaluate_multiclass: 24.748s</div>
<div>kfcv: 23.957s</div>
<div>atc_mc: 36.062s</div>
<div>atc_ne: 37.123s</div>
<div>doc_feat: 7.063s</div>
<div>rca_score: 148.420s</div>
<div>rca_star_score: 145.690s</div>
<div>tot: 149.118s</div>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>bin</th>
<th>mul</th>
<th>kfcv</th>
<th>atc_mc</th>
<th>atc_ne</th>
<th>doc_feat</th>
<th>rca</th>
<th>rca_star</th>
</tr>
</thead>
<tbody>
<tr>
<th>(0.0, 1.0)</th>
<td>0.0411</td>
<td>0.0907</td>
<td>0.0208</td>
<td>0.0267</td>
<td>0.0267</td>
<td>0.0204</td>
<td>0.1106</td>
<td>0.1059</td>
</tr>
<tr>
<th>(0.05, 0.95)</th>
<td>0.0392</td>
<td>0.0897</td>
<td>0.0216</td>
<td>0.0266</td>
<td>0.0266</td>
<td>0.0211</td>
<td>0.0523</td>
<td>0.0510</td>
</tr>
<tr>
<th>(0.1, 0.9)</th>
<td>0.0371</td>
<td>0.0891</td>
<td>0.0232</td>
<td>0.0267</td>
<td>0.0267</td>
<td>0.0227</td>
<td>0.0347</td>
<td>0.0354</td>
</tr>
<tr>
<th>(0.15, 0.85)</th>
<td>0.0464</td>
<td>0.0853</td>
<td>0.0226</td>
<td>0.0257</td>
<td>0.0257</td>
<td>0.0222</td>
<td>0.0315</td>
<td>0.0341</td>
</tr>
<tr>
<th>(0.2, 0.8)</th>
<td>0.0414</td>
<td>0.0757</td>
<td>0.0202</td>
<td>0.0249</td>
<td>0.0249</td>
<td>0.0200</td>
<td>0.0280</td>
<td>0.0302</td>
</tr>
<tr>
<th>(0.25, 0.75)</th>
<td>0.0468</td>
<td>0.0768</td>
<td>0.0204</td>
<td>0.0250</td>
<td>0.0250</td>
<td>0.0201</td>
<td>0.0335</td>
<td>0.0376</td>
</tr>
<tr>
<th>(0.3, 0.7)</th>
<td>0.0384</td>
<td>0.0739</td>
<td>0.0201</td>
<td>0.0252</td>
<td>0.0252</td>
<td>0.0200</td>
<td>0.0349</td>
<td>0.0410</td>
</tr>
<tr>
<th>(0.35, 0.65)</th>
<td>0.0386</td>
<td>0.0715</td>
<td>0.0198</td>
<td>0.0239</td>
<td>0.0239</td>
<td>0.0196</td>
<td>0.0376</td>
<td>0.0448</td>
</tr>
<tr>
<th>(0.4, 0.6)</th>
<td>0.0392</td>
<td>0.0657</td>
<td>0.0199</td>
<td>0.0249</td>
<td>0.0249</td>
<td>0.0197</td>
<td>0.0315</td>
<td>0.0391</td>
</tr>
<tr>
<th>(0.45, 0.55)</th>
<td>0.0380</td>
<td>0.0679</td>
<td>0.0213</td>
<td>0.0258</td>
<td>0.0258</td>
<td>0.0212</td>
<td>0.0358</td>
<td>0.0450</td>
</tr>
<tr>
<th>(0.5, 0.5)</th>
<td>0.0400</td>
<td>0.0670</td>
<td>0.0218</td>
<td>0.0228</td>
<td>0.0228</td>
<td>0.0217</td>
<td>0.0441</td>
<td>0.0550</td>
</tr>
<tr>
<th>(0.55, 0.45)</th>
<td>0.0403</td>
<td>0.0686</td>
<td>0.0203</td>
<td>0.0237</td>
<td>0.0237</td>
<td>0.0200</td>
<td>0.0398</td>
<td>0.0507</td>
</tr>
<tr>
<th>(0.6, 0.4)</th>
<td>0.0432</td>
<td>0.0625</td>
<td>0.0201</td>
<td>0.0245</td>
<td>0.0245</td>
<td>0.0200</td>
<td>0.0370</td>
<td>0.0487</td>
</tr>
<tr>
<th>(0.65, 0.35)</th>
<td>0.0384</td>
<td>0.0620</td>
<td>0.0195</td>
<td>0.0236</td>
<td>0.0236</td>
<td>0.0195</td>
<td>0.0356</td>
<td>0.0460</td>
</tr>
<tr>
<th>(0.7, 0.3)</th>
<td>0.0304</td>
<td>0.0570</td>
<td>0.0236</td>
<td>0.0227</td>
<td>0.0227</td>
<td>0.0236</td>
<td>0.0302</td>
<td>0.0396</td>
</tr>
<tr>
<th>(0.75, 0.25)</th>
<td>0.0321</td>
<td>0.0614</td>
<td>0.0187</td>
<td>0.0273</td>
<td>0.0273</td>
<td>0.0187</td>
<td>0.0332</td>
<td>0.0439</td>
</tr>
<tr>
<th>(0.8, 0.2)</th>
<td>0.0300</td>
<td>0.0555</td>
<td>0.0221</td>
<td>0.0230</td>
<td>0.0230</td>
<td>0.0222</td>
<td>0.0287</td>
<td>0.0340</td>
</tr>
<tr>
<th>(0.85, 0.15)</th>
<td>0.0325</td>
<td>0.0540</td>
<td>0.0224</td>
<td>0.0229</td>
<td>0.0229</td>
<td>0.0225</td>
<td>0.0342</td>
<td>0.0360</td>
</tr>
<tr>
<th>(0.9, 0.1)</th>
<td>0.0262</td>
<td>0.0518</td>
<td>0.0211</td>
<td>0.0238</td>
<td>0.0238</td>
<td>0.0211</td>
<td>0.0483</td>
<td>0.0469</td>
</tr>
<tr>
<th>(0.95, 0.05)</th>
<td>0.0243</td>
<td>0.0576</td>
<td>0.0197</td>
<td>0.0240</td>
<td>0.0240</td>
<td>0.0196</td>
<td>0.0806</td>
<td>0.0746</td>
</tr>
<tr>
<th>(1.0, 0.0)</th>
<td>0.0146</td>
<td>0.0597</td>
<td>0.0231</td>
<td>0.0244</td>
<td>0.0244</td>
<td>0.0232</td>
<td>0.1600</td>
<td>0.1515</td>
</tr>
</tbody>
</table>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>bin</th>
<th>mul</th>
<th>kfcv</th>
<th>atc_mc</th>
<th>atc_ne</th>
</tr>
</thead>
<tbody>
<tr>
<th>(0.0, 1.0)</th>
<td>0.0239</td>
<td>0.0477</td>
<td>0.0345</td>
<td>0.0162</td>
<td>0.0162</td>
</tr>
<tr>
<th>(0.05, 0.95)</th>
<td>0.0235</td>
<td>0.0496</td>
<td>0.0320</td>
<td>0.0169</td>
<td>0.0169</td>
</tr>
<tr>
<th>(0.1, 0.9)</th>
<td>0.0230</td>
<td>0.0520</td>
<td>0.0289</td>
<td>0.0171</td>
<td>0.0171</td>
</tr>
<tr>
<th>(0.15, 0.85)</th>
<td>0.0308</td>
<td>0.0528</td>
<td>0.0274</td>
<td>0.0171</td>
<td>0.0171</td>
</tr>
<tr>
<th>(0.2, 0.8)</th>
<td>0.0286</td>
<td>0.0490</td>
<td>0.0291</td>
<td>0.0186</td>
<td>0.0186</td>
</tr>
<tr>
<th>(0.25, 0.75)</th>
<td>0.0346</td>
<td>0.0534</td>
<td>0.0255</td>
<td>0.0186</td>
<td>0.0186</td>
</tr>
<tr>
<th>(0.3, 0.7)</th>
<td>0.0299</td>
<td>0.0545</td>
<td>0.0232</td>
<td>0.0205</td>
<td>0.0205</td>
</tr>
<tr>
<th>(0.35, 0.65)</th>
<td>0.0335</td>
<td>0.0566</td>
<td>0.0217</td>
<td>0.0211</td>
<td>0.0211</td>
</tr>
<tr>
<th>(0.4, 0.6)</th>
<td>0.0360</td>
<td>0.0562</td>
<td>0.0217</td>
<td>0.0226</td>
<td>0.0226</td>
</tr>
<tr>
<th>(0.45, 0.55)</th>
<td>0.0372</td>
<td>0.0626</td>
<td>0.0213</td>
<td>0.0246</td>
<td>0.0246</td>
</tr>
<tr>
<th>(0.5, 0.5)</th>
<td>0.0437</td>
<td>0.0677</td>
<td>0.0223</td>
<td>0.0241</td>
<td>0.0241</td>
</tr>
<tr>
<th>(0.55, 0.45)</th>
<td>0.0486</td>
<td>0.0762</td>
<td>0.0241</td>
<td>0.0269</td>
<td>0.0269</td>
</tr>
<tr>
<th>(0.6, 0.4)</th>
<td>0.0572</td>
<td>0.0779</td>
<td>0.0290</td>
<td>0.0312</td>
<td>0.0312</td>
</tr>
<tr>
<th>(0.65, 0.35)</th>
<td>0.0580</td>
<td>0.0866</td>
<td>0.0340</td>
<td>0.0341</td>
<td>0.0341</td>
</tr>
<tr>
<th>(0.7, 0.3)</th>
<td>0.0546</td>
<td>0.0919</td>
<td>0.0420</td>
<td>0.0374</td>
<td>0.0374</td>
</tr>
<tr>
<th>(0.75, 0.25)</th>
<td>0.0636</td>
<td>0.1161</td>
<td>0.0689</td>
<td>0.0533</td>
<td>0.0533</td>
</tr>
<tr>
<th>(0.8, 0.2)</th>
<td>0.0750</td>
<td>0.1192</td>
<td>0.0768</td>
<td>0.0560</td>
<td>0.0560</td>
</tr>
<tr>
<th>(0.85, 0.15)</th>
<td>0.1031</td>
<td>0.1580</td>
<td>0.1244</td>
<td>0.0728</td>
<td>0.0728</td>
</tr>
<tr>
<th>(0.9, 0.1)</th>
<td>0.1175</td>
<td>0.2412</td>
<td>0.1885</td>
<td>0.1100</td>
<td>0.1100</td>
</tr>
<tr>
<th>(0.95, 0.05)</th>
<td>0.1877</td>
<td>0.3434</td>
<td>0.3579</td>
<td>0.2053</td>
<td>0.2053</td>
</tr>
<tr>
<th>(1.0, 0.0)</th>
<td>0.2717</td>
<td>0.3136</td>
<td>0.9178</td>
<td>0.6264</td>
<td>0.6264</td>
</tr>
</tbody>
</table>

2978
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,40 +1,40 @@
[tool.poetry]
name = "quacc"
version = "0.1.0"
description = ""
authors = ["Lorenzo Volpi <lorenzo.volpi@outlook.com>"]
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.11"
quapy = "^0.1.7"
pandas = "^2.0.3"
jinja2 = "^3.1.2"
pyyaml = "^6.0.1"
logging = "^0.4.9.6"
[tool.poetry.scripts]
main = "quacc.main:main"
comp = "quacc.main:estimate_comparison"
tohost = "scp_sync:scp_sync_to_host"
[tool.poetry.group.dev.dependencies]
pytest = "^7.4.0"
pylance = "^0.5.9"
pytest-mock = "^3.11.1"
pytest-cov = "^4.1.0"
win11toast = "^0.32"
tabulate = "^0.9.0"
paramiko = "^3.3.1"
[tool.pytest.ini_options]
addopts = "--cov=quacc --capture=tee-sys"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[virtualenvs]
in-project = true
[tool.poetry]
name = "quacc"
version = "0.1.0"
description = ""
authors = ["Lorenzo Volpi <lorenzo.volpi@outlook.com>"]
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.11"
quapy = "^0.1.7"
pandas = "^2.0.3"
jinja2 = "^3.1.2"
pyyaml = "^6.0.1"
logging = "^0.4.9.6"
[tool.poetry.scripts]
main = "quacc.main:main"
comp = "quacc.main:estimate_comparison"
tohost = "scp_sync:scp_sync_to_host"
[tool.poetry.group.dev.dependencies]
pytest = "^7.4.0"
pylance = "^0.5.9"
pytest-mock = "^3.11.1"
pytest-cov = "^4.1.0"
win11toast = "^0.32"
tabulate = "^0.9.0"
paramiko = "^3.3.1"
[tool.pytest.ini_options]
addopts = "--cov=quacc --capture=tee-sys"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[virtualenvs]
in-project = true

9342
quacc.log

File diff suppressed because it is too large Load Diff

View File

@ -1,150 +1,150 @@
import math
from typing import List, Optional
import numpy as np
import scipy.sparse as sp
from quapy.data import LabelledCollection
# Extended classes
#
# 0 ~ True 0
# 1 ~ False 1
# 2 ~ False 0
# 3 ~ True 1
# _____________________
# | | |
# | True 0 | False 1 |
# |__________|__________|
# | | |
# | False 0 | True 1 |
# |__________|__________|
#
class ExClassManager:
@staticmethod
def get_ex(n_classes: int, true_class: int, pred_class: int) -> int:
return true_class * n_classes + pred_class
@staticmethod
def get_pred(n_classes: int, ex_class: int) -> int:
return ex_class % n_classes
@staticmethod
def get_true(n_classes: int, ex_class: int) -> int:
return ex_class // n_classes
class ExtendedCollection(LabelledCollection):
def __init__(
self,
instances: np.ndarray | sp.csr_matrix,
labels: np.ndarray,
classes: Optional[List] = None,
):
super().__init__(instances, labels, classes=classes)
def split_by_pred(self):
_ncl = int(math.sqrt(self.n_classes))
_indexes = ExtendedCollection._split_index_by_pred(_ncl, self.instances)
if isinstance(self.instances, np.ndarray):
_instances = [
self.instances[ind] if ind.shape[0] > 0 else np.asarray([], dtype=int)
for ind in _indexes
]
elif isinstance(self.instances, sp.csr_matrix):
_instances = [
self.instances[ind]
if ind.shape[0] > 0
else sp.csr_matrix(np.empty((0, 0), dtype=int))
for ind in _indexes
]
_labels = [
np.asarray(
[
ExClassManager.get_true(_ncl, lbl)
for lbl in (self.labels[ind] if len(ind) > 0 else [])
],
dtype=int,
)
for ind in _indexes
]
return [
ExtendedCollection(inst, lbl, classes=range(0, _ncl))
for (inst, lbl) in zip(_instances, _labels)
]
@classmethod
def split_inst_by_pred(
cls, n_classes: int, instances: np.ndarray | sp.csr_matrix
) -> (List[np.ndarray | sp.csr_matrix], List[float]):
_indexes = cls._split_index_by_pred(n_classes, instances)
if isinstance(instances, np.ndarray):
_instances = [
instances[ind] if ind.shape[0] > 0 else np.asarray([], dtype=int)
for ind in _indexes
]
elif isinstance(instances, sp.csr_matrix):
_instances = [
instances[ind]
if ind.shape[0] > 0
else sp.csr_matrix(np.empty((0, 0), dtype=int))
for ind in _indexes
]
norms = [inst.shape[0] / instances.shape[0] for inst in _instances]
return _instances, norms
@classmethod
def _split_index_by_pred(
cls, n_classes: int, instances: np.ndarray | sp.csr_matrix
) -> List[np.ndarray]:
if isinstance(instances, np.ndarray):
_pred_label = [np.argmax(inst[-n_classes:], axis=0) for inst in instances]
elif isinstance(instances, sp.csr_matrix):
_pred_label = [
np.argmax(inst[:, -n_classes:].toarray().flatten(), axis=0)
for inst in instances
]
else:
raise ValueError("Unsupported matrix format")
return [
np.asarray([j for (j, x) in enumerate(_pred_label) if x == i], dtype=int)
for i in range(0, n_classes)
]
@classmethod
def extend_instances(
cls, instances: np.ndarray | sp.csr_matrix, pred_proba: np.ndarray
) -> np.ndarray | sp.csr_matrix:
if isinstance(instances, sp.csr_matrix):
_pred_proba = sp.csr_matrix(pred_proba)
n_x = sp.hstack([instances, _pred_proba])
elif isinstance(instances, np.ndarray):
n_x = np.concatenate((instances, pred_proba), axis=1)
else:
raise ValueError("Unsupported matrix format")
return n_x
@classmethod
def extend_collection(
cls,
base: LabelledCollection,
pred_proba: np.ndarray,
):
n_classes = base.n_classes
# n_X = [ X | predicted probs. ]
n_x = cls.extend_instances(base.X, pred_proba)
# n_y = (exptected y, predicted y)
pred_proba = pred_proba[:, -n_classes:]
preds = np.argmax(pred_proba, axis=-1)
n_y = np.asarray(
[
ExClassManager.get_ex(n_classes, true_class, pred_class)
for (true_class, pred_class) in zip(base.y, preds)
]
)
return ExtendedCollection(n_x, n_y, classes=[*range(0, n_classes * n_classes)])
import math
from typing import List, Optional
import numpy as np
import scipy.sparse as sp
from quapy.data import LabelledCollection
# Extended classes
#
# 0 ~ True 0
# 1 ~ False 1
# 2 ~ False 0
# 3 ~ True 1
# _____________________
# | | |
# | True 0 | False 1 |
# |__________|__________|
# | | |
# | False 0 | True 1 |
# |__________|__________|
#
class ExClassManager:
@staticmethod
def get_ex(n_classes: int, true_class: int, pred_class: int) -> int:
return true_class * n_classes + pred_class
@staticmethod
def get_pred(n_classes: int, ex_class: int) -> int:
return ex_class % n_classes
@staticmethod
def get_true(n_classes: int, ex_class: int) -> int:
return ex_class // n_classes
class ExtendedCollection(LabelledCollection):
def __init__(
self,
instances: np.ndarray | sp.csr_matrix,
labels: np.ndarray,
classes: Optional[List] = None,
):
super().__init__(instances, labels, classes=classes)
def split_by_pred(self):
_ncl = int(math.sqrt(self.n_classes))
_indexes = ExtendedCollection._split_index_by_pred(_ncl, self.instances)
if isinstance(self.instances, np.ndarray):
_instances = [
self.instances[ind] if ind.shape[0] > 0 else np.asarray([], dtype=int)
for ind in _indexes
]
elif isinstance(self.instances, sp.csr_matrix):
_instances = [
self.instances[ind]
if ind.shape[0] > 0
else sp.csr_matrix(np.empty((0, 0), dtype=int))
for ind in _indexes
]
_labels = [
np.asarray(
[
ExClassManager.get_true(_ncl, lbl)
for lbl in (self.labels[ind] if len(ind) > 0 else [])
],
dtype=int,
)
for ind in _indexes
]
return [
ExtendedCollection(inst, lbl, classes=range(0, _ncl))
for (inst, lbl) in zip(_instances, _labels)
]
@classmethod
def split_inst_by_pred(
cls, n_classes: int, instances: np.ndarray | sp.csr_matrix
) -> (List[np.ndarray | sp.csr_matrix], List[float]):
_indexes = cls._split_index_by_pred(n_classes, instances)
if isinstance(instances, np.ndarray):
_instances = [
instances[ind] if ind.shape[0] > 0 else np.asarray([], dtype=int)
for ind in _indexes
]
elif isinstance(instances, sp.csr_matrix):
_instances = [
instances[ind]
if ind.shape[0] > 0
else sp.csr_matrix(np.empty((0, 0), dtype=int))
for ind in _indexes
]
norms = [inst.shape[0] / instances.shape[0] for inst in _instances]
return _instances, norms
@classmethod
def _split_index_by_pred(
cls, n_classes: int, instances: np.ndarray | sp.csr_matrix
) -> List[np.ndarray]:
if isinstance(instances, np.ndarray):
_pred_label = [np.argmax(inst[-n_classes:], axis=0) for inst in instances]
elif isinstance(instances, sp.csr_matrix):
_pred_label = [
np.argmax(inst[:, -n_classes:].toarray().flatten(), axis=0)
for inst in instances
]
else:
raise ValueError("Unsupported matrix format")
return [
np.asarray([j for (j, x) in enumerate(_pred_label) if x == i], dtype=int)
for i in range(0, n_classes)
]
@classmethod
def extend_instances(
cls, instances: np.ndarray | sp.csr_matrix, pred_proba: np.ndarray
) -> np.ndarray | sp.csr_matrix:
if isinstance(instances, sp.csr_matrix):
_pred_proba = sp.csr_matrix(pred_proba)
n_x = sp.hstack([instances, _pred_proba])
elif isinstance(instances, np.ndarray):
n_x = np.concatenate((instances, pred_proba), axis=1)
else:
raise ValueError("Unsupported matrix format")
return n_x
@classmethod
def extend_collection(
cls,
base: LabelledCollection,
pred_proba: np.ndarray,
):
n_classes = base.n_classes
# n_X = [ X | predicted probs. ]
n_x = cls.extend_instances(base.X, pred_proba)
# n_y = (exptected y, predicted y)
pred_proba = pred_proba[:, -n_classes:]
preds = np.argmax(pred_proba, axis=-1)
n_y = np.asarray(
[
ExClassManager.get_ex(n_classes, true_class, pred_class)
for (true_class, pred_class) in zip(base.y, preds)
]
)
return ExtendedCollection(n_x, n_y, classes=[*range(0, n_classes * n_classes)])

View File

@ -1,171 +1,171 @@
import math
from typing import List
import numpy as np
import quapy as qp
from quapy.data.base import LabelledCollection
from sklearn.conftest import fetch_rcv1
TRAIN_VAL_PROP = 0.5
class DatasetSample:
def __init__(
self,
train: LabelledCollection,
validation: LabelledCollection,
test: LabelledCollection,
):
self.train = train
self.validation = validation
self.test = test
@property
def train_prev(self):
return self.train.prevalence()
@property
def validation_prev(self):
return self.validation.prevalence()
@property
def prevs(self):
return {"train": self.train_prev, "validation": self.validation_prev}
class Dataset:
def __init__(self, name, n_prevalences=9, prevs=None, target=None):
self._name = name
self._target = target
self.prevs = None
self.n_prevs = n_prevalences
if prevs is not None:
prevs = np.unique([p for p in prevs if p > 0.0 and p < 1.0])
if prevs.shape[0] > 0:
self.prevs = np.sort(prevs)
self.n_prevs = self.prevs.shape[0]
def __spambase(self):
return qp.datasets.fetch_UCIDataset("spambase", verbose=False).train_test
# provare min_df=5
def __imdb(self):
return qp.datasets.fetch_reviews("imdb", tfidf=True, min_df=3).train_test
def __rcv1(self):
n_train = 23149
available_targets = ["CCAT", "GCAT", "MCAT"]
if self._target is None or self._target not in available_targets:
raise ValueError(f"Invalid target {self._target}")
dataset = fetch_rcv1()
target_index = np.where(dataset.target_names == self._target)[0]
all_train_d = dataset.data[:n_train, :]
test_d = dataset.data[n_train:, :]
labels = dataset.target[:, target_index].toarray().flatten()
all_train_l, test_l = labels[:n_train], labels[n_train:]
all_train = LabelledCollection(all_train_d, all_train_l, classes=[0, 1])
test = LabelledCollection(test_d, test_l, classes=[0, 1])
return all_train, test
def get_raw(self) -> DatasetSample:
all_train, test = {
"spambase": self.__spambase,
"imdb": self.__imdb,
"rcv1": self.__rcv1,
}[self._name]()
train, val = all_train.split_stratified(
train_prop=TRAIN_VAL_PROP, random_state=0
)
return DatasetSample(train, val, test)
def get(self) -> List[DatasetSample]:
(all_train, test) = {
"spambase": self.__spambase,
"imdb": self.__imdb,
"rcv1": self.__rcv1,
}[self._name]()
# resample all_train set to have (0.5, 0.5) prevalence
at_positives = np.sum(all_train.y)
all_train = all_train.sampling(
min(at_positives, len(all_train) - at_positives) * 2, 0.5, random_state=0
)
# sample prevalences
if self.prevs is not None:
prevs = self.prevs
else:
prevs = np.linspace(0.0, 1.0, num=self.n_prevs + 1, endpoint=False)[1:]
at_size = min(math.floor(len(all_train) * 0.5 / p) for p in prevs)
datasets = []
for p in 1.0 - prevs:
all_train_sampled = all_train.sampling(at_size, p, random_state=0)
train, validation = all_train_sampled.split_stratified(
train_prop=TRAIN_VAL_PROP, random_state=0
)
datasets.append(DatasetSample(train, validation, test))
return datasets
def __call__(self):
return self.get()
@property
def name(self):
return (
f"{self._name}_{self._target}_{self.n_prevs}prevs"
if self._name == "rcv1"
else f"{self._name}_{self.n_prevs}prevs"
)
# >>> fetch_rcv1().target_names
# array(['C11', 'C12', 'C13', 'C14', 'C15', 'C151', 'C1511', 'C152', 'C16',
# 'C17', 'C171', 'C172', 'C173', 'C174', 'C18', 'C181', 'C182',
# 'C183', 'C21', 'C22', 'C23', 'C24', 'C31', 'C311', 'C312', 'C313',
# 'C32', 'C33', 'C331', 'C34', 'C41', 'C411', 'C42', 'CCAT', 'E11',
# 'E12', 'E121', 'E13', 'E131', 'E132', 'E14', 'E141', 'E142',
# 'E143', 'E21', 'E211', 'E212', 'E31', 'E311', 'E312', 'E313',
# 'E41', 'E411', 'E51', 'E511', 'E512', 'E513', 'E61', 'E71', 'ECAT',
# 'G15', 'G151', 'G152', 'G153', 'G154', 'G155', 'G156', 'G157',
# 'G158', 'G159', 'GCAT', 'GCRIM', 'GDEF', 'GDIP', 'GDIS', 'GENT',
# 'GENV', 'GFAS', 'GHEA', 'GJOB', 'GMIL', 'GOBIT', 'GODD', 'GPOL',
# 'GPRO', 'GREL', 'GSCI', 'GSPO', 'GTOUR', 'GVIO', 'GVOTE', 'GWEA',
# 'GWELF', 'M11', 'M12', 'M13', 'M131', 'M132', 'M14', 'M141',
# 'M142', 'M143', 'MCAT'], dtype=object)
def rcv1_info():
dataset = fetch_rcv1()
n_train = 23149
targets = []
for target in range(103):
train_t_prev = np.average(dataset.target[:n_train, target].toarray().flatten())
test_t_prev = np.average(dataset.target[n_train:, target].toarray().flatten())
targets.append(
(
dataset.target_names[target],
{
"train": (1.0 - train_t_prev, train_t_prev),
"test": (1.0 - test_t_prev, test_t_prev),
},
)
)
targets.sort(key=lambda t: t[1]["train"][1])
for n, d in targets:
print(f"{n}:")
for k, (fp, tp) in d.items():
print(f"\t{k}: {fp:.4f}, {tp:.4f}")
if __name__ == "__main__":
rcv1_info()
import math
from typing import List
import numpy as np
import quapy as qp
from quapy.data.base import LabelledCollection
from sklearn.conftest import fetch_rcv1
TRAIN_VAL_PROP = 0.5
class DatasetSample:
def __init__(
self,
train: LabelledCollection,
validation: LabelledCollection,
test: LabelledCollection,
):
self.train = train
self.validation = validation
self.test = test
@property
def train_prev(self):
return self.train.prevalence()
@property
def validation_prev(self):
return self.validation.prevalence()
@property
def prevs(self):
return {"train": self.train_prev, "validation": self.validation_prev}
class Dataset:
def __init__(self, name, n_prevalences=9, prevs=None, target=None):
self._name = name
self._target = target
self.prevs = None
self.n_prevs = n_prevalences
if prevs is not None:
prevs = np.unique([p for p in prevs if p > 0.0 and p < 1.0])
if prevs.shape[0] > 0:
self.prevs = np.sort(prevs)
self.n_prevs = self.prevs.shape[0]
def __spambase(self):
return qp.datasets.fetch_UCIDataset("spambase", verbose=False).train_test
# provare min_df=5
def __imdb(self):
return qp.datasets.fetch_reviews("imdb", tfidf=True, min_df=3).train_test
def __rcv1(self):
n_train = 23149
available_targets = ["CCAT", "GCAT", "MCAT"]
if self._target is None or self._target not in available_targets:
raise ValueError(f"Invalid target {self._target}")
dataset = fetch_rcv1()
target_index = np.where(dataset.target_names == self._target)[0]
all_train_d = dataset.data[:n_train, :]
test_d = dataset.data[n_train:, :]
labels = dataset.target[:, target_index].toarray().flatten()
all_train_l, test_l = labels[:n_train], labels[n_train:]
all_train = LabelledCollection(all_train_d, all_train_l, classes=[0, 1])
test = LabelledCollection(test_d, test_l, classes=[0, 1])
return all_train, test
def get_raw(self) -> DatasetSample:
all_train, test = {
"spambase": self.__spambase,
"imdb": self.__imdb,
"rcv1": self.__rcv1,
}[self._name]()
train, val = all_train.split_stratified(
train_prop=TRAIN_VAL_PROP, random_state=0
)
return DatasetSample(train, val, test)
def get(self) -> List[DatasetSample]:
(all_train, test) = {
"spambase": self.__spambase,
"imdb": self.__imdb,
"rcv1": self.__rcv1,
}[self._name]()
# resample all_train set to have (0.5, 0.5) prevalence
at_positives = np.sum(all_train.y)
all_train = all_train.sampling(
min(at_positives, len(all_train) - at_positives) * 2, 0.5, random_state=0
)
# sample prevalences
if self.prevs is not None:
prevs = self.prevs
else:
prevs = np.linspace(0.0, 1.0, num=self.n_prevs + 1, endpoint=False)[1:]
at_size = min(math.floor(len(all_train) * 0.5 / p) for p in prevs)
datasets = []
for p in 1.0 - prevs:
all_train_sampled = all_train.sampling(at_size, p, random_state=0)
train, validation = all_train_sampled.split_stratified(
train_prop=TRAIN_VAL_PROP, random_state=0
)
datasets.append(DatasetSample(train, validation, test))
return datasets
def __call__(self):
return self.get()
@property
def name(self):
return (
f"{self._name}_{self._target}_{self.n_prevs}prevs"
if self._name == "rcv1"
else f"{self._name}_{self.n_prevs}prevs"
)
# >>> fetch_rcv1().target_names
# array(['C11', 'C12', 'C13', 'C14', 'C15', 'C151', 'C1511', 'C152', 'C16',
# 'C17', 'C171', 'C172', 'C173', 'C174', 'C18', 'C181', 'C182',
# 'C183', 'C21', 'C22', 'C23', 'C24', 'C31', 'C311', 'C312', 'C313',
# 'C32', 'C33', 'C331', 'C34', 'C41', 'C411', 'C42', 'CCAT', 'E11',
# 'E12', 'E121', 'E13', 'E131', 'E132', 'E14', 'E141', 'E142',
# 'E143', 'E21', 'E211', 'E212', 'E31', 'E311', 'E312', 'E313',
# 'E41', 'E411', 'E51', 'E511', 'E512', 'E513', 'E61', 'E71', 'ECAT',
# 'G15', 'G151', 'G152', 'G153', 'G154', 'G155', 'G156', 'G157',
# 'G158', 'G159', 'GCAT', 'GCRIM', 'GDEF', 'GDIP', 'GDIS', 'GENT',
# 'GENV', 'GFAS', 'GHEA', 'GJOB', 'GMIL', 'GOBIT', 'GODD', 'GPOL',
# 'GPRO', 'GREL', 'GSCI', 'GSPO', 'GTOUR', 'GVIO', 'GVOTE', 'GWEA',
# 'GWELF', 'M11', 'M12', 'M13', 'M131', 'M132', 'M14', 'M141',
# 'M142', 'M143', 'MCAT'], dtype=object)
def rcv1_info():
dataset = fetch_rcv1()
n_train = 23149
targets = []
for target in range(103):
train_t_prev = np.average(dataset.target[:n_train, target].toarray().flatten())
test_t_prev = np.average(dataset.target[n_train:, target].toarray().flatten())
targets.append(
(
dataset.target_names[target],
{
"train": (1.0 - train_t_prev, train_t_prev),
"test": (1.0 - test_t_prev, test_t_prev),
},
)
)
targets.sort(key=lambda t: t[1]["train"][1])
for n, d in targets:
print(f"{n}:")
for k, (fp, tp) in d.items():
print(f"\t{k}: {fp:.4f}, {tp:.4f}")
if __name__ == "__main__":
rcv1_info()

View File

@ -1,118 +1,118 @@
import collections as C
import copy
from typing import Any
import yaml
class environ:
_instance = None
_default_env = {
"DATASET_NAME": None,
"DATASET_TARGET": None,
"METRICS": [],
"COMP_ESTIMATORS": [],
"DATASET_N_PREVS": 9,
"DATASET_PREVS": None,
"OUT_DIR_NAME": "output",
"OUT_DIR": None,
"PLOT_DIR_NAME": "plot",
"PLOT_OUT_DIR": None,
"DATASET_DIR_UPDATE": False,
"PROTOCOL_N_PREVS": 21,
"PROTOCOL_REPEATS": 100,
"SAMPLE_SIZE": 1000,
"PLOT_ESTIMATORS": [],
"PLOT_STDEV": False,
}
_keys = list(_default_env.keys())
def __init__(self):
self.exec = []
self.confs = []
self.load_conf()
self._stack = C.deque([self.__getdict()])
def __setdict(self, d):
for k, v in d.items():
super().__setattr__(k, v)
def __getdict(self):
return {k: self.__getattribute__(k) for k in environ._keys}
def __setattr__(self, __name: str, __value: Any) -> None:
if __name in environ._keys:
self._stack[-1][__name] = __value
super().__setattr__(__name, __value)
def load_conf(self):
self.__setdict(environ._default_env)
with open("conf.yaml", "r") as f:
confs = yaml.safe_load(f)["exec"]
_global = confs["global"]
_estimators = set()
for pc in confs["plot_confs"].values():
_estimators = _estimators.union(set(pc["PLOT_ESTIMATORS"]))
_global["COMP_ESTIMATORS"] = list(_estimators)
self.__setdict(_global)
self.confs = confs["confs"]
self.plot_confs = confs["plot_confs"]
def get_confs(self):
self._stack.append(None)
for _conf in self.confs:
self._stack.pop()
self.__setdict(self._stack[-1])
self.__setdict(_conf)
self._stack.append(self.__getdict())
yield copy.deepcopy(self._stack[-1])
self._stack.pop()
def get_plot_confs(self):
self._stack.append(None)
for k, pc in self.plot_confs.items():
self._stack.pop()
self.__setdict(self._stack[-1])
self.__setdict(pc)
self._stack.append(self.__getdict())
name = self.DATASET_NAME
if self.DATASET_TARGET is not None:
name += f"_{self.DATASET_TARGET}"
name += f"_{k}"
yield name
self._stack.pop()
@property
def current(self):
return copy.deepcopy(self.__getdict())
env = environ()
if __name__ == "__main__":
stack = C.deque()
stack.append(-1)
def __gen(stack: C.deque):
stack.append(None)
for i in range(5):
stack.pop()
stack.append(i)
yield stack[-1]
stack.pop()
print(stack)
for i in __gen(stack):
print(stack, i)
print(stack)
import collections as C
import copy
from typing import Any
import yaml
class environ:
_instance = None
_default_env = {
"DATASET_NAME": None,
"DATASET_TARGET": None,
"METRICS": [],
"COMP_ESTIMATORS": [],
"DATASET_N_PREVS": 9,
"DATASET_PREVS": None,
"OUT_DIR_NAME": "output",
"OUT_DIR": None,
"PLOT_DIR_NAME": "plot",
"PLOT_OUT_DIR": None,
"DATASET_DIR_UPDATE": False,
"PROTOCOL_N_PREVS": 21,
"PROTOCOL_REPEATS": 100,
"SAMPLE_SIZE": 1000,
"PLOT_ESTIMATORS": [],
"PLOT_STDEV": False,
}
_keys = list(_default_env.keys())
def __init__(self):
self.exec = []
self.confs = []
self.load_conf()
self._stack = C.deque([self.__getdict()])
def __setdict(self, d):
for k, v in d.items():
super().__setattr__(k, v)
def __getdict(self):
return {k: self.__getattribute__(k) for k in environ._keys}
def __setattr__(self, __name: str, __value: Any) -> None:
if __name in environ._keys:
self._stack[-1][__name] = __value
super().__setattr__(__name, __value)
def load_conf(self):
self.__setdict(environ._default_env)
with open("conf.yaml", "r") as f:
confs = yaml.safe_load(f)["exec"]
_global = confs["global"]
_estimators = set()
for pc in confs["plot_confs"].values():
_estimators = _estimators.union(set(pc["PLOT_ESTIMATORS"]))
_global["COMP_ESTIMATORS"] = list(_estimators)
self.__setdict(_global)
self.confs = confs["confs"]
self.plot_confs = confs["plot_confs"]
def get_confs(self):
self._stack.append(None)
for _conf in self.confs:
self._stack.pop()
self.__setdict(self._stack[-1])
self.__setdict(_conf)
self._stack.append(self.__getdict())
yield copy.deepcopy(self._stack[-1])
self._stack.pop()
def get_plot_confs(self):
self._stack.append(None)
for k, pc in self.plot_confs.items():
self._stack.pop()
self.__setdict(self._stack[-1])
self.__setdict(pc)
self._stack.append(self.__getdict())
name = self.DATASET_NAME
if self.DATASET_TARGET is not None:
name += f"_{self.DATASET_TARGET}"
name += f"_{k}"
yield name
self._stack.pop()
@property
def current(self):
return copy.deepcopy(self.__getdict())
env = environ()
if __name__ == "__main__":
stack = C.deque()
stack.append(-1)
def __gen(stack: C.deque):
stack.append(None)
for i in range(5):
stack.pop()
stack.append(i)
yield stack[-1]
stack.pop()
print(stack)
for i in __gen(stack):
print(stack, i)
print(stack)

View File

@ -1,55 +1,55 @@
import numpy as np
def from_name(err_name):
assert err_name in ERROR_NAMES, f"unknown error {err_name}"
callable_error = globals()[err_name]
return callable_error
# def f1(prev):
# # https://github.com/dice-group/gerbil/wiki/Precision,-Recall-and-F1-measure
# if prev[0] == 0 and prev[1] == 0 and prev[2] == 0:
# return 1.0
# elif prev[0] == 0 and prev[1] > 0 and prev[2] == 0:
# return 0.0
# elif prev[0] == 0 and prev[1] == 0 and prev[2] > 0:
# return float('NaN')
# else:
# recall = prev[0] / (prev[0] + prev[1])
# precision = prev[0] / (prev[0] + prev[2])
# return 2 * (precision * recall) / (precision + recall)
def f1(prev):
den = (2 * prev[3]) + prev[1] + prev[2]
if den == 0:
return 0.0
else:
return (2 * prev[3]) / den
def f1e(prev):
return 1 - f1(prev)
def acc(prev: np.ndarray) -> float:
return (prev[0] + prev[3]) / np.sum(prev)
def accd(true_prevs: np.ndarray, estim_prevs: np.ndarray) -> np.ndarray:
vacc = np.vectorize(acc, signature="(m)->()")
a_tp = vacc(true_prevs)
a_ep = vacc(estim_prevs)
return np.abs(a_tp - a_ep)
def maccd(true_prevs: np.ndarray, estim_prevs: np.ndarray) -> float:
return accd(true_prevs, estim_prevs).mean()
ACCURACY_ERROR = {maccd}
ACCURACY_ERROR_SINGLE = {accd}
ACCURACY_ERROR_NAMES = {func.__name__ for func in ACCURACY_ERROR}
ACCURACY_ERROR_SINGLE_NAMES = {func.__name__ for func in ACCURACY_ERROR_SINGLE}
ERROR_NAMES = ACCURACY_ERROR_NAMES | ACCURACY_ERROR_SINGLE_NAMES
import numpy as np
def from_name(err_name):
assert err_name in ERROR_NAMES, f"unknown error {err_name}"
callable_error = globals()[err_name]
return callable_error
# def f1(prev):
# # https://github.com/dice-group/gerbil/wiki/Precision,-Recall-and-F1-measure
# if prev[0] == 0 and prev[1] == 0 and prev[2] == 0:
# return 1.0
# elif prev[0] == 0 and prev[1] > 0 and prev[2] == 0:
# return 0.0
# elif prev[0] == 0 and prev[1] == 0 and prev[2] > 0:
# return float('NaN')
# else:
# recall = prev[0] / (prev[0] + prev[1])
# precision = prev[0] / (prev[0] + prev[2])
# return 2 * (precision * recall) / (precision + recall)
def f1(prev):
den = (2 * prev[3]) + prev[1] + prev[2]
if den == 0:
return 0.0
else:
return (2 * prev[3]) / den
def f1e(prev):
return 1 - f1(prev)
def acc(prev: np.ndarray) -> float:
return (prev[0] + prev[3]) / np.sum(prev)
def accd(true_prevs: np.ndarray, estim_prevs: np.ndarray) -> np.ndarray:
vacc = np.vectorize(acc, signature="(m)->()")
a_tp = vacc(true_prevs)
a_ep = vacc(estim_prevs)
return np.abs(a_tp - a_ep)
def maccd(true_prevs: np.ndarray, estim_prevs: np.ndarray) -> float:
return accd(true_prevs, estim_prevs).mean()
ACCURACY_ERROR = {maccd}
ACCURACY_ERROR_SINGLE = {accd}
ACCURACY_ERROR_NAMES = {func.__name__ for func in ACCURACY_ERROR}
ACCURACY_ERROR_SINGLE_NAMES = {func.__name__ for func in ACCURACY_ERROR_SINGLE}
ERROR_NAMES = ACCURACY_ERROR_NAMES | ACCURACY_ERROR_SINGLE_NAMES

View File

@ -1,34 +1,34 @@
from typing import Callable, Union
import numpy as np
from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
import quacc as qc
from ..method.base import BaseAccuracyEstimator
def evaluate(
estimator: BaseAccuracyEstimator,
protocol: AbstractProtocol,
error_metric: Union[Callable | str],
) -> float:
if isinstance(error_metric, str):
error_metric = qc.error.from_name(error_metric)
collator_bck_ = protocol.collator
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
estim_prevs, true_prevs = [], []
for sample in protocol():
e_sample = estimator.extend(sample)
estim_prev = estimator.estimate(e_sample.X, ext=True)
estim_prevs.append(estim_prev)
true_prevs.append(e_sample.prevalence())
protocol.collator = collator_bck_
true_prevs = np.array(true_prevs)
estim_prevs = np.array(estim_prevs)
return error_metric(true_prevs, estim_prevs)
from typing import Callable, Union
import numpy as np
from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
import quacc as qc
from ..method.base import BaseAccuracyEstimator
def evaluate(
estimator: BaseAccuracyEstimator,
protocol: AbstractProtocol,
error_metric: Union[Callable | str],
) -> float:
if isinstance(error_metric, str):
error_metric = qc.error.from_name(error_metric)
collator_bck_ = protocol.collator
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
estim_prevs, true_prevs = [], []
for sample in protocol():
e_sample = estimator.extend(sample)
estim_prev = estimator.estimate(e_sample.X, ext=True)
estim_prevs.append(estim_prev)
true_prevs.append(e_sample.prevalence())
protocol.collator = collator_bck_
true_prevs = np.array(true_prevs)
estim_prevs = np.array(estim_prevs)
return error_metric(true_prevs, estim_prevs)

View File

@ -1,299 +1,299 @@
from functools import wraps
from statistics import mean
import numpy as np
import sklearn.metrics as metrics
from quapy.data import LabelledCollection
from quapy.protocol import AbstractStochasticSeededProtocol
from scipy.sparse import issparse
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_validate
import baselines.atc as atc
import baselines.doc as doc
import baselines.impweight as iw
import baselines.rca as rcalib
from .report import EvaluationReport
_baselines = {}
def baseline(func):
@wraps(func)
def wrapper(c_model, validation, protocol):
return func(c_model, validation, protocol)
_baselines[func.__name__] = wrapper
return wrapper
@baseline
def kfcv(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
predict_method="predict",
):
c_model_predict = getattr(c_model, predict_method)
scoring = ["accuracy", "f1_macro"]
scores = cross_validate(c_model, validation.X, validation.y, scoring=scoring)
acc_score = mean(scores["test_accuracy"])
f1_score = mean(scores["test_f1_macro"])
report = EvaluationReport(name="kfcv")
for test in protocol():
test_preds = c_model_predict(test.X)
meta_acc = abs(acc_score - metrics.accuracy_score(test.y, test_preds))
meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds))
report.append_row(
test.prevalence(),
acc_score=acc_score,
f1_score=f1_score,
acc=meta_acc,
f1=meta_f1,
)
return report
@baseline
def ref(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
):
c_model_predict = getattr(c_model, "predict")
report = EvaluationReport(name="ref")
for test in protocol():
test_preds = c_model_predict(test.X)
report.append_row(
test.prevalence(),
acc_score=metrics.accuracy_score(test.y, test_preds),
f1_score=metrics.f1_score(test.y, test_preds),
)
return report
@baseline
def atc_mc(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
predict_method="predict_proba",
):
"""garg"""
c_model_predict = getattr(c_model, predict_method)
## Load ID validation data probs and labels
val_probs, val_labels = c_model_predict(validation.X), validation.y
## score function, e.g., negative entropy or argmax confidence
val_scores = atc.get_max_conf(val_probs)
val_preds = np.argmax(val_probs, axis=-1)
_, atc_thres = atc.find_ATC_threshold(val_scores, val_labels == val_preds)
report = EvaluationReport(name="atc_mc")
for test in protocol():
## Load OOD test data probs
test_probs = c_model_predict(test.X)
test_preds = np.argmax(test_probs, axis=-1)
test_scores = atc.get_max_conf(test_probs)
atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores)
meta_acc = abs(atc_accuracy - metrics.accuracy_score(test.y, test_preds))
f1_score = atc.get_ATC_f1(atc_thres, test_scores, test_probs)
meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds))
report.append_row(
test.prevalence(),
acc=meta_acc,
acc_score=atc_accuracy,
f1_score=f1_score,
f1=meta_f1,
)
return report
@baseline
def atc_ne(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
predict_method="predict_proba",
):
"""garg"""
c_model_predict = getattr(c_model, predict_method)
## Load ID validation data probs and labels
val_probs, val_labels = c_model_predict(validation.X), validation.y
## score function, e.g., negative entropy or argmax confidence
val_scores = atc.get_entropy(val_probs)
val_preds = np.argmax(val_probs, axis=-1)
_, atc_thres = atc.find_ATC_threshold(val_scores, val_labels == val_preds)
report = EvaluationReport(name="atc_ne")
for test in protocol():
## Load OOD test data probs
test_probs = c_model_predict(test.X)
test_preds = np.argmax(test_probs, axis=-1)
test_scores = atc.get_entropy(test_probs)
atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores)
meta_acc = abs(atc_accuracy - metrics.accuracy_score(test.y, test_preds))
f1_score = atc.get_ATC_f1(atc_thres, test_scores, test_probs)
meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds))
report.append_row(
test.prevalence(),
acc=meta_acc,
acc_score=atc_accuracy,
f1_score=f1_score,
f1=meta_f1,
)
return report
@baseline
def doc_feat(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
predict_method="predict_proba",
):
c_model_predict = getattr(c_model, predict_method)
val_probs, val_labels = c_model_predict(validation.X), validation.y
val_scores = np.max(val_probs, axis=-1)
val_preds = np.argmax(val_probs, axis=-1)
v1acc = np.mean(val_preds == val_labels) * 100
report = EvaluationReport(name="doc_feat")
for test in protocol():
test_probs = c_model_predict(test.X)
test_preds = np.argmax(test_probs, axis=-1)
test_scores = np.max(test_probs, axis=-1)
score = (v1acc + doc.get_doc(val_scores, test_scores)) / 100.0
meta_acc = abs(score - metrics.accuracy_score(test.y, test_preds))
report.append_row(test.prevalence(), acc=meta_acc, acc_score=score)
return report
@baseline
def rca(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
predict_method="predict",
):
"""elsahar19"""
c_model_predict = getattr(c_model, predict_method)
val_pred1 = c_model_predict(validation.X)
report = EvaluationReport(name="rca")
for test in protocol():
try:
test_pred = c_model_predict(test.X)
c_model2 = rcalib.clone_fit(c_model, test.X, test_pred)
c_model2_predict = getattr(c_model2, predict_method)
val_pred2 = c_model2_predict(validation.X)
rca_score = 1.0 - rcalib.get_score(val_pred1, val_pred2, validation.y)
meta_score = abs(rca_score - metrics.accuracy_score(test.y, test_pred))
report.append_row(test.prevalence(), acc=meta_score, acc_score=rca_score)
except ValueError:
report.append_row(
test.prevalence(), acc=float("nan"), acc_score=float("nan")
)
return report
@baseline
def rca_star(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
predict_method="predict",
):
"""elsahar19"""
c_model_predict = getattr(c_model, predict_method)
validation1, validation2 = validation.split_stratified(
train_prop=0.5, random_state=0
)
val1_pred = c_model_predict(validation1.X)
c_model1 = rcalib.clone_fit(c_model, validation1.X, val1_pred)
c_model1_predict = getattr(c_model1, predict_method)
val2_pred1 = c_model1_predict(validation2.X)
report = EvaluationReport(name="rca_star")
for test in protocol():
try:
test_pred = c_model_predict(test.X)
c_model2 = rcalib.clone_fit(c_model, test.X, test_pred)
c_model2_predict = getattr(c_model2, predict_method)
val2_pred2 = c_model2_predict(validation2.X)
rca_star_score = 1.0 - rcalib.get_score(
val2_pred1, val2_pred2, validation2.y
)
meta_score = abs(rca_star_score - metrics.accuracy_score(test.y, test_pred))
report.append_row(
test.prevalence(), acc=meta_score, acc_score=rca_star_score
)
except ValueError:
report.append_row(
test.prevalence(), acc=float("nan"), acc_score=float("nan")
)
return report
@baseline
def logreg(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
predict_method="predict",
):
c_model_predict = getattr(c_model, predict_method)
val_preds = c_model_predict(validation.X)
report = EvaluationReport(name="logreg")
for test in protocol():
wx = iw.logreg(validation.X, validation.y, test.X)
test_preds = c_model_predict(test.X)
estim_acc = iw.get_acc(val_preds, validation.y, wx)
true_acc = metrics.accuracy_score(test.y, test_preds)
meta_score = abs(estim_acc - true_acc)
report.append_row(test.prevalence(), acc=meta_score, acc_score=estim_acc)
return report
@baseline
def kdex2(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
predict_method="predict",
):
c_model_predict = getattr(c_model, predict_method)
val_preds = c_model_predict(validation.X)
log_likelihood_val = iw.kdex2_lltr(validation.X)
Xval = validation.X.toarray() if issparse(validation.X) else validation.X
report = EvaluationReport(name="kdex2")
for test in protocol():
Xte = test.X.toarray() if issparse(test.X) else test.X
wx = iw.kdex2_weights(Xval, Xte, log_likelihood_val)
test_preds = c_model_predict(Xte)
estim_acc = iw.get_acc(val_preds, validation.y, wx)
true_acc = metrics.accuracy_score(test.y, test_preds)
meta_score = abs(estim_acc - true_acc)
report.append_row(test.prevalence(), acc=meta_score, acc_score=estim_acc)
return report
from functools import wraps
from statistics import mean
import numpy as np
import sklearn.metrics as metrics
from quapy.data import LabelledCollection
from quapy.protocol import AbstractStochasticSeededProtocol
from scipy.sparse import issparse
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_validate
import baselines.atc as atc
import baselines.doc as doc
import baselines.impweight as iw
import baselines.rca as rcalib
from .report import EvaluationReport
_baselines = {}
def baseline(func):
@wraps(func)
def wrapper(c_model, validation, protocol):
return func(c_model, validation, protocol)
_baselines[func.__name__] = wrapper
return wrapper
@baseline
def kfcv(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
predict_method="predict",
):
c_model_predict = getattr(c_model, predict_method)
scoring = ["accuracy", "f1_macro"]
scores = cross_validate(c_model, validation.X, validation.y, scoring=scoring)
acc_score = mean(scores["test_accuracy"])
f1_score = mean(scores["test_f1_macro"])
report = EvaluationReport(name="kfcv")
for test in protocol():
test_preds = c_model_predict(test.X)
meta_acc = abs(acc_score - metrics.accuracy_score(test.y, test_preds))
meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds))
report.append_row(
test.prevalence(),
acc_score=acc_score,
f1_score=f1_score,
acc=meta_acc,
f1=meta_f1,
)
return report
@baseline
def ref(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
):
c_model_predict = getattr(c_model, "predict")
report = EvaluationReport(name="ref")
for test in protocol():
test_preds = c_model_predict(test.X)
report.append_row(
test.prevalence(),
acc_score=metrics.accuracy_score(test.y, test_preds),
f1_score=metrics.f1_score(test.y, test_preds),
)
return report
@baseline
def atc_mc(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
predict_method="predict_proba",
):
"""garg"""
c_model_predict = getattr(c_model, predict_method)
## Load ID validation data probs and labels
val_probs, val_labels = c_model_predict(validation.X), validation.y
## score function, e.g., negative entropy or argmax confidence
val_scores = atc.get_max_conf(val_probs)
val_preds = np.argmax(val_probs, axis=-1)
_, atc_thres = atc.find_ATC_threshold(val_scores, val_labels == val_preds)
report = EvaluationReport(name="atc_mc")
for test in protocol():
## Load OOD test data probs
test_probs = c_model_predict(test.X)
test_preds = np.argmax(test_probs, axis=-1)
test_scores = atc.get_max_conf(test_probs)
atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores)
meta_acc = abs(atc_accuracy - metrics.accuracy_score(test.y, test_preds))
f1_score = atc.get_ATC_f1(atc_thres, test_scores, test_probs)
meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds))
report.append_row(
test.prevalence(),
acc=meta_acc,
acc_score=atc_accuracy,
f1_score=f1_score,
f1=meta_f1,
)
return report
@baseline
def atc_ne(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
predict_method="predict_proba",
):
"""garg"""
c_model_predict = getattr(c_model, predict_method)
## Load ID validation data probs and labels
val_probs, val_labels = c_model_predict(validation.X), validation.y
## score function, e.g., negative entropy or argmax confidence
val_scores = atc.get_entropy(val_probs)
val_preds = np.argmax(val_probs, axis=-1)
_, atc_thres = atc.find_ATC_threshold(val_scores, val_labels == val_preds)
report = EvaluationReport(name="atc_ne")
for test in protocol():
## Load OOD test data probs
test_probs = c_model_predict(test.X)
test_preds = np.argmax(test_probs, axis=-1)
test_scores = atc.get_entropy(test_probs)
atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores)
meta_acc = abs(atc_accuracy - metrics.accuracy_score(test.y, test_preds))
f1_score = atc.get_ATC_f1(atc_thres, test_scores, test_probs)
meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds))
report.append_row(
test.prevalence(),
acc=meta_acc,
acc_score=atc_accuracy,
f1_score=f1_score,
f1=meta_f1,
)
return report
@baseline
def doc_feat(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
predict_method="predict_proba",
):
c_model_predict = getattr(c_model, predict_method)
val_probs, val_labels = c_model_predict(validation.X), validation.y
val_scores = np.max(val_probs, axis=-1)
val_preds = np.argmax(val_probs, axis=-1)
v1acc = np.mean(val_preds == val_labels) * 100
report = EvaluationReport(name="doc_feat")
for test in protocol():
test_probs = c_model_predict(test.X)
test_preds = np.argmax(test_probs, axis=-1)
test_scores = np.max(test_probs, axis=-1)
score = (v1acc + doc.get_doc(val_scores, test_scores)) / 100.0
meta_acc = abs(score - metrics.accuracy_score(test.y, test_preds))
report.append_row(test.prevalence(), acc=meta_acc, acc_score=score)
return report
@baseline
def rca(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
predict_method="predict",
):
"""elsahar19"""
c_model_predict = getattr(c_model, predict_method)
val_pred1 = c_model_predict(validation.X)
report = EvaluationReport(name="rca")
for test in protocol():
try:
test_pred = c_model_predict(test.X)
c_model2 = rcalib.clone_fit(c_model, test.X, test_pred)
c_model2_predict = getattr(c_model2, predict_method)
val_pred2 = c_model2_predict(validation.X)
rca_score = 1.0 - rcalib.get_score(val_pred1, val_pred2, validation.y)
meta_score = abs(rca_score - metrics.accuracy_score(test.y, test_pred))
report.append_row(test.prevalence(), acc=meta_score, acc_score=rca_score)
except ValueError:
report.append_row(
test.prevalence(), acc=float("nan"), acc_score=float("nan")
)
return report
@baseline
def rca_star(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
predict_method="predict",
):
"""elsahar19"""
c_model_predict = getattr(c_model, predict_method)
validation1, validation2 = validation.split_stratified(
train_prop=0.5, random_state=0
)
val1_pred = c_model_predict(validation1.X)
c_model1 = rcalib.clone_fit(c_model, validation1.X, val1_pred)
c_model1_predict = getattr(c_model1, predict_method)
val2_pred1 = c_model1_predict(validation2.X)
report = EvaluationReport(name="rca_star")
for test in protocol():
try:
test_pred = c_model_predict(test.X)
c_model2 = rcalib.clone_fit(c_model, test.X, test_pred)
c_model2_predict = getattr(c_model2, predict_method)
val2_pred2 = c_model2_predict(validation2.X)
rca_star_score = 1.0 - rcalib.get_score(
val2_pred1, val2_pred2, validation2.y
)
meta_score = abs(rca_star_score - metrics.accuracy_score(test.y, test_pred))
report.append_row(
test.prevalence(), acc=meta_score, acc_score=rca_star_score
)
except ValueError:
report.append_row(
test.prevalence(), acc=float("nan"), acc_score=float("nan")
)
return report
@baseline
def logreg(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
predict_method="predict",
):
c_model_predict = getattr(c_model, predict_method)
val_preds = c_model_predict(validation.X)
report = EvaluationReport(name="logreg")
for test in protocol():
wx = iw.logreg(validation.X, validation.y, test.X)
test_preds = c_model_predict(test.X)
estim_acc = iw.get_acc(val_preds, validation.y, wx)
true_acc = metrics.accuracy_score(test.y, test_preds)
meta_score = abs(estim_acc - true_acc)
report.append_row(test.prevalence(), acc=meta_score, acc_score=estim_acc)
return report
@baseline
def kdex2(
c_model: BaseEstimator,
validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol,
predict_method="predict",
):
c_model_predict = getattr(c_model, predict_method)
val_preds = c_model_predict(validation.X)
log_likelihood_val = iw.kdex2_lltr(validation.X)
Xval = validation.X.toarray() if issparse(validation.X) else validation.X
report = EvaluationReport(name="kdex2")
for test in protocol():
Xte = test.X.toarray() if issparse(test.X) else test.X
wx = iw.kdex2_weights(Xval, Xte, log_likelihood_val)
test_preds = c_model_predict(Xte)
estim_acc = iw.get_acc(val_preds, validation.y, wx)
true_acc = metrics.accuracy_score(test.y, test_preds)
meta_score = abs(estim_acc - true_acc)
report.append_row(test.prevalence(), acc=meta_score, acc_score=estim_acc)
return report

View File

@ -1,128 +1,128 @@
import multiprocessing
import time
from traceback import print_exception as traceback
from typing import List
import numpy as np
import pandas as pd
import quapy as qp
from quacc.dataset import Dataset
from quacc.environment import env
from quacc.evaluation import baseline, method
from quacc.evaluation.report import CompReport, DatasetReport, EvaluationReport
from quacc.evaluation.worker import estimate_worker
from quacc.logger import Logger
pd.set_option("display.float_format", "{:.4f}".format)
qp.environ["SAMPLE_SIZE"] = env.SAMPLE_SIZE
class CompEstimatorName_:
def __init__(self, ce):
self.ce = ce
def __getitem__(self, e: str | List[str]):
if isinstance(e, str):
return self.ce._CompEstimator__get(e)[0]
elif isinstance(e, list):
return list(self.ce._CompEstimator__get(e).keys())
class CompEstimatorFunc_:
def __init__(self, ce):
self.ce = ce
def __getitem__(self, e: str | List[str]):
if isinstance(e, str):
return self.ce._CompEstimator__get(e)[1]
elif isinstance(e, list):
return list(self.ce._CompEstimator__get(e).values())
class CompEstimator:
__dict = method._methods | baseline._baselines
def __get(cls, e: str | List[str]):
if isinstance(e, str):
try:
return (e, cls.__dict[e])
except KeyError:
raise KeyError(f"Invalid estimator: estimator {e} does not exist")
elif isinstance(e, list):
_subtr = np.setdiff1d(e, list(cls.__dict.keys()))
if len(_subtr) > 0:
raise KeyError(
f"Invalid estimator: estimator {_subtr[0]} does not exist"
)
e_fun = {k: fun for k, fun in cls.__dict.items() if k in e}
if "ref" not in e:
e_fun["ref"] = cls.__dict["ref"]
return e_fun
@property
def name(self):
return CompEstimatorName_(self)
@property
def func(self):
return CompEstimatorFunc_(self)
CE = CompEstimator()
def evaluate_comparison(dataset: Dataset, estimators=None) -> EvaluationReport:
log = Logger.logger()
# with multiprocessing.Pool(1) as pool:
with multiprocessing.Pool(len(estimators)) as pool:
dr = DatasetReport(dataset.name)
log.info(f"dataset {dataset.name}")
for d in dataset():
log.info(
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} started"
)
tstart = time.time()
tasks = [
(estim, d.train, d.validation, d.test) for estim in CE.func[estimators]
]
results = [
pool.apply_async(estimate_worker, t, {"_env": env, "q": Logger.queue()})
for t in tasks
]
results_got = []
for _r in results:
try:
r = _r.get()
if r["result"] is not None:
results_got.append(r)
except Exception as e:
log.warning(
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} failed. Exception: {e}"
)
tend = time.time()
times = {r["name"]: r["time"] for r in results_got}
times["tot"] = tend - tstart
log.info(
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} finished [took {times['tot']:.4f}s]"
)
try:
cr = CompReport(
[r["result"] for r in results_got],
name=dataset.name,
train_prev=d.train_prev,
valid_prev=d.validation_prev,
times=times,
)
except Exception as e:
log.warning(
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} failed. Exception: {e}"
)
traceback(e)
cr = None
dr += cr
return dr
import multiprocessing
import time
from traceback import print_exception as traceback
from typing import List
import numpy as np
import pandas as pd
import quapy as qp
from quacc.dataset import Dataset
from quacc.environment import env
from quacc.evaluation import baseline, method
from quacc.evaluation.report import CompReport, DatasetReport, EvaluationReport
from quacc.evaluation.worker import estimate_worker
from quacc.logger import Logger
pd.set_option("display.float_format", "{:.4f}".format)
qp.environ["SAMPLE_SIZE"] = env.SAMPLE_SIZE
class CompEstimatorName_:
def __init__(self, ce):
self.ce = ce
def __getitem__(self, e: str | List[str]):
if isinstance(e, str):
return self.ce._CompEstimator__get(e)[0]
elif isinstance(e, list):
return list(self.ce._CompEstimator__get(e).keys())
class CompEstimatorFunc_:
def __init__(self, ce):
self.ce = ce
def __getitem__(self, e: str | List[str]):
if isinstance(e, str):
return self.ce._CompEstimator__get(e)[1]
elif isinstance(e, list):
return list(self.ce._CompEstimator__get(e).values())
class CompEstimator:
__dict = method._methods | baseline._baselines
def __get(cls, e: str | List[str]):
if isinstance(e, str):
try:
return (e, cls.__dict[e])
except KeyError:
raise KeyError(f"Invalid estimator: estimator {e} does not exist")
elif isinstance(e, list):
_subtr = np.setdiff1d(e, list(cls.__dict.keys()))
if len(_subtr) > 0:
raise KeyError(
f"Invalid estimator: estimator {_subtr[0]} does not exist"
)
e_fun = {k: fun for k, fun in cls.__dict.items() if k in e}
if "ref" not in e:
e_fun["ref"] = cls.__dict["ref"]
return e_fun
@property
def name(self):
return CompEstimatorName_(self)
@property
def func(self):
return CompEstimatorFunc_(self)
CE = CompEstimator()
def evaluate_comparison(dataset: Dataset, estimators=None) -> EvaluationReport:
log = Logger.logger()
# with multiprocessing.Pool(1) as pool:
with multiprocessing.Pool(len(estimators)) as pool:
dr = DatasetReport(dataset.name)
log.info(f"dataset {dataset.name}")
for d in dataset():
log.info(
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} started"
)
tstart = time.time()
tasks = [
(estim, d.train, d.validation, d.test) for estim in CE.func[estimators]
]
results = [
pool.apply_async(estimate_worker, t, {"_env": env, "q": Logger.queue()})
for t in tasks
]
results_got = []
for _r in results:
try:
r = _r.get()
if r["result"] is not None:
results_got.append(r)
except Exception as e:
log.warning(
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} failed. Exception: {e}"
)
tend = time.time()
times = {r["name"]: r["time"] for r in results_got}
times["tot"] = tend - tstart
log.info(
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} finished [took {times['tot']:.4f}s]"
)
try:
cr = CompReport(
[r["result"] for r in results_got],
name=dataset.name,
train_prev=d.train_prev,
valid_prev=d.validation_prev,
times=times,
)
except Exception as e:
log.warning(
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} failed. Exception: {e}"
)
traceback(e)
cr = None
dr += cr
return dr

View File

@ -1,305 +1,305 @@
import inspect
from functools import wraps
import numpy as np
from quapy.method.aggregative import PACC, SLD, CC
from quapy.protocol import UPP, AbstractProtocol
from sklearn.linear_model import LogisticRegression
import quacc as qc
from quacc.evaluation.report import EvaluationReport
from quacc.method.model_selection import BQAEgsq, GridSearchAE, MCAEgsq
from ..method.base import BQAE, MCAE, BaseAccuracyEstimator
_methods = {}
_sld_param_grid = {
"q__classifier__C": np.logspace(-3, 3, 7),
"q__classifier__class_weight": [None, "balanced"],
"q__recalib": [None, "bcts"],
"q__exact_train_prev": [True],
"confidence": [None, "max_conf", "entropy"],
}
_pacc_param_grid = {
"q__classifier__C": np.logspace(-3, 3, 7),
"q__classifier__class_weight": [None, "balanced"],
"confidence": [None, "max_conf", "entropy"],
}
def method(func):
@wraps(func)
def wrapper(c_model, validation, protocol):
return func(c_model, validation, protocol)
_methods[func.__name__] = wrapper
return wrapper
def evaluation_report(
estimator: BaseAccuracyEstimator,
protocol: AbstractProtocol,
) -> EvaluationReport:
method_name = inspect.stack()[1].function
report = EvaluationReport(name=method_name)
for sample in protocol():
e_sample = estimator.extend(sample)
estim_prev = estimator.estimate(e_sample.X, ext=True)
acc_score = qc.error.acc(estim_prev)
f1_score = qc.error.f1(estim_prev)
report.append_row(
sample.prevalence(),
acc_score=acc_score,
acc=abs(qc.error.acc(e_sample.prevalence()) - acc_score),
f1_score=f1_score,
f1=abs(qc.error.f1(e_sample.prevalence()) - f1_score),
)
return report
@method
def bin_sld(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(c_model, SLD(LogisticRegression())).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mul_sld(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(c_model, SLD(LogisticRegression())).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def binmc_sld(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(
c_model,
SLD(LogisticRegression()),
confidence="max_conf",
).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mulmc_sld(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(
c_model,
SLD(LogisticRegression()),
confidence="max_conf",
).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def binne_sld(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(
c_model,
SLD(LogisticRegression()),
confidence="entropy",
).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mulne_sld(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(
c_model,
SLD(LogisticRegression()),
confidence="entropy",
).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def bin_sld_gs(c_model, validation, protocol) -> EvaluationReport:
v_train, v_val = validation.split_stratified(0.6, random_state=0)
model = BQAE(c_model, SLD(LogisticRegression()))
est = GridSearchAE(
model=model,
param_grid=_sld_param_grid,
refit=False,
protocol=UPP(v_val, repeats=100),
verbose=True,
).fit(v_train)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mul_sld_gs(c_model, validation, protocol) -> EvaluationReport:
v_train, v_val = validation.split_stratified(0.6, random_state=0)
model = MCAE(c_model, SLD(LogisticRegression()))
est = GridSearchAE(
model=model,
param_grid=_sld_param_grid,
refit=False,
protocol=UPP(v_val, repeats=100),
verbose=True,
).fit(v_train)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def bin_sld_gsq(c_model, validation, protocol) -> EvaluationReport:
est = BQAEgsq(
c_model,
SLD(LogisticRegression()),
param_grid={
"classifier__C": np.logspace(-3, 3, 7),
"classifier__class_weight": [None, "balanced"],
"recalib": [None, "bcts", "vs"],
},
refit=False,
verbose=False,
).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mul_sld_gsq(c_model, validation, protocol) -> EvaluationReport:
est = MCAEgsq(
c_model,
SLD(LogisticRegression()),
param_grid={
"classifier__C": np.logspace(-3, 3, 7),
"classifier__class_weight": [None, "balanced"],
"recalib": [None, "bcts", "vs"],
},
refit=False,
verbose=False,
).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def bin_pacc(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(c_model, PACC(LogisticRegression())).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mul_pacc(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(c_model, PACC(LogisticRegression())).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def binmc_pacc(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(c_model, PACC(LogisticRegression()), confidence="max_conf").fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mulmc_pacc(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(c_model, PACC(LogisticRegression()), confidence="max_conf").fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def binne_pacc(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(c_model, PACC(LogisticRegression()), confidence="entropy").fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mulne_pacc(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(c_model, PACC(LogisticRegression()), confidence="entropy").fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def bin_pacc_gs(c_model, validation, protocol) -> EvaluationReport:
v_train, v_val = validation.split_stratified(0.6, random_state=0)
model = BQAE(c_model, PACC(LogisticRegression()))
est = GridSearchAE(
model=model,
param_grid=_pacc_param_grid,
refit=False,
protocol=UPP(v_val, repeats=100),
verbose=False,
).fit(v_train)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mul_pacc_gs(c_model, validation, protocol) -> EvaluationReport:
v_train, v_val = validation.split_stratified(0.6, random_state=0)
model = MCAE(c_model, PACC(LogisticRegression()))
est = GridSearchAE(
model=model,
param_grid=_pacc_param_grid,
refit=False,
protocol=UPP(v_val, repeats=100),
verbose=False,
).fit(v_train)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def bin_cc(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(c_model, CC(LogisticRegression())).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mul_cc(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(c_model, CC(LogisticRegression())).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
import inspect
from functools import wraps
import numpy as np
from quapy.method.aggregative import PACC, SLD, CC
from quapy.protocol import UPP, AbstractProtocol
from sklearn.linear_model import LogisticRegression
import quacc as qc
from quacc.evaluation.report import EvaluationReport
from quacc.method.model_selection import BQAEgsq, GridSearchAE, MCAEgsq
from ..method.base import BQAE, MCAE, BaseAccuracyEstimator
_methods = {}
_sld_param_grid = {
"q__classifier__C": np.logspace(-3, 3, 7),
"q__classifier__class_weight": [None, "balanced"],
"q__recalib": [None, "bcts"],
"q__exact_train_prev": [True],
"confidence": [None, "max_conf", "entropy"],
}
_pacc_param_grid = {
"q__classifier__C": np.logspace(-3, 3, 7),
"q__classifier__class_weight": [None, "balanced"],
"confidence": [None, "max_conf", "entropy"],
}
def method(func):
@wraps(func)
def wrapper(c_model, validation, protocol):
return func(c_model, validation, protocol)
_methods[func.__name__] = wrapper
return wrapper
def evaluation_report(
estimator: BaseAccuracyEstimator,
protocol: AbstractProtocol,
) -> EvaluationReport:
method_name = inspect.stack()[1].function
report = EvaluationReport(name=method_name)
for sample in protocol():
e_sample = estimator.extend(sample)
estim_prev = estimator.estimate(e_sample.X, ext=True)
acc_score = qc.error.acc(estim_prev)
f1_score = qc.error.f1(estim_prev)
report.append_row(
sample.prevalence(),
acc_score=acc_score,
acc=abs(qc.error.acc(e_sample.prevalence()) - acc_score),
f1_score=f1_score,
f1=abs(qc.error.f1(e_sample.prevalence()) - f1_score),
)
return report
@method
def bin_sld(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(c_model, SLD(LogisticRegression())).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mul_sld(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(c_model, SLD(LogisticRegression())).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def binmc_sld(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(
c_model,
SLD(LogisticRegression()),
confidence="max_conf",
).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mulmc_sld(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(
c_model,
SLD(LogisticRegression()),
confidence="max_conf",
).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def binne_sld(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(
c_model,
SLD(LogisticRegression()),
confidence="entropy",
).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mulne_sld(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(
c_model,
SLD(LogisticRegression()),
confidence="entropy",
).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def bin_sld_gs(c_model, validation, protocol) -> EvaluationReport:
v_train, v_val = validation.split_stratified(0.6, random_state=0)
model = BQAE(c_model, SLD(LogisticRegression()))
est = GridSearchAE(
model=model,
param_grid=_sld_param_grid,
refit=False,
protocol=UPP(v_val, repeats=100),
verbose=True,
).fit(v_train)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mul_sld_gs(c_model, validation, protocol) -> EvaluationReport:
v_train, v_val = validation.split_stratified(0.6, random_state=0)
model = MCAE(c_model, SLD(LogisticRegression()))
est = GridSearchAE(
model=model,
param_grid=_sld_param_grid,
refit=False,
protocol=UPP(v_val, repeats=100),
verbose=True,
).fit(v_train)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def bin_sld_gsq(c_model, validation, protocol) -> EvaluationReport:
est = BQAEgsq(
c_model,
SLD(LogisticRegression()),
param_grid={
"classifier__C": np.logspace(-3, 3, 7),
"classifier__class_weight": [None, "balanced"],
"recalib": [None, "bcts", "vs"],
},
refit=False,
verbose=False,
).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mul_sld_gsq(c_model, validation, protocol) -> EvaluationReport:
est = MCAEgsq(
c_model,
SLD(LogisticRegression()),
param_grid={
"classifier__C": np.logspace(-3, 3, 7),
"classifier__class_weight": [None, "balanced"],
"recalib": [None, "bcts", "vs"],
},
refit=False,
verbose=False,
).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def bin_pacc(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(c_model, PACC(LogisticRegression())).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mul_pacc(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(c_model, PACC(LogisticRegression())).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def binmc_pacc(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(c_model, PACC(LogisticRegression()), confidence="max_conf").fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mulmc_pacc(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(c_model, PACC(LogisticRegression()), confidence="max_conf").fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def binne_pacc(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(c_model, PACC(LogisticRegression()), confidence="entropy").fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mulne_pacc(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(c_model, PACC(LogisticRegression()), confidence="entropy").fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def bin_pacc_gs(c_model, validation, protocol) -> EvaluationReport:
v_train, v_val = validation.split_stratified(0.6, random_state=0)
model = BQAE(c_model, PACC(LogisticRegression()))
est = GridSearchAE(
model=model,
param_grid=_pacc_param_grid,
refit=False,
protocol=UPP(v_val, repeats=100),
verbose=False,
).fit(v_train)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mul_pacc_gs(c_model, validation, protocol) -> EvaluationReport:
v_train, v_val = validation.split_stratified(0.6, random_state=0)
model = MCAE(c_model, PACC(LogisticRegression()))
est = GridSearchAE(
model=model,
param_grid=_pacc_param_grid,
refit=False,
protocol=UPP(v_val, repeats=100),
verbose=False,
).fit(v_train)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def bin_cc(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(c_model, CC(LogisticRegression())).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)
@method
def mul_cc(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(c_model, CC(LogisticRegression())).fit(validation)
return evaluation_report(
estimator=est,
protocol=protocol,
)

File diff suppressed because it is too large Load Diff

View File

@ -1,44 +1,44 @@
import time
from traceback import print_exception as traceback
import quapy as qp
from quapy.protocol import APP
from sklearn.linear_model import LogisticRegression
from quacc.logger import SubLogger
def estimate_worker(_estimate, train, validation, test, _env=None, q=None):
qp.environ["SAMPLE_SIZE"] = _env.SAMPLE_SIZE
SubLogger.setup(q)
log = SubLogger.logger()
model = LogisticRegression()
model.fit(*train.Xy)
protocol = APP(
test,
n_prevalences=_env.PROTOCOL_N_PREVS,
repeats=_env.PROTOCOL_REPEATS,
return_type="labelled_collection",
)
start = time.time()
try:
result = _estimate(model, validation, protocol)
except Exception as e:
log.warning(f"Method {_estimate.__name__} failed. Exception: {e}")
traceback(e)
return {
"name": _estimate.__name__,
"result": None,
"time": 0,
}
end = time.time()
log.info(f"{_estimate.__name__} finished [took {end-start:.4f}s]")
return {
"name": _estimate.__name__,
"result": result,
"time": end - start,
}
import time
from traceback import print_exception as traceback
import quapy as qp
from quapy.protocol import APP
from sklearn.linear_model import LogisticRegression
from quacc.logger import SubLogger
def estimate_worker(_estimate, train, validation, test, _env=None, q=None):
qp.environ["SAMPLE_SIZE"] = _env.SAMPLE_SIZE
SubLogger.setup(q)
log = SubLogger.logger()
model = LogisticRegression()
model.fit(*train.Xy)
protocol = APP(
test,
n_prevalences=_env.PROTOCOL_N_PREVS,
repeats=_env.PROTOCOL_REPEATS,
return_type="labelled_collection",
)
start = time.time()
try:
result = _estimate(model, validation, protocol)
except Exception as e:
log.warning(f"Method {_estimate.__name__} failed. Exception: {e}")
traceback(e)
return {
"name": _estimate.__name__,
"result": None,
"time": 0,
}
end = time.time()
log.info(f"{_estimate.__name__} finished [took {end-start:.4f}s]")
return {
"name": _estimate.__name__,
"result": result,
"time": end - start,
}

View File

@ -1,136 +1,136 @@
import logging
import logging.handlers
import multiprocessing
import threading
from pathlib import Path
class Logger:
__logger_file = "quacc.log"
__logger_name = "queue_logger"
__manager = None
__queue = None
__thread = None
__setup = False
__handlers = []
@classmethod
def __logger_listener(cls, q):
while True:
record = q.get()
if record is None:
break
root = logging.getLogger("listener")
root.handle(record)
@classmethod
def setup(cls):
if cls.__setup:
return
# setup root
root = logging.getLogger("listener")
root.setLevel(logging.DEBUG)
rh = logging.FileHandler(cls.__logger_file, mode="a")
rh.setLevel(logging.DEBUG)
root.addHandler(rh)
# setup logger
if cls.__manager is None:
cls.__manager = multiprocessing.Manager()
if cls.__queue is None:
cls.__queue = cls.__manager.Queue()
logger = logging.getLogger(cls.__logger_name)
logger.setLevel(logging.DEBUG)
qh = logging.handlers.QueueHandler(cls.__queue)
qh.setLevel(logging.DEBUG)
qh.setFormatter(
logging.Formatter(
fmt="%(asctime)s| %(levelname)-8s %(message)s",
datefmt="%d/%m/%y %H:%M:%S",
)
)
logger.addHandler(qh)
# start listener
cls.__thread = threading.Thread(
target=cls.__logger_listener,
args=(cls.__queue,),
)
cls.__thread.start()
cls.__setup = True
@classmethod
def add_handler(cls, path: Path):
root = logging.getLogger("listener")
rh = logging.FileHandler(path, mode="a")
rh.setLevel(logging.DEBUG)
cls.__handlers.append(rh)
root.addHandler(rh)
@classmethod
def clear_handlers(cls):
root = logging.getLogger("listener")
for h in cls.__handlers:
root.removeHandler(h)
cls.__handlers.clear()
@classmethod
def queue(cls):
if not cls.__setup:
cls.setup()
return cls.__queue
@classmethod
def logger(cls):
if not cls.__setup:
cls.setup()
return logging.getLogger(cls.__logger_name)
@classmethod
def close(cls):
if cls.__setup and cls.__thread is not None:
root = logging.getLogger("listener")
root.info("-" * 100)
cls.__queue.put(None)
cls.__thread.join()
# cls.__manager.close()
class SubLogger:
__queue = None
__setup = False
@classmethod
def setup(cls, q):
if cls.__setup:
return
cls.__queue = q
# setup root
root = logging.getLogger()
root.setLevel(logging.DEBUG)
rh = logging.handlers.QueueHandler(q)
rh.setLevel(logging.DEBUG)
rh.setFormatter(
logging.Formatter(
fmt="%(asctime)s| %(levelname)-12s%(message)s",
datefmt="%d/%m/%y %H:%M:%S",
)
)
root.addHandler(rh)
cls.__setup = True
@classmethod
def logger(cls):
if not cls.__setup:
return None
return logging.getLogger()
import logging
import logging.handlers
import multiprocessing
import threading
from pathlib import Path
class Logger:
__logger_file = "quacc.log"
__logger_name = "queue_logger"
__manager = None
__queue = None
__thread = None
__setup = False
__handlers = []
@classmethod
def __logger_listener(cls, q):
while True:
record = q.get()
if record is None:
break
root = logging.getLogger("listener")
root.handle(record)
@classmethod
def setup(cls):
if cls.__setup:
return
# setup root
root = logging.getLogger("listener")
root.setLevel(logging.DEBUG)
rh = logging.FileHandler(cls.__logger_file, mode="a")
rh.setLevel(logging.DEBUG)
root.addHandler(rh)
# setup logger
if cls.__manager is None:
cls.__manager = multiprocessing.Manager()
if cls.__queue is None:
cls.__queue = cls.__manager.Queue()
logger = logging.getLogger(cls.__logger_name)
logger.setLevel(logging.DEBUG)
qh = logging.handlers.QueueHandler(cls.__queue)
qh.setLevel(logging.DEBUG)
qh.setFormatter(
logging.Formatter(
fmt="%(asctime)s| %(levelname)-8s %(message)s",
datefmt="%d/%m/%y %H:%M:%S",
)
)
logger.addHandler(qh)
# start listener
cls.__thread = threading.Thread(
target=cls.__logger_listener,
args=(cls.__queue,),
)
cls.__thread.start()
cls.__setup = True
@classmethod
def add_handler(cls, path: Path):
root = logging.getLogger("listener")
rh = logging.FileHandler(path, mode="a")
rh.setLevel(logging.DEBUG)
cls.__handlers.append(rh)
root.addHandler(rh)
@classmethod
def clear_handlers(cls):
root = logging.getLogger("listener")
for h in cls.__handlers:
root.removeHandler(h)
cls.__handlers.clear()
@classmethod
def queue(cls):
if not cls.__setup:
cls.setup()
return cls.__queue
@classmethod
def logger(cls):
if not cls.__setup:
cls.setup()
return logging.getLogger(cls.__logger_name)
@classmethod
def close(cls):
if cls.__setup and cls.__thread is not None:
root = logging.getLogger("listener")
root.info("-" * 100)
cls.__queue.put(None)
cls.__thread.join()
# cls.__manager.close()
class SubLogger:
__queue = None
__setup = False
@classmethod
def setup(cls, q):
if cls.__setup:
return
cls.__queue = q
# setup root
root = logging.getLogger()
root.setLevel(logging.DEBUG)
rh = logging.handlers.QueueHandler(q)
rh.setLevel(logging.DEBUG)
rh.setFormatter(
logging.Formatter(
fmt="%(asctime)s| %(levelname)-12s%(message)s",
datefmt="%d/%m/%y %H:%M:%S",
)
)
root.addHandler(rh)
cls.__setup = True
@classmethod
def logger(cls):
if not cls.__setup:
return None
return logging.getLogger()

View File

@ -1,75 +1,75 @@
from sys import platform
from traceback import print_exception as traceback
import quacc.evaluation.comp as comp
from quacc.dataset import Dataset
from quacc.environment import env
from quacc.logger import Logger
from quacc.utils import create_dataser_dir
CE = comp.CompEstimator()
def toast():
if platform == "win32":
import win11toast
win11toast.notify("Comp", "Completed Execution")
def estimate_comparison():
log = Logger.logger()
for conf in env.get_confs():
dataset = Dataset(
env.DATASET_NAME,
target=env.DATASET_TARGET,
n_prevalences=env.DATASET_N_PREVS,
prevs=env.DATASET_PREVS,
)
create_dataser_dir(dataset.name, update=env.DATASET_DIR_UPDATE)
Logger.add_handler(env.OUT_DIR / f"{dataset.name}.log")
try:
dr = comp.evaluate_comparison(
dataset,
estimators=CE.name[env.COMP_ESTIMATORS],
)
except Exception as e:
log.error(f"Evaluation over {dataset.name} failed. Exception: {e}")
traceback(e)
for plot_conf in env.get_plot_confs():
for m in env.METRICS:
output_path = env.OUT_DIR / f"{plot_conf}_{m}.md"
try:
_repr = dr.to_md(
conf=plot_conf,
metric=m,
estimators=CE.name[env.PLOT_ESTIMATORS],
stdev=env.PLOT_STDEV,
)
with open(output_path, "w") as f:
f.write(_repr)
except Exception as e:
log.error(
f"Failed while saving configuration {plot_conf} of {dataset.name}. Exception: {e}"
)
traceback(e)
Logger.clear_handlers()
# print(df.to_latex(float_format="{:.4f}".format))
# print(utils.avg_group_report(df).to_latex(float_format="{:.4f}".format))
def main():
log = Logger.logger()
try:
estimate_comparison()
except Exception as e:
log.error(f"estimate comparison failed. Exceprion: {e}")
traceback(e)
toast()
Logger.close()
if __name__ == "__main__":
main()
from sys import platform
from traceback import print_exception as traceback
import quacc.evaluation.comp as comp
from quacc.dataset import Dataset
from quacc.environment import env
from quacc.logger import Logger
from quacc.utils import create_dataser_dir
CE = comp.CompEstimator()
def toast():
if platform == "win32":
import win11toast
win11toast.notify("Comp", "Completed Execution")
def estimate_comparison():
log = Logger.logger()
for conf in env.get_confs():
dataset = Dataset(
env.DATASET_NAME,
target=env.DATASET_TARGET,
n_prevalences=env.DATASET_N_PREVS,
prevs=env.DATASET_PREVS,
)
create_dataser_dir(dataset.name, update=env.DATASET_DIR_UPDATE)
Logger.add_handler(env.OUT_DIR / f"{dataset.name}.log")
try:
dr = comp.evaluate_comparison(
dataset,
estimators=CE.name[env.COMP_ESTIMATORS],
)
except Exception as e:
log.error(f"Evaluation over {dataset.name} failed. Exception: {e}")
traceback(e)
for plot_conf in env.get_plot_confs():
for m in env.METRICS:
output_path = env.OUT_DIR / f"{plot_conf}_{m}.md"
try:
_repr = dr.to_md(
conf=plot_conf,
metric=m,
estimators=CE.name[env.PLOT_ESTIMATORS],
stdev=env.PLOT_STDEV,
)
with open(output_path, "w") as f:
f.write(_repr)
except Exception as e:
log.error(
f"Failed while saving configuration {plot_conf} of {dataset.name}. Exception: {e}"
)
traceback(e)
Logger.clear_handlers()
# print(df.to_latex(float_format="{:.4f}".format))
# print(utils.avg_group_report(df).to_latex(float_format="{:.4f}".format))
def main():
log = Logger.logger()
try:
estimate_comparison()
except Exception as e:
log.error(f"estimate comparison failed. Exceprion: {e}")
traceback(e)
toast()
Logger.close()
if __name__ == "__main__":
main()

View File

@ -1,120 +1,120 @@
from copy import deepcopy
from time import time
import numpy as np
import win11toast
from quapy.method.aggregative import SLD
from quapy.protocol import APP, UPP
from sklearn.linear_model import LogisticRegression
import quacc as qc
from quacc.dataset import Dataset
from quacc.error import acc
from quacc.evaluation.baseline import ref
from quacc.evaluation.method import mulmc_sld
from quacc.evaluation.report import CompReport, EvaluationReport
from quacc.method.base import MCAE, BinaryQuantifierAccuracyEstimator
from quacc.method.model_selection import GridSearchAE
def test_gs():
d = Dataset(name="rcv1", target="CCAT", n_prevalences=1).get_raw()
classifier = LogisticRegression()
classifier.fit(*d.train.Xy)
quantifier = SLD(LogisticRegression())
# estimator = MultiClassAccuracyEstimator(classifier, quantifier)
estimator = BinaryQuantifierAccuracyEstimator(classifier, quantifier)
v_train, v_val = d.validation.split_stratified(0.6, random_state=0)
gs_protocol = UPP(v_val, sample_size=1000, repeats=100)
gs_estimator = GridSearchAE(
model=deepcopy(estimator),
param_grid={
"q__classifier__C": np.logspace(-3, 3, 7),
"q__classifier__class_weight": [None, "balanced"],
"q__recalib": [None, "bcts", "ts"],
},
refit=False,
protocol=gs_protocol,
verbose=True,
).fit(v_train)
estimator.fit(d.validation)
tstart = time()
erb, ergs = EvaluationReport("base"), EvaluationReport("gs")
protocol = APP(
d.test,
sample_size=1000,
n_prevalences=21,
repeats=100,
return_type="labelled_collection",
)
for sample in protocol():
e_sample = gs_estimator.extend(sample)
estim_prev_b = estimator.estimate(e_sample.X, ext=True)
estim_prev_gs = gs_estimator.estimate(e_sample.X, ext=True)
erb.append_row(
sample.prevalence(),
acc=abs(acc(e_sample.prevalence()) - acc(estim_prev_b)),
)
ergs.append_row(
sample.prevalence(),
acc=abs(acc(e_sample.prevalence()) - acc(estim_prev_gs)),
)
cr = CompReport(
[erb, ergs],
"test",
train_prev=d.train_prev,
valid_prev=d.validation_prev,
)
print(cr.table())
print(f"[took {time() - tstart:.3f}s]")
win11toast.notify("Test", "completed")
def test_mc():
d = Dataset(name="rcv1", target="CCAT", prevs=[0.9]).get()[0]
classifier = LogisticRegression().fit(*d.train.Xy)
protocol = APP(
d.test,
sample_size=1000,
repeats=100,
n_prevalences=21,
return_type="labelled_collection",
)
ref_er = ref(classifier, d.validation, protocol)
mulmc_er = mulmc_sld(classifier, d.validation, protocol)
cr = CompReport(
[mulmc_er, ref_er],
name="test_mc",
train_prev=d.train_prev,
valid_prev=d.validation_prev,
)
with open("test_mc.md", "w") as f:
f.write(cr.data().to_markdown())
def test_et():
d = Dataset(name="imdb", prevs=[0.5]).get()[0]
classifier = LogisticRegression().fit(*d.train.Xy)
estimator = MCAE(
classifier,
SLD(LogisticRegression(), exact_train_prev=False),
confidence="max_conf",
).fit(d.validation)
e_test = estimator.extend(d.test)
ep = estimator.estimate(e_test.X, ext=True)
print(f"{qc.error.acc(ep) = }")
print(f"{qc.error.acc(e_test.prevalence()) = }")
if __name__ == "__main__":
test_et()
from copy import deepcopy
from time import time
import numpy as np
import win11toast
from quapy.method.aggregative import SLD
from quapy.protocol import APP, UPP
from sklearn.linear_model import LogisticRegression
import quacc as qc
from quacc.dataset import Dataset
from quacc.error import acc
from quacc.evaluation.baseline import ref
from quacc.evaluation.method import mulmc_sld
from quacc.evaluation.report import CompReport, EvaluationReport
from quacc.method.base import MCAE, BinaryQuantifierAccuracyEstimator
from quacc.method.model_selection import GridSearchAE
def test_gs():
d = Dataset(name="rcv1", target="CCAT", n_prevalences=1).get_raw()
classifier = LogisticRegression()
classifier.fit(*d.train.Xy)
quantifier = SLD(LogisticRegression())
# estimator = MultiClassAccuracyEstimator(classifier, quantifier)
estimator = BinaryQuantifierAccuracyEstimator(classifier, quantifier)
v_train, v_val = d.validation.split_stratified(0.6, random_state=0)
gs_protocol = UPP(v_val, sample_size=1000, repeats=100)
gs_estimator = GridSearchAE(
model=deepcopy(estimator),
param_grid={
"q__classifier__C": np.logspace(-3, 3, 7),
"q__classifier__class_weight": [None, "balanced"],
"q__recalib": [None, "bcts", "ts"],
},
refit=False,
protocol=gs_protocol,
verbose=True,
).fit(v_train)
estimator.fit(d.validation)
tstart = time()
erb, ergs = EvaluationReport("base"), EvaluationReport("gs")
protocol = APP(
d.test,
sample_size=1000,
n_prevalences=21,
repeats=100,
return_type="labelled_collection",
)
for sample in protocol():
e_sample = gs_estimator.extend(sample)
estim_prev_b = estimator.estimate(e_sample.X, ext=True)
estim_prev_gs = gs_estimator.estimate(e_sample.X, ext=True)
erb.append_row(
sample.prevalence(),
acc=abs(acc(e_sample.prevalence()) - acc(estim_prev_b)),
)
ergs.append_row(
sample.prevalence(),
acc=abs(acc(e_sample.prevalence()) - acc(estim_prev_gs)),
)
cr = CompReport(
[erb, ergs],
"test",
train_prev=d.train_prev,
valid_prev=d.validation_prev,
)
print(cr.table())
print(f"[took {time() - tstart:.3f}s]")
win11toast.notify("Test", "completed")
def test_mc():
d = Dataset(name="rcv1", target="CCAT", prevs=[0.9]).get()[0]
classifier = LogisticRegression().fit(*d.train.Xy)
protocol = APP(
d.test,
sample_size=1000,
repeats=100,
n_prevalences=21,
return_type="labelled_collection",
)
ref_er = ref(classifier, d.validation, protocol)
mulmc_er = mulmc_sld(classifier, d.validation, protocol)
cr = CompReport(
[mulmc_er, ref_er],
name="test_mc",
train_prev=d.train_prev,
valid_prev=d.validation_prev,
)
with open("test_mc.md", "w") as f:
f.write(cr.data().to_markdown())
def test_et():
d = Dataset(name="imdb", prevs=[0.5]).get()[0]
classifier = LogisticRegression().fit(*d.train.Xy)
estimator = MCAE(
classifier,
SLD(LogisticRegression(), exact_train_prev=False),
confidence="max_conf",
).fit(d.validation)
e_test = estimator.extend(d.test)
ep = estimator.estimate(e_test.X, ext=True)
print(f"{qc.error.acc(ep) = }")
print(f"{qc.error.acc(e_test.prevalence()) = }")
if __name__ == "__main__":
test_et()

View File

@ -1,177 +1,177 @@
import math
from abc import abstractmethod
from copy import deepcopy
from typing import List
import numpy as np
from quapy.data import LabelledCollection
from quapy.method.aggregative import BaseQuantifier
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator
from quacc.data import ExtendedCollection
class BaseAccuracyEstimator(BaseQuantifier):
def __init__(
self,
classifier: BaseEstimator,
quantifier: BaseQuantifier,
confidence=None,
):
self.__check_classifier(classifier)
self.quantifier = quantifier
self.confidence = confidence
def __check_classifier(self, classifier):
if not hasattr(classifier, "predict_proba"):
raise ValueError(
f"Passed classifier {classifier.__class__.__name__} cannot predict probabilities."
)
self.classifier = classifier
def __get_confidence(self):
def max_conf(probas):
_mc = np.max(probas, axis=-1)
_min = 1.0 / probas.shape[1]
_norm_mc = (_mc - _min) / (1.0 - _min)
return _norm_mc
def entropy(probas):
_ent = np.sum(np.multiply(probas, np.log(probas + 1e-20)), axis=1)
return _ent
if self.confidence is None:
return None
__confs = {
"max_conf": max_conf,
"entropy": entropy,
}
return __confs.get(self.confidence, None)
def __get_ext(self, pred_proba):
_ext = pred_proba
_f_conf = self.__get_confidence()
if _f_conf is not None:
_confs = _f_conf(pred_proba).reshape((len(pred_proba), 1))
_ext = np.concatenate((_confs, pred_proba), axis=1)
return _ext
def extend(self, coll: LabelledCollection, pred_proba=None) -> ExtendedCollection:
if pred_proba is None:
pred_proba = self.classifier.predict_proba(coll.X)
_ext = self.__get_ext(pred_proba)
return ExtendedCollection.extend_collection(coll, pred_proba=_ext)
def _extend_instances(self, instances: np.ndarray | csr_matrix, pred_proba=None):
if pred_proba is None:
pred_proba = self.classifier.predict_proba(instances)
_ext = self.__get_ext(pred_proba)
return ExtendedCollection.extend_instances(instances, _ext)
@abstractmethod
def fit(self, train: LabelledCollection | ExtendedCollection):
...
@abstractmethod
def estimate(self, instances, ext=False) -> np.ndarray:
...
class MultiClassAccuracyEstimator(BaseAccuracyEstimator):
def __init__(
self,
classifier: BaseEstimator,
quantifier: BaseQuantifier,
confidence: str = None,
):
super().__init__(
classifier=classifier,
quantifier=quantifier,
confidence=confidence,
)
self.e_train = None
def fit(self, train: LabelledCollection):
self.e_train = self.extend(train)
self.quantifier.fit(self.e_train)
return self
def estimate(self, instances, ext=False) -> np.ndarray:
e_inst = instances if ext else self._extend_instances(instances)
estim_prev = self.quantifier.quantify(e_inst)
return self._check_prevalence_classes(estim_prev, self.quantifier.classes_)
def _check_prevalence_classes(self, estim_prev, estim_classes) -> np.ndarray:
true_classes = self.e_train.classes_
for _cls in true_classes:
if _cls not in estim_classes:
estim_prev = np.insert(estim_prev, _cls, [0.0], axis=0)
return estim_prev
class BinaryQuantifierAccuracyEstimator(BaseAccuracyEstimator):
def __init__(
self,
classifier: BaseEstimator,
quantifier: BaseAccuracyEstimator,
confidence: str = None,
):
super().__init__(
classifier=classifier,
quantifier=quantifier,
confidence=confidence,
)
self.quantifiers = []
self.e_trains = []
def fit(self, train: LabelledCollection | ExtendedCollection):
self.e_train = self.extend(train)
self.n_classes = self.e_train.n_classes
self.e_trains = self.e_train.split_by_pred()
self.quantifiers = []
for train in self.e_trains:
quant = deepcopy(self.quantifier)
quant.fit(train)
self.quantifiers.append(quant)
return self
def estimate(self, instances, ext=False):
# TODO: test
e_inst = instances if ext else self._extend_instances(instances)
_ncl = int(math.sqrt(self.n_classes))
s_inst, norms = ExtendedCollection.split_inst_by_pred(_ncl, e_inst)
estim_prevs = self._quantify_helper(s_inst, norms)
estim_prev = np.array([prev_row for prev_row in zip(*estim_prevs)]).flatten()
return estim_prev
def _quantify_helper(
self,
s_inst: List[np.ndarray | csr_matrix],
norms: List[float],
):
estim_prevs = []
for quant, inst, norm in zip(self.quantifiers, s_inst, norms):
if inst.shape[0] > 0:
estim_prevs.append(quant.quantify(inst) * norm)
else:
estim_prevs.append(np.asarray([0.0, 0.0]))
return estim_prevs
BAE = BaseAccuracyEstimator
MCAE = MultiClassAccuracyEstimator
BQAE = BinaryQuantifierAccuracyEstimator
import math
from abc import abstractmethod
from copy import deepcopy
from typing import List
import numpy as np
from quapy.data import LabelledCollection
from quapy.method.aggregative import BaseQuantifier
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator
from quacc.data import ExtendedCollection
class BaseAccuracyEstimator(BaseQuantifier):
def __init__(
self,
classifier: BaseEstimator,
quantifier: BaseQuantifier,
confidence=None,
):
self.__check_classifier(classifier)
self.quantifier = quantifier
self.confidence = confidence
def __check_classifier(self, classifier):
if not hasattr(classifier, "predict_proba"):
raise ValueError(
f"Passed classifier {classifier.__class__.__name__} cannot predict probabilities."
)
self.classifier = classifier
def __get_confidence(self):
def max_conf(probas):
_mc = np.max(probas, axis=-1)
_min = 1.0 / probas.shape[1]
_norm_mc = (_mc - _min) / (1.0 - _min)
return _norm_mc
def entropy(probas):
_ent = np.sum(np.multiply(probas, np.log(probas + 1e-20)), axis=1)
return _ent
if self.confidence is None:
return None
__confs = {
"max_conf": max_conf,
"entropy": entropy,
}
return __confs.get(self.confidence, None)
def __get_ext(self, pred_proba):
_ext = pred_proba
_f_conf = self.__get_confidence()
if _f_conf is not None:
_confs = _f_conf(pred_proba).reshape((len(pred_proba), 1))
_ext = np.concatenate((_confs, pred_proba), axis=1)
return _ext
def extend(self, coll: LabelledCollection, pred_proba=None) -> ExtendedCollection:
if pred_proba is None:
pred_proba = self.classifier.predict_proba(coll.X)
_ext = self.__get_ext(pred_proba)
return ExtendedCollection.extend_collection(coll, pred_proba=_ext)
def _extend_instances(self, instances: np.ndarray | csr_matrix, pred_proba=None):
if pred_proba is None:
pred_proba = self.classifier.predict_proba(instances)
_ext = self.__get_ext(pred_proba)
return ExtendedCollection.extend_instances(instances, _ext)
@abstractmethod
def fit(self, train: LabelledCollection | ExtendedCollection):
...
@abstractmethod
def estimate(self, instances, ext=False) -> np.ndarray:
...
class MultiClassAccuracyEstimator(BaseAccuracyEstimator):
def __init__(
self,
classifier: BaseEstimator,
quantifier: BaseQuantifier,
confidence: str = None,
):
super().__init__(
classifier=classifier,
quantifier=quantifier,
confidence=confidence,
)
self.e_train = None
def fit(self, train: LabelledCollection):
self.e_train = self.extend(train)
self.quantifier.fit(self.e_train)
return self
def estimate(self, instances, ext=False) -> np.ndarray:
e_inst = instances if ext else self._extend_instances(instances)
estim_prev = self.quantifier.quantify(e_inst)
return self._check_prevalence_classes(estim_prev, self.quantifier.classes_)
def _check_prevalence_classes(self, estim_prev, estim_classes) -> np.ndarray:
true_classes = self.e_train.classes_
for _cls in true_classes:
if _cls not in estim_classes:
estim_prev = np.insert(estim_prev, _cls, [0.0], axis=0)
return estim_prev
class BinaryQuantifierAccuracyEstimator(BaseAccuracyEstimator):
def __init__(
self,
classifier: BaseEstimator,
quantifier: BaseAccuracyEstimator,
confidence: str = None,
):
super().__init__(
classifier=classifier,
quantifier=quantifier,
confidence=confidence,
)
self.quantifiers = []
self.e_trains = []
def fit(self, train: LabelledCollection | ExtendedCollection):
self.e_train = self.extend(train)
self.n_classes = self.e_train.n_classes
self.e_trains = self.e_train.split_by_pred()
self.quantifiers = []
for train in self.e_trains:
quant = deepcopy(self.quantifier)
quant.fit(train)
self.quantifiers.append(quant)
return self
def estimate(self, instances, ext=False):
# TODO: test
e_inst = instances if ext else self._extend_instances(instances)
_ncl = int(math.sqrt(self.n_classes))
s_inst, norms = ExtendedCollection.split_inst_by_pred(_ncl, e_inst)
estim_prevs = self._quantify_helper(s_inst, norms)
estim_prev = np.array([prev_row for prev_row in zip(*estim_prevs)]).flatten()
return estim_prev
def _quantify_helper(
self,
s_inst: List[np.ndarray | csr_matrix],
norms: List[float],
):
estim_prevs = []
for quant, inst, norm in zip(self.quantifiers, s_inst, norms):
if inst.shape[0] > 0:
estim_prevs.append(quant.quantify(inst) * norm)
else:
estim_prevs.append(np.asarray([0.0, 0.0]))
return estim_prevs
BAE = BaseAccuracyEstimator
MCAE = MultiClassAccuracyEstimator
BQAE = BinaryQuantifierAccuracyEstimator

View File

@ -1,307 +1,307 @@
import itertools
from copy import deepcopy
from time import time
from typing import Callable, Union
import numpy as np
import quapy as qp
from quapy.data import LabelledCollection
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP, AbstractProtocol, OnLabelledCollectionProtocol
from sklearn.base import BaseEstimator
import quacc as qc
import quacc.error
from quacc.data import ExtendedCollection
from quacc.evaluation import evaluate
from quacc.logger import SubLogger
from quacc.method.base import (
BaseAccuracyEstimator,
BinaryQuantifierAccuracyEstimator,
MultiClassAccuracyEstimator,
)
class GridSearchAE(BaseAccuracyEstimator):
def __init__(
self,
model: BaseAccuracyEstimator,
param_grid: dict,
protocol: AbstractProtocol,
error: Union[Callable, str] = qc.error.maccd,
refit=True,
# timeout=-1,
# n_jobs=None,
verbose=False,
):
self.model = model
self.param_grid = self.__normalize_params(param_grid)
self.protocol = protocol
self.refit = refit
# self.timeout = timeout
# self.n_jobs = qp._get_njobs(n_jobs)
self.verbose = verbose
self.__check_error(error)
assert isinstance(protocol, AbstractProtocol), "unknown protocol"
def _sout(self, msg):
if self.verbose:
print(f"[{self.__class__.__name__}]: {msg}")
def __normalize_params(self, params):
__remap = {}
for key in params.keys():
k, delim, sub_key = key.partition("__")
if delim and k == "q":
__remap[key] = f"quantifier__{sub_key}"
return {(__remap[k] if k in __remap else k): v for k, v in params.items()}
def __check_error(self, error):
if error in qc.error.ACCURACY_ERROR:
self.error = error
elif isinstance(error, str):
self.error = qc.error.from_name(error)
elif hasattr(error, "__call__"):
self.error = error
else:
raise ValueError(
f"unexpected error type; must either be a callable function or a str representing\n"
f"the name of an error function in {qc.error.ACCURACY_ERROR_NAMES}"
)
def fit(self, training: LabelledCollection):
"""Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
the error metric.
:param training: the training set on which to optimize the hyperparameters
:return: self
"""
params_keys = list(self.param_grid.keys())
params_values = list(self.param_grid.values())
protocol = self.protocol
self.param_scores_ = {}
self.best_score_ = None
tinit = time()
hyper = [
dict(zip(params_keys, val)) for val in itertools.product(*params_values)
]
# self._sout(f"starting model selection with {self.n_jobs =}")
self._sout("starting model selection")
scores = [self.__params_eval(params, training) for params in hyper]
for params, score, model in scores:
if score is not None:
if self.best_score_ is None or score < self.best_score_:
self.best_score_ = score
self.best_params_ = params
self.best_model_ = model
self.param_scores_[str(params)] = score
else:
self.param_scores_[str(params)] = "timeout"
tend = time() - tinit
if self.best_score_ is None:
raise TimeoutError("no combination of hyperparameters seem to work")
self._sout(
f"optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) "
f"[took {tend:.4f}s]"
)
log = SubLogger.logger()
log.debug(
f"[{self.model.__class__.__name__}] "
f"optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) "
f"[took {tend:.4f}s]"
)
if self.refit:
if isinstance(protocol, OnLabelledCollectionProtocol):
self._sout("refitting on the whole development set")
self.best_model_.fit(training + protocol.get_labelled_collection())
else:
raise RuntimeWarning(
f'"refit" was requested, but the protocol does not '
f"implement the {OnLabelledCollectionProtocol.__name__} interface"
)
return self
def __params_eval(self, params, training):
protocol = self.protocol
error = self.error
# if self.timeout > 0:
# def handler(signum, frame):
# raise TimeoutError()
# signal.signal(signal.SIGALRM, handler)
tinit = time()
# if self.timeout > 0:
# signal.alarm(self.timeout)
try:
model = deepcopy(self.model)
# overrides default parameters with the parameters being explored at this iteration
model.set_params(**params)
# print({k: v for k, v in model.get_params().items() if k in params})
model.fit(training)
score = evaluate(model, protocol=protocol, error_metric=error)
ttime = time() - tinit
self._sout(
f"hyperparams={params}\t got score {score:.5f} [took {ttime:.4f}s]"
)
# if self.timeout > 0:
# signal.alarm(0)
# except TimeoutError:
# self._sout(f"timeout ({self.timeout}s) reached for config {params}")
# score = None
except ValueError as e:
self._sout(f"the combination of hyperparameters {params} is invalid")
raise e
except Exception as e:
self._sout(f"something went wrong for config {params}; skipping:")
self._sout(f"\tException: {e}")
score = None
return params, score, model
def extend(self, coll: LabelledCollection, pred_proba=None) -> ExtendedCollection:
assert hasattr(self, "best_model_"), "quantify called before fit"
return self.best_model().extend(coll, pred_proba=pred_proba)
def estimate(self, instances, ext=False):
"""Estimate class prevalence values using the best model found after calling the :meth:`fit` method.
:param instances: sample contanining the instances
:return: a ndarray of shape `(n_classes)` with class prevalence estimates as according to the best model found
by the model selection process.
"""
assert hasattr(self, "best_model_"), "estimate called before fit"
return self.best_model().estimate(instances, ext=ext)
def set_params(self, **parameters):
"""Sets the hyper-parameters to explore.
:param parameters: a dictionary with keys the parameter names and values the list of values to explore
"""
self.param_grid = parameters
def get_params(self, deep=True):
"""Returns the dictionary of hyper-parameters to explore (`param_grid`)
:param deep: Unused
:return: the dictionary `param_grid`
"""
return self.param_grid
def best_model(self):
"""
Returns the best model found after calling the :meth:`fit` method, i.e., the one trained on the combination
of hyper-parameters that minimized the error function.
:return: a trained quantifier
"""
if hasattr(self, "best_model_"):
return self.best_model_
raise ValueError("best_model called before fit")
class MCAEgsq(MultiClassAccuracyEstimator):
def __init__(
self,
classifier: BaseEstimator,
quantifier: BaseAccuracyEstimator,
param_grid: dict,
error: Union[Callable, str] = qp.error.mae,
refit=True,
timeout=-1,
n_jobs=None,
verbose=False,
):
self.param_grid = param_grid
self.refit = refit
self.timeout = timeout
self.n_jobs = n_jobs
self.verbose = verbose
self.error = error
super().__init__(classifier, quantifier)
def fit(self, train: LabelledCollection):
self.e_train = self.extend(train)
t_train, t_val = self.e_train.split_stratified(0.6, random_state=0)
self.quantifier = GridSearchQ(
deepcopy(self.quantifier),
param_grid=self.param_grid,
protocol=UPP(t_val, repeats=100),
error=self.error,
refit=self.refit,
timeout=self.timeout,
n_jobs=self.n_jobs,
verbose=self.verbose,
).fit(self.e_train)
return self
def estimate(self, instances, ext=False) -> np.ndarray:
e_inst = instances if ext else self._extend_instances(instances)
estim_prev = self.quantifier.quantify(e_inst)
return self._check_prevalence_classes(estim_prev, self.quantifier.best_model().classes_)
class BQAEgsq(BinaryQuantifierAccuracyEstimator):
def __init__(
self,
classifier: BaseEstimator,
quantifier: BaseAccuracyEstimator,
param_grid: dict,
error: Union[Callable, str] = qp.error.mae,
refit=True,
timeout=-1,
n_jobs=None,
verbose=False,
):
self.param_grid = param_grid
self.refit = refit
self.timeout = timeout
self.n_jobs = n_jobs
self.verbose = verbose
self.error = error
super().__init__(classifier=classifier, quantifier=quantifier)
def fit(self, train: LabelledCollection):
self.e_train = self.extend(train)
self.n_classes = self.e_train.n_classes
self.e_trains = self.e_train.split_by_pred()
self.quantifiers = []
for e_train in self.e_trains:
t_train, t_val = e_train.split_stratified(0.6, random_state=0)
quantifier = GridSearchQ(
model=deepcopy(self.quantifier),
param_grid=self.param_grid,
protocol=UPP(t_val, repeats=100),
error=self.error,
refit=self.refit,
timeout=self.timeout,
n_jobs=self.n_jobs,
verbose=self.verbose,
).fit(t_train)
self.quantifiers.append(quantifier)
return self
import itertools
from copy import deepcopy
from time import time
from typing import Callable, Union
import numpy as np
import quapy as qp
from quapy.data import LabelledCollection
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP, AbstractProtocol, OnLabelledCollectionProtocol
from sklearn.base import BaseEstimator
import quacc as qc
import quacc.error
from quacc.data import ExtendedCollection
from quacc.evaluation import evaluate
from quacc.logger import SubLogger
from quacc.method.base import (
BaseAccuracyEstimator,
BinaryQuantifierAccuracyEstimator,
MultiClassAccuracyEstimator,
)
class GridSearchAE(BaseAccuracyEstimator):
def __init__(
self,
model: BaseAccuracyEstimator,
param_grid: dict,
protocol: AbstractProtocol,
error: Union[Callable, str] = qc.error.maccd,
refit=True,
# timeout=-1,
# n_jobs=None,
verbose=False,
):
self.model = model
self.param_grid = self.__normalize_params(param_grid)
self.protocol = protocol
self.refit = refit
# self.timeout = timeout
# self.n_jobs = qp._get_njobs(n_jobs)
self.verbose = verbose
self.__check_error(error)
assert isinstance(protocol, AbstractProtocol), "unknown protocol"
def _sout(self, msg):
if self.verbose:
print(f"[{self.__class__.__name__}]: {msg}")
def __normalize_params(self, params):
__remap = {}
for key in params.keys():
k, delim, sub_key = key.partition("__")
if delim and k == "q":
__remap[key] = f"quantifier__{sub_key}"
return {(__remap[k] if k in __remap else k): v for k, v in params.items()}
def __check_error(self, error):
if error in qc.error.ACCURACY_ERROR:
self.error = error
elif isinstance(error, str):
self.error = qc.error.from_name(error)
elif hasattr(error, "__call__"):
self.error = error
else:
raise ValueError(
f"unexpected error type; must either be a callable function or a str representing\n"
f"the name of an error function in {qc.error.ACCURACY_ERROR_NAMES}"
)
def fit(self, training: LabelledCollection):
"""Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
the error metric.
:param training: the training set on which to optimize the hyperparameters
:return: self
"""
params_keys = list(self.param_grid.keys())
params_values = list(self.param_grid.values())
protocol = self.protocol
self.param_scores_ = {}
self.best_score_ = None
tinit = time()
hyper = [
dict(zip(params_keys, val)) for val in itertools.product(*params_values)
]
# self._sout(f"starting model selection with {self.n_jobs =}")
self._sout("starting model selection")
scores = [self.__params_eval(params, training) for params in hyper]
for params, score, model in scores:
if score is not None:
if self.best_score_ is None or score < self.best_score_:
self.best_score_ = score
self.best_params_ = params
self.best_model_ = model
self.param_scores_[str(params)] = score
else:
self.param_scores_[str(params)] = "timeout"
tend = time() - tinit
if self.best_score_ is None:
raise TimeoutError("no combination of hyperparameters seem to work")
self._sout(
f"optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) "
f"[took {tend:.4f}s]"
)
log = SubLogger.logger()
log.debug(
f"[{self.model.__class__.__name__}] "
f"optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) "
f"[took {tend:.4f}s]"
)
if self.refit:
if isinstance(protocol, OnLabelledCollectionProtocol):
self._sout("refitting on the whole development set")
self.best_model_.fit(training + protocol.get_labelled_collection())
else:
raise RuntimeWarning(
f'"refit" was requested, but the protocol does not '
f"implement the {OnLabelledCollectionProtocol.__name__} interface"
)
return self
def __params_eval(self, params, training):
protocol = self.protocol
error = self.error
# if self.timeout > 0:
# def handler(signum, frame):
# raise TimeoutError()
# signal.signal(signal.SIGALRM, handler)
tinit = time()
# if self.timeout > 0:
# signal.alarm(self.timeout)
try:
model = deepcopy(self.model)
# overrides default parameters with the parameters being explored at this iteration
model.set_params(**params)
# print({k: v for k, v in model.get_params().items() if k in params})
model.fit(training)
score = evaluate(model, protocol=protocol, error_metric=error)
ttime = time() - tinit
self._sout(
f"hyperparams={params}\t got score {score:.5f} [took {ttime:.4f}s]"
)
# if self.timeout > 0:
# signal.alarm(0)
# except TimeoutError:
# self._sout(f"timeout ({self.timeout}s) reached for config {params}")
# score = None
except ValueError as e:
self._sout(f"the combination of hyperparameters {params} is invalid")
raise e
except Exception as e:
self._sout(f"something went wrong for config {params}; skipping:")
self._sout(f"\tException: {e}")
score = None
return params, score, model
def extend(self, coll: LabelledCollection, pred_proba=None) -> ExtendedCollection:
assert hasattr(self, "best_model_"), "quantify called before fit"
return self.best_model().extend(coll, pred_proba=pred_proba)
def estimate(self, instances, ext=False):
"""Estimate class prevalence values using the best model found after calling the :meth:`fit` method.
:param instances: sample contanining the instances
:return: a ndarray of shape `(n_classes)` with class prevalence estimates as according to the best model found
by the model selection process.
"""
assert hasattr(self, "best_model_"), "estimate called before fit"
return self.best_model().estimate(instances, ext=ext)
def set_params(self, **parameters):
"""Sets the hyper-parameters to explore.
:param parameters: a dictionary with keys the parameter names and values the list of values to explore
"""
self.param_grid = parameters
def get_params(self, deep=True):
"""Returns the dictionary of hyper-parameters to explore (`param_grid`)
:param deep: Unused
:return: the dictionary `param_grid`
"""
return self.param_grid
def best_model(self):
"""
Returns the best model found after calling the :meth:`fit` method, i.e., the one trained on the combination
of hyper-parameters that minimized the error function.
:return: a trained quantifier
"""
if hasattr(self, "best_model_"):
return self.best_model_
raise ValueError("best_model called before fit")
class MCAEgsq(MultiClassAccuracyEstimator):
def __init__(
self,
classifier: BaseEstimator,
quantifier: BaseAccuracyEstimator,
param_grid: dict,
error: Union[Callable, str] = qp.error.mae,
refit=True,
timeout=-1,
n_jobs=None,
verbose=False,
):
self.param_grid = param_grid
self.refit = refit
self.timeout = timeout
self.n_jobs = n_jobs
self.verbose = verbose
self.error = error
super().__init__(classifier, quantifier)
def fit(self, train: LabelledCollection):
self.e_train = self.extend(train)
t_train, t_val = self.e_train.split_stratified(0.6, random_state=0)
self.quantifier = GridSearchQ(
deepcopy(self.quantifier),
param_grid=self.param_grid,
protocol=UPP(t_val, repeats=100),
error=self.error,
refit=self.refit,
timeout=self.timeout,
n_jobs=self.n_jobs,
verbose=self.verbose,
).fit(self.e_train)
return self
def estimate(self, instances, ext=False) -> np.ndarray:
e_inst = instances if ext else self._extend_instances(instances)
estim_prev = self.quantifier.quantify(e_inst)
return self._check_prevalence_classes(estim_prev, self.quantifier.best_model().classes_)
class BQAEgsq(BinaryQuantifierAccuracyEstimator):
def __init__(
self,
classifier: BaseEstimator,
quantifier: BaseAccuracyEstimator,
param_grid: dict,
error: Union[Callable, str] = qp.error.mae,
refit=True,
timeout=-1,
n_jobs=None,
verbose=False,
):
self.param_grid = param_grid
self.refit = refit
self.timeout = timeout
self.n_jobs = n_jobs
self.verbose = verbose
self.error = error
super().__init__(classifier=classifier, quantifier=quantifier)
def fit(self, train: LabelledCollection):
self.e_train = self.extend(train)
self.n_classes = self.e_train.n_classes
self.e_trains = self.e_train.split_by_pred()
self.quantifiers = []
for e_train in self.e_trains:
t_train, t_val = e_train.split_stratified(0.6, random_state=0)
quantifier = GridSearchQ(
model=deepcopy(self.quantifier),
param_grid=self.param_grid,
protocol=UPP(t_val, repeats=100),
error=self.error,
refit=self.refit,
timeout=self.timeout,
n_jobs=self.n_jobs,
verbose=self.verbose,
).fit(t_train)
self.quantifiers.append(quantifier)
return self

View File

@ -1,239 +1,239 @@
from pathlib import Path
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from cycler import cycler
from quacc.environment import env
matplotlib.use("agg")
def _get_markers(n: int):
ls = "ovx+sDph*^1234X><.Pd"
if n > len(ls):
ls = ls * (n / len(ls) + 1)
return list(ls)[:n]
def plot_delta(
base_prevs,
columns,
data,
*,
stdevs=None,
pos_class=1,
metric="acc",
name="default",
train_prev=None,
legend=True,
avg=None,
) -> Path:
_base_title = "delta_stdev" if stdevs is not None else "delta"
if train_prev is not None:
t_prev_pos = int(round(train_prev[pos_class] * 100))
title = f"{_base_title}_{name}_{t_prev_pos}_{metric}"
else:
title = f"{_base_title}_{name}_avg_{avg}_{metric}"
fig, ax = plt.subplots()
ax.set_aspect("auto")
ax.grid()
NUM_COLORS = len(data)
cm = plt.get_cmap("tab10")
if NUM_COLORS > 10:
cm = plt.get_cmap("tab20")
cy = cycler(color=[cm(i) for i in range(NUM_COLORS)])
base_prevs = base_prevs[:, pos_class]
for method, deltas, _cy in zip(columns, data, cy):
ax.plot(
base_prevs,
deltas,
label=method,
color=_cy["color"],
linestyle="-",
marker="o",
markersize=3,
zorder=2,
)
if stdevs is not None:
_col_idx = np.where(columns == method)[0]
stdev = stdevs[_col_idx].flatten()
nn_idx = np.intersect1d(
np.where(deltas != np.nan)[0],
np.where(stdev != np.nan)[0],
)
_bps, _ds, _st = base_prevs[nn_idx], deltas[nn_idx], stdev[nn_idx]
ax.fill_between(
_bps,
_ds - _st,
_ds + _st,
color=_cy["color"],
alpha=0.25,
)
x_label = "test" if avg is None or avg == "train" else "train"
ax.set(
xlabel=f"{x_label} prevalence",
ylabel=metric,
title=title,
)
if legend:
ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
output_path = env.PLOT_OUT_DIR / f"{title}.png"
fig.savefig(output_path, bbox_inches="tight")
return output_path
def plot_diagonal(
reference,
columns,
data,
*,
pos_class=1,
metric="acc",
name="default",
train_prev=None,
legend=True,
):
if train_prev is not None:
t_prev_pos = int(round(train_prev[pos_class] * 100))
title = f"diagonal_{name}_{t_prev_pos}_{metric}"
else:
title = f"diagonal_{name}_{metric}"
fig, ax = plt.subplots()
ax.set_aspect("auto")
ax.grid()
ax.set_aspect("equal")
NUM_COLORS = len(data)
cm = plt.get_cmap("tab10")
if NUM_COLORS > 10:
cm = plt.get_cmap("tab20")
cy = cycler(
color=[cm(i) for i in range(NUM_COLORS)],
marker=_get_markers(NUM_COLORS),
)
reference = np.array(reference)
x_ticks = np.unique(reference)
x_ticks.sort()
for deltas, _cy in zip(data, cy):
ax.plot(
reference,
deltas,
color=_cy["color"],
linestyle="None",
marker=_cy["marker"],
markersize=3,
zorder=2,
alpha=0.25,
)
# ensure limits are equal for both axes
_alims = np.stack(((ax.get_xlim(), ax.get_ylim())), axis=-1)
_lims = np.array([f(ls) for f, ls in zip([np.min, np.max], _alims)])
ax.set(xlim=tuple(_lims), ylim=tuple(_lims))
for method, deltas, _cy in zip(columns, data, cy):
slope, interc = np.polyfit(reference, deltas, 1)
y_lr = np.array([slope * x + interc for x in _lims])
ax.plot(
_lims,
y_lr,
label=method,
color=_cy["color"],
linestyle="-",
markersize="0",
zorder=1,
)
# plot reference line
ax.plot(
_lims,
_lims,
color="black",
linestyle="--",
markersize=0,
zorder=1,
)
ax.set(xlabel=f"true {metric}", ylabel=f"estim. {metric}", title=title)
if legend:
ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
output_path = env.PLOT_OUT_DIR / f"{title}.png"
fig.savefig(output_path, bbox_inches="tight")
return output_path
def plot_shift(
shift_prevs,
columns,
data,
*,
counts=None,
pos_class=1,
metric="acc",
name="default",
train_prev=None,
legend=True,
) -> Path:
if train_prev is not None:
t_prev_pos = int(round(train_prev[pos_class] * 100))
title = f"shift_{name}_{t_prev_pos}_{metric}"
else:
title = f"shift_{name}_avg_{metric}"
fig, ax = plt.subplots()
ax.set_aspect("auto")
ax.grid()
NUM_COLORS = len(data)
cm = plt.get_cmap("tab10")
if NUM_COLORS > 10:
cm = plt.get_cmap("tab20")
cy = cycler(color=[cm(i) for i in range(NUM_COLORS)])
shift_prevs = shift_prevs[:, pos_class]
for method, shifts, _cy in zip(columns, data, cy):
ax.plot(
shift_prevs,
shifts,
label=method,
color=_cy["color"],
linestyle="-",
marker="o",
markersize=3,
zorder=2,
)
if counts is not None:
_col_idx = np.where(columns == method)[0]
count = counts[_col_idx].flatten()
for prev, shift, cnt in zip(shift_prevs, shifts, count):
label = f"{cnt}"
plt.annotate(
label,
(prev, shift),
textcoords="offset points",
xytext=(0, 10),
ha="center",
color=_cy["color"],
fontsize=12.0,
)
ax.set(xlabel="dataset shift", ylabel=metric, title=title)
if legend:
ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
output_path = env.PLOT_OUT_DIR / f"{title}.png"
fig.savefig(output_path, bbox_inches="tight")
return output_path
from pathlib import Path
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from cycler import cycler
from quacc.environment import env
matplotlib.use("agg")
def _get_markers(n: int):
ls = "ovx+sDph*^1234X><.Pd"
if n > len(ls):
ls = ls * (n / len(ls) + 1)
return list(ls)[:n]
def plot_delta(
base_prevs,
columns,
data,
*,
stdevs=None,
pos_class=1,
metric="acc",
name="default",
train_prev=None,
legend=True,
avg=None,
) -> Path:
_base_title = "delta_stdev" if stdevs is not None else "delta"
if train_prev is not None:
t_prev_pos = int(round(train_prev[pos_class] * 100))
title = f"{_base_title}_{name}_{t_prev_pos}_{metric}"
else:
title = f"{_base_title}_{name}_avg_{avg}_{metric}"
fig, ax = plt.subplots()
ax.set_aspect("auto")
ax.grid()
NUM_COLORS = len(data)
cm = plt.get_cmap("tab10")
if NUM_COLORS > 10:
cm = plt.get_cmap("tab20")
cy = cycler(color=[cm(i) for i in range(NUM_COLORS)])
base_prevs = base_prevs[:, pos_class]
for method, deltas, _cy in zip(columns, data, cy):
ax.plot(
base_prevs,
deltas,
label=method,
color=_cy["color"],
linestyle="-",
marker="o",
markersize=3,
zorder=2,
)
if stdevs is not None:
_col_idx = np.where(columns == method)[0]
stdev = stdevs[_col_idx].flatten()
nn_idx = np.intersect1d(
np.where(deltas != np.nan)[0],
np.where(stdev != np.nan)[0],
)
_bps, _ds, _st = base_prevs[nn_idx], deltas[nn_idx], stdev[nn_idx]
ax.fill_between(
_bps,
_ds - _st,
_ds + _st,
color=_cy["color"],
alpha=0.25,
)
x_label = "test" if avg is None or avg == "train" else "train"
ax.set(
xlabel=f"{x_label} prevalence",
ylabel=metric,
title=title,
)
if legend:
ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
output_path = env.PLOT_OUT_DIR / f"{title}.png"
fig.savefig(output_path, bbox_inches="tight")
return output_path
def plot_diagonal(
reference,
columns,
data,
*,
pos_class=1,
metric="acc",
name="default",
train_prev=None,
legend=True,
):
if train_prev is not None:
t_prev_pos = int(round(train_prev[pos_class] * 100))
title = f"diagonal_{name}_{t_prev_pos}_{metric}"
else:
title = f"diagonal_{name}_{metric}"
fig, ax = plt.subplots()
ax.set_aspect("auto")
ax.grid()
ax.set_aspect("equal")
NUM_COLORS = len(data)
cm = plt.get_cmap("tab10")
if NUM_COLORS > 10:
cm = plt.get_cmap("tab20")
cy = cycler(
color=[cm(i) for i in range(NUM_COLORS)],
marker=_get_markers(NUM_COLORS),
)
reference = np.array(reference)
x_ticks = np.unique(reference)
x_ticks.sort()
for deltas, _cy in zip(data, cy):
ax.plot(
reference,
deltas,
color=_cy["color"],
linestyle="None",
marker=_cy["marker"],
markersize=3,
zorder=2,
alpha=0.25,
)
# ensure limits are equal for both axes
_alims = np.stack(((ax.get_xlim(), ax.get_ylim())), axis=-1)
_lims = np.array([f(ls) for f, ls in zip([np.min, np.max], _alims)])
ax.set(xlim=tuple(_lims), ylim=tuple(_lims))
for method, deltas, _cy in zip(columns, data, cy):
slope, interc = np.polyfit(reference, deltas, 1)
y_lr = np.array([slope * x + interc for x in _lims])
ax.plot(
_lims,
y_lr,
label=method,
color=_cy["color"],
linestyle="-",
markersize="0",
zorder=1,
)
# plot reference line
ax.plot(
_lims,
_lims,
color="black",
linestyle="--",
markersize=0,
zorder=1,
)
ax.set(xlabel=f"true {metric}", ylabel=f"estim. {metric}", title=title)
if legend:
ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
output_path = env.PLOT_OUT_DIR / f"{title}.png"
fig.savefig(output_path, bbox_inches="tight")
return output_path
def plot_shift(
shift_prevs,
columns,
data,
*,
counts=None,
pos_class=1,
metric="acc",
name="default",
train_prev=None,
legend=True,
) -> Path:
if train_prev is not None:
t_prev_pos = int(round(train_prev[pos_class] * 100))
title = f"shift_{name}_{t_prev_pos}_{metric}"
else:
title = f"shift_{name}_avg_{metric}"
fig, ax = plt.subplots()
ax.set_aspect("auto")
ax.grid()
NUM_COLORS = len(data)
cm = plt.get_cmap("tab10")
if NUM_COLORS > 10:
cm = plt.get_cmap("tab20")
cy = cycler(color=[cm(i) for i in range(NUM_COLORS)])
shift_prevs = shift_prevs[:, pos_class]
for method, shifts, _cy in zip(columns, data, cy):
ax.plot(
shift_prevs,
shifts,
label=method,
color=_cy["color"],
linestyle="-",
marker="o",
markersize=3,
zorder=2,
)
if counts is not None:
_col_idx = np.where(columns == method)[0]
count = counts[_col_idx].flatten()
for prev, shift, cnt in zip(shift_prevs, shifts, count):
label = f"{cnt}"
plt.annotate(
label,
(prev, shift),
textcoords="offset points",
xytext=(0, 10),
ha="center",
color=_cy["color"],
fontsize=12.0,
)
ax.set(xlabel="dataset shift", ylabel=metric, title=title)
if legend:
ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
output_path = env.PLOT_OUT_DIR / f"{title}.png"
fig.savefig(output_path, bbox_inches="tight")
return output_path

View File

@ -1,59 +1,59 @@
import functools
import os
import shutil
from pathlib import Path
import pandas as pd
from quacc.environment import env
def combine_dataframes(dfs, df_index=[]) -> pd.DataFrame:
if len(dfs) < 1:
raise ValueError
if len(dfs) == 1:
return dfs[0]
df = dfs[0]
for ndf in dfs[1:]:
df = df.join(ndf.set_index(df_index), on=df_index)
return df
def avg_group_report(df: pd.DataFrame) -> pd.DataFrame:
def _reduce_func(s1, s2):
return {(n1, n2): v + s2[(n1, n2)] for ((n1, n2), v) in s1.items()}
lst = df.to_dict(orient="records")[1:-1]
summed_series = functools.reduce(_reduce_func, lst)
idx = df.columns.drop([("base", "T"), ("base", "F")])
avg_report = {
(n1, n2): (v / len(lst))
for ((n1, n2), v) in summed_series.items()
if n1 != "base"
}
return pd.DataFrame([avg_report], columns=idx)
def fmt_line_md(s):
return f"> {s} \n"
def create_dataser_dir(dir_name, update=False):
base_out_dir = Path(env.OUT_DIR_NAME)
if not base_out_dir.exists():
os.mkdir(base_out_dir)
dataset_dir = base_out_dir / dir_name
env.OUT_DIR = dataset_dir
if update:
if not dataset_dir.exists():
os.mkdir(dataset_dir)
else:
shutil.rmtree(dataset_dir, ignore_errors=True)
os.mkdir(dataset_dir)
plot_dir_path = dataset_dir / "plot"
env.PLOT_OUT_DIR = plot_dir_path
if not plot_dir_path.exists():
os.mkdir(plot_dir_path)
import functools
import os
import shutil
from pathlib import Path
import pandas as pd
from quacc.environment import env
def combine_dataframes(dfs, df_index=[]) -> pd.DataFrame:
if len(dfs) < 1:
raise ValueError
if len(dfs) == 1:
return dfs[0]
df = dfs[0]
for ndf in dfs[1:]:
df = df.join(ndf.set_index(df_index), on=df_index)
return df
def avg_group_report(df: pd.DataFrame) -> pd.DataFrame:
def _reduce_func(s1, s2):
return {(n1, n2): v + s2[(n1, n2)] for ((n1, n2), v) in s1.items()}
lst = df.to_dict(orient="records")[1:-1]
summed_series = functools.reduce(_reduce_func, lst)
idx = df.columns.drop([("base", "T"), ("base", "F")])
avg_report = {
(n1, n2): (v / len(lst))
for ((n1, n2), v) in summed_series.items()
if n1 != "base"
}
return pd.DataFrame([avg_report], columns=idx)
def fmt_line_md(s):
return f"> {s} \n"
def create_dataser_dir(dir_name, update=False):
base_out_dir = Path(env.OUT_DIR_NAME)
if not base_out_dir.exists():
os.mkdir(base_out_dir)
dataset_dir = base_out_dir / dir_name
env.OUT_DIR = dataset_dir
if update:
if not dataset_dir.exists():
os.mkdir(dataset_dir)
else:
shutil.rmtree(dataset_dir, ignore_errors=True)
os.mkdir(dataset_dir)
plot_dir_path = dataset_dir / "plot"
env.PLOT_OUT_DIR = plot_dir_path
if not plot_dir_path.exists():
os.mkdir(plot_dir_path)

View File

@ -1,40 +1,40 @@
## Roadmap
#### quantificator domain
- single multilabel quantificator
- vector of binary quantificators
| quantificator | | |
|:-------------------:|:--------------:|:--------------:|
| true quantificator | true positive | false positive |
| false quantificator | false negative | true negative |
#### dataset split
- train | test
- classificator C is fit on train
- quantificator Q is fit on cross validation of C over train
- train | validation | test
- classificator C is fit on train
- quantificator Q is fit on validation
#### classificator origin
- black box
- crystal box
#### test metrics
- f1_score
- K
#### models
- classificator
- quantificator
## Roadmap
#### quantificator domain
- single multilabel quantificator
- vector of binary quantificators
| quantificator | | |
|:-------------------:|:--------------:|:--------------:|
| true quantificator | true positive | false positive |
| false quantificator | false negative | true negative |
#### dataset split
- train | test
- classificator C is fit on train
- quantificator Q is fit on cross validation of C over train
- train | validation | test
- classificator C is fit on train
- quantificator Q is fit on validation
#### classificator origin
- black box
- crystal box
#### test metrics
- f1_score
- K
#### models
- classificator
- quantificator

4202
test_mc.md

File diff suppressed because it is too large Load Diff

View File

@ -1,225 +1,225 @@
import pytest
from quacc.data import ExClassManager as ECM, ExtendedCollection
import numpy as np
import scipy.sparse as sp
class TestExClassManager:
@pytest.mark.parametrize(
"true_class,pred_class,result",
[
(0, 0, 0),
(0, 1, 1),
(1, 0, 2),
(1, 1, 3),
],
)
def test_get_ex(self, true_class, pred_class, result):
ncl = 2
assert ECM.get_ex(ncl, true_class, pred_class) == result
@pytest.mark.parametrize(
"ex_class,result",
[
(0, 0),
(1, 1),
(2, 0),
(3, 1),
],
)
def test_get_pred(self, ex_class, result):
ncl = 2
assert ECM.get_pred(ncl, ex_class) == result
@pytest.mark.parametrize(
"ex_class,result",
[
(0, 0),
(1, 0),
(2, 1),
(3, 1),
],
)
def test_get_true(self, ex_class, result):
ncl = 2
assert ECM.get_true(ncl, ex_class) == result
class TestExtendedCollection:
@pytest.mark.parametrize(
"instances,result",
[
(
np.asarray(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
),
[np.asarray([1, 3]), np.asarray([0, 2])],
),
(
sp.csr_matrix(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
),
[np.asarray([1, 3]), np.asarray([0, 2])],
),
(
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
[np.asarray([], dtype=int), np.asarray([0, 1])],
),
(
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
[np.asarray([], dtype=int), np.asarray([0, 1])],
),
(
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
[np.asarray([0, 1]), np.asarray([], dtype=int)],
),
(
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
[np.asarray([0, 1]), np.asarray([], dtype=int)],
),
],
)
def test__split_index_by_pred(self, instances, result):
ncl = 2
assert all(
np.array_equal(a, b)
for (a, b) in zip(
ExtendedCollection._split_index_by_pred(ncl, instances),
result,
)
)
@pytest.mark.parametrize(
"instances,s_inst,norms",
[
(
np.asarray(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
),
[
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
],
[0.5, 0.5],
),
(
sp.csr_matrix(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
),
[
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
],
[0.5, 0.5],
),
(
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
[
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([], dtype=int),
],
[1.0, 0.0],
),
(
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
[
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
sp.csr_matrix([], dtype=int),
],
[1.0, 0.0],
),
(
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
[
np.asarray([], dtype=int),
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
],
[0.0, 1.0],
),
(
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
[
sp.csr_matrix([], dtype=int),
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
],
[0.0, 1.0],
),
],
)
def test_split_inst_by_pred(self, instances, s_inst, norms):
ncl = 2
_s_inst, _norms = ExtendedCollection.split_inst_by_pred(ncl, instances)
if isinstance(s_inst, np.ndarray):
assert all(np.array_equal(a, b) for (a, b) in zip(_s_inst, s_inst))
if isinstance(s_inst, sp.csr_matrix):
assert all((a != b).nnz == 0 for (a, b) in zip(_s_inst, s_inst))
assert all(a == b for (a, b) in zip(_norms, norms))
@pytest.mark.parametrize(
"instances,labels,inst0,lbl0,inst1,lbl1",
[
(
np.asarray(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
),
np.asarray([3, 0, 1, 2]),
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 1]),
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([1, 0]),
),
(
sp.csr_matrix(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
),
np.asarray([3, 0, 1, 2]),
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 1]),
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([1, 0]),
),
(
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([3, 1]),
np.asarray([], dtype=int),
np.asarray([], dtype=int),
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([1, 0]),
),
(
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([3, 1]),
sp.csr_matrix(np.empty((0, 0), dtype=int)),
np.asarray([], dtype=int),
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([1, 0]),
),
(
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 2]),
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 1]),
np.asarray([], dtype=int),
np.asarray([], dtype=int),
),
(
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 2]),
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 1]),
sp.csr_matrix(np.empty((0, 0), dtype=int)),
np.asarray([], dtype=int),
),
],
)
def test_split_by_pred(self, instances, labels, inst0, lbl0, inst1, lbl1):
ec = ExtendedCollection(instances, labels, classes=range(0, 4))
[ec0, ec1] = ec.split_by_pred()
if isinstance(instances, np.ndarray):
assert np.array_equal(ec0.X, inst0)
assert np.array_equal(ec1.X, inst1)
if isinstance(instances, sp.csr_matrix):
assert (ec0.X != inst0).nnz == 0
assert (ec1.X != inst1).nnz == 0
assert np.array_equal(ec0.y, lbl0)
assert np.array_equal(ec1.y, lbl1)
import pytest
from quacc.data import ExClassManager as ECM, ExtendedCollection
import numpy as np
import scipy.sparse as sp
class TestExClassManager:
@pytest.mark.parametrize(
"true_class,pred_class,result",
[
(0, 0, 0),
(0, 1, 1),
(1, 0, 2),
(1, 1, 3),
],
)
def test_get_ex(self, true_class, pred_class, result):
ncl = 2
assert ECM.get_ex(ncl, true_class, pred_class) == result
@pytest.mark.parametrize(
"ex_class,result",
[
(0, 0),
(1, 1),
(2, 0),
(3, 1),
],
)
def test_get_pred(self, ex_class, result):
ncl = 2
assert ECM.get_pred(ncl, ex_class) == result
@pytest.mark.parametrize(
"ex_class,result",
[
(0, 0),
(1, 0),
(2, 1),
(3, 1),
],
)
def test_get_true(self, ex_class, result):
ncl = 2
assert ECM.get_true(ncl, ex_class) == result
class TestExtendedCollection:
@pytest.mark.parametrize(
"instances,result",
[
(
np.asarray(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
),
[np.asarray([1, 3]), np.asarray([0, 2])],
),
(
sp.csr_matrix(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
),
[np.asarray([1, 3]), np.asarray([0, 2])],
),
(
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
[np.asarray([], dtype=int), np.asarray([0, 1])],
),
(
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
[np.asarray([], dtype=int), np.asarray([0, 1])],
),
(
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
[np.asarray([0, 1]), np.asarray([], dtype=int)],
),
(
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
[np.asarray([0, 1]), np.asarray([], dtype=int)],
),
],
)
def test__split_index_by_pred(self, instances, result):
ncl = 2
assert all(
np.array_equal(a, b)
for (a, b) in zip(
ExtendedCollection._split_index_by_pred(ncl, instances),
result,
)
)
@pytest.mark.parametrize(
"instances,s_inst,norms",
[
(
np.asarray(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
),
[
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
],
[0.5, 0.5],
),
(
sp.csr_matrix(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
),
[
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
],
[0.5, 0.5],
),
(
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
[
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([], dtype=int),
],
[1.0, 0.0],
),
(
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
[
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
sp.csr_matrix([], dtype=int),
],
[1.0, 0.0],
),
(
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
[
np.asarray([], dtype=int),
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
],
[0.0, 1.0],
),
(
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
[
sp.csr_matrix([], dtype=int),
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
],
[0.0, 1.0],
),
],
)
def test_split_inst_by_pred(self, instances, s_inst, norms):
ncl = 2
_s_inst, _norms = ExtendedCollection.split_inst_by_pred(ncl, instances)
if isinstance(s_inst, np.ndarray):
assert all(np.array_equal(a, b) for (a, b) in zip(_s_inst, s_inst))
if isinstance(s_inst, sp.csr_matrix):
assert all((a != b).nnz == 0 for (a, b) in zip(_s_inst, s_inst))
assert all(a == b for (a, b) in zip(_norms, norms))
@pytest.mark.parametrize(
"instances,labels,inst0,lbl0,inst1,lbl1",
[
(
np.asarray(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
),
np.asarray([3, 0, 1, 2]),
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 1]),
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([1, 0]),
),
(
sp.csr_matrix(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
),
np.asarray([3, 0, 1, 2]),
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 1]),
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([1, 0]),
),
(
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([3, 1]),
np.asarray([], dtype=int),
np.asarray([], dtype=int),
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([1, 0]),
),
(
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([3, 1]),
sp.csr_matrix(np.empty((0, 0), dtype=int)),
np.asarray([], dtype=int),
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([1, 0]),
),
(
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 2]),
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 1]),
np.asarray([], dtype=int),
np.asarray([], dtype=int),
),
(
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 2]),
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 1]),
sp.csr_matrix(np.empty((0, 0), dtype=int)),
np.asarray([], dtype=int),
),
],
)
def test_split_by_pred(self, instances, labels, inst0, lbl0, inst1, lbl1):
ec = ExtendedCollection(instances, labels, classes=range(0, 4))
[ec0, ec1] = ec.split_by_pred()
if isinstance(instances, np.ndarray):
assert np.array_equal(ec0.X, inst0)
assert np.array_equal(ec1.X, inst1)
if isinstance(instances, sp.csr_matrix):
assert (ec0.X != inst0).nnz == 0
assert (ec1.X != inst1).nnz == 0
assert np.array_equal(ec0.y, lbl0)
assert np.array_equal(ec1.y, lbl1)

View File

@ -1,3 +1,3 @@
class TestDataset:
class TestDataset:
pass

View File

@ -1,12 +1,12 @@
from sklearn.linear_model import LogisticRegression
from quacc.dataset import Dataset
from quacc.evaluation.baseline import kfcv
class TestBaseline:
def test_kfcv(self):
spambase = Dataset("spambase", n_prevalences=1).get_raw()
c_model = LogisticRegression()
c_model.fit(spambase.train.X, spambase.train.y)
assert "f1_score" in kfcv(c_model, spambase.validation)
from sklearn.linear_model import LogisticRegression
from quacc.dataset import Dataset
from quacc.evaluation.baseline import kfcv
class TestBaseline:
def test_kfcv(self):
spambase = Dataset("spambase", n_prevalences=1).get_raw()
c_model = LogisticRegression()
c_model.fit(spambase.train.X, spambase.train.y)
assert "f1_score" in kfcv(c_model, spambase.validation)

View File

@ -1,66 +1,66 @@
import numpy as np
import pytest
import scipy.sparse as sp
from sklearn.linear_model import LogisticRegression
from quacc.method.base import BinaryQuantifierAccuracyEstimator
class TestBQAE:
@pytest.mark.parametrize(
"instances,preds0,preds1,result",
[
(
np.asarray(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
),
np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]),
np.asarray([0.15, 0.2, 0.35, 0.3]),
),
(
sp.csr_matrix(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
),
np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]),
np.asarray([0.15, 0.2, 0.35, 0.3]),
),
(
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]),
np.asarray([0.0, 0.4, 0.0, 0.6]),
),
(
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]),
np.asarray([0.0, 0.4, 0.0, 0.6]),
),
(
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]),
np.asarray([0.3, 0.0, 0.7, 0.0]),
),
(
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]),
np.asarray([0.3, 0.0, 0.7, 0.0]),
),
],
)
def test_estimate_ndarray(self, mocker, instances, preds0, preds1, result):
estimator = BinaryQuantifierAccuracyEstimator(LogisticRegression())
estimator.n_classes = 4
with mocker.patch.object(estimator.q_model_0, "quantify"), mocker.patch.object(
estimator.q_model_1, "quantify"
):
estimator.q_model_0.quantify.return_value = preds0
estimator.q_model_1.quantify.return_value = preds1
assert np.array_equal(
estimator.estimate(instances, ext=True),
result,
)
import numpy as np
import pytest
import scipy.sparse as sp
from sklearn.linear_model import LogisticRegression
from quacc.method.base import BinaryQuantifierAccuracyEstimator
class TestBQAE:
@pytest.mark.parametrize(
"instances,preds0,preds1,result",
[
(
np.asarray(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
),
np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]),
np.asarray([0.15, 0.2, 0.35, 0.3]),
),
(
sp.csr_matrix(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
),
np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]),
np.asarray([0.15, 0.2, 0.35, 0.3]),
),
(
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]),
np.asarray([0.0, 0.4, 0.0, 0.6]),
),
(
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]),
np.asarray([0.0, 0.4, 0.0, 0.6]),
),
(
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]),
np.asarray([0.3, 0.0, 0.7, 0.0]),
),
(
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]),
np.asarray([0.3, 0.0, 0.7, 0.0]),
),
],
)
def test_estimate_ndarray(self, mocker, instances, preds0, preds1, result):
estimator = BinaryQuantifierAccuracyEstimator(LogisticRegression())
estimator.n_classes = 4
with mocker.patch.object(estimator.q_model_0, "quantify"), mocker.patch.object(
estimator.q_model_1, "quantify"
):
estimator.q_model_0.quantify.return_value = preds0
estimator.q_model_1.quantify.return_value = preds1
assert np.array_equal(
estimator.estimate(instances, ext=True),
result,
)

View File

@ -1,2 +1,2 @@
class TestMCAE:
pass
class TestMCAE:
pass