Compare commits
21 Commits
|
@ -1,4 +1,4 @@
|
|||
Change Log 0.1.8
|
||||
Change Log 0.1.8g
|
||||
----------------
|
||||
|
||||
- Added Kernel Density Estimation methods (KDEyML, KDEyCS, KDEyHD) as proposed in the paper:
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,88 @@
|
|||
import zipfile
|
||||
|
||||
import pandas as pd
|
||||
import os
|
||||
from os.path import join
|
||||
|
||||
import quapy as qp
|
||||
from scripts.data import load_vector_documents
|
||||
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.protocol import AbstractProtocol
|
||||
from quapy.util import download_file_if_not_exists
|
||||
|
||||
LEQUA2024_TASKS = ['T1', 'T2', 'T3', 'T4']
|
||||
|
||||
LEQUA2024_ZENODO = 'https://zenodo.org/record/11091067' # v2, no ground truth for test yet
|
||||
|
||||
|
||||
class LabelledCollectionsFromDir(AbstractProtocol):
|
||||
|
||||
def __init__(self, path_dir:str, ground_truth_path:str, load_fn):
|
||||
self.path_dir = path_dir
|
||||
self.load_fn = load_fn
|
||||
self.true_prevs = pd.read_csv(ground_truth_path, index_col=0)
|
||||
|
||||
def __call__(self):
|
||||
for id, prevalence in self.true_prevs.iterrows():
|
||||
collection_path = os.path.join(self.path_dir, f'{id}.txt')
|
||||
lc = LabelledCollection.load(path=collection_path, loader_func=self.load_fn)
|
||||
yield lc
|
||||
|
||||
|
||||
def fetch_lequa2024(task, data_home=None, merge_T3=False):
|
||||
|
||||
from quapy.data._lequa2022 import SamplesFromDir
|
||||
|
||||
assert task in LEQUA2024_TASKS, \
|
||||
f'Unknown task {task}. Valid ones are {LEQUA2024_TASKS}'
|
||||
|
||||
if data_home is None:
|
||||
data_home = qp.util.get_quapy_home()
|
||||
|
||||
lequa_dir = data_home
|
||||
|
||||
URL_TRAINDEV=f'{LEQUA2024_ZENODO}/files/{task}.train_dev.zip'
|
||||
URL_TEST=f'{LEQUA2024_ZENODO}/files/{task}.test.zip'
|
||||
# URL_TEST_PREV=f'{LEQUA2024_ZENODO}/files/{task}.test_prevalences.zip'
|
||||
|
||||
lequa_dir = join(data_home, 'lequa2024')
|
||||
os.makedirs(lequa_dir, exist_ok=True)
|
||||
|
||||
def download_unzip_and_remove(unzipped_path, url):
|
||||
tmp_path = join(lequa_dir, task + '_tmp.zip')
|
||||
download_file_if_not_exists(url, tmp_path)
|
||||
with zipfile.ZipFile(tmp_path) as file:
|
||||
file.extractall(unzipped_path)
|
||||
os.remove(tmp_path)
|
||||
|
||||
if not os.path.exists(join(lequa_dir, task)):
|
||||
download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
|
||||
download_unzip_and_remove(lequa_dir, URL_TEST)
|
||||
# download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
|
||||
|
||||
load_fn = load_vector_documents
|
||||
|
||||
val_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
|
||||
val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt')
|
||||
val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
|
||||
|
||||
# test_samples_path = join(lequa_dir, task, 'public', 'test_samples')
|
||||
# test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
|
||||
# test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn)
|
||||
test_gen = None
|
||||
|
||||
if task != 'T3':
|
||||
tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
|
||||
train = LabelledCollection.load(tr_path, loader_func=load_fn)
|
||||
return train, val_gen, test_gen
|
||||
else:
|
||||
training_samples_path = join(lequa_dir, task, 'public', 'training_samples')
|
||||
training_true_prev_path = join(lequa_dir, task, 'public', 'training_prevalences.txt')
|
||||
train_gen = LabelledCollectionsFromDir(training_samples_path, training_true_prev_path, load_fn=load_fn)
|
||||
if merge_T3:
|
||||
train = LabelledCollection.join(*list(train_gen()))
|
||||
return train, val_gen, test_gen
|
||||
else:
|
||||
return train_gen, val_gen, test_gen
|
||||
|
|
@ -0,0 +1,115 @@
|
|||
import argparse
|
||||
import pickle
|
||||
import os
|
||||
import sys
|
||||
from os.path import join
|
||||
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression as LR
|
||||
|
||||
from scripts.constants import SAMPLE_SIZE
|
||||
from scripts.evaluate import normalized_match_distance
|
||||
from LeQua2024._lequa2024 import LEQUA2024_TASKS, fetch_lequa2024, LEQUA2024_ZENODO
|
||||
from quapy.method.aggregative import *
|
||||
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
|
||||
import quapy.functional as F
|
||||
|
||||
|
||||
# LeQua official baselines (under development!)
|
||||
# =================================================================================
|
||||
|
||||
BINARY_TASKS = ['T1', 'T4']
|
||||
|
||||
|
||||
def new_cls():
|
||||
return LR(n_jobs=-1, max_iter=3000)
|
||||
|
||||
|
||||
lr_params = {
|
||||
'C': np.logspace(-4, 4, 9),
|
||||
'class_weight': [None, 'balanced']
|
||||
}
|
||||
|
||||
def wrap_params(cls_params:dict, prefix:str):
|
||||
return {'__'.join([prefix, key]): val for key, val in cls_params.items()}
|
||||
|
||||
|
||||
|
||||
def baselines():
|
||||
|
||||
q_params = wrap_params(lr_params, 'classifier')
|
||||
kde_params = {**q_params, 'bandwidth': np.linspace(0.01, 0.20, 20)}
|
||||
dm_params = {**q_params, 'nbins': [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 64]}
|
||||
|
||||
yield CC(new_cls()), "CC", q_params
|
||||
yield ACC(new_cls()), "ACC", q_params
|
||||
yield PCC(new_cls()), "PCC", q_params
|
||||
yield PACC(new_cls()), "PACC", q_params
|
||||
yield SLD(new_cls()), "SLD", q_params
|
||||
#yield KDEyML(new_cls()), "KDEy-ML", kde_params
|
||||
#yield KDEyHD(new_cls()), "KDEy-HD", kde_params
|
||||
# yield KDEyCS(new_cls()), "KDEy-CS", kde_params
|
||||
#yield DMy(new_cls()), "DMy", dm_params
|
||||
|
||||
|
||||
def main(args):
|
||||
|
||||
models_path = qp.util.create_if_not_exist(join('./models', args.task))
|
||||
hyperparams_path = qp.util.create_if_not_exist(join('./hyperparams', args.task))
|
||||
|
||||
os.makedirs(models_path, exist_ok=True)
|
||||
os.makedirs(hyperparams_path, exist_ok=True)
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = SAMPLE_SIZE[args.task]
|
||||
|
||||
train, gen_val, gen_test = fetch_lequa2024(task=args.task, data_home=args.datadir, merge_T3=True)
|
||||
|
||||
# gen_test is None, since the true prevalence vectors for the test samples will be released
|
||||
# only after the competition ends
|
||||
|
||||
print(f'number of classes: {len(train.classes_)}')
|
||||
print(f'number of training documents: {len(train)}')
|
||||
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
||||
print(f'training matrix shape: {train.instances.shape}')
|
||||
|
||||
for quantifier, q_name, param_grid in baselines():
|
||||
|
||||
model_path = os.path.join(models_path, q_name + '.pkl')
|
||||
modelparams_path = os.path.join(hyperparams_path, q_name + '.pkl')
|
||||
if os.path.exists(model_path):
|
||||
print(f'a pickle for {q_name} exists already in {model_path}; skipping!')
|
||||
continue
|
||||
|
||||
print(f'starting model fitting for {q_name}')
|
||||
|
||||
if param_grid is not None:
|
||||
optimizer = qp.model_selection.GridSearchQ(
|
||||
quantifier,
|
||||
param_grid,
|
||||
protocol=gen_val,
|
||||
error=normalized_match_distance if args.task=='T3' else qp.error.mrae,
|
||||
refit=False,
|
||||
verbose=True,
|
||||
n_jobs=-1
|
||||
).fit(train)
|
||||
print(f'{q_name} got MRAE={optimizer.best_score_:.5f} (hyper-params: {optimizer.best_params_})')
|
||||
quantifier = optimizer.best_model()
|
||||
else:
|
||||
quantifier.fit(train)
|
||||
|
||||
print(f'saving model in {model_path}')
|
||||
pickle.dump(quantifier, open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
||||
pickle.dump(quantifier.get_params(), open(modelparams_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
parser = argparse.ArgumentParser(description='LeQua2024 baselines')
|
||||
parser.add_argument('task', metavar='TASK', type=str, choices=LEQUA2024_TASKS,
|
||||
help=f'Code of the task; available ones are {LEQUA2024_TASKS}')
|
||||
parser.add_argument('datadir', metavar='DATA-PATH', type=str,
|
||||
help='Path of the directory containing LeQua 2024 data (default is ./data)',
|
||||
default='./data')
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
|
@ -0,0 +1,60 @@
|
|||
import argparse
|
||||
import quapy as qp
|
||||
from scripts.data import ResultSubmission
|
||||
import os
|
||||
import pickle
|
||||
from tqdm import tqdm
|
||||
from scripts.data import gen_load_samples
|
||||
from glob import glob
|
||||
from scripts import constants
|
||||
from regressor import KDEyRegressor, RegressionToSimplex
|
||||
|
||||
"""
|
||||
LeQua2024 prediction script
|
||||
"""
|
||||
|
||||
def main(args):
|
||||
|
||||
if not args.force and os.path.exists(args.output):
|
||||
print(f'prediction file {args.output} already exists! set --force to override')
|
||||
return
|
||||
|
||||
# check the number of samples
|
||||
nsamples = len(glob(os.path.join(args.samples, f'*.txt')))
|
||||
if nsamples not in {constants.DEV_SAMPLES, constants.TEST_SAMPLES}:
|
||||
print(f'Warning: The number of samples (.txt) in {args.samples} '
|
||||
f'does neither coincide with the expected number of '
|
||||
f'dev samples ({constants.DEV_SAMPLES}) nor with the expected number of '
|
||||
f'test samples ({constants.TEST_SAMPLES}).')
|
||||
|
||||
# load pickled model
|
||||
model = pickle.load(open(args.model, 'rb'))
|
||||
|
||||
# predictions
|
||||
predictions = ResultSubmission()
|
||||
for sampleid, sample in tqdm(gen_load_samples(args.samples, return_id=True), desc='predicting', total=nsamples):
|
||||
predictions.add(sampleid, model.quantify(sample))
|
||||
|
||||
# saving
|
||||
qp.util.create_parent_dir(args.output)
|
||||
predictions.dump(args.output)
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
parser = argparse.ArgumentParser(description='LeQua2024 prediction script')
|
||||
parser.add_argument('model', metavar='MODEL-PATH', type=str,
|
||||
help='Path of saved model')
|
||||
parser.add_argument('samples', metavar='SAMPLES-PATH', type=str,
|
||||
help='Path to the directory containing the samples')
|
||||
parser.add_argument('output', metavar='PREDICTIONS-PATH', type=str,
|
||||
help='Path where to store the predictions file')
|
||||
parser.add_argument('--force', action='store_true',
|
||||
help='Overrides prediction file if exists')
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.exists(args.samples):
|
||||
raise FileNotFoundError(f'path {args.samples} does not exist')
|
||||
if not os.path.isdir(args.samples):
|
||||
raise ValueError(f'path {args.samples} is not a valid directory')
|
||||
|
||||
main(args)
|
|
@ -0,0 +1,133 @@
|
|||
import pickle
|
||||
|
||||
import numpy as np
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.multioutput import MultiOutputRegressor
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.svm import SVR
|
||||
|
||||
from LeQua2024._lequa2024 import fetch_lequa2024
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.protocol import AbstractProtocol
|
||||
from quapy.method.base import BaseQuantifier
|
||||
import quapy.functional as F
|
||||
from tqdm import tqdm
|
||||
from scripts.evaluate import normalized_match_distance, match_distance
|
||||
|
||||
|
||||
def projection_simplex_sort(unnormalized_arr) -> np.ndarray:
|
||||
"""Projects a point onto the probability simplex.
|
||||
[This code is taken from the devel branch, that will correspond to the future QuaPy 0.1.9]
|
||||
|
||||
The code is adapted from Mathieu Blondel's BSD-licensed
|
||||
`implementation <https://gist.github.com/mblondel/6f3b7aaad90606b98f71>`_
|
||||
(see function `projection_simplex_sort` in their repo) which is accompanying the paper
|
||||
|
||||
Mathieu Blondel, Akinori Fujino, and Naonori Ueda.
|
||||
Large-scale Multiclass Support Vector Machine Training via Euclidean Projection onto the Simplex,
|
||||
ICPR 2014, `URL <http://www.mblondel.org/publications/mblondel-icpr2014.pdf>`_
|
||||
|
||||
:param `unnormalized_arr`: point in n-dimensional space, shape `(n,)`
|
||||
:return: projection of `unnormalized_arr` onto the (n-1)-dimensional probability simplex, shape `(n,)`
|
||||
"""
|
||||
unnormalized_arr = np.asarray(unnormalized_arr)
|
||||
n = len(unnormalized_arr)
|
||||
u = np.sort(unnormalized_arr)[::-1]
|
||||
cssv = np.cumsum(u) - 1.0
|
||||
ind = np.arange(1, n + 1)
|
||||
cond = u - cssv / ind > 0
|
||||
rho = ind[cond][-1]
|
||||
theta = cssv[cond][-1] / float(rho)
|
||||
return np.maximum(unnormalized_arr - theta, 0)
|
||||
|
||||
|
||||
class RegressionToSimplex(BaseEstimator):
|
||||
"""
|
||||
A very simple regressor of probability distributions.
|
||||
Internally, this class works by invoking an SVR regressor multioutput
|
||||
followed by a mapping onto the probability simplex.
|
||||
|
||||
:param C: regularziation parameter for SVR
|
||||
"""
|
||||
|
||||
def __init__(self, C=1):
|
||||
self.C = C
|
||||
|
||||
def fit(self, X, y):
|
||||
"""
|
||||
Learns the correction
|
||||
|
||||
:param X: array-like of shape `(n_instances, n_classes)` with uncorrected prevalence vectors
|
||||
:param y: array-like of shape `(n_instances, n_classes)` with true prevalence vectors
|
||||
:return: self
|
||||
"""
|
||||
self.reg = MultiOutputRegressor(SVR(C=self.C), n_jobs=-1)
|
||||
self.reg.fit(X, y)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""
|
||||
Corrects the a vector of prevalence values
|
||||
|
||||
:param X: array-like of shape `(n_classes,)` with one vector of uncorrected prevalence values
|
||||
:return: array-like of shape `(n_classes,)` with one vector of corrected prevalence values
|
||||
"""
|
||||
y_ = self.reg.predict(X)
|
||||
y_ = np.asarray([projection_simplex_sort(y_i) for y_i in y_])
|
||||
return y_
|
||||
|
||||
|
||||
class KDEyRegressor(BaseQuantifier):
|
||||
"""
|
||||
This class implements a regressor-based correction on top of a quantifier.
|
||||
The quantifier is taken to be KDEy-ML, which is considered to be already trained (this
|
||||
method simply loads a pickled object).
|
||||
The method then optimizes a regressor that corrects prevalence vectors using the
|
||||
validation samples as training data.
|
||||
The regressor is based on a multioutput SVR and relies on a post-processing to guarantee
|
||||
that the output lies on the probability simplex (see also RegressionToSimplex)
|
||||
"""
|
||||
|
||||
def __init__(self, kde_path, Cs=np.logspace(-3,3,7)):
|
||||
self.kde_path = kde_path
|
||||
self.Cs = Cs
|
||||
|
||||
def fit(self, val_data: AbstractProtocol):
|
||||
print(f'loading kde from {self.kde_path}')
|
||||
self.kdey = pickle.load(open(self.kde_path, 'rb'))
|
||||
|
||||
print('representing val data with kde')
|
||||
pbar = tqdm(val_data(), total=val_data.total())
|
||||
Xs, Ys = [], []
|
||||
for sample, prev in pbar:
|
||||
prev_hat = self.kdey.quantify(sample)
|
||||
Xs.append(prev_hat)
|
||||
Ys.append(prev)
|
||||
|
||||
Xs = np.asarray(Xs)
|
||||
Ys = np.asarray(Ys)
|
||||
|
||||
def scorer(estimator, X, y):
|
||||
y_hat = estimator.predict(X)
|
||||
md = normalized_match_distance(y, y_hat)
|
||||
return (-md)
|
||||
|
||||
grid = {'C': self.Cs}
|
||||
optim = GridSearchCV(
|
||||
RegressionToSimplex(), param_grid=grid, scoring=scorer, verbose=0, cv=10, n_jobs=64
|
||||
).fit(Xs, Ys)
|
||||
self.regressor = optim.best_estimator_
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
prev_hat = self.kdey.quantify(instances)
|
||||
return self.regressor.predict([prev_hat])[0]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
train, gen_val, _ = fetch_lequa2024(task='T3', data_home='./data', merge_T3=True)
|
||||
kdey_r = KDEyRegressor('./models/T3/KDEy-ML.pkl')
|
||||
kdey_r.fit(gen_val)
|
||||
pickle.dump(kdey_r, open('./models/T3/KDEyRegressor.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
#!/bin/bash
|
||||
set -x
|
||||
|
||||
# T1: binary (n=2)
|
||||
# T2: multiclass (n=28)
|
||||
# T3: ordinal (n=5)
|
||||
# T4: covariante shift (n=2)
|
||||
|
||||
# preparing the environment: downloads the official LeQua 2024 scripts (only once and for all)
|
||||
SCRIPTS_URL=https://github.com/HLT-ISTI/LeQua2024_scripts/archive/refs/heads/main.zip
|
||||
|
||||
# download and unzip the LeQua 2024 scripts
|
||||
|
||||
if [ ! -d "./scripts" ]; then
|
||||
echo "Downloading $SCRIPTS_URL into ./scripts"
|
||||
wget -qO scripts.zip "$SCRIPTS_URL"
|
||||
unzip -q scripts.zip
|
||||
mv "LeQua2024_scripts-main" "scripts"
|
||||
rm scripts.zip
|
||||
echo "[Done]"
|
||||
else
|
||||
echo "LeQua 2024 scripts already exists"
|
||||
fi
|
||||
|
||||
|
||||
for task in T1 T2 T3 T4 ; do
|
||||
|
||||
PYTHONPATH=.:scripts/:.. python3 baselines.py $task data/
|
||||
|
||||
TEST_SAMPLES=data/lequa2024/$task/public/test_samples
|
||||
|
||||
for pickledmodel in models/$task/*.pkl ; do
|
||||
model=$(basename "$pickledmodel" .pkl)
|
||||
PREDICTIONS=predictions/$model/task_"${task: -1}".csv
|
||||
PYTHONPATH=.:scripts/:.. python3 predict.py models/$task/$model.pkl $TEST_SAMPLES $PREDICTIONS
|
||||
done
|
||||
|
||||
done
|
||||
|
||||
echo "generating submission files for codalab in folder ./submission_files"
|
||||
|
||||
mkdir -p submission_files
|
||||
|
||||
for modelname in predictions/* ; do
|
||||
modelname=$(basename "$modelname")
|
||||
echo "modelname is $modelname"
|
||||
for taskname in predictions/$modelname/*.csv ; do
|
||||
taskname=$(basename "$taskname")
|
||||
submission_name=submission_files/"$modelname"_"$taskname".zip
|
||||
rm -f $submission_name
|
||||
echo "zipping results for $modelname and task $taskname"
|
||||
zip -j $submission_name predictions/$modelname/$taskname
|
||||
done
|
||||
done
|
||||
|
||||
echo "[Done]"
|
|
@ -0,0 +1,120 @@
|
|||
import os
|
||||
from os.path import join
|
||||
import pandas as pd
|
||||
import quapy as qp
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
os.chdir('/home/moreo/QuaPy/LeQua2024/util_scripts')
|
||||
print(os.getcwd())
|
||||
|
||||
qp.environ['SAMPLE_SIZE']=250
|
||||
|
||||
true_prevs_path = '../TruePrevalences/T4.test_prevalences/T4/public/test_prevalences.txt'
|
||||
domain_prevs_path = '../T4_domain_prevalence/test_domain_prevalences.txt'
|
||||
folder = '../Results_CODALAB_2024/extracted/TASK_4'
|
||||
|
||||
def load_result_file(path):
|
||||
df = pd.read_csv(path, index_col=0)
|
||||
id = df.index.to_numpy()
|
||||
prevs = df.values
|
||||
return id, prevs
|
||||
|
||||
method_files = [
|
||||
#'ACC.csv',
|
||||
#'CC.csv',
|
||||
#'DistMatching-y.csv',
|
||||
#'KDEy.csv',
|
||||
#'PACC.csv',
|
||||
'PCC.csv',
|
||||
#'SLD.csv',
|
||||
#'TeamCUFE.csv',
|
||||
#'TeamGMNet.csv',
|
||||
'tobiaslotz.csv'
|
||||
]
|
||||
|
||||
method_names_nice={
|
||||
'DistMatching-y': 'DM',
|
||||
'TeamGMNet': 'UniOviedo(Team1)',
|
||||
'tobiaslotz': 'Lamarr'
|
||||
}
|
||||
|
||||
desired_order=[
|
||||
'Lamarr',
|
||||
'SLD',
|
||||
'DM',
|
||||
'KDEy',
|
||||
'UniOviedo(Team1)'
|
||||
]
|
||||
desired_order=[
|
||||
'PCC', 'Lamarr'
|
||||
]
|
||||
|
||||
# load the true values (sentiment prevalence, domain prevalence)
|
||||
true_id, true_prevs = load_result_file(true_prevs_path)
|
||||
dom_id, dom_prevs = load_result_file(domain_prevs_path)
|
||||
assert (true_id == dom_id).all(), 'unmatched files'
|
||||
|
||||
# define the loss for evaluation
|
||||
error_name = 'RAE'
|
||||
error_log = False
|
||||
|
||||
if error_name == 'RAE':
|
||||
err_function_ = qp.error.rae
|
||||
elif error_name == 'AE':
|
||||
err_function_ = qp.error.ae
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
if error_log:
|
||||
error_name = f'log({error_name})'
|
||||
err_function = lambda x,y: np.log(err_function_(x,y))
|
||||
else:
|
||||
err_function = err_function_
|
||||
|
||||
# load the participant and baseline results
|
||||
errors = {}
|
||||
for method_file in method_files:
|
||||
method_name = method_file.replace('.csv', '')
|
||||
id, method_prevs = load_result_file(join(folder, method_file))
|
||||
print(method_file)
|
||||
assert (true_id == id).all(), f'unmatched files for {method_file}'
|
||||
method_error = err_function(true_prevs, method_prevs)
|
||||
method_name = method_names_nice.get(method_name, method_name)
|
||||
errors[method_name] = method_error
|
||||
|
||||
dom_A_prevs = dom_prevs[:,0]
|
||||
|
||||
n_bins = 5
|
||||
bins = np.linspace(dom_A_prevs.min(), dom_A_prevs.max(), n_bins + 1)
|
||||
|
||||
# Crear un DataFrame para los datos
|
||||
df = pd.DataFrame({'dom_A_prevs': dom_A_prevs})
|
||||
for method, err in errors.items():
|
||||
df[method] = err
|
||||
|
||||
# Asignar cada valor de dom_A_prevs a un bin
|
||||
df['bin'] = pd.cut(df['dom_A_prevs'], bins=bins, labels=False, include_lowest=True)
|
||||
|
||||
# Convertir el DataFrame a formato largo
|
||||
df_long = df.melt(id_vars=['dom_A_prevs', 'bin'], value_vars=errors.keys(), var_name='Método', value_name='Error')
|
||||
|
||||
# Crear etiquetas de los bins para el eje X
|
||||
bin_labels = [f"[{bins[i]:.3f}-{bins[i + 1]:.3f}" + (']' if i == n_bins-1 else ')') for i in range(n_bins)]
|
||||
df_long['bin_label'] = df_long['bin'].map(dict(enumerate(bin_labels)))
|
||||
|
||||
# Crear el gráfico de boxplot en Seaborn
|
||||
plt.figure(figsize=(14, 8))
|
||||
sns.boxplot(x='bin', y='Error', hue='Método', data=df_long, palette='Set2', showfliers=False, hue_order=desired_order)
|
||||
|
||||
# Configurar etiquetas del eje X con los rangos de los bins
|
||||
plt.xticks(ticks=range(n_bins), labels=bin_labels, rotation=0)
|
||||
plt.xlabel("Prevalence of Books")
|
||||
plt.ylabel(error_name)
|
||||
#plt.title("Boxplots de Errores por Método dentro de Bins de dom_A_prevs")
|
||||
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
|
||||
plt.tight_layout()
|
||||
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
|
||||
#plt.show()
|
||||
plt.savefig(f'./t4_{error_name}_pcc.png')
|
|
@ -0,0 +1,113 @@
|
|||
import os
|
||||
from os.path import join
|
||||
import pandas as pd
|
||||
|
||||
from LeQua2024.scripts.data import load_vector_documents
|
||||
from LeQua2024.scripts.constants import SAMPLE_SIZE
|
||||
from quapy.data.base import LabelledCollection
|
||||
import sys
|
||||
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
||||
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))
|
||||
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), './')))
|
||||
#from LeQua2024.scripts import constants
|
||||
#from LeQua2024._lequa2024 import fetch_lequa2024
|
||||
import quapy as qp
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
# import seaborn as sns
|
||||
from pathlib import Path
|
||||
import glob
|
||||
from commons import *
|
||||
|
||||
|
||||
for TASK in [1,2,4]:
|
||||
qp.environ['SAMPLE_SIZE']=SAMPLE_SIZE[f'T{TASK}']
|
||||
|
||||
true_prevs_path = f'../TruePrevalences/T{TASK}.test_prevalences/T{TASK}/public/test_prevalences.txt'
|
||||
folder = F'../Results_CODALAB_2024/extracted/TASK_{TASK}'
|
||||
|
||||
method_files = glob.glob(f"{folder}/*.csv")
|
||||
|
||||
desired_order = desired_order_dict[TASK]
|
||||
|
||||
# load the true values (sentiment prevalence, domain prevalence)
|
||||
true_id, true_prevs = load_result_file(true_prevs_path)
|
||||
|
||||
# define the loss for evaluation
|
||||
error_name = 'RAE'
|
||||
error_log = False
|
||||
|
||||
if error_name == 'RAE':
|
||||
err_function_ = qp.error.rae
|
||||
elif error_name == 'AE':
|
||||
err_function_ = qp.error.ae
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
if error_log:
|
||||
error_name = f'log({error_name})'
|
||||
err_function = lambda x,y: np.log(err_function_(x,y))
|
||||
else:
|
||||
err_function = err_function_
|
||||
|
||||
|
||||
#train_prevalence = fetch_lequa2024(task=f'T{TASK}', data_home='./data')
|
||||
train = LabelledCollection.load(f'../data/lequa2024/T{TASK}/public/training_data.txt', loader_func=load_vector_documents)
|
||||
train_prev = train.prevalence()
|
||||
#train_prev = np.tile(train_prev, (len(true_id),1))
|
||||
|
||||
from quapy.plot import error_by_drift, binary_diagonal
|
||||
|
||||
# load the participant and baseline results
|
||||
method_names, estim_prevs = [], []
|
||||
for method_file in method_files:
|
||||
method_name = Path(method_file).name.replace('.csv', '')
|
||||
# if method_name in exclude_methods:
|
||||
# continue
|
||||
id, method_prevs = load_result_file(join(folder, method_name+'.csv'))
|
||||
assert (true_id == id).all(), f'unmatched files for {method_file}'
|
||||
method_name = method_names_nice.get(method_name, method_name)
|
||||
if method_name not in desired_order:
|
||||
print(f'method {method_name} unknown')
|
||||
raise ValueError()
|
||||
method_names.append(method_name)
|
||||
estim_prevs.append(method_prevs)
|
||||
|
||||
plt.rcParams['figure.figsize'] = [14, 6]
|
||||
plt.rcParams['figure.dpi'] = 200
|
||||
plt.rcParams['font.size'] = 15
|
||||
|
||||
true_prevs = [true_prevs]*len(method_names)
|
||||
savepath = f'./t{TASK}_diagonal.png'
|
||||
if TASK in [1,4]:
|
||||
binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, show_std=True, legend=True,
|
||||
train_prev=train.prevalence(), savepath=savepath, method_order=desired_order)
|
||||
|
||||
|
||||
box_to_ancor={
|
||||
1: (0.88,0.1),
|
||||
2: (0.9,0.15),
|
||||
4: (0.9, 0.15),
|
||||
}
|
||||
|
||||
tr_prevs =[train.prevalence()]*len(method_names)
|
||||
savepath = f'./t{TASK}_{error_name}_pps.png'
|
||||
binary=TASK in [1,4]
|
||||
if binary:
|
||||
print(f'{TASK=} has positive prevalence = {train.prevalence()[1]}')
|
||||
error_by_drift(method_names,
|
||||
true_prevs,
|
||||
estim_prevs,
|
||||
tr_prevs,
|
||||
title=None,
|
||||
y_error_name='rae',
|
||||
x_error_name='bias_binary' if binary else 'ae',
|
||||
x_axis_title=f'PPS between training set and test sample (in terms of bias)' if binary else None,
|
||||
show_std=False,
|
||||
n_bins=25,
|
||||
logscale=True if binary else False,
|
||||
show_density=True,
|
||||
method_order=desired_order,
|
||||
vlines=list(train.prevalence()) if binary else None,
|
||||
bbox_to_anchor=box_to_ancor[TASK],
|
||||
savepath=savepath)
|
|
@ -44,6 +44,28 @@ def acce(y_true, y_pred):
|
|||
"""
|
||||
return 1. - (y_true == y_pred).mean()
|
||||
|
||||
def bias_binary(prevs, prevs_hat):
|
||||
"""
|
||||
Computes the (positive) bias in a binary problem. The bias is simply the difference between the
|
||||
predicted positive value and the true positive value, so that a positive such value indicates the
|
||||
prediction has positive bias (i.e., it tends to overestimate) the true value, and negative otherwise.
|
||||
:math:`bias(p,\\hat{p})=\\hat{p}_1-p_1`,
|
||||
:param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
|
||||
:param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted
|
||||
prevalence values
|
||||
:return: binary bias
|
||||
"""
|
||||
assert prevs.shape[-1] == 2 and prevs.shape[-1] == 2, f'bias_binary can only be applied to binary problems'
|
||||
return prevs_hat[...,1]-prevs[...,1]
|
||||
|
||||
def mean_bias_binary(prevs, prevs_hat):
|
||||
"""
|
||||
Computes the mean of the (positive) bias in a binary problem.
|
||||
:param prevs: array-like of shape `(n_classes,)` with the true prevalence values
|
||||
:param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
|
||||
:return: mean binary bias
|
||||
"""
|
||||
return np.mean(bias_binary(prevs, prevs_hat))
|
||||
|
||||
def mae(prevs, prevs_hat):
|
||||
"""Computes the mean absolute error (see :meth:`quapy.error.ae`) across the sample pairs.
|
||||
|
@ -308,8 +330,8 @@ def __check_eps(eps=None):
|
|||
|
||||
|
||||
CLASSIFICATION_ERROR = {f1e, acce}
|
||||
QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld}
|
||||
QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld}
|
||||
QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld, mean_bias_binary}
|
||||
QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld, bias_binary}
|
||||
QUANTIFICATION_ERROR_SMOOTH = {kld, nkld, rae, nrae, mkld, mnkld, mrae}
|
||||
CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR}
|
||||
QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR}
|
||||
|
|
168
quapy/plot.py
168
quapy/plot.py
|
@ -1,6 +1,6 @@
|
|||
from collections import defaultdict
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.cm import get_cmap
|
||||
from matplotlib.pyplot import get_cmap
|
||||
import numpy as np
|
||||
from matplotlib import cm
|
||||
from scipy.stats import ttest_ind_from_stats
|
||||
|
@ -9,6 +9,7 @@ import math
|
|||
|
||||
import quapy as qp
|
||||
|
||||
|
||||
plt.rcParams['figure.figsize'] = [10, 6]
|
||||
plt.rcParams['figure.dpi'] = 200
|
||||
plt.rcParams['font.size'] = 18
|
||||
|
@ -212,13 +213,19 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
|
|||
|
||||
|
||||
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
||||
n_bins=20, error_name='ae', show_std=False,
|
||||
n_bins=20,
|
||||
y_error_name='ae',
|
||||
x_error_name='ae',
|
||||
show_std=False,
|
||||
show_density=True,
|
||||
show_legend=True,
|
||||
y_axis_title=None,
|
||||
x_axis_title=None,
|
||||
logscale=False,
|
||||
title=f'Quantification error as a function of distribution shift',
|
||||
title=None,
|
||||
vlines=None,
|
||||
method_order=None,
|
||||
bbox_to_anchor=(1,1),
|
||||
savepath=None):
|
||||
"""
|
||||
Plots the error (along the x-axis, as measured in terms of `error_name`) as a function of the train-test shift
|
||||
|
@ -234,7 +241,10 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
for each experiment
|
||||
:param tr_prevs: training prevalence of each experiment
|
||||
:param n_bins: number of bins in which the y-axis is to be divided (default is 20)
|
||||
:param error_name: a string representing the name of an error function (as defined in `quapy.error`, default is "ae")
|
||||
:param y_error_name: a string representing the name of an error function (as defined in `quapy.error`,
|
||||
default is "ae") to be used along the y-axis
|
||||
:param x_error_name: a string representing the name of an error function (as defined in `quapy.error`,
|
||||
default is "ae") to be used along the x-axis
|
||||
:param show_std: whether or not to show standard deviations as color bands (default is False)
|
||||
:param show_density: whether or not to display the distribution of experiments for each bin (default is True)
|
||||
:param show_density: whether or not to display the legend of the chart (default is True)
|
||||
|
@ -250,31 +260,40 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
fig, ax = plt.subplots()
|
||||
ax.grid()
|
||||
|
||||
x_error = qp.error.ae
|
||||
y_error = getattr(qp.error, error_name)
|
||||
if isinstance(x_error_name, str):
|
||||
x_error = qp.error.from_name(x_error_name)
|
||||
if isinstance(y_error_name, str):
|
||||
y_error = qp.error.from_name(y_error_name)
|
||||
|
||||
# get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same
|
||||
# order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to
|
||||
# x_error function) and 'y' is the estim-test shift (computed as according to y_error)
|
||||
data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
|
||||
|
||||
min_x = np.min([np.min(m_data['x']) for m_data in data.values()])
|
||||
max_x = np.max([np.max(m_data['x']) for m_data in data.values()])
|
||||
min_y = np.min([np.min(m_data['y']) for m_data in data.values()])
|
||||
max_y = np.max([np.max(m_data['y']) for m_data in data.values()])
|
||||
print(f'[{min_x}, {max_x}]<-')
|
||||
|
||||
if method_order is None:
|
||||
method_order = method_names
|
||||
|
||||
_set_colors(ax, n_methods=len(method_order))
|
||||
|
||||
bins = np.linspace(0, 1, n_bins+1)
|
||||
binwidth = 1 / n_bins
|
||||
min_x, max_x, min_y, max_y = None, None, None, None
|
||||
bins = np.linspace(min_x-1E-8, max_x+1E-8, n_bins+1)
|
||||
print('bins', bins)
|
||||
binwidth = (max_x-min_x) / n_bins
|
||||
|
||||
npoints = np.zeros(len(bins), dtype=float)
|
||||
for method in method_order:
|
||||
tr_test_drifts = data[method]['x']
|
||||
method_drifts = data[method]['y']
|
||||
if logscale:
|
||||
ax.set_yscale("log")
|
||||
ax.yaxis.set_major_formatter(ScalarFormatter())
|
||||
ax.yaxis.get_major_formatter().set_scientific(False)
|
||||
ax.minorticks_off()
|
||||
# ax.yaxis.set_major_formatter(ScalarFormatter())
|
||||
# ax.yaxis.get_major_formatter().set_scientific(False)
|
||||
# ax.minorticks_off()
|
||||
|
||||
inds = np.digitize(tr_test_drifts, bins, right=True)
|
||||
|
||||
|
@ -282,7 +301,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
for p,ind in enumerate(range(len(bins))):
|
||||
selected = inds==ind
|
||||
if selected.sum() > 0:
|
||||
xs.append(ind*binwidth-binwidth/2)
|
||||
xs.append(min_x + (ind*binwidth-binwidth/2))
|
||||
ys.append(np.mean(method_drifts[selected]))
|
||||
ystds.append(np.std(method_drifts[selected]))
|
||||
npoints[p] += len(method_drifts[selected])
|
||||
|
@ -291,13 +310,6 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
ys = np.asarray(ys)
|
||||
ystds = np.asarray(ystds)
|
||||
|
||||
min_x_method, max_x_method, min_y_method, max_y_method = xs.min(), xs.max(), ys.min(), ys.max()
|
||||
min_x = min_x_method if min_x is None or min_x_method < min_x else min_x
|
||||
max_x = max_x_method if max_x is None or max_x_method > max_x else max_x
|
||||
max_y = max_y_method if max_y is None or max_y_method > max_y else max_y
|
||||
min_y = min_y_method if min_y is None or min_y_method < min_y else min_y
|
||||
max_y = max_y_method if max_y is None or max_y_method > max_y else max_y
|
||||
|
||||
ax.errorbar(xs, ys, fmt='-', marker='o', color='w', markersize=8, linewidth=4, zorder=1)
|
||||
ax.errorbar(xs, ys, fmt='-', marker='o', label=method, markersize=6, linewidth=2, zorder=2)
|
||||
|
||||
|
@ -307,32 +319,41 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
if show_density:
|
||||
ax2 = ax.twinx()
|
||||
densities = npoints/np.sum(npoints)
|
||||
ax2.bar([ind * binwidth-binwidth/2 for ind in range(len(bins))],
|
||||
densities, alpha=0.15, color='g', width=binwidth, label='density')
|
||||
ax2.bar([min_x + (ind * binwidth-binwidth/2) for ind in range(len(bins))],
|
||||
densities, alpha=0.15, color='g', width=binwidth, label='density')
|
||||
ax2.set_ylim(0,max(densities))
|
||||
ax2.spines['right'].set_color('g')
|
||||
ax2.tick_params(axis='y', colors='g')
|
||||
|
||||
ax.set(xlabel=f'Distribution shift between training set and test sample',
|
||||
ylabel=f'{error_name.upper()} (true distribution, predicted distribution)',
|
||||
title=title)
|
||||
|
||||
y_axis_err_name = y_error_name.upper()
|
||||
if logscale:
|
||||
y_axis_err_name = f'log({y_axis_err_name})'
|
||||
if y_axis_title is None:
|
||||
y_axis_title=f'{y_axis_err_name} (true distribution, predicted distribution)'
|
||||
if x_axis_title is None:
|
||||
x_axis_title = f'PPS between training set and test sample (in terms of {x_error_name.upper()})'
|
||||
ax.set(xlabel=x_axis_title, ylabel=y_axis_title, title=title)
|
||||
box = ax.get_position()
|
||||
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
|
||||
if vlines:
|
||||
for vline in vlines:
|
||||
ax.axvline(vline, 0, 1, linestyle='--', color='k')
|
||||
|
||||
ax.set_xlim(min_x, max_x)
|
||||
margin = (max_x-min_x)*0.02
|
||||
ax.set_xlim(min_x-margin, max_x+margin)
|
||||
if logscale:
|
||||
#nice scale for the logaritmic axis
|
||||
ax.set_ylim(0,10 ** math.ceil(math.log10(max_y)))
|
||||
|
||||
|
||||
|
||||
|
||||
if show_legend:
|
||||
# fig.legend(loc='lower center',
|
||||
# bbox_to_anchor=(1, 0.5),
|
||||
# ncol=(len(method_names)+1)//2)
|
||||
fig.legend(loc='lower center',
|
||||
bbox_to_anchor=(1, 0.5),
|
||||
ncol=(len(method_names)+1)//2)
|
||||
|
||||
bbox_to_anchor=bbox_to_anchor,
|
||||
ncol=1)
|
||||
|
||||
_save_or_show(savepath)
|
||||
|
||||
|
||||
|
@ -547,6 +568,7 @@ def _save_or_show(savepath):
|
|||
plt.savefig(savepath, bbox_inches='tight')
|
||||
else:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
|
||||
def _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order):
|
||||
|
@ -567,4 +589,84 @@ def _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error
|
|||
if method not in method_order:
|
||||
method_order.append(method)
|
||||
|
||||
return data
|
||||
return data
|
||||
|
||||
|
||||
def bin_signal(x, y, n_bins, binning_type='isometric'):
|
||||
"""
|
||||
Bins the input data `x` and computes statistical summaries of `y` within each bin.
|
||||
|
||||
:param x: The independent variable to be binned. Must be a 1D array of numerical values.
|
||||
:type x: array-like
|
||||
:param y: The dependent variable corresponding to `x`. Must be the same length as `x`.
|
||||
:type y: array-like
|
||||
:param n_bins: The number of bins to create.
|
||||
:type n_bins: int
|
||||
:param binning_type: The method to use for binning:
|
||||
- `'isometric'`: Creates bins with equal width (isometric binning).
|
||||
- `'isotonic'`: Creates bins containing an equal number of instances (isotonic binning).
|
||||
Defaults to `'isometric'`.
|
||||
:type binning_type: str
|
||||
|
||||
:return: A tuple containing:
|
||||
- `bin_means` (numpy.ndarray): The mean of `y` values in each bin (`np.nan` for empty bins).
|
||||
- `bin_stds` (numpy.ndarray): The standard deviation (sample std) of `y` values in each bin (`np.nan` for empty bins).
|
||||
- `bin_centers` (numpy.ndarray): The center points of each bin.
|
||||
- `bin_lengths` (numpy.ndarray): The length (width) of each bin.
|
||||
- `bin_counts` (numpy.ndarray): The number of elements in each bin.
|
||||
:rtype: tuple (numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray)
|
||||
|
||||
:raises ValueError: If `binning_type` is not one of `'isometric'` or `'isotonic'`.
|
||||
|
||||
.. note::
|
||||
- For isometric binning, bins are equally spaced along the range of `x`.
|
||||
- For isotonic binning, bins are constructed to contain approximately equal numbers of elements, based on sorted `x`.
|
||||
- If a bin is empty (no elements fall within its range), its mean and standard deviation will be `np.nan`.
|
||||
|
||||
:example:
|
||||
|
||||
>>> import numpy as np
|
||||
>>> x = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
|
||||
>>> y = np.array([10, 20, 15, 25, 30, 35, 45, 50, 40])
|
||||
>>> n_bins = 3
|
||||
>>> bin_signal(x, y, n_bins, binning_type='isometric')
|
||||
(array([15., 30., 45.]),
|
||||
array([5., 5., 5.]),
|
||||
array([2.33333333, 5. , 7.66666667]),
|
||||
array([2.66666667, 2.66666667, 2.66666667]),
|
||||
array([3, 3, 3]))
|
||||
"""
|
||||
x = np.asarray(x)
|
||||
y = np.asarray(y)
|
||||
|
||||
if binning_type == 'isometric':
|
||||
# all bins are equally-sized
|
||||
bin_edges = np.linspace(x.min(), x.max(), n_bins + 1)
|
||||
elif binning_type == 'isotonic':
|
||||
# all bins contain the same number of instances
|
||||
sorted_x = np.sort(x)
|
||||
bin_edges = np.interp(np.linspace(0, len(x), n_bins + 1), np.arange(len(x)), sorted_x)
|
||||
else:
|
||||
raise ValueError("valid binning types include 'isometric' and 'isotonic'")
|
||||
|
||||
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
|
||||
bin_lengths = bin_edges[1:] - bin_edges[:-1]
|
||||
|
||||
bin_means = []
|
||||
bin_stds = []
|
||||
bin_counts = []
|
||||
|
||||
for start, end in zip(bin_edges[:-1], bin_edges[1:]):
|
||||
mask = (x >= start) & (x < end) if end != bin_edges[-1] else (x >= start) & (x <= end)
|
||||
count = mask.sum()
|
||||
if count > 0:
|
||||
y_in_bin = y[mask]
|
||||
bin_means.append(np.mean(y_in_bin))
|
||||
bin_stds.append(np.std(y_in_bin, ddof=1))
|
||||
else:
|
||||
bin_means.append(np.nan)
|
||||
bin_stds.append(np.nan)
|
||||
bin_counts.append(count)
|
||||
|
||||
return np.array(bin_means), np.array(bin_stds), np.array(bin_centers), np.array(bin_lengths), np.array(bin_counts)
|
||||
|
||||
|
|
Loading…
Reference in New Issue