evaluation script and format checker added

This commit is contained in:
Alejandro Moreo Fernandez 2021-10-25 13:37:22 +02:00
parent 5f15b365fe
commit 9a08125e7e
7 changed files with 166 additions and 85 deletions

View File

@ -6,3 +6,4 @@
6. estoy leyendo los samples en orden, y no hace falta. Sería mejor una función genérica que lee todos los ejemplos y 6. estoy leyendo los samples en orden, y no hace falta. Sería mejor una función genérica que lee todos los ejemplos y
que de todos modos genera un output con el mismo nombre del file que de todos modos genera un output con el mismo nombre del file
7. Make ResultSubmission class abstract, and create 4 instances thus forcing the field task_name to be set correctly 7. Make ResultSubmission class abstract, and create 4 instances thus forcing the field task_name to be set correctly
8. No me convence que la lectura de los samples (caso en que no hay ground truth) viene en orden aleatorio

6
LeQua2022/constants.py Normal file
View File

@ -0,0 +1,6 @@
DEV_SAMPLES = 1000
TEST_SAMPLES = 5000
T1A_SAMPLE_SIZE = 250
ERROR_TOL=1E-3

View File

@ -7,6 +7,9 @@ import quapy as qp
import numpy as np import numpy as np
import sklearn import sklearn
import re import re
from glob import glob
import constants
# def load_binary_raw_document(path): # def load_binary_raw_document(path):
@ -20,14 +23,38 @@ import re
# def load_multiclass_raw_document(path): # def load_multiclass_raw_document(path):
# return qp.data.from_text(path, verbose=0, class2int=False) # return qp.data.from_text(path, verbose=0, class2int=False)
def load_category_map(path):
cat2code = {}
with open(path, 'rt') as fin:
category, code = fin.readline().split()
cat2code[category] = int(code)
return cat2code
def load_binary_vectors(path, nF=None): def load_binary_vectors(path, nF=None):
return sklearn.datasets.load_svmlight_file(path, n_features=nF) return sklearn.datasets.load_svmlight_file(path, n_features=nF)
def gen_load_samples_T1A(path_dir:str, ground_truth_path:str = None): def __gen_load_samples_with_groudtruth(path_dir:str, ground_truth_path:str, load_fn, **load_kwargs):
# for ... : yield true_prevs = ResultSubmission.load(ground_truth_path)
pass for filename, prevalence in true_prevs.iterrows():
sample, _ = load_fn(os.path.join(path_dir, filename), **load_kwargs)
yield filename, sample, prevalence
def __gen_load_samples_without_groudtruth(path_dir:str, load_fn, **load_kwargs):
for filepath in glob(os.path.join(path_dir, '*_sample_*.txt')):
sample, _ = load_fn(filepath, **load_kwargs)
yield os.path.basename(filepath), sample
def gen_load_samples_T1A(path_dir:str, nF:int, ground_truth_path:str = None):
if ground_truth_path is None:
for filename, sample in __gen_load_samples_without_groudtruth(path_dir, load_binary_vectors, nF=nF):
yield filename, sample
else:
for filename, sample, prevalence in __gen_load_samples_with_groudtruth(path_dir, ground_truth_path, load_binary_vectors, nF=nF):
yield filename, sample, prevalence
def gen_load_samples_T1B(path_dir:str, ground_truth_path:str = None): def gen_load_samples_T1B(path_dir:str, ground_truth_path:str = None):
@ -46,9 +73,6 @@ def gen_load_samples_T2B(path_dir:str, ground_truth_path:str = None):
class ResultSubmission: class ResultSubmission:
DEV_LEN = 1000
TEST_LEN = 5000
ERROR_TOL = 1E-3
def __init__(self, categories: List[str]): def __init__(self, categories: List[str]):
if not isinstance(categories, list) or len(categories) < 2: if not isinstance(categories, list) or len(categories) < 2:
@ -80,9 +104,9 @@ class ResultSubmission:
raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}') raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}')
if (prevalence_values<0).any() or (prevalence_values>1).any(): if (prevalence_values<0).any() or (prevalence_values>1).any():
raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_name}"') raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_name}"')
if np.abs(prevalence_values.sum()-1) > ResultSubmission.ERROR_TOL: if np.abs(prevalence_values.sum()-1) > constants.ERROR_TOL:
raise ValueError(f'error: prevalence values do not sum up to one for "{sample_name}"' raise ValueError(f'error: prevalence values do not sum up to one for "{sample_name}"'
f'(error tolerance {ResultSubmission.ERROR_TOL})') f'(error tolerance {constants.ERROR_TOL})')
new_entry = dict([('filename',sample_name)]+[(col_i,prev_i) for col_i, prev_i in zip(self.categories, prevalence_values)]) new_entry = dict([('filename',sample_name)]+[(col_i,prev_i) for col_i, prev_i in zip(self.categories, prevalence_values)])
self.df = self.df.append(new_entry, ignore_index=True) self.df = self.df.append(new_entry, ignore_index=True)
@ -93,7 +117,7 @@ class ResultSubmission:
@classmethod @classmethod
def load(cls, path: str) -> 'ResultSubmission': def load(cls, path: str) -> 'ResultSubmission':
df, inferred_type = ResultSubmission.check_file_format(path, return_inferred_type=True) df, inferred_type = ResultSubmission.check_file_format(path, return_inferred_type=True)
r = ResultSubmission(categories=df.columns.values.tolist()) r = ResultSubmission(categories=df.columns.values[1:].tolist())
r.inferred_type = inferred_type r.inferred_type = inferred_type
r.df = df r.df = df
return r return r
@ -102,13 +126,19 @@ class ResultSubmission:
ResultSubmission.check_dataframe_format(self.df) ResultSubmission.check_dataframe_format(self.df)
self.df.to_csv(path) self.df.to_csv(path)
def get(self, sample_name:str): def prevalence(self, sample_name:str):
sel = self.df.loc[self.df['filename'] == sample_name] sel = self.df.loc[self.df['filename'] == sample_name]
if sel.empty: if sel.empty:
return None return None
else: else:
return sel.loc[:,self.df.columns[1]:].values.flatten() return sel.loc[:,self.df.columns[1]:].values.flatten()
def iterrows(self):
for index, row in self.df.iterrows():
filename = row.filename
prevalence = row[self.df.columns[1]:].values.flatten()
yield filename, prevalence
@classmethod @classmethod
def check_file_format(cls, path, return_inferred_type=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: def check_file_format(cls, path, return_inferred_type=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
df = pd.read_csv(path, index_col=0) df = pd.read_csv(path, index_col=0)
@ -116,7 +146,7 @@ class ResultSubmission:
@classmethod @classmethod
def check_dataframe_format(cls, df, path=None, return_inferred_type=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: def check_dataframe_format(cls, df, path=None, return_inferred_type=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
hint_path = '' # if given, show the data path in the error messages hint_path = '' # if given, show the data path in the error message
if path is not None: if path is not None:
hint_path = f' in {path}' hint_path = f' in {path}'
@ -125,33 +155,33 @@ class ResultSubmission:
if df.empty: if df.empty:
raise ValueError(f'error{hint_path}: results file is empty') raise ValueError(f'error{hint_path}: results file is empty')
elif len(df) == ResultSubmission.DEV_LEN: elif len(df) == constants.DEV_SAMPLES:
inferred_type = 'dev' inferred_type = 'dev'
expected_len = ResultSubmission.DEV_LEN expected_len = constants.DEV_SAMPLES
elif len(df) == ResultSubmission.TEST_LEN: elif len(df) == constants.TEST_SAMPLES:
inferred_type = 'test' inferred_type = 'test'
expected_len = ResultSubmission.TEST_LEN expected_len = constants.TEST_SAMPLES
else: else:
raise ValueError(f'wrong number of prevalence values found{hint_path}; ' raise ValueError(f'wrong number of prevalence values found{hint_path}; '
f'expected {ResultSubmission.DEV_LEN} for development sets and ' f'expected {constants.DEV_SAMPLES} for development sets and '
f'{ResultSubmission.TEST_LEN} for test sets; found {len(df)}') f'{constants.TEST_SAMPLES} for test sets; found {len(df)}')
set_names = frozenset(df.filename) set_names = frozenset(df.filename)
for i in range(expected_len): for i in range(expected_len):
if f'{inferred_type}_sample_{i}.txt' not in set_names: if f'{inferred_type}_sample_{i}.txt' not in set_names:
raise ValueError(f'{hint_path} a file with {len(df)} entries is assumed to be of type ' raise ValueError(f'error{hint_path} a file with {len(df)} entries is assumed to be of type '
f'"{inferred_type}" but entry {inferred_type}_sample_{i}.txt is missing ' f'"{inferred_type}" but entry {inferred_type}_sample_{i}.txt is missing '
f'(among perhaps many others)') f'(among perhaps many others)')
for category_name in df.columns[1:]: for category_name in df.columns[1:]:
if (df[category_name] < 0).any() or (df[category_name] > 1).any(): if (df[category_name] < 0).any() or (df[category_name] > 1).any():
raise ValueError(f'{hint_path} column "{category_name}" contains values out of range [0,1]') raise ValueError(f'error{hint_path} column "{category_name}" contains values out of range [0,1]')
prevs = df.loc[:, df.columns[1]:].values prevs = df.loc[:, df.columns[1]:].values
round_errors = np.abs(prevs.sum(axis=-1) - 1.) > ResultSubmission.ERROR_TOL round_errors = np.abs(prevs.sum(axis=-1) - 1.) > constants.ERROR_TOL
if round_errors.any(): if round_errors.any():
raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} ' raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} '
f'do not sum up to 1 (error tolerance {ResultSubmission.ERROR_TOL}), ' f'do not sum up to 1 (error tolerance {constants.ERROR_TOL}), '
f'probably due to some rounding errors.') f'probably due to some rounding errors.')
if return_inferred_type: if return_inferred_type:
@ -163,20 +193,31 @@ class ResultSubmission:
self.df = self.df.reindex([self.df.columns[0]] + sorted(self.df.columns[1:]), axis=1) self.df = self.df.reindex([self.df.columns[0]] + sorted(self.df.columns[1:]), axis=1)
self.categories = sorted(self.categories) self.categories = sorted(self.categories)
def filenames(self):
return self.df.filename.values
def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSubmission, sample_size=1000, average=True): def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSubmission, sample_size=None, average=True):
if sample_size is None:
if qp.environ['SAMPLE_SIZE'] is None:
raise ValueError('Relative Absolute Error cannot be computed: '
'neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified')
else:
sample_size = qp.environ['SAMPLE_SIZE']
if len(true_prevs) != len(predicted_prevs): if len(true_prevs) != len(predicted_prevs):
raise ValueError(f'size mismatch, groun truth has {len(true_prevs)} entries ' raise ValueError(f'size mismatch, ground truth file has {len(true_prevs)} entries '
f'while predictions contain {len(predicted_prevs)} entries') f'while the file of predictions contain {len(predicted_prevs)} entries')
true_prevs.sort_categories() true_prevs.sort_categories()
predicted_prevs.sort_categories() predicted_prevs.sort_categories()
if true_prevs.categories != predicted_prevs.categories: if true_prevs.categories != predicted_prevs.categories:
raise ValueError(f'these result files are not comparable since the categories are different') raise ValueError(f'these result files are not comparable since the categories are different: '
f'true={true_prevs.categories} vs. predictions={predicted_prevs.categories}')
ae, rae = [], [] ae, rae = [], []
for sample_name in true_prevs.df.filename.values: for sample_name, true_prevalence in true_prevs.iterrows():
ae.append(qp.error.mae(true_prevs.get(sample_name), predicted_prevs.get(sample_name))) pred_prevalence = predicted_prevs.prevalence(sample_name)
rae.append(qp.error.mrae(true_prevs.get(sample_name), predicted_prevs.get(sample_name), eps=sample_size)) ae.append(qp.error.ae(true_prevalence, pred_prevalence))
rae.append(qp.error.rae(true_prevalence, pred_prevalence, eps=1./(2*sample_size)))
ae = np.asarray(ae) ae = np.asarray(ae)
rae = np.asarray(rae) rae = np.asarray(rae)
if average: if average:
@ -187,21 +228,6 @@ def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSub
# r = ResultSubmission(['negative', 'positive'])
# from tqdm import tqdm
# for i in tqdm(range(1000), total=1000):
# r.add(f'dev_sample_{i}.txt', np.asarray([0.5, 0.5]))
# r.dump('./path.csv')
r = ResultSubmission.load('./data/T1A/public/dummy_submission.csv')
t = ResultSubmission.load('./data/T1A/public/dummy_submission (copy).csv')
# print(r.df)
# print(r.get('dev_sample_10.txt'))
print(evaluate_submission(r, t))
# s = ResultSubmission.load('./data/T1A/public/dummy_submission.csv')
#
# print(s)

41
LeQua2022/evaluation.py Normal file
View File

@ -0,0 +1,41 @@
import argparse
import quapy as qp
from data import ResultSubmission, evaluate_submission
import constants
import os
"""
LeQua2022 Official evaluation script
"""
def main(args):
if args.task in {'T1A'}:
qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE
true_prev = ResultSubmission.load(args.true_prevalences)
pred_prev = ResultSubmission.load(args.pred_prevalences)
mae, mrae = evaluate_submission(true_prev, pred_prev)
print(f'MAE: {mae:.4f}')
print(f'MRAE: {mrae:.4f}')
if args.output is not None:
outdir = os.path.dirname(args.output)
if outdir:
os.makedirs(outdir, exist_ok=True)
with open(args.output, 'wt') as foo:
foo.write(f'MAE: {mae:.4f}\n')
foo.write(f'MRAE: {mrae:.4f}\n')
if __name__=='__main__':
parser = argparse.ArgumentParser(description='LeQua2022 official evaluation script')
parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B', 'T2A', 'T2B'],
help='Task name (T1A, T1B, T2A, T2B)')
parser.add_argument('true_prevalences', metavar='TRUE-PREV-PATH', type=str,
help='Path of ground truth prevalence values file (.csv)')
parser.add_argument('pred_prevalences', metavar='PRED-PREV-PATH', type=str,
help='Path of predicted prevalence values file (.csv)')
parser.add_argument('--output', metavar='SCORES-PATH', type=str, default=None,
help='Path where to store the evaluation scores')
args = parser.parse_args()
main(args)

View File

@ -0,0 +1,27 @@
import argparse
import quapy as qp
from data import ResultSubmission, evaluate_submission
import constants
import os
"""
LeQua2022 Official format-checker script
"""
def main(args):
try:
ResultSubmission.check_file_format(args.prevalence_file)
except Exception as e:
print(e)
print('Format check: not passed')
else:
print('Format check: passed')
if __name__=='__main__':
parser = argparse.ArgumentParser(description='LeQua2022 official format-checker script')
parser.add_argument('prevalence_file', metavar='PREV-PATH', type=str,
help='Path of the file containing prevalence values to check')
args = parser.parse_args()
main(args)

View File

@ -9,64 +9,44 @@ import quapy as qp
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.method.aggregative import * from quapy.method.aggregative import *
import quapy.functional as F import quapy.functional as F
from data import load_binary_vectors from data import *
import os import os
import constants
path_binary_vector = './data/T1A' predictions_path = os.path.join('predictions', 'T1A') # binary - vector
result_path = os.path.join('results', 'T1A') # binary - vector os.makedirs(predictions_path, exist_ok=True)
os.makedirs(result_path, exist_ok=True)
train_file = os.path.join(path_binary_vector, 'public', 'training_vectors.txt') pathT1A = './data/T1A/public'
T1A_devvectors_path = os.path.join(pathT1A, 'dev_vectors')
train = LabelledCollection.load(train_file, load_binary_vectors) T1A_devprevalence_path = os.path.join(pathT1A, 'dev_prevalences.csv')
T1A_trainpath = os.path.join(pathT1A, 'training_vectors.txt')
train = LabelledCollection.load(T1A_trainpath, load_binary_vectors)
nF = train.instances.shape[1] nF = train.instances.shape[1]
qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE
print(f'number of classes: {len(train.classes_)}') print(f'number of classes: {len(train.classes_)}')
print(f'number of training documents: {len(train)}') print(f'number of training documents: {len(train)}')
print(f'training prevalence: {F.strprev(train.prevalence())}') print(f'training prevalence: {F.strprev(train.prevalence())}')
print(f'training matrix shape: {train.instances.shape}') print(f'training matrix shape: {train.instances.shape}')
dev_prev = pd.read_csv(os.path.join(path_binary_vector, 'public', 'dev_prevalences.csv'), index_col=0) true_prevalence = ResultSubmission.load(T1A_devprevalence_path)
print(dev_prev)
for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
scores = {}
for quantifier in [CC]: #, ACC, PCC, PACC, EMQ, HDy]:
classifier = CalibratedClassifierCV(LogisticRegression()) classifier = CalibratedClassifierCV(LogisticRegression())
model = quantifier(classifier).fit(train) model = quantifier(classifier).fit(train)
quantifier_name = model.__class__.__name__ quantifier_name = model.__class__.__name__
scores[quantifier_name]={} predictions = ResultSubmission(categories=['negative', 'positive'])
for sample_set, sample_size in [('dev', 1000)]: for samplename, sample in tqdm(gen_load_samples_T1A(T1A_devvectors_path, nF),
ae_errors, rae_errors = [], [] desc=quantifier_name, total=len(true_prevalence)):
for i, row in tqdm(dev_prev.iterrows(), total=len(dev_prev), desc=f'testing {quantifier_name} in {sample_set}'): predictions.add(samplename, model.quantify(sample))
filename = row['filename']
prev_true = row[1:].values
sample_path = os.path.join(path_binary_vector, 'public', f'{sample_set}_vectors', filename)
sample, _ = load_binary_vectors(sample_path, nF)
qp.environ['SAMPLE_SIZE'] = sample.shape[0]
prev_estim = model.quantify(sample)
# prev_true = sample.prevalence()
ae_errors.append(qp.error.mae(prev_true, prev_estim))
rae_errors.append(qp.error.mrae(prev_true, prev_estim))
ae_errors = np.asarray(ae_errors)
rae_errors = np.asarray(rae_errors)
mae = ae_errors.mean()
mrae = rae_errors.mean()
scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae}
pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
print(f'{quantifier_name} {sample_set} MAE={mae:.4f}')
print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}')
for model in scores:
for sample_set in ['validation']:#, 'test']:
print(f'{model}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}')
predictions.dump(os.path.join(predictions_path, quantifier_name + '.csv'))
mae, mrae = evaluate_submission(true_prevalence, predictions)
print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}')
""" """
test: test:

View File

@ -149,7 +149,7 @@ class IndexTransformer:
def index(self, documents): def index(self, documents):
vocab = self.vocabulary_.copy() vocab = self.vocabulary_.copy()
return [[vocab.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')] return [[vocab.prevalence(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
def fit_transform(self, X, n_jobs=-1): def fit_transform(self, X, n_jobs=-1):
return self.fit(X).transform(X, n_jobs=n_jobs) return self.fit(X).transform(X, n_jobs=n_jobs)