From 5f15b365fe2e559c932ab57f2d45ed7502cbf887 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Fri, 22 Oct 2021 19:03:15 +0200 Subject: [PATCH] result file format check, read, load, and evaluation with pandas --- LeQua2022/data.py | 170 ++++++++++++++++++++++---- LeQua2022/main_binary_vector.py | 2 - quapy/data/base.py | 204 +++++++++++++++++--------------- 3 files changed, 256 insertions(+), 120 deletions(-) diff --git a/LeQua2022/data.py b/LeQua2022/data.py index 9a133c4..2d99120 100644 --- a/LeQua2022/data.py +++ b/LeQua2022/data.py @@ -1,6 +1,12 @@ +import os.path +from typing import List, Tuple, Union + +import pandas as pd + import quapy as qp import numpy as np import sklearn +import re # def load_binary_raw_document(path): @@ -40,40 +46,162 @@ def gen_load_samples_T2B(path_dir:str, ground_truth_path:str = None): class ResultSubmission: - def __init__(self, team_name, run_name, task_name): - assert isinstance(team_name, str) and team_name, \ - f'invalid value encountered for team_name' - assert isinstance(run_name, str) and run_name, \ - f'invalid value encountered for run_name' - assert isinstance(task_name, str) and task_name in {'T1A', 'T1B', 'T2A', 'T2B'}, \ - f'invalid value encountered for task_name; valid values are T1A, T1B, T2A, and T2B' - self.team_name = team_name - self.run_name = run_name - self.task_name = task_name - self.data = {} + DEV_LEN = 1000 + TEST_LEN = 5000 + ERROR_TOL = 1E-3 + + def __init__(self, categories: List[str]): + if not isinstance(categories, list) or len(categories) < 2: + raise TypeError('wrong format for categories; a list with at least two category names (str) was expected') + self.categories = categories + self.df = pd.DataFrame(columns=['filename'] + list(categories)) + self.inferred_type = None def add(self, sample_name:str, prevalence_values:np.ndarray): - # assert the result is a valid sample_name (not repeated) - pass + if not isinstance(sample_name, str): + raise TypeError(f'error: expected str for sample_sample, found {type(sample_name)}') + if not isinstance(prevalence_values, np.ndarray): + raise TypeError(f'error: expected np.ndarray for prevalence_values, found {type(prevalence_values)}') + + if self.inferred_type is None: + if sample_name.startswith('test'): + self.inferred_type = 'test' + elif sample_name.startswith('dev'): + self.inferred_type = 'dev' + else: + if not sample_name.startswith(self.inferred_type): + raise ValueError(f'error: sample "{sample_name}" is not a valid entry for type "{self.inferred_type}"') + + if not re.match("(test|dev)_sample_\d+\.txt", sample_name): + raise ValueError(f'error: wrong format "{sample_name}"; right format is (test|dev)_sample_.txt') + if sample_name in self.df.filename.values: + raise ValueError(f'error: prevalence values for "{sample_name}" already added') + if prevalence_values.ndim!=1 and prevalence_values.size != len(self.categories): + raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}') + if (prevalence_values<0).any() or (prevalence_values>1).any(): + raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_name}"') + if np.abs(prevalence_values.sum()-1) > ResultSubmission.ERROR_TOL: + raise ValueError(f'error: prevalence values do not sum up to one for "{sample_name}"' + f'(error tolerance {ResultSubmission.ERROR_TOL})') + + new_entry = dict([('filename',sample_name)]+[(col_i,prev_i) for col_i, prev_i in zip(self.categories, prevalence_values)]) + self.df = self.df.append(new_entry, ignore_index=True) def __len__(self): - return len(self.data) + return len(self.df) @classmethod - def load(cls, path:str)-> 'ResultSubmission': - pass + def load(cls, path: str) -> 'ResultSubmission': + df, inferred_type = ResultSubmission.check_file_format(path, return_inferred_type=True) + r = ResultSubmission(categories=df.columns.values.tolist()) + r.inferred_type = inferred_type + r.df = df + return r def dump(self, path:str): - # assert all samples are covered (check for test and dev accordingly) - pass + ResultSubmission.check_dataframe_format(self.df) + self.df.to_csv(path) def get(self, sample_name:str): - pass + sel = self.df.loc[self.df['filename'] == sample_name] + if sel.empty: + return None + else: + return sel.loc[:,self.df.columns[1]:].values.flatten() + + @classmethod + def check_file_format(cls, path, return_inferred_type=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: + df = pd.read_csv(path, index_col=0) + return ResultSubmission.check_dataframe_format(df, path=path, return_inferred_type=return_inferred_type) + + @classmethod + def check_dataframe_format(cls, df, path=None, return_inferred_type=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: + hint_path = '' # if given, show the data path in the error messages + if path is not None: + hint_path = f' in {path}' + + if 'filename' not in df.columns or len(df.columns) < 3: + raise ValueError(f'wrong header{hint_path}, the format of the header should be ",filename,,...,"') + + if df.empty: + raise ValueError(f'error{hint_path}: results file is empty') + elif len(df) == ResultSubmission.DEV_LEN: + inferred_type = 'dev' + expected_len = ResultSubmission.DEV_LEN + elif len(df) == ResultSubmission.TEST_LEN: + inferred_type = 'test' + expected_len = ResultSubmission.TEST_LEN + else: + raise ValueError(f'wrong number of prevalence values found{hint_path}; ' + f'expected {ResultSubmission.DEV_LEN} for development sets and ' + f'{ResultSubmission.TEST_LEN} for test sets; found {len(df)}') + + set_names = frozenset(df.filename) + for i in range(expected_len): + if f'{inferred_type}_sample_{i}.txt' not in set_names: + raise ValueError(f'{hint_path} a file with {len(df)} entries is assumed to be of type ' + f'"{inferred_type}" but entry {inferred_type}_sample_{i}.txt is missing ' + f'(among perhaps many others)') + + for category_name in df.columns[1:]: + if (df[category_name] < 0).any() or (df[category_name] > 1).any(): + raise ValueError(f'{hint_path} column "{category_name}" contains values out of range [0,1]') + + prevs = df.loc[:, df.columns[1]:].values + round_errors = np.abs(prevs.sum(axis=-1) - 1.) > ResultSubmission.ERROR_TOL + if round_errors.any(): + raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} ' + f'do not sum up to 1 (error tolerance {ResultSubmission.ERROR_TOL}), ' + f'probably due to some rounding errors.') + + if return_inferred_type: + return df, inferred_type + else: + return df + + def sort_categories(self): + self.df = self.df.reindex([self.df.columns[0]] + sorted(self.df.columns[1:]), axis=1) + self.categories = sorted(self.categories) -def evaluate_submission(ground_truth_prevs: ResultSubmission, submission_prevs: ResultSubmission): - pass +def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSubmission, sample_size=1000, average=True): + if len(true_prevs) != len(predicted_prevs): + raise ValueError(f'size mismatch, groun truth has {len(true_prevs)} entries ' + f'while predictions contain {len(predicted_prevs)} entries') + true_prevs.sort_categories() + predicted_prevs.sort_categories() + if true_prevs.categories != predicted_prevs.categories: + raise ValueError(f'these result files are not comparable since the categories are different') + ae, rae = [], [] + for sample_name in true_prevs.df.filename.values: + ae.append(qp.error.mae(true_prevs.get(sample_name), predicted_prevs.get(sample_name))) + rae.append(qp.error.mrae(true_prevs.get(sample_name), predicted_prevs.get(sample_name), eps=sample_size)) + ae = np.asarray(ae) + rae = np.asarray(rae) + if average: + return ae.mean(), rae.mean() + else: + return ae, rae + + + + +# r = ResultSubmission(['negative', 'positive']) +# from tqdm import tqdm +# for i in tqdm(range(1000), total=1000): +# r.add(f'dev_sample_{i}.txt', np.asarray([0.5, 0.5])) +# r.dump('./path.csv') + +r = ResultSubmission.load('./data/T1A/public/dummy_submission.csv') +t = ResultSubmission.load('./data/T1A/public/dummy_submission (copy).csv') +# print(r.df) +# print(r.get('dev_sample_10.txt')) +print(evaluate_submission(r, t)) + +# s = ResultSubmission.load('./data/T1A/public/dummy_submission.csv') +# +# print(s) diff --git a/LeQua2022/main_binary_vector.py b/LeQua2022/main_binary_vector.py index fab1bc2..2930091 100644 --- a/LeQua2022/main_binary_vector.py +++ b/LeQua2022/main_binary_vector.py @@ -31,8 +31,6 @@ dev_prev = pd.read_csv(os.path.join(path_binary_vector, 'public', 'dev_prevalenc print(dev_prev) - - scores = {} for quantifier in [CC]: #, ACC, PCC, PACC, EMQ, HDy]: diff --git a/quapy/data/base.py b/quapy/data/base.py index 1a631d7..7a8df5c 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -1,3 +1,4 @@ +from abc import abstractmethod from typing import List, Union import numpy as np @@ -8,6 +9,112 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold from quapy.functional import artificial_prevalence_sampling, strprev +# class Sampling: +# +# @abstractmethod +# def load(cls, path: str, loader_func: callable, classes=None): ... +# +# @abstractmethod +# @property +# def __len__(self): ... +# +# @abstractmethod +# @property +# def prevalence(self): ... +# +# @abstractmethod +# @property +# def n_classes(self): +# +# @property +# def binary(self): +# return self.n_classes == 2 +# +# def uniform_sampling_index(self, size): +# return np.random.choice(len(self), size, replace=False) +# +# def uniform_sampling(self, size): +# unif_index = self.uniform_sampling_index(size) +# return self.sampling_from_index(unif_index) +# +# def sampling(self, size, *prevs, shuffle=True): +# prev_index = self.sampling_index(size, *prevs, shuffle=shuffle) +# return self.sampling_from_index(prev_index) +# +# def sampling_from_index(self, index): +# documents = self.instances[index] +# labels = self.labels[index] +# return LabelledCollection(documents, labels, classes_=self.classes_) +# +# def split_stratified(self, train_prop=0.6, random_state=None): +# # with temp_seed(42): +# tr_docs, te_docs, tr_labels, te_labels = \ +# train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels, +# random_state=random_state) +# return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels) +# +# def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1): +# dimensions = self.n_classes +# for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): +# yield self.sampling(sample_size, *prevs) +# +# def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1): +# dimensions = self.n_classes +# for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): +# yield self.sampling_index(sample_size, *prevs) +# +# def natural_sampling_generator(self, sample_size, repeats=100): +# for _ in range(repeats): +# yield self.uniform_sampling(sample_size) +# +# def natural_sampling_index_generator(self, sample_size, repeats=100): +# for _ in range(repeats): +# yield self.uniform_sampling_index(sample_size) +# +# def __add__(self, other): +# if other is None: +# return self +# elif issparse(self.instances) and issparse(other.instances): +# join_instances = vstack([self.instances, other.instances]) +# elif isinstance(self.instances, list) and isinstance(other.instances, list): +# join_instances = self.instances + other.instances +# elif isinstance(self.instances, np.ndarray) and isinstance(other.instances, np.ndarray): +# join_instances = np.concatenate([self.instances, other.instances]) +# else: +# raise NotImplementedError('unsupported operation for collection types') +# labels = np.concatenate([self.labels, other.labels]) +# return LabelledCollection(join_instances, labels) +# +# @property +# def Xy(self): +# return self.instances, self.labels +# +# def stats(self, show=True): +# ninstances = len(self) +# instance_type = type(self.instances[0]) +# if instance_type == list: +# nfeats = len(self.instances[0]) +# elif instance_type == np.ndarray or issparse(self.instances): +# nfeats = self.instances.shape[1] +# else: +# nfeats = '?' +# stats_ = {'instances': ninstances, +# 'type': instance_type, +# 'features': nfeats, +# 'classes': self.classes_, +# 'prevs': strprev(self.prevalence())} +# if show: +# print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, ' +# f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}') +# return stats_ +# +# def kFCV(self, nfolds=5, nrepeats=1, random_state=0): +# kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state) +# for train_index, test_index in kf.split(*self.Xy): +# train = self.sampling_from_index(train_index) +# test = self.sampling_from_index(test_index) +# yield train, test + class LabelledCollection: ''' A LabelledCollection is a set of objects each with a label associated to it. @@ -176,104 +283,7 @@ class LabelledCollection: yield train, test -class MultilingualLabelledCollection: - def __init__(self, langs:List[str], labelledCollections:List[LabelledCollection]): - assert len(langs) == len(labelledCollections), 'length mismatch for langs and labelledCollection lists' - assert all(isinstance(lc, LabelledCollection) for lc in labelledCollections), 'unexpected type for labelledCollections' - assert all(labelledCollections[0].classes_ == lc_i.classes_ for lc_i in labelledCollections[1:]), \ - 'inconsistent classes found for some labelled collections' - self.llc = {l: lc for l, lc in zip(langs, labelledCollections)} - self.classes_=labelledCollections[0].classes_ - @classmethod - def fromLangDict(cls, lang_labelledCollection:dict): - return MultilingualLabelledCollection(*list(zip(*list(lang_labelledCollection.items())))) - - def langs(self): - return list(sorted(self.llc.keys())) - - def __getitem__(self, lang)->LabelledCollection: - return self.llc[lang] - - @classmethod - def load(cls, path: str, loader_func: callable): - return MultilingualLabelledCollection(*loader_func(path)) - - def __len__(self): - return sum(map(len, self.llc.values())) - - def prevalence(self): - prev = np.asarray([lc.prevalence() * len(lc) for lc in self.llc.values()]).sum(axis=0) - return prev / prev.sum() - - def language_prevalence(self): - lang_count = np.asarray([len(self.llc[l]) for l in self.langs()]) - return lang_count / lang_count.sum() - - def counts(self): - return np.asarray([lc.counts() for lc in self.llc.values()]).sum(axis=0) - - @property - def n_classes(self): - return len(self.classes_) - - @property - def binary(self): - return self.n_classes == 2 - - def __check_langs(self, l_dict:dict): - assert len(l_dict)==len(self.langs()), 'wrong number of languages' - assert all(l in l_dict for l in self.langs()), 'missing languages in l_sizes' - - def __check_sizes(self, l_sizes: Union[int,dict]): - assert isinstance(l_sizes, int) or isinstance(l_sizes, dict), 'unexpected type for l_sizes' - if isinstance(l_sizes, int): - return {l:l_sizes for l in self.langs()} - self.__check_langs(l_sizes) - return l_sizes - - def sampling_index(self, l_sizes: Union[int,dict], *prevs, shuffle=True): - l_sizes = self.__check_sizes(l_sizes) - return {l:lc.sampling_index(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()} - - def uniform_sampling_index(self, l_sizes: Union[int, dict]): - l_sizes = self.__check_sizes(l_sizes) - return {l: lc.uniform_sampling_index(l_sizes[l]) for l,lc in self.llc.items()} - - def uniform_sampling(self, l_sizes: Union[int, dict]): - l_sizes = self.__check_sizes(l_sizes) - return MultilingualLabelledCollection.fromLangDict( - {l: lc.uniform_sampling(l_sizes[l]) for l,lc in self.llc.items()} - ) - - def sampling(self, l_sizes: Union[int, dict], *prevs, shuffle=True): - l_sizes = self.__check_sizes(l_sizes) - return MultilingualLabelledCollection.fromLangDict( - {l: lc.sampling(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()} - ) - - def sampling_from_index(self, l_index:dict): - self.__check_langs(l_index) - return MultilingualLabelledCollection.fromLangDict( - {l: lc.sampling_from_index(l_index[l]) for l,lc in self.llc.items()} - ) - - def split_stratified(self, train_prop=0.6, random_state=None): - train, test = list(zip(*[self[l].split_stratified(train_prop, random_state) for l in self.langs()])) - return MultilingualLabelledCollection(self.langs(), train), MultilingualLabelledCollection(self.langs(), test) - - def asLabelledCollection(self, return_langs=False): - lXy_list = [([l]*len(lc),*lc.Xy) for l, lc in self.llc.items()] # a list with (lang_i, Xi, yi) - ls,Xs,ys = list(zip(*lXy_list)) - ls = np.concatenate(ls) - vertstack = vstack if issparse(Xs[0]) else np.vstack - Xs = vertstack(Xs) - ys = np.concatenate(ys) - lc = LabelledCollection(Xs, ys, classes_=self.classes_) - # return lc, ls if return_langs else lc -# -# -# class Dataset: def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):