forked from moreo/QuaPy
adding features for cross-lingual
This commit is contained in:
parent
986e61620c
commit
ce908573e7
6
TODO.txt
6
TODO.txt
|
@ -35,6 +35,8 @@ GridSearchQ is not trully parallelized. It only parallelizes on the predictions.
|
||||||
In the context of a quantifier (e.g., QuaNet or CC), the parameters of the learner should be prefixed with "estimator__",
|
In the context of a quantifier (e.g., QuaNet or CC), the parameters of the learner should be prefixed with "estimator__",
|
||||||
in QuaNet this is resolved with a __check_params_colision, but this should be improved. It might be cumbersome to
|
in QuaNet this is resolved with a __check_params_colision, but this should be improved. It might be cumbersome to
|
||||||
impose the "estimator__" prefix for, e.g., quantifiers like CC though... This should be changed everywhere...
|
impose the "estimator__" prefix for, e.g., quantifiers like CC though... This should be changed everywhere...
|
||||||
|
QuaNet needs refactoring. The base quantifiers ACC and PACC receive val_data with instances already transformed. This
|
||||||
|
issue is due to a bad design.
|
||||||
|
|
||||||
Improvements:
|
Improvements:
|
||||||
==========================================
|
==========================================
|
||||||
|
@ -49,6 +51,10 @@ We might want to think of (improving and) adding the class Tabular (it is define
|
||||||
experiment looks like. (Do we want to abstract experimental results? this could be useful not only for tables but
|
experiment looks like. (Do we want to abstract experimental results? this could be useful not only for tables but
|
||||||
also for plots).
|
also for plots).
|
||||||
Add proper logging system. Currently we use print
|
Add proper logging system. Currently we use print
|
||||||
|
It might be good to simplify the number of methods that have to be implemented for any new Quantifier. At the moment,
|
||||||
|
there are many functions like get_params, set_params, and, specially, @property classes_, which are cumbersome to
|
||||||
|
implement for quick experiments. A possible solution is to impose get_params and set_params only in cases in which
|
||||||
|
the model extends some "ModelSelectable" interface only. The classes_ should have a default implementation.
|
||||||
|
|
||||||
Checks:
|
Checks:
|
||||||
==========================================
|
==========================================
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy.sparse import issparse
|
from scipy.sparse import issparse
|
||||||
from scipy.sparse import vstack
|
from scipy.sparse import vstack
|
||||||
|
@ -174,6 +176,104 @@ class LabelledCollection:
|
||||||
yield train, test
|
yield train, test
|
||||||
|
|
||||||
|
|
||||||
|
class MultilingualLabelledCollection:
|
||||||
|
def __init__(self, langs:List[str], labelledCollections:List[LabelledCollection]):
|
||||||
|
assert len(langs) == len(labelledCollections), 'length mismatch for langs and labelledCollection lists'
|
||||||
|
assert all(isinstance(lc, LabelledCollection) for lc in labelledCollections), 'unexpected type for labelledCollections'
|
||||||
|
assert all(labelledCollections[0].classes_ == lc_i.classes_ for lc_i in labelledCollections[1:]), \
|
||||||
|
'inconsistent classes found for some labelled collections'
|
||||||
|
self.llc = {l: lc for l, lc in zip(langs, labelledCollections)}
|
||||||
|
self.classes_=labelledCollections[0].classes_
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fromLangDict(cls, lang_labelledCollection:dict):
|
||||||
|
return MultilingualLabelledCollection(*list(zip(*list(lang_labelledCollection.items()))))
|
||||||
|
|
||||||
|
def langs(self):
|
||||||
|
return list(sorted(self.llc.keys()))
|
||||||
|
|
||||||
|
def __getitem__(self, lang)->LabelledCollection:
|
||||||
|
return self.llc[lang]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, path: str, loader_func: callable):
|
||||||
|
return MultilingualLabelledCollection(*loader_func(path))
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return sum(map(len, self.llc.values()))
|
||||||
|
|
||||||
|
def prevalence(self):
|
||||||
|
prev = np.asarray([lc.prevalence() * len(lc) for lc in self.llc.values()]).sum(axis=0)
|
||||||
|
return prev / prev.sum()
|
||||||
|
|
||||||
|
def language_prevalence(self):
|
||||||
|
lang_count = np.asarray([len(self.llc[l]) for l in self.langs()])
|
||||||
|
return lang_count / lang_count.sum()
|
||||||
|
|
||||||
|
def counts(self):
|
||||||
|
return np.asarray([lc.counts() for lc in self.llc.values()]).sum(axis=0)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def n_classes(self):
|
||||||
|
return len(self.classes_)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def binary(self):
|
||||||
|
return self.n_classes == 2
|
||||||
|
|
||||||
|
def __check_langs(self, l_dict:dict):
|
||||||
|
assert len(l_dict)==len(self.langs()), 'wrong number of languages'
|
||||||
|
assert all(l in l_dict for l in self.langs()), 'missing languages in l_sizes'
|
||||||
|
|
||||||
|
def __check_sizes(self, l_sizes: Union[int,dict]):
|
||||||
|
assert isinstance(l_sizes, int) or isinstance(l_sizes, dict), 'unexpected type for l_sizes'
|
||||||
|
if isinstance(l_sizes, int):
|
||||||
|
return {l:l_sizes for l in self.langs()}
|
||||||
|
self.__check_langs(l_sizes)
|
||||||
|
return l_sizes
|
||||||
|
|
||||||
|
def sampling_index(self, l_sizes: Union[int,dict], *prevs, shuffle=True):
|
||||||
|
l_sizes = self.__check_sizes(l_sizes)
|
||||||
|
return {l:lc.sampling_index(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
|
||||||
|
|
||||||
|
def uniform_sampling_index(self, l_sizes: Union[int, dict]):
|
||||||
|
l_sizes = self.__check_sizes(l_sizes)
|
||||||
|
return {l: lc.uniform_sampling_index(l_sizes[l]) for l,lc in self.llc.items()}
|
||||||
|
|
||||||
|
def uniform_sampling(self, l_sizes: Union[int, dict]):
|
||||||
|
l_sizes = self.__check_sizes(l_sizes)
|
||||||
|
return MultilingualLabelledCollection.fromLangDict(
|
||||||
|
{l: lc.uniform_sampling(l_sizes[l]) for l,lc in self.llc.items()}
|
||||||
|
)
|
||||||
|
|
||||||
|
def sampling(self, l_sizes: Union[int, dict], *prevs, shuffle=True):
|
||||||
|
l_sizes = self.__check_sizes(l_sizes)
|
||||||
|
return MultilingualLabelledCollection.fromLangDict(
|
||||||
|
{l: lc.sampling(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
|
||||||
|
)
|
||||||
|
|
||||||
|
def sampling_from_index(self, l_index:dict):
|
||||||
|
self.__check_langs(l_index)
|
||||||
|
return MultilingualLabelledCollection.fromLangDict(
|
||||||
|
{l: lc.sampling_from_index(l_index[l]) for l,lc in self.llc.items()}
|
||||||
|
)
|
||||||
|
|
||||||
|
def split_stratified(self, train_prop=0.6, random_state=None):
|
||||||
|
train, test = list(zip(*[self[l].split_stratified(train_prop, random_state) for l in self.langs()]))
|
||||||
|
return MultilingualLabelledCollection(self.langs(), train), MultilingualLabelledCollection(self.langs(), test)
|
||||||
|
|
||||||
|
def asLabelledCollection(self, return_langs=False):
|
||||||
|
lXy_list = [([l]*len(lc),*lc.Xy) for l, lc in self.llc.items()] # a list with (lang_i, Xi, yi)
|
||||||
|
ls,Xs,ys = list(zip(*lXy_list))
|
||||||
|
ls = np.concatenate(ls)
|
||||||
|
vertstack = vstack if issparse(Xs[0]) else np.vstack
|
||||||
|
Xs = vertstack(Xs)
|
||||||
|
ys = np.concatenate(ys)
|
||||||
|
lc = LabelledCollection(Xs, ys, classes_=self.classes_)
|
||||||
|
# return lc, ls if return_langs else lc
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#
|
||||||
class Dataset:
|
class Dataset:
|
||||||
|
|
||||||
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
|
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
|
||||||
|
|
Loading…
Reference in New Issue