forked from moreo/QuaPy
last updates
This commit is contained in:
parent
60b6fa3c12
commit
ab746eed8d
|
@ -1,3 +1,5 @@
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
@ -94,3 +96,101 @@ class MultilabelledCollection:
|
||||||
@property
|
@property
|
||||||
def Xy(self):
|
def Xy(self):
|
||||||
return self.instances, self.labels
|
return self.instances, self.labels
|
||||||
|
|
||||||
|
|
||||||
|
class MultilingualLabelledCollection:
|
||||||
|
def __init__(self, langs:List[str], labelledCollections:List[Union[LabelledCollection, MultilabelledCollection]]):
|
||||||
|
assert len(langs) == len(labelledCollections), 'length mismatch for langs and labelledCollection lists'
|
||||||
|
assert all(isinstance(lc, LabelledCollection) or all(isinstance(lc, MultilabelledCollection)) for lc in labelledCollections), \
|
||||||
|
'unexpected type for labelledCollections'
|
||||||
|
assert all(labelledCollections[0].classes_ == lc_i.classes_ for lc_i in labelledCollections[1:]), \
|
||||||
|
'inconsistent classes found for some labelled collections'
|
||||||
|
self.llc = {l: lc for l, lc in zip(langs, labelledCollections)}
|
||||||
|
self.classes_=labelledCollections[0].classes_
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fromLangDict(cls, lang_labelledCollection:dict):
|
||||||
|
return MultilingualLabelledCollection(*list(zip(*list(lang_labelledCollection.items()))))
|
||||||
|
|
||||||
|
def langs(self):
|
||||||
|
return list(sorted(self.llc.keys()))
|
||||||
|
|
||||||
|
def __getitem__(self, lang)->LabelledCollection:
|
||||||
|
return self.llc[lang]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, path: str, loader_func: callable):
|
||||||
|
return MultilingualLabelledCollection(*loader_func(path))
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return sum(map(len, self.llc.values()))
|
||||||
|
|
||||||
|
def prevalence(self):
|
||||||
|
prev = np.asarray([lc.prevalence() * len(lc) for lc in self.llc.values()]).sum(axis=0)
|
||||||
|
return prev / prev.sum()
|
||||||
|
|
||||||
|
def language_prevalence(self):
|
||||||
|
lang_count = np.asarray([len(self.llc[l]) for l in self.langs()])
|
||||||
|
return lang_count / lang_count.sum()
|
||||||
|
|
||||||
|
def counts(self):
|
||||||
|
return np.asarray([lc.counts() for lc in self.llc.values()]).sum(axis=0)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def n_classes(self):
|
||||||
|
return len(self.classes_)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def binary(self):
|
||||||
|
return self.n_classes == 2
|
||||||
|
|
||||||
|
def __check_langs(self, l_dict:dict):
|
||||||
|
assert len(l_dict)==len(self.langs()), 'wrong number of languages'
|
||||||
|
assert all(l in l_dict for l in self.langs()), 'missing languages in l_sizes'
|
||||||
|
|
||||||
|
def __check_sizes(self, l_sizes: Union[int,dict]):
|
||||||
|
assert isinstance(l_sizes, int) or isinstance(l_sizes, dict), 'unexpected type for l_sizes'
|
||||||
|
if isinstance(l_sizes, int):
|
||||||
|
return {l:l_sizes for l in self.langs()}
|
||||||
|
self.__check_langs(l_sizes)
|
||||||
|
return l_sizes
|
||||||
|
|
||||||
|
def sampling_index(self, l_sizes: Union[int,dict], *prevs, shuffle=True):
|
||||||
|
l_sizes = self.__check_sizes(l_sizes)
|
||||||
|
return {l:lc.sampling_index(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
|
||||||
|
|
||||||
|
def uniform_sampling_index(self, l_sizes: Union[int, dict]):
|
||||||
|
l_sizes = self.__check_sizes(l_sizes)
|
||||||
|
return {l: lc.uniform_sampling_index(l_sizes[l]) for l,lc in self.llc.items()}
|
||||||
|
|
||||||
|
def uniform_sampling(self, l_sizes: Union[int, dict]):
|
||||||
|
l_sizes = self.__check_sizes(l_sizes)
|
||||||
|
return MultilingualLabelledCollection.fromLangDict(
|
||||||
|
{l: lc.uniform_sampling(l_sizes[l]) for l,lc in self.llc.items()}
|
||||||
|
)
|
||||||
|
|
||||||
|
def sampling(self, l_sizes: Union[int, dict], *prevs, shuffle=True):
|
||||||
|
l_sizes = self.__check_sizes(l_sizes)
|
||||||
|
return MultilingualLabelledCollection.fromLangDict(
|
||||||
|
{l: lc.sampling(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
|
||||||
|
)
|
||||||
|
|
||||||
|
def sampling_from_index(self, l_index:dict):
|
||||||
|
self.__check_langs(l_index)
|
||||||
|
return MultilingualLabelledCollection.fromLangDict(
|
||||||
|
{l: lc.sampling_from_index(l_index[l]) for l,lc in self.llc.items()}
|
||||||
|
)
|
||||||
|
|
||||||
|
def split_stratified(self, train_prop=0.6, random_state=None):
|
||||||
|
train, test = list(zip(*[self[l].split_stratified(train_prop, random_state) for l in self.langs()]))
|
||||||
|
return MultilingualLabelledCollection(self.langs(), train), MultilingualLabelledCollection(self.langs(), test)
|
||||||
|
|
||||||
|
def asLabelledCollection(self, return_langs=False):
|
||||||
|
lXy_list = [([l]*len(lc),*lc.Xy) for l, lc in self.llc.items()] # a list with (lang_i, Xi, yi)
|
||||||
|
ls,Xs,ys = list(zip(*lXy_list))
|
||||||
|
ls = np.concatenate(ls)
|
||||||
|
vertstack = vstack if issparse(Xs[0]) else np.vstack
|
||||||
|
Xs = vertstack(Xs)
|
||||||
|
ys = np.concatenate(ys)
|
||||||
|
lc = LabelledCollection(Xs, ys, classes_=self.classes_)
|
||||||
|
# return lc, ls if return_langs else lc
|
||||||
|
|
|
@ -31,28 +31,28 @@ n_samples = 5000
|
||||||
|
|
||||||
|
|
||||||
def models():
|
def models():
|
||||||
# yield 'NaiveCC', MultilabelNaiveAggregativeQuantifier(CC(cls()))
|
yield 'NaiveCC', MultilabelNaiveAggregativeQuantifier(CC(cls()))
|
||||||
# yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls()))
|
yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls()))
|
||||||
# yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls()))
|
yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls()))
|
||||||
# yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls()))
|
yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls()))
|
||||||
# yield 'EMQ', MultilabelQuantifier(EMQ(calibratedCls()))
|
# yield 'EMQ', MultilabelQuantifier(EMQ(calibratedCls()))
|
||||||
# yield 'StackCC', MLCC(MultilabelStackedClassifier(cls()))
|
yield 'StackCC', MLCC(MultilabelStackedClassifier(cls()))
|
||||||
# yield 'StackPCC', MLPCC(MultilabelStackedClassifier(cls()))
|
yield 'StackPCC', MLPCC(MultilabelStackedClassifier(cls()))
|
||||||
# yield 'StackACC', MLACC(MultilabelStackedClassifier(cls()))
|
yield 'StackACC', MLACC(MultilabelStackedClassifier(cls()))
|
||||||
# yield 'StackPACC', MLPACC(MultilabelStackedClassifier(cls()))
|
yield 'StackPACC', MLPACC(MultilabelStackedClassifier(cls()))
|
||||||
# yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None, order='random'))
|
# yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None, order='random'))
|
||||||
# yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random'))
|
# yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random'))
|
||||||
# yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random'))
|
# yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random'))
|
||||||
# yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random'))
|
# yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random'))
|
||||||
common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'}
|
common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'}
|
||||||
# yield 'MRQ-CC', MLRegressionQuantification(MultilabelNaiveQuantifier(CC(cls())), **common)
|
yield 'MRQ-CC', MLRegressionQuantification(MultilabelNaiveQuantifier(CC(cls())), **common)
|
||||||
# yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())), **common)
|
yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())), **common)
|
||||||
# yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())), **common)
|
yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())), **common)
|
||||||
# yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common)
|
yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common)
|
||||||
# yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common)
|
yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common)
|
||||||
# yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common)
|
yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common)
|
||||||
# yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common)
|
yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common)
|
||||||
# yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), **common)
|
yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), **common)
|
||||||
yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
||||||
yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
||||||
yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
||||||
|
@ -63,8 +63,11 @@ def models():
|
||||||
# yield 'MRQ-ChainPACC', MLRegressionQuantification(MLPACC(ClassifierChain(cls())), **common)
|
# yield 'MRQ-ChainPACC', MLRegressionQuantification(MLPACC(ClassifierChain(cls())), **common)
|
||||||
|
|
||||||
|
|
||||||
dataset = 'reuters21578'
|
# dataset = 'reuters21578'
|
||||||
picklepath = '/home/moreo/word-class-embeddings/pickles'
|
# dataset = 'ohsumed'
|
||||||
|
dataset = 'jrcall'
|
||||||
|
# picklepath = '/home/moreo/word-class-embeddings/pickles'
|
||||||
|
picklepath = './pickles'
|
||||||
data = Dataset.load(dataset, pickle_path=f'{picklepath}/{dataset}.pickle')
|
data = Dataset.load(dataset, pickle_path=f'{picklepath}/{dataset}.pickle')
|
||||||
|
|
||||||
Xtr, Xte = data.vectorize()
|
Xtr, Xte = data.vectorize()
|
||||||
|
@ -72,7 +75,8 @@ ytr = data.devel_labelmatrix.todense().getA()
|
||||||
yte = data.test_labelmatrix.todense().getA()
|
yte = data.test_labelmatrix.todense().getA()
|
||||||
|
|
||||||
# remove categories with < 10 training documents
|
# remove categories with < 10 training documents
|
||||||
to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50)
|
# to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50)
|
||||||
|
to_keep = np.argsort(ytr.sum(axis=0))[-10:]
|
||||||
ytr = ytr[:, to_keep]
|
ytr = ytr[:, to_keep]
|
||||||
yte = yte[:, to_keep]
|
yte = yte[:, to_keep]
|
||||||
print(f'num categories = {ytr.shape[1]}')
|
print(f'num categories = {ytr.shape[1]}')
|
||||||
|
|
|
@ -176,104 +176,6 @@ class LabelledCollection:
|
||||||
yield train, test
|
yield train, test
|
||||||
|
|
||||||
|
|
||||||
class MultilingualLabelledCollection:
|
|
||||||
def __init__(self, langs:List[str], labelledCollections:List[LabelledCollection]):
|
|
||||||
assert len(langs) == len(labelledCollections), 'length mismatch for langs and labelledCollection lists'
|
|
||||||
assert all(isinstance(lc, LabelledCollection) for lc in labelledCollections), 'unexpected type for labelledCollections'
|
|
||||||
assert all(labelledCollections[0].classes_ == lc_i.classes_ for lc_i in labelledCollections[1:]), \
|
|
||||||
'inconsistent classes found for some labelled collections'
|
|
||||||
self.llc = {l: lc for l, lc in zip(langs, labelledCollections)}
|
|
||||||
self.classes_=labelledCollections[0].classes_
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def fromLangDict(cls, lang_labelledCollection:dict):
|
|
||||||
return MultilingualLabelledCollection(*list(zip(*list(lang_labelledCollection.items()))))
|
|
||||||
|
|
||||||
def langs(self):
|
|
||||||
return list(sorted(self.llc.keys()))
|
|
||||||
|
|
||||||
def __getitem__(self, lang)->LabelledCollection:
|
|
||||||
return self.llc[lang]
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load(cls, path: str, loader_func: callable):
|
|
||||||
return MultilingualLabelledCollection(*loader_func(path))
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return sum(map(len, self.llc.values()))
|
|
||||||
|
|
||||||
def prevalence(self):
|
|
||||||
prev = np.asarray([lc.prevalence() * len(lc) for lc in self.llc.values()]).sum(axis=0)
|
|
||||||
return prev / prev.sum()
|
|
||||||
|
|
||||||
def language_prevalence(self):
|
|
||||||
lang_count = np.asarray([len(self.llc[l]) for l in self.langs()])
|
|
||||||
return lang_count / lang_count.sum()
|
|
||||||
|
|
||||||
def counts(self):
|
|
||||||
return np.asarray([lc.counts() for lc in self.llc.values()]).sum(axis=0)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def n_classes(self):
|
|
||||||
return len(self.classes_)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def binary(self):
|
|
||||||
return self.n_classes == 2
|
|
||||||
|
|
||||||
def __check_langs(self, l_dict:dict):
|
|
||||||
assert len(l_dict)==len(self.langs()), 'wrong number of languages'
|
|
||||||
assert all(l in l_dict for l in self.langs()), 'missing languages in l_sizes'
|
|
||||||
|
|
||||||
def __check_sizes(self, l_sizes: Union[int,dict]):
|
|
||||||
assert isinstance(l_sizes, int) or isinstance(l_sizes, dict), 'unexpected type for l_sizes'
|
|
||||||
if isinstance(l_sizes, int):
|
|
||||||
return {l:l_sizes for l in self.langs()}
|
|
||||||
self.__check_langs(l_sizes)
|
|
||||||
return l_sizes
|
|
||||||
|
|
||||||
def sampling_index(self, l_sizes: Union[int,dict], *prevs, shuffle=True):
|
|
||||||
l_sizes = self.__check_sizes(l_sizes)
|
|
||||||
return {l:lc.sampling_index(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
|
|
||||||
|
|
||||||
def uniform_sampling_index(self, l_sizes: Union[int, dict]):
|
|
||||||
l_sizes = self.__check_sizes(l_sizes)
|
|
||||||
return {l: lc.uniform_sampling_index(l_sizes[l]) for l,lc in self.llc.items()}
|
|
||||||
|
|
||||||
def uniform_sampling(self, l_sizes: Union[int, dict]):
|
|
||||||
l_sizes = self.__check_sizes(l_sizes)
|
|
||||||
return MultilingualLabelledCollection.fromLangDict(
|
|
||||||
{l: lc.uniform_sampling(l_sizes[l]) for l,lc in self.llc.items()}
|
|
||||||
)
|
|
||||||
|
|
||||||
def sampling(self, l_sizes: Union[int, dict], *prevs, shuffle=True):
|
|
||||||
l_sizes = self.__check_sizes(l_sizes)
|
|
||||||
return MultilingualLabelledCollection.fromLangDict(
|
|
||||||
{l: lc.sampling(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
|
|
||||||
)
|
|
||||||
|
|
||||||
def sampling_from_index(self, l_index:dict):
|
|
||||||
self.__check_langs(l_index)
|
|
||||||
return MultilingualLabelledCollection.fromLangDict(
|
|
||||||
{l: lc.sampling_from_index(l_index[l]) for l,lc in self.llc.items()}
|
|
||||||
)
|
|
||||||
|
|
||||||
def split_stratified(self, train_prop=0.6, random_state=None):
|
|
||||||
train, test = list(zip(*[self[l].split_stratified(train_prop, random_state) for l in self.langs()]))
|
|
||||||
return MultilingualLabelledCollection(self.langs(), train), MultilingualLabelledCollection(self.langs(), test)
|
|
||||||
|
|
||||||
def asLabelledCollection(self, return_langs=False):
|
|
||||||
lXy_list = [([l]*len(lc),*lc.Xy) for l, lc in self.llc.items()] # a list with (lang_i, Xi, yi)
|
|
||||||
ls,Xs,ys = list(zip(*lXy_list))
|
|
||||||
ls = np.concatenate(ls)
|
|
||||||
vertstack = vstack if issparse(Xs[0]) else np.vstack
|
|
||||||
Xs = vertstack(Xs)
|
|
||||||
ys = np.concatenate(ys)
|
|
||||||
lc = LabelledCollection(Xs, ys, classes_=self.classes_)
|
|
||||||
# return lc, ls if return_langs else lc
|
|
||||||
#
|
|
||||||
#
|
|
||||||
#
|
|
||||||
class Dataset:
|
class Dataset:
|
||||||
|
|
||||||
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
|
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
|
||||||
|
|
Loading…
Reference in New Issue