diff --git a/MultiLabel/mldata.py b/MultiLabel/mldata.py index daa0ff2..562d4f4 100644 --- a/MultiLabel/mldata.py +++ b/MultiLabel/mldata.py @@ -1,3 +1,5 @@ +from typing import List, Union + import numpy as np from sklearn.model_selection import train_test_split @@ -93,4 +95,102 @@ class MultilabelledCollection: @property def Xy(self): - return self.instances, self.labels \ No newline at end of file + return self.instances, self.labels + + +class MultilingualLabelledCollection: + def __init__(self, langs:List[str], labelledCollections:List[Union[LabelledCollection, MultilabelledCollection]]): + assert len(langs) == len(labelledCollections), 'length mismatch for langs and labelledCollection lists' + assert all(isinstance(lc, LabelledCollection) or all(isinstance(lc, MultilabelledCollection)) for lc in labelledCollections), \ + 'unexpected type for labelledCollections' + assert all(labelledCollections[0].classes_ == lc_i.classes_ for lc_i in labelledCollections[1:]), \ + 'inconsistent classes found for some labelled collections' + self.llc = {l: lc for l, lc in zip(langs, labelledCollections)} + self.classes_=labelledCollections[0].classes_ + + @classmethod + def fromLangDict(cls, lang_labelledCollection:dict): + return MultilingualLabelledCollection(*list(zip(*list(lang_labelledCollection.items())))) + + def langs(self): + return list(sorted(self.llc.keys())) + + def __getitem__(self, lang)->LabelledCollection: + return self.llc[lang] + + @classmethod + def load(cls, path: str, loader_func: callable): + return MultilingualLabelledCollection(*loader_func(path)) + + def __len__(self): + return sum(map(len, self.llc.values())) + + def prevalence(self): + prev = np.asarray([lc.prevalence() * len(lc) for lc in self.llc.values()]).sum(axis=0) + return prev / prev.sum() + + def language_prevalence(self): + lang_count = np.asarray([len(self.llc[l]) for l in self.langs()]) + return lang_count / lang_count.sum() + + def counts(self): + return np.asarray([lc.counts() for lc in self.llc.values()]).sum(axis=0) + + @property + def n_classes(self): + return len(self.classes_) + + @property + def binary(self): + return self.n_classes == 2 + + def __check_langs(self, l_dict:dict): + assert len(l_dict)==len(self.langs()), 'wrong number of languages' + assert all(l in l_dict for l in self.langs()), 'missing languages in l_sizes' + + def __check_sizes(self, l_sizes: Union[int,dict]): + assert isinstance(l_sizes, int) or isinstance(l_sizes, dict), 'unexpected type for l_sizes' + if isinstance(l_sizes, int): + return {l:l_sizes for l in self.langs()} + self.__check_langs(l_sizes) + return l_sizes + + def sampling_index(self, l_sizes: Union[int,dict], *prevs, shuffle=True): + l_sizes = self.__check_sizes(l_sizes) + return {l:lc.sampling_index(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()} + + def uniform_sampling_index(self, l_sizes: Union[int, dict]): + l_sizes = self.__check_sizes(l_sizes) + return {l: lc.uniform_sampling_index(l_sizes[l]) for l,lc in self.llc.items()} + + def uniform_sampling(self, l_sizes: Union[int, dict]): + l_sizes = self.__check_sizes(l_sizes) + return MultilingualLabelledCollection.fromLangDict( + {l: lc.uniform_sampling(l_sizes[l]) for l,lc in self.llc.items()} + ) + + def sampling(self, l_sizes: Union[int, dict], *prevs, shuffle=True): + l_sizes = self.__check_sizes(l_sizes) + return MultilingualLabelledCollection.fromLangDict( + {l: lc.sampling(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()} + ) + + def sampling_from_index(self, l_index:dict): + self.__check_langs(l_index) + return MultilingualLabelledCollection.fromLangDict( + {l: lc.sampling_from_index(l_index[l]) for l,lc in self.llc.items()} + ) + + def split_stratified(self, train_prop=0.6, random_state=None): + train, test = list(zip(*[self[l].split_stratified(train_prop, random_state) for l in self.langs()])) + return MultilingualLabelledCollection(self.langs(), train), MultilingualLabelledCollection(self.langs(), test) + + def asLabelledCollection(self, return_langs=False): + lXy_list = [([l]*len(lc),*lc.Xy) for l, lc in self.llc.items()] # a list with (lang_i, Xi, yi) + ls,Xs,ys = list(zip(*lXy_list)) + ls = np.concatenate(ls) + vertstack = vstack if issparse(Xs[0]) else np.vstack + Xs = vertstack(Xs) + ys = np.concatenate(ys) + lc = LabelledCollection(Xs, ys, classes_=self.classes_) + # return lc, ls if return_langs else lc diff --git a/MultiLabel/multi_label.py b/MultiLabel/multi_label.py index bf413cd..9e9d85e 100644 --- a/MultiLabel/multi_label.py +++ b/MultiLabel/multi_label.py @@ -31,28 +31,28 @@ n_samples = 5000 def models(): - # yield 'NaiveCC', MultilabelNaiveAggregativeQuantifier(CC(cls())) - # yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls())) - # yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls())) - # yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls())) + yield 'NaiveCC', MultilabelNaiveAggregativeQuantifier(CC(cls())) + yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls())) + yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls())) + yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls())) # yield 'EMQ', MultilabelQuantifier(EMQ(calibratedCls())) - # yield 'StackCC', MLCC(MultilabelStackedClassifier(cls())) - # yield 'StackPCC', MLPCC(MultilabelStackedClassifier(cls())) - # yield 'StackACC', MLACC(MultilabelStackedClassifier(cls())) - # yield 'StackPACC', MLPACC(MultilabelStackedClassifier(cls())) + yield 'StackCC', MLCC(MultilabelStackedClassifier(cls())) + yield 'StackPCC', MLPCC(MultilabelStackedClassifier(cls())) + yield 'StackACC', MLACC(MultilabelStackedClassifier(cls())) + yield 'StackPACC', MLPACC(MultilabelStackedClassifier(cls())) # yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None, order='random')) # yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random')) # yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random')) # yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random')) common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'} - # yield 'MRQ-CC', MLRegressionQuantification(MultilabelNaiveQuantifier(CC(cls())), **common) - # yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())), **common) - # yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())), **common) - # yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common) - # yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common) - # yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common) - # yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common) - # yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), **common) + yield 'MRQ-CC', MLRegressionQuantification(MultilabelNaiveQuantifier(CC(cls())), **common) + yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())), **common) + yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())), **common) + yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common) + yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common) + yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common) + yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common) + yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), **common) yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), protocol='app', **common) yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), protocol='app', **common) yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), protocol='app', **common) @@ -63,8 +63,11 @@ def models(): # yield 'MRQ-ChainPACC', MLRegressionQuantification(MLPACC(ClassifierChain(cls())), **common) -dataset = 'reuters21578' -picklepath = '/home/moreo/word-class-embeddings/pickles' +# dataset = 'reuters21578' +# dataset = 'ohsumed' +dataset = 'jrcall' +# picklepath = '/home/moreo/word-class-embeddings/pickles' +picklepath = './pickles' data = Dataset.load(dataset, pickle_path=f'{picklepath}/{dataset}.pickle') Xtr, Xte = data.vectorize() @@ -72,7 +75,8 @@ ytr = data.devel_labelmatrix.todense().getA() yte = data.test_labelmatrix.todense().getA() # remove categories with < 10 training documents -to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50) +# to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50) +to_keep = np.argsort(ytr.sum(axis=0))[-10:] ytr = ytr[:, to_keep] yte = yte[:, to_keep] print(f'num categories = {ytr.shape[1]}') diff --git a/MultiLabel/results.txt b/MultiLabel/results_reuters21578.txt similarity index 100% rename from MultiLabel/results.txt rename to MultiLabel/results_reuters21578.txt diff --git a/quapy/data/base.py b/quapy/data/base.py index b482548..39e7c93 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -176,104 +176,6 @@ class LabelledCollection: yield train, test -class MultilingualLabelledCollection: - def __init__(self, langs:List[str], labelledCollections:List[LabelledCollection]): - assert len(langs) == len(labelledCollections), 'length mismatch for langs and labelledCollection lists' - assert all(isinstance(lc, LabelledCollection) for lc in labelledCollections), 'unexpected type for labelledCollections' - assert all(labelledCollections[0].classes_ == lc_i.classes_ for lc_i in labelledCollections[1:]), \ - 'inconsistent classes found for some labelled collections' - self.llc = {l: lc for l, lc in zip(langs, labelledCollections)} - self.classes_=labelledCollections[0].classes_ - - @classmethod - def fromLangDict(cls, lang_labelledCollection:dict): - return MultilingualLabelledCollection(*list(zip(*list(lang_labelledCollection.items())))) - - def langs(self): - return list(sorted(self.llc.keys())) - - def __getitem__(self, lang)->LabelledCollection: - return self.llc[lang] - - @classmethod - def load(cls, path: str, loader_func: callable): - return MultilingualLabelledCollection(*loader_func(path)) - - def __len__(self): - return sum(map(len, self.llc.values())) - - def prevalence(self): - prev = np.asarray([lc.prevalence() * len(lc) for lc in self.llc.values()]).sum(axis=0) - return prev / prev.sum() - - def language_prevalence(self): - lang_count = np.asarray([len(self.llc[l]) for l in self.langs()]) - return lang_count / lang_count.sum() - - def counts(self): - return np.asarray([lc.counts() for lc in self.llc.values()]).sum(axis=0) - - @property - def n_classes(self): - return len(self.classes_) - - @property - def binary(self): - return self.n_classes == 2 - - def __check_langs(self, l_dict:dict): - assert len(l_dict)==len(self.langs()), 'wrong number of languages' - assert all(l in l_dict for l in self.langs()), 'missing languages in l_sizes' - - def __check_sizes(self, l_sizes: Union[int,dict]): - assert isinstance(l_sizes, int) or isinstance(l_sizes, dict), 'unexpected type for l_sizes' - if isinstance(l_sizes, int): - return {l:l_sizes for l in self.langs()} - self.__check_langs(l_sizes) - return l_sizes - - def sampling_index(self, l_sizes: Union[int,dict], *prevs, shuffle=True): - l_sizes = self.__check_sizes(l_sizes) - return {l:lc.sampling_index(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()} - - def uniform_sampling_index(self, l_sizes: Union[int, dict]): - l_sizes = self.__check_sizes(l_sizes) - return {l: lc.uniform_sampling_index(l_sizes[l]) for l,lc in self.llc.items()} - - def uniform_sampling(self, l_sizes: Union[int, dict]): - l_sizes = self.__check_sizes(l_sizes) - return MultilingualLabelledCollection.fromLangDict( - {l: lc.uniform_sampling(l_sizes[l]) for l,lc in self.llc.items()} - ) - - def sampling(self, l_sizes: Union[int, dict], *prevs, shuffle=True): - l_sizes = self.__check_sizes(l_sizes) - return MultilingualLabelledCollection.fromLangDict( - {l: lc.sampling(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()} - ) - - def sampling_from_index(self, l_index:dict): - self.__check_langs(l_index) - return MultilingualLabelledCollection.fromLangDict( - {l: lc.sampling_from_index(l_index[l]) for l,lc in self.llc.items()} - ) - - def split_stratified(self, train_prop=0.6, random_state=None): - train, test = list(zip(*[self[l].split_stratified(train_prop, random_state) for l in self.langs()])) - return MultilingualLabelledCollection(self.langs(), train), MultilingualLabelledCollection(self.langs(), test) - - def asLabelledCollection(self, return_langs=False): - lXy_list = [([l]*len(lc),*lc.Xy) for l, lc in self.llc.items()] # a list with (lang_i, Xi, yi) - ls,Xs,ys = list(zip(*lXy_list)) - ls = np.concatenate(ls) - vertstack = vstack if issparse(Xs[0]) else np.vstack - Xs = vertstack(Xs) - ys = np.concatenate(ys) - lc = LabelledCollection(Xs, ys, classes_=self.classes_) - # return lc, ls if return_langs else lc -# -# -# class Dataset: def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):