merged!

2021-08-25 17:10:24 +02:00 · 2021-08-25 17:10:24 +02:00 · d040b2acb6
parent c6de5a043d ab746eed8d
commit d040b2acb6
4 changed files with 105 additions and 104 deletions
--- a/MultiLabel/main.py
+++ b/MultiLabel/main.py
@ -50,10 +50,10 @@ def models():
    yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())),  **common)
    yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())),  **common)
    yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common)
-    # yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common)
-    # yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common)
-    # yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common)
-    # yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())),  **common)
+    yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common)
+    yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common)
+    yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common)
+    yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())),  **common)
    yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
    yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
    yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), protocol='app', **common)
@ -94,7 +94,6 @@ yte = yte.todense().getA()
 # print((np.abs(np.corrcoef(ytr, rowvar=False))>0.1).sum())
 # sys.exit(0)

-
 train = MultilabelledCollection(Xtr, ytr)
 test = MultilabelledCollection(Xte, yte)

--- a/MultiLabel/mldata.py
+++ b/MultiLabel/mldata.py
@ -1,3 +1,5 @@
+from typing import List, Union
+
 import numpy as np
 from sklearn.model_selection import train_test_split

@ -93,4 +95,102 @@ class MultilabelledCollection:

    @property
    def Xy(self):
-        return self.instances, self.labels
+        return self.instances, self.labels
+
+
+class MultilingualLabelledCollection:
+    def __init__(self, langs:List[str], labelledCollections:List[Union[LabelledCollection, MultilabelledCollection]]):
+        assert len(langs) == len(labelledCollections), 'length mismatch for langs and labelledCollection lists'
+        assert all(isinstance(lc, LabelledCollection) or all(isinstance(lc, MultilabelledCollection)) for lc in labelledCollections), \
+            'unexpected type for labelledCollections'
+        assert all(labelledCollections[0].classes_ == lc_i.classes_ for lc_i in labelledCollections[1:]), \
+            'inconsistent classes found for some labelled collections'
+        self.llc = {l: lc for l, lc in zip(langs, labelledCollections)}
+        self.classes_=labelledCollections[0].classes_
+
+    @classmethod
+    def fromLangDict(cls, lang_labelledCollection:dict):
+        return MultilingualLabelledCollection(*list(zip(*list(lang_labelledCollection.items()))))
+
+    def langs(self):
+        return list(sorted(self.llc.keys()))
+
+    def __getitem__(self, lang)->LabelledCollection:
+        return self.llc[lang]
+
+    @classmethod
+    def load(cls, path: str, loader_func: callable):
+        return MultilingualLabelledCollection(*loader_func(path))
+
+    def __len__(self):
+        return sum(map(len, self.llc.values()))
+
+    def prevalence(self):
+        prev = np.asarray([lc.prevalence() * len(lc) for lc in self.llc.values()]).sum(axis=0)
+        return prev / prev.sum()
+
+    def language_prevalence(self):
+        lang_count = np.asarray([len(self.llc[l]) for l in self.langs()])
+        return lang_count / lang_count.sum()
+
+    def counts(self):
+        return np.asarray([lc.counts() for lc in self.llc.values()]).sum(axis=0)
+
+    @property
+    def n_classes(self):
+        return len(self.classes_)
+
+    @property
+    def binary(self):
+        return self.n_classes == 2
+
+    def __check_langs(self, l_dict:dict):
+        assert len(l_dict)==len(self.langs()), 'wrong number of languages'
+        assert all(l in l_dict for l in self.langs()), 'missing languages in l_sizes'
+
+    def __check_sizes(self, l_sizes: Union[int,dict]):
+        assert isinstance(l_sizes, int) or isinstance(l_sizes, dict), 'unexpected type for l_sizes'
+        if isinstance(l_sizes, int):
+            return {l:l_sizes for l in self.langs()}
+        self.__check_langs(l_sizes)
+        return l_sizes
+
+    def sampling_index(self, l_sizes: Union[int,dict], *prevs, shuffle=True):
+        l_sizes = self.__check_sizes(l_sizes)
+        return {l:lc.sampling_index(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
+
+    def uniform_sampling_index(self, l_sizes: Union[int, dict]):
+        l_sizes = self.__check_sizes(l_sizes)
+        return {l: lc.uniform_sampling_index(l_sizes[l]) for l,lc in self.llc.items()}
+
+    def uniform_sampling(self, l_sizes: Union[int, dict]):
+        l_sizes = self.__check_sizes(l_sizes)
+        return MultilingualLabelledCollection.fromLangDict(
+            {l: lc.uniform_sampling(l_sizes[l]) for l,lc in self.llc.items()}
+        )
+
+    def sampling(self, l_sizes: Union[int, dict], *prevs, shuffle=True):
+        l_sizes = self.__check_sizes(l_sizes)
+        return MultilingualLabelledCollection.fromLangDict(
+            {l: lc.sampling(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
+        )
+
+    def sampling_from_index(self, l_index:dict):
+        self.__check_langs(l_index)
+        return MultilingualLabelledCollection.fromLangDict(
+            {l: lc.sampling_from_index(l_index[l]) for l,lc in self.llc.items()}
+        )
+
+    def split_stratified(self, train_prop=0.6, random_state=None):
+        train, test = list(zip(*[self[l].split_stratified(train_prop, random_state) for l in self.langs()]))
+        return MultilingualLabelledCollection(self.langs(), train), MultilingualLabelledCollection(self.langs(), test)
+
+    def asLabelledCollection(self, return_langs=False):
+        lXy_list = [([l]*len(lc),*lc.Xy) for l, lc in self.llc.items()]  # a list with (lang_i, Xi, yi)
+        ls,Xs,ys = list(zip(*lXy_list))
+        ls = np.concatenate(ls)
+        vertstack = vstack if issparse(Xs[0]) else np.vstack
+        Xs = vertstack(Xs)
+        ys = np.concatenate(ys)
+        lc = LabelledCollection(Xs, ys, classes_=self.classes_)
+        # return lc, ls if return_langs else lc
--- a/MultiLabel/results_reuters21578.txt
+++ b/MultiLabel/results_reuters21578.txt
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@ -176,104 +176,6 @@ class LabelledCollection:
            yield train, test


-class MultilingualLabelledCollection:
-    def __init__(self, langs:List[str], labelledCollections:List[LabelledCollection]):
-        assert len(langs) == len(labelledCollections), 'length mismatch for langs and labelledCollection lists'
-        assert all(isinstance(lc, LabelledCollection) for lc in labelledCollections), 'unexpected type for labelledCollections'
-        assert all(labelledCollections[0].classes_ == lc_i.classes_ for lc_i in labelledCollections[1:]), \
-            'inconsistent classes found for some labelled collections'
-        self.llc = {l: lc for l, lc in zip(langs, labelledCollections)}
-        self.classes_=labelledCollections[0].classes_
-
-    @classmethod
-    def fromLangDict(cls, lang_labelledCollection:dict):
-        return MultilingualLabelledCollection(*list(zip(*list(lang_labelledCollection.items()))))
-
-    def langs(self):
-        return list(sorted(self.llc.keys()))
-
-    def __getitem__(self, lang)->LabelledCollection:
-        return self.llc[lang]
-
-    @classmethod
-    def load(cls, path: str, loader_func: callable):
-        return MultilingualLabelledCollection(*loader_func(path))
-
-    def __len__(self):
-        return sum(map(len, self.llc.values()))
-
-    def prevalence(self):
-        prev = np.asarray([lc.prevalence() * len(lc) for lc in self.llc.values()]).sum(axis=0)
-        return prev / prev.sum()
-
-    def language_prevalence(self):
-        lang_count = np.asarray([len(self.llc[l]) for l in self.langs()])
-        return lang_count / lang_count.sum()
-
-    def counts(self):
-        return np.asarray([lc.counts() for lc in self.llc.values()]).sum(axis=0)
-
-    @property
-    def n_classes(self):
-        return len(self.classes_)
-
-    @property
-    def binary(self):
-        return self.n_classes == 2
-
-    def __check_langs(self, l_dict:dict):
-        assert len(l_dict)==len(self.langs()), 'wrong number of languages'
-        assert all(l in l_dict for l in self.langs()), 'missing languages in l_sizes'
-
-    def __check_sizes(self, l_sizes: Union[int,dict]):
-        assert isinstance(l_sizes, int) or isinstance(l_sizes, dict), 'unexpected type for l_sizes'
-        if isinstance(l_sizes, int):
-            return {l:l_sizes for l in self.langs()}
-        self.__check_langs(l_sizes)
-        return l_sizes
-
-    def sampling_index(self, l_sizes: Union[int,dict], *prevs, shuffle=True):
-        l_sizes = self.__check_sizes(l_sizes)
-        return {l:lc.sampling_index(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
-
-    def uniform_sampling_index(self, l_sizes: Union[int, dict]):
-        l_sizes = self.__check_sizes(l_sizes)
-        return {l: lc.uniform_sampling_index(l_sizes[l]) for l,lc in self.llc.items()}
-
-    def uniform_sampling(self, l_sizes: Union[int, dict]):
-        l_sizes = self.__check_sizes(l_sizes)
-        return MultilingualLabelledCollection.fromLangDict(
-            {l: lc.uniform_sampling(l_sizes[l]) for l,lc in self.llc.items()}
-        )
-
-    def sampling(self, l_sizes: Union[int, dict], *prevs, shuffle=True):
-        l_sizes = self.__check_sizes(l_sizes)
-        return MultilingualLabelledCollection.fromLangDict(
-            {l: lc.sampling(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
-        )
-
-    def sampling_from_index(self, l_index:dict):
-        self.__check_langs(l_index)
-        return MultilingualLabelledCollection.fromLangDict(
-            {l: lc.sampling_from_index(l_index[l]) for l,lc in self.llc.items()}
-        )
-
-    def split_stratified(self, train_prop=0.6, random_state=None):
-        train, test = list(zip(*[self[l].split_stratified(train_prop, random_state) for l in self.langs()]))
-        return MultilingualLabelledCollection(self.langs(), train), MultilingualLabelledCollection(self.langs(), test)
-
-    def asLabelledCollection(self, return_langs=False):
-        lXy_list = [([l]*len(lc),*lc.Xy) for l, lc in self.llc.items()]  # a list with (lang_i, Xi, yi)
-        ls,Xs,ys = list(zip(*lXy_list))
-        ls = np.concatenate(ls)
-        vertstack = vstack if issparse(Xs[0]) else np.vstack
-        Xs = vertstack(Xs)
-        ys = np.concatenate(ys)
-        lc = LabelledCollection(Xs, ys, classes_=self.classes_)
-        # return lc, ls if return_langs else lc
-#
-#
-#
 class Dataset:

    def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):