new methods, some experiments added

2021-07-06 18:26:05 +02:00 · 2021-07-06 18:26:05 +02:00 · 60b6fa3c12
parent 7b8e6462ff
commit 60b6fa3c12
4 changed files with 171 additions and 29 deletions
--- a/MultiLabel/NOTES.txt
+++ b/MultiLabel/NOTES.txt
@ -0,0 +1,14 @@
 Things to test:
 - MultiChain for classification, MultiChain for regression?
 - Independent classifiers + independent quantifiers
 - Stacking + independent quantifiers
 - ClassifierChain + independent quantifiers
 - Independent quantifiers + cross-class regression (independent?)
 - Stacking + cross-class regression
 - ClassifierChain + cross-class regression
 - Covariates (Means, CovMatrix from samples) + multioutput regression?
 - Covariates concatented with quantifiers predictions + cross-class regression?
 - Model Selection for specific protocols?
--- a/MultiLabel/mlquantification.py
+++ b/MultiLabel/mlquantification.py
@ -154,15 +154,24 @@ class MultilabelNaiveAggregativeQuantifier(MultilabelNaiveQuantifier, MLAggregat
        return self.aggregate(predictions)
-class MultilabelRegressionQuantification:
+class MLRegressionQuantification:
-    def __init__(self, base_quantifier=CC(LinearSVC()), regression='ridge', n_samples=500, sample_size=500, norm=True,
+    def __init__(self,
-                 means=True, stds=True):
+                 mlquantifier=MultilabelNaiveQuantifier(CC(LinearSVC())),
                 regression='ridge',
                 protocol='npp',
                 n_samples=500,
                 sample_size=500,
                 norm=True,
                 means=True,
                 stds=True):
        assert regression in ['ridge', 'svr'], 'unknown regression model'
-        self.estimator = MultilabelNaiveQuantifier(base_quantifier)
+        assert protocol in ['npp', 'app'], 'unknown protocol'
        self.estimator = mlquantifier
        if regression == 'ridge':
            self.reg = Ridge(normalize=norm)
        elif regression == 'svr':
            self.reg = MultiOutputRegressor(LinearSVR())
        self.protocol = protocol
        # self.reg = MultiTaskLassoCV(normalize=norm)
        # self.reg = KernelRidge(kernel='rbf')
        # self.reg = LassoLarsCV(normalize=norm)
@ -174,25 +183,11 @@ class MultilabelRegressionQuantification:
        self.regression = regression
        self.n_samples = n_samples
        self.sample_size = sample_size
-        self.norm = StandardScaler()
+        # self.norm = StandardScaler()
        self.means = means
        self.stds = stds
-    def fit(self, data:MultilabelledCollection):
+    def _prepare_arrays(self, Xs, ys, samples_mean, samples_std):
        self.classes_ = data.classes_
        tr, te = data.train_test_split()
        self.estimator.fit(tr)
        samples_mean = []
        samples_std = []
        Xs = []
        ys = []
        for sample in te.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
            ys.append(sample.prevalence()[:,1])
            Xs.append(self.estimator.quantify(sample.instances)[:,1])
            if self.means:
                samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
            if self.stds:
                samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
        Xs = np.asarray(Xs)
        ys = np.asarray(ys)
        if self.means:
@ -201,7 +196,49 @@ class MultilabelRegressionQuantification:
        if self.stds:
            samples_std = np.asarray(samples_std)
            Xs = np.hstack([Xs, samples_std])
-        Xs = self.norm.fit_transform(Xs)
+        return Xs, ys
    def generate_samples_npp(self, val):
        samples_mean = []
        samples_std = []
        Xs = []
        ys = []
        for sample in val.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
            ys.append(sample.prevalence()[:, 1])
            Xs.append(self.estimator.quantify(sample.instances)[:, 1])
            if self.means:
                samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
            if self.stds:
                samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
        return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
    def generate_samples_app(self, val):
        samples_mean = []
        samples_std = []
        Xs = []
        ys = []
        ncats = len(self.classes_)
        nprevs  = 21
        repeats = max(self.n_samples // (ncats * nprevs), 1)
        for cat in self.classes_:
            for sample in val.artificial_sampling_generator(sample_size=self.sample_size, category=cat, n_prevalences=nprevs, repeats=repeats):
                ys.append(sample.prevalence()[:, 1])
                Xs.append(self.estimator.quantify(sample.instances)[:, 1])
                if self.means:
                    samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
                if self.stds:
                    samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
        return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
    def fit(self, data:MultilabelledCollection):
        self.classes_ = data.classes_
        tr, val = data.train_test_split()
        self.estimator.fit(tr)
        if self.protocol == 'npp':
            Xs, ys = self.generate_samples_npp(val)
        elif self.protocol == 'app':
            Xs, ys = self.generate_samples_app(val)
        # Xs = self.norm.fit_transform(Xs)
        self.reg.fit(Xs, ys)
        return self
@ -213,9 +250,9 @@ class MultilabelRegressionQuantification:
        if self.stds:
            sample_std = instances.todense().std(axis=0).getA()
            Xs = np.hstack([Xs, sample_std])
-        Xs = self.norm.transform(Xs)
+        # Xs = self.norm.transform(Xs)
        Xs = self.reg.predict(Xs)
-        Xs = self.norm.inverse_transform(Xs)
+        # Xs = self.norm.inverse_transform(Xs)
        adjusted = np.clip(Xs, 0, 1)
        adjusted = adjusted.flatten()
        neg_prevs = 1-adjusted
--- a/MultiLabel/multi_label.py
+++ b/MultiLabel/multi_label.py
@ -6,7 +6,7 @@ from tqdm import tqdm
 import quapy as qp
 from MultiLabel.mlclassification import MultilabelStackedClassifier
 from MultiLabel.mldata import MultilabelledCollection
-from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, MultilabelRegressionQuantification, \
+from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
    MLACC, \
    MLPACC, MultilabelNaiveAggregativeQuantifier
 from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
@ -44,11 +44,23 @@ def models():
    # yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random'))
    # yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random'))
    # yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random'))
-    common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False}
+    common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'}
-    yield 'MRQ-CC', MultilabelRegressionQuantification(base_quantifier=CC(cls()), regression='svr', **common)
+    # yield 'MRQ-CC', MLRegressionQuantification(MultilabelNaiveQuantifier(CC(cls())), **common)
-    yield 'MRQ-PCC', MultilabelRegressionQuantification(base_quantifier=PCC(cls()), regression='svr', **common)
+    # yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())),  **common)
-    yield 'MRQ-ACC', MultilabelRegressionQuantification(base_quantifier=ACC(cls()), regression='svr', **common)
+    # yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())),  **common)
-    yield 'MRQ-PACC', MultilabelRegressionQuantification(base_quantifier=PACC(cls()), regression='svr', **common)
+    # yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common)
    # yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common)
    # yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common)
    # yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common)
    # yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())),  **common)
    yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
    yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
    yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), protocol='app', **common)
    yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), protocol='app',  **common)
    # yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common)
    # yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common)
    # yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common)
    # yield 'MRQ-ChainPACC', MLRegressionQuantification(MLPACC(ClassifierChain(cls())), **common)
 dataset = 'reuters21578'
--- a/MultiLabel/results.txt
+++ b/MultiLabel/results.txt
@ -0,0 +1,79 @@
 num categories = 10
 Train-counts: [1650  181  389 2877  433  347  538  197  369  212]
 Test-counts: [ 719   56  189 1087  149  131  179   89  117   71]
 MLPE: 0.01101
 NPP:
 NaiveCC   	mae=0.01718
 NaivePCC  	mae=0.00898
 NaiveACC  	mae=0.01560
 NaivePACC 	mae=0.01062
 StackCC   	mae=0.00790
 StackPCC  	mae=0.00659 **
 StackACC  	mae=0.00913
 StackPACC 	mae=0.00771
 ChainCC   	mae=0.01644
 ChainPCC  	mae=0.00924
 ChainACC  	mae=0.01767
 ChainPACC 	mae=0.01140
 MRQ-CC    	mae=0.01130
 MRQ-PCC   	mae=0.00941
 MRQ-ACC   	mae=0.01153
 MRQ-PACC  	mae=0.01000
 MRQ-StackCC	mae=0.00757
 MRQ-StackPCC	mae=0.00652 **
 MRQ-StackACC	mae=0.00799
 MRQ-StackPACC	mae=0.00763
 MRQ-StackCC-app	mae=0.00791
 MRQ-StackPCC-appmae=0.00840
 MRQ-StackACC-appmae=0.00910
 MRQ-StackPACC-apmae=0.00941
 MRQ-ChainCC	mae=0.00989
 MRQ-ChainPCC	mae=0.00916
 MRQ-ChainACC	mae=0.01251
 MRQ-ChainPACC	mae=0.00954
 APP:
 NaiveCC   	mae=0.04120
 NaivePCC  	mae=0.03741
 NaiveACC  	mae=0.03202
 NaivePACC 	mae=0.02293
 StackCC   	mae=0.01969
 StackPCC  	mae=0.01871
 StackACC  	mae=0.01386 **
 StackPACC 	mae=0.01267 **
 ChainCC   	mae=0.04136
 ChainPCC  	mae=0.03571
 ChainACC  	mae=0.03622
 ChainPACC 	mae=0.02659
 MRQ-CC    	mae=0.04356
 MRQ-PCC   	mae=0.02532
 MRQ-ACC   	mae=0.05716
 MRQ-PACC  	mae=0.02936
 MRQ-StackCC	mae=0.02448
 MRQ-StackPCC	mae=0.02090
 MRQ-StackACC	mae=0.02579
 MRQ-StackPACC	mae=0.02388
 MRQ-StackCC-app	mae=0.01535
 MRQ-StackPCC-appmae=0.01457
 MRQ-StackACC-appmae=0.01441
 MRQ-StackPACC-apmae=0.01633
 MRQ-ChainCC	mae=0.04874
 MRQ-ChainPCC	mae=0.02537
 MRQ-ChainACC	mae=0.06262
 MRQ-ChainPACC	mae=0.02906