diff --git a/MultiLabel/NOTES.txt b/MultiLabel/NOTES.txt new file mode 100644 index 0000000..b809537 --- /dev/null +++ b/MultiLabel/NOTES.txt @@ -0,0 +1,14 @@ +Things to test: +- MultiChain for classification, MultiChain for regression? + +- Independent classifiers + independent quantifiers +- Stacking + independent quantifiers +- ClassifierChain + independent quantifiers +- Independent quantifiers + cross-class regression (independent?) +- Stacking + cross-class regression +- ClassifierChain + cross-class regression +- Covariates (Means, CovMatrix from samples) + multioutput regression? +- Covariates concatented with quantifiers predictions + cross-class regression? + +- Model Selection for specific protocols? + diff --git a/MultiLabel/mlquantification.py b/MultiLabel/mlquantification.py index b8ea3cc..775bd94 100644 --- a/MultiLabel/mlquantification.py +++ b/MultiLabel/mlquantification.py @@ -154,15 +154,24 @@ class MultilabelNaiveAggregativeQuantifier(MultilabelNaiveQuantifier, MLAggregat return self.aggregate(predictions) -class MultilabelRegressionQuantification: - def __init__(self, base_quantifier=CC(LinearSVC()), regression='ridge', n_samples=500, sample_size=500, norm=True, - means=True, stds=True): +class MLRegressionQuantification: + def __init__(self, + mlquantifier=MultilabelNaiveQuantifier(CC(LinearSVC())), + regression='ridge', + protocol='npp', + n_samples=500, + sample_size=500, + norm=True, + means=True, + stds=True): assert regression in ['ridge', 'svr'], 'unknown regression model' - self.estimator = MultilabelNaiveQuantifier(base_quantifier) + assert protocol in ['npp', 'app'], 'unknown protocol' + self.estimator = mlquantifier if regression == 'ridge': self.reg = Ridge(normalize=norm) elif regression == 'svr': self.reg = MultiOutputRegressor(LinearSVR()) + self.protocol = protocol # self.reg = MultiTaskLassoCV(normalize=norm) # self.reg = KernelRidge(kernel='rbf') # self.reg = LassoLarsCV(normalize=norm) @@ -174,25 +183,11 @@ class MultilabelRegressionQuantification: self.regression = regression self.n_samples = n_samples self.sample_size = sample_size - self.norm = StandardScaler() + # self.norm = StandardScaler() self.means = means self.stds = stds - def fit(self, data:MultilabelledCollection): - self.classes_ = data.classes_ - tr, te = data.train_test_split() - self.estimator.fit(tr) - samples_mean = [] - samples_std = [] - Xs = [] - ys = [] - for sample in te.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples): - ys.append(sample.prevalence()[:,1]) - Xs.append(self.estimator.quantify(sample.instances)[:,1]) - if self.means: - samples_mean.append(sample.instances.mean(axis=0).getA().flatten()) - if self.stds: - samples_std.append(sample.instances.todense().std(axis=0).getA().flatten()) + def _prepare_arrays(self, Xs, ys, samples_mean, samples_std): Xs = np.asarray(Xs) ys = np.asarray(ys) if self.means: @@ -201,7 +196,49 @@ class MultilabelRegressionQuantification: if self.stds: samples_std = np.asarray(samples_std) Xs = np.hstack([Xs, samples_std]) - Xs = self.norm.fit_transform(Xs) + return Xs, ys + + def generate_samples_npp(self, val): + samples_mean = [] + samples_std = [] + Xs = [] + ys = [] + for sample in val.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples): + ys.append(sample.prevalence()[:, 1]) + Xs.append(self.estimator.quantify(sample.instances)[:, 1]) + if self.means: + samples_mean.append(sample.instances.mean(axis=0).getA().flatten()) + if self.stds: + samples_std.append(sample.instances.todense().std(axis=0).getA().flatten()) + return self._prepare_arrays(Xs, ys, samples_mean, samples_std) + + def generate_samples_app(self, val): + samples_mean = [] + samples_std = [] + Xs = [] + ys = [] + ncats = len(self.classes_) + nprevs = 21 + repeats = max(self.n_samples // (ncats * nprevs), 1) + for cat in self.classes_: + for sample in val.artificial_sampling_generator(sample_size=self.sample_size, category=cat, n_prevalences=nprevs, repeats=repeats): + ys.append(sample.prevalence()[:, 1]) + Xs.append(self.estimator.quantify(sample.instances)[:, 1]) + if self.means: + samples_mean.append(sample.instances.mean(axis=0).getA().flatten()) + if self.stds: + samples_std.append(sample.instances.todense().std(axis=0).getA().flatten()) + return self._prepare_arrays(Xs, ys, samples_mean, samples_std) + + def fit(self, data:MultilabelledCollection): + self.classes_ = data.classes_ + tr, val = data.train_test_split() + self.estimator.fit(tr) + if self.protocol == 'npp': + Xs, ys = self.generate_samples_npp(val) + elif self.protocol == 'app': + Xs, ys = self.generate_samples_app(val) + # Xs = self.norm.fit_transform(Xs) self.reg.fit(Xs, ys) return self @@ -213,9 +250,9 @@ class MultilabelRegressionQuantification: if self.stds: sample_std = instances.todense().std(axis=0).getA() Xs = np.hstack([Xs, sample_std]) - Xs = self.norm.transform(Xs) + # Xs = self.norm.transform(Xs) Xs = self.reg.predict(Xs) - Xs = self.norm.inverse_transform(Xs) + # Xs = self.norm.inverse_transform(Xs) adjusted = np.clip(Xs, 0, 1) adjusted = adjusted.flatten() neg_prevs = 1-adjusted diff --git a/MultiLabel/multi_label.py b/MultiLabel/multi_label.py index 5fe8ae9..bf413cd 100644 --- a/MultiLabel/multi_label.py +++ b/MultiLabel/multi_label.py @@ -6,7 +6,7 @@ from tqdm import tqdm import quapy as qp from MultiLabel.mlclassification import MultilabelStackedClassifier from MultiLabel.mldata import MultilabelledCollection -from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, MultilabelRegressionQuantification, \ +from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \ MLACC, \ MLPACC, MultilabelNaiveAggregativeQuantifier from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy @@ -44,11 +44,23 @@ def models(): # yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random')) # yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random')) # yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random')) - common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False} - yield 'MRQ-CC', MultilabelRegressionQuantification(base_quantifier=CC(cls()), regression='svr', **common) - yield 'MRQ-PCC', MultilabelRegressionQuantification(base_quantifier=PCC(cls()), regression='svr', **common) - yield 'MRQ-ACC', MultilabelRegressionQuantification(base_quantifier=ACC(cls()), regression='svr', **common) - yield 'MRQ-PACC', MultilabelRegressionQuantification(base_quantifier=PACC(cls()), regression='svr', **common) + common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'} + # yield 'MRQ-CC', MLRegressionQuantification(MultilabelNaiveQuantifier(CC(cls())), **common) + # yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())), **common) + # yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())), **common) + # yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common) + # yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common) + # yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common) + # yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common) + # yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), **common) + yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), protocol='app', **common) + yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), protocol='app', **common) + yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), protocol='app', **common) + yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), protocol='app', **common) + # yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common) + # yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common) + # yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common) + # yield 'MRQ-ChainPACC', MLRegressionQuantification(MLPACC(ClassifierChain(cls())), **common) dataset = 'reuters21578' diff --git a/MultiLabel/results.txt b/MultiLabel/results.txt new file mode 100644 index 0000000..9243caa --- /dev/null +++ b/MultiLabel/results.txt @@ -0,0 +1,79 @@ +num categories = 10 +Train-counts: [1650 181 389 2877 433 347 538 197 369 212] +Test-counts: [ 719 56 189 1087 149 131 179 89 117 71] +MLPE: 0.01101 + +NPP: +NaiveCC mae=0.01718 +NaivePCC mae=0.00898 +NaiveACC mae=0.01560 +NaivePACC mae=0.01062 + +StackCC mae=0.00790 +StackPCC mae=0.00659 ** +StackACC mae=0.00913 +StackPACC mae=0.00771 + +ChainCC mae=0.01644 +ChainPCC mae=0.00924 +ChainACC mae=0.01767 +ChainPACC mae=0.01140 + +MRQ-CC mae=0.01130 +MRQ-PCC mae=0.00941 +MRQ-ACC mae=0.01153 +MRQ-PACC mae=0.01000 + +MRQ-StackCC mae=0.00757 +MRQ-StackPCC mae=0.00652 ** +MRQ-StackACC mae=0.00799 +MRQ-StackPACC mae=0.00763 + +MRQ-StackCC-app mae=0.00791 +MRQ-StackPCC-appmae=0.00840 +MRQ-StackACC-appmae=0.00910 +MRQ-StackPACC-apmae=0.00941 + +MRQ-ChainCC mae=0.00989 +MRQ-ChainPCC mae=0.00916 +MRQ-ChainACC mae=0.01251 +MRQ-ChainPACC mae=0.00954 + +APP: +NaiveCC mae=0.04120 +NaivePCC mae=0.03741 +NaiveACC mae=0.03202 +NaivePACC mae=0.02293 + +StackCC mae=0.01969 +StackPCC mae=0.01871 +StackACC mae=0.01386 ** +StackPACC mae=0.01267 ** + +ChainCC mae=0.04136 +ChainPCC mae=0.03571 +ChainACC mae=0.03622 +ChainPACC mae=0.02659 + +MRQ-CC mae=0.04356 +MRQ-PCC mae=0.02532 +MRQ-ACC mae=0.05716 +MRQ-PACC mae=0.02936 + +MRQ-StackCC mae=0.02448 +MRQ-StackPCC mae=0.02090 +MRQ-StackACC mae=0.02579 +MRQ-StackPACC mae=0.02388 + +MRQ-StackCC-app mae=0.01535 +MRQ-StackPCC-appmae=0.01457 +MRQ-StackACC-appmae=0.01441 +MRQ-StackPACC-apmae=0.01633 + +MRQ-ChainCC mae=0.04874 +MRQ-ChainPCC mae=0.02537 +MRQ-ChainACC mae=0.06262 +MRQ-ChainPACC mae=0.02906 + + +