diff --git a/MultiLabel/gentables.py b/MultiLabel/gentables.py index 750bb68..857429c 100644 --- a/MultiLabel/gentables.py +++ b/MultiLabel/gentables.py @@ -23,12 +23,19 @@ import os import pickle models = [#'MLPE', - 'NaiveCC', 'NaivePCC', 'NaiveACC', 'NaivePACC', #'NaiveHDy', 'NaiveSLD', - 'StackCC', 'StackPCC', 'StackACC', 'StackPACC', - 'MRQ-CC', 'MRQ-PCC', 'MRQ-ACC', 'MRQ-PACC', - 'MRQ-StackCC', 'MRQ-StackPCC', 'MRQ-StackACC', 'MRQ-StackPACC', - 'MRQ-StackCC-app', 'MRQ-StackPCC-app', 'MRQ-StackACC-app', 'MRQ-StackPACC-app', - 'LSP-CC', 'LSP-ACC' + 'NaiveCC', 'NaivePCC', 'NaivePCCcal', 'NaiveACC', 'NaivePACC', 'NaivePACCcal', 'NaiveACCit', 'NaivePACCit', + #'NaiveHDy', 'NaiveSLD', + 'ChainCC', 'ChainPCC', 'ChainACC', 'ChainPACC', + 'StackCC', 'StackPCC', 'StackPCCcal', 'StackACC', 'StackPACC', 'StackPACCcal', 'StackACCit', 'StackP' + 'ACCit', + 'MRQ-CC', 'MRQ-PCC', 'MRQ-ACC', 'MRQ-PACC', 'MRQ-ACCit', 'MRQ-PACCit', + 'StackMRQ-CC', 'StackMRQ-PCC', 'StackMRQ-ACC', 'StackMRQ-PACC', + 'MRQ-StackCC', 'MRQ-StackPCC', 'MRQ-StackACC', 'MRQ-StackPACC', + 'StackMRQ-StackCC', 'StackMRQ-StackPCC', 'StackMRQ-StackACC', 'StackMRQ-StackPACC', + 'MRQ-StackCC-app', 'MRQ-StackPCC-app', 'MRQ-StackACC-app', 'MRQ-StackPACC-app', + 'StackMRQ-StackCC-app', 'StackMRQ-StackPCC-app', 'StackMRQ-StackACC-app', 'StackMRQ-StackPACC-app', + 'LSP-CC', 'LSP-ACC', 'MLKNN-CC', 'MLKNN-ACC', + 'MLAdjustedC', 'MLStackAdjustedC', 'MLprobAdjustedC', 'MLStackProbAdjustedC' ] # datasets = sorted(set([x[0] for x in available_data_sets().keys()])) @@ -64,6 +71,12 @@ def generate_table(path, protocol, error): dataset, model, scores = r table.add(dataset, model, scores) + save_table(table, path) + save_table(table.getRankTable(), path.replace('.tex','.rank.tex')) + + + +def save_table(table, path): tabular = """ \\resizebox{\\textwidth}{!}{% \\begin{tabular}{|c||""" + ('c|' * len(models)) + """} \hline @@ -79,7 +92,6 @@ def generate_table(path, protocol, error): with open(path, 'wt') as foo: foo.write(tabular) - if __name__ == '__main__': parser = argparse.ArgumentParser(description='Experiments for multi-label quantification') parser.add_argument('--results', type=str, default='./results', metavar='str', diff --git a/MultiLabel/main.py b/MultiLabel/main.py index 8941340..65b2e76 100644 --- a/MultiLabel/main.py +++ b/MultiLabel/main.py @@ -2,6 +2,8 @@ import argparse from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression import itertools + +from sklearn.multiclass import OneVsRestClassifier from sklearn.multioutput import ClassifierChain from tqdm import tqdm from skmultilearn.dataset import load_dataset, available_data_sets @@ -11,7 +13,7 @@ from MultiLabel.mlclassification import MLStackedClassifier, LabelSpacePartion, from MultiLabel.mldata import MultilabelledCollection from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \ MLACC, \ - MLPACC, MLNaiveAggregativeQuantifier, MLMLPE + MLPACC, MLNaiveAggregativeQuantifier, MLMLPE, StackMLRQuantifier, MLadjustedCount, MLprobAdjustedCount from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy import numpy as np from data.dataset import Dataset @@ -49,23 +51,33 @@ def models(): yield 'MLPE', MLMLPE() yield 'NaiveCC', MLNaiveAggregativeQuantifier(CC(cls())) yield 'NaivePCC', MLNaiveAggregativeQuantifier(PCC(cls())) + yield 'NaivePCCcal', MLNaiveAggregativeQuantifier(PCC(calibratedCls())) yield 'NaiveACC', MLNaiveAggregativeQuantifier(ACC(cls())) yield 'NaivePACC', MLNaiveAggregativeQuantifier(PACC(cls())) + yield 'NaivePACCcal', MLNaiveAggregativeQuantifier(PACC(calibratedCls())) + yield 'NaiveACCit', MLNaiveAggregativeQuantifier(ACC(cls())) + yield 'NaivePACCit', MLNaiveAggregativeQuantifier(PACC(cls())) # yield 'NaiveHDy', MLNaiveAggregativeQuantifier(HDy(cls())) # yield 'NaiveSLD', MLNaiveAggregativeQuantifier(EMQ(calibratedCls())) yield 'StackCC', MLCC(MLStackedClassifier(cls())) yield 'StackPCC', MLPCC(MLStackedClassifier(cls())) + yield 'StackPCCcal', MLPCC(MLStackedClassifier(calibratedCls())) yield 'StackACC', MLACC(MLStackedClassifier(cls())) yield 'StackPACC', MLPACC(MLStackedClassifier(cls())) - # yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None, order='random')) - # yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random')) - # yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random')) - # yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random')) + yield 'StackPACCcal', MLPACC(MLStackedClassifier(calibratedCls())) + yield 'StackACCit', MLACC(MLStackedClassifier(cls())) + yield 'StackPACCit', MLPACC(MLStackedClassifier(cls())) + # yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None)) + # yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None)) + # yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None)) + # yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None)) common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'} yield 'MRQ-CC', MLRegressionQuantification(MLNaiveQuantifier(CC(cls())), **common) yield 'MRQ-PCC', MLRegressionQuantification(MLNaiveQuantifier(PCC(cls())), **common) yield 'MRQ-ACC', MLRegressionQuantification(MLNaiveQuantifier(ACC(cls())), **common) yield 'MRQ-PACC', MLRegressionQuantification(MLNaiveQuantifier(PACC(cls())), **common) + yield 'MRQ-ACCit', MLRegressionQuantification(MLNaiveQuantifier(ACC(cls())), **common) + yield 'MRQ-PACCit', MLRegressionQuantification(MLNaiveQuantifier(PACC(cls())), **common) yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), **common) yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), **common) yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), **common) @@ -74,6 +86,23 @@ def models(): yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), protocol='app', **common) yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), protocol='app', **common) yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), protocol='app', **common) + yield 'StackMRQ-CC', StackMLRQuantifier(MLNaiveQuantifier(CC(cls())), **common) + yield 'StackMRQ-PCC', StackMLRQuantifier(MLNaiveQuantifier(PCC(cls())), **common) + yield 'StackMRQ-ACC', StackMLRQuantifier(MLNaiveQuantifier(ACC(cls())), **common) + yield 'StackMRQ-PACC', StackMLRQuantifier(MLNaiveQuantifier(PACC(cls())), **common) + yield 'StackMRQ-StackCC', StackMLRQuantifier(MLCC(MLStackedClassifier(cls())), **common) + yield 'StackMRQ-StackPCC', StackMLRQuantifier(MLPCC(MLStackedClassifier(cls())), **common) + yield 'StackMRQ-StackACC', StackMLRQuantifier(MLACC(MLStackedClassifier(cls())), **common) + yield 'StackMRQ-StackPACC', StackMLRQuantifier(MLPACC(MLStackedClassifier(cls())), **common) + yield 'StackMRQ-StackCC-app', StackMLRQuantifier(MLCC(MLStackedClassifier(cls())), protocol='app', **common) + yield 'StackMRQ-StackPCC-app', StackMLRQuantifier(MLPCC(MLStackedClassifier(cls())), protocol='app', **common) + yield 'StackMRQ-StackACC-app', StackMLRQuantifier(MLACC(MLStackedClassifier(cls())), protocol='app', **common) + yield 'StackMRQ-StackPACC-app', StackMLRQuantifier(MLPACC(MLStackedClassifier(cls())), protocol='app', **common) + yield 'MLAdjustedC', MLadjustedCount(OneVsRestClassifier(cls())) + yield 'MLStackAdjustedC', MLadjustedCount(MLStackedClassifier(cls())) + # yield 'MLprobAdjustedC', MLprobAdjustedCount(OneVsRestClassifier(calibratedCls())) + # yield 'MLStackProbAdjustedC', MLprobAdjustedCount(MLStackedClassifier(calibratedCls())) + # yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common) # yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common) # yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common) @@ -82,10 +111,10 @@ def models(): # yield 'LSP-ACC', MLACC(LabelSpacePartion(cls())) # yield 'TwinSVM-CC', MLCC(MLTwinSVM()) # yield 'TwinSVM-ACC', MLACC(MLTwinSVM()) - yield 'MLKNN-CC', MLCC(MLknn()) - yield 'MLKNN-PCC', MLPCC(MLknn()) - yield 'MLKNN-ACC', MLACC(MLknn()) - yield 'MLKNN-PACC', MLPACC(MLknn()) + # yield 'MLKNN-CC', MLCC(MLknn()) + #yield 'MLKNN-PCC', MLPCC(MLknn()) + # yield 'MLKNN-ACC', MLACC(MLknn()) + #yield 'MLKNN-PACC', MLPACC(MLknn()) def get_dataset(dataset_name, dopickle=True): diff --git a/MultiLabel/mlclassification.py b/MultiLabel/mlclassification.py index af81a28..929be94 100644 --- a/MultiLabel/mlclassification.py +++ b/MultiLabel/mlclassification.py @@ -1,7 +1,7 @@ from copy import deepcopy from sklearn.calibration import CalibratedClassifierCV -from sklearn.linear_model import LogisticRegression +from sklearn.linear_model import LogisticRegression, Ridge from sklearn.multiclass import OneVsRestClassifier from sklearn.preprocessing import StandardScaler from skmultilearn.adapt import MLTSVM @@ -44,6 +44,25 @@ class MLStackedClassifier: # aka Funnelling Monolingual return self.meta.predict_proba(P) +class MLStackedRegressor: + def __init__(self, base_regressor=Ridge(normalize=True)): + self.base = deepcopy(base_regressor) + self.meta = deepcopy(base_regressor) + + def fit(self, X, y): + assert y.ndim==2, 'the dataset does not seem to be multi-label' + self.base.fit(X, y) + R = self.base.predict(X) + # R = self.norm.fit_transform(R) + self.meta.fit(R, y) + return self + + def predict(self, X): + R = self.base.predict(X) + # R = self.norm.transform(R) + return self.meta.predict(R) + + class LabelSpacePartion: def __init__(self, base_estimator=LogisticRegression()): graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False) diff --git a/MultiLabel/mldata.py b/MultiLabel/mldata.py index b4b68ec..2e5298f 100644 --- a/MultiLabel/mldata.py +++ b/MultiLabel/mldata.py @@ -6,6 +6,7 @@ from sklearn.model_selection import train_test_split from quapy.data import LabelledCollection from quapy.functional import artificial_prevalence_sampling +from skmultilearn.model_selection import iterative_train_test_split class MultilabelledCollection: def __init__(self, instances, labels): @@ -67,10 +68,13 @@ class MultilabelledCollection: labels = self.labels[index] return MultilabelledCollection(documents, labels) - def train_test_split(self, train_prop=0.6, random_state=None): - #raise ValueError('use the scikit-multilearn implementation') - tr_docs, te_docs, tr_labels, te_labels = \ - train_test_split(self.instances, self.labels, train_size=train_prop, random_state=random_state) + def train_test_split(self, train_prop=0.6, random_state=None, iterative=False): + if iterative: + tr_docs, tr_labels, te_docs, te_labels = \ + iterative_train_test_split(self.instances, self.labels, test_size=1-train_prop) + else: + tr_docs, te_docs, tr_labels, te_labels = \ + train_test_split(self.instances, self.labels, train_size=train_prop, random_state=random_state) return MultilabelledCollection(tr_docs, tr_labels), MultilabelledCollection(te_docs, te_labels) def artificial_sampling_generator(self, sample_size, category, n_prevalences=101, repeats=1): @@ -98,6 +102,10 @@ class MultilabelledCollection: for c in self.classes_: yield self.asLabelledCollection(c) + # @property + # def label_cardinality(self): + # return self.labels.sum()/len(self) + @property def Xy(self): return self.instances, self.labels diff --git a/MultiLabel/mlquantification.py b/MultiLabel/mlquantification.py index 7276267..aba2f36 100644 --- a/MultiLabel/mlquantification.py +++ b/MultiLabel/mlquantification.py @@ -1,6 +1,8 @@ import numpy as np from copy import deepcopy +import sklearn.preprocessing +from sklearn.ensemble import StackingRegressor from sklearn.metrics import confusion_matrix from sklearn.multioutput import MultiOutputRegressor from sklearn.preprocessing import StandardScaler @@ -9,7 +11,7 @@ from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LassoCV, Mult ElasticNet, MultiTaskElasticNetCV, MultiTaskElasticNet, LinearRegression, ARDRegression, BayesianRidge, SGDRegressor import quapy as qp -from MultiLabel.mlclassification import MLStackedClassifier +from MultiLabel.mlclassification import MLStackedClassifier, MLStackedRegressor from MultiLabel.mldata import MultilabelledCollection from method.aggregative import CC, ACC, PACC, AggregativeQuantifier from method.base import BaseQuantifier @@ -166,13 +168,17 @@ class MLRegressionQuantification: norm=True, means=True, stds=True): - assert regression in ['ridge', 'svr'], 'unknown regression model' + assert protocol in ['npp', 'app'], 'unknown protocol' self.estimator = mlquantifier - if regression == 'ridge': - self.reg = Ridge(normalize=norm) - elif regression == 'svr': - self.reg = MultiOutputRegressor(LinearSVR()) + if isinstance(regression, str): + assert regression in ['ridge', 'svr'], 'unknown regression model' + if regression == 'ridge': + self.reg = Ridge(normalize=norm) + elif regression == 'svr': + self.reg = MultiOutputRegressor(LinearSVR()) + else: + self.reg = regression self.protocol = protocol # self.reg = MultiTaskLassoCV(normalize=norm) # self.reg = KernelRidge(kernel='rbf') @@ -215,7 +221,7 @@ class MLRegressionQuantification: Xs, ys = [], [] samples_mean, samples_std = [], [] for sample in val.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples): - self._extract_features(self, sample, Xs, ys, samples_mean, samples_std) + self._extract_features(sample, Xs, ys, samples_mean, samples_std) return self._prepare_arrays(Xs, ys, samples_mean, samples_std) @@ -227,7 +233,7 @@ class MLRegressionQuantification: repeats = max(self.n_samples // (ncats * nprevs), 1) for cat in self.classes_: for sample in val.artificial_sampling_generator(sample_size=self.sample_size, category=cat, n_prevalences=nprevs, repeats=repeats): - self._extract_features(self, sample, Xs, ys, samples_mean, samples_std) + self._extract_features(sample, Xs, ys, samples_mean, samples_std) return self._prepare_arrays(Xs, ys, samples_mean, samples_std) def fit(self, data:MultilabelledCollection): @@ -259,4 +265,97 @@ class MLRegressionQuantification: return np.asarray([neg_prevs, adjusted]).T -# class \ No newline at end of file +class StackMLRQuantifier: + def __init__(self, + mlquantifier=MLNaiveQuantifier(CC(LinearSVC())), + regression='ridge', + protocol='npp', + n_samples=500, + sample_size=500, + norm=True, + means=True, + stds=True): + if regression == 'ridge': + reg = MLStackedRegressor(Ridge(normalize=True)) + elif regression == 'svr': + reg = MLStackedRegressor(MultiOutputRegressor(LinearSVR())) + else: + ValueError(f'unknown regressor {regression}') + + self.base = MLRegressionQuantification( + mlquantifier=mlquantifier, + regression=reg, + protocol=protocol, + n_samples=n_samples, + sample_size=sample_size, + norm=norm, + means=means, + stds=stds) + + def fit(self, data:MultilabelledCollection): + self.classes_ = data.classes_ + self.base.fit(data) + return self + + def quantify(self, instances): + return self.base.quantify(instances) + + +class MLadjustedCount(MLAggregativeQuantifier): + def __init__(self, learner): + self.learner = learner + + def preclassify(self, instances): + return self.learner.predict(instances) + + def fit(self, data: MultilabelledCollection, train_prop=0.6): + self.classes_ = data.classes_ + train, val = data.train_test_split(train_prop=train_prop) + self.learner.fit(*train.Xy) + val_predictions = self.preclassify(val.instances) + val_true = val.labels + + N = len(val) + C = val_predictions.T.dot(val_true) / N # join probabilities [[P(y1,\hat{y}1), P(y2,\hat{y}1)], ... ] + priorP = val_predictions.mean(axis=0).reshape(-1,1) # priors [P(hat{y}1), P(hat{y}2), ...] + self.Pte_cond_estim_ = np.true_divide(C, priorP, where=priorP>0) # cond probabilities [[P(y1|\hat{y}1), P(y2|\hat{y}1)], ... ] + + return self + + def aggregate(self, predictions): + P = sklearn.preprocessing.normalize(predictions, norm='l1') + correction = P.dot(self.Pte_cond_estim_) + adjusted = correction.mean(axis=0) + return np.asarray([1-adjusted, adjusted]).T + + +class MLprobAdjustedCount(MLAggregativeQuantifier): + def __init__(self, learner): + self.learner = learner + + def preclassify(self, instances): + return self.learner.predict_proba(instances) + + def fit(self, data: MultilabelledCollection, train_prop=0.6): + self.classes_ = data.classes_ + train, val = data.train_test_split(train_prop=train_prop) + self.learner.fit(*train.Xy) + val_predictions = self.preclassify(val.instances) + val_true = val.labels + + N = len(val) + + C = (val_predictions>0.5).T.dot(val_true) / N # join probabilities [[P(y1,\hat{y}1), P(y2,\hat{y}1)], ... ] + # not sure... + + + priorP = val_predictions.mean(axis=0).reshape(-1,1) # priors [P(hat{y}1), P(hat{y}2), ...] + self.Pte_cond_estim_ = np.true_divide(C, priorP, where=priorP>0) # cond probabilities [[P(y1|\hat{y}1), P(y2|\hat{y}1)], ... ] + + return self + + def aggregate(self, predictions): + P = sklearn.preprocessing.normalize(predictions, norm='l1') + correction = P.dot(self.Pte_cond_estim_) + adjusted = correction.mean(axis=0) + return np.asarray([1-adjusted, adjusted]).T