forked from moreo/QuaPy
adding multi-label classification methods
This commit is contained in:
parent
dc2fa05cf8
commit
4572ec266d
|
@ -7,12 +7,12 @@ from tqdm import tqdm
|
||||||
from skmultilearn.dataset import load_dataset, available_data_sets
|
from skmultilearn.dataset import load_dataset, available_data_sets
|
||||||
from scipy.sparse import csr_matrix
|
from scipy.sparse import csr_matrix
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from MultiLabel.main import load_results
|
from MultiLabel.main import load_results, SKMULTILEARN_RED_DATASETS, TC_DATASETS, sample_size
|
||||||
from MultiLabel.mlclassification import MultilabelStackedClassifier
|
from MultiLabel.mlclassification import MLStackedClassifier
|
||||||
from MultiLabel.mldata import MultilabelledCollection
|
from MultiLabel.mldata import MultilabelledCollection
|
||||||
from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
|
from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
|
||||||
MLACC, \
|
MLACC, \
|
||||||
MLPACC, MultilabelNaiveAggregativeQuantifier
|
MLPACC, MLNaiveAggregativeQuantifier
|
||||||
from MultiLabel.tabular import Table
|
from MultiLabel.tabular import Table
|
||||||
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
|
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -22,29 +22,56 @@ import sys
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
models = ['NaiveCC', 'NaivePCC', 'NaiveACC', 'NaivePACC', 'NaiveHDy', 'NaiveSLD']
|
models = [#'MLPE',
|
||||||
datasets = sorted(set([x[0] for x in available_data_sets().keys()]))
|
'NaiveCC', 'NaivePCC', 'NaiveACC', 'NaivePACC', #'NaiveHDy', 'NaiveSLD',
|
||||||
|
'StackCC', 'StackPCC', 'StackACC', 'StackPACC',
|
||||||
|
'MRQ-CC', 'MRQ-PCC', 'MRQ-ACC', 'MRQ-PACC',
|
||||||
|
'MRQ-StackCC', 'MRQ-StackPCC', 'MRQ-StackACC', 'MRQ-StackPACC',
|
||||||
|
'MRQ-StackCC-app', 'MRQ-StackPCC-app', 'MRQ-StackACC-app', 'MRQ-StackPACC-app',
|
||||||
|
'LSP-CC', 'LSP-ACC'
|
||||||
|
]
|
||||||
|
|
||||||
|
# datasets = sorted(set([x[0] for x in available_data_sets().keys()]))
|
||||||
|
datasets = TC_DATASETS
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def generate_table(path, protocol, error):
|
def generate_table(path, protocol, error):
|
||||||
print(f'generating {path}')
|
|
||||||
table = Table(datasets, models)
|
def compute_score_job(args):
|
||||||
for dataset, model in itertools.product(datasets, models):
|
dataset, model = args
|
||||||
result_path = f'{opt.results}/{dataset}_{model}.pkl'
|
result_path = f'{opt.results}/{dataset}_{model}.pkl'
|
||||||
if os.path.exists(result_path):
|
if os.path.exists(result_path):
|
||||||
|
print('+', end='')
|
||||||
|
sys.stdout.flush()
|
||||||
result = load_results(result_path)
|
result = load_results(result_path)
|
||||||
true_prevs, estim_prevs = result[protocol]
|
true_prevs, estim_prevs = result[protocol]
|
||||||
scores = np.asarray([error(trues, estims) for trues, estims in zip(true_prevs, estim_prevs)]).flatten()
|
scores = np.asarray([error(trues, estims) for trues, estims in zip(true_prevs, estim_prevs)]).flatten()
|
||||||
|
return dataset, model, scores
|
||||||
|
print('-', end='')
|
||||||
|
sys.stdout.flush()
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
print(f'\ngenerating {path}')
|
||||||
|
table = Table(datasets, models, prec_mean=4, significance_test='wilcoxon')
|
||||||
|
results = qp.util.parallel(compute_score_job, list(itertools.product(datasets, models)), n_jobs=-1)
|
||||||
|
print()
|
||||||
|
|
||||||
|
for r in results:
|
||||||
|
if r is not None:
|
||||||
|
dataset, model, scores = r
|
||||||
table.add(dataset, model, scores)
|
table.add(dataset, model, scores)
|
||||||
|
|
||||||
tabular = """
|
tabular = """
|
||||||
\\resizebox{\\textwidth}{!}{%
|
\\resizebox{\\textwidth}{!}{%
|
||||||
\\begin{tabular}{|c||""" + ('c|' * len(models)) + """} \hline
|
\\begin{tabular}{|c||""" + ('c|' * len(models)) + """} \hline
|
||||||
"""
|
"""
|
||||||
dataset_replace = {'tmc2007_500': 'tmc2007\_500'}
|
dataset_replace = {'tmc2007_500': 'tmc2007\_500', 'tmc2007_500-red': 'tmc2007\_500-red'}
|
||||||
method_replace = {}
|
method_replace = {}
|
||||||
|
|
||||||
tabular += table.latexTabular(benchmark_replace=dataset_replace, method_replace=method_replace)
|
tabular += table.latexTabularT(benchmark_replace=dataset_replace, method_replace=method_replace, side=True)
|
||||||
tabular += """
|
tabular += """
|
||||||
\end{tabular}%
|
\end{tabular}%
|
||||||
}
|
}
|
||||||
|
@ -61,13 +88,17 @@ if __name__ == '__main__':
|
||||||
help=f'path where to store the tables')
|
help=f'path where to store the tables')
|
||||||
opt = parser.parse_args()
|
opt = parser.parse_args()
|
||||||
|
|
||||||
os.makedirs(opt.results, exist_ok=True)
|
assert os.path.exists(opt.results), f'result directory {opt.results} does not exist'
|
||||||
os.makedirs(opt.tablepath, exist_ok=True)
|
os.makedirs(opt.tablepath, exist_ok=True)
|
||||||
|
|
||||||
eval_error = qp.error.ae
|
qp.environ["SAMPLE_SIZE"] = sample_size
|
||||||
generate_table(f'{opt.tablepath}/npp.ae.tex', protocol='npp', error=eval_error)
|
absolute_error = qp.error.ae
|
||||||
generate_table(f'{opt.tablepath}/app.ae.tex', protocol='app', error=eval_error)
|
relative_absolute_error = qp.error.rae
|
||||||
|
|
||||||
|
generate_table(f'{opt.tablepath}/npp.ae.tex', protocol='npp', error=absolute_error)
|
||||||
|
generate_table(f'{opt.tablepath}/app.ae.tex', protocol='app', error=absolute_error)
|
||||||
|
generate_table(f'{opt.tablepath}/npp.rae.tex', protocol='npp', error=relative_absolute_error)
|
||||||
|
generate_table(f'{opt.tablepath}/app.rae.tex', protocol='app', error=relative_absolute_error)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,11 +7,11 @@ from tqdm import tqdm
|
||||||
from skmultilearn.dataset import load_dataset, available_data_sets
|
from skmultilearn.dataset import load_dataset, available_data_sets
|
||||||
from scipy.sparse import csr_matrix
|
from scipy.sparse import csr_matrix
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from MultiLabel.mlclassification import MultilabelStackedClassifier
|
from MultiLabel.mlclassification import MLStackedClassifier, LabelSpacePartion, MLTwinSVM, MLknn
|
||||||
from MultiLabel.mldata import MultilabelledCollection
|
from MultiLabel.mldata import MultilabelledCollection
|
||||||
from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
|
from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
|
||||||
MLACC, \
|
MLACC, \
|
||||||
MLPACC, MultilabelNaiveAggregativeQuantifier
|
MLPACC, MLNaiveAggregativeQuantifier, MLMLPE
|
||||||
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
|
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from data.dataset import Dataset
|
from data.dataset import Dataset
|
||||||
|
@ -35,80 +35,136 @@ def calibratedCls():
|
||||||
sample_size = 100
|
sample_size = 100
|
||||||
n_samples = 5000
|
n_samples = 5000
|
||||||
|
|
||||||
|
SKMULTILEARN_ALL_DATASETS = sorted(set([x[0] for x in available_data_sets().keys()]))
|
||||||
|
SKMULTILEARN_RED_DATASETS = [x+'-red' for x in SKMULTILEARN_ALL_DATASETS]
|
||||||
|
TC_DATASETS = ['reuters21578', 'jrcall', 'ohsumed', 'rcv1']
|
||||||
|
|
||||||
|
DATASETS = TC_DATASETS
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def models():
|
def models():
|
||||||
yield 'NaiveCC', MultilabelNaiveAggregativeQuantifier(CC(cls()))
|
yield 'MLPE', MLMLPE()
|
||||||
yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls()))
|
yield 'NaiveCC', MLNaiveAggregativeQuantifier(CC(cls()))
|
||||||
yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls()))
|
yield 'NaivePCC', MLNaiveAggregativeQuantifier(PCC(cls()))
|
||||||
yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls()))
|
yield 'NaiveACC', MLNaiveAggregativeQuantifier(ACC(cls()))
|
||||||
# yield 'NaiveHDy', MultilabelNaiveAggregativeQuantifier(HDy(cls()))
|
yield 'NaivePACC', MLNaiveAggregativeQuantifier(PACC(cls()))
|
||||||
# yield 'NaiveSLD', MultilabelNaiveAggregativeQuantifier(EMQ(calibratedCls()))
|
# yield 'NaiveHDy', MLNaiveAggregativeQuantifier(HDy(cls()))
|
||||||
yield 'StackCC', MLCC(MultilabelStackedClassifier(cls()))
|
# yield 'NaiveSLD', MLNaiveAggregativeQuantifier(EMQ(calibratedCls()))
|
||||||
yield 'StackPCC', MLPCC(MultilabelStackedClassifier(cls()))
|
yield 'StackCC', MLCC(MLStackedClassifier(cls()))
|
||||||
yield 'StackACC', MLACC(MultilabelStackedClassifier(cls()))
|
yield 'StackPCC', MLPCC(MLStackedClassifier(cls()))
|
||||||
yield 'StackPACC', MLPACC(MultilabelStackedClassifier(cls()))
|
yield 'StackACC', MLACC(MLStackedClassifier(cls()))
|
||||||
|
yield 'StackPACC', MLPACC(MLStackedClassifier(cls()))
|
||||||
# yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None, order='random'))
|
# yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None, order='random'))
|
||||||
# yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random'))
|
# yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random'))
|
||||||
# yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random'))
|
# yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random'))
|
||||||
# yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random'))
|
# yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random'))
|
||||||
common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'}
|
common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'}
|
||||||
yield 'MRQ-CC', MLRegressionQuantification(MultilabelNaiveQuantifier(CC(cls())), **common)
|
yield 'MRQ-CC', MLRegressionQuantification(MLNaiveQuantifier(CC(cls())), **common)
|
||||||
yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())), **common)
|
yield 'MRQ-PCC', MLRegressionQuantification(MLNaiveQuantifier(PCC(cls())), **common)
|
||||||
yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())), **common)
|
yield 'MRQ-ACC', MLRegressionQuantification(MLNaiveQuantifier(ACC(cls())), **common)
|
||||||
yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common)
|
yield 'MRQ-PACC', MLRegressionQuantification(MLNaiveQuantifier(PACC(cls())), **common)
|
||||||
yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common)
|
yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), **common)
|
||||||
yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common)
|
yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), **common)
|
||||||
yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common)
|
yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), **common)
|
||||||
yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), **common)
|
yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), **common)
|
||||||
# yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), protocol='app', **common)
|
||||||
# yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), protocol='app', **common)
|
||||||
# yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), protocol='app', **common)
|
||||||
# yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), protocol='app', **common)
|
||||||
# yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common)
|
# yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common)
|
||||||
# yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common)
|
# yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common)
|
||||||
# yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common)
|
# yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common)
|
||||||
# yield 'MRQ-ChainPACC', MLRegressionQuantification(MLPACC(ClassifierChain(cls())), **common)
|
# yield 'MRQ-ChainPACC', MLRegressionQuantification(MLPACC(ClassifierChain(cls())), **common)
|
||||||
|
# yield 'LSP-CC', MLCC(LabelSpacePartion(cls()))
|
||||||
|
# yield 'LSP-ACC', MLACC(LabelSpacePartion(cls()))
|
||||||
|
# yield 'TwinSVM-CC', MLCC(MLTwinSVM())
|
||||||
|
# yield 'TwinSVM-ACC', MLACC(MLTwinSVM())
|
||||||
|
yield 'MLKNN-CC', MLCC(MLknn())
|
||||||
|
yield 'MLKNN-PCC', MLPCC(MLknn())
|
||||||
|
yield 'MLKNN-ACC', MLACC(MLknn())
|
||||||
|
yield 'MLKNN-PACC', MLPACC(MLknn())
|
||||||
|
|
||||||
|
|
||||||
# dataset = 'reuters21578'
|
def get_dataset(dataset_name, dopickle=True):
|
||||||
# picklepath = '/home/moreo/word-class-embeddings/pickles'
|
datadir = f'{qp.util.get_quapy_home()}/pickles'
|
||||||
# data = Dataset.load(dataset, pickle_path=f'{picklepath}/{dataset}.pickle')
|
datapath = f'{datadir}/{dataset_name}.pkl'
|
||||||
# Xtr, Xte = data.vectorize()
|
if dopickle:
|
||||||
# ytr = data.devel_labelmatrix.todense().getA()
|
if os.path.exists(datapath):
|
||||||
# yte = data.test_labelmatrix.todense().getA()
|
print(f'returning pickled object in {datapath}')
|
||||||
|
return pickle.load(open(datapath, 'rb'))
|
||||||
|
|
||||||
# remove categories with < 10 training documents
|
if dataset_name in SKMULTILEARN_ALL_DATASETS + SKMULTILEARN_RED_DATASETS:
|
||||||
# to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50)
|
clean_name = dataset_name.replace('-red','')
|
||||||
# ytr = ytr[:, to_keep]
|
Xtr, ytr, feature_names, label_names = load_dataset(clean_name, 'train')
|
||||||
# yte = yte[:, to_keep]
|
Xte, yte, _, _ = load_dataset(clean_name, 'test')
|
||||||
# print(f'num categories = {ytr.shape[1]}')
|
print(f'n-labels = {len(label_names)}')
|
||||||
|
|
||||||
|
Xtr = csr_matrix(Xtr)
|
||||||
|
Xte = csr_matrix(Xte)
|
||||||
|
|
||||||
def datasets():
|
ytr = ytr.todense().getA()
|
||||||
dataset_list = sorted(set([x[0] for x in available_data_sets().keys()]))
|
yte = yte.todense().getA()
|
||||||
for dataset_name in dataset_list:
|
|
||||||
yield dataset_name
|
|
||||||
|
|
||||||
|
if dataset_name.endswith('-red'):
|
||||||
|
TO_SELECT = 10
|
||||||
|
nC = ytr.shape[1]
|
||||||
|
tr_counts = ytr.sum(axis=0)
|
||||||
|
te_counts = yte.sum(axis=0)
|
||||||
|
if nC > TO_SELECT:
|
||||||
|
Y = ytr.T.dot(ytr) # class-class coincidence matrix
|
||||||
|
Y[np.triu_indices(nC)] = 0 # zeroing all duplicates entries and the diagonal
|
||||||
|
order_ij = np.argsort(-Y, axis=None)
|
||||||
|
selected = set()
|
||||||
|
p=0
|
||||||
|
while len(selected) < TO_SELECT:
|
||||||
|
highest_index = order_ij[p]
|
||||||
|
class_i = highest_index // nC
|
||||||
|
class_j = highest_index % nC
|
||||||
|
# if there is only one class to go, then add the most populated one
|
||||||
|
most_populated, least_populated = (class_i, class_j) if tr_counts[class_i] > tr_counts[class_j] else (class_j, class_i)
|
||||||
|
if te_counts[most_populated]>0:
|
||||||
|
selected.add(most_populated)
|
||||||
|
if len(selected) < TO_SELECT:
|
||||||
|
if te_counts[least_populated]>0:
|
||||||
|
selected.add(least_populated)
|
||||||
|
p+=1
|
||||||
|
selected = np.asarray(sorted(selected))
|
||||||
|
ytr = ytr[:,selected]
|
||||||
|
yte = yte[:, selected]
|
||||||
|
# else:
|
||||||
|
# remove categories without positives in the training or test splits
|
||||||
|
# valid_categories = np.logical_and(ytr.sum(axis=0)>5, yte.sum(axis=0)>5)
|
||||||
|
# ytr = ytr[:, valid_categories]
|
||||||
|
# yte = yte[:, valid_categories]
|
||||||
|
|
||||||
def get_dataset(dataset_name):
|
elif dataset_name in TC_DATASETS:
|
||||||
Xtr, ytr, feature_names, label_names = load_dataset(dataset_name, 'train')
|
picklepath = '/home/moreo/word-class-embeddings/pickles'
|
||||||
Xte, yte, _, _ = load_dataset(dataset_name, 'test')
|
data = Dataset.load(dataset_name, pickle_path=f'{picklepath}/{dataset_name}.pickle')
|
||||||
print(f'n-labels = {len(label_names)}')
|
Xtr, Xte = data.vectorize()
|
||||||
|
ytr = data.devel_labelmatrix.todense().getA()
|
||||||
|
yte = data.test_labelmatrix.todense().getA()
|
||||||
|
|
||||||
Xtr = csr_matrix(Xtr)
|
# remove categories with < 50 training or test documents
|
||||||
Xte = csr_matrix(Xte)
|
# to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50)
|
||||||
|
# keep the 10 most populated categories
|
||||||
|
to_keep = np.argsort(ytr.sum(axis=0))[-10:]
|
||||||
|
ytr = ytr[:, to_keep]
|
||||||
|
yte = yte[:, to_keep]
|
||||||
|
print(f'num categories = {ytr.shape[1]}')
|
||||||
|
|
||||||
ytr = ytr.todense().getA()
|
else:
|
||||||
yte = yte.todense().getA()
|
raise ValueError(f'unknown dataset {dataset_name}')
|
||||||
|
|
||||||
# remove categories without positives in the training or test splits
|
|
||||||
valid_categories = np.logical_and(ytr.sum(axis=0)>5, yte.sum(axis=0)>5)
|
|
||||||
ytr = ytr[:, valid_categories]
|
|
||||||
yte = yte[:, valid_categories]
|
|
||||||
|
|
||||||
train = MultilabelledCollection(Xtr, ytr)
|
train = MultilabelledCollection(Xtr, ytr)
|
||||||
test = MultilabelledCollection(Xte, yte)
|
test = MultilabelledCollection(Xte, yte)
|
||||||
|
|
||||||
|
if dopickle:
|
||||||
|
os.makedirs(datadir, exist_ok=True)
|
||||||
|
pickle.dump((train, test), open(datapath, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
return train, test
|
return train, test
|
||||||
|
|
||||||
|
|
||||||
|
@ -176,8 +232,8 @@ def run_experiment(dataset_name, model_name, model):
|
||||||
|
|
||||||
print(f'runing experiment {dataset_name} x {model_name}')
|
print(f'runing experiment {dataset_name} x {model_name}')
|
||||||
train, test = get_dataset(dataset_name)
|
train, test = get_dataset(dataset_name)
|
||||||
if train.n_classes>100:
|
# if train.n_classes>100:
|
||||||
return
|
# return
|
||||||
|
|
||||||
print_info(train, test)
|
print_info(train, test)
|
||||||
|
|
||||||
|
@ -186,8 +242,6 @@ def run_experiment(dataset_name, model_name, model):
|
||||||
results_npp = ml_natural_prevalence_prediction(model, test, sample_size, repeats=100)
|
results_npp = ml_natural_prevalence_prediction(model, test, sample_size, repeats=100)
|
||||||
results_app = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences=11, repeats=5)
|
results_app = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences=11, repeats=5)
|
||||||
save_results(results_npp, results_app, result_path)
|
save_results(results_npp, results_app, result_path)
|
||||||
results_npp2, results_app2 = load_results(result_path)
|
|
||||||
print('pass')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -198,7 +252,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
os.makedirs(opt.results, exist_ok=True)
|
os.makedirs(opt.results, exist_ok=True)
|
||||||
|
|
||||||
for datasetname, (modelname,model) in itertools.product(datasets(), models()):
|
for datasetname, (modelname,model) in itertools.product(DATASETS, models()):
|
||||||
run_experiment(datasetname, modelname, model)
|
run_experiment(datasetname, modelname, model)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,9 +4,19 @@ from sklearn.calibration import CalibratedClassifierCV
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.multiclass import OneVsRestClassifier
|
from sklearn.multiclass import OneVsRestClassifier
|
||||||
from sklearn.preprocessing import StandardScaler
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from skmultilearn.adapt import MLTSVM
|
||||||
|
|
||||||
|
from skmultilearn.ensemble import LabelSpacePartitioningClassifier
|
||||||
|
from skmultilearn.problem_transform import LabelPowerset
|
||||||
|
from skmultilearn.cluster import NetworkXLabelGraphClusterer, LabelCooccurrenceGraphBuilder
|
||||||
|
|
||||||
|
from skmultilearn.embedding import SKLearnEmbedder, EmbeddingClassifier
|
||||||
|
from sklearn.manifold import SpectralEmbedding
|
||||||
|
from sklearn.ensemble import RandomForestRegressor
|
||||||
|
from skmultilearn.adapt import MLkNN
|
||||||
|
|
||||||
|
|
||||||
class MultilabelStackedClassifier: # aka Funnelling Monolingual
|
class MLStackedClassifier: # aka Funnelling Monolingual
|
||||||
def __init__(self, base_estimator=LogisticRegression()):
|
def __init__(self, base_estimator=LogisticRegression()):
|
||||||
if not hasattr(base_estimator, 'predict_proba'):
|
if not hasattr(base_estimator, 'predict_proba'):
|
||||||
print('the estimator does not seem to be probabilistic: calibrating')
|
print('the estimator does not seem to be probabilistic: calibrating')
|
||||||
|
@ -32,3 +42,50 @@ class MultilabelStackedClassifier: # aka Funnelling Monolingual
|
||||||
P = self.base.predict_proba(X)
|
P = self.base.predict_proba(X)
|
||||||
P = self.norm.transform(P)
|
P = self.norm.transform(P)
|
||||||
return self.meta.predict_proba(P)
|
return self.meta.predict_proba(P)
|
||||||
|
|
||||||
|
|
||||||
|
class LabelSpacePartion:
|
||||||
|
def __init__(self, base_estimator=LogisticRegression()):
|
||||||
|
graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False)
|
||||||
|
self.classifier = LabelSpacePartitioningClassifier(
|
||||||
|
classifier=LabelPowerset(classifier=base_estimator),
|
||||||
|
clusterer=NetworkXLabelGraphClusterer(graph_builder, method='louvain')
|
||||||
|
)
|
||||||
|
|
||||||
|
def fit(self, X, y):
|
||||||
|
return self.classifier.fit(X, y)
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
return self.classifier.predict(X).todense().getA()
|
||||||
|
|
||||||
|
|
||||||
|
class MLTwinSVM:
|
||||||
|
def __init__(self):
|
||||||
|
self.classifier = MLTSVM()
|
||||||
|
|
||||||
|
def fit(self, X, y):
|
||||||
|
return self.classifier.fit(X, y)
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
return self.classifier.predict(X).todense().getA()
|
||||||
|
|
||||||
|
|
||||||
|
class MLknn:
|
||||||
|
#http://scikit.ml/api/skmultilearn.embedding.classifier.html#skmultilearn.embedding.EmbeddingClassifier
|
||||||
|
#notes: need to install package openne
|
||||||
|
def __init__(self):
|
||||||
|
self.classifier = EmbeddingClassifier(
|
||||||
|
SKLearnEmbedder(SpectralEmbedding(n_components=10)),
|
||||||
|
RandomForestRegressor(n_estimators=10),
|
||||||
|
MLkNN(k=5)
|
||||||
|
)
|
||||||
|
|
||||||
|
def fit(self, X, y):
|
||||||
|
return self.classifier.fit(X, y)
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
return self.classifier.predict(X).todense().getA()
|
||||||
|
|
||||||
|
def predict_proba(self, X):
|
||||||
|
return self.classifier.predict_proba(X)
|
||||||
|
|
||||||
|
|
|
@ -34,6 +34,10 @@ class MultilabelledCollection:
|
||||||
def n_classes(self):
|
def n_classes(self):
|
||||||
return len(self.classes_)
|
return len(self.classes_)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def n_features(self):
|
||||||
|
return self.instances.shape[1]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def binary(self):
|
def binary(self):
|
||||||
return False
|
return False
|
||||||
|
@ -43,8 +47,8 @@ class MultilabelledCollection:
|
||||||
|
|
||||||
def sampling_multi_index(self, size, cat, prev=None):
|
def sampling_multi_index(self, size, cat, prev=None):
|
||||||
if prev is None: # no prevalence was indicated; returns an index for uniform sampling
|
if prev is None: # no prevalence was indicated; returns an index for uniform sampling
|
||||||
return np.random.choice(len(self), size, replace=size>len(self))
|
return np.random.choice(len(self), size, replace=size > len(self))
|
||||||
aux = LabelledCollection(self.__gen_index(), self.labels[:,cat])
|
aux = LabelledCollection(self.__gen_index(), self.labels[:, cat])
|
||||||
return aux.sampling_index(size, *[1-prev, prev])
|
return aux.sampling_index(size, *[1-prev, prev])
|
||||||
|
|
||||||
def uniform_sampling_multi_index(self, size):
|
def uniform_sampling_multi_index(self, size):
|
||||||
|
|
|
@ -9,7 +9,7 @@ from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LassoCV, Mult
|
||||||
ElasticNet, MultiTaskElasticNetCV, MultiTaskElasticNet, LinearRegression, ARDRegression, BayesianRidge, SGDRegressor
|
ElasticNet, MultiTaskElasticNetCV, MultiTaskElasticNet, LinearRegression, ARDRegression, BayesianRidge, SGDRegressor
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from MultiLabel.mlclassification import MultilabelStackedClassifier
|
from MultiLabel.mlclassification import MLStackedClassifier
|
||||||
from MultiLabel.mldata import MultilabelledCollection
|
from MultiLabel.mldata import MultilabelledCollection
|
||||||
from method.aggregative import CC, ACC, PACC, AggregativeQuantifier
|
from method.aggregative import CC, ACC, PACC, AggregativeQuantifier
|
||||||
from method.base import BaseQuantifier
|
from method.base import BaseQuantifier
|
||||||
|
@ -25,7 +25,19 @@ class MLQuantifier:
|
||||||
def quantify(self, instances): ...
|
def quantify(self, instances): ...
|
||||||
|
|
||||||
|
|
||||||
|
class MLMLPE(MLQuantifier):
|
||||||
|
def fit(self, data: MultilabelledCollection):
|
||||||
|
self.tr_prev = data.prevalence()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
return self.tr_prev
|
||||||
|
|
||||||
|
|
||||||
class MLAggregativeQuantifier(MLQuantifier):
|
class MLAggregativeQuantifier(MLQuantifier):
|
||||||
|
def __init__(self, mlcls):
|
||||||
|
self.learner = mlcls
|
||||||
|
|
||||||
def fit(self, data:MultilabelledCollection):
|
def fit(self, data:MultilabelledCollection):
|
||||||
self.learner.fit(*data.Xy)
|
self.learner.fit(*data.Xy)
|
||||||
return self
|
return self
|
||||||
|
@ -42,9 +54,6 @@ class MLAggregativeQuantifier(MLQuantifier):
|
||||||
|
|
||||||
|
|
||||||
class MLCC(MLAggregativeQuantifier):
|
class MLCC(MLAggregativeQuantifier):
|
||||||
def __init__(self, mlcls):
|
|
||||||
self.learner = mlcls
|
|
||||||
|
|
||||||
def preclassify(self, instances):
|
def preclassify(self, instances):
|
||||||
return self.learner.predict(instances)
|
return self.learner.predict(instances)
|
||||||
|
|
||||||
|
@ -55,16 +64,11 @@ class MLCC(MLAggregativeQuantifier):
|
||||||
|
|
||||||
|
|
||||||
class MLPCC(MLCC):
|
class MLPCC(MLCC):
|
||||||
def __init__(self, mlcls):
|
|
||||||
self.learner = mlcls
|
|
||||||
|
|
||||||
def preclassify(self, instances):
|
def preclassify(self, instances):
|
||||||
return self.learner.predict_proba(instances)
|
return self.learner.predict_proba(instances)
|
||||||
|
|
||||||
|
|
||||||
class MLACC(MLCC):
|
class MLACC(MLCC):
|
||||||
def __init__(self, mlcls):
|
|
||||||
self.learner = mlcls
|
|
||||||
|
|
||||||
def fit(self, data:MultilabelledCollection, train_prop=0.6):
|
def fit(self, data:MultilabelledCollection, train_prop=0.6):
|
||||||
self.classes_ = data.classes_
|
self.classes_ = data.classes_
|
||||||
|
@ -88,8 +92,6 @@ class MLACC(MLCC):
|
||||||
|
|
||||||
|
|
||||||
class MLPACC(MLPCC):
|
class MLPACC(MLPCC):
|
||||||
def __init__(self, mlcls):
|
|
||||||
self.learner = mlcls
|
|
||||||
|
|
||||||
def fit(self, data:MultilabelledCollection, train_prop=0.6):
|
def fit(self, data:MultilabelledCollection, train_prop=0.6):
|
||||||
self.classes_ = data.classes_
|
self.classes_ = data.classes_
|
||||||
|
@ -109,7 +111,7 @@ class MLPACC(MLPCC):
|
||||||
return pacc_prevs
|
return pacc_prevs
|
||||||
|
|
||||||
|
|
||||||
class MultilabelNaiveQuantifier(MLQuantifier):
|
class MLNaiveQuantifier(MLQuantifier):
|
||||||
def __init__(self, q:BaseQuantifier, n_jobs=-1):
|
def __init__(self, q:BaseQuantifier, n_jobs=-1):
|
||||||
self.q = q
|
self.q = q
|
||||||
self.estimators = None
|
self.estimators = None
|
||||||
|
@ -132,7 +134,7 @@ class MultilabelNaiveQuantifier(MLQuantifier):
|
||||||
return np.asarray([neg_prevs, pos_prevs]).T
|
return np.asarray([neg_prevs, pos_prevs]).T
|
||||||
|
|
||||||
|
|
||||||
class MultilabelNaiveAggregativeQuantifier(MultilabelNaiveQuantifier, MLAggregativeQuantifier):
|
class MLNaiveAggregativeQuantifier(MLNaiveQuantifier, MLAggregativeQuantifier):
|
||||||
def __init__(self, q:AggregativeQuantifier, n_jobs=-1):
|
def __init__(self, q:AggregativeQuantifier, n_jobs=-1):
|
||||||
assert isinstance(q, AggregativeQuantifier), 'the quantifier is not of type aggregative!'
|
assert isinstance(q, AggregativeQuantifier), 'the quantifier is not of type aggregative!'
|
||||||
self.q = q
|
self.q = q
|
||||||
|
@ -156,7 +158,7 @@ class MultilabelNaiveAggregativeQuantifier(MultilabelNaiveQuantifier, MLAggregat
|
||||||
|
|
||||||
class MLRegressionQuantification:
|
class MLRegressionQuantification:
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
mlquantifier=MultilabelNaiveQuantifier(CC(LinearSVC())),
|
mlquantifier=MLNaiveQuantifier(CC(LinearSVC())),
|
||||||
regression='ridge',
|
regression='ridge',
|
||||||
protocol='npp',
|
protocol='npp',
|
||||||
n_samples=500,
|
n_samples=500,
|
||||||
|
@ -201,36 +203,31 @@ class MLRegressionQuantification:
|
||||||
|
|
||||||
return Xs, ys
|
return Xs, ys
|
||||||
|
|
||||||
|
def _extract_features(self, sample, Xs, ys, samples_mean, samples_std):
|
||||||
|
ys.append(sample.prevalence()[:, 1])
|
||||||
|
Xs.append(self.estimator.quantify(sample.instances)[:, 1])
|
||||||
|
if self.means:
|
||||||
|
samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
|
||||||
|
if self.stds:
|
||||||
|
samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
|
||||||
|
|
||||||
def generate_samples_npp(self, val):
|
def generate_samples_npp(self, val):
|
||||||
samples_mean = []
|
Xs, ys = [], []
|
||||||
samples_std = []
|
samples_mean, samples_std = [], []
|
||||||
Xs = []
|
|
||||||
ys = []
|
|
||||||
for sample in val.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
|
for sample in val.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
|
||||||
ys.append(sample.prevalence()[:, 1])
|
self._extract_features(self, sample, Xs, ys, samples_mean, samples_std)
|
||||||
Xs.append(self.estimator.quantify(sample.instances)[:, 1])
|
|
||||||
if self.means:
|
|
||||||
samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
|
|
||||||
if self.stds:
|
|
||||||
samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
|
|
||||||
return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
|
return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
|
||||||
|
|
||||||
|
|
||||||
def generate_samples_app(self, val):
|
def generate_samples_app(self, val):
|
||||||
samples_mean = []
|
Xs, ys = [], []
|
||||||
samples_std = []
|
samples_mean, samples_std = [], []
|
||||||
Xs = []
|
|
||||||
ys = []
|
|
||||||
ncats = len(self.classes_)
|
ncats = len(self.classes_)
|
||||||
nprevs = 21
|
nprevs = 21
|
||||||
repeats = max(self.n_samples // (ncats * nprevs), 1)
|
repeats = max(self.n_samples // (ncats * nprevs), 1)
|
||||||
for cat in self.classes_:
|
for cat in self.classes_:
|
||||||
for sample in val.artificial_sampling_generator(sample_size=self.sample_size, category=cat, n_prevalences=nprevs, repeats=repeats):
|
for sample in val.artificial_sampling_generator(sample_size=self.sample_size, category=cat, n_prevalences=nprevs, repeats=repeats):
|
||||||
ys.append(sample.prevalence()[:, 1])
|
self._extract_features(self, sample, Xs, ys, samples_mean, samples_std)
|
||||||
Xs.append(self.estimator.quantify(sample.instances)[:, 1])
|
|
||||||
if self.means:
|
|
||||||
samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
|
|
||||||
if self.stds:
|
|
||||||
samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
|
|
||||||
return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
|
return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
|
||||||
|
|
||||||
def fit(self, data:MultilabelledCollection):
|
def fit(self, data:MultilabelledCollection):
|
||||||
|
|
|
@ -6,10 +6,10 @@ from scipy.stats import ttest_ind_from_stats, wilcoxon
|
||||||
class Table:
|
class Table:
|
||||||
VALID_TESTS = [None, "wilcoxon", "ttest"]
|
VALID_TESTS = [None, "wilcoxon", "ttest"]
|
||||||
|
|
||||||
def __init__(self, benchmarks, methods, lower_is_better=True, ttest='ttest', prec_mean=3,
|
def __init__(self, benchmarks, methods, lower_is_better=True, significance_test='ttest', prec_mean=3,
|
||||||
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
|
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
|
||||||
color=True):
|
color=True):
|
||||||
assert ttest in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
|
assert significance_test in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
|
||||||
|
|
||||||
self.benchmarks = np.asarray(benchmarks)
|
self.benchmarks = np.asarray(benchmarks)
|
||||||
self.benchmark_index = {row: i for i, row in enumerate(benchmarks)}
|
self.benchmark_index = {row: i for i, row in enumerate(benchmarks)}
|
||||||
|
@ -21,7 +21,7 @@ class Table:
|
||||||
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
|
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
|
||||||
self._addmap('values', dtype=object)
|
self._addmap('values', dtype=object)
|
||||||
self.lower_is_better = lower_is_better
|
self.lower_is_better = lower_is_better
|
||||||
self.ttest = ttest
|
self.ttest = significance_test
|
||||||
self.prec_mean = prec_mean
|
self.prec_mean = prec_mean
|
||||||
self.clean_zero = clean_zero
|
self.clean_zero = clean_zero
|
||||||
self.show_std = show_std
|
self.show_std = show_std
|
||||||
|
@ -156,8 +156,9 @@ class Table:
|
||||||
return all(self.map['fill'][:, self.method_index[col]])
|
return all(self.map['fill'][:, self.method_index[col]])
|
||||||
|
|
||||||
def _addave(self):
|
def _addave(self):
|
||||||
ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, ttest=self.ttest, average=False,
|
ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, significance_test=self.ttest, average=False,
|
||||||
missing=self.missing, missing_str=self.missing_str)
|
missing=self.missing, missing_str=self.missing_str, prec_mean=self.prec_mean, prec_std=self.prec_std,
|
||||||
|
show_std=self.show_std)
|
||||||
for col in self.methods:
|
for col in self.methods:
|
||||||
values = None
|
values = None
|
||||||
if self._is_column_full(col):
|
if self._is_column_full(col):
|
||||||
|
@ -267,12 +268,37 @@ class Table:
|
||||||
tab += self.latexAverage()
|
tab += self.latexAverage()
|
||||||
return tab
|
return tab
|
||||||
|
|
||||||
|
def latexTabularT(self, benchmark_replace={}, method_replace={}, average=True, side=False):
|
||||||
|
def withside(label):
|
||||||
|
return '\side{'+label+'}' if side else label
|
||||||
|
|
||||||
|
tab = ' & '
|
||||||
|
tab += ' & '.join([withside(benchmark_replace.get(col, col)) for col in self.benchmarks])
|
||||||
|
if average:
|
||||||
|
tab += ' & ' + withside('Ave')
|
||||||
|
tab += ' \\\\\hline\n'
|
||||||
|
for row in self.methods:
|
||||||
|
rowname = method_replace.get(row, row)
|
||||||
|
tab += rowname + ' & '
|
||||||
|
tab += self.latexRowT(row, endl='')
|
||||||
|
if average:
|
||||||
|
tab += ' & '
|
||||||
|
tab += self.average.latexCell('ave', row)
|
||||||
|
tab += '\\\\\hline\n'
|
||||||
|
return tab
|
||||||
|
|
||||||
def latexRow(self, benchmark, endl='\\\\\hline\n'):
|
def latexRow(self, benchmark, endl='\\\\\hline\n'):
|
||||||
s = [self.latexCell(benchmark, col) for col in self.methods]
|
s = [self.latexCell(benchmark, col) for col in self.methods]
|
||||||
s = ' & '.join(s)
|
s = ' & '.join(s)
|
||||||
s += ' ' + endl
|
s += ' ' + endl
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
def latexRowT(self, method, endl='\\\\\hline\n'):
|
||||||
|
s = [self.latexCell(benchmark, method) for benchmark in self.benchmarks]
|
||||||
|
s = ' & '.join(s)
|
||||||
|
s += ' ' + endl
|
||||||
|
return s
|
||||||
|
|
||||||
def latexAverage(self, endl='\\\\\hline\n'):
|
def latexAverage(self, endl='\\\\\hline\n'):
|
||||||
if self.add_average:
|
if self.add_average:
|
||||||
return self.average.latexRow('ave', endl=endl)
|
return self.average.latexRow('ave', endl=endl)
|
||||||
|
|
Loading…
Reference in New Issue