1
0
Fork 0

adding multi-label classification methods

This commit is contained in:
Alejandro Moreo Fernandez 2021-09-02 11:07:33 +02:00
parent dc2fa05cf8
commit 4572ec266d
6 changed files with 286 additions and 117 deletions

View File

@ -7,12 +7,12 @@ from tqdm import tqdm
from skmultilearn.dataset import load_dataset, available_data_sets from skmultilearn.dataset import load_dataset, available_data_sets
from scipy.sparse import csr_matrix from scipy.sparse import csr_matrix
import quapy as qp import quapy as qp
from MultiLabel.main import load_results from MultiLabel.main import load_results, SKMULTILEARN_RED_DATASETS, TC_DATASETS, sample_size
from MultiLabel.mlclassification import MultilabelStackedClassifier from MultiLabel.mlclassification import MLStackedClassifier
from MultiLabel.mldata import MultilabelledCollection from MultiLabel.mldata import MultilabelledCollection
from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \ from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
MLACC, \ MLACC, \
MLPACC, MultilabelNaiveAggregativeQuantifier MLPACC, MLNaiveAggregativeQuantifier
from MultiLabel.tabular import Table from MultiLabel.tabular import Table
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
import numpy as np import numpy as np
@ -22,29 +22,56 @@ import sys
import os import os
import pickle import pickle
models = ['NaiveCC', 'NaivePCC', 'NaiveACC', 'NaivePACC', 'NaiveHDy', 'NaiveSLD'] models = [#'MLPE',
datasets = sorted(set([x[0] for x in available_data_sets().keys()])) 'NaiveCC', 'NaivePCC', 'NaiveACC', 'NaivePACC', #'NaiveHDy', 'NaiveSLD',
'StackCC', 'StackPCC', 'StackACC', 'StackPACC',
'MRQ-CC', 'MRQ-PCC', 'MRQ-ACC', 'MRQ-PACC',
'MRQ-StackCC', 'MRQ-StackPCC', 'MRQ-StackACC', 'MRQ-StackPACC',
'MRQ-StackCC-app', 'MRQ-StackPCC-app', 'MRQ-StackACC-app', 'MRQ-StackPACC-app',
'LSP-CC', 'LSP-ACC'
]
# datasets = sorted(set([x[0] for x in available_data_sets().keys()]))
datasets = TC_DATASETS
def generate_table(path, protocol, error): def generate_table(path, protocol, error):
print(f'generating {path}')
table = Table(datasets, models) def compute_score_job(args):
for dataset, model in itertools.product(datasets, models): dataset, model = args
result_path = f'{opt.results}/{dataset}_{model}.pkl' result_path = f'{opt.results}/{dataset}_{model}.pkl'
if os.path.exists(result_path): if os.path.exists(result_path):
print('+', end='')
sys.stdout.flush()
result = load_results(result_path) result = load_results(result_path)
true_prevs, estim_prevs = result[protocol] true_prevs, estim_prevs = result[protocol]
scores = np.asarray([error(trues, estims) for trues, estims in zip(true_prevs, estim_prevs)]).flatten() scores = np.asarray([error(trues, estims) for trues, estims in zip(true_prevs, estim_prevs)]).flatten()
return dataset, model, scores
print('-', end='')
sys.stdout.flush()
return None
print(f'\ngenerating {path}')
table = Table(datasets, models, prec_mean=4, significance_test='wilcoxon')
results = qp.util.parallel(compute_score_job, list(itertools.product(datasets, models)), n_jobs=-1)
print()
for r in results:
if r is not None:
dataset, model, scores = r
table.add(dataset, model, scores) table.add(dataset, model, scores)
tabular = """ tabular = """
\\resizebox{\\textwidth}{!}{% \\resizebox{\\textwidth}{!}{%
\\begin{tabular}{|c||""" + ('c|' * len(models)) + """} \hline \\begin{tabular}{|c||""" + ('c|' * len(models)) + """} \hline
""" """
dataset_replace = {'tmc2007_500': 'tmc2007\_500'} dataset_replace = {'tmc2007_500': 'tmc2007\_500', 'tmc2007_500-red': 'tmc2007\_500-red'}
method_replace = {} method_replace = {}
tabular += table.latexTabular(benchmark_replace=dataset_replace, method_replace=method_replace) tabular += table.latexTabularT(benchmark_replace=dataset_replace, method_replace=method_replace, side=True)
tabular += """ tabular += """
\end{tabular}% \end{tabular}%
} }
@ -61,13 +88,17 @@ if __name__ == '__main__':
help=f'path where to store the tables') help=f'path where to store the tables')
opt = parser.parse_args() opt = parser.parse_args()
os.makedirs(opt.results, exist_ok=True) assert os.path.exists(opt.results), f'result directory {opt.results} does not exist'
os.makedirs(opt.tablepath, exist_ok=True) os.makedirs(opt.tablepath, exist_ok=True)
eval_error = qp.error.ae qp.environ["SAMPLE_SIZE"] = sample_size
generate_table(f'{opt.tablepath}/npp.ae.tex', protocol='npp', error=eval_error) absolute_error = qp.error.ae
generate_table(f'{opt.tablepath}/app.ae.tex', protocol='app', error=eval_error) relative_absolute_error = qp.error.rae
generate_table(f'{opt.tablepath}/npp.ae.tex', protocol='npp', error=absolute_error)
generate_table(f'{opt.tablepath}/app.ae.tex', protocol='app', error=absolute_error)
generate_table(f'{opt.tablepath}/npp.rae.tex', protocol='npp', error=relative_absolute_error)
generate_table(f'{opt.tablepath}/app.rae.tex', protocol='app', error=relative_absolute_error)

View File

@ -7,11 +7,11 @@ from tqdm import tqdm
from skmultilearn.dataset import load_dataset, available_data_sets from skmultilearn.dataset import load_dataset, available_data_sets
from scipy.sparse import csr_matrix from scipy.sparse import csr_matrix
import quapy as qp import quapy as qp
from MultiLabel.mlclassification import MultilabelStackedClassifier from MultiLabel.mlclassification import MLStackedClassifier, LabelSpacePartion, MLTwinSVM, MLknn
from MultiLabel.mldata import MultilabelledCollection from MultiLabel.mldata import MultilabelledCollection
from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \ from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
MLACC, \ MLACC, \
MLPACC, MultilabelNaiveAggregativeQuantifier MLPACC, MLNaiveAggregativeQuantifier, MLMLPE
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
import numpy as np import numpy as np
from data.dataset import Dataset from data.dataset import Dataset
@ -35,80 +35,136 @@ def calibratedCls():
sample_size = 100 sample_size = 100
n_samples = 5000 n_samples = 5000
SKMULTILEARN_ALL_DATASETS = sorted(set([x[0] for x in available_data_sets().keys()]))
SKMULTILEARN_RED_DATASETS = [x+'-red' for x in SKMULTILEARN_ALL_DATASETS]
TC_DATASETS = ['reuters21578', 'jrcall', 'ohsumed', 'rcv1']
DATASETS = TC_DATASETS
def models(): def models():
yield 'NaiveCC', MultilabelNaiveAggregativeQuantifier(CC(cls())) yield 'MLPE', MLMLPE()
yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls())) yield 'NaiveCC', MLNaiveAggregativeQuantifier(CC(cls()))
yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls())) yield 'NaivePCC', MLNaiveAggregativeQuantifier(PCC(cls()))
yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls())) yield 'NaiveACC', MLNaiveAggregativeQuantifier(ACC(cls()))
# yield 'NaiveHDy', MultilabelNaiveAggregativeQuantifier(HDy(cls())) yield 'NaivePACC', MLNaiveAggregativeQuantifier(PACC(cls()))
# yield 'NaiveSLD', MultilabelNaiveAggregativeQuantifier(EMQ(calibratedCls())) # yield 'NaiveHDy', MLNaiveAggregativeQuantifier(HDy(cls()))
yield 'StackCC', MLCC(MultilabelStackedClassifier(cls())) # yield 'NaiveSLD', MLNaiveAggregativeQuantifier(EMQ(calibratedCls()))
yield 'StackPCC', MLPCC(MultilabelStackedClassifier(cls())) yield 'StackCC', MLCC(MLStackedClassifier(cls()))
yield 'StackACC', MLACC(MultilabelStackedClassifier(cls())) yield 'StackPCC', MLPCC(MLStackedClassifier(cls()))
yield 'StackPACC', MLPACC(MultilabelStackedClassifier(cls())) yield 'StackACC', MLACC(MLStackedClassifier(cls()))
yield 'StackPACC', MLPACC(MLStackedClassifier(cls()))
# yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None, order='random')) # yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None, order='random'))
# yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random')) # yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random'))
# yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random')) # yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random'))
# yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random')) # yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random'))
common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'} common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'}
yield 'MRQ-CC', MLRegressionQuantification(MultilabelNaiveQuantifier(CC(cls())), **common) yield 'MRQ-CC', MLRegressionQuantification(MLNaiveQuantifier(CC(cls())), **common)
yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())), **common) yield 'MRQ-PCC', MLRegressionQuantification(MLNaiveQuantifier(PCC(cls())), **common)
yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())), **common) yield 'MRQ-ACC', MLRegressionQuantification(MLNaiveQuantifier(ACC(cls())), **common)
yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common) yield 'MRQ-PACC', MLRegressionQuantification(MLNaiveQuantifier(PACC(cls())), **common)
yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common) yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), **common)
yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common) yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), **common)
yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common) yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), **common)
yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), **common) yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), **common)
# yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), protocol='app', **common) yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), protocol='app', **common)
# yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), protocol='app', **common) yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), protocol='app', **common)
# yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), protocol='app', **common) yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), protocol='app', **common)
# yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), protocol='app', **common) yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), protocol='app', **common)
# yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common) # yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common)
# yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common) # yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common)
# yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common) # yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common)
# yield 'MRQ-ChainPACC', MLRegressionQuantification(MLPACC(ClassifierChain(cls())), **common) # yield 'MRQ-ChainPACC', MLRegressionQuantification(MLPACC(ClassifierChain(cls())), **common)
# yield 'LSP-CC', MLCC(LabelSpacePartion(cls()))
# yield 'LSP-ACC', MLACC(LabelSpacePartion(cls()))
# yield 'TwinSVM-CC', MLCC(MLTwinSVM())
# yield 'TwinSVM-ACC', MLACC(MLTwinSVM())
yield 'MLKNN-CC', MLCC(MLknn())
yield 'MLKNN-PCC', MLPCC(MLknn())
yield 'MLKNN-ACC', MLACC(MLknn())
yield 'MLKNN-PACC', MLPACC(MLknn())
# dataset = 'reuters21578' def get_dataset(dataset_name, dopickle=True):
# picklepath = '/home/moreo/word-class-embeddings/pickles' datadir = f'{qp.util.get_quapy_home()}/pickles'
# data = Dataset.load(dataset, pickle_path=f'{picklepath}/{dataset}.pickle') datapath = f'{datadir}/{dataset_name}.pkl'
# Xtr, Xte = data.vectorize() if dopickle:
# ytr = data.devel_labelmatrix.todense().getA() if os.path.exists(datapath):
# yte = data.test_labelmatrix.todense().getA() print(f'returning pickled object in {datapath}')
return pickle.load(open(datapath, 'rb'))
# remove categories with < 10 training documents if dataset_name in SKMULTILEARN_ALL_DATASETS + SKMULTILEARN_RED_DATASETS:
# to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50) clean_name = dataset_name.replace('-red','')
# ytr = ytr[:, to_keep] Xtr, ytr, feature_names, label_names = load_dataset(clean_name, 'train')
# yte = yte[:, to_keep] Xte, yte, _, _ = load_dataset(clean_name, 'test')
# print(f'num categories = {ytr.shape[1]}') print(f'n-labels = {len(label_names)}')
Xtr = csr_matrix(Xtr)
Xte = csr_matrix(Xte)
def datasets(): ytr = ytr.todense().getA()
dataset_list = sorted(set([x[0] for x in available_data_sets().keys()])) yte = yte.todense().getA()
for dataset_name in dataset_list:
yield dataset_name
if dataset_name.endswith('-red'):
TO_SELECT = 10
nC = ytr.shape[1]
tr_counts = ytr.sum(axis=0)
te_counts = yte.sum(axis=0)
if nC > TO_SELECT:
Y = ytr.T.dot(ytr) # class-class coincidence matrix
Y[np.triu_indices(nC)] = 0 # zeroing all duplicates entries and the diagonal
order_ij = np.argsort(-Y, axis=None)
selected = set()
p=0
while len(selected) < TO_SELECT:
highest_index = order_ij[p]
class_i = highest_index // nC
class_j = highest_index % nC
# if there is only one class to go, then add the most populated one
most_populated, least_populated = (class_i, class_j) if tr_counts[class_i] > tr_counts[class_j] else (class_j, class_i)
if te_counts[most_populated]>0:
selected.add(most_populated)
if len(selected) < TO_SELECT:
if te_counts[least_populated]>0:
selected.add(least_populated)
p+=1
selected = np.asarray(sorted(selected))
ytr = ytr[:,selected]
yte = yte[:, selected]
# else:
# remove categories without positives in the training or test splits
# valid_categories = np.logical_and(ytr.sum(axis=0)>5, yte.sum(axis=0)>5)
# ytr = ytr[:, valid_categories]
# yte = yte[:, valid_categories]
def get_dataset(dataset_name): elif dataset_name in TC_DATASETS:
Xtr, ytr, feature_names, label_names = load_dataset(dataset_name, 'train') picklepath = '/home/moreo/word-class-embeddings/pickles'
Xte, yte, _, _ = load_dataset(dataset_name, 'test') data = Dataset.load(dataset_name, pickle_path=f'{picklepath}/{dataset_name}.pickle')
print(f'n-labels = {len(label_names)}') Xtr, Xte = data.vectorize()
ytr = data.devel_labelmatrix.todense().getA()
yte = data.test_labelmatrix.todense().getA()
Xtr = csr_matrix(Xtr) # remove categories with < 50 training or test documents
Xte = csr_matrix(Xte) # to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50)
# keep the 10 most populated categories
to_keep = np.argsort(ytr.sum(axis=0))[-10:]
ytr = ytr[:, to_keep]
yte = yte[:, to_keep]
print(f'num categories = {ytr.shape[1]}')
ytr = ytr.todense().getA() else:
yte = yte.todense().getA() raise ValueError(f'unknown dataset {dataset_name}')
# remove categories without positives in the training or test splits
valid_categories = np.logical_and(ytr.sum(axis=0)>5, yte.sum(axis=0)>5)
ytr = ytr[:, valid_categories]
yte = yte[:, valid_categories]
train = MultilabelledCollection(Xtr, ytr) train = MultilabelledCollection(Xtr, ytr)
test = MultilabelledCollection(Xte, yte) test = MultilabelledCollection(Xte, yte)
if dopickle:
os.makedirs(datadir, exist_ok=True)
pickle.dump((train, test), open(datapath, 'wb'), pickle.HIGHEST_PROTOCOL)
return train, test return train, test
@ -176,8 +232,8 @@ def run_experiment(dataset_name, model_name, model):
print(f'runing experiment {dataset_name} x {model_name}') print(f'runing experiment {dataset_name} x {model_name}')
train, test = get_dataset(dataset_name) train, test = get_dataset(dataset_name)
if train.n_classes>100: # if train.n_classes>100:
return # return
print_info(train, test) print_info(train, test)
@ -186,8 +242,6 @@ def run_experiment(dataset_name, model_name, model):
results_npp = ml_natural_prevalence_prediction(model, test, sample_size, repeats=100) results_npp = ml_natural_prevalence_prediction(model, test, sample_size, repeats=100)
results_app = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences=11, repeats=5) results_app = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences=11, repeats=5)
save_results(results_npp, results_app, result_path) save_results(results_npp, results_app, result_path)
results_npp2, results_app2 = load_results(result_path)
print('pass')
if __name__ == '__main__': if __name__ == '__main__':
@ -198,7 +252,7 @@ if __name__ == '__main__':
os.makedirs(opt.results, exist_ok=True) os.makedirs(opt.results, exist_ok=True)
for datasetname, (modelname,model) in itertools.product(datasets(), models()): for datasetname, (modelname,model) in itertools.product(DATASETS, models()):
run_experiment(datasetname, modelname, model) run_experiment(datasetname, modelname, model)

View File

@ -4,9 +4,19 @@ from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
from skmultilearn.adapt import MLTSVM
from skmultilearn.ensemble import LabelSpacePartitioningClassifier
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.cluster import NetworkXLabelGraphClusterer, LabelCooccurrenceGraphBuilder
from skmultilearn.embedding import SKLearnEmbedder, EmbeddingClassifier
from sklearn.manifold import SpectralEmbedding
from sklearn.ensemble import RandomForestRegressor
from skmultilearn.adapt import MLkNN
class MultilabelStackedClassifier: # aka Funnelling Monolingual class MLStackedClassifier: # aka Funnelling Monolingual
def __init__(self, base_estimator=LogisticRegression()): def __init__(self, base_estimator=LogisticRegression()):
if not hasattr(base_estimator, 'predict_proba'): if not hasattr(base_estimator, 'predict_proba'):
print('the estimator does not seem to be probabilistic: calibrating') print('the estimator does not seem to be probabilistic: calibrating')
@ -32,3 +42,50 @@ class MultilabelStackedClassifier: # aka Funnelling Monolingual
P = self.base.predict_proba(X) P = self.base.predict_proba(X)
P = self.norm.transform(P) P = self.norm.transform(P)
return self.meta.predict_proba(P) return self.meta.predict_proba(P)
class LabelSpacePartion:
def __init__(self, base_estimator=LogisticRegression()):
graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False)
self.classifier = LabelSpacePartitioningClassifier(
classifier=LabelPowerset(classifier=base_estimator),
clusterer=NetworkXLabelGraphClusterer(graph_builder, method='louvain')
)
def fit(self, X, y):
return self.classifier.fit(X, y)
def predict(self, X):
return self.classifier.predict(X).todense().getA()
class MLTwinSVM:
def __init__(self):
self.classifier = MLTSVM()
def fit(self, X, y):
return self.classifier.fit(X, y)
def predict(self, X):
return self.classifier.predict(X).todense().getA()
class MLknn:
#http://scikit.ml/api/skmultilearn.embedding.classifier.html#skmultilearn.embedding.EmbeddingClassifier
#notes: need to install package openne
def __init__(self):
self.classifier = EmbeddingClassifier(
SKLearnEmbedder(SpectralEmbedding(n_components=10)),
RandomForestRegressor(n_estimators=10),
MLkNN(k=5)
)
def fit(self, X, y):
return self.classifier.fit(X, y)
def predict(self, X):
return self.classifier.predict(X).todense().getA()
def predict_proba(self, X):
return self.classifier.predict_proba(X)

View File

@ -34,6 +34,10 @@ class MultilabelledCollection:
def n_classes(self): def n_classes(self):
return len(self.classes_) return len(self.classes_)
@property
def n_features(self):
return self.instances.shape[1]
@property @property
def binary(self): def binary(self):
return False return False
@ -43,8 +47,8 @@ class MultilabelledCollection:
def sampling_multi_index(self, size, cat, prev=None): def sampling_multi_index(self, size, cat, prev=None):
if prev is None: # no prevalence was indicated; returns an index for uniform sampling if prev is None: # no prevalence was indicated; returns an index for uniform sampling
return np.random.choice(len(self), size, replace=size>len(self)) return np.random.choice(len(self), size, replace=size > len(self))
aux = LabelledCollection(self.__gen_index(), self.labels[:,cat]) aux = LabelledCollection(self.__gen_index(), self.labels[:, cat])
return aux.sampling_index(size, *[1-prev, prev]) return aux.sampling_index(size, *[1-prev, prev])
def uniform_sampling_multi_index(self, size): def uniform_sampling_multi_index(self, size):

View File

@ -9,7 +9,7 @@ from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LassoCV, Mult
ElasticNet, MultiTaskElasticNetCV, MultiTaskElasticNet, LinearRegression, ARDRegression, BayesianRidge, SGDRegressor ElasticNet, MultiTaskElasticNetCV, MultiTaskElasticNet, LinearRegression, ARDRegression, BayesianRidge, SGDRegressor
import quapy as qp import quapy as qp
from MultiLabel.mlclassification import MultilabelStackedClassifier from MultiLabel.mlclassification import MLStackedClassifier
from MultiLabel.mldata import MultilabelledCollection from MultiLabel.mldata import MultilabelledCollection
from method.aggregative import CC, ACC, PACC, AggregativeQuantifier from method.aggregative import CC, ACC, PACC, AggregativeQuantifier
from method.base import BaseQuantifier from method.base import BaseQuantifier
@ -25,7 +25,19 @@ class MLQuantifier:
def quantify(self, instances): ... def quantify(self, instances): ...
class MLMLPE(MLQuantifier):
def fit(self, data: MultilabelledCollection):
self.tr_prev = data.prevalence()
return self
def quantify(self, instances):
return self.tr_prev
class MLAggregativeQuantifier(MLQuantifier): class MLAggregativeQuantifier(MLQuantifier):
def __init__(self, mlcls):
self.learner = mlcls
def fit(self, data:MultilabelledCollection): def fit(self, data:MultilabelledCollection):
self.learner.fit(*data.Xy) self.learner.fit(*data.Xy)
return self return self
@ -42,9 +54,6 @@ class MLAggregativeQuantifier(MLQuantifier):
class MLCC(MLAggregativeQuantifier): class MLCC(MLAggregativeQuantifier):
def __init__(self, mlcls):
self.learner = mlcls
def preclassify(self, instances): def preclassify(self, instances):
return self.learner.predict(instances) return self.learner.predict(instances)
@ -55,16 +64,11 @@ class MLCC(MLAggregativeQuantifier):
class MLPCC(MLCC): class MLPCC(MLCC):
def __init__(self, mlcls):
self.learner = mlcls
def preclassify(self, instances): def preclassify(self, instances):
return self.learner.predict_proba(instances) return self.learner.predict_proba(instances)
class MLACC(MLCC): class MLACC(MLCC):
def __init__(self, mlcls):
self.learner = mlcls
def fit(self, data:MultilabelledCollection, train_prop=0.6): def fit(self, data:MultilabelledCollection, train_prop=0.6):
self.classes_ = data.classes_ self.classes_ = data.classes_
@ -88,8 +92,6 @@ class MLACC(MLCC):
class MLPACC(MLPCC): class MLPACC(MLPCC):
def __init__(self, mlcls):
self.learner = mlcls
def fit(self, data:MultilabelledCollection, train_prop=0.6): def fit(self, data:MultilabelledCollection, train_prop=0.6):
self.classes_ = data.classes_ self.classes_ = data.classes_
@ -109,7 +111,7 @@ class MLPACC(MLPCC):
return pacc_prevs return pacc_prevs
class MultilabelNaiveQuantifier(MLQuantifier): class MLNaiveQuantifier(MLQuantifier):
def __init__(self, q:BaseQuantifier, n_jobs=-1): def __init__(self, q:BaseQuantifier, n_jobs=-1):
self.q = q self.q = q
self.estimators = None self.estimators = None
@ -132,7 +134,7 @@ class MultilabelNaiveQuantifier(MLQuantifier):
return np.asarray([neg_prevs, pos_prevs]).T return np.asarray([neg_prevs, pos_prevs]).T
class MultilabelNaiveAggregativeQuantifier(MultilabelNaiveQuantifier, MLAggregativeQuantifier): class MLNaiveAggregativeQuantifier(MLNaiveQuantifier, MLAggregativeQuantifier):
def __init__(self, q:AggregativeQuantifier, n_jobs=-1): def __init__(self, q:AggregativeQuantifier, n_jobs=-1):
assert isinstance(q, AggregativeQuantifier), 'the quantifier is not of type aggregative!' assert isinstance(q, AggregativeQuantifier), 'the quantifier is not of type aggregative!'
self.q = q self.q = q
@ -156,7 +158,7 @@ class MultilabelNaiveAggregativeQuantifier(MultilabelNaiveQuantifier, MLAggregat
class MLRegressionQuantification: class MLRegressionQuantification:
def __init__(self, def __init__(self,
mlquantifier=MultilabelNaiveQuantifier(CC(LinearSVC())), mlquantifier=MLNaiveQuantifier(CC(LinearSVC())),
regression='ridge', regression='ridge',
protocol='npp', protocol='npp',
n_samples=500, n_samples=500,
@ -201,36 +203,31 @@ class MLRegressionQuantification:
return Xs, ys return Xs, ys
def _extract_features(self, sample, Xs, ys, samples_mean, samples_std):
ys.append(sample.prevalence()[:, 1])
Xs.append(self.estimator.quantify(sample.instances)[:, 1])
if self.means:
samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
if self.stds:
samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
def generate_samples_npp(self, val): def generate_samples_npp(self, val):
samples_mean = [] Xs, ys = [], []
samples_std = [] samples_mean, samples_std = [], []
Xs = []
ys = []
for sample in val.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples): for sample in val.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
ys.append(sample.prevalence()[:, 1]) self._extract_features(self, sample, Xs, ys, samples_mean, samples_std)
Xs.append(self.estimator.quantify(sample.instances)[:, 1])
if self.means:
samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
if self.stds:
samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
return self._prepare_arrays(Xs, ys, samples_mean, samples_std) return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
def generate_samples_app(self, val): def generate_samples_app(self, val):
samples_mean = [] Xs, ys = [], []
samples_std = [] samples_mean, samples_std = [], []
Xs = []
ys = []
ncats = len(self.classes_) ncats = len(self.classes_)
nprevs = 21 nprevs = 21
repeats = max(self.n_samples // (ncats * nprevs), 1) repeats = max(self.n_samples // (ncats * nprevs), 1)
for cat in self.classes_: for cat in self.classes_:
for sample in val.artificial_sampling_generator(sample_size=self.sample_size, category=cat, n_prevalences=nprevs, repeats=repeats): for sample in val.artificial_sampling_generator(sample_size=self.sample_size, category=cat, n_prevalences=nprevs, repeats=repeats):
ys.append(sample.prevalence()[:, 1]) self._extract_features(self, sample, Xs, ys, samples_mean, samples_std)
Xs.append(self.estimator.quantify(sample.instances)[:, 1])
if self.means:
samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
if self.stds:
samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
return self._prepare_arrays(Xs, ys, samples_mean, samples_std) return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
def fit(self, data:MultilabelledCollection): def fit(self, data:MultilabelledCollection):

View File

@ -6,10 +6,10 @@ from scipy.stats import ttest_ind_from_stats, wilcoxon
class Table: class Table:
VALID_TESTS = [None, "wilcoxon", "ttest"] VALID_TESTS = [None, "wilcoxon", "ttest"]
def __init__(self, benchmarks, methods, lower_is_better=True, ttest='ttest', prec_mean=3, def __init__(self, benchmarks, methods, lower_is_better=True, significance_test='ttest', prec_mean=3,
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--', clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
color=True): color=True):
assert ttest in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}' assert significance_test in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
self.benchmarks = np.asarray(benchmarks) self.benchmarks = np.asarray(benchmarks)
self.benchmark_index = {row: i for i, row in enumerate(benchmarks)} self.benchmark_index = {row: i for i, row in enumerate(benchmarks)}
@ -21,7 +21,7 @@ class Table:
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values'] # keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
self._addmap('values', dtype=object) self._addmap('values', dtype=object)
self.lower_is_better = lower_is_better self.lower_is_better = lower_is_better
self.ttest = ttest self.ttest = significance_test
self.prec_mean = prec_mean self.prec_mean = prec_mean
self.clean_zero = clean_zero self.clean_zero = clean_zero
self.show_std = show_std self.show_std = show_std
@ -156,8 +156,9 @@ class Table:
return all(self.map['fill'][:, self.method_index[col]]) return all(self.map['fill'][:, self.method_index[col]])
def _addave(self): def _addave(self):
ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, ttest=self.ttest, average=False, ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, significance_test=self.ttest, average=False,
missing=self.missing, missing_str=self.missing_str) missing=self.missing, missing_str=self.missing_str, prec_mean=self.prec_mean, prec_std=self.prec_std,
show_std=self.show_std)
for col in self.methods: for col in self.methods:
values = None values = None
if self._is_column_full(col): if self._is_column_full(col):
@ -267,12 +268,37 @@ class Table:
tab += self.latexAverage() tab += self.latexAverage()
return tab return tab
def latexTabularT(self, benchmark_replace={}, method_replace={}, average=True, side=False):
def withside(label):
return '\side{'+label+'}' if side else label
tab = ' & '
tab += ' & '.join([withside(benchmark_replace.get(col, col)) for col in self.benchmarks])
if average:
tab += ' & ' + withside('Ave')
tab += ' \\\\\hline\n'
for row in self.methods:
rowname = method_replace.get(row, row)
tab += rowname + ' & '
tab += self.latexRowT(row, endl='')
if average:
tab += ' & '
tab += self.average.latexCell('ave', row)
tab += '\\\\\hline\n'
return tab
def latexRow(self, benchmark, endl='\\\\\hline\n'): def latexRow(self, benchmark, endl='\\\\\hline\n'):
s = [self.latexCell(benchmark, col) for col in self.methods] s = [self.latexCell(benchmark, col) for col in self.methods]
s = ' & '.join(s) s = ' & '.join(s)
s += ' ' + endl s += ' ' + endl
return s return s
def latexRowT(self, method, endl='\\\\\hline\n'):
s = [self.latexCell(benchmark, method) for benchmark in self.benchmarks]
s = ' & '.join(s)
s += ' ' + endl
return s
def latexAverage(self, endl='\\\\\hline\n'): def latexAverage(self, endl='\\\\\hline\n'):
if self.add_average: if self.add_average:
return self.average.latexRow('ave', endl=endl) return self.average.latexRow('ave', endl=endl)