forked from moreo/QuaPy
adding tweet sent quant experiments
This commit is contained in:
parent
41347b50f9
commit
2ec3400d15
3
TODO.txt
3
TODO.txt
|
@ -23,4 +23,5 @@ Explore the hyperparameter "number of bins" in HDy
|
||||||
Implement HDy for single-label?
|
Implement HDy for single-label?
|
||||||
Rename EMQ to SLD ?
|
Rename EMQ to SLD ?
|
||||||
How many times is the system of equations for ACC and PACC not solved? How many times is it clipped? Do they sum up
|
How many times is the system of equations for ACC and PACC not solved? How many times is it clipped? Do they sum up
|
||||||
to one always?
|
to one always?
|
||||||
|
Parallelize the kFCV in ACC and PACC
|
|
@ -6,6 +6,7 @@ from . import data
|
||||||
from . import evaluation
|
from . import evaluation
|
||||||
from . import plot
|
from . import plot
|
||||||
from . import util
|
from . import util
|
||||||
|
from . import model_selection
|
||||||
from method.aggregative import isaggregative, isprobabilistic
|
from method.aggregative import isaggregative, isprobabilistic
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -132,26 +132,34 @@ class LabelledCollection:
|
||||||
def Xy(self):
|
def Xy(self):
|
||||||
return self.instances, self.labels
|
return self.instances, self.labels
|
||||||
|
|
||||||
def stats(self):
|
def stats(self, show=True):
|
||||||
ninstances = len(self)
|
ninstances = len(self)
|
||||||
instance_type = type(self.instances[0])
|
instance_type = type(self.instances[0])
|
||||||
if instance_type == list:
|
if instance_type == list:
|
||||||
nfeats = len(self.instances[0])
|
nfeats = len(self.instances[0])
|
||||||
elif instance_type == np.ndarray:
|
elif instance_type == np.ndarray or issparse(self.instances):
|
||||||
nfeats = self.instances.shape[1]
|
nfeats = self.instances.shape[1]
|
||||||
else:
|
else:
|
||||||
nfeats = '?'
|
nfeats = '?'
|
||||||
print(f'#instances={ninstances}, type={instance_type}, features={nfeats}, n_classes={self.n_classes}, '
|
stats_ = {'instances': ninstances,
|
||||||
f'prevs={strprev(self.prevalence())}')
|
'type': instance_type,
|
||||||
|
'features': nfeats,
|
||||||
|
'classes': self.n_classes,
|
||||||
|
'prevs': strprev(self.prevalence())}
|
||||||
|
if show:
|
||||||
|
print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, '
|
||||||
|
f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}')
|
||||||
|
return stats_
|
||||||
|
|
||||||
|
|
||||||
class Dataset:
|
class Dataset:
|
||||||
|
|
||||||
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None):
|
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
|
||||||
assert training.n_classes == test.n_classes, 'incompatible labels in training and test collections'
|
assert training.n_classes == test.n_classes, 'incompatible labels in training and test collections'
|
||||||
self.training = training
|
self.training = training
|
||||||
self.test = test
|
self.test = test
|
||||||
self.vocabulary = vocabulary
|
self.vocabulary = vocabulary
|
||||||
|
self.name = name
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def SplitStratified(cls, collection: LabelledCollection, train_size=0.6):
|
def SplitStratified(cls, collection: LabelledCollection, train_size=0.6):
|
||||||
|
@ -175,6 +183,13 @@ class Dataset:
|
||||||
def vocabulary_size(self):
|
def vocabulary_size(self):
|
||||||
return len(self.vocabulary)
|
return len(self.vocabulary)
|
||||||
|
|
||||||
|
def stats(self):
|
||||||
|
tr_stats = self.training.stats(show=False)
|
||||||
|
te_stats = self.test.stats(show=False)
|
||||||
|
print(f'Name={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, '
|
||||||
|
f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
|
||||||
|
f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
|
||||||
|
|
||||||
|
|
||||||
def isbinary(data):
|
def isbinary(data):
|
||||||
if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
|
if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
|
||||||
|
|
|
@ -53,6 +53,8 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle
|
||||||
if min_df is not None:
|
if min_df is not None:
|
||||||
reduce_columns(data, min_df=min_df, inplace=True)
|
reduce_columns(data, min_df=min_df, inplace=True)
|
||||||
|
|
||||||
|
data.name = dataset_name
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
@ -116,6 +118,8 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
|
||||||
if min_df is not None:
|
if min_df is not None:
|
||||||
reduce_columns(data, min_df=min_df, inplace=True)
|
reduce_columns(data, min_df=min_df, inplace=True)
|
||||||
|
|
||||||
|
data.name = dataset_name
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -161,7 +161,7 @@ class ACC(AggregativeQuantifier):
|
||||||
def __init__(self, learner:BaseEstimator):
|
def __init__(self, learner:BaseEstimator):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3):
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.4):
|
||||||
"""
|
"""
|
||||||
Trains a ACC quantifier
|
Trains a ACC quantifier
|
||||||
:param data: the training set
|
:param data: the training set
|
||||||
|
@ -244,7 +244,7 @@ class PACC(AggregativeProbabilisticQuantifier):
|
||||||
def __init__(self, learner:BaseEstimator):
|
def __init__(self, learner:BaseEstimator):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3):
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.4):
|
||||||
"""
|
"""
|
||||||
Trains a PACC quantifier
|
Trains a PACC quantifier
|
||||||
:param data: the training set
|
:param data: the training set
|
||||||
|
@ -358,7 +358,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
def __init__(self, learner: BaseEstimator):
|
def __init__(self, learner: BaseEstimator):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=0.3):
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=0.4):
|
||||||
"""
|
"""
|
||||||
Trains a HDy quantifier
|
Trains a HDy quantifier
|
||||||
:param data: the training set
|
:param data: the training set
|
||||||
|
|
|
@ -90,7 +90,7 @@ class GridSearchQ(BaseQuantifier):
|
||||||
elif eval_budget is None:
|
elif eval_budget is None:
|
||||||
self.n_prevpoints = n_prevpoints
|
self.n_prevpoints = n_prevpoints
|
||||||
eval_computations = F.num_prevalence_combinations(self.n_prevpoints, n_classes, n_repetitions)
|
eval_computations = F.num_prevalence_combinations(self.n_prevpoints, n_classes, n_repetitions)
|
||||||
self.sout(f'{eval_computations} evaluations will be performed for each\n'
|
self.sout(f'{eval_computations} evaluations will be performed for each '
|
||||||
f'combination of hyper-parameters')
|
f'combination of hyper-parameters')
|
||||||
else:
|
else:
|
||||||
eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, n_repetitions)
|
eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, n_repetitions)
|
||||||
|
@ -169,3 +169,8 @@ class GridSearchQ(BaseQuantifier):
|
||||||
def get_params(self, deep=True):
|
def get_params(self, deep=True):
|
||||||
return self.param_grid
|
return self.param_grid
|
||||||
|
|
||||||
|
def best_model(self):
|
||||||
|
if hasattr(self, 'best_model_'):
|
||||||
|
return self.best_model_
|
||||||
|
raise ValueError('best_model called before fit')
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,137 @@
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import quapy as qp
|
||||||
|
import quapy.functional as F
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = 100
|
||||||
|
sample_size = qp.environ['SAMPLE_SIZE']
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_experiment(true_prevalences, estim_prevalences, n_repetitions=25):
|
||||||
|
#n_classes = true_prevalences.shape[1]
|
||||||
|
#true_ave = true_prevalences.reshape(-1, n_repetitions, n_classes).mean(axis=1)
|
||||||
|
#estim_ave = estim_prevalences.reshape(-1, n_repetitions, n_classes).mean(axis=1)
|
||||||
|
#estim_std = estim_prevalences.reshape(-1, n_repetitions, n_classes).std(axis=1)
|
||||||
|
#print('\nTrueP->mean(Phat)(std(Phat))\n'+'='*22)
|
||||||
|
#for true, estim, std in zip(true_ave, estim_ave, estim_std):
|
||||||
|
# str_estim = ', '.join([f'{mean:.3f}+-{std:.4f}' for mean, std in zip(estim, std)])
|
||||||
|
# print(f'{F.strprev(true)}->[{str_estim}]')
|
||||||
|
|
||||||
|
print('\nEvaluation Metrics:\n'+'='*22)
|
||||||
|
for eval_measure in [qp.error.mae, qp.error.mrae]:
|
||||||
|
err = eval_measure(true_prevalences, estim_prevalences)
|
||||||
|
print(f'\t{eval_measure.__name__}={err:.4f}')
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_method_point_test(method, test):
|
||||||
|
estim_prev = method.quantify(test.instances)
|
||||||
|
true_prev = F.prevalence_from_labels(test.labels, test.n_classes)
|
||||||
|
print('\nPoint-Test evaluation:\n' + '=' * 22)
|
||||||
|
print(f'true-prev={F.strprev(true_prev)}, estim-prev={F.strprev(estim_prev)}')
|
||||||
|
for eval_measure in [qp.error.mae, qp.error.mrae]:
|
||||||
|
err = eval_measure(true_prev, estim_prev)
|
||||||
|
print(f'\t{eval_measure.__name__}={err:.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
def quantification_models():
|
||||||
|
def newLR():
|
||||||
|
return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
|
||||||
|
__C_range = np.logspace(-4, 5, 10)
|
||||||
|
lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']}
|
||||||
|
#yield 'cc', qp.method.aggregative.CC(newLR()), lr_params
|
||||||
|
yield 'acc', qp.method.aggregative.ACC(newLR()), lr_params
|
||||||
|
#yield 'pcc', qp.method.aggregative.PCC(newLR()), lr_params
|
||||||
|
#yield 'pacc', qp.method.aggregative.PACC(newLR()), lr_params
|
||||||
|
|
||||||
|
|
||||||
|
def result_path(dataset_name, model_name, optim_metric):
|
||||||
|
return f'{dataset_name}-{model_name}-{optim_metric}.pkl'
|
||||||
|
|
||||||
|
|
||||||
|
def check_already_computed(dataset_name, model_name, optim_metric):
|
||||||
|
path = result_path(dataset_name, model_name, optim_metric)
|
||||||
|
return os.path.exists(path)
|
||||||
|
|
||||||
|
|
||||||
|
def save_results(dataset_name, model_name, optim_metric, *results):
|
||||||
|
path = result_path(dataset_name, model_name, optim_metric)
|
||||||
|
qp.util.create_parent_dir(path)
|
||||||
|
with open(path, 'wb') as foo:
|
||||||
|
pickle.dump(tuple(results), foo, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
np.random.seed(0)
|
||||||
|
|
||||||
|
for dataset_name in ['hcr']: # qp.datasets.TWITTER_SENTIMENT_DATASETS:
|
||||||
|
|
||||||
|
benchmark_devel = qp.datasets.fetch_twitter(dataset_name, for_model_selection=True, min_df=5, pickle=True)
|
||||||
|
benchmark_devel.stats()
|
||||||
|
|
||||||
|
for model_name, model, hyperparams in quantification_models():
|
||||||
|
|
||||||
|
model_selection = qp.model_selection.GridSearchQ(
|
||||||
|
model,
|
||||||
|
param_grid=hyperparams,
|
||||||
|
sample_size=sample_size,
|
||||||
|
n_prevpoints=21,
|
||||||
|
n_repetitions=5,
|
||||||
|
error='mae',
|
||||||
|
refit=False,
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
|
||||||
|
model_selection.fit(benchmark_devel.training, benchmark_devel.test)
|
||||||
|
model = model_selection.best_model()
|
||||||
|
|
||||||
|
benchmark_eval = qp.datasets.fetch_twitter(dataset_name, for_model_selection=False, min_df=5, pickle=True)
|
||||||
|
model.fit(benchmark_eval.training)
|
||||||
|
true_prevalences, estim_prevalences = qp.evaluation.artificial_sampling_prediction(
|
||||||
|
model,
|
||||||
|
test=benchmark_eval.test,
|
||||||
|
sample_size=sample_size,
|
||||||
|
n_prevpoints=21,
|
||||||
|
n_repetitions=25
|
||||||
|
)
|
||||||
|
|
||||||
|
evaluate_experiment(true_prevalences, estim_prevalences, n_repetitions=25)
|
||||||
|
evaluate_method_point_test(model, benchmark_eval.test)
|
||||||
|
|
||||||
|
#save_arrays(FLAGS.results, true_prevalences, estim_prevalences, test_name)
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# decide the test to be performed (in the case of 'semeval', tests are 'semeval13', 'semeval14', 'semeval15')
|
||||||
|
if FLAGS.dataset == 'semeval':
|
||||||
|
test_sets = ['semeval13', 'semeval14', 'semeval15']
|
||||||
|
else:
|
||||||
|
test_sets = [FLAGS.dataset]
|
||||||
|
|
||||||
|
evaluate_method_point_test(method, benchmark_eval.test, test_name=test_set)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# quantifiers:
|
||||||
|
# ----------------------------------------
|
||||||
|
# alias for quantifiers and default configurations
|
||||||
|
QUANTIFIER_ALIASES = {
|
||||||
|
'cc': lambda learner: ClassifyAndCount(learner),
|
||||||
|
'acc': lambda learner: AdjustedClassifyAndCount(learner),
|
||||||
|
'pcc': lambda learner: ProbabilisticClassifyAndCount(learner),
|
||||||
|
'pacc': lambda learner: ProbabilisticAdjustedClassifyAndCount(learner),
|
||||||
|
'emq': lambda learner: ExpectationMaximizationQuantifier(learner),
|
||||||
|
'svmq': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='q'),
|
||||||
|
'svmkld': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='kld'),
|
||||||
|
'svmnkld': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='nkld'),
|
||||||
|
'svmmae': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='mae'),
|
||||||
|
'svmmrae': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='mrae'),
|
||||||
|
'mlpe': lambda learner: MaximumLikelihoodPrevalenceEstimation(),
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue