1
0
Fork 0

adding tweet sent quant experiments

This commit is contained in:
Alejandro Moreo Fernandez 2021-01-11 18:31:12 +01:00
parent 41347b50f9
commit 2ec3400d15
7 changed files with 173 additions and 10 deletions

View File

@ -23,4 +23,5 @@ Explore the hyperparameter "number of bins" in HDy
Implement HDy for single-label? Implement HDy for single-label?
Rename EMQ to SLD ? Rename EMQ to SLD ?
How many times is the system of equations for ACC and PACC not solved? How many times is it clipped? Do they sum up How many times is the system of equations for ACC and PACC not solved? How many times is it clipped? Do they sum up
to one always? to one always?
Parallelize the kFCV in ACC and PACC

View File

@ -6,6 +6,7 @@ from . import data
from . import evaluation from . import evaluation
from . import plot from . import plot
from . import util from . import util
from . import model_selection
from method.aggregative import isaggregative, isprobabilistic from method.aggregative import isaggregative, isprobabilistic

View File

@ -132,26 +132,34 @@ class LabelledCollection:
def Xy(self): def Xy(self):
return self.instances, self.labels return self.instances, self.labels
def stats(self): def stats(self, show=True):
ninstances = len(self) ninstances = len(self)
instance_type = type(self.instances[0]) instance_type = type(self.instances[0])
if instance_type == list: if instance_type == list:
nfeats = len(self.instances[0]) nfeats = len(self.instances[0])
elif instance_type == np.ndarray: elif instance_type == np.ndarray or issparse(self.instances):
nfeats = self.instances.shape[1] nfeats = self.instances.shape[1]
else: else:
nfeats = '?' nfeats = '?'
print(f'#instances={ninstances}, type={instance_type}, features={nfeats}, n_classes={self.n_classes}, ' stats_ = {'instances': ninstances,
f'prevs={strprev(self.prevalence())}') 'type': instance_type,
'features': nfeats,
'classes': self.n_classes,
'prevs': strprev(self.prevalence())}
if show:
print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, '
f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}')
return stats_
class Dataset: class Dataset:
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None): def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
assert training.n_classes == test.n_classes, 'incompatible labels in training and test collections' assert training.n_classes == test.n_classes, 'incompatible labels in training and test collections'
self.training = training self.training = training
self.test = test self.test = test
self.vocabulary = vocabulary self.vocabulary = vocabulary
self.name = name
@classmethod @classmethod
def SplitStratified(cls, collection: LabelledCollection, train_size=0.6): def SplitStratified(cls, collection: LabelledCollection, train_size=0.6):
@ -175,6 +183,13 @@ class Dataset:
def vocabulary_size(self): def vocabulary_size(self):
return len(self.vocabulary) return len(self.vocabulary)
def stats(self):
tr_stats = self.training.stats(show=False)
te_stats = self.test.stats(show=False)
print(f'Name={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, '
f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
def isbinary(data): def isbinary(data):
if isinstance(data, Dataset) or isinstance(data, LabelledCollection): if isinstance(data, Dataset) or isinstance(data, LabelledCollection):

View File

@ -53,6 +53,8 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle
if min_df is not None: if min_df is not None:
reduce_columns(data, min_df=min_df, inplace=True) reduce_columns(data, min_df=min_df, inplace=True)
data.name = dataset_name
return data return data
@ -116,6 +118,8 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
if min_df is not None: if min_df is not None:
reduce_columns(data, min_df=min_df, inplace=True) reduce_columns(data, min_df=min_df, inplace=True)
data.name = dataset_name
return data return data

View File

@ -161,7 +161,7 @@ class ACC(AggregativeQuantifier):
def __init__(self, learner:BaseEstimator): def __init__(self, learner:BaseEstimator):
self.learner = learner self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3): def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.4):
""" """
Trains a ACC quantifier Trains a ACC quantifier
:param data: the training set :param data: the training set
@ -244,7 +244,7 @@ class PACC(AggregativeProbabilisticQuantifier):
def __init__(self, learner:BaseEstimator): def __init__(self, learner:BaseEstimator):
self.learner = learner self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3): def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.4):
""" """
Trains a PACC quantifier Trains a PACC quantifier
:param data: the training set :param data: the training set
@ -358,7 +358,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
def __init__(self, learner: BaseEstimator): def __init__(self, learner: BaseEstimator):
self.learner = learner self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=0.3): def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=0.4):
""" """
Trains a HDy quantifier Trains a HDy quantifier
:param data: the training set :param data: the training set

View File

@ -90,7 +90,7 @@ class GridSearchQ(BaseQuantifier):
elif eval_budget is None: elif eval_budget is None:
self.n_prevpoints = n_prevpoints self.n_prevpoints = n_prevpoints
eval_computations = F.num_prevalence_combinations(self.n_prevpoints, n_classes, n_repetitions) eval_computations = F.num_prevalence_combinations(self.n_prevpoints, n_classes, n_repetitions)
self.sout(f'{eval_computations} evaluations will be performed for each\n' self.sout(f'{eval_computations} evaluations will be performed for each '
f'combination of hyper-parameters') f'combination of hyper-parameters')
else: else:
eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, n_repetitions) eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, n_repetitions)
@ -169,3 +169,8 @@ class GridSearchQ(BaseQuantifier):
def get_params(self, deep=True): def get_params(self, deep=True):
return self.param_grid return self.param_grid
def best_model(self):
if hasattr(self, 'best_model_'):
return self.best_model_
raise ValueError('best_model called before fit')

137
tweet_sent_quant.py Normal file
View File

@ -0,0 +1,137 @@
from sklearn.linear_model import LogisticRegression
import quapy as qp
import quapy.functional as F
import numpy as np
import os
import sys
import pickle
qp.environ['SAMPLE_SIZE'] = 100
sample_size = qp.environ['SAMPLE_SIZE']
def evaluate_experiment(true_prevalences, estim_prevalences, n_repetitions=25):
#n_classes = true_prevalences.shape[1]
#true_ave = true_prevalences.reshape(-1, n_repetitions, n_classes).mean(axis=1)
#estim_ave = estim_prevalences.reshape(-1, n_repetitions, n_classes).mean(axis=1)
#estim_std = estim_prevalences.reshape(-1, n_repetitions, n_classes).std(axis=1)
#print('\nTrueP->mean(Phat)(std(Phat))\n'+'='*22)
#for true, estim, std in zip(true_ave, estim_ave, estim_std):
# str_estim = ', '.join([f'{mean:.3f}+-{std:.4f}' for mean, std in zip(estim, std)])
# print(f'{F.strprev(true)}->[{str_estim}]')
print('\nEvaluation Metrics:\n'+'='*22)
for eval_measure in [qp.error.mae, qp.error.mrae]:
err = eval_measure(true_prevalences, estim_prevalences)
print(f'\t{eval_measure.__name__}={err:.4f}')
print()
def evaluate_method_point_test(method, test):
estim_prev = method.quantify(test.instances)
true_prev = F.prevalence_from_labels(test.labels, test.n_classes)
print('\nPoint-Test evaluation:\n' + '=' * 22)
print(f'true-prev={F.strprev(true_prev)}, estim-prev={F.strprev(estim_prev)}')
for eval_measure in [qp.error.mae, qp.error.mrae]:
err = eval_measure(true_prev, estim_prev)
print(f'\t{eval_measure.__name__}={err:.4f}')
def quantification_models():
def newLR():
return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
__C_range = np.logspace(-4, 5, 10)
lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']}
#yield 'cc', qp.method.aggregative.CC(newLR()), lr_params
yield 'acc', qp.method.aggregative.ACC(newLR()), lr_params
#yield 'pcc', qp.method.aggregative.PCC(newLR()), lr_params
#yield 'pacc', qp.method.aggregative.PACC(newLR()), lr_params
def result_path(dataset_name, model_name, optim_metric):
return f'{dataset_name}-{model_name}-{optim_metric}.pkl'
def check_already_computed(dataset_name, model_name, optim_metric):
path = result_path(dataset_name, model_name, optim_metric)
return os.path.exists(path)
def save_results(dataset_name, model_name, optim_metric, *results):
path = result_path(dataset_name, model_name, optim_metric)
qp.util.create_parent_dir(path)
with open(path, 'wb') as foo:
pickle.dump(tuple(results), foo, pickle.HIGHEST_PROTOCOL)
if __name__ == '__main__':
np.random.seed(0)
for dataset_name in ['hcr']: # qp.datasets.TWITTER_SENTIMENT_DATASETS:
benchmark_devel = qp.datasets.fetch_twitter(dataset_name, for_model_selection=True, min_df=5, pickle=True)
benchmark_devel.stats()
for model_name, model, hyperparams in quantification_models():
model_selection = qp.model_selection.GridSearchQ(
model,
param_grid=hyperparams,
sample_size=sample_size,
n_prevpoints=21,
n_repetitions=5,
error='mae',
refit=False,
verbose=True
)
model_selection.fit(benchmark_devel.training, benchmark_devel.test)
model = model_selection.best_model()
benchmark_eval = qp.datasets.fetch_twitter(dataset_name, for_model_selection=False, min_df=5, pickle=True)
model.fit(benchmark_eval.training)
true_prevalences, estim_prevalences = qp.evaluation.artificial_sampling_prediction(
model,
test=benchmark_eval.test,
sample_size=sample_size,
n_prevpoints=21,
n_repetitions=25
)
evaluate_experiment(true_prevalences, estim_prevalences, n_repetitions=25)
evaluate_method_point_test(model, benchmark_eval.test)
#save_arrays(FLAGS.results, true_prevalences, estim_prevalences, test_name)
sys.exit(0)
# decide the test to be performed (in the case of 'semeval', tests are 'semeval13', 'semeval14', 'semeval15')
if FLAGS.dataset == 'semeval':
test_sets = ['semeval13', 'semeval14', 'semeval15']
else:
test_sets = [FLAGS.dataset]
evaluate_method_point_test(method, benchmark_eval.test, test_name=test_set)
# quantifiers:
# ----------------------------------------
# alias for quantifiers and default configurations
QUANTIFIER_ALIASES = {
'cc': lambda learner: ClassifyAndCount(learner),
'acc': lambda learner: AdjustedClassifyAndCount(learner),
'pcc': lambda learner: ProbabilisticClassifyAndCount(learner),
'pacc': lambda learner: ProbabilisticAdjustedClassifyAndCount(learner),
'emq': lambda learner: ExpectationMaximizationQuantifier(learner),
'svmq': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='q'),
'svmkld': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='kld'),
'svmnkld': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='nkld'),
'svmmae': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='mae'),
'svmmrae': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='mrae'),
'mlpe': lambda learner: MaximumLikelihoodPrevalenceEstimation(),
}