diff --git a/TODO.txt b/TODO.txt index 1c58be2..16de883 100644 --- a/TODO.txt +++ b/TODO.txt @@ -23,4 +23,5 @@ Explore the hyperparameter "number of bins" in HDy Implement HDy for single-label? Rename EMQ to SLD ? How many times is the system of equations for ACC and PACC not solved? How many times is it clipped? Do they sum up - to one always? \ No newline at end of file + to one always? +Parallelize the kFCV in ACC and PACC \ No newline at end of file diff --git a/quapy/__init__.py b/quapy/__init__.py index a2f98fd..9d368bf 100644 --- a/quapy/__init__.py +++ b/quapy/__init__.py @@ -6,6 +6,7 @@ from . import data from . import evaluation from . import plot from . import util +from . import model_selection from method.aggregative import isaggregative, isprobabilistic diff --git a/quapy/data/base.py b/quapy/data/base.py index c75804e..bca3e85 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -132,26 +132,34 @@ class LabelledCollection: def Xy(self): return self.instances, self.labels - def stats(self): + def stats(self, show=True): ninstances = len(self) instance_type = type(self.instances[0]) if instance_type == list: nfeats = len(self.instances[0]) - elif instance_type == np.ndarray: + elif instance_type == np.ndarray or issparse(self.instances): nfeats = self.instances.shape[1] else: nfeats = '?' - print(f'#instances={ninstances}, type={instance_type}, features={nfeats}, n_classes={self.n_classes}, ' - f'prevs={strprev(self.prevalence())}') + stats_ = {'instances': ninstances, + 'type': instance_type, + 'features': nfeats, + 'classes': self.n_classes, + 'prevs': strprev(self.prevalence())} + if show: + print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, ' + f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}') + return stats_ class Dataset: - def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None): + def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''): assert training.n_classes == test.n_classes, 'incompatible labels in training and test collections' self.training = training self.test = test self.vocabulary = vocabulary + self.name = name @classmethod def SplitStratified(cls, collection: LabelledCollection, train_size=0.6): @@ -175,6 +183,13 @@ class Dataset: def vocabulary_size(self): return len(self.vocabulary) + def stats(self): + tr_stats = self.training.stats(show=False) + te_stats = self.test.stats(show=False) + print(f'Name={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, ' + f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, ' + f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}') + def isbinary(data): if isinstance(data, Dataset) or isinstance(data, LabelledCollection): diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index b919959..54bfbfb 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -53,6 +53,8 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle if min_df is not None: reduce_columns(data, min_df=min_df, inplace=True) + data.name = dataset_name + return data @@ -116,6 +118,8 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom if min_df is not None: reduce_columns(data, min_df=min_df, inplace=True) + data.name = dataset_name + return data diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 5a04123..76952a4 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -161,7 +161,7 @@ class ACC(AggregativeQuantifier): def __init__(self, learner:BaseEstimator): self.learner = learner - def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3): + def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.4): """ Trains a ACC quantifier :param data: the training set @@ -244,7 +244,7 @@ class PACC(AggregativeProbabilisticQuantifier): def __init__(self, learner:BaseEstimator): self.learner = learner - def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3): + def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.4): """ Trains a PACC quantifier :param data: the training set @@ -358,7 +358,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): def __init__(self, learner: BaseEstimator): self.learner = learner - def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=0.3): + def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=0.4): """ Trains a HDy quantifier :param data: the training set diff --git a/quapy/model_selection.py b/quapy/model_selection.py index 7731d2a..039dbb9 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -90,7 +90,7 @@ class GridSearchQ(BaseQuantifier): elif eval_budget is None: self.n_prevpoints = n_prevpoints eval_computations = F.num_prevalence_combinations(self.n_prevpoints, n_classes, n_repetitions) - self.sout(f'{eval_computations} evaluations will be performed for each\n' + self.sout(f'{eval_computations} evaluations will be performed for each ' f'combination of hyper-parameters') else: eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, n_repetitions) @@ -169,3 +169,8 @@ class GridSearchQ(BaseQuantifier): def get_params(self, deep=True): return self.param_grid + def best_model(self): + if hasattr(self, 'best_model_'): + return self.best_model_ + raise ValueError('best_model called before fit') + diff --git a/tweet_sent_quant.py b/tweet_sent_quant.py new file mode 100644 index 0000000..3965c0f --- /dev/null +++ b/tweet_sent_quant.py @@ -0,0 +1,137 @@ +from sklearn.linear_model import LogisticRegression +import quapy as qp +import quapy.functional as F +import numpy as np +import os +import sys +import pickle + +qp.environ['SAMPLE_SIZE'] = 100 +sample_size = qp.environ['SAMPLE_SIZE'] + + + +def evaluate_experiment(true_prevalences, estim_prevalences, n_repetitions=25): + #n_classes = true_prevalences.shape[1] + #true_ave = true_prevalences.reshape(-1, n_repetitions, n_classes).mean(axis=1) + #estim_ave = estim_prevalences.reshape(-1, n_repetitions, n_classes).mean(axis=1) + #estim_std = estim_prevalences.reshape(-1, n_repetitions, n_classes).std(axis=1) + #print('\nTrueP->mean(Phat)(std(Phat))\n'+'='*22) + #for true, estim, std in zip(true_ave, estim_ave, estim_std): + # str_estim = ', '.join([f'{mean:.3f}+-{std:.4f}' for mean, std in zip(estim, std)]) + # print(f'{F.strprev(true)}->[{str_estim}]') + + print('\nEvaluation Metrics:\n'+'='*22) + for eval_measure in [qp.error.mae, qp.error.mrae]: + err = eval_measure(true_prevalences, estim_prevalences) + print(f'\t{eval_measure.__name__}={err:.4f}') + print() + + +def evaluate_method_point_test(method, test): + estim_prev = method.quantify(test.instances) + true_prev = F.prevalence_from_labels(test.labels, test.n_classes) + print('\nPoint-Test evaluation:\n' + '=' * 22) + print(f'true-prev={F.strprev(true_prev)}, estim-prev={F.strprev(estim_prev)}') + for eval_measure in [qp.error.mae, qp.error.mrae]: + err = eval_measure(true_prev, estim_prev) + print(f'\t{eval_measure.__name__}={err:.4f}') + + +def quantification_models(): + def newLR(): + return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1) + __C_range = np.logspace(-4, 5, 10) + lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']} + #yield 'cc', qp.method.aggregative.CC(newLR()), lr_params + yield 'acc', qp.method.aggregative.ACC(newLR()), lr_params + #yield 'pcc', qp.method.aggregative.PCC(newLR()), lr_params + #yield 'pacc', qp.method.aggregative.PACC(newLR()), lr_params + + +def result_path(dataset_name, model_name, optim_metric): + return f'{dataset_name}-{model_name}-{optim_metric}.pkl' + + +def check_already_computed(dataset_name, model_name, optim_metric): + path = result_path(dataset_name, model_name, optim_metric) + return os.path.exists(path) + + +def save_results(dataset_name, model_name, optim_metric, *results): + path = result_path(dataset_name, model_name, optim_metric) + qp.util.create_parent_dir(path) + with open(path, 'wb') as foo: + pickle.dump(tuple(results), foo, pickle.HIGHEST_PROTOCOL) + + +if __name__ == '__main__': + + np.random.seed(0) + + for dataset_name in ['hcr']: # qp.datasets.TWITTER_SENTIMENT_DATASETS: + + benchmark_devel = qp.datasets.fetch_twitter(dataset_name, for_model_selection=True, min_df=5, pickle=True) + benchmark_devel.stats() + + for model_name, model, hyperparams in quantification_models(): + + model_selection = qp.model_selection.GridSearchQ( + model, + param_grid=hyperparams, + sample_size=sample_size, + n_prevpoints=21, + n_repetitions=5, + error='mae', + refit=False, + verbose=True + ) + + model_selection.fit(benchmark_devel.training, benchmark_devel.test) + model = model_selection.best_model() + + benchmark_eval = qp.datasets.fetch_twitter(dataset_name, for_model_selection=False, min_df=5, pickle=True) + model.fit(benchmark_eval.training) + true_prevalences, estim_prevalences = qp.evaluation.artificial_sampling_prediction( + model, + test=benchmark_eval.test, + sample_size=sample_size, + n_prevpoints=21, + n_repetitions=25 + ) + + evaluate_experiment(true_prevalences, estim_prevalences, n_repetitions=25) + evaluate_method_point_test(model, benchmark_eval.test) + + #save_arrays(FLAGS.results, true_prevalences, estim_prevalences, test_name) + + sys.exit(0) + + # decide the test to be performed (in the case of 'semeval', tests are 'semeval13', 'semeval14', 'semeval15') + if FLAGS.dataset == 'semeval': + test_sets = ['semeval13', 'semeval14', 'semeval15'] + else: + test_sets = [FLAGS.dataset] + + evaluate_method_point_test(method, benchmark_eval.test, test_name=test_set) + + + + +# quantifiers: +# ---------------------------------------- +# alias for quantifiers and default configurations +QUANTIFIER_ALIASES = { + 'cc': lambda learner: ClassifyAndCount(learner), + 'acc': lambda learner: AdjustedClassifyAndCount(learner), + 'pcc': lambda learner: ProbabilisticClassifyAndCount(learner), + 'pacc': lambda learner: ProbabilisticAdjustedClassifyAndCount(learner), + 'emq': lambda learner: ExpectationMaximizationQuantifier(learner), + 'svmq': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='q'), + 'svmkld': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='kld'), + 'svmnkld': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='nkld'), + 'svmmae': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='mae'), + 'svmmrae': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='mrae'), + 'mlpe': lambda learner: MaximumLikelihoodPrevalenceEstimation(), +} +