diff --git a/laboratory/main_lequa.py b/laboratory/main_lequa.py index 2834d23..7eabd50 100644 --- a/laboratory/main_lequa.py +++ b/laboratory/main_lequa.py @@ -27,7 +27,7 @@ if __name__ == '__main__': 'classifier__class_weight': ['balanced', None] } - for method in ['PACC', 'SLD', 'DM', 'KDE', 'HDy', 'DIR']: + for method in ['KDE', 'PACC', 'SLD', 'DM', 'HDy-OvA', 'DIR']: #if os.path.exists(result_path): # print('Result already exit. Nothing to do') @@ -43,7 +43,7 @@ if __name__ == '__main__': dataset = 'T1B' train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset) - print('init', dataset) + print(f'init {dataset} #instances: {len(train)}') if method == 'KDE': param_grid = { 'bandwidth': np.linspace(0.001, 0.2, 21), @@ -51,6 +51,11 @@ if __name__ == '__main__': 'classifier__class_weight': ['balanced', None] } quantifier = KDEy(LogisticRegression(), target='max_likelihood') + elif method == 'KDE-debug': + param_grid = None + qp.environ['N_JOBS'] = 1 + quantifier = KDEy(LogisticRegression(), target='max_likelihood', bandwidth=0.02) + #train = train.sampling(280, *[1./train.n_classes]*(train.n_classes-1)) elif method == 'DIR': param_grid = hyper_LR quantifier = DIRy(LogisticRegression()) @@ -62,7 +67,7 @@ if __name__ == '__main__': quantifier = PACC(LogisticRegression()) elif method == 'HDy-OvA': param_grid = { - 'binary_quantifier__classifier__C': np.logspace(-4,4,9), + 'binary_quantifier__classifier__C': np.logspace(-3,3,9), 'binary_quantifier__classifier__class_weight': ['balanced', None] } quantifier = OneVsAllAggregative(HDy(LogisticRegression())) @@ -76,13 +81,17 @@ if __name__ == '__main__': else: raise NotImplementedError('unknown method', method) - modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim) + if param_grid is not None: + modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim) - modsel.fit(train) - print(f'best params {modsel.best_params_}') - pickle.dump(modsel.best_params_, open(f'{result_dir}/{method}_{dataset}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) + modsel.fit(train) + print(f'best params {modsel.best_params_}') + pickle.dump(modsel.best_params_, open(f'{result_dir}/{method}_{dataset}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) - quantifier = modsel.best_model() + quantifier = modsel.best_model() + else: + print('debug mode... skipping model selection') + quantifier.fit(train) report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True) means = report.mean() diff --git a/laboratory/main_tweets.py b/laboratory/main_tweets.py index c07efbb..0bbed1f 100644 --- a/laboratory/main_tweets.py +++ b/laboratory/main_tweets.py @@ -6,12 +6,13 @@ import sys import pandas as pd import quapy as qp -from quapy.method.aggregative import EMQ, DistributionMatching, PACC, HDy, OneVsAllAggregative +from quapy.method.aggregative import EMQ, DistributionMatching, PACC, ACC, CC, PCC, HDy, OneVsAllAggregative from method_kdey import KDEy from method_dirichlety import DIRy from quapy.model_selection import GridSearchQ from quapy.protocol import UPP +SEED=1 if __name__ == '__main__': @@ -29,7 +30,7 @@ if __name__ == '__main__': 'classifier__class_weight': ['balanced', None] } - for method in ['PACC', 'SLD', 'DM', 'KDE', 'HDy', 'DIR']: + for method in ['KDE-nomonte', 'KDE-monte2', 'SLD', 'KDE-kfcv']:# , 'DIR', 'DM', 'HDy-OvA', 'CC', 'ACC', 'PCC']: #if os.path.exists(result_path): # print('Result already exit. Nothing to do') @@ -49,69 +50,100 @@ if __name__ == '__main__': for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST: print('init', dataset) - - is_semeval = dataset.startswith('semeval') - - if not is_semeval or not semeval_trained: - - if method == 'KDE': - param_grid = { - 'bandwidth': np.linspace(0.001, 0.2, 21), - 'classifier__C': np.logspace(-4,4,9), - 'classifier__class_weight': ['balanced', None] - } - quantifier = KDEy(LogisticRegression(), target='max_likelihood') - elif method == 'DIR': - param_grid = hyper_LR - quantifier = DIRy(LogisticRegression()) - elif method == 'SLD': - param_grid = hyper_LR - quantifier = EMQ(LogisticRegression()) - elif method == 'PACC': - param_grid = hyper_LR - quantifier = PACC(LogisticRegression()) - elif method == 'HDy-OvA': - param_grid = { - 'binary_quantifier__classifier__C': np.logspace(-4,4,9), - 'binary_quantifier__classifier__class_weight': ['balanced', None] - } - quantifier = OneVsAllAggregative(HDy(LogisticRegression())) - elif method == 'DM': - param_grid = { - 'nbins': [5,10,15], - 'classifier__C': np.logspace(-4,4,9), - 'classifier__class_weight': ['balanced', None] - } - quantifier = DistributionMatching(LogisticRegression()) - else: - raise NotImplementedError('unknown method', method) - - # model selection - data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True) - - protocol = UPP(data.test, repeats=n_bags_val) - modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim) - - modsel.fit(data.training) - print(f'best params {modsel.best_params_}') - pickle.dump(modsel.best_params_, open(f'{result_dir}/{method}_{dataset}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) - - quantifier = modsel.best_model() - - if is_semeval: - semeval_trained = True - else: - print(f'model selection for {dataset} already done; skipping') + with qp.util.temp_seed(SEED): - data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False) - quantifier.fit(data.training) - protocol = UPP(data.test, repeats=n_bags_test) - report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True) - report.to_csv(result_path+'.dataframe') - means = report.mean() - csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') - csv.flush() + is_semeval = dataset.startswith('semeval') + + if not is_semeval or not semeval_trained: + + if method == 'KDE': + param_grid = { + 'bandwidth': np.linspace(0.001, 0.2, 21), + 'classifier__C': np.logspace(-4,4,9), + 'classifier__class_weight': ['balanced', None] + } + quantifier = KDEy(LogisticRegression(), target='max_likelihood') + elif method == 'KDE-kfcv': + param_grid = { + 'bandwidth': np.linspace(0.001, 0.2, 21), + 'classifier__C': np.logspace(-4,4,9), + 'classifier__class_weight': ['balanced', None] + } + quantifier = KDEy(LogisticRegression(), target='max_likelihood', val_split=10) + elif method in ['KDE-monte2']: + param_grid = { + 'bandwidth': np.linspace(0.001, 0.2, 21), + } + quantifier = KDEy(LogisticRegression(), target='min_divergence') + elif method in ['KDE-nomonte']: + param_grid = { + 'bandwidth': np.linspace(0.001, 0.2, 21), + } + quantifier = KDEy(LogisticRegression(), target='max_likelihood') + elif method == 'DIR': + param_grid = hyper_LR + quantifier = DIRy(LogisticRegression()) + elif method == 'SLD': + param_grid = hyper_LR + quantifier = EMQ(LogisticRegression()) + elif method == 'PACC': + param_grid = hyper_LR + quantifier = PACC(LogisticRegression()) + elif method == 'PACC-kfcv': + param_grid = hyper_LR + quantifier = PACC(LogisticRegression(), val_split=10) + elif method == 'PCC': + param_grid = hyper_LR + quantifier = PCC(LogisticRegression()) + elif method == 'ACC': + param_grid = hyper_LR + quantifier = ACC(LogisticRegression()) + elif method == 'CC': + param_grid = hyper_LR + quantifier = CC(LogisticRegression()) + elif method == 'HDy-OvA': + param_grid = { + 'binary_quantifier__classifier__C': np.logspace(-4,4,9), + 'binary_quantifier__classifier__class_weight': ['balanced', None] + } + quantifier = OneVsAllAggregative(HDy(LogisticRegression())) + elif method == 'DM': + param_grid = { + 'nbins': [5,10,15], + 'classifier__C': np.logspace(-4,4,9), + 'classifier__class_weight': ['balanced', None] + } + quantifier = DistributionMatching(LogisticRegression()) + else: + raise NotImplementedError('unknown method', method) + + # model selection + data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True) + + protocol = UPP(data.test, repeats=n_bags_val) + modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim) + + modsel.fit(data.training) + print(f'best params {modsel.best_params_}') + pickle.dump(modsel.best_params_, open(f'{result_dir}/{method}_{dataset}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) + + quantifier = modsel.best_model() + + if is_semeval: + semeval_trained = True + + else: + print(f'model selection for {dataset} already done; skipping') + + data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False) + quantifier.fit(data.training) + protocol = UPP(data.test, repeats=n_bags_test) + report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True) + report.to_csv(result_path+'.dataframe') + means = report.mean() + csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') + csv.flush() df = pd.read_csv(result_path+'.csv', sep='\t') diff --git a/laboratory/method_kdey.py b/laboratory/method_kdey.py index 6aa89be..30edc3c 100644 --- a/laboratory/method_kdey.py +++ b/laboratory/method_kdey.py @@ -30,7 +30,7 @@ class KDEy(AggregativeProbabilisticQuantifier): TARGET = ['min_divergence', 'max_likelihood'] def __init__(self, classifier: BaseEstimator, val_split=0.4, divergence: Union[str, Callable]='HD', - bandwidth='scott', engine='sklearn', target='min_divergence', n_jobs=None): + bandwidth='scott', engine='sklearn', target='min_divergence', n_jobs=None, random_state=0): assert bandwidth in KDEy.BANDWIDTH_METHOD or isinstance(bandwidth, float), \ f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}' assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}' @@ -42,6 +42,7 @@ class KDEy(AggregativeProbabilisticQuantifier): self.engine = engine self.target = target self.n_jobs = n_jobs + self.random_state=random_state def search_bandwidth_maxlikelihood(self, posteriors, labels): grid = {'bandwidth': np.linspace(0.001, 0.2, 100)} @@ -84,14 +85,20 @@ class KDEy(AggregativeProbabilisticQuantifier): kde = scipy.stats.gaussian_kde(posteriors) kde.set_bandwidth(self.bandwidth) elif self.engine == 'sklearn': + #print('fitting kde') kde = KernelDensity(bandwidth=self.bandwidth).fit(posteriors) + #print('[fitting done]') return kde def pdf(self, kde, posteriors): if self.engine == 'scipy': return kde(posteriors[:, :-1].T) elif self.engine == 'sklearn': - return np.exp(kde.score_samples(posteriors)) + #print('pdf...') + densities = np.exp(kde.score_samples(posteriors)) + #print('[pdf done]') + return densities + #return np.exp(kde.score_samples(posteriors)) def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None): """ @@ -118,13 +125,13 @@ class KDEy(AggregativeProbabilisticQuantifier): return self - def val_pdf(self, prev): + #def val_pdf(self, prev): """ Returns a function that computes the mixture model with the given prev as mixture factor :param prev: a prevalence vector, ndarray :return: a function implementing the validation distribution with fixed mixture factor """ - return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev)) + # return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev)) def aggregate(self, posteriors: np.ndarray): if self.target == 'min_divergence': @@ -134,14 +141,9 @@ class KDEy(AggregativeProbabilisticQuantifier): else: raise ValueError('unknown target') - def _target_divergence(self, posteriors): - """ - Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution - (the mixture) that best matches the test distribution, in terms of the divergence measure of choice. - - :param instances: instances in the sample - :return: a vector of class prevalence estimates - """ + def _target_divergence_depr(self, posteriors): + # this variant is, I think, ill-formed, since it evaluates the likelihood on the test points, which are + # overconfident in the KDE-test. test_density = self.get_kde(posteriors) # val_test_posteriors = np.concatenate([self.val_posteriors, posteriors]) test_likelihood = self.pdf(test_density, posteriors) @@ -164,6 +166,31 @@ class KDEy(AggregativeProbabilisticQuantifier): r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) return r.x + def _target_divergence(self, posteriors, montecarlo_samples=5000): + # in this variant we evaluate the divergence using a Montecarlo approach + n_classes = len(self.val_densities) + samples = qp.functional.uniform_prevalence_sampling(n_classes, size=montecarlo_samples) + + test_kde = self.get_kde(posteriors) + test_likelihood = self.pdf(test_kde, samples) + + divergence = _get_divergence(self.divergence) + + sample_densities = [self.pdf(kde_i, samples) for kde_i in self.val_densities] + + def match(prev): + val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, sample_densities)) + return divergence(val_likelihood, test_likelihood) + + # the initial point is set as the uniform distribution + uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + + # solutions are bounded to those contained in the unit-simplex + bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] + constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 + r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) + return r.x + def _target_likelihood(self, posteriors, eps=0.000001): """ Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution @@ -172,13 +199,20 @@ class KDEy(AggregativeProbabilisticQuantifier): :param instances: instances in the sample :return: a vector of class prevalence estimates """ + np.random.RandomState(self.random_state) n_classes = len(self.val_densities) + test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.val_densities] + #return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev)) def neg_loglikelihood(prev): - val_pdf = self.val_pdf(prev) - test_likelihood = val_pdf(posteriors) - test_loglikelihood = np.log(test_likelihood + eps) - return -np.sum(test_loglikelihood) + #print('-neg_likelihood') + #val_pdf = self.val_pdf(prev) + #test_likelihood = val_pdf(posteriors) + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + eps) + neg_log_likelihood = -np.sum(test_loglikelihood) + #print('-neg_likelihood [done!]') + return neg_log_likelihood #return -np.prod(test_likelihood) # the initial point is set as the uniform distribution @@ -187,5 +221,7 @@ class KDEy(AggregativeProbabilisticQuantifier): # solutions are bounded to those contained in the unit-simplex bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 + #print('searching for alpha') r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) + #print('[optimization ended]') return r.x \ No newline at end of file diff --git a/laboratory/show_results.py b/laboratory/show_results.py index 2810234..6bdcdfc 100644 --- a/laboratory/show_results.py +++ b/laboratory/show_results.py @@ -2,18 +2,18 @@ import sys from pathlib import Path import pandas as pd -#result_dir = 'results_tweet_1000' -result_dir = 'results_lequa' +result_dir = 'results_tweet_1000' +#result_dir = 'results_lequa' dfs = [] pathlist = Path(result_dir).rglob('*.csv') for path in pathlist: path_in_str = str(path) - print(path_in_str) try: df = pd.read_csv(path_in_str, sep='\t') + df = df[df.iloc[:, 0] != df.columns[0]] if not df.empty: dfs.append(df) except Exception: @@ -21,7 +21,7 @@ for path in pathlist: df = pd.concat(dfs) -for err in ['MAE', 'MRAE']: +for err in ['MAE', 'MRAE', 'KLD']: print('-'*100) print(err) print('-'*100) diff --git a/laboratory/todo.txt b/laboratory/todo.txt index 4042ab2..0dc9d09 100644 --- a/laboratory/todo.txt +++ b/laboratory/todo.txt @@ -4,23 +4,28 @@ y el otro es un KDE en test), de las que luego se calculará la divergencia (obj generar solo una distribución (mixture model de train) y tomar la likelihood de los puntos de test como objetivo a maximizar. -- quedarse con hyperparametros mejores por verlos -- sacar los dataframes en resultados para hcer test estadisticos +- echar un ojo a los hyperparametros - hacer dibujitos +- estudiar el caso en que el target es minimizar una divergencia. Posibilidades: + - evaluar los puntos de test solo + - evaluar un APP sobre el simplexo? + - evaluar un UPP sobre el simplexo? (=Montecarlo) + - qué divergencias? HD, topsoe, L1? - -1) aclarar: only test? 2) implementar el auto - optimización interna para likelihood [ninguno parece funcionar bien] - de todo (e.g., todo el training)? - independiente para cada conjunto etiquetado? (e.g., positivos, negativos, neutros, y test) - optimización como un parámetro GridSearchQ -3) aclarar: topsoe? -4) otro tipo de model selection? -5) aumentar numero de bags -6) optimizar parametro C? optimizar kernel? optimizar distancia? +6) optimizar kernel? optimizar distancia? 7) KDE de sklearn o multivariate KDE de statsmodel? ver también qué es esto (parece que da P(Y|X) o sea que podría eliminar el clasificador?): https://www.statsmodels.org/dev/_modules/statsmodels/nonparametric/kernel_density.html#KDEMultivariateConditional -8) quitar la ultima dimension en sklearn también? -9) optimizar para RAE en vez de AE? +8) quitar la ultima dimension en sklearn también? No veo porqué +9) optimizar para RAE en vez de AE? No va bien... +10) Definir un clasificador que devuelva, para cada clase, una posterior como la likelihood en la class-conditional KDE dividida + por la likelihood en en todas las clases (como propone Juanjo) y meterlo en EMD. Hacer al contario: re-calibrar con + EMD y meterlo en KDEy +11) KDEx? +12) Dirichlet (el método DIR) habría que arreglarlo y mostrar resultados... +13) Test estadisticos. diff --git a/quapy/data/base.py b/quapy/data/base.py index 9cc6441..8c80fba 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -123,7 +123,7 @@ class LabelledCollection: return self.uniform_sampling_index(size, random_state=random_state) if len(prevs) == self.n_classes - 1: prevs = prevs + (1 - sum(prevs),) - assert len(prevs) == self.n_classes, 'unexpected number of prevalences' + assert len(prevs) == self.n_classes, f'unexpected number of prevalences (found {len(prevs)}, expected {self.n_classes})' assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})' # Decide how many instances should be taken for each class in order to satisfy the requested prevalence