From 6dfa1d3536a32fd4b595effabaea96a22e2912a4 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Wed, 17 Apr 2024 11:50:37 +0200 Subject: [PATCH] testing first gp in binary data, with pdf table --- ClassifierAccuracy/gaussian_process.py | 175 ++++++++++++++++++++---- ClassifierAccuracy/models_multiclass.py | 10 ++ quapy/data/datasets.py | 7 +- 3 files changed, 165 insertions(+), 27 deletions(-) diff --git a/ClassifierAccuracy/gaussian_process.py b/ClassifierAccuracy/gaussian_process.py index fea988a..296ba79 100644 --- a/ClassifierAccuracy/gaussian_process.py +++ b/ClassifierAccuracy/gaussian_process.py @@ -1,26 +1,78 @@ +import os.path +import pickle +from pathlib import Path + +from sklearn.linear_model import LogisticRegression + +from method.aggregative import PACC, EMQ, KDEyML + +""" +Ideas: +Try kernel based on feature covariance matrix, with dot product and with another kernel +Try Cauchy-Schwarz kernel + +""" + import sklearn.metrics from sklearn.gaussian_process import GaussianProcessRegressor import numpy as np from sklearn.gaussian_process.kernels import RBF, GenericKernelMixin, Kernel from sklearn.metrics.pairwise import pairwise_distances, pairwise_kernels +from data import LabelledCollection +from protocol import UPP +from quapy.method.base import BaseQuantifier, BinaryQuantifier +import quapy.functional as F +from result_table.src.table import Table + np.random.seed(0) -class MinL2Kernel(GenericKernelMixin, Kernel): +class FeatCovKernel(GenericKernelMixin, Kernel): + def __init__(self, dimensions): + self.dimensions = dimensions + + def _f(self, sample1, sample2): + """ + kernel value between a pair of samples + """ + sample1 = sample1.reshape(-1, self.dimensions) + sample2 = sample2.reshape(-1, self.dimensions) + featCov1 = pairwise_distances(sample1.T, metric='correlation') + featCov2 = pairwise_distances(sample2.T, metric='correlation') + featDiffNorm = np.linalg.norm(featCov1-featCov2) + simil = np.exp(-featDiffNorm) + return simil + + def __call__(self, X, Y=None, eval_gradient=False): + if Y is None: + Y = X + + if eval_gradient: + raise NotImplementedError() + else: + return np.array([[self._f(x, y) for y in Y] for x in X]) + + def diag(self, X): + return np.array([self._f(x, x) for x in X]) + + def is_stationary(self): + return True + +class AveL2Kernel(GenericKernelMixin, Kernel): """ A minimal (but valid) convolutional kernel for sequences of variable lengths.""" - def __init__(self): - pass + def __init__(self, dimensions): + self.dimensions=dimensions def _f(self, sample1, sample2): """ kernel value between a pair of sequences """ - sample1 = sample1.reshape(-1, 3) - sample2 = sample2.reshape(-1, 3) + sample1 = sample1.reshape(-1, self.dimensions) + sample2 = sample2.reshape(-1, self.dimensions) dist = pairwise_distances(sample1, sample2) mean_dist = dist.mean() closenest = np.exp(-mean_dist) @@ -83,14 +135,15 @@ def RJSDk(sample_1, sample_2): pi1 = n1 / (n1 + n2) pi2 = n2 / (n1 + n2) Z = np.concatenate([sample_1, sample_2]) - # Kz = pairwise_kernels(Z, metric='rbf', n_jobs=-1) - Kz = pairwise_kernels(Z, metric='cosine', n_jobs=-1) + Kz = pairwise_kernels(Z, metric='rbf', n_jobs=-1) + # Kz = pairwise_kernels(Z, metric='cosine', n_jobs=-1) Kx = Kz[:n1, :n1] Ky = Kz[n1:, n1:] SKz = S(Kz) SKx = S(Kx) SKy = S(Ky) + return SKz - (pi1 * SKx + pi2 * SKy) def S(K): @@ -110,7 +163,7 @@ def target_function(X): return X[:,0]**3 + 2.1*X[:,1]**2 + X[:,0] + 0.1 -# X = np.random.rand(10,3) +# X = np.random.rand(14,3) # X /= X.sum(axis=1, keepdims=True) # Y = np.random.rand(10,3) # Y /= Y.sum(axis=1, keepdims=True) @@ -122,27 +175,99 @@ def target_function(X): # # print(d) # +# d = RJSDk(X, X) +# +# print(d) +# # import sys ; sys.exit(0) -X_train = [np.random.rand(10*3) for _ in range(15)] -y_train = [target_function(X).mean() for X in X_train] - -X_test = [np.random.rand(10*3) for _ in range(11)] -y_test = [target_function(X).mean() for X in X_test] - - -print('fit') -#kernel = 1 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2)) +# X_train = [np.random.rand(10*3) for _ in range(50)] +# y_train = [target_function(X).mean() for X in X_train] +# +# X_test = [np.random.rand(10*3) for _ in range(20)] +# y_test = [target_function(X).mean() for X in X_test] +# +# +# print('fit') +# # kernel = 1 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2)) # kernel = MinL2Kernel() -kernel = RJSDkernel() -gaussian_process = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9) -gaussian_process.fit(X_train, y_train) -print('[done]') +# # kernel = RJSDkernel() +# gaussian_process = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9) +# gaussian_process.fit(X_train, y_train) +# print('[done]') +# +# print(gaussian_process.kernel_) +# +# y_pred = gaussian_process.predict(X_test) +# +# mse = np.mean((y_test - y_pred)**2) +# +# print(mse) -print(gaussian_process.kernel_) +class GPQuantifier(BaseQuantifier): -y_pred = gaussian_process.predict(X_test) + def __init__(self, dimensions, kernel, num_tr_samples=20, size_tr_samples=50): + self.dimensions = dimensions + self.num_tr_samples = num_tr_samples + self.size_tr_samples = size_tr_samples + self.gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9) -mse = np.mean((y_test - y_pred)**2) + def fit(self, data: LabelledCollection): + sampler = UPP(data, sample_size=self.size_tr_samples, repeats=self.num_tr_samples) + Xs, ps = list(zip(*[(X,p) for X,p in sampler()])) + ps = [p[1] for p in ps] + Xs = [X.flatten() for X in Xs] + self.gp.fit(Xs, ps) + return self -print(mse) \ No newline at end of file + def quantify(self, instances): + X = [instances.flatten()] + p = self.gp.predict(X)[0] + return F.as_binary_prevalence(p, clip_if_necessary=True) + +import quapy as qp + +from quapy.data.datasets import fetch_UCIBinaryDataset, UCI_BINARY_DATASETS + +table = Table('avel2') +methodnames = ['AveL2','PACC', 'SLD', 'KDEyML'] + +for methodname in methodnames: + errors = [] + for dataset_name in UCI_BINARY_DATASETS: + if dataset_name in ['balance.2']: + continue + + result_path = f'./results_gp/{dataset_name}_{methodname}.pkl' + os.makedirs(Path(result_path).parent, exist_ok=True) + if os.path.exists(result_path): + aes = pickle.load(open(result_path, 'rb')) + else: + dataset = fetch_UCIBinaryDataset(dataset_name) + qp.data.preprocessing.standardize(dataset, inplace=True) + train, test = dataset.train_test + d = train.X.shape[1] + if methodname=='AveL2': + q = GPQuantifier(dimensions=d, kernel=AveL2Kernel(dimensions=d), num_tr_samples=150, size_tr_samples=100) + elif methodname=='PACC': + q = PACC(LogisticRegression()) + elif methodname=='SLD': + q = EMQ(LogisticRegression()) + elif methodname=='KDEyML': + q = KDEyML(LogisticRegression(), bandwidth=0.05) + else: + raise ValueError('unknown method' + methodname) + q.fit(train) + aes = qp.evaluation.evaluate(q, UPP(test, sample_size=100), error_metric='ae', verbose=False) + pickle.dump(aes, open(result_path, 'wb'), pickle.HIGHEST_PROTOCOL) + + mae = np.mean(aes) + print(f'{dataset_name}\t{np.mean(mae):.4f}') + + errors.append(mae) + table.add(dataset_name, methodname, aes) + +print(f'\nmean={np.mean(errors):.5f}') +table.format.show_std=False +table.format.mean_prec=4 +table.LatexPDF('./table_gp/gp.pdf', tables=[table], resizebox=False) diff --git a/ClassifierAccuracy/models_multiclass.py b/ClassifierAccuracy/models_multiclass.py index 09e4da5..7b5aa4f 100644 --- a/ClassifierAccuracy/models_multiclass.py +++ b/ClassifierAccuracy/models_multiclass.py @@ -549,6 +549,16 @@ class EmptySafeQuantifier(BaseQuantifier): def num_non_empty_classes(self): return len(self.old_class_idx) + def get_params(self, deep=True): + return self.surrogate.get_params(deep=deep) + + def set_params(self, **params): + return self.surrogate.set_params(**params) + +class EmptySafeAggregativeQuantifier(AggregativeQuantifier, EmptySafeQuantifier): + + + # Baselines: class ATC(ClassifierAccuracyPrediction): diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 5b9806f..66f318f 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -20,8 +20,11 @@ TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders', TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders', 'semeval', 'semeval16', 'sst', 'wa', 'wb'] -UCI_BINARY_DATASETS = ['acute.a', 'acute.b', - 'balance.1', 'balance.2', 'balance.3', +UCI_BINARY_DATASETS = [ + #'acute.a', 'acute.b', + 'balance.1', + #'balance.2', + 'balance.3', 'breast-cancer', 'cmc.1', 'cmc.2', 'cmc.3', 'ctg.1', 'ctg.2', 'ctg.3',