From e6e8ed87fd99d88429e8338b11050cfae9f1d0b5 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Tue, 28 Feb 2023 10:05:57 +0100 Subject: [PATCH] more stuff that does not work --- Transduction_office/prueba.py | 195 ++++++++++++++++++++++++++------ quapy/method/non_aggregative.py | 2 + 2 files changed, 160 insertions(+), 37 deletions(-) diff --git a/Transduction_office/prueba.py b/Transduction_office/prueba.py index dfb2ce5..e24d713 100644 --- a/Transduction_office/prueba.py +++ b/Transduction_office/prueba.py @@ -9,15 +9,22 @@ from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV import quapy as qp +from Transduction_office.grid_naive_quantif import GridQuantifier, binned_indexer, Indexer, GridQuantifier2, \ + classifier_indexer from Transduction_office.pykliep import DensityRatioEstimator -from quapy.protocol import AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol +from method.non_aggregative import MLPE +from quapy.protocol import AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol, UPP from quapy.data import LabelledCollection from quapy.method.aggregative import * import quapy.functional as F from time import time +from scipy.spatial.distance import cdist -def gaussian(mean, cov=1., label=0, size=100, random_state=0): +plottting = False + + +def gaussian(mean, cov=0.1, label=0, size=100, random_state=0): """ Creates a label collection in which the instances are distributed according to a Gaussian with specified parameters and labels all data points with a specific label. @@ -38,6 +45,36 @@ def gaussian(mean, cov=1., label=0, size=100, random_state=0): return LabelledCollection(instances, labels=[label]*size) +def _internal_plot(train, val, test): + if plottting: + xmin = min(train.X[:, 0].min(), val.X[:, 0].min(), test[:, 0].min()) + xmax = max(train.X[:, 0].max(), val.X[:, 0].max(), test[:, 0].max()) + ymin = min(train.X[:, 1].min(), val.X[:, 1].min(), test[:, 1].min()) + ymax = max(train.X[:, 1].max(), val.X[:, 1].max(), test[:, 1].max()) + plot(train, 'sel_train.png', xlim=(xmin, xmax), ylim=(ymin, ymax)) + plot(val, 'sel_val.png', xlim=(xmin, xmax), ylim=(ymin, ymax)) + plot(test, 'test.png', xlim=(xmin, xmax), ylim=(ymin, ymax)) + +def plot(data: LabelledCollection, path, xlim=None, ylim=None): + import matplotlib.pyplot as plt + plt.clf() + if isinstance(data, LabelledCollection): + if data.instances.shape[1] != 2: + return + + negative, positive = data.separate() + plt.scatter(negative.X[:,0], negative.X[:,1], label='neg', alpha=0.5) + plt.scatter(positive.X[:, 0], positive.X[:, 1], label='pos', alpha=0.5) + else: + if data.shape[1] != 2: + return + plt.scatter(data[:, 0], data[:, 1], label='test', alpha=0.5) + if xlim is not None: + plt.xlim(*xlim) + plt.ylim(*ylim) + plt.legend() + plt.savefig(path) + # ------------------------------------------------------------------------------------ # Protocol for generating prior probability shift + covariate shift by mixing "domains" # ------------------------------------------------------------------------------------ @@ -62,7 +99,6 @@ class CovPriorShift(AbstractStochasticSeededProtocol): tentatives = 0 while len(indexes) < self.repeats: alpha = F.uniform_simplex_sampling(n_classes=len(self.domains)) - # sizes = np.asarray([round(len(lc_i) * alpha_i) for lc_i, alpha_i in zip(self.domains, alpha)]) sizes = (alpha * self.sample_size).astype(int) if all(sizes > self.min_support): indexes_i = [lc.sampling_index(size) for lc, size in zip(self.domains, sizes)] @@ -185,6 +221,37 @@ class Random(ImportanceWeight): def weights(self, Xtr, ytr, Xte): return np.random.rand(len(Xtr)) + +class MostSimilarK(ImportanceWeight): + # retains the training documents that are most similar in average to the k closest test points + + def __init__(self, k): + self.k = k + + def weights(self, Xtr, ytr, Xte): + distances = cdist(Xtr, Xte) + min_dist = np.min(distances) + max_dist = np.max(distances) + distances = (distances-min_dist)/(max_dist-min_dist) + similarities = 1 / (1+distances) + top_k_sim = np.sort(similarities, axis=1)[:,-self.k:] + ave_sim = np.mean(top_k_sim, axis=1) + return ave_sim + +class MostSimilarTest(ImportanceWeight): + # retains the training documents that are the most similar to one test document + # i.e., for each test point, selects the K most similar train instances + + def __init__(self, k=1): + self.k = k + + def weights(self, Xtr, ytr, Xte): + distances = cdist(Xtr, Xte) + most_similar_idx = np.argsort(distances, axis=0)[:self.k, :].flatten() + weights = np.zeros(shape=Xtr.shape[0]) + weights[most_similar_idx] = 1 + return weights + # -------------------------------------------------------------------------------------------- # Quantification Methods that rely on Importance Weight for reweighting the training instances # -------------------------------------------------------------------------------------------- @@ -218,37 +285,71 @@ class ReweightingAggregative(TransductiveQuantifier): # Quantification Methods that rely on Importance Weight for selecting a validation partition # -------------------------------------------------------------------------------------------- -def select_from_weights(w, data: LabelledCollection, val_prop=0.4): - # w[w<1]=0 - order = np.argsort(w) - split_point = int(len(w)*val_prop) - train_idx, val_idx = order[:-split_point], order[-split_point:] - return data.sampling_from_index(train_idx), data.sampling_from_index(val_idx) -class SelectorQuantifiers(TransductiveQuantifier): - def __init__(self, classifier, weighter: ImportanceWeight, quantif_method=ACC, val_split=0.4): +class SelectorQuantifiersTrainVal(TransductiveQuantifier): + + def __init__(self, classifier, weighter: ImportanceWeight, quantif_method=ACC, val_split=0.4, only_positives=False): self.classifier = classifier self.weighter = weighter self.quantif_method = quantif_method self.val_split = val_split + self.only_positives = only_positives def quantify(self, instances): w = self.weighter.weights(*self.training.Xy, instances) - train, val = select_from_weights(w, self.training, self.val_split) + train, val = self.select_from_weights(w, self.training, self.val_split, self.only_positives) + _internal_plot(train, val, instances) + # print('\ttraining size', len(train), '\tval size', len(val)) quantifier = self.quantif_method(self.classifier).fit(train, val_split=val) return quantifier.quantify(instances) + def select_from_weights(self, w, data: LabelledCollection, val_prop=0.4, only_positives=False): + order = np.argsort(w) + if only_positives: + val_prop = np.mean(w > 0) + split_point = int(len(w) * val_prop) + different_idx, similar_idx = order[:-split_point], order[-split_point:] + different, similar = data.sampling_from_index(different_idx), data.sampling_from_index(similar_idx) + # return different, similar + train, val = similar.split_stratified(0.6) + return train, val + + +class SelectorQuantifiersTrain(TransductiveQuantifier): + + def __init__(self, classifier, weighter: ImportanceWeight, quantif_method=ACC, only_positives=False): + self.classifier = classifier + self.weighter = weighter + self.quantif_method = quantif_method + self.only_positives = only_positives + + def quantify(self, instances): + w = self.weighter.weights(*self.training.Xy, instances) + train = self.select_from_weights(w, self.training, select_prop=None, only_positives=self.only_positives) + # _internal_plot(train, None, instances) + # print('\ttraining size', len(train)) + quantifier = self.quantif_method(self.classifier).fit(train) + return quantifier.quantify(instances) + + def select_from_weights(self, w, data: LabelledCollection, select_prop=0.5, only_positives=False): + order = np.argsort(w) + if only_positives: + select_prop = np.mean(w > 0) + split_point = int(len(w) * select_prop) + different_idx, similar_idx = order[:-split_point], order[-split_point:] + different, similar = data.sampling_from_index(different_idx), data.sampling_from_index(similar_idx) + return similar if __name__ == '__main__': qp.environ['SAMPLE_SIZE'] = 500 - dA_l0 = gaussian(mean=[0,0], label=0, size=1000) - dA_l1 = gaussian(mean=[1,0], label=1, size=1000) - dB_l0 = gaussian(mean=[0,1], label=0, size=1000) - dB_l1 = gaussian(mean=[1,1], label=1, size=1000) + dA_l0 = gaussian(mean=[0,0], label=0, size=5000) + dA_l1 = gaussian(mean=[1,0], label=1, size=5000) + dB_l0 = gaussian(mean=[0,1], label=0, size=5000) + dB_l1 = gaussian(mean=[1,1], label=1, size=5000) dA = LabelledCollection.join(dA_l0, dA_l1) dB = LabelledCollection.join(dB_l0, dB_l1) @@ -258,42 +359,62 @@ if __name__ == '__main__': train = LabelledCollection.join(dA_train, dB_train) + plot(train, 'train.png') + def lr(): return LogisticRegression() - # def lr(): - # return GridSearchCV( - # LogisticRegression(), - # param_grid={'C':np.logspace(-3,3,7), 'class_weight': ['balanced', None]}, - # n_jobs=-1 - # ) + + # EMQ.MAX_ITER*=10 + # val_split = 0.5 + k_sim = 10 + Q=ACC methods = [ + ('MLPE', MLPE()), ('CC', CC(lr())), ('PCC', PCC(lr())), ('ACC', ACC(lr())), ('PACC', PACC(lr())), - ('HDy', EMQ(lr())), + ('HDy', HDy(lr())), ('EMQ', EMQ(lr())), - ('Sel-ACC', SelectorQuantifiers(lr(), MostTest(), ACC)), - ('Sel-PACC', SelectorQuantifiers(lr(), MostTest(), PACC)), - ('Sel-HDy', SelectorQuantifiers(lr(), MostTest(), HDy)), - ('LogReg-CC', ReweightingAggregative(lr(), LogReg(), CC)), - ('LogReg-PCC', ReweightingAggregative(lr(), LogReg(), PCC)), - ('LogReg-EMQ', ReweightingAggregative(lr(), LogReg(), EMQ)), - # ('KLIEP-CC', TransductiveAggregative(lr(), KLIEP(), CC)), - # ('KLIEP-PCC', TransductiveAggregative(lr(), KLIEP(), PCC)), - # ('KLIEP-EMQ', TransductiveAggregative(lr(), KLIEP(), EMQ)), - # ('SILF-CC', TransductiveAggregative(lr(), USILF(), CC)), - # ('SILF-PCC', TransductiveAggregative(lr(), USILF(), PCC)), - # ('SILF-EMQ', TransductiveAggregative(lr(), USILF(), EMQ)) + ('GridQ', GridQuantifier2(classifier=lr())), + # ('GridQ', GridQuantifier(Indexer(binned_indexer(train.X, nbins_by_dim=2)), cell_quantifier=Q(lr()))), + # ('GridQ', GridQuantifier(Indexer(binned_indexer(train.X, nbins_by_dim=4)), cell_quantifier=Q(lr()))), + # ('GridQ', GridQuantifier(Indexer(binned_indexer(train.X, nbins_by_dim=6)), cell_quantifier=Q(lr()))), + # ('GridQ', GridQuantifier(Indexer(binned_indexer(train.X, nbins_by_dim=8)), cell_quantifier=Q(lr()))), + # ('GridQ', GridQuantifier(Indexer(binned_indexer(train.X, nbins_by_dim=10)), cell_quantifier=Q(lr()))), + # ('GridQ', GridQuantifier(Indexer(binned_indexer(train.X, nbins_by_dim=20)), cell_quantifier=Q(lr()))), + # ('kSim-ACC', SelectorQuantifiers(lr(), MostSimilar(k_sim), ACC, val_split=val_split)), + # ('kSim-PACC', SelectorQuantifiers(lr(), MostSimilar(k_sim), PACC, val_split=val_split)), + # ('kSim-HDy', SelectorQuantifiers(lr(), MostSimilar(k_sim), HDy, val_split=val_split)), + # ('Sel-CC', SelectorQuantifiersTrain(lr(), MostSimilarTest(k=k_sim), CC, only_positives=True)), + # ('Sel-PCC', SelectorQuantifiersTrain(lr(), MostSimilarTest(k=k_sim), PCC, only_positives=True)), + # ('Sel-ACC', SelectorQuantifiersTrainVal(lr(), MostSimilarTest(k=k_sim), ACC, only_positives=True)), + # ('Sel-PACC', SelectorQuantifiersTrainVal(lr(), MostSimilarTest(k=k_sim), PACC, only_positives=True)), + # ('Sel-HDy', SelectorQuantifiersTrainVal(lr(), MostSimilarTest(k=k_sim), HDy, only_positives=True)), + # ('Sel-EMQ', SelectorQuantifiersTrain(lr(), MostSimilarTest(k=k_sim), EMQ, only_positives=True)), + # ('Sel-EMQ', SelectorQuantifiersTrainVal(lr(), USILF(), PACC, only_positives=False)), + # ('Sel-PACC', SelectorQuantifiers(lr(), MostTest(), PACC)), + # ('Sel-HDy', SelectorQuantifiers(lr(), MostTest(), HDy)), + # ('LogReg-CC', ReweightingAggregative(lr(), LogReg(), CC)), + # ('LogReg-PCC', ReweightingAggregative(lr(), LogReg(), PCC)), + # ('LogReg-EMQ', ReweightingAggregative(lr(), LogReg(), EMQ)), + # ('KLIEP-CC', ReweightingAggregative(lr(), KLIEP(), CC)), + # ('KLIEP-PCC', ReweightingAggregative(lr(), KLIEP(), PCC)), + # ('KLIEP-EMQ', ReweightingAggregative(lr(), KLIEP(), EMQ)), + # ('SILF-CC', ReweightingAggregative(lr(), USILF(), CC)), + # ('SILF-PCC', ReweightingAggregative(lr(), USILF(), PCC)), + # ('SILF-EMQ', ReweightingAggregative(lr(), USILF(), EMQ)) ] for name, model in methods: - with qp.util.temp_seed(1): + with qp.util.temp_seed(5): + # print('original training size', len(train)) model.fit(train) - prot = CovPriorShift([dA_test, dB_test], repeats=10) + prot = CovPriorShift([dA_test, dB_test], repeats=1 if plottting else 150) + # prot = UPP(dA_test+dB_test, repeats=1 if plottting else 150) mae = qp.evaluation.evaluate(model, protocol=prot, error_metric='mae') print(f'{name}: {mae = :.4f}') # mrae = qp.evaluation.evaluate(model, protocol=prot, error_metric='mrae') diff --git a/quapy/method/non_aggregative.py b/quapy/method/non_aggregative.py index 0a8680d..af76d81 100644 --- a/quapy/method/non_aggregative.py +++ b/quapy/method/non_aggregative.py @@ -33,3 +33,5 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier): """ return self.estimated_prevalence + +MLPE = MaximumLikelihoodPrevalenceEstimation \ No newline at end of file