QuaPy/Transduction_office/prueba.py

import itertools
from functools import cache

import numpy as np
from densratio import densratio
from scipy.sparse import issparse, vstack
from scipy.stats import multivariate_normal
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

import quapy as qp
from Transduction_office.pykliep import DensityRatioEstimator
from quapy.protocol import AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol
from quapy.data import LabelledCollection
from quapy.method.aggregative import *
import quapy.functional as F
from time import time


def gaussian(mean, cov=1., label=0, size=100, random_state=0):
    """
    Creates a label collection in which the instances are distributed according to a Gaussian with specified
    parameters and labels all data points with a specific label.

    :param mean: ndarray of shape (n_dimensions) with the center
    :param cov: ndarray of shape (n_dimensions, n_dimensions) with the covariance matrix, or a number for np.eye
    :param label: the class label for the collection
    :param size: number of instances
    :param random_state: allows for replicating experiments
    :return: an instance of LabelledCollection
    """
    mean = np.asarray(mean)
    assert mean.ndim==1, 'wrong shape for mean'
    n_features = mean.shape[0]
    if isinstance(cov, (int, float)):
        cov = np.eye(n_features) * cov
    instances = multivariate_normal.rvs(mean, cov, size, random_state=random_state)
    return LabelledCollection(instances, labels=[label]*size)


# ------------------------------------------------------------------------------------
# Protocol for generating prior probability shift + covariate shift by mixing "domains"
# ------------------------------------------------------------------------------------
class CovPriorShift(AbstractStochasticSeededProtocol):

    def __init__(self, domains: list[LabelledCollection], sample_size=None, repeats=100, min_support=0, random_state=0,
                 return_type='sample_prev'):
        super(CovPriorShift, self).__init__(random_state)
        self.domains = list(itertools.chain.from_iterable(lc.separate() for lc in domains))
        self.sample_size = qp._get_sample_size(sample_size)
        self.repeats = repeats
        self.min_support = min_support
        self.collator = OnLabelledCollectionProtocol.get_collator(return_type)

    def samples_parameters(self):
        """
        Return all the necessary parameters to replicate the samples as according to the UPP protocol.

        :return: a list of indexes that realize the UPP sampling
        """
        indexes = []
        tentatives = 0
        while len(indexes) < self.repeats:
            alpha = F.uniform_simplex_sampling(n_classes=len(self.domains))
            # sizes = np.asarray([round(len(lc_i) * alpha_i) for lc_i, alpha_i in zip(self.domains, alpha)])
            sizes = (alpha * self.sample_size).astype(int)
            if all(sizes > self.min_support):
                indexes_i = [lc.sampling_index(size) for lc, size in zip(self.domains, sizes)]
                indexes.append(indexes_i)
                tentatives = 0
            else:
                tentatives += 1
            if tentatives > 100:
                raise ValueError('the support is too strict, and it is difficult '
                                 'or impossible to generate valid samples')
        return indexes

    def sample(self, params):
        indexes = params
        lcs = [lc.sampling_from_index(index) for index, lc in zip(indexes, self.domains)]
        return LabelledCollection.join(*lcs)

    def total(self):
        """
        Returns the number of samples that will be generated

        :return: int
        """
        return self.repeats


# ---------------------------------------------------------------------------------------
# Methods of "importance weight", e.g., by ratio density estimation (KLIEP, SILF, LogReg)
# ---------------------------------------------------------------------------------------
class ImportanceWeight:
    @abstractmethod
    def weights(self, Xtr, ytr, Xte):
        pass


class KLIEP(ImportanceWeight):

    def __init__(self):
        pass

    def weights(self, Xtr, ytr, Xte):
        kliep = DensityRatioEstimator()
        kliep.fit(Xtr, Xte)
        return kliep.predict(Xtr)


class USILF(ImportanceWeight):

    def __init__(self, alpha=0.):
        self.alpha = alpha

    def weights(self, Xtr, ytr, Xte):
        dense_ratio_obj = densratio(Xtr, Xte, alpha=self.alpha, verbose=False)
        return dense_ratio_obj.compute_density_ratio(Xtr)


class LogReg(ImportanceWeight):

    def __init__(self):
        pass

    def weights(self, Xtr, ytr, Xte):
        # check "Direct Density Ratio Estimation for
        # Large-scale Covariate Shift Adaptation", Eq.28

        if issparse(Xtr):
            X = vstack([Xtr, Xte])
        else:
            X = np.concatenate([Xtr, Xte])

        y = [0]*len(Xtr) + [1]*len(Xte)

        logreg = GridSearchCV(
            LogisticRegression(),
            param_grid={'C':np.logspace(-3,3,7), 'class_weight': ['balanced', None]},
            n_jobs=-1
        )
        logreg.fit(X, y)
        prob_train = logreg.predict_proba(Xtr)[:,0]
        prob_test  = logreg.predict_proba(Xtr)[:,1]
        prior_train = len(Xtr)
        prior_test = len(Xte)
        w = (prior_train/prior_test)*(prob_test/prob_train)
        return w


class MostTest(ImportanceWeight):

    def __init__(self):
        pass

    def weights(self, Xtr, ytr, Xte):
        # check "Direct Density Ratio Estimation for
        # Large-scale Covariate Shift Adaptation", Eq.28

        if issparse(Xtr):
            X = vstack([Xtr, Xte])
        else:
            X = np.concatenate([Xtr, Xte])

        y = [0]*len(Xtr) + [1]*len(Xte)

        logreg = GridSearchCV(
            LogisticRegression(),
            param_grid={'C':np.logspace(-3,3,7), 'class_weight': ['balanced', None]},
            n_jobs=-1
        )
        # logreg = LogisticRegression()
        # logreg.fit(X, y)
        # prob_test  = logreg.predict_proba(Xtr)[:,1]
        prob_test = cross_val_predict(logreg, X, y, n_jobs=-1, method="predict_proba")[:len(Xtr),1]
        return prob_test


class Random(ImportanceWeight):

    def __init__(self):
        pass

    def weights(self, Xtr, ytr, Xte):
        return np.random.rand(len(Xtr))

# --------------------------------------------------------------------------------------------
# Quantification Methods that rely on Importance Weight for reweighting the training instances
# --------------------------------------------------------------------------------------------
class TransductiveQuantifier(BaseQuantifier):

    def fit(self, data: LabelledCollection):
        self.training_ = data
        return self

    @property
    def training(self):
        return self.training_


class ReweightingAggregative(TransductiveQuantifier):

    def __init__(self, classifier, weighter: ImportanceWeight, quantif_method=CC):
        self.classifier = classifier
        self.weighter = weighter
        self.quantif_method = quantif_method

    def quantify(self, instances):
        # time_weight = 2.95340 time_train = 0.00619
        w = self.weighter.weights(*self.training.Xy, instances)
        self.classifier.fit(*self.training.Xy, sample_weight=w)
        quantifier = self.quantif_method(self.classifier).fit(self.training, fit_classifier=False)
        return quantifier.quantify(instances)


# --------------------------------------------------------------------------------------------
# Quantification Methods that rely on Importance Weight for selecting a validation partition
# --------------------------------------------------------------------------------------------

def select_from_weights(w, data: LabelledCollection, val_prop=0.4):
    # w[w<1]=0
    order = np.argsort(w)
    split_point = int(len(w)*val_prop)
    train_idx, val_idx = order[:-split_point], order[-split_point:]
    return data.sampling_from_index(train_idx), data.sampling_from_index(val_idx)


class SelectorQuantifiers(TransductiveQuantifier):

    def __init__(self, classifier, weighter: ImportanceWeight, quantif_method=ACC, val_split=0.4):
        self.classifier = classifier
        self.weighter = weighter
        self.quantif_method = quantif_method
        self.val_split = val_split

    def quantify(self, instances):
        w = self.weighter.weights(*self.training.Xy, instances)
        train, val = select_from_weights(w, self.training, self.val_split)
        quantifier = self.quantif_method(self.classifier).fit(train, val_split=val)
        return quantifier.quantify(instances)


if __name__ == '__main__':
    qp.environ['SAMPLE_SIZE'] = 500

    dA_l0 = gaussian(mean=[0,0], label=0, size=1000)
    dA_l1 = gaussian(mean=[1,0], label=1, size=1000)
    dB_l0 = gaussian(mean=[0,1], label=0, size=1000)
    dB_l1 = gaussian(mean=[1,1], label=1, size=1000)

    dA = LabelledCollection.join(dA_l0, dA_l1)
    dB = LabelledCollection.join(dB_l0, dB_l1)

    dA_train, dA_test = dA.split_stratified(0.5, random_state=0)
    dB_train, dB_test = dB.split_stratified(0.5, random_state=0)

    train = LabelledCollection.join(dA_train, dB_train)

    def lr():
        return LogisticRegression()

    # def lr():
    #     return GridSearchCV(
    #         LogisticRegression(),
    #         param_grid={'C':np.logspace(-3,3,7), 'class_weight': ['balanced', None]},
    #         n_jobs=-1
    #     )

    methods = [
        ('CC', CC(lr())),
        ('PCC', PCC(lr())),
        ('ACC', ACC(lr())),
        ('PACC', PACC(lr())),
        ('HDy', EMQ(lr())),
        ('EMQ', EMQ(lr())),
        ('Sel-ACC', SelectorQuantifiers(lr(), MostTest(), ACC)),
        ('Sel-PACC', SelectorQuantifiers(lr(), MostTest(), PACC)),
        ('Sel-HDy', SelectorQuantifiers(lr(), MostTest(), HDy)),
        ('LogReg-CC', ReweightingAggregative(lr(), LogReg(), CC)),
        ('LogReg-PCC', ReweightingAggregative(lr(), LogReg(), PCC)),
        ('LogReg-EMQ', ReweightingAggregative(lr(), LogReg(), EMQ)),
        # ('KLIEP-CC', TransductiveAggregative(lr(), KLIEP(), CC)),
        # ('KLIEP-PCC', TransductiveAggregative(lr(), KLIEP(), PCC)),
        # ('KLIEP-EMQ', TransductiveAggregative(lr(), KLIEP(), EMQ)),
        # ('SILF-CC', TransductiveAggregative(lr(), USILF(), CC)),
        # ('SILF-PCC', TransductiveAggregative(lr(), USILF(), PCC)),
        # ('SILF-EMQ', TransductiveAggregative(lr(), USILF(), EMQ))
    ]

    for name, model in methods:
        with qp.util.temp_seed(1):
            model.fit(train)

            prot = CovPriorShift([dA_test, dB_test], repeats=10)
            mae = qp.evaluation.evaluate(model, protocol=prot, error_metric='mae')
            print(f'{name}: {mae = :.4f}')
            # mrae = qp.evaluation.evaluate(model, protocol=prot, error_metric='mrae')
            # print(f'{name}: {mrae = :.4f}')