mandoline baseline added

2023-11-22 19:18:22 +01:00 · 2023-11-22 19:18:22 +01:00 · b566049b4f
parent 23b74fad7b
commit b566049b4f
1 changed files with 261 additions and 0 deletions
--- a/baselines/mandoline.py
+++ b/baselines/mandoline.py
@ -0,0 +1,261 @@
+from functools import partial
+from types import SimpleNamespace
+from typing import List, Optional
+
+import numpy as np
+import scipy.optimize
+import scipy.special
+import sklearn.metrics.pairwise as skmetrics
+
+
+def Phi(
+    D: np.ndarray,
+    edge_list: np.ndarray = None,
+):
+    """
+    Given an n x d matrix of (example, slices), calculate the potential
+    matrix.
+
+    Includes correlations modeled by the edges in the `edge_list`.
+
+    Args:
+        D (np.ndarray): n x d matrix of (example, slice)
+        edge_list (np.ndarray): k x 2 matrix of edge correlations to be modeled.
+            edge_list[i, :] should be indices for a pair of columns of D.
+
+    Returns:
+        Potential matrix. Equals D when edge_list is None, otherwise adds additional
+        (x_i * x_j) "cross-terms" corresponding to the edges in the `edge_list`.
+
+    Examples:
+        >>> D = np.random.choice([-1, 1], size=(100, 6))
+        >>> edge_list = np.array([(0, 1), (1, 4)])
+        >>> Phi(D, edge_list)
+    """
+
+    if edge_list is not None:
+        pairwise_terms = (
+            D[np.arange(len(D)), edge_list[:, 0][:, np.newaxis]].T
+            * D[np.arange(len(D)), edge_list[:, 1][:, np.newaxis]].T
+        )
+        return np.concatenate([D, pairwise_terms], axis=1)
+    else:
+        return D
+
+
+def log_partition_ratio(
+    x: np.ndarray,
+    Phi_D_src: np.ndarray,
+    n_src: int,
+):
+    """
+    Calculate the log-partition ratio in the KLIEP problem.
+    """
+    return np.log(n_src) - scipy.special.logsumexp(Phi_D_src.dot(x))
+
+
+def mandoline(
+    D_src: np.ndarray,
+    D_tgt: np.ndarray,
+    edge_list: np.ndarray,
+    sigma: float = None,
+):
+    """
+    Mandoline solver.
+
+    Args:
+        D_src: (n_src x d) matrix of (example, slices) for the source distribution.
+        D_tgt: (n_tgt x d) matrix of (example, slices) for the source distribution.
+        edge_list: list of edge correlations between slices that should be modeled.
+        sigma: optional parameter that activates RBF kernel-based KLIEP with scale
+        `sigma`.
+
+    Returns: SimpleNamespace that contains
+        opt: result of scipy.optimize
+        Phi_D_src: source potential matrix used in Mandoline
+        Phi_D_tgt: target potential matrix used in Mandoline
+        n_src: number of source samples
+        n_tgt: number of target samples
+        edge_list: the `edge_list` parameter passed as input
+
+    """
+    # Copy and binarize the input matrices to -1/1
+    D_src, D_tgt = np.copy(D_src), np.copy(D_tgt)
+    if np.min(D_src) == 0:
+        D_src[D_src == 0] = -1
+        D_tgt[D_tgt == 0] = -1
+
+    # Edge list encoding dependencies between gs
+    if edge_list is not None:
+        edge_list = np.array(edge_list)
+
+    # Create the potential matrices
+    Phi_D_tgt, Phi_D_src = Phi(D_tgt, edge_list), Phi(D_src, edge_list)
+
+    # Number of examples
+    n_src, n_tgt = Phi_D_src.shape[0], Phi_D_tgt.shape[0]
+
+    def f(x):
+        obj = Phi_D_tgt.dot(x).sum() - n_tgt * scipy.special.logsumexp(Phi_D_src.dot(x))
+        return -obj
+
+    # Set the kernel
+    kernel = partial(skmetrics.rbf_kernel, gamma=sigma)
+
+    def llkliep_f(x):
+        obj = kernel(
+            Phi_D_tgt, x[:, np.newaxis]
+        ).sum() - n_tgt * scipy.special.logsumexp(kernel(Phi_D_src, x[:, np.newaxis]))
+        return -obj
+
+    # Solve
+    if not sigma:
+        opt = scipy.optimize.minimize(
+            f, np.random.randn(Phi_D_tgt.shape[1]), method="BFGS"
+        )
+    else:
+        opt = scipy.optimize.minimize(
+            llkliep_f, np.random.randn(Phi_D_tgt.shape[1]), method="BFGS"
+        )
+
+    return SimpleNamespace(
+        opt=opt,
+        Phi_D_src=Phi_D_src,
+        Phi_D_tgt=Phi_D_tgt,
+        n_src=n_src,
+        n_tgt=n_tgt,
+        edge_list=edge_list,
+    )
+
+
+def log_density_ratio(D, solved):
+    """
+    Calculate the log density ratio for a solved Mandoline run.
+    """
+    Phi_D = Phi(D, None)
+    return Phi_D.dot(solved.opt.x) + log_partition_ratio(
+        solved.opt.x, solved.Phi_D_src, solved.n_src
+    )
+
+
+def get_k_most_unbalanced_gs(D_src, D_tgt, k):
+    """
+    Get the top k slices that shift most between source and target
+    distributions.
+
+    Uses difference in marginals between each slice.
+    """
+    marginal_diff = np.abs(D_src.mean(axis=0) - D_tgt.mean(axis=0))
+    differences = np.sort(marginal_diff)[-k:]
+    indices = np.argsort(marginal_diff)[-k:]
+    return list(indices), list(differences)
+
+
+def weighted_estimator(weights: Optional[np.ndarray], mat: np.ndarray):
+    """
+    Calculate a weighted empirical mean over a matrix of samples.
+
+    Args:
+        weights (Optional[np.ndarray]):
+            length n array of weights that sums to 1. Calculates an unweighted
+            mean if `weights` is None.
+        mat (np.ndarray):
+            (n x r) matrix of empirical observations that is being averaged.
+
+    Returns:
+        Length r np.ndarray of weighted means.
+    """
+    _sum_weights = np.sum(weights)
+    if _sum_weights != 1.0:
+        if (_err := abs(1.0 - _sum_weights)) > 1e-15:
+            print(_err)
+            assert _sum_weights == 1, "`weights` must sum to 1."
+
+    if weights is None:
+        return np.mean(mat, axis=0)
+    return np.sum(weights[:, np.newaxis] * mat, axis=0)
+
+
+def estimate_performance(
+    D_src: np.ndarray,
+    D_tgt: np.ndarray,
+    edge_list: np.ndarray,
+    empirical_mat_list_src: List[np.ndarray],
+):
+    """
+    Estimate performance on a target distribution using slice information from the
+    source and target data.
+
+    This function runs Mandoline to calculate the importance weights to reweight
+    the source data.
+
+    Args:
+        D_src (np.ndarray): (n_src x d) matrix of (example, slices) for the source
+            distribution.
+        D_tgt (np.ndarray): (n_tgt x d) matrix of (example, slices) for the target
+            distribution.
+        edge_list (np.ndarray):
+        empirical_mat_list_src (List[np.ndarray]):
+
+    Returns:
+        SimpleNamespace with 3 attributes
+        - `all_estimates` is a list of SimpleNamespace objects with
+            2 attributes
+            - `weighted` is the estimate for the target distribution
+            - `source` is the estimate for the source distribution
+        - `solved`: result of scipy.optimize Mandoline solver
+        - `weights`: self-normalized importance weights used to weight the source data
+    """
+    # Run the solver
+    solved = mandoline(D_src, D_tgt, edge_list)
+
+    # Compute the weights on the source dataset
+    density_ratios = np.e ** log_density_ratio(solved.Phi_D_src, solved)
+
+    # Self-normalized importance weights
+    weights = density_ratios / np.sum(density_ratios)
+
+    all_estimates = []
+    for mat_src in empirical_mat_list_src:
+        # Estimates is a 1-D array of estimates for each mat e.g.
+        # each mat can correspond to a model's (n x 1) error matrix
+        weighted_estimates = weighted_estimator(weights, mat_src)
+        source_estimates = weighted_estimator(
+            np.ones(solved.n_src) / solved.n_src, mat_src
+        )
+
+        all_estimates.append(
+            SimpleNamespace(
+                weighted=weighted_estimates,
+                source=source_estimates,
+            )
+        )
+
+    return SimpleNamespace(
+        all_estimates=all_estimates,
+        solved=solved,
+        weights=weights,
+    )
+
+
+###########################################################################
+
+
+def get_entropy(probas):
+    return -np.sum(np.multiply(probas, np.log(probas + 1e-20)), axis=1)
+
+
+def get_slices(probas, n_ent_bins=6):
+    ln, ncl = probas.shape
+    preds = np.argmax(probas, axis=1)
+    pred_slices = np.full((ln, ncl), fill_value=-1, dtype="<i8")
+    pred_slices[np.arange(ln), preds] = 1
+
+    ent = get_entropy(probas)
+    range_top = get_entropy(np.array([np.ones(ncl) / ncl]))[0]
+    ent_bins = np.linspace(0, range_top, n_ent_bins + 1)
+    bins_map = np.digitize(ent, bins=ent_bins, right=True) - 1
+    ent_slices = np.full((ln, n_ent_bins), fill_value=-1, dtype="<i8")
+    ent_slices[np.arange(ln), bins_map] = 1
+
+    return np.concatenate([pred_slices, ent_slices], axis=1)