Source code for pypesto.ensemble.dimension_reduction

from collections.abc import Callable

import numpy as np

from .ensemble import Ensemble, EnsemblePrediction
from .util import get_prediction_dataset



[docs]
def get_umap_representation_parameters(
    ens: Ensemble,
    n_components: int = 2,
    normalize_data: bool = False,
    **kwargs,
) -> tuple:
    """
    UMAP of parameter ensemble.

    Compute the representation with reduced dimensionality via umap
    (with a given number of umap components) of the parameter ensemble.
    Allows to pass on additional keyword arguments to the umap routine.

    Parameters
    ----------
    ens:
        Ensemble objects containing a set of parameter vectors
    n_components:
        number of components for the dimension reduction
    normalize_data:
        flag indicating whether the parameter ensemble should be rescaled with
        mean and standard deviation

    Returns
    -------
    umap_components:
        first components of the umap embedding
    umap_object:
        returned fitted umap object from umap.UMAP()
    """
    # call lowlevel routine using the parameter vector ensemble
    return _get_umap_representation_lowlevel(
        dataset=ens.x_vectors.transpose(),
        n_components=n_components,
        normalize_data=normalize_data,
        **kwargs,
    )




[docs]
def get_umap_representation_predictions(
    ens: Ensemble | EnsemblePrediction,
    prediction_index: int = 0,
    n_components: int = 2,
    normalize_data: bool = False,
    **kwargs,
) -> tuple:
    """
    UMAP of ensemble prediction.

    Compute the representation with reduced dimensionality via umap
    (with a given number of umap components) of the ensemble predictions.
    Allows to pass on additional keyword arguments to the umap routine.

    Parameters
    ----------
    ens:
        Ensemble objects containing a set of parameter vectors and a set of
        predictions or EnsemblePrediction object containing only predictions
    prediction_index:
        index telling which prediction from the list should be analyzed
    n_components:
        number of components for the dimension reduction
    normalize_data:
        flag indicating whether the parameter ensemble should be rescaled with
        mean and standard deviation

    Returns
    -------
    umap_components:
        first components of the umap embedding
    umap_object:
        returned fitted umap object from umap.UMAP()
    """
    # extract the an array of predictions from either an Ensemble object or an
    # EnsemblePrediction object
    dataset = get_prediction_dataset(ens, prediction_index)

    # For a UMAP representation, we need to reshape the dataset.
    # To the form (n_samples, n_features).
    n_samples = dataset.shape[0]
    dataset = dataset.reshape(n_samples, -1)

    # call lowlevel routine using the prediction ensemble
    return _get_umap_representation_lowlevel(
        dataset=dataset,
        n_components=n_components,
        normalize_data=normalize_data,
        **kwargs,
    )




[docs]
def get_pca_representation_parameters(
    ens: Ensemble,
    n_components: int = 2,
    rescale_data: bool = True,
    rescaler: Callable | None = None,
) -> tuple:
    """
    PCA of parameter ensemble.

    Compute the representation with reduced dimensionality via principal
    component analysis (with a given number of principal components) of the
    parameter ensemble.

    Parameters
    ----------
    ens:
        Ensemble objects containing a set of parameter vectors
    n_components:
        number of components for the dimension reduction
    rescale_data:
        flag indicating whether the principal components should be rescaled
        using a rescaler function (e.g., an arcsinh function)
    rescaler:
        callable function to rescale the output of the PCA (defaults to
        numpy.arcsinh)

    Returns
    -------
    principal_components:
        principal components of the parameter vector ensemble
    pca_object:
        returned fitted pca object from sklearn.decomposition.PCA()
    """
    return _get_pca_representation_lowlevel(
        dataset=ens.x_vectors.transpose(),
        n_components=n_components,
        rescale_data=rescale_data,
        rescaler=rescaler,
    )




[docs]
def get_pca_representation_predictions(
    ens: Ensemble | EnsemblePrediction,
    prediction_index: int = 0,
    n_components: int = 2,
    rescale_data: bool = True,
    rescaler: Callable | None = None,
) -> tuple:
    """
    PCA of ensemble prediction.

    Compute the representation with reduced dimensionality via principal
    component analysis (with a given number of principal components) of the
    ensemble prediction.

    Parameters
    ----------
    ens:
        Ensemble objects containing a set of parameter vectors and a set of
        predictions or EnsemblePrediction object containing only predictions
    prediction_index:
        index telling which prediction from the list should be analyzed
    n_components:
        number of components for the dimension reduction
    rescale_data:
        flag indicating whether the principal components should be rescaled
        using a rescaler function (e.g., an arcsinh function)
    rescaler:
        callable function to rescale the output of the PCA (defaults to
        numpy.arcsinh)

    Returns
    -------
    principal_components:
        principal components of the parameter vector ensemble
    pca_object:
        returned fitted pca object from sklearn.decomposition.PCA()
    """
    # extract the an array of predictions from either an Ensemble object or an
    # EnsemblePrediction object
    dataset = get_prediction_dataset(ens, prediction_index)

    # For a PCA representation, we need to reshape the dataset.
    # To the form (n_samples, n_features).
    n_samples = dataset.shape[0]
    dataset = dataset.reshape(n_samples, -1)

    # call lowlevel routine using the prediction ensemble
    return _get_pca_representation_lowlevel(
        dataset=dataset,
        n_components=n_components,
        rescale_data=rescale_data,
        rescaler=rescaler,
    )



def _get_umap_representation_lowlevel(
    dataset: np.ndarray,
    n_components: int = 2,
    normalize_data: bool = False,
    **kwargs,
) -> tuple:
    """
    Low level UMAP of parameter ensemble.

    Compute the representation with reduced dimensionality via uniform
    manifold approximation and projection (with a given number of principal
    components) of the parameter ensemble.

    Parameters
    ----------
    dataset:
        numpy array containing either the ensemble predictions or the parameter
        ensemble itself
    n_components:
        number of components for the dimension reduction
    rescale_data:
        flag indicating whether the principal components should be rescaled
        using a rescaler function (e.g., an arcsinh function)
    rescaler:
        callable function to rescale the output of the PCA (defaults to
        numpy.arcsinh)

    Returns
    -------
    umap_components:
        first components of the umap embedding
    umap_object:
        returned fitted umap object from umap.UMAP()
    """
    import umap
    import umap.plot
    from sklearn.preprocessing import StandardScaler

    # create a umap object
    umap_object = umap.UMAP(n_components=n_components, **kwargs)

    # normalize data with mean and standard deviation if wanted
    if normalize_data:
        dataset = StandardScaler().fit_transform(dataset)

    # perform the manifold fitting and transform the dataset
    umap_components = umap_object.fit_transform(dataset)

    return umap_components, umap_object


def _get_pca_representation_lowlevel(
    dataset: np.ndarray,
    n_components: int = 2,
    rescale_data: bool = True,
    rescaler: Callable | None = None,
) -> tuple:
    """
    Low level PCA of parameter ensemble.

    Compute the representation with reduced dimensionality via principal
    component analysis (with a given number of principal components) of the
    parameter ensemble.

    Parameters
    ----------
    dataset:
        numpy array containing either the ensemble predictions or the parameter
        ensemble itself
    n_components:
        number of components for the dimension reduction
    rescale_data:
        flag indicating whether the principal components should be rescaled
        using a rescaler function (e.g., an arcsinh function)
    rescaler:
        callable function to rescale the output of the PCA (defaults to
        numpy.arcsinh)

    Returns
    -------
    principal_components:
        principal components of the parameter vector ensemble
    pca_object:
        returned fitted pca object from sklearn.decomposition.PCA()
    """
    import sklearn.decomposition

    # create a PCA object and decompose the dataset
    pca_object = sklearn.decomposition.PCA(n_components=n_components)
    pca_object.fit(dataset)
    # get the projection down to the first components
    principal_components = pca_object.transform(dataset)

    # rescale the principal components with a non-linear function, if wanted
    if rescale_data:
        if rescaler is None:
            # use arcsinh as default
            principal_components = np.arcsinh(principal_components)
        else:
            # use provided funcation for rescaling
            principal_components = rescaler(principal_components)

    return principal_components, pca_object