Source code for ants.utils.impute



__all__ = ['impute']

import numpy as np

try:
    from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, IterativeSVD
    has_fancyimpute = True
except:
    has_fancyimpute = False


[docs]def impute(data, method='mean', value=None, nan_value=np.nan):
    """
    Impute missing values on a numpy ndarray in a column-wise manner.
    
    ANTsR function: `antsrimpute`

    Arguments
    ---------
    data : numpy.ndarray
        data to impute

    method : string or float
        type of imputation method to use
        Options:
            mean
            median
            constant
            KNN
            BiScaler
            NuclearNormMinimization
            SoftImpute
            IterativeSVD

    value : scalar (optional)
        optional arguments for different methods
        if method == 'constant'
            constant value
        if method == 'KNN'
            number of nearest neighbors to use

    nan_value : scalar
        value which is interpreted as a missing value

    Returns
    -------
    ndarray if ndarray was given
    OR
    pd.DataFrame if pd.DataFrame was given

    Example
    -------
    >>> import ants
    >>> import numpy as np
    >>> data = np.random.randn(4,10)
    >>> data[2,3] = np.nan
    >>> data[3,5] = np.nan
    >>> data_imputed = ants.impute(data, 'mean')

    Details
    -------
    KNN: Nearest neighbor imputations which weights samples using the mean squared 
            difference on features for which two rows both have observed data.

    SoftImpute: Matrix completion by iterative soft thresholding of SVD 
                decompositions. Inspired by the softImpute package for R, which 
                is based on Spectral Regularization Algorithms for Learning 
                Large Incomplete Matrices by Mazumder et. al.

    IterativeSVD: Matrix completion by iterative low-rank SVD decomposition.
                    Should be similar to SVDimpute from Missing value estimation 
                    methods for DNA microarrays by Troyanskaya et. al.

    MICE: Reimplementation of Multiple Imputation by Chained Equations.

    MatrixFactorization: Direct factorization of the incomplete matrix into 
                        low-rank U and V, with an L1 sparsity penalty on the elements 
                        of U and an L2 penalty on the elements of V. 
                        Solved by gradient descent.

    NuclearNormMinimization: Simple implementation of Exact Matrix Completion 
                            via Convex Optimization by Emmanuel Candes and Benjamin 
                            Recht using cvxpy. Too slow for large matrices.

    BiScaler: Iterative estimation of row/column means and standard deviations 
                to get doubly normalized matrix. Not guaranteed to converge but 
                works well in practice. Taken from Matrix Completion and 
                Low-Rank SVD via Fast Alternating Least Squares.
    """
    _fancyimpute_options = {'KNN', 'BiScaler', 'NuclearNormMinimization', 'SoftImpute', 'IterativeSVD'}
    if (not has_fancyimpute) and (method in _fancyimpute_options):
        raise ValueError('You must install `fancyimpute` (pip install fancyimpute) to use this method')

    _base_options = {'mean', 'median', 'constant'}
    if (method not in _base_options) and (method not in _fancyimpute_options) and (not isinstance(method, (int,float))):
        raise ValueError('method not understood.. Use `mean`, `median`, a scalar, or an option from `fancyimpute`')

    X_incomplete = data.copy()

    if method == 'KNN':
        if value is None:
            value = 3
        X_filled = KNN(k=value, verbose=False).complete(X_incomplete)

    elif method == 'BiScaler':
        X_filled = BiScaler(verbose=False).fit_transform(X_incomplete)

    elif method == 'SoftImpute':
        X_filled = SoftImpute(verbose=False).complete(X_incomplete)

    elif method == 'IterativeSVD':
        if value is None:
            rank = min(10, X_incomplete.shape[0]-2)
        else:
            rank = value
        X_filled = IterativeSVD(rank=rank, verbose=False).complete(X_incomplete)

    elif method == 'mean':
        col_means = np.nanmean(X_incomplete, axis=0)
        for i in range(X_incomplete.shape[1]):
            X_incomplete[:,i][np.isnan(X_incomplete[:,i])] = col_means[i]
        X_filled = X_incomplete

    elif method == 'median':
        col_means = np.nanmean(X_incomplete, axis=0)
        for i in range(X_incomplete.shape[1]):
            X_incomplete[:,i][np.isnan(X_incomplete[:,i])] = col_means[i]
        X_filled = X_incomplete

    elif method == 'constant':
        if value is None:
            raise ValueError('Must give `value` argument if method == constant')
        X_incomplete[np.isnan(X_incomplete)] = value
        X_filled = X_incomplete

    return X_filled
Source code for ants.utils.impute

ANTsPy

Navigation

Related Topics