Source code for ants.utils.impute
__all__ = ['impute']
import numpy as np
try:
from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, IterativeSVD
has_fancyimpute = True
except:
has_fancyimpute = False
[docs]def impute(data, method='mean', value=None, nan_value=np.nan):
"""
Impute missing values on a numpy ndarray in a column-wise manner.
ANTsR function: `antsrimpute`
Arguments
---------
data : numpy.ndarray
data to impute
method : string or float
type of imputation method to use
Options:
mean
median
constant
KNN
BiScaler
NuclearNormMinimization
SoftImpute
IterativeSVD
value : scalar (optional)
optional arguments for different methods
if method == 'constant'
constant value
if method == 'KNN'
number of nearest neighbors to use
nan_value : scalar
value which is interpreted as a missing value
Returns
-------
ndarray if ndarray was given
OR
pd.DataFrame if pd.DataFrame was given
Example
-------
>>> import ants
>>> import numpy as np
>>> data = np.random.randn(4,10)
>>> data[2,3] = np.nan
>>> data[3,5] = np.nan
>>> data_imputed = ants.impute(data, 'mean')
Details
-------
KNN: Nearest neighbor imputations which weights samples using the mean squared
difference on features for which two rows both have observed data.
SoftImpute: Matrix completion by iterative soft thresholding of SVD
decompositions. Inspired by the softImpute package for R, which
is based on Spectral Regularization Algorithms for Learning
Large Incomplete Matrices by Mazumder et. al.
IterativeSVD: Matrix completion by iterative low-rank SVD decomposition.
Should be similar to SVDimpute from Missing value estimation
methods for DNA microarrays by Troyanskaya et. al.
MICE: Reimplementation of Multiple Imputation by Chained Equations.
MatrixFactorization: Direct factorization of the incomplete matrix into
low-rank U and V, with an L1 sparsity penalty on the elements
of U and an L2 penalty on the elements of V.
Solved by gradient descent.
NuclearNormMinimization: Simple implementation of Exact Matrix Completion
via Convex Optimization by Emmanuel Candes and Benjamin
Recht using cvxpy. Too slow for large matrices.
BiScaler: Iterative estimation of row/column means and standard deviations
to get doubly normalized matrix. Not guaranteed to converge but
works well in practice. Taken from Matrix Completion and
Low-Rank SVD via Fast Alternating Least Squares.
"""
_fancyimpute_options = {'KNN', 'BiScaler', 'NuclearNormMinimization', 'SoftImpute', 'IterativeSVD'}
if (not has_fancyimpute) and (method in _fancyimpute_options):
raise ValueError('You must install `fancyimpute` (pip install fancyimpute) to use this method')
_base_options = {'mean', 'median', 'constant'}
if (method not in _base_options) and (method not in _fancyimpute_options) and (not isinstance(method, (int,float))):
raise ValueError('method not understood.. Use `mean`, `median`, a scalar, or an option from `fancyimpute`')
X_incomplete = data.copy()
if method == 'KNN':
if value is None:
value = 3
X_filled = KNN(k=value, verbose=False).complete(X_incomplete)
elif method == 'BiScaler':
X_filled = BiScaler(verbose=False).fit_transform(X_incomplete)
elif method == 'SoftImpute':
X_filled = SoftImpute(verbose=False).complete(X_incomplete)
elif method == 'IterativeSVD':
if value is None:
rank = min(10, X_incomplete.shape[0]-2)
else:
rank = value
X_filled = IterativeSVD(rank=rank, verbose=False).complete(X_incomplete)
elif method == 'mean':
col_means = np.nanmean(X_incomplete, axis=0)
for i in range(X_incomplete.shape[1]):
X_incomplete[:,i][np.isnan(X_incomplete[:,i])] = col_means[i]
X_filled = X_incomplete
elif method == 'median':
col_means = np.nanmean(X_incomplete, axis=0)
for i in range(X_incomplete.shape[1]):
X_incomplete[:,i][np.isnan(X_incomplete[:,i])] = col_means[i]
X_filled = X_incomplete
elif method == 'constant':
if value is None:
raise ValueError('Must give `value` argument if method == constant')
X_incomplete[np.isnan(X_incomplete)] = value
X_filled = X_incomplete
return X_filled