Source code for phygnn.utilities.pre_processing

# -*- coding: utf-8 -*-
"""
Data pre-processing module.
"""
import logging
from warnings import warn

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

logger = logging.getLogger(__name__)


[docs]class PreProcess: """Class to handle the pre-processing of feature data.""" def __init__(self, features, feature_names=None): """ Parameters ---------- features : np.ndarray | pd.DataFrame Feature data in a 2D array or DataFrame. feature_names : str, optional Feature names, used if features is an ndarray, by default None """ self._features = features self._pd = False if isinstance(self._features, pd.DataFrame): self._pd = True self._feature_names = self._features.columns.tolist() if not features.index.is_unique: msg = 'DataFrame indices must be unique' logger.error(msg) raise AttributeError(msg) else: self._pd = False check = (feature_names is not None and len(set(feature_names)) != features.shape[1]) if check: msg = ('The number of feature names ({}) does not match the ' 'number of features ({})!' .format(len(set(feature_names)), features.shape[1])) logger.error(msg) raise ValueError(msg) self._feature_names = feature_names @staticmethod def _check_stdev(stdev): """ Check stdev values for 0s or near 0 values, replace with 1s Parameters ---------- stdev : int | ndarray Normalization stdev value(s) Returns ------- stdev : int | ndarray Normalization stdev values(s) with 0s replaced with 1s """ zeros = np.isclose(stdev, 0) if np.any(zeros): msg = ('Standard deviation is ~0 and will be set to 1') logger.warning(msg) warn(msg) if isinstance(zeros, bool): stdev = 1 else: stdev[zeros] = 1 return stdev
[docs] @staticmethod def normalize(arr, mean=None, stdev=None): """ Normalize features with mean at 0 and stdev of 1. Parameters ---------- arr : ndarray | pd.DataFrame native data, dataframes are converted to arrays. mean : float | None mean to use for normalization stdev : float | None stdev to use for normalization Returns ------- norm_arr : ndarray normalized data mean : np.ndarray 1D array of mean values used for normalization with length equal to number of features stdev : np.ndarray 1D array of stdev values used for normalization with length equal to number of features """ if isinstance(arr, pd.DataFrame): arr = arr.values.copy() if mean is None: mean = np.array([np.nanmean(arr[..., f]) for f in range(arr.shape[-1])]) if stdev is None: stdev = np.array([np.nanstd(arr[..., f]) for f in range(arr.shape[-1])]) stdev = PreProcess._check_stdev(stdev) for f in range(arr.shape[-1]): arr[..., f] = (arr[..., f] - mean[f]) / stdev[f] return arr, mean, stdev
[docs] @staticmethod def unnormalize(norm_arr, mean, stdev): """ Unnormalize data with mean at 0 and stdev of 1. Parameters ---------- norm_arr : ndarray normalized data mean : float mean used for normalization stdev : float stdev used for normalization Returns ------- native_arr : ndarray native un-normalized data """ native_arr = norm_arr * stdev native_arr += mean return native_arr
@staticmethod def _is_one_hot(arr, convert_int=False): """Check if an array of data is to be transformed into a one-hot vector by sampling the first datum and checking the type. Parameters ---------- arr : np.ndarray Array (column) of data to be checked. convert_int : bool Flag to convert integer data to one-hot vectors. Returns ------- one_hot : bool True if arr is to be transformed into a one-hot vector. """ if len(arr.shape) == 1: sample = arr[0] elif len(arr.shape) == 2: sample = arr[0, 0] else: e = 'Cannot process 3D column into one hot' logger.error(e) raise ValueError(e) one_hot = False if (isinstance(sample, str) or np.issubdtype(type(sample), np.integer) and convert_int): one_hot = True return one_hot
[docs] @staticmethod def check_one_hot_categories(one_hot_categories, feature_names=None): """ Check one hot features and categories for duplicate names and against feature names if provided Parameters ---------- one_hot_categories : dict, optional Features to one-hot encode using given categories feature_names : [type], optional Feature names, by default None """ one_hot_features_names = [i for sub in one_hot_categories.values() for i in sub] names, feature_counts = np.unique(one_hot_features_names, return_counts=True) if any(feature_counts > 1): msg = ('one-hot category names have to be unique accross all ' 'features. The following category names were duplicated:' '\n{}'.format(names[feature_counts > 1])) logger.error(msg) raise RuntimeError(msg) if feature_names is not None: one_hot_features = np.array(list(one_hot_categories)) check = np.isin(one_hot_features, feature_names) if not all(check): bad_names = one_hot_features[~check] msg = ('The following one-hot features do not have valid ' 'names!\n{}\nMust be one of the available feature ' 'names:\n{}'.format(bad_names, feature_names)) logger.error(msg) raise RuntimeError(msg) final_names = list(set(feature_names) - set(one_hot_categories)) check = np.isin(one_hot_features_names, final_names) if any(check): msg = ('The following category names: {} conflict with ' 'existing feature names' .format(np.array(one_hot_features_names)[check])) logger.error(msg) raise RuntimeError(msg)
def _get_one_hot_data(self, convert_int=False, categories=None): """Get one hot data and column indexes. Parameters ---------- convert_int : bool Flag to convert integer data to one-hot vectors. categories : dict | None Categories to use for one hot encoding where a key is the original column name in the feature dataframe and value is a list of the possible unique values of the feature column. The value list must have as many or more entries as unique values in the feature column. This will name the feature column headers for the new one-hot-encoding if features is a dataframe. Empty dict or None results in category names being determined automatically. Format: {'col_name1' : ['cat1', 'cat2', 'cat3'], 'col_name2' : ['other_cat1', 'other_cat2']} Returns ------- one_hot_ind : list List of numeric column indices in the native data that are to-be-transformed into one-hot vectors. one_hot_data : list List of arrays of one hot data columns that are transformations of the one_hot_ind columns. numerical_ind : list List of numeric column indices in the native data that are continuous numerical columns that are not to-be-transformed into one-hot vectors. """ if categories is None: categories = {} one_hot_ind = [] one_hot_data = [] numerical_ind = [] for i in range(self._features.shape[1]): name = self._feature_names[i] if self._feature_names else None n = len(self._features) if self._pd: col = self._features.iloc[:, i].values.reshape((n, 1)) else: col = self._features[:, i].reshape((n, 1)) if not self._is_one_hot(col, convert_int=convert_int): numerical_ind.append(i) else: logger.debug('One hot encoding {}'.format(name)) one_hot_ind.append(i) if name in categories: cats = [categories[name]] logger.debug('Using categories {} for column {}' ''.format(cats, name)) oh_obj = OneHotEncoder(sparse_output=False, categories=cats) else: oh_obj = OneHotEncoder(sparse_output=False) oh_obj.fit(col) one_hot_data.append(oh_obj.transform(col)) return one_hot_ind, one_hot_data, numerical_ind def _make_df_one_hot_cols_labels(self, one_hot_ind, one_hot_data, categories=None): """Make unique column labels for the new one-hot data. This will use column labels from categories if available. Parameters ---------- one_hot_ind : list List of numeric column indices in the native data that are to-be-transformed into one-hot vectors. one_hot_data : list List of arrays of one hot data columns that are transformations of the one_hot_ind columns. categories : dict | None Categories to use for one hot encoding where a key is the original column name in the feature dataframe and value is a list of the possible unique values of the feature column. The value list must have as many or more entries as unique values in the feature column. This will name the feature column headers for the new one-hot-encoding if features is a dataframe. Empty dict or None results in category names being determined automatically. Format: {'col_name1' : ['cat1', 'cat2', 'cat3'], 'col_name2' : ['other_cat1', 'other_cat2']} Returns ------- col_labels : list List of string labels corresponding to np.hstack(one_hot_data). """ if categories is None: categories = {} col_labels = [] for i, oh_ind in enumerate(one_hot_ind): orig_col_label = self._features.columns.values[oh_ind] if orig_col_label in categories: cat_labels = categories[orig_col_label] msg = ('Values in the categories input dict must be a ' 'list or tuple!') assert isinstance(cat_labels, (list, tuple)), msg unique_vals = pd.unique(self._features[orig_col_label]) msg = ('Categories for "{a}" one-hot column had fewer unique ' 'entries than one-hot encodings! You input these ' 'categories: {b} but "{a}" has these values: {c}' .format(a=orig_col_label, b=cat_labels, c=unique_vals)) assert len(cat_labels) >= len(unique_vals), msg if isinstance(cat_labels, tuple): cat_labels = list(cat_labels) col_labels += cat_labels else: def_labels = [orig_col_label + '_' + str(k) for k in range(one_hot_data[i].shape[1])] col_labels += def_labels return col_labels
[docs] @staticmethod def update_names(names, categories): """Update feature names with the OHE categories. Parameters ---------- names : list | None Feature or label names categories : dict Categories to use for one hot encoding where a key is the original column name in the feature dataframe and value is a list of the possible unique values of the feature column. The value list must have as many or more entries as unique values in the feature column. This will name the feature column headers for the new one-hot-encoding if features is a dataframe. Empty dict or None results in category names being determined automatically. Format: {'col_name1' : ['cat1', 'cat2', 'cat3'], 'col_name2' : ['other_cat1', 'other_cat2']} Returns ------- names : list | None Names updated with categories """ if names is not None: for category, replacements in categories.items(): if category in names: i = names.index(category) names[i] = replacements names = [entry for sublist in names for entry in sublist] return names
[docs] def process_one_hot(self, convert_int=False, categories=None, return_ind=False): """Process str and int columns in the feature data to one-hot vectors. Parameters ---------- convert_int : bool, optional Flag to convert integer data to one-hot vectors, by default False categories : dict | None, optional Categories to use for one hot encoding where a key is the original column name in the feature dataframe and value is a list of the possible unique values of the feature column. The value list must have as many or more entries as unique values in the feature column. This will name the feature column headers for the new one-hot-encoding if features is a dataframe. Empty dict or None results in category names being determined automatically. Format: {'col_name1' : ['cat1', 'cat2', 'cat3'], 'col_name2' : ['other_cat1', 'other_cat2']} by default None return_ind : bool, optional Return one hot column indices, by default False Returns ------- processed : np.ndarray | pd.DataFrame Feature data with str and int columns removed and one-hot boolean vectors appended as new columns. If features is a dataframe and categories is input, the new one-hot columns will be named according to categories. one_hot_ind : list, optional List of numeric column indices in the native data that are to-be-transformed into one-hot vectors. """ if categories is None: categories = {} else: self.check_one_hot_categories(categories, feature_names=self._feature_names) one_hot_ind, one_hot_data, numerical_ind = self._get_one_hot_data( convert_int=convert_int, categories=categories) if not one_hot_ind: processed = self._features else: if self._pd: num_df = self._features.iloc[:, numerical_ind] col_labels = self._make_df_one_hot_cols_labels(one_hot_ind, one_hot_data, categories) one_hot_df = pd.DataFrame(np.hstack(one_hot_data), columns=col_labels, index=self._features.index) processed = num_df.join(one_hot_df) assert processed.shape[0] == num_df.shape[0] == \ one_hot_df.shape[0] else: processed = np.hstack((self._features[:, numerical_ind], np.hstack(one_hot_data))) assert processed.shape[0] == self._features.shape[0] processed = processed.astype(np.float32) if return_ind: return processed, one_hot_ind else: return processed
[docs] @classmethod def one_hot(cls, features, feature_names=None, convert_int=False, categories=None, return_ind=False): """ Process str and int columns in the feature data to one-hot vectors. Parameters ---------- features : np.ndarray | pd.DataFrame Feature data in a 2D array or DataFrame. feature_names : str, optional Feature names, used if features is an ndarray, by default None convert_int : bool, optional Flag to convert integer data to one-hot vectors, by default False categories : dict | None, optional Categories to use for one hot encoding where a key is the original column name in the feature dataframe and value is a list of the possible unique values of the feature column. The value list must have as many or more entries as unique values in the feature column. This will name the feature column headers for the new one-hot-encoding if features is a dataframe. Empty dict or None results in category names being determined automatically. Format: {'col_name1' : ['cat1', 'cat2', 'cat3'], 'col_name2' : ['other_cat1', 'other_cat2']} by default None return_ind : bool, optional Return one hot column indices, by default False Returns ------- processed : np.ndarray | pd.DataFrame Feature data with str and int columns removed and one-hot boolean vectors appended as new columns. If features is a dataframe and categories is input, the new one-hot columns will be named according to categories. one_hot_ind : list, optional List of numeric column indices in the native data that are to-be-transformed into one-hot vectors. """ logger.debug('Checking for one-hot items and converting them ' 'to binary values') pp = cls(features, feature_names=feature_names) out = pp.process_one_hot(convert_int=convert_int, categories=categories, return_ind=return_ind) return out