Source code for phygnn.model_interfaces.random_forest_model

# -*- coding: utf-8 -*-
"""
Random Forest Model
"""
import json
import logging
import numpy as np
import pprint
import os
from sklearn.ensemble import RandomForestRegressor

from phygnn.model_interfaces.base_model import ModelBase
from phygnn.utilities.pre_processing import PreProcess

logger = logging.getLogger(__name__)


[docs]class RandomForestModel(ModelBase):
    """
    scikit learn Random Forest Regression model interface
    """

    def __init__(self, model, feature_names=None, label_name=None,
                 norm_params=None, normalize=True, one_hot_categories=None):
        """
        Parameters
        ----------
        model : sklearn.ensemble.RandomForestRegressor
            Sklearn Random Forest Model
        feature_names : list
            Ordered list of feature names.
        label_name : str
            label (output) variable name.
        norm_params : dict, optional
            Dictionary mapping feature and label names (keys) to normalization
            parameters (mean, stdev), by default None
        normalize : bool | tuple, optional
            Boolean flag(s) as to whether features and labels should be
            normalized. Possible values:
            - True means normalize both
            - False means don't normalize either
            - Tuple of flags (normalize_feature, normalize_label)
            by default True
        one_hot_categories : dict, optional
            Features to one-hot encode using given categories, if None do
            not run one-hot encoding, by default None
        """
        super().__init__(model, feature_names=feature_names,
                         label_names=label_name, norm_params=norm_params,
                         normalize=normalize,
                         one_hot_categories=one_hot_categories)

        if len(self.label_names) > 1:
            msg = ("Only a single label can be supplied to {}, but {} were"
                   .format(self.__class__.__name__, len(self.label_names)))
            logger.error(msg)
            raise ValueError(msg)

[docs]    @staticmethod
    def compile_model(**kwargs):
        """
        Build sklearn random forest model

        Parameters
        ----------
        kwargs : dict
            kwargs for sklearn.ensemble.RandomForestRegressor

        Returns
        -------
        sklearn.ensemble.RandomForestRegressor
            sklearn random forest model
        """
        model = RandomForestRegressor(**kwargs)

        return model

[docs]    def unnormalize_prediction(self, prediction):
        """
        Unnormalize prediction if needed

        Parameters
        ----------
        prediction : ndarray
           Model prediction

        Returns
        -------
        prediction : ndarray
            Native prediction
        """
        means = self.label_means[0]
        if means:
            stdevs = self.label_stdevs[0]
            prediction = PreProcess.unnormalize(prediction, means, stdevs)

        return prediction

[docs]    def parse_labels(self, label, name=None):
        """
        Parse labels and normalize if desired

        Parameters
        ----------
        label : pandas.DataFrame | dict | ndarray
            Features to train on or predict from
        name : list, optional
            List of label names, by default None

        Returns
        -------
        label : ndarray
            Parsed labels array, normalized if desired
        """
        if self.normalize_labels:
            label = super().parse_labels(label, names=name)

        if len(self.label_names) > 1:
            msg = ("Only a single label can be supplied to {}, but {} were"
                   .format(self.__class__.__name__, len(self.label_names)))
            logger.error(msg)
            raise ValueError(msg)

        return label

[docs]    def train_model(self, features, label, shuffle=True, parse_kwargs=None,
                    fit_kwargs=None):
        """
        Train the model with the provided features and label

        Parameters
        ----------
        features : dict | pandas.DataFrame
            Input features to train on
        label : dict | pandas.DataFrame
            label to train on
        shuffle : bool
            Flag to randomly subset the validation data and batch selection
            from features and labels.
        parse_kwargs : dict
            kwargs for cls.parse_features
        fit_kwargs : dict
            kwargs for sklearn.ensemble.RandomForestRegressor.fit
        """
        if parse_kwargs is None:
            parse_kwargs = {}

        features = self.parse_features(features, **parse_kwargs)

        label = self.parse_labels(label)

        if fit_kwargs is None:
            fit_kwargs = {}

        if shuffle:
            L = len(features)
            i = np.random.choice(L, size=L, replace=False)
            features = features[i]
            label = label[i]

        # pylint: disable=no-member
        self._model.fit(features, label.ravel(), **fit_kwargs)

[docs]    def save_model(self, path):
        """
        Save Random Forest Model to path.

        Parameters
        ----------
        path : str
            Path to save model to
        """
        if path.endswith('.json'):
            dir_path = os.path.dirname(path)
        else:
            dir_path = path
            path = os.path.join(dir_path, os.path.basename(path) + '.json')

        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        model_params = {'feature_names': self.feature_names,
                        'label_name': self.label_names,
                        'norm_params': self.normalization_parameters,
                        'normalize': (self.normalize_features,
                                      self.normalize_labels),
                        'one_hot_categories': self.one_hot_categories,
                        'version_record': self.version_record,
                        'model_params': self.model.get_params(),
                        }

        model_params = self.dict_json_convert(model_params)
        with open(path, 'w') as f:
            json.dump(model_params, f, indent=2, sort_keys=True)

[docs]    @classmethod
    def build_trained(cls, features, label, normalize=True,
                      one_hot_categories=None, shuffle=True, save_path=None,
                      compile_kwargs=None, parse_kwargs=None, fit_kwargs=None):
        """
        Build Random Forest Model with given kwargs and then train with
        given features, labels, and kwargs

        Parameters
        ----------
        features : pandas.DataFrame
            Model features
        label : pandas.DataFrame
            label to train on
        normalize : bool | tuple, optional
            Boolean flag(s) as to whether features and labels should be
            normalized. Possible values:
            - True means normalize both
            - False means don't normalize either
            - Tuple of flags (normalize_feature, normalize_label)
            by default True
        one_hot_categories : dict, optional
            Features to one-hot encode using given categories, if None do
            not run one-hot encoding, by default None
        shuffle : bool
            Flag to randomly subset the validation data and batch selection
            from features and labels.
        save_path : str
            Directory path to save model to. The RandomForest Model will be
            saved to the directory while the framework parameters will be
            saved in json.
        compile_kwargs : dict
            kwargs for sklearn.ensemble.RandomForestRegressor
        parse_kwargs : dict
            kwargs for cls.parse_features
        fit_kwargs : dict
            kwargs for sklearn.ensemble.RandomForestRegressor.fit

        Returns
        -------
        model : RandomForestModel
            Initialized and trained RandomForestModel obj
        """
        if compile_kwargs is None:
            compile_kwargs = {}

        _, feature_names = cls._parse_data_names(features, fallback_prefix='F')
        _, label_name = cls._parse_data_names(label, fallback_prefix='L')

        model = cls.compile_model(**compile_kwargs)
        if one_hot_categories is not None:
            check_names = feature_names + label_name
            PreProcess.check_one_hot_categories(one_hot_categories,
                                                feature_names=check_names)
            feature_names = cls.make_one_hot_feature_names(feature_names,
                                                           one_hot_categories)

        model = cls(model, feature_names=feature_names, label_name=label_name,
                    normalize=normalize, one_hot_categories=one_hot_categories)

        model.train_model(features, label, shuffle=shuffle,
                          parse_kwargs=parse_kwargs, fit_kwargs=fit_kwargs)

        if save_path is not None:
            model.save_model(save_path)

        return model

[docs]    @classmethod
    def load(cls, path):
        """
        Load model from model path.

        Parameters
        ----------
        path : str
            Directory path to RandomForestModel from pickle file.

        Returns
        -------
        model : RandomForestModel
            Loaded RandomForestModel from disk.
        """
        if not path.endswith('.json'):
            path = os.path.join(path, os.path.basename(path) + '.json')

        if not os.path.exists(path):
            e = ('{} does not exist'.format(path))
            logger.error(e)
            raise IOError(e)

        with open(path, 'r') as f:
            model_params = json.load(f)

        if 'version_record' in model_params:
            version_record = model_params.pop('version_record')
            logger.info('Loading model from disk that was created with the '
                        'following package versions: \n{}'
                        .format(pprint.pformat(version_record, indent=4)))

        loaded = RandomForestRegressor()
        rf_params = model_params.pop('model_params')
        loaded = loaded.set_params(**rf_params)

        model = cls(loaded, **model_params)

        return model