Source code for phygnn.model_interfaces.random_forest_model

# -*- coding: utf-8 -*-
"""
Random Forest Model
"""
import json
import logging
import numpy as np
import pprint
import os
from sklearn.ensemble import RandomForestRegressor

from phygnn.model_interfaces.base_model import ModelBase
from phygnn.utilities.pre_processing import PreProcess

logger = logging.getLogger(__name__)


[docs]class RandomForestModel(ModelBase): """ scikit learn Random Forest Regression model interface """ def __init__(self, model, feature_names=None, label_name=None, norm_params=None, normalize=True, one_hot_categories=None): """ Parameters ---------- model : sklearn.ensemble.RandomForestRegressor Sklearn Random Forest Model feature_names : list Ordered list of feature names. label_name : str label (output) variable name. norm_params : dict, optional Dictionary mapping feature and label names (keys) to normalization parameters (mean, stdev), by default None normalize : bool | tuple, optional Boolean flag(s) as to whether features and labels should be normalized. Possible values: - True means normalize both - False means don't normalize either - Tuple of flags (normalize_feature, normalize_label) by default True one_hot_categories : dict, optional Features to one-hot encode using given categories, if None do not run one-hot encoding, by default None """ super().__init__(model, feature_names=feature_names, label_names=label_name, norm_params=norm_params, normalize=normalize, one_hot_categories=one_hot_categories) if len(self.label_names) > 1: msg = ("Only a single label can be supplied to {}, but {} were" .format(self.__class__.__name__, len(self.label_names))) logger.error(msg) raise ValueError(msg)
[docs] @staticmethod def compile_model(**kwargs): """ Build sklearn random forest model Parameters ---------- kwargs : dict kwargs for sklearn.ensemble.RandomForestRegressor Returns ------- sklearn.ensemble.RandomForestRegressor sklearn random forest model """ model = RandomForestRegressor(**kwargs) return model
[docs] def unnormalize_prediction(self, prediction): """ Unnormalize prediction if needed Parameters ---------- prediction : ndarray Model prediction Returns ------- prediction : ndarray Native prediction """ means = self.label_means[0] if means: stdevs = self.label_stdevs[0] prediction = PreProcess.unnormalize(prediction, means, stdevs) return prediction
[docs] def parse_labels(self, label, name=None): """ Parse labels and normalize if desired Parameters ---------- label : pandas.DataFrame | dict | ndarray Features to train on or predict from name : list, optional List of label names, by default None Returns ------- label : ndarray Parsed labels array, normalized if desired """ if self.normalize_labels: label = super().parse_labels(label, names=name) if len(self.label_names) > 1: msg = ("Only a single label can be supplied to {}, but {} were" .format(self.__class__.__name__, len(self.label_names))) logger.error(msg) raise ValueError(msg) return label
[docs] def train_model(self, features, label, shuffle=True, parse_kwargs=None, fit_kwargs=None): """ Train the model with the provided features and label Parameters ---------- features : dict | pandas.DataFrame Input features to train on label : dict | pandas.DataFrame label to train on shuffle : bool Flag to randomly subset the validation data and batch selection from features and labels. parse_kwargs : dict kwargs for cls.parse_features fit_kwargs : dict kwargs for sklearn.ensemble.RandomForestRegressor.fit """ if parse_kwargs is None: parse_kwargs = {} features = self.parse_features(features, **parse_kwargs) label = self.parse_labels(label) if fit_kwargs is None: fit_kwargs = {} if shuffle: L = len(features) i = np.random.choice(L, size=L, replace=False) features = features[i] label = label[i] # pylint: disable=no-member self._model.fit(features, label.ravel(), **fit_kwargs)
[docs] def save_model(self, path): """ Save Random Forest Model to path. Parameters ---------- path : str Path to save model to """ if path.endswith('.json'): dir_path = os.path.dirname(path) else: dir_path = path path = os.path.join(dir_path, os.path.basename(path) + '.json') if not os.path.exists(dir_path): os.makedirs(dir_path) model_params = {'feature_names': self.feature_names, 'label_name': self.label_names, 'norm_params': self.normalization_parameters, 'normalize': (self.normalize_features, self.normalize_labels), 'one_hot_categories': self.one_hot_categories, 'version_record': self.version_record, 'model_params': self.model.get_params(), } model_params = self.dict_json_convert(model_params) with open(path, 'w') as f: json.dump(model_params, f, indent=2, sort_keys=True)
[docs] @classmethod def build_trained(cls, features, label, normalize=True, one_hot_categories=None, shuffle=True, save_path=None, compile_kwargs=None, parse_kwargs=None, fit_kwargs=None): """ Build Random Forest Model with given kwargs and then train with given features, labels, and kwargs Parameters ---------- features : pandas.DataFrame Model features label : pandas.DataFrame label to train on normalize : bool | tuple, optional Boolean flag(s) as to whether features and labels should be normalized. Possible values: - True means normalize both - False means don't normalize either - Tuple of flags (normalize_feature, normalize_label) by default True one_hot_categories : dict, optional Features to one-hot encode using given categories, if None do not run one-hot encoding, by default None shuffle : bool Flag to randomly subset the validation data and batch selection from features and labels. save_path : str Directory path to save model to. The RandomForest Model will be saved to the directory while the framework parameters will be saved in json. compile_kwargs : dict kwargs for sklearn.ensemble.RandomForestRegressor parse_kwargs : dict kwargs for cls.parse_features fit_kwargs : dict kwargs for sklearn.ensemble.RandomForestRegressor.fit Returns ------- model : RandomForestModel Initialized and trained RandomForestModel obj """ if compile_kwargs is None: compile_kwargs = {} _, feature_names = cls._parse_data_names(features, fallback_prefix='F') _, label_name = cls._parse_data_names(label, fallback_prefix='L') model = cls.compile_model(**compile_kwargs) if one_hot_categories is not None: check_names = feature_names + label_name PreProcess.check_one_hot_categories(one_hot_categories, feature_names=check_names) feature_names = cls.make_one_hot_feature_names(feature_names, one_hot_categories) model = cls(model, feature_names=feature_names, label_name=label_name, normalize=normalize, one_hot_categories=one_hot_categories) model.train_model(features, label, shuffle=shuffle, parse_kwargs=parse_kwargs, fit_kwargs=fit_kwargs) if save_path is not None: model.save_model(save_path) return model
[docs] @classmethod def load(cls, path): """ Load model from model path. Parameters ---------- path : str Directory path to RandomForestModel from pickle file. Returns ------- model : RandomForestModel Loaded RandomForestModel from disk. """ if not path.endswith('.json'): path = os.path.join(path, os.path.basename(path) + '.json') if not os.path.exists(path): e = ('{} does not exist'.format(path)) logger.error(e) raise IOError(e) with open(path, 'r') as f: model_params = json.load(f) if 'version_record' in model_params: version_record = model_params.pop('version_record') logger.info('Loading model from disk that was created with the ' 'following package versions: \n{}' .format(pprint.pformat(version_record, indent=4))) loaded = RandomForestRegressor() rf_params = model_params.pop('model_params') loaded = loaded.set_params(**rf_params) model = cls(loaded, **model_params) return model