"""Cloud Properties filling using phgynn"""
import logging
import os
import shutil
import time
from concurrent.futures import as_completed
from warnings import warn
import numpy as np
import pandas as pd
import psutil
from farms import ICE_TYPES, WATER_TYPES
from mlclouds import MODEL_FPATH
from mlclouds.model.multi_step import MultiCloudsModel
from rex import MultiFileNSRDB
from rex.utilities.execution import SpawnProcessPool
from nsrdb.data_model.variable_factory import VarFactory
from nsrdb.file_handlers.outputs import Outputs
from nsrdb.file_handlers.resource import Resource
from nsrdb.gap_fill.cloud_fill import CloudGapFill
logger = logging.getLogger(__name__)
[docs]
class MLCloudsFill:
"""
Use the MLClouds algorithm with phygnn model to fill missing cloud data
"""
DEFAULT_MODEL = MODEL_FPATH
def __init__(
self, h5_source, fill_all=False, model_path=None, var_meta=None
):
"""
Parameters
----------
h5_source : str
Path to directory containing multi-file resource file sets.
Available formats:
/h5_dir/
/h5_dir/prefix*suffix
fill_all : bool
Flag to fill all cloud properties for all timesteps where
cloud_type is cloudy.
model_path : dict | None
kwargs for ``MultiCloudsModel.load`` method. Specifies
``cloud_prop_model_path`` for cloud property model and optionally
``cloud_type_model_path`` for a cloud type model. Each value is
typically a fpath to a .pkl file with an accompanying .json file in
the same directory. None will try to use the default model path
from the mlclouds project directory.
var_meta : str | pd.DataFrame | None
CSV file or dataframe containing meta data for all NSRDB variables.
Defaults to the NSRDB var meta csv in git repo.
"""
self._dset_map = {}
self._res_shape = None
self._h5_source = h5_source
self._fill_all = fill_all
self._var_meta = var_meta
if model_path is None:
model_path = self.DEFAULT_MODEL
logger.info(
'Initializing MLCloudsFill with h5_source: {}'.format(
self._h5_source
)
)
logger.info(
'Initializing MLCloudsFill with model: {}'.format(model_path)
)
logger.info(
'MLCloudsFill fill filling all cloud properties: {}'.format(
self._fill_all
)
)
self._phygnn_model = MultiCloudsModel.load(**model_path)
if self.h5_source is not None:
with MultiFileNSRDB(self.h5_source) as res:
self._dset_map = res.h5._dset_map
self._res_shape = res.shape
[docs]
def preflight(self):
"""Run preflight checks and raise error if datasets are missing"""
missing = []
for dset in self._phygnn_model.feature_names:
ignore = ('clear', 'ice_cloud', 'water_cloud', 'bad_cloud')
if dset not in ignore and dset not in self._dset_map:
missing.append(dset)
if any(missing):
msg = (
'The following datasets were missing in the h5_source '
'directory: {}'.format(missing)
)
logger.error(msg)
raise FileNotFoundError(msg)
@property
def phygnn_model(self):
"""
Pre-trained PhygnnModel instance
Returns
-------
PhygnnModel
"""
return self._phygnn_model
@property
def h5_source(self):
"""
Path to directory containing multi-file resource file sets.
Available formats:
/h5_dir/
/h5_dir/prefix*suffix
Returns
-------
str
"""
return self._h5_source
@property
def dset_map(self):
"""
Mapping of datasets to .h5 files
Returns
-------
dict
"""
return self._dset_map
[docs]
def parse_feature_data(self, feature_data=None, col_slice=slice(None)):
"""
Parse raw feature data from .h5 files (will have gaps!)
Parameters
----------
feature_data : dict | None
Pre-loaded feature data to add to (optional). Keys are the feature
names (nsrdb dataset names), values are 2D numpy arrays
(time x sites). Any dsets already in this input won't be re-read.
col_slice : slice
Column slice of the resource data to work on. This is a result of
chunking the columns to reduce memory load. Use slice(None) for
no chunking.
Returns
-------
feature_data : dict
Raw feature data with gaps. keys are the feature names (nsrdb
dataset names), values are 2D numpy arrays (time x sites).
"""
logger.info('Loading feature data.')
if feature_data is None:
feature_data = {}
dsets = (
self._phygnn_model.feature_names + self._phygnn_model.label_names
)
with MultiFileNSRDB(self.h5_source) as res:
for dset in dsets:
if dset not in feature_data and dset in res.dsets:
logger.debug('Loading {} data'.format(dset))
feature_data[dset] = res[dset, :, col_slice]
mem = psutil.virtual_memory()
logger.info(
'Feature data loaded for column slice {}. '
'Memory utilization is {:.3f} GB out of {:.3f} GB total '
'({:.2f}% used).'.format(
col_slice,
mem.used / 1e9,
mem.total / 1e9,
100 * mem.used / mem.total,
)
)
return feature_data
[docs]
def init_clean_arrays(self):
"""Initialize a dict of numpy arrays for clean data."""
arr = np.zeros(self._res_shape, dtype=np.float32)
dsets = (
'cloud_type',
'cld_press_acha',
'cld_opd_dcomp',
'cld_reff_dcomp',
)
clean_arrays = {d: arr.copy() for d in dsets}
fill_flag_array = arr.copy().astype(np.uint8)
return clean_arrays, fill_flag_array
[docs]
@staticmethod
def clean_array(dset, array):
"""Clean a dataset array using temporal nearest neighbor interpolation.
Parameters
----------
dset : str
NSRDB dataset name
array : np.ndarray
2D (time x sites) float numpy array of data for dset. Missing
values should be set to NaN.
Returns
-------
array : np.ndarray
2D (time x sites) float numpy array of data for dset. Missing
values should be filled
"""
# attempt to clean sites with mostly NaN values. This has to happen
# before the df.interpolate() call.
bad = np.isnan(array)
any_bad = bad.any()
bad_cols = (~bad).sum(axis=0) < 3
if bad.all():
msg = (
f'Feature dataset "{dset}" has all NaN data! Filling with '
'zeros.'
)
logger.warning(msg)
warn(msg)
array[:] = 0
elif bad_cols.any():
mean_impute = np.nanmean(array, axis=0)
count = bad_cols.sum()
msg = (
'Feature dataset "{}" has {} columns with nearly all NaN '
'values out of {} ({:.2f}%) ({} NaN values out of '
'{} total {:.2f}%). Filling with mean values by site.'.format(
dset,
count,
array.shape[1],
100 * count / array.shape[1],
bad.sum(),
array.size,
100 * bad.sum() / array.size,
)
)
logger.warning(msg)
warn(msg)
array[:, bad_cols] = mean_impute[bad_cols]
# attempt to fill all remaining NaN values that should be scattered
# throughout (not full sites missing data)
if any_bad:
array = (
pd.DataFrame(array)
.interpolate('nearest')
.ffill()
.bfill()
.values
)
# Fill any persistent NaN values with the global mean (these are
# usually sites that have no valid data at all)
bad = np.isnan(array)
bad_cols = (~bad).sum(axis=0) < 3
if bad.any() or bad_cols.any():
mean_impute = np.nanmean(array)
msg = (
'There were {} observations (out of {}) that have '
'persistent NaN values and {} sites (out of {}) that '
'still have all NaN values that could not be '
'cleaned for {}. Filling with global mean value of {}'.format(
bad.sum(),
bad.size,
bad_cols.sum(),
array.shape[1],
dset,
mean_impute,
)
)
logger.warning(msg)
warn(msg)
array[bad] = mean_impute
return array
[docs]
def clean_feature_data(
self, feature_raw, fill_flag, sza_lim=90, max_workers=1
):
"""
Clean feature data
Parameters
----------
feature_raw : dict
Raw feature data with gaps. keys are the feature names (nsrdb
dataset names), values are 2D numpy arrays (time x sites).
fill_flag : np.ndarray
Integer array of flags showing what data was filled and why.
sza_lim : int, optional
Solar zenith angle limit below which missing cloud property data
will be gap filled. By default 90 to fill all missing daylight data
max_workers : None | int
Maximum workers to clean data in parallel. 1 is serial and None
uses all available workers.
Returns
-------
feature_data : dict
Clean feature data without gaps. keys are the feature names
(nsrdb dataset names), values are 2D numpy arrays (time x sites).
fill_flag : np.ndarray
Integer array of flags showing what data was filled and why.
"""
t0 = time.time()
feature_data = feature_raw.copy()
day = feature_data['solar_zenith_angle'] < sza_lim
cloud_type = feature_data['cloud_type']
assert cloud_type.shape == fill_flag.shape
# fill flag 1 and 2 are the missing cloud type flags
day_missing_ctype = day & (
np.isin(fill_flag, (1, 2)) | (cloud_type < 0)
)
mask = cloud_type < 0
full_missing_ctype_mask = mask.all(axis=0)
if any(full_missing_ctype_mask) or mask.any():
msg = (
'There are {} missing cloud type observations '
'out of {} including {} with full missing ctype. This '
'needs to be resolved using the fill_ctype_press() '
'method before this step.'.format(
mask.sum(), mask.size, full_missing_ctype_mask.sum()
)
)
logger.error(msg)
raise RuntimeError(msg)
feature_data['cld_opd_dcomp'] = np.nan_to_num(
feature_data['cld_opd_dcomp'], nan=-1.0
)
feature_data['cld_reff_dcomp'] = np.nan_to_num(
feature_data['cld_reff_dcomp'], nan=-1.0
)
cloudy = np.isin(cloud_type, ICE_TYPES + WATER_TYPES)
day_clouds = day & cloudy
day_missing_opd = day_clouds & (feature_data['cld_opd_dcomp'] <= 0)
day_missing_reff = day_clouds & (feature_data['cld_reff_dcomp'] <= 0)
mask_fill_flag_opd = day_missing_opd & (fill_flag == 0)
mask_fill_flag_reff = day_missing_reff & (fill_flag == 0)
mask_all_bad_opd = (day_missing_opd | ~day).all(axis=0) & (
fill_flag < 2
).all(axis=0)
mask_all_bad_reff = (day_missing_reff | ~day).all(axis=0) & (
fill_flag < 2
).all(axis=0)
fill_flag[mask_fill_flag_opd] = 7 # fill_flag=7 is mlcloud fill
fill_flag[mask_fill_flag_reff] = 7 # fill_flag=7 is mlcloud fill
fill_flag[:, mask_all_bad_opd] = 4
fill_flag[:, mask_all_bad_reff] = 4
logger.info(
'{:.2f}% of timesteps are daylight'.format(
100 * day.sum() / (day.shape[0] * day.shape[1])
)
)
logger.info(
'{:.2f}% of daylight timesteps are cloudy'.format(
100 * day_clouds.sum() / day.sum()
)
)
logger.info(
'{:.2f}% of daylight timesteps are missing cloud type'.format(
100 * day_missing_ctype.sum() / day.sum()
)
)
logger.info(
'{:.2f}% of cloudy daylight timesteps are missing cloud '
'opd'.format(100 * day_missing_opd.sum() / day_clouds.sum())
)
logger.info(
'{:.2f}% of cloudy daylight timesteps are missing cloud '
'reff'.format(100 * day_missing_reff.sum() / day_clouds.sum())
)
mask = feature_data['cld_opd_dcomp'] <= 0
feature_data['cld_opd_dcomp'][mask] = np.nan
feature_data['cld_opd_dcomp'][~cloudy] = 0.0
mask = feature_data['cld_reff_dcomp'] <= 0
feature_data['cld_reff_dcomp'][mask] = np.nan
feature_data['cld_reff_dcomp'][~cloudy] = 0.0
logger.debug('Column NaN values:')
for c, d in feature_data.items():
pnan = 100 * np.isnan(d).sum() / (d.shape[0] * d.shape[1])
logger.debug('\t"{}" has {:.2f}% NaN values'.format(c, pnan))
logger.debug('Interpolating feature data using nearest neighbor.')
if max_workers == 1:
for c, d in feature_data.items():
if c not in self.phygnn_model.label_names:
feature_data[c] = self.clean_array(c, d)
else:
logger.info(
'Interpolating feature data with {} max workers'.format(
max_workers
)
)
futures = {}
loggers = ['nsrdb', 'rex', 'phygnn']
with SpawnProcessPool(
loggers=loggers, max_workers=max_workers
) as exe:
for c, d in feature_data.items():
if c not in self.phygnn_model.label_names:
future = exe.submit(self.clean_array, c, d)
futures[future] = c
for future in as_completed(futures):
c = futures[future]
feature_data[c] = future.result()
logger.debug('Feature data interpolation is complete.')
assert ~(feature_data['cloud_type'] < 0).any()
assert ~any(np.isnan(d).any() for d in feature_data.values())
assert ~(cloudy & (feature_data['cld_opd_dcomp'] <= 0)).any()
assert ~(cloudy & (feature_data['cld_reff_dcomp'] <= 0)).any()
logger.debug('Adding feature flag')
cloud_type = feature_data['cloud_type']
day_ice_clouds = day & np.isin(cloud_type, ICE_TYPES)
day_water_clouds = day & np.isin(cloud_type, WATER_TYPES)
day_clouds_bad_ctype = (
day_ice_clouds | day_water_clouds
) & day_missing_ctype
flag = np.full(day_ice_clouds.shape, 'night', dtype=object)
flag[day] = 'clear'
flag[day_ice_clouds] = 'ice_cloud'
flag[day_water_clouds] = 'water_cloud'
flag[day_clouds_bad_ctype] = 'bad_cloud'
flag[day_missing_opd] = 'bad_cloud'
flag[day_missing_reff] = 'bad_cloud'
feature_data['flag'] = flag
logger.debug(
'Created the "flag" dataset with the following unique '
'values: {}'.format(np.unique(flag))
)
mem = psutil.virtual_memory()
logger.debug(
'Cleaned feature data dict has these keys: {}'.format(
feature_data.keys()
)
)
logger.debug(
'Cleaned feature data dict values have these shapes: {}'.format(
[d.shape for d in feature_data.values()]
)
)
logger.debug(
'Feature flag column has these values: {}'.format(
np.unique(feature_data['flag'])
)
)
logger.info('Cleaning took {:.1f} seconds'.format(time.time() - t0))
logger.info(
'Memory utilization is '
'{:.3f} GB out of {:.3f} GB total ({:.2f}% used).'.format(
mem.used / 1e9, mem.total / 1e9, 100 * mem.used / mem.total
)
)
return feature_data, fill_flag
[docs]
def archive_cld_properties(self):
"""
Archive original cloud property (cld_*) .h5 files. This method creates
.tmp files in a ./raw/ sub directory. mark_complete_archived_files()
should be run at the end to remove the .tmp designation. This will
signify that the cloud fill was completed successfully.
"""
cld_dsets = [
'cld_opd_dcomp',
'cld_reff_dcomp',
'cld_press_acha',
'cloud_type',
]
for dset in [ds for ds in cld_dsets if ds in self.dset_map]:
src_fpath = self.dset_map[dset]
src_dir, f_name = os.path.split(src_fpath)
dst_dir = os.path.join(src_dir, 'raw')
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
dst_fpath = os.path.join(dst_dir, f_name)
dst_fpath_tmp = dst_fpath + '.tmp'
logger.debug('Archiving {} to {}'.format(src_fpath, dst_fpath_tmp))
if os.path.exists(dst_fpath):
msg = (
'A raw cloud file already exists, this suggests '
'MLClouds gap fill has already been run: {}'.format(
dst_fpath
)
)
logger.error(msg)
raise RuntimeError(msg)
if os.path.exists(dst_fpath_tmp):
# don't overwrite the tmp file, the original may have been
# manipulated by a failed mlclouds job.
logger.debug(
'Archive file exists, not overwriting: {}'.format(
dst_fpath_tmp
)
)
else:
shutil.copy(src_fpath, dst_fpath_tmp)
[docs]
def mark_complete_archived_files(self):
"""Remove the .tmp marker from the archived files once MLCloudsFill
is complete"""
cld_dsets = [
'cld_opd_dcomp',
'cld_reff_dcomp',
'cld_press_acha',
'cloud_type',
]
for dset in cld_dsets:
fpath = self.dset_map[dset]
src_dir, f_name = os.path.split(fpath)
raw_path = os.path.join(src_dir, 'raw', f_name)
logger.debug('Renaming .tmp raw file to {}'.format(raw_path))
if os.path.exists(raw_path + '.tmp'):
os.rename(raw_path + '.tmp', raw_path)
else:
msg = (
'Something went wrong. The .tmp file created at the '
'beginning of MLCloudsFill no longer exists: {}'.format(
raw_path + '.tmp'
)
)
logger.error(msg)
raise FileNotFoundError(msg)
[docs]
def predict_cld_properties(
self, feature_data, col_slice=None, low_mem=False
):
"""
Predict cloud properties with phygnn
Parameters
----------
feature_data : dict
Clean feature data without gaps. keys are the feature names
(nsrdb dataset names), values are 2D numpy arrays (time x sites).
col_slice : slice
Column slice of the resource data to work on. This is a result of
chunking the columns to reduce memory load. Just used for logging
in this method.
low_mem : bool
Option to run predictions in low memory mode. Typically the
memory bloat during prediction is:
(n_time x n_sites x n_nodes_per_layer). low_mem=True will
reduce this to (1000 x n_nodes_per_layer)
Returns
-------
predicted_data : dict
Dictionary of predicted cloud properties. Keys are nsrdb dataset
names, values are 2D arrays of phygnn-predicted values
(time x sites).
"""
L = feature_data['flag'].shape[0] * feature_data['flag'].shape[1]
cols = [
k
for k in feature_data
if k in self.phygnn_model.feature_names or k == 'flag'
]
feature_df = pd.DataFrame(index=np.arange(L), columns=cols)
for dset, arr in feature_data.items():
if dset in feature_df:
feature_df[dset] = arr.flatten(order='F')
# Predict on night timesteps as if they were clear
# the phygnn model wont be trained with the night category
# so this is an easy way to keep the full data shape and
# cooperate with the feature names that phygnn expects
night_mask = feature_df['flag'] == 'night'
feature_df.loc[night_mask, 'flag'] = 'clear'
logger.debug(
'Predicting gap filled cloud data for column slice {}'.format(
col_slice
)
)
predict_feats = set(feature_df.columns).intersection(
self.phygnn_model.input_feature_names
)
predict_feats = feature_df[list(predict_feats)]
if not low_mem:
labels = self.phygnn_model.predict(predict_feats, table=False)
else:
len_df = len(predict_feats)
chunks = np.array_split(
np.arange(len_df), int(np.ceil(len_df / 1000))
)
labels = []
for index_chunk in chunks:
sub = predict_feats.iloc[index_chunk]
labels.append(self.phygnn_model.predict(sub, table=False))
labels = np.concatenate(labels, axis=0)
mem = psutil.virtual_memory()
logger.debug(
'Prediction complete for column slice {}. Memory '
'utilization is {:.3f} GB out of {:.3f} GB total '
'({:.2f}% used).'.format(
col_slice,
mem.used / 1e9,
mem.total / 1e9,
100 * mem.used / mem.total,
)
)
logger.debug('Label data shape: {}'.format(labels.shape))
shape = feature_data['flag'].shape
predicted_data = {}
for i, dset in enumerate(self.phygnn_model.output_names):
logger.debug('Reshaping predicted {} to {}'.format(dset, shape))
predicted_data[dset] = labels[:, i].reshape(shape, order='F')
for dset, arr in predicted_data.items():
nnan = np.isnan(arr).sum()
ntot = arr.shape[0] * arr.shape[1]
logger.debug(
'Raw predicted data for {} for column slice {} '
'has mean: {:.2f}, median: {:.2f}, range: '
'({:.2f}, {:.2f}) and {} NaN values out of '
'{} ({:.2f}%)'.format(
dset,
col_slice,
np.nanmean(arr),
np.median(arr),
np.nanmin(arr),
np.nanmax(arr),
nnan,
ntot,
100 * nnan / ntot,
)
)
return predicted_data
[docs]
def fill_bad_cld_properties(
self, predicted_data, feature_data, col_slice=None
):
"""
Fill bad cloud properties in the feature data from predicted data
Parameters
----------
predicted_data : dict
Dictionary of predicted cloud properties. Keys are nsrdb dataset
names, values are 2D arrays of phygnn-predicted values
(time x sites).
feature_data : dict
Clean feature data without gaps. keys are the feature names
(nsrdb dataset names), values are 2D numpy arrays (time x sites).
col_slice : slice
Column slice of the resource data to work on. This is a result of
chunking the columns to reduce memory load. Just used for logging
in this method.
Returns
-------
filled_data : dict
Dictionary of filled cloud properties. Keys are nsrdb dataset
names, values are 2D arrays of phygnn-predicted values
(time x sites). The filled data is a combination of the input
predicted_data and feature_data. The datasets in the predicted_data
input are used to fill the feature_data input where:
(feature_data['flag'] == "bad_cloud")
"""
fill_mask = feature_data['flag'] == 'bad_cloud'
night_mask = feature_data['flag'] == 'night'
clear_mask = feature_data['flag'] == 'clear'
cloud_mask = (feature_data['flag'] != 'night') & (
feature_data['flag'] != 'clear'
)
logger.debug(
'Final fill mask has {} bad clouds, {} night timesteps, '
'{} clear timesteps, {} timesteps that are either night '
'or clear, and {} cloudy timesteps out of {} '
'total observations.'.format(
fill_mask.sum(),
night_mask.sum(),
clear_mask.sum(),
(clear_mask | night_mask).sum(),
cloud_mask.sum(),
fill_mask.size,
)
)
if fill_mask.sum() == 0:
msg = (
'No "bad_cloud" flags were detected in the feature_data '
'"flag" dataset. Something went wrong! '
'The cloud data is never perfect...'
)
logger.warning(msg)
warn(msg)
if self._fill_all:
logger.debug(
'Filling {} values (all cloudy timesteps) using '
'MLClouds predictions for column slice {}'.format(
np.sum(cloud_mask), col_slice
)
)
else:
logger.debug(
'Filling {} values using MLClouds predictions for '
'column slice {}'.format(np.sum(fill_mask), col_slice)
)
filled_data = {}
for dset, arr in predicted_data.items():
varobj = VarFactory.get_base_handler(dset, var_meta=self._var_meta)
arr = np.maximum(arr, 0.01)
arr = np.minimum(arr, varobj.physical_max)
cld_data = feature_data[dset]
if self._fill_all:
cld_data[cloud_mask] = arr[cloud_mask]
else:
cld_data[fill_mask] = arr[fill_mask]
cld_data[night_mask | clear_mask] = 0
filled_data[dset] = cld_data
nnan = np.isnan(filled_data[dset]).sum()
ntot = filled_data[dset].shape[0] * filled_data[dset].shape[1]
logger.debug(
'Final cleaned data for {} for column slice {} '
'has mean: {:.2f}, median: {:.2f}, range: '
'({:.2f}, {:.2f}) and {} NaN values out of '
'{} ({:.2f}%)'.format(
dset,
col_slice,
np.nanmean(filled_data[dset]),
np.median(filled_data[dset]),
np.nanmin(filled_data[dset]),
np.nanmax(filled_data[dset]),
nnan,
ntot,
100 * nnan / ntot,
)
)
if nnan > 0:
msg = (
'Gap filled cloud property "{}" still had '
'{} NaN values!'.format(dset, nnan)
)
logger.error(msg)
raise RuntimeError(msg)
return filled_data
[docs]
@staticmethod
def fill_ctype_press(h5_source, col_slice=slice(None)):
"""Fill cloud type and pressure using simple temporal nearest neighbor.
Parameters
----------
h5_source : str
Path to directory containing multi-file resource file sets.
Available formats:
/h5_dir/
/h5_dir/prefix*suffix
col_slice : slice
Column slice of the resource data to work on. This is a result of
chunking the columns to reduce memory load. Use slice(None) for
no chunking.
Returns
-------
cloud_type : np.ndarray
2D array (time x sites) of gap-filled cloud type data.
cloud_pres : np.ndarray
2D array (time x sites) of gap-filled cld_press_acha data.
sza : np.ndarray
2D array (time x sites) of solar zenith angle data.
fill_flag : np.ndarray
Integer array of flags showing what data was filled and why.
"""
fill_flag = None
with MultiFileNSRDB(h5_source) as f:
cloud_type = f['cloud_type', :, col_slice]
cloud_pres = f['cld_press_acha', :, col_slice]
sza = f['solar_zenith_angle', :, col_slice]
cloud_type, fill_flag = CloudGapFill.fill_cloud_type(
cloud_type, fill_flag=fill_flag
)
cloud_pres, fill_flag = CloudGapFill.fill_cloud_prop(
'cld_press_acha',
cloud_pres,
cloud_type,
sza,
fill_flag=fill_flag,
cloud_type_is_clean=True,
)
return cloud_type, cloud_pres, sza, fill_flag
[docs]
def write_filled_data(self, filled_data, col_slice=slice(None)):
"""Write gap filled cloud data to disk
Parameters
----------
filled_data : dict
Dictionary of filled cloud properties. Keys are nsrdb dataset
names, values are 2D arrays of phygnn-predicted values
(time x sites). The filled data is a combination of the input
predicted_data and feature_data. The datasets in the predicted_data
input are used to fill the feature_data input where:
(feature_data['flag'] == "bad_cloud")
col_slice : slice
Column slice of the resource data to work on. This is a result of
chunking the columns to reduce memory load. Use slice(None) for
no chunking.
"""
for dset, arr in filled_data.items():
fpath = self.dset_map[dset]
with Outputs(fpath, mode='a') as f:
logger.info(
'Writing filled "{}" to: {}'.format(
dset, os.path.basename(fpath)
)
)
f[dset, :, col_slice] = arr
logger.debug('Finished writing "{}".'.format(dset))
[docs]
def write_fill_flag(self, fill_flag, col_slice=slice(None)):
"""Write the fill flag dataset to its daily file next to the cloud
property files.
Parameters
----------
fill_flag : np.ndarray
Integer array of flags showing what data was filled and why.
col_slice : slice
Column slice of the resource data to work on. This is a result of
chunking the columns to reduce memory load. Use slice(None) for
no chunking.
"""
fpath_opd = self._dset_map['cld_opd_dcomp']
fpath = fpath_opd.replace('cld_opd_dcomp', 'cloud_fill_flag')
var_obj = VarFactory.get_base_handler(
'cloud_fill_flag', var_meta=self._var_meta
)
with Resource(fpath_opd) as res:
ti = res.time_index
meta = res.meta
logger.info(
'Writing cloud_fill_flag to: {}'.format(os.path.basename(fpath))
)
init_dset = False
if os.path.exists(fpath):
with Outputs(fpath, mode='r') as fout:
if 'cloud_fill_flag' not in fout:
init_dset = True
else:
init_dset = True
try:
if init_dset:
with Outputs(fpath, mode='w') as fout:
fout.time_index = ti
fout.meta = meta
init_data = np.zeros(
(len(ti), len(meta)), dtype=var_obj.final_dtype
)
fout._add_dset(
dset_name='cloud_fill_flag',
dtype=var_obj.final_dtype,
data=init_data,
chunks=var_obj.chunks,
attrs=var_obj.attrs,
)
with Outputs(fpath, mode='a') as fout:
fout['cloud_fill_flag', :, col_slice] = fill_flag
except Exception as e:
msg = 'Could not write col_slice {} to file: "{}"'.format(
col_slice, fpath
)
logger.exception(msg)
raise OSError from e
logger.debug('Write complete')
logger.info('Final cloud_fill_flag counts:')
ntot = fill_flag.shape[0] * fill_flag.shape[1]
for n in range(10):
count = (fill_flag == n).sum()
logger.info(
'\tFlag {} has {} counts out of {} ({:.2f}%)'.format(
n, count, ntot, 100 * count / ntot
)
)
[docs]
@classmethod
def prep_chunk(
cls,
h5_source,
model_path=None,
var_meta=None,
sza_lim=90,
col_slice=slice(None),
):
"""Prepare a column chunk (slice) of data for phygnn prediction.
Parameters
----------
h5_source : str
Path to directory containing multi-file resource file sets.
Available formats:
/h5_dir/
/h5_dir/prefix*suffix
model_path : str | None
Directory to load phygnn model from. This is typically a fpath to
a .pkl file with an accompanying .json file in the same directory.
None will try to use the default model path from the mlclouds
project directory.
var_meta : str | pd.DataFrame | None
CSV file or dataframe containing meta data for all NSRDB variables.
Defaults to the NSRDB var meta csv in git repo.
sza_lim : int, optional
Solar zenith angle limit below which missing cloud property data
will be gap filled. By default 90 to fill all missing daylight data
col_slice : slice
Column slice of the resource data to work on. This is a result of
chunking the columns to reduce memory load. Use slice(None) for
no chunking.
Returns
-------
feature_data : dict
Clean feature data without gaps. keys are the feature names
(nsrdb dataset names), values are 2D numpy arrays (time x sites).
This is just for the col_slice being worked on.
clean_data : dict
Dictionary of filled cloud properties. Keys are nsrdb dataset
names, values are 2D arrays (time x sites) and have
nearest-neighbor cleaned values for cloud pressure and type
This is just for the col_slice being worked on.
fill_flag : np.ndarray
Integer array of flags showing what data was filled and why.
This is just for the col_slice being worked on.
"""
obj = cls(h5_source, model_path=model_path, var_meta=var_meta)
logger.debug(
'Preparing data for MLCloudsFill for column slice {}'.format(
col_slice
)
)
ctype, cpres, sza, fill_flag = obj.fill_ctype_press(
obj.h5_source, col_slice=col_slice
)
clean_data = {'cloud_type': ctype, 'cld_press_acha': cpres}
feature_data = {
'cloud_type': ctype,
'cld_press_acha': cpres,
'solar_zenith_angle': sza,
}
feature_data = obj.parse_feature_data(
feature_data=feature_data, col_slice=col_slice
)
feature_data, fill_flag = obj.clean_feature_data(
feature_data, fill_flag, sza_lim=sza_lim
)
logger.debug(
'Completed MLClouds data prep for column slice {}'.format(
col_slice
)
)
return feature_data, clean_data, fill_flag
[docs]
def process_chunk(
self,
i_features,
i_clean,
i_flag,
col_slice,
clean_data,
fill_flag,
low_mem=False,
):
"""Use cleaned and prepared data to run phygnn predictions and create
final filled data for a single column chunk.
Parameters
----------
i_features : dict
Clean feature data without gaps. keys are the feature names
(nsrdb dataset names), values are 2D numpy arrays (time x sites).
This is just for a single column chunk (col_slice).
i_clean : dict
Dictionary of filled cloud properties. Keys are nsrdb dataset
names, values are 2D arrays (time x sites) of phygnn-predicted
values (cloud opd and reff) or nearest-neighbor cleaned values
(cloud pressure and type).
This is just for a single column chunk (col_slice).
i_flag : np.ndarray
Integer array of flags showing what data was filled and why.
This is just for a single column chunk (col_slice).
col_slice : slice
Column slice of the resource data to work on. This is a result of
chunking the columns to reduce memory load.
clean_data : dict
Dictionary of filled cloud properties. Keys are nsrdb dataset
names, values are 2D arrays (time x sites).
This is for ALL chunks (full resource shape).
fill_flag : np.ndarray
Integer array of flags showing what data was filled and why.
This is for ALL chunks (full resource shape).
low_mem : bool
Option to run predictions in low memory mode. Typically the
memory bloat during prediction is:
(n_time x n_sites x n_nodes_per_layer). low_mem=True will
reduce this to (1000 x n_nodes_per_layer)
Returns
-------
clean_data : dict
Dictionary of filled cloud properties. Keys are nsrdb dataset
names, values are 2D arrays (time x sites). This has been updated
with phygnn-predicted values (cloud opd and reff) or
nearest-neighbor cleaned values (cloud pressure and type)
This is for ALL chunks (full resource shape).
fill_flag : np.ndarray
Integer array of flags showing what data was filled and why.
This is for ALL chunks (full resource shape).
"""
i_predicted = self.predict_cld_properties(
i_features, col_slice=col_slice, low_mem=low_mem
)
i_filled = self.fill_bad_cld_properties(
i_predicted, i_features, col_slice=col_slice
)
i_clean.update(i_filled)
fill_flag[:, col_slice] = i_flag
for k, v in clean_data.items():
v[:, col_slice] = i_clean[k]
return clean_data, fill_flag
[docs]
@classmethod
def clean_data_model(
cls,
data_model,
fill_all=False,
model_path=None,
var_meta=None,
sza_lim=90,
low_mem=False,
):
"""Run the MLCloudsFill process on data in-memory in an nsrdb
data model object.
Parameters
----------
data_model : nsrdb.data_model.DataModel
DataModel object with processed source data (cloud data + ancillary
processed onto the nsrdb grid).
fill_all : bool
Flag to fill all cloud properties for all timesteps where
cloud_type is cloudy.
model_path : str | None
Directory to load phygnn model from. This is typically a fpath to
a .pkl file with an accompanying .json file in the same directory.
None will try to use the default model path from the mlclouds
project directory.
var_meta : str | pd.DataFrame | None
CSV file or dataframe containing meta data for all NSRDB variables.
Defaults to the NSRDB var meta csv in git repo.
sza_lim : int, optional
Solar zenith angle limit below which missing cloud property data
will be gap filled. By default 90 to fill all missing daylight data
low_mem : bool
Option to run predictions in low memory mode. Typically the
memory bloat during prediction is:
(n_time x n_sites x n_nodes_per_layer). low_mem=True will
reduce this to (1000 x n_nodes_per_layer)
Returns
-------
data_model : nsrdb.data_model.DataModel
DataModel object with processed source data (cloud data + ancillary
processed onto the nsrdb grid). The cloud property datasets
(cloud_type, cld_opd_dcomp, cld_reff_dcomp, cloud_fill_flag) are
now cleaned.
"""
obj = cls(
None, fill_all=fill_all, model_path=model_path, var_meta=var_meta
)
logger.info('Preparing data for MLCloudsFill to clean data model')
ctype = data_model['cloud_type']
sza = data_model['solar_zenith_angle']
ctype, fill_flag = CloudGapFill.fill_cloud_type(ctype, fill_flag=None)
feature_data = {'cloud_type': ctype, 'solar_zenith_angle': sza}
logger.info('Loading feature data.')
dsets = obj._phygnn_model.feature_names + obj._phygnn_model.label_names
for dset in dsets:
if dset not in feature_data and dset in data_model:
feature_data[dset] = data_model[dset]
mem = psutil.virtual_memory()
logger.info(
'Feature data loaded for data model cleaning. '
'Memory utilization is {:.3f} GB out of {:.3f} GB total '
'({:.2f}% used).'.format(
mem.used / 1e9, mem.total / 1e9, 100 * mem.used / mem.total
)
)
feature_data, fill_flag = obj.clean_feature_data(
feature_data, fill_flag, sza_lim=sza_lim
)
logger.info('Completed MLClouds data prep')
predicted = obj.predict_cld_properties(feature_data, low_mem=low_mem)
filled = obj.fill_bad_cld_properties(predicted, feature_data)
feature_data['cloud_fill_flag'] = fill_flag
for k, v in feature_data.items():
logger.info(
'Sending cleaned feature dataset "{}" to data model '
'with shape {}'.format(k, v.shape)
)
data_model[k] = v
for k, v in filled.items():
logger.info(
'Sending cleaned cloud property dataset "{}" to '
'data model with shape {}'.format(k, v.shape)
)
data_model[k] = v
logger.info('Finished MLClouds fill of data model object.')
return data_model
[docs]
@classmethod
def merra_clouds(
cls,
h5_source,
var_meta=None,
merra_fill_flag=8,
fill_all=False,
model_path=None,
):
"""Quick check to see if cloud data is from a merra source in which
case it should be gap-free and cloud_fill_flag will be written with all
8's
Parameters
----------
h5_source : str
Path to directory containing multi-file resource file sets.
Available formats:
/h5_dir/
/h5_dir/prefix*suffix
var_meta : str | pd.DataFrame | None
CSV file or dataframe containing meta data for all NSRDB variables.
Defaults to the NSRDB var meta csv in git repo.
merra_fill_flag : int
Integer fill flag representing where merra data was used as source
cloud data.
fill_all : bool
Flag to fill all cloud properties for all timesteps where
cloud_type is cloudy.
model_path : str | None
Directory to load phygnn model from. This is typically a fpath to
a .pkl file with an accompanying .json file in the same directory.
None will try to use the default model path from the mlclouds
project directory.
Returns
-------
is_merra : bool
Flag that is True if cloud data is from merra
"""
mlclouds = cls(
h5_source,
var_meta=var_meta,
model_path=model_path,
fill_all=fill_all,
)
with MultiFileNSRDB(h5_source) as res:
attrs = res.attrs.get('cld_opd_dcomp', {})
if 'merra' in attrs.get('data_source', '').lower():
logger.info(
'Found cloud data from MERRA2 for source: {}'.format(h5_source)
)
fill_flag_arr = mlclouds.init_clean_arrays()[1]
fill_flag_arr[:] = merra_fill_flag
mlclouds.write_fill_flag(fill_flag_arr)
return True
return False
[docs]
@classmethod
def run(
cls,
h5_source,
fill_all=False,
model_path=None,
var_meta=None,
sza_lim=90,
col_chunk=None,
max_workers=None,
low_mem=False,
):
"""
Fill cloud properties using phygnn predictions. Original files will be
archived to a new "raw/" sub-directory
Parameters
----------
h5_source : str
Path to directory containing multi-file resource file sets.
Available formats:
/h5_dir/
/h5_dir/prefix*suffix
fill_all : bool
Flag to fill all cloud properties for all timesteps where
cloud_type is cloudy.
model_path : str | None
Directory to load phygnn model from. This is typically a fpath to
a .pkl file with an accompanying .json file in the same directory.
None will try to use the default model path from the mlclouds
project directory.
var_meta : str | pd.DataFrame | None
CSV file or dataframe containing meta data for all NSRDB variables.
Defaults to the NSRDB var meta csv in git repo.
sza_lim : int, optional
Solar zenith angle limit below which missing cloud property data
will be gap filled. By default 90 to fill all missing daylight data
col_chunk : None | int
Optional chunking method to gap fill one column chunk at a time
to reduce memory requirements. If provided, this should be an
integer specifying how many columns to work on at one time.
max_workers : None | int
Maximum workers for running mlclouds in parallel. 1 is serial and
None uses all available workers.
low_mem : bool
Option to run predictions in low memory mode. Typically the
memory bloat during prediction is:
(n_time x n_sites x n_nodes_per_layer). low_mem=True will
reduce this to (1000 x n_nodes_per_layer)
"""
logger.info(
f'Running MLCloudsFill with h5_source: {h5_source}. Running '
f'MLCloudsFill with model: {model_path}. Running MLCloudsFill '
f'with col_chunk: {col_chunk}.'
)
obj = cls(
h5_source,
fill_all=fill_all,
model_path=model_path,
var_meta=var_meta,
)
obj.preflight()
obj.archive_cld_properties()
clean_data, fill_flag = obj.init_clean_arrays()
if col_chunk is None:
slices = [slice(None)]
logger.info(
'MLClouds gap fill is being run without col_chunk for '
f'full data shape {obj._res_shape} all on one process. If you '
'see memory errors, try setting the col_chunk input to '
'distribute the problem across multiple small workers.'
)
else:
columns = np.arange(obj._res_shape[1])
N = np.ceil(len(columns) / col_chunk)
arrays = np.array_split(columns, N)
slices = [slice(a[0], 1 + a[-1]) for a in arrays]
logger.info(
'MLClouds gap fill will be run across the full data '
f'column shape {len(columns)} in {len(slices)} column chunks '
f'with approximately {col_chunk} sites per chunk.'
)
if max_workers == 1:
for col_slice in slices:
out = obj.prep_chunk(
h5_source,
model_path=model_path,
var_meta=var_meta,
sza_lim=sza_lim,
col_slice=col_slice,
)
i_features, i_clean, i_flag = out
out = obj.process_chunk(
i_features,
i_clean,
i_flag,
col_slice,
clean_data,
fill_flag,
low_mem=low_mem,
)
clean_data, fill_flag = out
del i_features, i_clean, i_flag
else:
futures = {}
logger.info(
'Starting process pool for parallel phygnn cloud '
f'fill with {max_workers} workers.'
)
loggers = ['nsrdb', 'rex', 'phygnn']
with SpawnProcessPool(
loggers=loggers, max_workers=max_workers
) as exe:
for col_slice in slices:
future = exe.submit(
obj.prep_chunk,
h5_source,
model_path=model_path,
var_meta=var_meta,
sza_lim=sza_lim,
col_slice=col_slice,
)
futures[future] = col_slice
logger.info(f'Kicked off {len(futures)} futures.')
for i, future in enumerate(as_completed(futures)):
col_slice = futures[future]
i_features, i_clean, i_flag = future.result()
out = obj.process_chunk(
i_features,
i_clean,
i_flag,
col_slice,
clean_data,
fill_flag,
low_mem=low_mem,
)
clean_data, fill_flag = out
del i_features, i_clean, i_flag, future
mem = psutil.virtual_memory()
logger.info(
'MLCloudsFill futures completed: '
'{} out of {}. '
'Current memory usage is '
'{:.3f} GB out of {:.3f} GB total.'.format(
i + 1,
len(futures),
mem.used / 1e9,
mem.total / 1e9,
)
)
obj.write_filled_data(clean_data, col_slice=slice(None))
obj.write_fill_flag(fill_flag, col_slice=slice(None))
obj.mark_complete_archived_files()
logger.info(
'Completed MLCloudsFill for h5_source: {}'.format(h5_source)
)