Source code for revrt.spatial_characterization.cli

"""revrt zonal stats command line interface (CLI)"""

import logging
from pathlib import Path

import rioxarray
import pandas as pd
import geopandas as gpd
from dask.distributed import Client
from gaps.config import load_config
from gaps.cli import CLICommandFromFunction

from revrt.spatial_characterization.zonal import ZonalStats
from revrt.utilities import buffer_routes


logger = logging.getLogger(__name__)


[docs] def buffered_route_characterizations( geotiff_fp, route_fp, row_widths=None, row_width_ranges=None, multiplier_scalar=1.0, prefix=None, copy_properties=None, parallel=False, row_width_key="voltage", chunks="auto", **kwargs, ): """Compute route characterizations/statistics Each route is buffered before computing statistics. Parameters ---------- geotiff_fp : path-like Path to the raster file. route_fp : path-like Path to the vector file of routes. Must contain a "geometry" column and the `row_width_key` column (used to map to path ROW width). row_widths : dict, optional A dictionary specifying the row widths in the following format: ``{"row_width_id": row_width_meters}``. The ``row_width_id`` is a value used to match each route with a particular ROW width (this is typically a voltage). The value should be found under the ``row_width_key`` entry of the ``route_fp``. .. IMPORTANT:: At least one of `row_widths` or `row_width_ranges` must be provided. By default, ``None``. row_width_ranges : list, optional Optional list of dictionaries, where each dictionary contains the keys "min", "max", and "width". This can be used to specify row widths based on ranges of values (e.g. voltage). For example, the following input:: [ {"min": 0, "max": 70, "width": 20}, {"min": 70, "max": 150, "width": 30}, {"min": 200, "max": 350, "width": 40}, {"min": 400, "max": 500, "width": 50}, ] would map voltages in the range ``0 <= volt < 70`` to a row width of 20 meters, ``70 <= volt < 150`` to a row width of 30 meters, ``200 <= volt < 350`` to a row width of 40 meters, and so-on. .. IMPORTANT:: Any values in the `row_widths` dict will take precedence over these ranges. So if a voltage of 138 kV is mapped to a row width of 25 meters in the `row_widths` dict, that value will be used instead of the 30 meter width specified by the ranges above. By default, ``None``. multiplier_scalar : float, optional Optional multiplier value to apply to layer before computing statistics. This is useful if you want to scale the values in the raster before computing statistics. By default, ``1.0``. prefix : str, optional A string representing a prefix to add to each stat name. If you wish to have the prefix separated by a delimiter, you must include it in this string (e.g. ``prefix="test_"``). By default, ``None``. copy_properties : iterable of str, optional Iterable of columns names to copy over from the zone feature. By default, ``None``. parallel : bool, optional Option to perform processing in parallel using dask. By default, ``False``. row_width_key : str, default="voltage" Name of column in vector file of routes used to map to the ROW widths. By default, ``"voltage"``. chunks : tuple or str, default="auto" ``chunks`` keyword argument to pass down to :func:`rioxarray.open_rasterio`. Use this to control the Dask chunk size. By default, ``"auto"``. Returns ------- pd.DataFrame Pandas DataFrame containing computed characteristics/stats. """ rds = ( rioxarray.open_rasterio(geotiff_fp, chunks=chunks) * multiplier_scalar ) logger.debug("Tiff properties:\n%r", rds) logger.debug("Tiff chunksizes:\n%r", rds.chunksizes) # cspell:disable-line routes = gpd.read_file(route_fp) routes = routes.to_crs(rds.rio.crs) routes = buffer_routes( routes, row_widths=row_widths, row_width_ranges=row_width_ranges, row_width_key=row_width_key, ) logger.info("Initializing zonal stats with kwargs:\n%s", kwargs) zs = ZonalStats(**kwargs) logger.info("Computing stats...") stats = zs.from_array( zones=routes, raster_array=rds, affine_transform=rds.rio.transform(), prefix=prefix, copy_properties=copy_properties, parallel=parallel, ) return pd.json_normalize(list(stats), sep="_")
def _route_characterizations_from_config( out_dir, _stat_kwargs, _row_widths=None, _row_width_ranges=None, _default_route_fp=None, _default_copy_properties=None, _default_row_width_key=None, _default_chunks=None, max_workers=1, tag=None, memory_limit_per_worker="auto", ): """Compute route characterizations/statistics Parameters ---------- max_workers : int, optional Number of parallel workers to use for computation. If ``None`` or >1, processing is performed in parallel (using Dask). If your paths span large areas, keep this value low (~10) to avoid running into memory errors. By default, ``1``. memory_limit_per_worker : str, float, int, or None, default="auto" Sets the memory limit *per worker*. This only applies if ``max_workers != 1``. If ``None`` or ``0``, no limit is applied. If ``"auto"``, the total system memory is split evenly between the workers. If a float, that fraction of the system memory is used *per worker*. If a string giving a number of bytes (like "1GiB"), that amount is used *per worker*. If an int, that number of bytes is used *per worker*. By default, ``"auto"`` """ tag = tag or "" _stat_kwargs.setdefault("route_fp", _default_route_fp) _stat_kwargs.setdefault("copy_properties", _default_copy_properties) _stat_kwargs.setdefault( "row_width_key", _default_row_width_key or "voltage" ) _stat_kwargs.setdefault("chunks", _default_chunks or "auto") raster_name = _stat_kwargs.get("geotiff_fp") raster_name = f"_{Path(raster_name).stem}" if raster_name else "" route_name = _stat_kwargs.get("route_fp") route_name = f"_{Path(route_name).stem}" if route_name else "" out_fp = Path(out_dir) / f"characterized{raster_name}{route_name}{tag}.csv" logger.debug( "Running with max_workers=%r and memory_limit_per_worker=%r", max_workers, memory_limit_per_worker, ) parallel = False if max_workers != 1: parallel = True client = Client( n_workers=max_workers, memory_limit=memory_limit_per_worker ) logger.info( "Dask client created with %s workers and %s memory limit per " "worker", max_workers, memory_limit_per_worker, ) logger.info("Dashboard link: %s", client.dashboard_link) out_data = buffered_route_characterizations( row_widths=_row_widths, row_width_ranges=_row_width_ranges, parallel=parallel, **_stat_kwargs, ) out_data.to_csv(out_fp, index=False) return str(out_fp) def _preprocess_stats_config( config, layers, default_route_fp=None, default_copy_properties=None, default_row_width_key=None, default_chunks=None, row_widths=None, row_width_ranges=None, ): """Preprocess user config Parameters ---------- config : dict User configuration parsed as (nested) dict. layers : dict or list of dict A single dictionary or a list of dictionaries specifying the statistics to compute. Each dictionary should contain the following keys: - geotiff_fp: (REQUIRED) Path to the raster file. - route_fp: (REQUIRED) Path to the vector file of routes. Must contain a "geometry" column and the `row_width_key` column (used to map to path ROW width). - stats: (OPTIONAL) Names of all statistics to compute. Statistics must be one of the members of :class:`~revrt.spatial_characterization.stats.Stat` or :class:`~revrt.spatial_characterization.stats.FractionalStat`, or must start with the ``percentile_`` prefix and end with an int or float representing the percentile to compute (e.g. ``percentile_10.5``). If only one statistic is to be computed, you can provide it directly as a string. Otherwise, provide a list of statistic names or a string with the names separated by a space. You can also provide the string ``"ALL"`` or ``"*"`` to specify that all statistics should be computed (i.e. all options from *both* :class:`~revrt.spatial_characterization.stats.Stat` and :class:`~revrt.spatial_characterization.stats.FractionalStat`). If no input, empty input, or ``None`` is provided, then only the base stats ("count", "min", "max", "mean") are configured. To summarize, all of the following are valid inputs: - ``stats: "*"`` or ``stats="ALL"`` or ``stats="All"`` - ``stats: "min"`` - ``stats: "min max"`` - ``stats: ["min"]`` - ``stats: ["min", "max", "percentile_10.5"]`` - nodata : (OPTIONAL) Value in the raster that represents `nodata`. This value will not show up in any statistics except for the `nodata` statistic itself, which computes the number of `nodata` values within the buffered routes. Note that this value is used **in addition to** any `NODATA` value in the raster's metadata. - all_touched : (OPTIONAL) Boolean flag indicating whether to include every raster cell touched by a geometry (``True``), or only those having a center point within the polygon (``False``). By default, ``True``. - category_map : (OPTIONAL) Dictionary mapping raster values to new names. If given, this mapping will be applied to the pixel count dictionary, so you can use it to map raster values to human-readable category names. - multiplier_scalar: (OPTIONAL) Optional multiplier value to apply to layer before computing statistics. This is useful if you want to scale the values in the raster before computing statistics. By default, ``1.0``. - prefix: (OPTIONAL) A string representing a prefix to add to each stat name. If you wish to have the prefix separated by a delimiter, you must include it in this string (e.g. ``prefix="test_"``). - copy_properties: (OPTIONAL) List of columns names to copy over from the vector file of routes. - row_width_key: (OPTIONAL) Name of column in vector file of routes used to map to the ROW widths. By default, ``"voltage"``. - chunks : (OPTIONAL) ``chunks`` keyword argument to pass down to :func:`rioxarray.open_rasterio`. Use this to control the Dask chunk size. By default, ``"auto"``. default_route_fp : path-like, optional Default path to the vector file of routes. This will be used *only if* no `route_fp` is provided in a layer's stats dictionary. Must contain a "geometry" column and the `row_width_key` column (used to map to path ROW width). By default, ``None``. default_copy_properties : iterable of str, optional Default iterable of columns names to copy over from the zone feature. This will be used *only if* no `copy_properties` is provided in a layer's stats dictionary. By default, ``None``. default_row_width_key : str, optional Default name of column in vector file of routes used to map to the ROW widths. This will be used *only if* no `row_width_key` is provided in a layer's stats dictionary. By default, ``None``. default_chunks : tuple or str, optional Default ``chunks`` keyword argument to pass down to :func:`rioxarray.open_rasterio`. This will be used *only if* no `chunks` is provided in a layer's stats dictionary. Use this to control the Dask chunk size. By default, ``None``, which uses ``"auto"`` as the final chunk input. row_widths : dict or path-like, optional A dictionary specifying the row widths in the following format: ``{"row_width_id": row_width_meters}``. The ``row_width_id`` is a value used to match each route with a particular ROW width (this is typically a voltage). The value should be found under the ``row_width_key`` entry of the ``route_fp``. .. IMPORTANT:: At least one of `row_widths` or `row_width_ranges` must be provided. .. WARNING:: Routes without a valid voltage in the `row_widths` or `row_width_ranges` input will not be characterized. If a path is provided, it should point to a JSON file containing the row width dictionary as specified above. By default, ``None``. row_width_ranges : list, optional Optional list of dictionaries, where each dictionary contains the keys "min", "max", and "width". This can be used to specify row widths based on ranges of values (e.g. voltage). For example, the following input:: [ {"min": 0, "max": 70, "width": 20}, {"min": 70, "max": 150, "width": 30}, {"min": 200, "max": 350, "width": 40}, {"min": 400, "max": 500, "width": 50}, ] would map voltages in the range ``0 <= volt < 70`` to a row width of 20 meters, ``70 <= volt < 150`` to a row width of 30 meters, ``200 <= volt < 350`` to a row width of 40 meters, and so-on. .. IMPORTANT:: Any values in the `row_widths` dict will take precedence over these ranges. So if a voltage of 138 kV is mapped to a row width of 25 meters in the `row_widths` dict, that value will be used instead of the 30 meter width specified by the ranges above. If a path is provided, it should point to a JSON file containing the list of dictionaries. By default, ``None``. """ for key, user_input in ( ("_row_widths", row_widths), ("_row_width_ranges", row_width_ranges), ): if isinstance(user_input, str): user_input = load_config(user_input) # noqa: PLW2901 config[key] = user_input if isinstance(layers, dict): layers = [layers] config["_stat_kwargs"] = layers config["_default_route_fp"] = default_route_fp config["_default_copy_properties"] = default_copy_properties config["_default_row_width_key"] = default_row_width_key config["_default_chunks"] = default_chunks return config route_characterizations_command = CLICommandFromFunction( _route_characterizations_from_config, name="route-characterization", add_collect=False, split_keys=["_stat_kwargs"], config_preprocessor=_preprocess_stats_config, )