Source code for revrt.spatial_characterization.cli

"""revrt zonal stats command line interface (CLI)"""

import logging
from pathlib import Path

import rioxarray
import pandas as pd
import geopandas as gpd
from dask.distributed import Client
from gaps.config import load_config
from gaps.cli import CLICommandFromFunction

from revrt.spatial_characterization.zonal import ZonalStats
from revrt.utilities import buffer_routes


logger = logging.getLogger(__name__)



[docs]
def buffered_route_characterizations(
    geotiff_fp,
    route_fp,
    row_widths=None,
    row_width_ranges=None,
    multiplier_scalar=1.0,
    prefix=None,
    copy_properties=None,
    parallel=False,
    row_width_key="voltage",
    chunks="auto",
    **kwargs,
):
    """Compute route characterizations/statistics

    Each route is buffered before computing statistics.

    Parameters
    ----------
    geotiff_fp : path-like
        Path to the raster file.
    route_fp : path-like
        Path to the vector file of routes. Must contain a "geometry"
        column and the `row_width_key` column (used to map to path ROW
        width).
    row_widths : dict, optional
        A dictionary specifying the row widths in the following format:
        ``{"row_width_id": row_width_meters}``. The ``row_width_id`` is
        a value used to match each route with a particular ROW width
        (this is typically a voltage). The value should be found under
        the ``row_width_key`` entry of the ``route_fp``.

        .. IMPORTANT::
            At least one of `row_widths` or `row_width_ranges` must be
            provided.

        By default, ``None``.
    row_width_ranges : list, optional
        Optional list of dictionaries, where each dictionary contains
        the keys "min", "max", and "width". This can be used to specify
        row widths based on ranges of values (e.g. voltage). For
        example, the following input::

            [
                {"min": 0, "max": 70, "width": 20},
                {"min": 70, "max": 150, "width": 30},
                {"min": 200, "max": 350, "width": 40},
                {"min": 400, "max": 500, "width": 50},
            ]

        would map voltages in the range ``0 <= volt < 70`` to a row
        width of 20 meters, ``70 <= volt < 150`` to a row width of 30
        meters, ``200 <= volt < 350`` to a row width of 40 meters,
        and so-on.

        .. IMPORTANT::
            Any values in the `row_widths` dict will take precedence
            over these ranges. So if a voltage of 138 kV is mapped to a
            row width of 25 meters in the `row_widths` dict, that value
            will be used instead of the 30 meter width specified by the
            ranges above.

        By default, ``None``.
    multiplier_scalar : float, optional
        Optional multiplier value to apply to layer before computing
        statistics. This is useful if you want to scale the values in
        the raster before computing statistics. By default, ``1.0``.
    prefix : str, optional
        A string representing a prefix to add to each stat name. If you
        wish to have the prefix separated by a delimiter, you must
        include it in this string (e.g. ``prefix="test_"``).
        By default, ``None``.
    copy_properties : iterable of str, optional
        Iterable of columns names to copy over from the zone feature.
        By default, ``None``.
    parallel : bool, optional
        Option to perform processing in parallel using dask.
        By default, ``False``.
    row_width_key : str, default="voltage"
        Name of column in vector file of routes used to map to the
        ROW widths. By default, ``"voltage"``.
    chunks : tuple or str, default="auto"
        ``chunks`` keyword argument to pass down to
        :func:`rioxarray.open_rasterio`. Use this to control the Dask
        chunk size. By default, ``"auto"``.

    Returns
    -------
    pd.DataFrame
        Pandas DataFrame containing computed characteristics/stats.
    """
    rds = (
        rioxarray.open_rasterio(geotiff_fp, chunks=chunks) * multiplier_scalar
    )
    logger.debug("Tiff properties:\n%r", rds)
    logger.debug("Tiff chunksizes:\n%r", rds.chunksizes)  # cspell:disable-line

    routes = gpd.read_file(route_fp)
    routes = routes.to_crs(rds.rio.crs)

    routes = buffer_routes(
        routes,
        row_widths=row_widths,
        row_width_ranges=row_width_ranges,
        row_width_key=row_width_key,
    )

    logger.info("Initializing zonal stats with kwargs:\n%s", kwargs)
    zs = ZonalStats(**kwargs)
    logger.info("Computing stats...")
    stats = zs.from_array(
        zones=routes,
        raster_array=rds,
        affine_transform=rds.rio.transform(),
        prefix=prefix,
        copy_properties=copy_properties,
        parallel=parallel,
    )
    return pd.json_normalize(list(stats), sep="_")



def _route_characterizations_from_config(
    out_dir,
    _stat_kwargs,
    _row_widths=None,
    _row_width_ranges=None,
    _default_route_fp=None,
    _default_copy_properties=None,
    _default_row_width_key=None,
    _default_chunks=None,
    max_workers=1,
    tag=None,
    memory_limit_per_worker="auto",
):
    """Compute route characterizations/statistics

    Parameters
    ----------
    max_workers : int, optional
        Number of parallel workers to use for computation. If ``None``
        or >1, processing is performed in parallel (using Dask). If your
        paths span large areas, keep this value low (~10) to avoid
        running into memory errors. By default, ``1``.
    memory_limit_per_worker : str, float, int, or None, default="auto"
        Sets the memory limit *per worker*. This only applies if
        ``max_workers != 1``. If ``None`` or ``0``, no limit is applied.
        If ``"auto"``, the total system memory is split evenly between
        the workers. If a float, that fraction of the system memory is
        used *per worker*. If a string giving a number  of bytes (like
        "1GiB"), that amount is used *per worker*. If an int, that
        number of bytes is used *per worker*. By default, ``"auto"``
    """
    tag = tag or ""
    _stat_kwargs.setdefault("route_fp", _default_route_fp)
    _stat_kwargs.setdefault("copy_properties", _default_copy_properties)
    _stat_kwargs.setdefault(
        "row_width_key", _default_row_width_key or "voltage"
    )
    _stat_kwargs.setdefault("chunks", _default_chunks or "auto")

    raster_name = _stat_kwargs.get("geotiff_fp")
    raster_name = f"_{Path(raster_name).stem}" if raster_name else ""
    route_name = _stat_kwargs.get("route_fp")
    route_name = f"_{Path(route_name).stem}" if route_name else ""
    out_fp = Path(out_dir) / f"characterized{raster_name}{route_name}{tag}.csv"

    logger.debug(
        "Running with max_workers=%r and memory_limit_per_worker=%r",
        max_workers,
        memory_limit_per_worker,
    )
    parallel = False
    if max_workers != 1:
        parallel = True
        client = Client(
            n_workers=max_workers, memory_limit=memory_limit_per_worker
        )
        logger.info(
            "Dask client created with %s workers and %s memory limit per "
            "worker",
            max_workers,
            memory_limit_per_worker,
        )
        logger.info("Dashboard link: %s", client.dashboard_link)

    out_data = buffered_route_characterizations(
        row_widths=_row_widths,
        row_width_ranges=_row_width_ranges,
        parallel=parallel,
        **_stat_kwargs,
    )
    out_data.to_csv(out_fp, index=False)
    return str(out_fp)


def _preprocess_stats_config(
    config,
    layers,
    default_route_fp=None,
    default_copy_properties=None,
    default_row_width_key=None,
    default_chunks=None,
    row_widths=None,
    row_width_ranges=None,
):
    """Preprocess user config

    Parameters
    ----------
    config : dict
        User configuration parsed as (nested) dict.
    layers : dict or list of dict
        A single dictionary or a list of dictionaries specifying the
        statistics to compute. Each dictionary should contain the
        following keys:

            - geotiff_fp: (REQUIRED) Path to the raster file.
            - route_fp: (REQUIRED) Path to the vector file of routes.
              Must contain a "geometry" column and the `row_width_key`
              column (used to map to path ROW width).
            - stats: (OPTIONAL) Names of all statistics to compute.
              Statistics must be one of the members of
              :class:`~revrt.spatial_characterization.stats.Stat` or
              :class:`~revrt.spatial_characterization.stats.FractionalStat`,
              or must start with the ``percentile_`` prefix and end with
              an int or float representing the percentile to compute
              (e.g. ``percentile_10.5``). If only one statistic is to be
              computed, you can provide it directly as a string.
              Otherwise, provide a list of statistic names or a string
              with the names separated by a space. You can also provide
              the string ``"ALL"`` or ``"*"`` to specify that all
              statistics should be computed (i.e. all options from
              *both*
              :class:`~revrt.spatial_characterization.stats.Stat` and
              :class:`~revrt.spatial_characterization.stats.FractionalStat`).
              If no input, empty input, or ``None`` is provided, then
              only the base stats ("count", "min", "max", "mean") are
              configured. To summarize, all of the following are valid
              inputs:

                - ``stats: "*"`` or ``stats="ALL"`` or ``stats="All"``
                - ``stats: "min"``
                - ``stats: "min max"``
                - ``stats: ["min"]``
                - ``stats: ["min", "max", "percentile_10.5"]``

            - nodata : (OPTIONAL) Value in the raster that represents
              `nodata`. This value will not show up in any statistics
              except for the `nodata` statistic itself, which computes
              the number of `nodata` values within the buffered routes.
              Note that this value is used **in addition to** any
              `NODATA` value in the raster's metadata.
            - all_touched : (OPTIONAL) Boolean flag indicating whether
              to include every raster cell touched by a geometry
              (``True``), or only those having a center point within the
              polygon (``False``). By default, ``True``.
            - category_map : (OPTIONAL) Dictionary mapping raster values
              to new names. If given, this mapping will be applied to
              the pixel count dictionary, so you can use it to map
              raster values to human-readable category names.
            - multiplier_scalar: (OPTIONAL) Optional multiplier value to
              apply to layer before computing statistics. This is useful
              if you want to scale the values in the raster before
              computing statistics. By default, ``1.0``.
            - prefix: (OPTIONAL) A string representing a prefix to add
              to each stat name. If you wish to have the prefix
              separated by a delimiter, you must include it in this
              string (e.g. ``prefix="test_"``).
            - copy_properties: (OPTIONAL) List of columns names to copy
              over from the vector file of routes.
            - row_width_key: (OPTIONAL) Name of column in vector file of
              routes used to map to the ROW widths.
              By default, ``"voltage"``.
            - chunks : (OPTIONAL) ``chunks`` keyword argument to pass
              down to :func:`rioxarray.open_rasterio`. Use this to
              control the Dask chunk size. By default, ``"auto"``.

    default_route_fp : path-like, optional
        Default path to the vector file of routes. This will be used
        *only if* no `route_fp` is provided in a layer's stats
        dictionary. Must contain a "geometry" column and the
        `row_width_key` column (used to map to path ROW width).
        By default, ``None``.
    default_copy_properties : iterable of str, optional
        Default iterable of columns names to copy over from the zone
        feature. This will be used *only if* no `copy_properties` is
        provided in a layer's stats dictionary. By default, ``None``.
    default_row_width_key : str, optional
        Default name of column in vector file of routes used to map to
        the ROW widths. This will be used *only if* no `row_width_key`
        is provided in a layer's stats dictionary. By default, ``None``.
    default_chunks : tuple or str, optional
        Default ``chunks`` keyword argument to pass down to
        :func:`rioxarray.open_rasterio`. This will be used *only if* no
        `chunks` is provided in a layer's stats dictionary. Use this to
        control the Dask chunk size. By default, ``None``, which uses
        ``"auto"`` as the final chunk input.
    row_widths : dict or path-like, optional
        A dictionary specifying the row widths in the following format:
        ``{"row_width_id": row_width_meters}``. The ``row_width_id`` is
        a value used to match each route with a particular ROW width
        (this is typically a voltage). The value should be found under
        the ``row_width_key`` entry of the ``route_fp``.

        .. IMPORTANT::
            At least one of `row_widths` or `row_width_ranges` must be
            provided.

        .. WARNING::
           Routes without a valid voltage in the `row_widths` or
           `row_width_ranges` input will not be characterized.

        If a path is provided, it should point to a JSON file containing
        the row width dictionary as specified above.
        By default, ``None``.
    row_width_ranges : list, optional
        Optional list of dictionaries, where each dictionary contains
        the keys "min", "max", and "width". This can be used to specify
        row widths based on ranges of values (e.g. voltage). For
        example, the following input::

            [
                {"min": 0, "max": 70, "width": 20},
                {"min": 70, "max": 150, "width": 30},
                {"min": 200, "max": 350, "width": 40},
                {"min": 400, "max": 500, "width": 50},
            ]

        would map voltages in the range ``0 <= volt < 70`` to a row
        width of 20 meters, ``70 <= volt < 150`` to a row width of 30
        meters, ``200 <= volt < 350`` to a row width of 40 meters,
        and so-on.

        .. IMPORTANT::
            Any values in the `row_widths` dict will take precedence
            over these ranges. So if a voltage of 138 kV is mapped to a
            row width of 25 meters in the `row_widths` dict, that value
            will be used instead of the 30 meter width specified by the
            ranges above.

        If a path is provided, it should point to a JSON file containing
        the list of dictionaries. By default, ``None``.
    """
    for key, user_input in (
        ("_row_widths", row_widths),
        ("_row_width_ranges", row_width_ranges),
    ):
        if isinstance(user_input, str):
            user_input = load_config(user_input)  # noqa: PLW2901

        config[key] = user_input

    if isinstance(layers, dict):
        layers = [layers]

    config["_stat_kwargs"] = layers
    config["_default_route_fp"] = default_route_fp
    config["_default_copy_properties"] = default_copy_properties
    config["_default_row_width_key"] = default_row_width_key
    config["_default_chunks"] = default_chunks
    return config


route_characterizations_command = CLICommandFromFunction(
    _route_characterizations_from_config,
    name="route-characterization",
    add_collect=False,
    split_keys=["_stat_kwargs"],
    config_preprocessor=_preprocess_stats_config,
)