import logging
import warnings
from typing import Any, Dict, List, Optional, Tuple
import inspect
import gymnasium as gym
import numpy as np
from ray.rllib.env.env_context import EnvContext
from ray.rllib.utils.spaces.repeated import Repeated
from graphenv.vertex import V
logger = logging.getLogger(__name__)
[docs]class GraphEnv(gym.Env):
Defines an OpenAI Gym Env for traversing a graph using the current vertex
as the state, and the successor vertices as actions.
GraphEnv uses composition to supply the per-vertex model of type Vertex, which
defines the graph via it's `_get_children()` method.
The `env_config` dictionary should contain the following keys::
state (N): Current vertex
max_num_children (int): maximum number of children considered at a time.
env_config (dict): A dictionary of parameters, required to conform with
rllib's environment initialization.
#: graphenv.vertex.Vertex: current vertex
state: V
#: int: maximum number of actions considered at a time
max_num_children: int
#: the observation space of the graph environment
observation_space: gym.Space
#: the action space, a Discrete space over `max_num_children`
action_space: gym.Space
# For environment rendering
metadata: Dict[str, Any] = {"render_modes": ["human", None]}
render_mode: Optional[str] = None
def __init__(self, env_config: EnvContext) -> None:
logger.debug("entering graphenv construction")
self.state = env_config["state"]
self.max_num_children = env_config["max_num_children"]
num_vertex_observations = 1 + self.max_num_children
self.observation_space = Repeated(
self.state.observation_space, num_vertex_observations
self.action_space = gym.spaces.Discrete(self.max_num_children)
logger.debug("leaving graphenv construction")
# RLlib 2.3.1 does not yet support setting the 'seed' here. Using kwargs quiets the warning.
# "Seeding will take place using 'env.seed()' and the info dict will not be returned from reset."
#def reset(self, *, seed=None, options=None) -> Tuple[Dict[str, np.ndarray], Dict]:
[docs] def reset(self, **kwargs) -> Tuple[Dict[str, np.ndarray], Dict]:
"""Reset this state to the root vertex. It is possible for state.root to
return different root vertices on each call.
Dict[str, np.ndarray]: Observation of the root vertex.
self.state = self.state.root
return self.make_observation(),
[docs] def step(self, action: int) -> Tuple[Dict[str, np.ndarray], float, bool, bool, dict]:
"""Steps the environment to a new state by taking an action. In the
case of GraphEnv, the action specifies which next vertex to move to and
this method advances the environment to that vertex.
action (int): The index of the child vertex of self.state to move to.
RuntimeError: When action is an invalid index.
Tuple[Dict[str, np.ndarray], float, bool, dict]: Tuple of:
a dictionary of the new state's observation,
the reward received by moving to the new state's vertex,
a bool which is true iff the new state is a terminal vertex,
a bool which is true if the search is truncated
a dictionary of debugging information related to this call
if len(self.state.children) > self.max_num_children:
raise RuntimeError(
f"State {self.state} has {len(self.state.children)} children "
f"(> {self.max_num_children})"
if action not in self.action_space:
raise RuntimeError(
f"Action {action} outside the action space of state {self.state}: "
f"{len(self.state.children)} max children"
# Move the state to the next action
self.state = self.state.children[action]
except IndexError:
# Skip this warning message if the call
# came from rllib's precheck function
skip_warning = False
for stack_func_info in inspect.stack():
caller_name = stack_func_info[3]
if caller_name == "check_gym_environments":
skip_warning = True
if not skip_warning:
"Attempting to choose a masked child state. This is either due to "
"rllib's env pre_check module, or due to a failure of the policy model "
"to mask invalid actions. Returning the current state to satisfy the "
"pre_check module.",
# In RLlib 2.3, the config options "no_done_at_end", "horizon", and "soft_horizon" are no longer supported
# according to the migration guide
# Instead, wrap your gymnasium environment with a TimeLimit wrapper,
# which will set truncated according to the number of timesteps
# see
truncated = False
result = (
f"{type(self)}: {result[1]} {result[2]}, {result[3]},"
f" {len(self.state.children)}"
return result
[docs] def make_observation(self) -> List[any]:
Makes an observation for this state which includes observations of
each possible action, and the current state.
Expects the action observations to all be Dicts with the same keys.
Returns a column-oriented representation, a Dict with keys matching
the action observation keys, and values that are the current state
and every action's values for that key concatenated into numpy arrays.
The current state is the 0th entry in these arrays, and the children
are offset by one index to accommodate that.
List[any]: A list of next state observations.
assert (
len(self.state.children) <= self.max_num_children
), f"{self.state} exceeds the maximum number of children"
return [state.observation for state in (self.state, *self.state.children)]
[docs] def render(self, mode: str = "human") -> Any:
"""Delegates to Vertex.render()"""
if mode == "human":
return self.state.render()