TSP Environment Rendering

Here we demonstrate some additional usage of graphenv in visualizing environment solutions

[1]:

import subprocess
from copy import copy
from itertools import chain
from pathlib import Path
from tempfile import TemporaryDirectory

import numpy as np
from graphenv.examples.tsp.graph_utils import make_complete_planar_graph
from graphenv.examples.tsp.tsp_state import TSPState
from graphenv.graph_env import GraphEnv

%matplotlib inline

[2]:

N = 50

# Create the TSPState and GraphEnv objects
state = TSPState(lambda: make_complete_planar_graph(N, seed=1))
env = GraphEnv(
    {
        "state": state,
        "max_num_children": N,
    }
)

[3]:

%%capture

# Reset the environment and initialize the observation, reward, and done fields
obs, info = env.reset()
greedy_reward = 0
done = False

with TemporaryDirectory(dir=".") as tmpdir:
    while not done:

        # Select the action with the minimum distance from the parent node
        action = np.argmin([x["parent_dist"] for x in obs[1:]])

        # Get the observation for the next set of candidate nodes,
        # incremental reward, and done flags
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        # Append the step's reward to the running total
        greedy_reward += reward

        # Render the intermediate environment and save as an image
        fig, ax = env.render()
        ax.set_title(f"distance = {greedy_reward:.3f}")
        fig.savefig(Path(tmpdir, f"img{len(env.state.tour):01d}.jpg"), dpi=100)

    # Convert the list of frames to a video
    subprocess.run(
        "ffmpeg -i img%01d.jpg -vcodec libx264 -acodec libfaac -y ../movie.mp4",
        shell=True,
        check=True,
        cwd=tmpdir,
        capture_output=True,
    )

    subprocess.run(
        "ffmpeg -i img%01d.jpg -vcodec libvpx -acodec libvorbis -y ../movie.webm",
        shell=True,
        check=True,
        cwd=tmpdir,
        stdout=None,
        capture_output=True,
    )

Beam Search

GraphEnvs can be copied using python’s shallow copy() function, which allows parallelizing search strategies. Here we demonstrate this functionality in a simple beam search, where the top K shortest solutions are expanded on each iteration.

[5]:

def expand(env, previous_reward):
    """Given an environment (and it's cumulative reward), return possible next actions
    and their associated rewards.

    Note, here we avoid calling env.step(), as openai's API would require enumerating
    the full search tree for the child nodes.
    """
    return (
        {
            "action_index": i,
            "reward": previous_reward + child.reward,
            "env": copy(env),  # Here, we copy the environment to maintain seperate tours
        }
        for i, child in enumerate(env.state.children)
    )


def beam_search(env_list, k):
    """Expand the given (env, reward) pairs, and subsample only the top K from the next
    iteration of results.
    """

    top_actions = sorted(
        chain(*(expand(env, previous_reward) for env, previous_reward in env_list)),
        key=lambda x: x["reward"],
        reverse=True,
    )[:k]

    for entry in top_actions:
        obs, reward, terminated, truncated, info = entry["env"].step(entry["action_index"])

    return [(entry["env"], entry["reward"]) for entry in top_actions], done

[6]:

obs, info = env.reset()
env_list = [(env, 0)]
done = False

# A beam search with K=1 should be equivalent to greedy search
while not done:
    env_list, done = beam_search(env_list, 1)

assert np.isclose(env_list[0][1], greedy_reward)

[7]:

obs, info = env.reset()
env_list = [(env, 0)]
done = False

# Here, we show a beam search with K=50 that beats the greedy baseline
while not done:
    env_list, done = beam_search(env_list, 50)

[8]:

env_list[:5]

[8]:

[(<graphenv.graph_env.GraphEnv at 0x15a0ab8b0>, -6.946512200757427),
 (<graphenv.graph_env.GraphEnv at 0x15aba0280>, -7.01145372119443),
 (<graphenv.graph_env.GraphEnv at 0x15aba0340>, -7.0149204046663485),
 (<graphenv.graph_env.GraphEnv at 0x15aba5d30>, -7.050288524545745),
 (<graphenv.graph_env.GraphEnv at 0x15aba5bb0>, -7.0804851031169225)]

[9]:

fig, ax = env_list[0][0].render()
_ = ax.set_title(f"reward = {env_list[0][1]:.3f}")