Skip to content

Update torchrl==0.3.0 tutos #2759

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 14 additions & 27 deletions advanced_source/coding_ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,22 +65,24 @@

# sphinx_gallery_start_ignore
import warnings

warnings.filterwarnings("ignore")
import multiprocessing
from torch import multiprocessing

# TorchRL prefers spawn method, that restricts creation of ``~torchrl.envs.ParallelEnv`` inside
# `__main__` method call, but for the easy of reading the code switch to fork
# which is also a default spawn method in Google's Colaboratory
try:
multiprocessing.set_start_method("fork")
except RuntimeError:
assert multiprocessing.get_start_method() == "fork"
pass

# sphinx_gallery_end_ignore


import torchrl
import torch
import tqdm
from typing import Tuple


###############################################################################
# We will execute the policy on CUDA if available
Expand Down Expand Up @@ -245,23 +247,23 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams):
value_key = "state_action_value"
if value_type == ValueEstimators.TD1:
self._value_estimator = TD1Estimator(
value_network=self.actor_critic, value_key=value_key, **hp
value_network=self.actor_critic, **hp
)
elif value_type == ValueEstimators.TD0:
self._value_estimator = TD0Estimator(
value_network=self.actor_critic, value_key=value_key, **hp
value_network=self.actor_critic, **hp
)
elif value_type == ValueEstimators.GAE:
raise NotImplementedError(
f"Value type {value_type} it not implemented for loss {type(self)}."
)
elif value_type == ValueEstimators.TDLambda:
self._value_estimator = TDLambdaEstimator(
value_network=self.actor_critic, value_key=value_key, **hp
value_network=self.actor_critic, **hp
)
else:
raise NotImplementedError(f"Unknown value type {value_type}")

self._value_estimator.set_keys(value=value_key)

###############################################################################
# The ``make_value_estimator`` method can but does not need to be called: if
Expand Down Expand Up @@ -311,7 +313,7 @@ def _loss_actor(
def _loss_value(
self,
tensordict,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
):
td_copy = tensordict.clone()

# V(s, a)
Expand Down Expand Up @@ -349,7 +351,7 @@ def _loss_value(
# value and actor loss, collect the cost values and write them in a ``TensorDict``
# delivered to the user.

from tensordict.tensordict import TensorDict, TensorDictBase
from tensordict import TensorDict, TensorDictBase


def _forward(self, input_tensordict: TensorDictBase) -> TensorDict:
Expand Down Expand Up @@ -457,6 +459,7 @@ def make_env(from_pixels=False):
raise NotImplementedError

env_kwargs = {
"device": device,
"from_pixels": from_pixels,
"pixels_only": from_pixels,
"frame_skip": 2,
Expand Down Expand Up @@ -519,16 +522,6 @@ def make_transformed_env(
# syntax.
env.append_transform(RewardScaling(loc=0.0, scale=reward_scaling))

double_to_float_list = []
double_to_float_inv_list = []
if env_library is DMControlEnv:
# ``DMControl`` requires double-precision
double_to_float_list += [
"reward",
"action",
]
double_to_float_inv_list += ["action"]

# We concatenate all states into a single "observation_vector"
# even if there is a single tensor, it'll be renamed in "observation_vector".
# This facilitates the downstream operations as we know the name of the
Expand All @@ -544,11 +537,8 @@ def make_transformed_env(
# version of the transform
env.append_transform(ObservationNorm(in_keys=[out_key], standard_normal=True))

double_to_float_list.append(out_key)
env.append_transform(
DoubleToFloat(
in_keys=double_to_float_list, in_keys_inv=double_to_float_inv_list
)
DoubleToFloat()
)

env.append_transform(StepCounter(max_frames_per_traj))
Expand Down Expand Up @@ -874,9 +864,6 @@ def make_ddpg_actor(
reset_at_each_iter=False,
split_trajs=False,
device=collector_device,
# device for execution
storing_device=collector_device,
# device where data will be stored and passed
exploration_type=ExplorationType.RANDOM,
)

Expand Down
32 changes: 25 additions & 7 deletions advanced_source/pendulum.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
is an integrative part of reinforcement learning and control engineering.

TorchRL provides a set of tools to do this in multiple contexts.
This tutorial demonstrates how to use PyTorch and TorchRL code a pendulum
This tutorial demonstrates how to use PyTorch and TorchRL code a pendulum
simulator from the ground up.
It is freely inspired by the Pendulum-v1 implementation from `OpenAI-Gym/Farama-Gymnasium
control library <https://github.com/Farama-Foundation/Gymnasium>`__.
Expand Down Expand Up @@ -49,9 +49,9 @@
# cover a broader range of features of the environment API in TorchRL.
#
# Modeling stateless environments gives users full control over the input and
# outputs of the simulator: one can reset an experiment at any stage or actively
# modify the dynamics from the outside. However, it assumes that we have some control
# over a task, which may not always be the case: solving a problem where we cannot
# outputs of the simulator: one can reset an experiment at any stage or actively
# modify the dynamics from the outside. However, it assumes that we have some control
# over a task, which may not always be the case: solving a problem where we cannot
# control the current state is more challenging but has a much wider set of applications.
#
# Another advantage of stateless environments is that they can enable
Expand All @@ -73,14 +73,31 @@
# simulation graph.
# * Finally, we will train a simple policy to solve the system we implemented.
#

# sphinx_gallery_start_ignore
import warnings

warnings.filterwarnings("ignore")
from torch import multiprocessing

# TorchRL prefers spawn method, that restricts creation of ``~torchrl.envs.ParallelEnv`` inside
# `__main__` method call, but for the easy of reading the code switch to fork
# which is also a default spawn method in Google's Colaboratory
try:
multiprocessing.set_start_method("fork")
except RuntimeError:
pass

# sphinx_gallery_end_ignore

from collections import defaultdict
from typing import Optional

import numpy as np
import torch
import tqdm
from tensordict import TensorDict, TensorDictBase
from tensordict.nn import TensorDictModule
from tensordict.tensordict import TensorDict, TensorDictBase
from torch import nn

from torchrl.data import BoundedTensorSpec, CompositeSpec, UnboundedContinuousTensorSpec
Expand Down Expand Up @@ -167,7 +184,7 @@
# of :meth:`~torchrl.envs.EnvBase.step` in the input ``tensordict`` to enforce
# input/output consistency.
#
# Typically, for stateful environments, this will look like this:
# Typically, for stateful environments, this will look like this:
#
# .. code-block::
#
Expand Down Expand Up @@ -221,6 +238,7 @@
# needed as the state needs to be read from the environment.
#


def _step(tensordict):
th, thdot = tensordict["th"], tensordict["thdot"] # th := theta

Expand Down Expand Up @@ -896,7 +914,7 @@ def plot():
######################################################################
# Conclusion
# ----------
#
#
# In this tutorial, we have learned how to code a stateless environment from
# scratch. We touched the subjects of:
#
Expand Down
4 changes: 2 additions & 2 deletions intermediate_source/mario_rl_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@
#
# %%bash
# pip install gym-super-mario-bros==7.4.0
# pip install tensordict==0.2.0
# pip install torchrl==0.2.0
# pip install tensordict==0.3.0
# pip install torchrl==0.3.0
#

import torch
Expand Down
61 changes: 31 additions & 30 deletions intermediate_source/reinforcement_ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,22 @@
# description and more about the algorithm itself.
#

# sphinx_gallery_start_ignore
import warnings

warnings.filterwarnings("ignore")
from torch import multiprocessing

# TorchRL prefers spawn method, that restricts creation of ``~torchrl.envs.ParallelEnv`` inside
# `__main__` method call, but for the easy of reading the code switch to fork
# which is also a default spawn method in Google's Colaboratory
try:
multiprocessing.set_start_method("fork")
except RuntimeError:
pass

# sphinx_gallery_end_ignore

from collections import defaultdict

import matplotlib.pyplot as plt
Expand All @@ -118,7 +134,7 @@
from torchrl.envs import (Compose, DoubleToFloat, ObservationNorm, StepCounter,
TransformedEnv)
from torchrl.envs.libs.gym import GymEnv
from torchrl.envs.utils import check_env_specs, set_exploration_mode
from torchrl.envs.utils import check_env_specs, ExplorationType, set_exploration_type
from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator
from torchrl.objectives import ClipPPOLoss
from torchrl.objectives.value import GAE
Expand Down Expand Up @@ -152,22 +168,10 @@
# use. In general, the goal of an RL algorithm is to learn to solve the task
# as fast as it can in terms of environment interactions: the lower the ``total_frames``
# the better.
# We also define a ``frame_skip``: in some contexts, repeating the same action
# multiple times over the course of a trajectory may be beneficial as it makes
# the behavior more consistent and less erratic. However, "skipping"
# too many frames will hamper training by reducing the reactivity of the actor
# to observation changes.
#
# When using ``frame_skip`` it is good practice to
# correct the other frame counts by the number of frames we are grouping
# together. If we configure a total count of X frames for training but
# use a ``frame_skip`` of Y, we will be actually collecting ``XY`` frames in total
# which exceeds our predefined budget.
#
frame_skip = 1
frames_per_batch = 1000 // frame_skip
#
frames_per_batch = 1000
# For a complete training, bring the number of frames up to 1M
total_frames = 50_000 // frame_skip
total_frames = 50_000

######################################################################
# PPO parameters
Expand Down Expand Up @@ -196,14 +200,14 @@
#
# In RL, an *environment* is usually the way we refer to a simulator or a
# control system. Various libraries provide simulation environments for reinforcement
# learning, including Gymnasium (previously OpenAI Gym), DeepMind Control Suite, and
# learning, including Gymnasium (previously OpenAI Gym), DeepMind control suite, and
# many others.
# As a general library, TorchRL's goal is to provide an interchangeable interface
# to a large panel of RL simulators, allowing you to easily swap one environment
# with another. For example, creating a wrapped gym environment can be achieved with few characters:
#

base_env = GymEnv("InvertedDoublePendulum-v4", device=device, frame_skip=frame_skip)
base_env = GymEnv("InvertedDoublePendulum-v4", device=device)

######################################################################
# There are a few things to notice in this code: first, we created
Expand Down Expand Up @@ -262,7 +266,7 @@
Compose(
# normalize observations
ObservationNorm(in_keys=["observation"]),
DoubleToFloat(in_keys=["observation"]),
DoubleToFloat(),
StepCounter(),
),
)
Expand Down Expand Up @@ -410,8 +414,8 @@
in_keys=["loc", "scale"],
distribution_class=TanhNormal,
distribution_kwargs={
"min": env.action_spec.space.minimum,
"max": env.action_spec.space.maximum,
"min": env.action_spec.space.low,
"max": env.action_spec.space.high,
},
return_log_prob=True,
# we'll need the log-prob for the numerator of the importance weights
Expand Down Expand Up @@ -514,7 +518,7 @@
#

replay_buffer = ReplayBuffer(
storage=LazyTensorStorage(frames_per_batch),
storage=LazyTensorStorage(max_size=frames_per_batch),
sampler=SamplerWithoutReplacement(),
)

Expand Down Expand Up @@ -546,16 +550,13 @@
)

loss_module = ClipPPOLoss(
actor=policy_module,
critic=value_module,
advantage_key="advantage",
actor_network=policy_module,
critic_network=value_module,
clip_epsilon=clip_epsilon,
entropy_bonus=bool(entropy_eps),
entropy_coef=entropy_eps,
# these keys match by default but we set this for completeness
value_target_key=advantage_module.value_target_key,
critic_coef=1.0,
gamma=0.99,
loss_critic_type="smooth_l1",
)

Expand Down Expand Up @@ -586,7 +587,7 @@


logs = defaultdict(list)
pbar = tqdm(total=total_frames * frame_skip)
pbar = tqdm(total=total_frames)
eval_str = ""

# We iterate over the collector until it reaches the total number of frames it was
Expand Down Expand Up @@ -618,7 +619,7 @@
optim.zero_grad()

logs["reward"].append(tensordict_data["next", "reward"].mean().item())
pbar.update(tensordict_data.numel() * frame_skip)
pbar.update(tensordict_data.numel())
cum_reward_str = (
f"average reward={logs['reward'][-1]: 4.4f} (init={logs['reward'][0]: 4.4f})"
)
Expand All @@ -633,7 +634,7 @@
# number of steps (1000, which is our ``env`` horizon).
# The ``rollout`` method of the ``env`` can take a policy as argument:
# it will then execute this policy at each step.
with set_exploration_mode("mean"), torch.no_grad():
with set_exploration_type(ExplorationType.MEAN), torch.no_grad():
# execute a rollout with the trained policy
eval_rollout = env.rollout(1000, policy_module)
logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ tensorboard
jinja2==3.1.3
pytorch-lightning
torchx
torchrl==0.2.1
tensordict==0.2.1
torchrl==0.3.0
tensordict==0.3.0
ax-platform
nbformat>=4.2.0
datasets
Expand Down