pytorch · svekars · Feb 5, 2024 · Feb 1, 2024 · Feb 1, 2024 · Feb 1, 2024
diff --git a/advanced_source/coding_ddpg.py b/advanced_source/coding_ddpg.py
@@ -65,22 +65,24 @@
 
 # sphinx_gallery_start_ignore
 import warnings
+
 warnings.filterwarnings("ignore")
-import multiprocessing
+from torch import multiprocessing
+
 # TorchRL prefers spawn method, that restricts creation of  ``~torchrl.envs.ParallelEnv`` inside
 # `__main__` method call, but for the easy of reading the code switch to fork
 # which is also a default spawn method in Google's Colaboratory
 try:
     multiprocessing.set_start_method("fork")
 except RuntimeError:
-    assert multiprocessing.get_start_method() == "fork"
+    pass
+
 # sphinx_gallery_end_ignore
 
 
-import torchrl
 import torch
 import tqdm
-from typing import Tuple
+
 
 ###############################################################################
 # We will execute the policy on CUDA if available
@@ -245,23 +247,23 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams):
     value_key = "state_action_value"
     if value_type == ValueEstimators.TD1:
         self._value_estimator = TD1Estimator(
-            value_network=self.actor_critic, value_key=value_key, **hp
+            value_network=self.actor_critic, **hp
         )
     elif value_type == ValueEstimators.TD0:
         self._value_estimator = TD0Estimator(
-            value_network=self.actor_critic, value_key=value_key, **hp
+            value_network=self.actor_critic, **hp
         )
     elif value_type == ValueEstimators.GAE:
         raise NotImplementedError(
             f"Value type {value_type} it not implemented for loss {type(self)}."
         )
     elif value_type == ValueEstimators.TDLambda:
         self._value_estimator = TDLambdaEstimator(
-            value_network=self.actor_critic, value_key=value_key, **hp
+            value_network=self.actor_critic, **hp
         )
     else:
         raise NotImplementedError(f"Unknown value type {value_type}")
-
+    self._value_estimator.set_keys(value=value_key)
 
 ###############################################################################
 # The ``make_value_estimator`` method can but does not need to be called: if
@@ -311,7 +313,7 @@ def _loss_actor(
 def _loss_value(
     self,
     tensordict,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+):
     td_copy = tensordict.clone()
 
     # V(s, a)
@@ -349,7 +351,7 @@ def _loss_value(
 # value and actor loss, collect the cost values and write them in a ``TensorDict``
 # delivered to the user.
 
-from tensordict.tensordict import TensorDict, TensorDictBase
+from tensordict import TensorDict, TensorDictBase
 
 
 def _forward(self, input_tensordict: TensorDictBase) -> TensorDict:
@@ -457,6 +459,7 @@ def make_env(from_pixels=False):
         raise NotImplementedError
 
     env_kwargs = {
+        "device": device,
         "from_pixels": from_pixels,
         "pixels_only": from_pixels,
         "frame_skip": 2,
@@ -519,16 +522,6 @@ def make_transformed_env(
     # syntax.
     env.append_transform(RewardScaling(loc=0.0, scale=reward_scaling))
 
-    double_to_float_list = []
-    double_to_float_inv_list = []
-    if env_library is DMControlEnv:
-        # ``DMControl`` requires double-precision
-        double_to_float_list += [
-            "reward",
-            "action",
-        ]
-        double_to_float_inv_list += ["action"]
-
     # We concatenate all states into a single "observation_vector"
     # even if there is a single tensor, it'll be renamed in "observation_vector".
     # This facilitates the downstream operations as we know the name of the
@@ -544,11 +537,8 @@ def make_transformed_env(
     # version of the transform
     env.append_transform(ObservationNorm(in_keys=[out_key], standard_normal=True))
 
-    double_to_float_list.append(out_key)
     env.append_transform(
-        DoubleToFloat(
-            in_keys=double_to_float_list, in_keys_inv=double_to_float_inv_list
-        )
+        DoubleToFloat()
     )
 
     env.append_transform(StepCounter(max_frames_per_traj))
@@ -874,9 +864,6 @@ def make_ddpg_actor(
     reset_at_each_iter=False,
     split_trajs=False,
     device=collector_device,
-    # device for execution
-    storing_device=collector_device,
-    # device where data will be stored and passed
     exploration_type=ExplorationType.RANDOM,
 )
 

diff --git a/advanced_source/pendulum.py b/advanced_source/pendulum.py
@@ -10,7 +10,7 @@
 is an integrative part of reinforcement learning and control engineering.
 
 TorchRL provides a set of tools to do this in multiple contexts.
-This tutorial demonstrates how to use PyTorch and TorchRL code a pendulum 
+This tutorial demonstrates how to use PyTorch and TorchRL code a pendulum
 simulator from the ground up.
 It is freely inspired by the Pendulum-v1 implementation from `OpenAI-Gym/Farama-Gymnasium
 control library <https://github.com/Farama-Foundation/Gymnasium>`__.
@@ -49,9 +49,9 @@
 # cover a broader range of features of the environment API in TorchRL.
 #
 # Modeling stateless environments gives users full control over the input and
-# outputs of the simulator: one can reset an experiment at any stage or actively 
-# modify the dynamics from the outside. However, it assumes that we have some control 
-# over a task, which may not always be the case: solving a problem where we cannot 
+# outputs of the simulator: one can reset an experiment at any stage or actively
+# modify the dynamics from the outside. However, it assumes that we have some control
+# over a task, which may not always be the case: solving a problem where we cannot
 # control the current state is more challenging but has a much wider set of applications.
 #
 # Another advantage of stateless environments is that they can enable
@@ -73,14 +73,31 @@
 #   simulation graph.
 # * Finally, we will train a simple policy to solve the system we implemented.
 #
+
+# sphinx_gallery_start_ignore
+import warnings
+
+warnings.filterwarnings("ignore")
+from torch import multiprocessing
+
+# TorchRL prefers spawn method, that restricts creation of  ``~torchrl.envs.ParallelEnv`` inside
+# `__main__` method call, but for the easy of reading the code switch to fork
+# which is also a default spawn method in Google's Colaboratory
+try:
+    multiprocessing.set_start_method("fork")
+except RuntimeError:
+    pass
+
+# sphinx_gallery_end_ignore
+
 from collections import defaultdict
 from typing import Optional
 
 import numpy as np
 import torch
 import tqdm
+from tensordict import TensorDict, TensorDictBase
 from tensordict.nn import TensorDictModule
-from tensordict.tensordict import TensorDict, TensorDictBase
 from torch import nn
 
 from torchrl.data import BoundedTensorSpec, CompositeSpec, UnboundedContinuousTensorSpec
@@ -167,7 +184,7 @@
 # of :meth:`~torchrl.envs.EnvBase.step` in the input ``tensordict`` to enforce
 # input/output consistency.
 #
-# Typically, for stateful environments, this will look like this: 
+# Typically, for stateful environments, this will look like this:
 #
 # .. code-block::
 #
@@ -221,6 +238,7 @@
 # needed as the state needs to be read from the environment.
 #
 
+
 def _step(tensordict):
     th, thdot = tensordict["th"], tensordict["thdot"]  # th := theta
 
@@ -896,7 +914,7 @@ def plot():
 ######################################################################
 # Conclusion
 # ----------
-# 
+#
 # In this tutorial, we have learned how to code a stateless environment from
 # scratch. We touched the subjects of:
 #

diff --git a/intermediate_source/mario_rl_tutorial.py b/intermediate_source/mario_rl_tutorial.py
@@ -32,8 +32,8 @@
 #
 #      %%bash
 #      pip install gym-super-mario-bros==7.4.0
-#      pip install tensordict==0.2.0
-#      pip install torchrl==0.2.0
+#      pip install tensordict==0.3.0
+#      pip install torchrl==0.3.0
 #
 
 import torch

diff --git a/intermediate_source/reinforcement_ppo.py b/intermediate_source/reinforcement_ppo.py
@@ -104,6 +104,22 @@
 # description and more about the algorithm itself.
 #
 
+# sphinx_gallery_start_ignore
+import warnings
+
+warnings.filterwarnings("ignore")
+from torch import multiprocessing
+
+# TorchRL prefers spawn method, that restricts creation of  ``~torchrl.envs.ParallelEnv`` inside
+# `__main__` method call, but for the easy of reading the code switch to fork
+# which is also a default spawn method in Google's Colaboratory
+try:
+    multiprocessing.set_start_method("fork")
+except RuntimeError:
+    pass
+
+# sphinx_gallery_end_ignore
+
 from collections import defaultdict
 
 import matplotlib.pyplot as plt
@@ -118,7 +134,7 @@
 from torchrl.envs import (Compose, DoubleToFloat, ObservationNorm, StepCounter,
                           TransformedEnv)
 from torchrl.envs.libs.gym import GymEnv
-from torchrl.envs.utils import check_env_specs, set_exploration_mode
+from torchrl.envs.utils import check_env_specs, ExplorationType, set_exploration_type
 from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator
 from torchrl.objectives import ClipPPOLoss
 from torchrl.objectives.value import GAE
@@ -152,22 +168,10 @@
 # use. In general, the goal of an RL algorithm is to learn to solve the task
 # as fast as it can in terms of environment interactions: the lower the ``total_frames``
 # the better.
-# We also define a ``frame_skip``: in some contexts, repeating the same action
-# multiple times over the course of a trajectory may be beneficial as it makes
-# the behavior more consistent and less erratic. However, "skipping"
-# too many frames will hamper training by reducing the reactivity of the actor
-# to observation changes.
-#
-# When using ``frame_skip`` it is good practice to
-# correct the other frame counts by the number of frames we are grouping
-# together. If we configure a total count of X frames for training but
-# use a ``frame_skip`` of Y, we will be actually collecting ``XY`` frames in total
-# which exceeds our predefined budget.
-#
-frame_skip = 1
-frames_per_batch = 1000 // frame_skip
+#
+frames_per_batch = 1000
 # For a complete training, bring the number of frames up to 1M
-total_frames = 50_000 // frame_skip
+total_frames = 50_000
 
 ######################################################################
 # PPO parameters
@@ -196,14 +200,14 @@
 #
 # In RL, an *environment* is usually the way we refer to a simulator or a
 # control system. Various libraries provide simulation environments for reinforcement
-# learning, including Gymnasium (previously OpenAI Gym), DeepMind Control Suite, and
+# learning, including Gymnasium (previously OpenAI Gym), DeepMind control suite, and
 # many others.
 # As a general library, TorchRL's goal is to provide an interchangeable interface
 # to a large panel of RL simulators, allowing you to easily swap one environment
 # with another. For example, creating a wrapped gym environment can be achieved with few characters:
 #
 
-base_env = GymEnv("InvertedDoublePendulum-v4", device=device, frame_skip=frame_skip)
+base_env = GymEnv("InvertedDoublePendulum-v4", device=device)
 
 ######################################################################
 # There are a few things to notice in this code: first, we created
@@ -262,7 +266,7 @@
     Compose(
         # normalize observations
         ObservationNorm(in_keys=["observation"]),
-        DoubleToFloat(in_keys=["observation"]),
+        DoubleToFloat(),
         StepCounter(),
     ),
 )
@@ -410,8 +414,8 @@
     in_keys=["loc", "scale"],
     distribution_class=TanhNormal,
     distribution_kwargs={
-        "min": env.action_spec.space.minimum,
-        "max": env.action_spec.space.maximum,
+        "min": env.action_spec.space.low,
+        "max": env.action_spec.space.high,
     },
     return_log_prob=True,
     # we'll need the log-prob for the numerator of the importance weights
@@ -514,7 +518,7 @@
 #
 
 replay_buffer = ReplayBuffer(
-    storage=LazyTensorStorage(frames_per_batch),
+    storage=LazyTensorStorage(max_size=frames_per_batch),
     sampler=SamplerWithoutReplacement(),
 )
 
@@ -546,16 +550,13 @@
 )
 
 loss_module = ClipPPOLoss(
-    actor=policy_module,
-    critic=value_module,
-    advantage_key="advantage",
+    actor_network=policy_module,
+    critic_network=value_module,
     clip_epsilon=clip_epsilon,
     entropy_bonus=bool(entropy_eps),
     entropy_coef=entropy_eps,
     # these keys match by default but we set this for completeness
-    value_target_key=advantage_module.value_target_key,
     critic_coef=1.0,
-    gamma=0.99,
     loss_critic_type="smooth_l1",
 )
 
@@ -586,7 +587,7 @@
 
 
 logs = defaultdict(list)
-pbar = tqdm(total=total_frames * frame_skip)
+pbar = tqdm(total=total_frames)
 eval_str = ""
 
 # We iterate over the collector until it reaches the total number of frames it was
@@ -618,7 +619,7 @@
             optim.zero_grad()
 
     logs["reward"].append(tensordict_data["next", "reward"].mean().item())
-    pbar.update(tensordict_data.numel() * frame_skip)
+    pbar.update(tensordict_data.numel())
     cum_reward_str = (
         f"average reward={logs['reward'][-1]: 4.4f} (init={logs['reward'][0]: 4.4f})"
     )
@@ -633,7 +634,7 @@
         # number of steps (1000, which is our ``env`` horizon).
         # The ``rollout`` method of the ``env`` can take a policy as argument:
         # it will then execute this policy at each step.
-        with set_exploration_mode("mean"), torch.no_grad():
+        with set_exploration_type(ExplorationType.MEAN), torch.no_grad():
             # execute a rollout with the trained policy
             eval_rollout = env.rollout(1000, policy_module)
             logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())

diff --git a/requirements.txt b/requirements.txt
@@ -25,8 +25,8 @@ tensorboard
 jinja2==3.1.3
 pytorch-lightning
 torchx
-torchrl==0.2.1
-tensordict==0.2.1
+torchrl==0.3.0
+tensordict==0.3.0
 ax-platform
 nbformat>=4.2.0
 datasets