JuliaReinforcementLearning · jeremiahpslewis · Jul 29, 2023 · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023
diff --git a/docs/homepage/blog/a_practical_introduction_to_RL.jl/index.html b/docs/homepage/blog/a_practical_introduction_to_RL.jl/index.html
@@ -14804,7 +14804,7 @@ <h2 id="Understand-the-Trajectories">Understand the <em>Trajectories</em><a clas
 <div class="prompt input_prompt">In&nbsp;[28]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-julia"><pre><span></span><span class="n">t</span> <span class="o">=</span> <span class="n">Trajectories</span><span class="o">.</span><span class="n">CircularArraySARTTraces</span><span class="p">(;</span><span class="n">capacity</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
+<div class=" highlight hl-julia"><pre><span></span><span class="n">t</span> <span class="o">=</span> <span class="n">Trajectories</span><span class="o">.</span><span class="n">CircularArraySARTSTraces</span><span class="p">(;</span><span class="n">capacity</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
 </pre></div>
 
     </div>

diff --git a/docs/src/How_to_implement_a_new_algorithm.md b/docs/src/How_to_implement_a_new_algorithm.md
@@ -94,7 +94,7 @@ A `Trajectory` is composed of three elements: a `container`, a `controller`, and
 
 The container is typically an `AbstractTraces`, an object that store a set of `Trace` in a structured manner. You can either define your own (and contribute it to the package if it is likely to be usable for other algorithms), or use a predefined one if it exists. 
 
-The most common `AbstractTraces` object is the `CircularArraySARTTraces`, this is a container of a fixed length that stores the following traces: `:state` (S), `:action` (A), `:reward` (R), `:terminal` (T), which toghether are aliased to `SART = (:state, :action, :reward, :terminal)`. Let us see how it is constructed in this simplified version as an example of how to build a custom trace. 
+The most common `AbstractTraces` object is the `CircularArraySARTSTraces`, this is a container of a fixed length that stores the following traces: `:state` (S), `:action` (A), `:reward` (R), `:terminal` (T), which together are aliased to `SART = (:state, :action, :reward, :terminal)`. Let us see how it is constructed in this simplified version as an example of how to build a custom trace. 
 
 ```julia
 function (capacity, state_size, state_eltype, action_size, action_eltype, reward_eltype)

diff --git a/docs/src/Zoo_Algorithms/MPO.md b/docs/src/Zoo_Algorithms/MPO.md
@@ -56,7 +56,7 @@ The next step is to wrap this policy into an `Agent`. An agent is a combination
 
 ```julia
 trajectory = Trajectory(
-            CircularArraySARTTraces(capacity = 1000, state = Float32 => (4,),action = Float32 => (1,)), 
+            CircularArraySARTSTraces(capacity = 1000, state = Float32 => (4,), action = Float32 => (1,)), 
             MetaSampler(
                 actor = MultiBatchSampler(BatchSampler{(:state,)}(32), 10),
                 critic = MultiBatchSampler(BatchSampler{SS′ART}(32), 1000)

diff --git a/src/ReinforcementLearningCore/Project.toml b/src/ReinforcementLearningCore/Project.toml
@@ -1,6 +1,6 @@
 name = "ReinforcementLearningCore"
 uuid = "de1b191a-4ae0-4afa-a27b-92d07f46b2d6"
-version = "0.11.3"
+version = "0.12.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
@@ -40,7 +40,7 @@ Parsers = "2"
 ProgressMeter = "1"
 Reexport = "1"
 ReinforcementLearningBase = "0.12"
-ReinforcementLearningTrajectories = "^0.1.9"
+ReinforcementLearningTrajectories = "^0.3.2"
 StatsBase = "0.32, 0.33, 0.34"
 TimerOutputs = "0.5"
 UnicodePlots = "1.3, 2, 3"

diff --git a/src/ReinforcementLearningCore/src/ReinforcementLearningCore.jl b/src/ReinforcementLearningCore/src/ReinforcementLearningCore.jl
@@ -3,7 +3,6 @@ module ReinforcementLearningCore
 using TimerOutputs
 using ReinforcementLearningBase
 using Reexport
-
 const RLCore = ReinforcementLearningCore
 
 export RLCore

diff --git a/src/ReinforcementLearningCore/src/core/run.jl b/src/ReinforcementLearningCore/src/core/run.jl
@@ -102,21 +102,17 @@ function _run(policy::AbstractPolicy,
             action = @timeit_debug timer "plan!"                RLBase.plan!(policy, env)
             @timeit_debug timer "act!"                          act!(env, action)
 
-            @timeit_debug timer "push!(policy) PostActStage"    push!(policy, PostActStage(), env)
+            @timeit_debug timer "push!(policy) PostActStage"    push!(policy, PostActStage(), env, action)
             @timeit_debug timer "optimise! PostActStage"        optimise!(policy, PostActStage())
             @timeit_debug timer "push!(hook) PostActStage"      push!(hook, PostActStage(), policy, env)
 
             if check_stop(stop_condition, policy, env)
                 is_stop = true
-                @timeit_debug timer "push!(policy) PreActStage"   push!(policy, PreActStage(), env)
-                @timeit_debug timer "optimise! PreActStage"       optimise!(policy, PreActStage())
-                @timeit_debug timer "push!(hook) PreActStage"     push!(hook, PreActStage(), policy, env)
-                @timeit_debug timer "plan!"                       RLBase.plan!(policy, env)  # let the policy see the last observation
                 break
             end
         end # end of an episode
 
-        @timeit_debug timer "push!(policy) PostEpisodeStage"      push!(policy, PostEpisodeStage(), env)  # let the policy see the last observation
+        @timeit_debug timer "push!(policy) PostEpisodeStage"      push!(policy, PostEpisodeStage(), env)
         @timeit_debug timer "optimise! PostEpisodeStage"          optimise!(policy, PostEpisodeStage())
         @timeit_debug timer "push!(hook) PostEpisodeStage"        push!(hook, PostEpisodeStage(), policy, env)
 

diff --git a/src/ReinforcementLearningCore/src/core/stages.jl b/src/ReinforcementLearningCore/src/core/stages.jl
@@ -17,7 +17,9 @@ struct PreActStage <: AbstractStage end
 struct PostActStage <: AbstractStage end
 
 Base.push!(p::AbstractPolicy, ::AbstractStage, ::AbstractEnv) = nothing
+Base.push!(p::AbstractPolicy, ::PostActStage, ::AbstractEnv, action) = nothing
 Base.push!(p::AbstractPolicy, ::AbstractStage, ::AbstractEnv, ::Symbol) = nothing
+Base.push!(p::AbstractPolicy, ::PostActStage, ::AbstractEnv, action, ::Symbol) = nothing
 
 RLBase.optimise!(policy::P, ::S) where {P<:AbstractPolicy,S<:AbstractStage} = nothing
 

diff --git a/src/ReinforcementLearningCore/src/policies/agent/agent.jl b/src/ReinforcementLearningCore/src/policies/agent/agent.jl
@@ -1,3 +1,3 @@
-include("base.jl")
+include("agent_base.jl")
 include("agent_srt_cache.jl")
 include("multi_agent.jl")
diff --git a/src/ReinforcementLearningCore/src/policies/agent/agent_base.jl b/src/ReinforcementLearningCore/src/policies/agent/agent_base.jl
@@ -0,0 +1,64 @@
+export Agent
+
+using Base.Threads: @spawn
+
+using Functors: @functor
+import Base.push!
+"""
+    Agent(;policy, trajectory) <: AbstractPolicy
+
+A wrapper of an `AbstractPolicy`. Generally speaking, it does nothing but to
+update the trajectory and policy appropriately in different stages. Agent
+is a Callable and its call method accepts varargs and keyword arguments to be
+passed to the policy. 
+
+"""
+mutable struct Agent{P,T} <: AbstractPolicy
+    policy::P
+    trajectory::T
+
+    function Agent(policy::P, trajectory::T) where {P<:AbstractPolicy, T<:Trajectory}
+        agent = new{P,T}(policy, trajectory)
+
+        if TrajectoryStyle(trajectory) === AsyncTrajectoryStyle()
+            bind(trajectory, @spawn(optimise!(policy, trajectory)))
+        end
+        agent
+    end
+end
+
+Agent(;policy, trajectory) = Agent(policy, trajectory)
+
+RLBase.optimise!(agent::Agent, stage::S) where {S<:AbstractStage} = RLBase.optimise!(TrajectoryStyle(agent.trajectory), agent, stage)
+RLBase.optimise!(::SyncTrajectoryStyle, agent::Agent, stage::S) where {S<:AbstractStage} = RLBase.optimise!(agent.policy, stage, agent.trajectory)
+
+# already spawn a task to optimise inner policy when initializing the agent
+RLBase.optimise!(::AsyncTrajectoryStyle, agent::Agent, stage::S) where {S<:AbstractStage} = nothing
+
+#by default, optimise does nothing at all stage
+function RLBase.optimise!(policy::AbstractPolicy, stage::AbstractStage, trajectory::Trajectory) end
+
+@functor Agent (policy,)
+
+function Base.push!(agent::Agent, ::PreEpisodeStage, env::AbstractEnv)
+    push!(agent.trajectory, (state = state(env),))
+end
+
+# !!! TODO: In async scenarios, parameters of the policy may still be updating
+# (partially), which will result to incorrect action. This should be addressed
+# in Oolong.jl with a wrapper
+function RLBase.plan!(agent::Agent, env::AbstractEnv)
+    RLBase.plan!(agent.policy, env)
+end
+
+function Base.push!(agent::Agent, ::PostActStage, env::AbstractEnv, action)
+    next_state = state(env)
+    push!(agent.trajectory, (state = next_state, action = action, reward = reward(env), terminal = is_terminated(env)))
+end
+
+function Base.push!(agent::Agent, ::PostEpisodeStage, env::AbstractEnv)
+    if haskey(agent.trajectory, :next_action) 
+        action = RLBase.plan!(agent.policy, env)
+        push!(agent.trajectory, PartialNamedTuple((action = action, )))
+    end
+end
diff --git a/src/ReinforcementLearningCore/src/policies/agent/agent_srt_cache.jl b/src/ReinforcementLearningCore/src/policies/agent/agent_srt_cache.jl
@@ -27,12 +27,12 @@ struct SART{S,A,R,T}
 end
 
 # This method is used to push a state and action to a trace 
-function Base.push!(ts::Union{CircularArraySARTTraces,ElasticArraySARTTraces}, xs::SA)
+function Base.push!(ts::Union{CircularArraySARTSTraces,ElasticArraySARTTraces}, xs::SA)
     push!(ts.traces[1].trace, xs.state)
     push!(ts.traces[2].trace, xs.action)
 end
 
-function Base.push!(ts::Union{CircularArraySARTTraces,ElasticArraySARTTraces}, xs::SART)
+function Base.push!(ts::Union{CircularArraySARTSTraces,ElasticArraySARTTraces}, xs::SART)
     push!(ts.traces[1].trace, xs.state)
     push!(ts.traces[2].trace, xs.action)
     push!(ts.traces[3], xs.reward)

diff --git a/src/ReinforcementLearningCore/src/policies/agent/base.jl b/src/ReinforcementLearningCore/src/policies/agent/base.jl
diff --git a/src/ReinforcementLearningCore/src/policies/agent/multi_agent.jl b/src/ReinforcementLearningCore/src/policies/agent/multi_agent.jl
@@ -125,18 +125,12 @@ function Base.run(
                 action = @timeit_debug timer "plan!"               RLBase.plan!(policy, env)
                 @timeit_debug timer "act!" act!(env, action)
 
-
-
-                @timeit_debug timer "push!(policy) PostActStage"     push!(policy, PostActStage(), env)
+                @timeit_debug timer "push!(policy) PostActStage"     push!(policy, PostActStage(), env, action)
                 @timeit_debug timer "optimise! PostActStage"         optimise!(policy, PostActStage())
                 @timeit_debug timer "push!(hook) PostActStage"       push!(hook, PostActStage(), policy, env)
 
                 if check_stop(stop_condition, policy, env)
                     is_stop = true
-                    @timeit_debug timer "push!(policy) PreActStage"  push!(multiagent_policy, PreActStage(), env)
-                    @timeit_debug timer "optimise! PreActStage"      optimise!(multiagent_policy, PreActStage())
-                    @timeit_debug timer "push!(hook) PreActStage"    push!(multiagent_hook, PreActStage(), policy, env)
-                    @timeit_debug timer "plan!"                      RLBase.plan!(multiagent_policy, env)  # let the policy see the last observation
                     break
                 end
 
@@ -191,21 +185,43 @@ function Base.push!(multiagent::MultiAgentPolicy, stage::S, env::E) where {S<:Ab
     end
 end
 
-# Like in the single-agent case, push! at the PreActStage() calls push! on each player with the state of the environment
-function Base.push!(multiagent::MultiAgentPolicy{names, T}, ::PreActStage, env::E) where {E<:AbstractEnv, names, T <: Agent}
+# Like in the single-agent case, push! at the PostActStage() calls push! on each player.
+function Base.push!(agent::Agent, ::PreEpisodeStage, env::AbstractEnv, player::Symbol)
+    push!(agent.trajectory, (state = state(env, player),))
+end
+
+function Base.push!(multiagent::MultiAgentPolicy, s::PreEpisodeStage, env::E) where {E<:AbstractEnv}
     for player in players(env)
-        push!(multiagent[player], state(env, player))
+        push!(multiagent[player], s, env, player)
     end
 end
 
-# Like in the single-agent case, push! at the PostActStage() calls push! on each player with the reward and termination status of the environment
-function Base.push!(multiagent::MultiAgentPolicy{names, T}, ::PostActStage, env::E) where {E<:AbstractEnv, names, T <: Agent}
-    for player in players(env)
-        push!(multiagent[player].cache, reward(env, player), is_terminated(env))
+function RLBase.plan!(agent::Agent, env::AbstractEnv, player::Symbol)
+    RLBase.plan!(agent.policy, env, player)
+end
+
+# Like in the single-agent case, push! at the PostActStage() calls push! on each player to store the action, reward, next_state, and terminal signal.
+function Base.push!(multiagent::MultiAgentPolicy, ::PostActStage, env::E, actions) where {E<:AbstractEnv}
+    for (player, action) in zip(players(env), actions)
+        next_state = state(env, player)
+        observation = (
+            state = next_state,
+            action = action,
+            reward = reward(env, player),
+            terminal = is_terminated(env)
+        )
+        push!(multiagent[player].trajectory, observation)
+    end
+end
+
+function Base.push!(agent::Agent, ::PostEpisodeStage, env::AbstractEnv, p::Symbol)
+    if haskey(agent.trajectory, :next_action) 
+        action = RLBase.plan!(agent.policy, env, p)
+        push!(agent.trajectory, PartialNamedTuple((action = action, )))
     end
 end
 
-function Base.push!(hook::MultiAgentHook, stage::S, multiagent::MultiAgentPolicy, env::E) where {E<:AbstractEnv,S<:AbstractStage}
+function Base.push!(hook::MultiAgentHook, stage::S, multiagent::MultiAgentPolicy, env::E) where {E<:AbstractEnv, S<:AbstractStage}
     for player in players(env)
         push!(hook[player], stage, multiagent[player], env, player)
     end
@@ -227,8 +243,9 @@ function Base.push!(composed_hook::ComposedHook{T},
     _push!(stage, policy, env, player, composed_hook.hooks...)
 end
 
+#For simultaneous players, plan! returns a Tuple of actions. 
 function RLBase.plan!(multiagent::MultiAgentPolicy, env::E) where {E<:AbstractEnv}
-    return (RLBase.plan!(multiagent[player], env, player) for player in players(env))
+    return Tuple(RLBase.plan!(multiagent[player], env, player) for player in players(env))
 end
 
 function RLBase.optimise!(multiagent::MultiAgentPolicy, stage::S) where {S<:AbstractStage}