Refactor TRPO and VPG with EpisodesSampler (#952)

HenriDeh · jeremiahpslewis · web-flow · commit 208cfb75c25c · 2023-08-11T15:15:39.000+02:00
* Change qpolicy default update stage

* Add a docstring

* qbasedpolicy dispatches on learner

* default to nothing

* update docs

* bump versions to require RLTraj 0.3.3

* use EpisodesSampler in experiments

* Require latest Zoo version

* And bump Exp version

* refactor VPG

* rebump compats and versions

* refactor TRPO

* add cuDNN

* use correct traj

* activate tests...

* include algos

* use stack for dimensions agnosticity

* use stack with trpo

* move slow runtests

* fix dimensions

* comment back the algos

* lower NFQ batchsize

* Update src/ReinforcementLearningZoo/Project.toml

Co-authored-by: Jeremiah &lt;4462211+jeremiahpslewis@users.noreply.github.com&gt;

* Update src/ReinforcementLearningCore/Project.toml

Co-authored-by: Jeremiah &lt;4462211+jeremiahpslewis@users.noreply.github.com&gt;

* Update src/ReinforcementLearningExperiments/Project.toml

Co-authored-by: Jeremiah &lt;4462211+jeremiahpslewis@users.noreply.github.com&gt;

---------

Co-authored-by: Jeremiah &lt;4462211+jeremiahpslewis@users.noreply.github.com&gt;
diff --git a/src/ReinforcementLearningCore/Project.toml b/src/ReinforcementLearningCore/Project.toml
@@ -25,6 +25,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [compat]
 AbstractTrees = "0.3, 0.4"
@@ -42,7 +43,7 @@ Parsers = "2"
 ProgressMeter = "1"
 Reexport = "1"
 ReinforcementLearningBase = "0.12"
-ReinforcementLearningTrajectories = "^0.3.2"
+ReinforcementLearningTrajectories = "^0.3.3"
 StatsBase = "0.32, 0.33, 0.34"
 TimerOutputs = "0.5"
 UnicodePlots = "1.3, 2, 3"
diff --git a/src/ReinforcementLearningExperiments/Project.toml b/src/ReinforcementLearningExperiments/Project.toml
@@ -14,6 +14,7 @@ ReinforcementLearningEnvironments = "25e41dd2-4622-11e9-1641-f1adca772921"
 ReinforcementLearningZoo = "d607f57d-ee1e-4ba7-bcf2-7734c1e31854"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Weave = "44d3d7a6-8a23-5bf8-98c5-b353f8df5ec9"
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [compat]
 Distributions = "0.25"
@@ -22,7 +23,7 @@ Reexport = "1"
 ReinforcementLearningBase = "0.12"
 ReinforcementLearningCore = "0.12, 0.13"
 ReinforcementLearningEnvironments = "0.8"
-ReinforcementLearningZoo = "0.7, 0.8"
+ReinforcementLearningZoo = "^0.8.3"
 StableRNGs = "1"
 Weave = "0.10"
 julia = "1.9"
diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl
@@ -56,7 +56,7 @@ function RLCore.Experiment(
                 action=Float32 => (na,),
             ),
             sampler=BatchSampler{SS′ART}(
-                batch_size=10_000,
+                batch_size=128,
                 rng=rng
             ),
             controller=InsertSampleRatioController(
diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/Policy Gradient/JuliaRL_TRPO_CartPole.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/Policy Gradient/JuliaRL_TRPO_CartPole.jl
@@ -45,7 +45,8 @@ function RLCore.Experiment(
             ),
             rng=rng,
         ),
-        trajectory=Trajectory(container=Episode(ElasticArraySARTTraces(state=Float32 => (ns,))))
+        trajectory=Trajectory(container=CircularArraySARTSTraces(capacity = 10000, state=Float32 => (ns,)), sampler = EpisodesSampler(), controller = InsertSampleRatioController(ratio = 1/10000))
+        #Note: an EpisodeSamplerRatioController would be more adapted here. 
     )
     stop_condition = StopAfterEpisode(100, is_show_progress=!haskey(ENV, "CI"))
 
diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/Policy Gradient/JuliaRL_VPG_CartPole.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/Policy Gradient/JuliaRL_VPG_CartPole.jl
@@ -46,7 +46,8 @@ function RLCore.Experiment(
             γ=0.99f0,
             rng=rng,
         ),
-        trajectory=Trajectory(container=Episode(ElasticArraySARTTraces(state=Float32 => (ns,))))
+        trajectory=Trajectory(container=CircularArraySARTSTraces(capacity = 10000, state=Float32 => (ns,)), sampler = EpisodesSampler(), controller = InsertSampleRatioController(ratio = 1/10000))
+        #Note: an EpisodeSamplerRatioController would be more adapted here. 
     )
     stop_condition = StopAfterEpisode(500, is_show_progress=!haskey(ENV, "CI"))
 
diff --git a/src/ReinforcementLearningExperiments/test/runtests.jl b/src/ReinforcementLearningExperiments/test/runtests.jl
@@ -3,15 +3,16 @@ using CUDA
 
 CUDA.allowscalar(false)
 
-run(E`JuliaRL_NFQ_CartPole`)
 run(E`JuliaRL_BasicDQN_CartPole`)
 run(E`JuliaRL_DQN_CartPole`)
+run(E`JuliaRL_NFQ_CartPole`)
 # run(E`JuliaRL_PrioritizedDQN_CartPole`)
 run(E`JuliaRL_QRDQN_CartPole`)
 run(E`JuliaRL_REMDQN_CartPole`)
 run(E`JuliaRL_IQN_CartPole`)
 run(E`JuliaRL_Rainbow_CartPole`)
-# run(E`JuliaRL_VPG_CartPole`)
+#run(E`JuliaRL_VPG_CartPole`)
+#run(E`JuliaRL_TRPO_CartPole`)
 run(E`JuliaRL_MPODiscrete_CartPole`)
 run(E`JuliaRL_MPOContinuous_CartPole`)
 run(E`JuliaRL_MPOCovariance_CartPole`)
diff --git a/src/ReinforcementLearningZoo/Project.toml b/src/ReinforcementLearningZoo/Project.toml
@@ -17,6 +17,7 @@ ReinforcementLearningBase = "e575027e-6cd6-5018-9292-cdc6200d2b44"
 ReinforcementLearningCore = "de1b191a-4ae0-4afa-a27b-92d07f46b2d6"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [compat]
 CUDA = "4"
@@ -28,7 +29,7 @@ LogExpFunctions = "0.3"
 NNlib = "0.8, 0.9"
 Optim = "1"
 ReinforcementLearningBase = "0.12"
-ReinforcementLearningCore = "0.12, 0.13"
+ReinforcementLearningCore = "^0.12.3, 0.13"
 StatsBase = "0.33, 0.34"
 Zygote = "0.6"
 julia = "1.9"
diff --git a/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/policy_gradient.jl b/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/policy_gradient.jl
@@ -1,6 +1,6 @@
 # include("run.jl")
 include("util.jl")
-# include("vpg.jl")
+#include("vpg.jl")
 # include("A2C.jl")
 # include("ppo.jl")
 # include("A2CGAE.jl")
@@ -10,5 +10,5 @@ include("util.jl")
 # include("sac.jl")
 # include("maddpg.jl")
 # include("vmpo.jl")
-# include("trpo.jl")
+#include("trpo.jl")
 include("mpo.jl")
diff --git a/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/trpo.jl b/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/trpo.jl
@@ -39,13 +39,23 @@ function Base.push!(p::Agent{<:TRPO}, ::PostEpisodeStage, env::AbstractEnv)
     empty!(p.trajectory.container)
 end
 
-RLBase.optimise!(::Agent{<:TRPO}, ::PostActStage) = nothing
-
-function RLBase.optimise!(π::TRPO, ::PostActStage, episode::Episode)
-    gain = discount_rewards(episode[:reward][:], π.γ)
-    for inds in Iterators.partition(shuffle(π.rng, 1:length(episode)), π.batch_size)
-        RLBase.optimise!(π, (state=episode[:state][inds], action=episode[:action][inds], gain=gain[inds]))
+function RLBase.optimise!(p::TRPO, ::PostEpisodeStage, trajectory::Trajectory)
+    has_optimized = false
+    for batch in trajectory #batch is a vector of Episode
+        gains = vcat(discount_rewards(ep[:reward], p.γ) for ep in batch)
+        states = reduce(ep[:state] for ep in batch) do s, s2 
+            cat(s,s2, dims = ndims(first(batch[:state])))
+        end
+        actions = reduce(ep[:action] for ep in batch) do s, s2
+            cat(s, s2, dims = ndims(first(batch[:action])))
+        end
+        for inds in Iterators.partition(shuffle(p.rng, eachindex(gains)), p.batch_size)
+            RLBase.optimise!(p, (state=selectdim(states,ndims(states),inds), action=selectdim(actions,ndims(actions),inds), gain=gains[inds]))
+        end
+        has_optimized = true
     end
+    has_optimized && empty!(trajectory.container)
+    return nothing
 end
 
 function RLBase.optimise!(p::TRPO, ::PostActStage, batch::NamedTuple{(:state, :action, :gain)})
diff --git a/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/vpg.jl b/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/vpg.jl
@@ -31,26 +31,30 @@ function RLBase.plan!(π::VPG, env::AbstractEnv)
 end
 
 function RLBase.optimise!(p::VPG, ::PostEpisodeStage, trajectory::Trajectory)
-    trajectory.container[] = true
-    for batch in trajectory
-        RLBase.optimise!(p, batch)
-    end
-    empty!(trajectory.container)
-end
-
-function RLBase.optimise!(π::VPG, episode::Episode)
-    gain = discount_rewards(episode[:reward][:], π.γ)
-    for inds in Iterators.partition(shuffle(π.rng, 1:length(episode)), π.batch_size)
-        RLBase.optimise!(π, (state=episode[:state][inds], action=episode[:action][inds], gain=gain[inds]))
+    has_optimized = false
+    for batch in trajectory #batch is a vector of Episode
+        gains = vcat(discount_rewards(ep[:reward], p.γ) for ep in batch)
+        states = reduce(ep[:state] for ep in batch) do s, s2 
+            cat(s,s2, dims = ndims(first(batch[:state])))
+        end
+        actions = reduce(ep[:action] for ep in batch) do s, s2
+            cat(s, s2, dims = ndims(first(batch[:action])))
+        end
+        for inds in Iterators.partition(shuffle(p.rng, eachindex(gains)), p.batch_size)
+            RLBase.optimise!(p, (state=selectdim(states,ndims(states),inds), action=selectdim(actions,ndims(actions),inds), gain=gains[inds]))
+        end
+        has_optimized = true
     end
+    has_optimized && empty!(trajectory.container)
+    return nothing
 end
 
 function RLBase.optimise!(p::VPG, batch::NamedTuple{(:state, :action, :gain)})
     A = p.approximator
     B = p.baseline
-    s, a, g = map(Array, batch) # !!! FIXME
+    s, a, g = batch[:state], batch[:action], batch[:gain]
     local δ
-
+    println(s)
     if isnothing(B)
         δ = normalise(g)
         loss = 0

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,8 @@ function RLCore.Experiment(`
`45`	`45`	`),`
`46`	`46`	`rng=rng,`
`47`	`47`	`),`
`48`		`- trajectory=Trajectory(container=Episode(ElasticArraySARTTraces(state=Float32 => (ns,))))`
	`48`	`+ trajectory=Trajectory(container=CircularArraySARTSTraces(capacity = 10000, state=Float32 => (ns,)), sampler = EpisodesSampler(), controller = InsertSampleRatioController(ratio = 1/10000))`
	`49`	`+ #Note: an EpisodeSamplerRatioController would be more adapted here.`
`49`	`50`	`)`
`50`	`51`	`stop_condition = StopAfterEpisode(100, is_show_progress=!haskey(ENV, "CI"))`
`51`	`52`
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,8 @@ function RLCore.Experiment(`
`46`	`46`	`γ=0.99f0,`
`47`	`47`	`rng=rng,`
`48`	`48`	`),`
`49`		`- trajectory=Trajectory(container=Episode(ElasticArraySARTTraces(state=Float32 => (ns,))))`
	`49`	`+ trajectory=Trajectory(container=CircularArraySARTSTraces(capacity = 10000, state=Float32 => (ns,)), sampler = EpisodesSampler(), controller = InsertSampleRatioController(ratio = 1/10000))`
	`50`	`+ #Note: an EpisodeSamplerRatioController would be more adapted here.`
`50`	`51`	`)`
`51`	`52`	`stop_condition = StopAfterEpisode(500, is_show_progress=!haskey(ENV, "CI"))`
`52`	`53`