fixes and tests

HenriDeh · HenriDeh · commit 54551321e248 · 2023-08-23T11:59:46.000+02:00
diff --git a/src/samplers.jl b/src/samplers.jl
@@ -184,61 +184,71 @@ mutable struct NStepBatchSampler{names, S <: Union{Nothing,Int}}
     rng::Any
 end
 
-NStepBatchSampler(; kw...) = NStepBatchSampler{SS′ART}(; kw...)
+NStepBatchSampler(t::AbstractTraces; kw...) = NStepBatchSampler{keys(t)}(; kw...)
 function NStepBatchSampler{names}(; n, γ, batch_size=32, stack_size=nothing, rng=Random.GLOBAL_RNG) where {names} 
     @assert n >= 1 "n must be ≥ 1."
-    NStepBatchSampler{names}(n, γ, batch_size, stack_size == 1 ? nothing : stack_size, rng)
+    ss = stack_size == 1 ? nothing : stack_size
+    NStepBatchSampler{names, typeof(ss)}(n, γ, batch_size, ss, rng)
 end
 
-function valid_range_nbatchsampler(s::NStepBatchSampler, eb::EpisodesBuffer)
+#return a boolean vector of the valid sample indices given the stack_size and the truncated n for each index.
+function valid_range(s::NStepBatchSampler, eb::EpisodesBuffer) 
     range = copy(eb.sampleable_inds)
+    ns = Vector{Int}(undef, length(eb.sampleable_inds))
     stack_size = isnothing(s.stack_size) ? 1 : s.stack_size 
     for idx in eachindex(range)
-        valid = eb.step_numbers[idx] >= stack_size && eb.step_numbers[idx] <= eb.episodes_lengths[idx] + 1 - eb.n && eb.sampleable_inds[idx]
-        range[idx] = valid
+        step_number = eb.step_numbers[idx]
+        range[idx] = step_number >= stack_size && eb.sampleable_inds[idx]
+        ns[idx] = min(s.n, eb.episodes_lengths[idx] - step_number + 1)
     end
-    return range
+    return range, ns
 end
 
 function StatsBase.sample(s::NStepBatchSampler{names}, ts) where {names}
     StatsBase.sample(s, ts, Val(names))
 end
 
-function StatsBase.sample(s::NStepBatchSampler, t::EpisodesBuffer, names)
-    valid_range = valid_range_nbatchsampler(s, t)
-    StatsBase.sample(s, t.traces, names, StatsBase.FrequencyWeights(valid_range))
+function StatsBase.sample(s::NStepBatchSampler, t::EpisodesBuffer, ::Val{names}) where names
+    weights, ns = valid_range(s, t)
+    inds = StatsBase.sample(s.rng, 1:length(t), StatsBase.FrequencyWeights(weights[1:end-1]), s.batch_size)
+    fetch(s, t, Val(names), inds, ns)
 end
 
-function StatsBase.sample(s::NStepBatchSampler, t::AbstractTraces, names, weights = StatsBase.UnitWeights{Int}(length(t)))
-    inds = StatsBase.sample(s.rng, 1:length(t), weights, s.batch_size)
-    NamedTuple{names}map(name -> collect(fetch(s, ts[name], Val(name), inds)), names)
+function fetch(s::NStepBatchSampler, ts::EpisodesBuffer, ::Val{names}, inds, ns) where names
+    NamedTuple{names}(map(name -> collect(fetch(s, ts[name], Val(name), inds, ns[inds])), names))
 end
 
 #state and next_state have specialized fetch methods due to stack_size
-fetch(::NStepBatchSampler{names, Nothing}, trace, ::Val{:state}, inds) where {names} = trace[inds]
-fetch(s::NStepBatchSampler{names, Int}, trace, ::Val{:state}, inds) where {names} = trace[[x + s.n - 1 + i for i in -s.stack_size+1:0, x in inds]]
-fetch(s::NStepBatchSampler{names, Nothing}, trace, ::Val{:next_state}, inds) where {names} = trace[inds.+(s.n-1)]
-fetch(s::NStepBatchSampler{names, Int}, trace, ::Val{:next_state}, inds) where {names} = trace[[x + s.n - 1 + i for i in -s.stack_size+1:0, x in inds]]
+fetch(::NStepBatchSampler{names, Nothing}, trace::AbstractTrace, ::Val{:state}, inds, ns) where {names} = trace[inds]
+fetch(s::NStepBatchSampler{names, Int}, trace::AbstractTrace, ::Val{:state}, inds, ns) where {names} = trace[[x + i for i in -s.stack_size+1:0, x in inds]]
+fetch(::NStepBatchSampler{names, Nothing}, trace::RelativeTrace{1,0}, ::Val{:next_state}, inds, ns) where {names} = trace[inds .+ ns .- 1]
+fetch(s::NStepBatchSampler{names, Int}, trace::RelativeTrace{1,0}, ::Val{:next_state}, inds, ns) where {names}  = trace[[x + ns[idx] - 1 + i for i in -s.stack_size+1:0, (idx,x) in enumerate(inds)]]
+
 #reward due to discounting
-function fetch(s::NStepBatchSampler{names}, trace, ::Val{:reward}, inds) where {names} 
-    rewards = trace[[x + j for j in 0:nbs.n-1, x in inds]]
-    return reduce((x,y)->x + s.γ*y, rewards, init = zero(eltype(rewards)), dims = 1)
+function fetch(s::NStepBatchSampler, trace::AbstractTrace, ::Val{:reward}, inds, ns)
+    rewards = Vector{eltype(trace)}(undef, length(inds))
+    for (i,idx) in enumerate(inds)
+        rewards_to_go = trace[idx:idx+ns[i]-1]
+        rewards[i] = foldr((x,y)->x + s.γ*y, rewards_to_go)
+    end
+    return rewards
 end
 #terminal is that of the nth step
-fetch(s::NStepBatchSampler{names}, trace, ::Val{:terminal}, inds) where {names} = trace[inds.+s.n]
+fetch(::NStepBatchSampler, trace::AbstractTrace, ::Val{:terminal}, inds, ns) = trace[inds .+ ns .- 1]
 #right multiplex traces must be n-step sampled
-fetch(::NStepBatchSampler{names}, trace::RelativeTrace{1,0} , ::Val{<:Symbol}, inds) where {names} = trace[inds.+(s.n-1)]
+fetch(::NStepBatchSampler, trace::RelativeTrace{1,0} , ::Val, inds, ns) = trace[inds .+ ns .- 1]
 #normal trace types are fetched at inds
-fetch(::NStepBatchSampler{names}, trace, ::Val{<:Symbol}, inds) where {names} = trace[inds] #other types of trace are sampled normaly
+fetch(::NStepBatchSampler, trace::AbstractTrace, ::Val, inds, ns) = trace[inds] #other types of trace are sampled normally
 
 function StatsBase.sample(s::NStepBatchSampler{names}, e::EpisodesBuffer{<:Any, <:Any, <:CircularPrioritizedTraces}) where {names}
     t = e.traces
     st = deepcopy(t.priorities)
-    st .*= valid_range_nbatchsampler(s,e) #temporary sumtree that puts 0 priority to non sampleable indices.
+    valids, ns = valid_range(s,e)
+    st .*= valids[1:end-1] #temporary sumtree that puts 0 priority to non sampleable indices.
     inds, priorities = rand(s.rng, st, s.batch_size)
     merge(
         (key=t.keys[inds], priority=priorities),
-        fetch(s, t.traces, Val(names), inds)
+        fetch(s, e, Val(names), inds, ns)
     )
 end
 
diff --git a/src/traces.jl b/src/traces.jl
@@ -48,7 +48,7 @@ Base.size(x::Trace) = (size(x.parent, ndims(x.parent)),)
 Base.getindex(s::Trace, I) = Base.maybeview(s.parent, ntuple(i -> i == ndims(s.parent) ? I : (:), Val(ndims(s.parent)))...)
 Base.setindex!(s::Trace, v, I) = setindex!(s.parent, v, ntuple(i -> i == ndims(s.parent) ? I : (:), Val(ndims(s.parent)))...)
 
-@forward Trace.parent Base.parent, Base.pushfirst!, Base.push!, Base.append!, Base.prepend!, Base.pop!, Base.popfirst!, Base.empty!
+@forward Trace.parent Base.parent, Base.pushfirst!, Base.push!, Base.append!, Base.prepend!, Base.pop!, Base.popfirst!, Base.empty!, Base.eltype
 
 #By default, AbstractTrace have infinity capacity (like a Vector). This method is specialized for 
 #CircularArraySARTSTraces in common.jl. The functions below are made that way to avoid type piracy.
@@ -94,6 +94,7 @@ Base.getindex(s::RelativeTrace{0,-1}, I) = getindex(s.trace, I)
 Base.getindex(s::RelativeTrace{1,0}, I) = getindex(s.trace, I .+ 1)
 Base.setindex!(s::RelativeTrace{0,-1}, v, I) = setindex!(s.trace, v, I)
 Base.setindex!(s::RelativeTrace{1,0}, v, I) = setindex!(s.trace, v, I .+ 1)
+Base.eltype(t::RelativeTrace) = eltype(t.trace)
 capacity(t::RelativeTrace) = capacity(t.trace)
 
 """
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,9 +1,9 @@
 using ReinforcementLearningTrajectories
 using CircularArrayBuffers, DataStructures
 using Test
+import ReinforcementLearningTrajectories.StatsBase.sample
 using CUDA
 using Adapt
-import ReinforcementLearningTrajectories.StatsBase.sample
 
 struct TestAdaptor end
 
diff --git a/test/samplers.jl b/test/samplers.jl
@@ -1,4 +1,5 @@
-@testset "Samplers" begin
+import ReinforcementLearningTrajectories.fetch
+#@testset "Samplers" begin
     @testset "BatchSampler" begin
         sz = 32
         s = BatchSampler(sz)
@@ -74,132 +75,76 @@
 
     #! format: off
     @testset "NStepSampler" begin
-        γ = 0.9
+        γ = 0.99
         n_stack = 2
         n_horizon = 3
-        batch_size = 4
-
-        t1 = MultiplexTraces{(:state, :next_state)}(1:10) +
-            MultiplexTraces{(:action, :next_action)}(iseven.(1:10)) +
-            Traces(
-                reward=1:9,
-                terminal=Bool[0, 0, 0, 1, 0, 0, 0, 0, 1],
-            )
-
-        s1 = NStepBatchSampler(n=n_horizon, γ=γ, stack_size=n_stack, batch_size=batch_size)
-
-        xs = RLTrajectories.StatsBase.sample(s1, t1)
-
-        @test size(xs.state) == (n_stack, batch_size)
-        @test size(xs.next_state) == (n_stack, batch_size)
-        @test size(xs.action) == (batch_size,)
-        @test size(xs.reward) == (batch_size,)
-        @test size(xs.terminal) == (batch_size,)
-
-        
-        state_size = (2,3)
-        n_state = reduce(*, state_size)
-        total_length = 10
-        t2 = MultiplexTraces{(:state, :next_state)}(
-                reshape(1:n_state * total_length, state_size..., total_length)
-            ) +
-            MultiplexTraces{(:action, :next_action)}(iseven.(1:total_length)) +
-            Traces(
-                reward=1:total_length-1,
-                terminal=Bool[0, 0, 0, 1, 0, 0, 0, 0, 1],
-            )
-
-        xs2 = RLTrajectories.StatsBase.sample(s1, t2)
-
-        @test size(xs2.state) == (state_size..., n_stack, batch_size)
-        @test size(xs2.next_state) == (state_size..., n_stack, batch_size)
-        @test size(xs2.action) == (batch_size,)
-        @test size(xs2.reward) == (batch_size,)
-        @test size(xs2.terminal) == (batch_size,)
-
-        inds = [3, 5, 7]
-        xs3 = RLTrajectories.StatsBase.sample(s1, t2, Val(SS′ART), inds)
-
-        @test xs3.state == cat(
-            (
-                reshape(n_state * (i-n_stack)+1: n_state * i, state_size..., n_stack)
-                for i in inds
-            )...
-            ;dims=length(state_size) + 2
-        ) 
-
-        @test xs3.next_state == xs3.state .+ (n_state * n_horizon)
-        @test xs3.action == iseven.(inds)
-        @test xs3.terminal == [any(t2[:terminal][i: i+n_horizon-1]) for i in inds]
-
-        # manual calculation
-        @test xs3.reward[1] ≈ 3 + γ * 4  # terminated at step 4
-        @test xs3.reward[2] ≈ 5 + γ * (6 + γ * 7)
-        @test xs3.reward[3] ≈ 7 + γ * (8 + γ * 9)
-    end
-    #! format: on
-
-    @testset "Trajectory with CircularPrioritizedTraces and NStepBatchSampler" begin
-        n=1
-        γ=0.99f0
-
-        t = Trajectory(
-            container=CircularPrioritizedTraces(
-                CircularArraySARTSTraces(
-                    capacity=5,
-                    state=Float32 => (4,),
-                );
-                default_priority=100.0f0
-            ),
-            sampler=NStepBatchSampler{SS′ART}(
-                n=n,
-                γ=γ,
-                batch_size=32,
-            ),
-            controller=InsertSampleRatioController(
-                threshold=100,
-                n_inserted=-1
-            )
-        )
+        batch_size = 1000
+        eb = EpisodesBuffer(CircularArraySARTSATraces(capacity=10)) 
+        s1 = NStepBatchSampler(eb, n=n_horizon, γ=γ, stack_size=n_stack, batch_size=batch_size)
 
-        push!(t, (state = 1, action = true))
-        for i = 1:9
-            push!(t, (state = i+1, action = true, reward = i, terminal = false))
+        push!(eb, (state = 1, action = 1))
+        for i = 1:5
+            push!(eb, (state = i+1, action =i+1, reward = i, terminal = i == 5))
         end
-
-        b = RLTrajectories.StatsBase.sample(t)
-        @test haskey(b, :priority)
-        @test sum(b.action .== 0) == 0
-    end
-
-
-    @testset "Trajectory with CircularArraySARTSTraces and NStepBatchSampler" begin
-        n=1
-        γ=0.99f0
-
-        t = Trajectory(
-            container=CircularArraySARTSTraces(
-                    capacity=5,
-                    state=Float32 => (4,),
-            ),
-            sampler=NStepBatchSampler{SS′ART}(
-                n=n,
-                γ=γ,
-                batch_size=32,
-            ),
-            controller=InsertSampleRatioController(
-                threshold=100,
-                n_inserted=-1
-            )
-        )
-
-        push!(t, (state = 1, action = true))
-        for i = 1:9
-            push!(t, (state = i+1, action = true, reward = i, terminal = false))
+        push!(eb, (state = 7, action = 7))
+        for (j,i) = enumerate(8:11)
+            push!(eb, (state = i, action =i, reward = i-1, terminal = false))
+        end
+        weights, ns = ReinforcementLearningTrajectories.valid_range(s1, eb)
+        @test weights == [0,1,1,1,1,0,0,1,1,1,0]
+        @test ns == [3,3,3,2,1,-1,3,3,2,1,0] #the -1 is due to ep_lengths[6] being that of 2nd episode but step_numbers[6] being that of 1st episode
+        inds = [i for i in eachindex(weights) if weights[i] == 1]
+        batch = sample(s1, eb)
+        for key in keys(eb)
+            @test haskey(batch, key)
         end
+        #state: samples with stack_size
+        states = ReinforcementLearningTrajectories.fetch(s1, eb[:state], Val(:state), inds, ns[inds])
+        @test states == [1 2 3 4 7 8 9;
+                         2 3 4 5 8 9 10]
+        @test all(in(eachcol(states)), unique(eachcol(batch[:state])))
+        #next_state: samples with stack_size and nsteps forward
+        next_states = ReinforcementLearningTrajectories.fetch(s1, eb[:next_state], Val(:next_state), inds, ns[inds])
+        @test next_states == [4 5 5 5 10 10 10;
+                              5 6 6 6 11 11 11]
+        @test all(in(eachcol(next_states)), unique(eachcol(batch[:next_state])))
+        #action: samples normally 
+        actions = ReinforcementLearningTrajectories.fetch(s1, eb[:action], Val(:action), inds, ns[inds])
+        @test actions == inds
+        @test all(in(actions), unique(batch[:action]))
+        #next_action: is a multiplex trace: should automatically sample nsteps forward
+        next_actions = ReinforcementLearningTrajectories.fetch(s1, eb[:next_action], Val(:next_action), inds, ns[inds])
+        @test next_actions == [5, 6, 6, 6, 11, 11, 11]
+        @test all(in(next_actions), unique(batch[:next_action]))
+        #reward: discounted sum
+        rewards = ReinforcementLearningTrajectories.fetch(s1, eb[:reward], Val(:reward), inds, ns[inds])
+        @test rewards ≈ [2+0.99*3+0.99^2*4, 3+0.99*4+0.99^2*5, 4+0.99*5, 5, 8+0.99*9+0.99^2*10,9+0.99*10, 10]
+        @test all(in(rewards), unique(batch[:reward]))
+        #terminal: nsteps forward
+        terminals = ReinforcementLearningTrajectories.fetch(s1, eb[:terminal], Val(:terminal), inds, ns[inds])
+        @test terminals == [0,1,1,1,0,0,0]
+
+        ### CircularPrioritizedTraces and NStepBatchSampler
+        γ = 0.99
+        n_horizon = 3
+        batch_size = 4
+        eb = EpisodesBuffer(CircularPrioritizedTraces(CircularArraySARTSATraces(capacity=10), default_priority = 10f0)) 
+        s1 = NStepBatchSampler(eb, n=n_horizon, γ=γ, batch_size=batch_size)
 
-        b = RLTrajectories.StatsBase.sample(t)
-        @test sum(b.action .== 0) == 0
+        push!(eb, (state = 1, action = 1))
+        for i = 1:5
+            push!(eb, (state = i+1, action =i+1, reward = i, terminal = i == 5))
+        end
+        push!(eb, (state = 7, action = 7))
+        for (j,i) = enumerate(8:11)
+            push!(eb, (state = i, action =i, reward = i-1, terminal = false))
+        end
+        weights, ns = ReinforcementLearningTrajectories.valid_range(s1, eb)
+        inds = [i for i in eachindex(weights) if weights[i] == 1]
+        batch = sample(s1, eb)
+        for key in (keys(eb)..., :key, :priority)
+            @test haskey(batch, key)
+        end
     end
 
     @testset "EpisodesSampler" begin