JuliaReinforcementLearning · findmyway · Dec 15, 2020 · Dec 15, 2020
diff --git a/src/base.jl b/src/base.jl
@@ -6,7 +6,8 @@ function env_traits()
     [eval(x) for x in RLBase.ENV_API if endswith(String(x), "Style")]
 end
 
-Base.show(io::IO, t::MIME"text/plain", env::AbstractEnv) = show(io, MIME"text/markdown"(), env)
+Base.show(io::IO, t::MIME"text/plain", env::AbstractEnv) =
+    show(io, MIME"text/markdown"(), env)
 
 function Base.show(io::IO, t::MIME"text/markdown", env::AbstractEnv)
     show(io, t, Markdown.parse("""
@@ -62,7 +63,11 @@ function test_interfaces(env)
 
     rng = Random.MersenneTwister(666)
 
-    @info "testing $(nameof(env)), you need to manually check these traits to make sure they are implemented correctly!" NumAgentStyle(env) DynamicStyle(env) ActionStyle(env) InformationStyle(env) StateStyle(env) RewardStyle(env) UtilityStyle(env) ChanceStyle(env)
+    @info "testing $(nameof(env)), you need to manually check these traits to make sure they are implemented correctly!" NumAgentStyle(
+        env,
+    ) DynamicStyle(env) ActionStyle(env) InformationStyle(env) StateStyle(env) RewardStyle(
+        env,
+    ) UtilityStyle(env) ChanceStyle(env)
 
     reset!(env)
 
@@ -99,7 +104,7 @@ function test_interfaces(env)
 
     @testset "SingleAgent" begin
         if NumAgentStyle(env) === SINGLE_AGENT
-            total_reward = 0.
+            total_reward = 0.0
             while !is_terminated(env)
                 if StateStyle(env) isa Tuple
                     for ss in StateStyle(env)
@@ -111,7 +116,8 @@ function test_interfaces(env)
                 if ActionStyle(env) === MINIMAL_ACTION_SET
                     action_space(env) == legal_action_space
                 elseif ActionStyle(env) === FULL_ACTION_SET
-                    @test legal_action_space(env) == action_space(env)[legal_action_space_mask(env)]
+                    @test legal_action_space(env) ==
+                          action_space(env)[legal_action_space_mask(env)]
                 else
                     @error "TODO:"
                 end
@@ -133,7 +139,7 @@ function test_interfaces(env)
     @testset "MultiAgent" begin
         if NumAgentStyle(env) isa MultiAgent
             reset!(env)
-            rewards = [0. for p in players(env)]
+            rewards = [0.0 for p in players(env)]
             while !is_terminated(env)
                 if InformationStyle(env) === PERFECT_INFORMATION
                     for p in players(env)
@@ -142,7 +148,7 @@ function test_interfaces(env)
                 end
                 a = rand(rng, legal_action_space(env))
                 env(a)
-                for (i,p) in enumerate(players(env))
+                for (i, p) in enumerate(players(env))
                     @test state(env, p) ∈ state_space(env, p)
                     rewards[i] += reward(env, p)
                 end
@@ -158,7 +164,7 @@ function test_interfaces(env)
                 # @test isempty(legal_action_space(env, p))
             end
             if RewardStyle(env) === TERMINAL_REWARD
-                for (p,r) in zip(players(env), rewards)
+                for (p, r) in zip(players(env), rewards)
                     @test r == reward(env, p)
                 end
             end
@@ -207,10 +213,10 @@ function gen_traits_table(io, envs)
         print(io, "<th> $(i) </th>")
     end
 
-    for k in sort(collect(keys(trait_dict)), by=nameof)
+    for k in sort(collect(keys(trait_dict)), by = nameof)
         vs = trait_dict[k]
         print(io, "<tr> <th rowspan=\"$(length(vs))\"> $(nameof(k)) </th>")
-        for (i,v) in enumerate(vs)
+        for (i, v) in enumerate(vs)
             if i != 1
                 print(io, "<tr> ")
             end
@@ -239,7 +245,10 @@ function gen_traits_table(io, envs)
 
     print(io, "<ol>")
     for env in envs
-        println(io, "<li> <a href=\"https://github.com/JuliaReinforcementLearning/ReinforcementLearningBase.jl/tree/master/src/examples/$(nameof(env)).jl\"> $(nameof(env)) </a></li>")
+        println(
+            io,
+            "<li> <a href=\"https://github.com/JuliaReinforcementLearning/ReinforcementLearningBase.jl/tree/master/src/examples/$(nameof(env)).jl\"> $(nameof(env)) </a></li>",
+        )
     end
     print(io, "</ol>")
 end
@@ -255,4 +264,4 @@ watch https://github.com/JuliaMath/IntervalSets.jl/issues/66
 """
 function Base.in(x::AbstractArray, s::Array{<:Interval})
     size(x) == size(s) && all(x .∈ s)
-end
+end
diff --git a/src/examples/KuhnPokerEnv.jl b/src/examples/KuhnPokerEnv.jl
@@ -1,16 +1,16 @@
 export KuhnPokerEnv
 
 const KUHN_POKER_CARDS = (:J, :Q, :K)
-const KUHN_POKER_CARD_COMBINATIONS = ((:J, :Q), (:J, :K), (:Q, :J), (:Q, :K), (:K, :J), (:K, :Q))
+const KUHN_POKER_CARD_COMBINATIONS =
+    ((:J, :Q), (:J, :K), (:Q, :J), (:Q, :K), (:K, :J), (:K, :Q))
 const KUHN_POKER_ACTIONS = (:pass, :bet)
 const KUHN_POKER_STATES = (
     (),
     map(tuple, KUHN_POKER_CARDS)...,
     KUHN_POKER_CARD_COMBINATIONS...,
     (
-        (cards..., actions...)
-        for cards in ((), map(tuple, KUHN_POKER_CARDS)...)
-        for actions in (
+        (cards..., actions...) for cards in ((), map(tuple, KUHN_POKER_CARDS)...) for
+        actions in (
             (),
             (:bet,),
             (:bet, :bet),
@@ -21,7 +21,7 @@ const KUHN_POKER_STATES = (
             (:pass, :bet, :pass),
             (:pass, :bet, :bet),
         )
-    )...
+    )...,
 )
 
 """
@@ -34,28 +34,24 @@ const KUHN_POKER_REWARD_TABLE = Dict(
     (:Q, :K, :bet, :bet) => -2,
     (:K, :J, :bet, :bet) => 2,
     (:K, :Q, :bet, :bet) => 2,
-
     (:J, :Q, :bet, :pass) => 1,
     (:J, :K, :bet, :pass) => 1,
     (:Q, :J, :bet, :pass) => 1,
     (:Q, :K, :bet, :pass) => 1,
     (:K, :J, :bet, :pass) => 1,
     (:K, :Q, :bet, :pass) => 1,
-
     (:J, :Q, :pass, :pass) => -1,
     (:J, :K, :pass, :pass) => -1,
     (:Q, :J, :pass, :pass) => 1,
     (:Q, :K, :pass, :pass) => -1,
     (:K, :J, :pass, :pass) => 1,
     (:K, :Q, :pass, :pass) => 1,
-
     (:J, :Q, :pass, :bet, :pass) => -1,
     (:J, :K, :pass, :bet, :pass) => -1,
     (:Q, :J, :pass, :bet, :pass) => -1,
     (:Q, :K, :pass, :bet, :pass) => -1,
     (:K, :J, :pass, :bet, :pass) => -1,
     (:K, :Q, :pass, :bet, :pass) => -1,
-
     (:J, :Q, :pass, :bet, :bet) => -2,
     (:J, :K, :pass, :bet, :bet) => -2,
     (:Q, :J, :pass, :bet, :bet) => 2,
@@ -88,7 +84,9 @@ function reset!(env::KuhnPokerEnv)
     empty!(env.actions)
 end
 
-is_terminated(env::KuhnPokerEnv) = length(env.actions) == 2 && (env.actions[1] == :bet || env.actions[2] == :pass) || length(env.actions) == 3
+is_terminated(env::KuhnPokerEnv) =
+    length(env.actions) == 2 && (env.actions[1] == :bet || env.actions[2] == :pass) ||
+    length(env.actions) == 3
 players(env::KuhnPokerEnv) = 1:2
 
 function state(env::KuhnPokerEnv, ::InformationSet{Tuple{Vararg{Symbol}}}, p::Int)
@@ -99,13 +97,16 @@ function state(env::KuhnPokerEnv, ::InformationSet{Tuple{Vararg{Symbol}}}, p::In
     end
 end
 
-state(env::KuhnPokerEnv, ::InformationSet{Tuple{Vararg{Symbol}}}, ::ChancePlayer) = Tuple(env.cards)
-state_space(env::KuhnPokerEnv, ::InformationSet{Tuple{Vararg{Symbol}}}, p) = KUHN_POKER_STATES
+state(env::KuhnPokerEnv, ::InformationSet{Tuple{Vararg{Symbol}}}, ::ChancePlayer) =
+    Tuple(env.cards)
+state_space(env::KuhnPokerEnv, ::InformationSet{Tuple{Vararg{Symbol}}}, p) =
+    KUHN_POKER_STATES
 
 action_space(env::KuhnPokerEnv, ::Int) = Base.OneTo(length(KUHN_POKER_ACTIONS))
 action_space(env::KuhnPokerEnv, ::ChancePlayer) = Base.OneTo(length(KUHN_POKER_CARDS))
 
-legal_action_space(env::KuhnPokerEnv, p::ChancePlayer) = [x for x in action_space(env,p) if KUHN_POKER_CARDS[x] ∉ env.cards]
+legal_action_space(env::KuhnPokerEnv, p::ChancePlayer) =
+    [x for x in action_space(env, p) if KUHN_POKER_CARDS[x] ∉ env.cards]
 
 function legal_action_space_mask(env::KuhnPokerEnv, p::ChancePlayer)
     m = fill(true, 3)
@@ -115,9 +116,9 @@ end
 
 function prob(env::KuhnPokerEnv, ::ChancePlayer)
     if length(env.cards) == 0
-        fill(1/3, 3)
+        fill(1 / 3, 3)
     elseif length(env.cards) == 1
-        p = fill(1/2, 3)
+        p = fill(1 / 2, 3)
         p[env.cards[1]] = 0
     else
         @error "it's not chance player's turn!"
@@ -138,16 +139,17 @@ function reward(env::KuhnPokerEnv, p)
     end
 end
 
-current_player(env::KuhnPokerEnv) = if length(env.cards) < 2
-    CHANCE_PLAYER
-elseif length(env.actions) == 0
-    1
-elseif length(env.actions) == 1
-    2
-elseif length(env.actions) == 2
-    1
-else
-end
+current_player(env::KuhnPokerEnv) =
+    if length(env.cards) < 2
+        CHANCE_PLAYER
+    elseif length(env.actions) == 0
+        1
+    elseif length(env.actions) == 1
+        2
+    elseif length(env.actions) == 2
+        1
+    else
+    end
 
 NumAgentStyle(::KuhnPokerEnv) = MultiAgent(2)
 DynamicStyle(::KuhnPokerEnv) = SEQUENTIAL
@@ -156,4 +158,4 @@ InformationStyle(::KuhnPokerEnv) = IMPERFECT_INFORMATION
 StateStyle(::KuhnPokerEnv) = InformationSet{Tuple{Vararg{Symbol}}}()
 RewardStyle(::KuhnPokerEnv) = TERMINAL_REWARD
 UtilityStyle(::KuhnPokerEnv) = ZERO_SUM
-ChanceStyle(::KuhnPokerEnv) = EXPLICIT_STOCHASTIC
+ChanceStyle(::KuhnPokerEnv) = EXPLICIT_STOCHASTIC
diff --git a/src/examples/MontyHallEnv.jl b/src/examples/MontyHallEnv.jl
@@ -1,13 +1,13 @@
 export MontyHallEnv
 
-const REWARD_OF_GOAT = 10.
-const REWARD_OF_CAR = 1_000.
+const REWARD_OF_GOAT = 10.0
+const REWARD_OF_CAR = 1_000.0
 
 mutable struct MontyHallEnv <: AbstractEnv
     doors::Vector{Symbol}
     rng::AbstractRNG
     guest_action::Union{Nothing,Int}
-    host_action::Union{Nothing, Int}
+    host_action::Union{Nothing,Int}
     reward::Union{Nothing,Float64}
 end
 
@@ -24,7 +24,7 @@ Quoted from [wiki](https://en.wikipedia.org/wiki/Monty_Hall_problem):
 
 Here we'll introduce the first environment which is of [`FULL_ACTION_SET`](@ref).
 """
-function MontyHallEnv(;rng=Random.GLOBAL_RNG)
+function MontyHallEnv(; rng = Random.GLOBAL_RNG)
     doors = fill(:goat, 3)
     doors[rand(rng, 1:3)] = :car
     MontyHallEnv(doors, rng, nothing, nothing, nothing)
@@ -94,7 +94,7 @@ function (env::MontyHallEnv)(action)
     end
 end
 
-reward(env::MontyHallEnv) = isnothing(env.reward) ? 0. : env.reward
+reward(env::MontyHallEnv) = isnothing(env.reward) ? 0.0 : env.reward
 
 is_terminated(env::MontyHallEnv) = !isnothing(env.reward)
 
@@ -113,4 +113,4 @@ InformationStyle(::MontyHallEnv) = IMPERFECT_INFORMATION  # the distribution of
 StateStyle(::MontyHallEnv) = Observation{Int}()
 RewardStyle(::MontyHallEnv) = TERMINAL_REWARD
 UtilityStyle(::MontyHallEnv) = GENERAL_SUM
-ChanceStyle(::MontyHallEnv) = STOCHASTIC  # the same action lead to different reward each time.
+ChanceStyle(::MontyHallEnv) = STOCHASTIC  # the same action lead to different reward each time.
diff --git a/src/examples/MultiArmBanditsEnv.jl b/src/examples/MultiArmBanditsEnv.jl
@@ -20,15 +20,9 @@ This is a **one-shot** game. The environment terminates immediately after taking
 in an action. Here we use it to demonstrate how to write a customized
 environment with only minimal interfaces defined.
 """
-function MultiArmBanditsEnv(;true_reward=0., k = 10, rng=Random.GLOBAL_RNG)
+function MultiArmBanditsEnv(; true_reward = 0.0, k = 10, rng = Random.GLOBAL_RNG)
     true_values = true_reward .+ randn(rng, k)
-    MultiArmBanditsEnv(
-        true_reward,
-        true_values,
-        rng,
-        0.,
-        false
-    )
+    MultiArmBanditsEnv(true_reward, true_values, rng, 0.0, false)
 end
 
 """
@@ -103,4 +97,4 @@ InformationStyle(::MultiArmBanditsEnv) = IMPERFECT_INFORMATION  # the distributi
 StateStyle(::MultiArmBanditsEnv) = Observation{Int}()
 RewardStyle(::MultiArmBanditsEnv) = TERMINAL_REWARD
 UtilityStyle(::MultiArmBanditsEnv) = GENERAL_SUM
-ChanceStyle(::MultiArmBanditsEnv) = STOCHASTIC  # the same action lead to different reward each time.
+ChanceStyle(::MultiArmBanditsEnv) = STOCHASTIC  # the same action lead to different reward each time.
diff --git a/src/examples/PigEnv.jl b/src/examples/PigEnv.jl
@@ -17,7 +17,7 @@ See [wiki](https://en.wikipedia.org/wiki/Pig_(dice_game)) for explanation of thi
 
 Here we use it to demonstrate how to write a game with more than 2 players.
 """
-PigEnv(;n_players=2) = PigEnv{n_players}(zeros(Int, n_players), 1, false, 0)
+PigEnv(; n_players = 2) = PigEnv{n_players}(zeros(Int, n_players), 1, false, 0)
 
 function reset!(env::PigEnv)
     fill!(env.scores, 0)
@@ -26,15 +26,17 @@ function reset!(env::PigEnv)
     env.tmp_score = 0
 end
 
-current_player(env::PigEnv) = env.is_chance_player_active ? CHANCE_PLAYER : env.current_player
+current_player(env::PigEnv) =
+    env.is_chance_player_active ? CHANCE_PLAYER : env.current_player
 players(env::PigEnv) = 1:length(env.scores)
 action_space(env::PigEnv, ::Int) = (:roll, :hold)
 action_space(env::PigEnv, ::ChancePlayer) = Base.OneTo(PIG_N_SIDES)
 
-prob(env::PigEnv, ::ChancePlayer) = fill(1/6, 6)  # TODO: uniform distribution, more memory efficient
+prob(env::PigEnv, ::ChancePlayer) = fill(1 / 6, 6)  # TODO: uniform distribution, more memory efficient
 
 state(env::PigEnv, ::Observation{Vector{Int}}, p) = env.scores
-state_space(env::PigEnv, ::Observation, p) = [0..(PIG_TARGET_SCORE+PIG_N_SIDES-1) for _ in env.scores]
+state_space(env::PigEnv, ::Observation, p) =
+    [0..(PIG_TARGET_SCORE + PIG_N_SIDES - 1) for _ in env.scores]
 
 is_terminated(env::PigEnv) = any(s >= PIG_TARGET_SCORE for s in env.scores)
 
@@ -75,11 +77,11 @@ function (env::PigEnv)(action, ::ChancePlayer)
     end
 end
 
-NumAgentStyle(::PigEnv{N}) where N = MultiAgent(N)
+NumAgentStyle(::PigEnv{N}) where {N} = MultiAgent(N)
 DynamicStyle(::PigEnv) = SEQUENTIAL
 ActionStyle(::PigEnv) = MINIMAL_ACTION_SET
 InformationStyle(::PigEnv) = PERFECT_INFORMATION
 StateStyle(::PigEnv) = Observation{Vector{Int}}()
 RewardStyle(::PigEnv) = TERMINAL_REWARD
 UtilityStyle(::PigEnv) = CONSTANT_SUM
-ChanceStyle(::PigEnv) = EXPLICIT_STOCHASTIC
+ChanceStyle(::PigEnv) = EXPLICIT_STOCHASTIC
diff --git a/src/examples/RandomWalk1D.jl b/src/examples/RandomWalk1D.jl
@@ -16,7 +16,7 @@ Compared to the [`MultiArmBanditsEnv`](@ref):
 Base.@kwdef mutable struct RandomWalk1D <: AbstractEnv
     rewards::Pair{Float64,Float64} = -1.0 => 1.0
     N::Int = 7
-    start_pos::Int = (N+1) ÷ 2
+    start_pos::Int = (N + 1) ÷ 2
     pos::Int = start_pos
 end
 
@@ -28,9 +28,9 @@ action_space(::RandomWalk1D) = Base.OneTo(length(ACTIONS_OF_RANDOMWALK1D))
 
 function (env::RandomWalk1D)(action::Symbol)
     if action == :left
-        env.pos = max(env.pos-1, 1)
+        env.pos = max(env.pos - 1, 1)
     elseif action == :right
-        env.pos = min(env.pos+1, env.N)
+        env.pos = min(env.pos + 1, env.N)
     else
         @error "invalid action: $action"
     end
@@ -47,7 +47,7 @@ function reward(env::RandomWalk1D)
     elseif env.pos == env.N
         last(env.rewards)
     else
-        0.
+        0.0
     end
 end
 
@@ -58,4 +58,4 @@ InformationStyle(::RandomWalk1D) = PERFECT_INFORMATION
 StateStyle(::RandomWalk1D) = Observation{Int}()
 RewardStyle(::RandomWalk1D) = TERMINAL_REWARD
 UtilityStyle(::RandomWalk1D) = GENERAL_SUM
-ChanceStyle(::RandomWalk1D) = DETERMINISTIC
+ChanceStyle(::RandomWalk1D) = DETERMINISTIC