update Hanabi.jl (#12)

findmyway · web-flow · commit 82a21f38367d · 2019-08-04T16:03:52.000+08:00
* update Hanabi.jl

* support Julia 1.0

* update README.md
diff --git a/.travis.yml b/.travis.yml
@@ -3,6 +3,7 @@ language: julia
 os:
   - linux
 julia:
+  - 1.0
   - 1.1
   - nightly
 notifications:
diff --git a/README.md b/README.md
@@ -10,24 +10,16 @@ Install:
 (v1.1) pkg> add https://github.com/JuliaReinforcementLearning/ReinforcementLearningEnvironments.jl
 ```
 
-**TODO:**
-
-- [x] Add a Docker file for quick test.
-    ```
-    $ docker run -it --rm juliareinforcementlearning/reinforcementlearningenvironments
-    ```
-- [ ] Add benchmarks.
-
 ## API
 
 | Method | Description |
 | :---  | :--------- |
 | `observe(env, observer=:default)` | Return the observation of `env` from the view of `observer`|
 | `reset!(env)` | Reset `env` to an initial state|
 | `interact!(env, action)` | Send `action` to `env`. For some multi-agent environments, `action` can be a dictionary of actions from different agents|
+| **Optional Methods** | |
 | `action_space(env)` | Return the action space of `env` |
 | `observation_space(env)` | Return the observation space of `env`|
-| **Optional Methods** | |
 | `render(env)` | Show the current state of environment |
 
 ## Supported Environments
@@ -59,7 +51,7 @@ By default, only some basic environments are installed. If you want to use some
 | `AtariEnv` | [ArcadeLearningEnvironment.jl](https://github.com/JuliaReinforcementLearning/ArcadeLearningEnvironment.jl) | |
 | `ViZDoomEnv` | [ViZDoom.jl](https://github.com/JuliaReinforcementLearning/ViZDoom.jl) | Currently only a basic environment is supported. (By calling `basic_ViZDoom_env()`)|
 | `GymEnv` | [PyCall.jl](https://github.com/JuliaPy/PyCall.jl) | You need to manually install `gym` first |
-| `HanabiEnv` | [Hanabi.jl](https://github.com/JuliaReinforcementLearning/Hanabi.jl) | `Hanabi.jl` hasn't been registered yet. Install by `pkg> add https://github.com/JuliaReinforcementLearning/Hanabi.jl` |
+| `HanabiEnv` | [Hanabi.jl](https://github.com/JuliaReinforcementLearning/Hanabi.jl) | Hanabi is a turn based multi-player environment, the API is slightly different from the environments above.|
 
 **TODO:**
 
@@ -72,7 +64,7 @@ Take the `AtariEnv` for example:
 
 1. Install this package by:
     ```julia
-    (v1.1) pkg> add https://github.com/JuliaReinforcementLearning/ReinforcementLearningEnvironments.jl
+    (v1.1) pkg> add ReinforcementLearningEnvironments
     ```
 2. Install corresponding dependent package by:
     ```julia
diff --git a/src/environments/hanabi.jl b/src/environments/hanabi.jl
@@ -1,7 +1,8 @@
 using Hanabi
 
-export HanabiEnv, legal_actions
+export HanabiEnv, legal_actions, observe, reset!, interact!
 export PlayCard, DiscardCard, RevealColor, RevealRank, parse_move
+export cur_player, get_score, get_fireworks, encode_observation, encode_observation!, legal_actions!, legal_actions, get_cur_player
 
 @enum HANABI_OBSERVATION_ENCODER_TYPE CANONICAL
 @enum COLOR R Y G W B
@@ -11,43 +12,58 @@ export PlayCard, DiscardCard, RevealColor, RevealRank, parse_move
 const CHANCE_PLAYER_ID = -1
 const COLORS_DICT = Dict(string(x) => x for x in instances(COLOR))
 
+###
+### finalizers
+###
+
+move_finalizer(x) = finalizer(m -> delete_move(m), x)
+history_item_finalizer(x) = finalizer(h -> delete_history_item(h), x)
+game_finalizer(x) = finalizer(g -> delete_game(g), x)
+observation_finalizer(x) = finalizer(o -> delete_observation(o), x)
+observation_encoder_finalizer(x) = finalizer(e -> delete_observation_encoder(e), x)
+state_finalizer(x) = finalizer(s -> delete_state(s), x)
+
 ###
 ### moves
 ###
 
 function PlayCard(card_idx::Int)
     m = Ref{HanabiMove}()
     get_play_move(card_idx - 1, m)
+    move_finalizer(m)
     m
 end
 
 function DiscardCard(card_idx::Int)
     m = Ref{HanabiMove}()
     get_discard_move(card_idx - 1, m)
+    move_finalizer(m)
     m
 end
 
 function RevealColor(target_offset::Int, color::COLOR)
     m = Ref{HanabiMove}()
     get_reveal_color_move(target_offset, color, m)
+    move_finalizer(m)
     m
 end
 
 function RevealRank(target_offset::Int, rank::Int)
     m = Ref{HanabiMove}()
     get_reveal_rank_move(target_offset, rank - 1, m)
+    move_finalizer(m)
     m
 end
 
 function parse_move(s::String)
     m = match(r"PlayCard\((?<card_idx>[1-5])\)", s)
-    !isnothing(m) && return PlayCard(parse(Int, m[:card_idx]))
+    !(m === nothing) && return PlayCard(parse(Int, m[:card_idx]))
     m = match(r"DiscardCard\((?<card_idx>[1-5])\)", s)
-    !isnothing(m) && return DiscardCard(parse(Int, m[:card_idx]))
+    !(m === nothing) && return DiscardCard(parse(Int, m[:card_idx]))
     m = match(r"RevealColor\((?<target>[1-5]),(?<color>[RYGWB])\)", s)
-    !isnothing(m) && return RevealColor(parse(Int, m[:target]), COLORS_DICT[m[:color]])
+    !(m === nothing) && return RevealColor(parse(Int, m[:target]), COLORS_DICT[m[:color]])
     m = match(r"RevealRank\((?<target>[1-5]),(?<rank>[1-5])\)", s)
-    !isnothing(m) && return RevealRank(parse(Int, m[:target]), parse(Int, m[:rank]))
+    !(m === nothing) && return RevealRank(parse(Int, m[:target]), parse(Int, m[:rank]))
     return nothing
 end
 
@@ -73,9 +89,7 @@ end
 
 """
     HanabiEnv(;kw...)
-
 Default game params:
-
 random_start_player    = false,
 seed                   = -1,
 max_life_tokens        = 3,
@@ -86,13 +100,12 @@ colors                 = 5,
 observation_type       = 1,
 players                = 2
 """
-mutable struct HanabiEnv <: AbstractEnv
+mutable struct HanabiEnv
     game::Base.RefValue{Hanabi.LibHanabi.PyHanabiGame}
     state::Base.RefValue{Hanabi.LibHanabi.PyHanabiState}
     moves::Vector{Base.RefValue{Hanabi.LibHanabi.PyHanabiMove}}
     observation_encoder::Base.RefValue{Hanabi.LibHanabi.PyHanabiObservationEncoder}
-    observation_space::MultiDiscreteSpace{Int, 1}
-    action_space::DiscreteSpace{Int}
+    observation_length::Int
     reward::HanabiResult
 
     function HanabiEnv(;kw...)
@@ -105,29 +118,30 @@ mutable struct HanabiEnv <: AbstractEnv
             new_game(game, length(params), params)
         end
 
+        game_finalizer(game)
+
         state = Ref{HanabiState}()
+        new_state(game, state)
+        state_finalizer(state)
 
         observation_encoder = Ref{HanabiObservationEncoder}()
         new_observation_encoder(observation_encoder, game, CANONICAL)
+        observation_encoder_finalizer(observation_encoder)
         observation_length = parse(Int, unsafe_string(observation_shape(observation_encoder)))
-        observation_space = MultiDiscreteSpace(ones(Int, observation_length), zeros(Int, observation_length))
 
         n_moves = max_moves(game)
-        action_space = DiscreteSpace(Int(n_moves))
         moves = [Ref{HanabiMove}() for _ in 1:n_moves]
         for i in 1:n_moves
             get_move_by_uid(game, i-1, moves[i])
+            move_finalizer(moves[i])
         end
 
-        env = new(game, state, moves, observation_encoder, observation_space, action_space, HanabiResult(Int32(0), Int32(0)))
+        env = new(game, state, moves, observation_encoder, observation_length, HanabiResult(Int32(0), Int32(0)))
         reset!(env)  # reset immediately
         env
     end
 end
 
-observation_space(env::HanabiEnv) = env.observation_space
-action_space(env::HanabiEnv) = env.action_space
-
 line_sep(x, sep="=") = repeat(sep, 25) * x * repeat(sep, 25)
 
 function Base.show(io::IO, env::HanabiEnv)
@@ -139,11 +153,22 @@ function Base.show(io::IO, env::HanabiEnv)
     """)
 end
 
+function highlight(s)
+    s = replace(s, "R" => Base.text_colors[:red] * "R" * Base.text_colors[:default])
+    s = replace(s, "G" => Base.text_colors[:green] * "G" * Base.text_colors[:default])
+    s = replace(s, "B" => Base.text_colors[:blue] * "B" * Base.text_colors[:default])
+    s = replace(s, "Y" => Base.text_colors[:yellow] * "Y" * Base.text_colors[:default])
+    s = replace(s, "W" => Base.text_colors[:white] * "W" * Base.text_colors[:default])
+    s
+end
+
 Base.show(io::IO, game::Base.RefValue{Hanabi.LibHanabi.PyHanabiGame}) = print(io, unsafe_string(game_param_string(game)))
-Base.show(io::IO, state::Base.RefValue{Hanabi.LibHanabi.PyHanabiState}) = print(io, unsafe_string(state_to_string(state)))
-Base.show(io::IO, obs::Base.RefValue{Hanabi.LibHanabi.PyHanabiObservation}) = print(io, unsafe_string(obs_to_string(obs)))
+Base.show(io::IO, state::Base.RefValue{Hanabi.LibHanabi.PyHanabiState}) = print(io, highlight("\n" * unsafe_string(state_to_string(state))))
+Base.show(io::IO, obs::Base.RefValue{Hanabi.LibHanabi.PyHanabiObservation}) = print(io, highlight("\n" * unsafe_string(obs_to_string(obs))))
 
 function reset!(env::HanabiEnv)
+    env.state = Ref{HanabiState}()
+    state_finalizer(env.state)
     new_state(env.game, env.state)
     while state_cur_player(env.state) == CHANCE_PLAYER_ID 
         state_deal_random_card(env.state)
@@ -167,27 +192,27 @@ function interact!(env::HanabiEnv, move::Base.RefValue{Hanabi.LibHanabi.PyHanabi
     new_score = state_score(env.state)
     env.reward.player = player
     env.reward.score_gain = new_score - old_score
+    nothing
+end
 
-    observation = Ref{HanabiObservation}()
-    new_observation(env.state, player, observation)
+function observe(env::HanabiEnv, observer=state_cur_player(env.state))
+    raw_obs = Ref{HanabiObservation}()
+    observation_finalizer(raw_obs)
+    new_observation(env.state, observer, raw_obs)
 
-    (observation = _encode_observation(observation, env),
-     reward      = env.reward.score_gain,
+    (observation = raw_obs,
+     reward      = env.reward.player == observer ? env.reward.score_gain : Int32(0),
      isdone      = state_end_of_game_status(env.state) != Int(NOT_FINISHED),
-     raw_obs     = observation)
+     game        = env.game)
 end
 
-function observe(env::HanabiEnv, observer=state_cur_player(env.state))
-    observation = Ref{HanabiObservation}()
-    new_observation(env.state, observer, observation)
-    (observation     = _encode_observation(observation, env),
-     reward          = env.reward.player == observer ? env.reward.score_gain : Int32(0),
-     isdone          = state_end_of_game_status(env.state) != Int(NOT_FINISHED),
-     raw_obs         = observation)
+function encode_observation(obs, env)
+    encoding = Vector{Int32}(undef, env.observation_length)
+    encode_obs(env.observation_encoder, obs, encoding)
+    encoding
 end
 
-function _encode_observation(obs, env)
-    encoding = Vector{Int32}(undef, length(env.observation_space.low))
+function encode_observation!(obs, env, encoding)
     encode_obs(env.observation_encoder, obs, encoding)
     encoding
 end
@@ -196,6 +221,9 @@ end
 ### Some Useful APIs
 ###
 
+get_score(env::HanabiEnv) = state_score(env.state)
+cur_player(env::HanabiEnv) = state_cur_player(env.state)
+
 function legal_actions(env::HanabiEnv)
     actions = Int32[]
     for (i, move) in enumerate(env.moves)
@@ -206,44 +234,39 @@ function legal_actions(env::HanabiEnv)
     actions
 end
 
-function get_card_knowledge(obs)
-    knowledges = []
-    for pid in 0:obs_num_players(obs)-1
-        hand_kd = []
-        for i in 0:obs_get_hand_size(obs, pid) - 1
-            kd = Ref{HanabiCardKnowledge}()
-            obs_get_hand_card_knowledge(obs, pid, i, kd)
-            push!(
-                hand_kd,
-                Dict{String, Any}(
-                    "color" => color_was_hinted(kd) > 0 ? COLOR(known_color(kd)) : nothing,
-                    "rank"  => rank_was_hinted(kd) > 0 ? known_rank(kd) : nothing))
-        end
-        push!(knowledges, hand_kd)
+legal_actions!(env::HanabiEnv, actions::AbstractVector{Bool}) = legal_actions!(env, actions, true, false)
+legal_actions!(env::HanabiEnv, actions::AbstractVector{T}) where T<:Number = legal_actions!(env, actions, zero(T), typemin(T))
+
+function legal_actions!(env::HanabiEnv, actions, legal_value, illegal_value)
+    for (i, move) in enumerate(env.moves)
+        actions[i] = move_is_legal(env.state, move) ? legal_value : illegal_value
     end
-    knowledges
+    actions
 end
 
-function observed_hands(obs)
-    hands = Vector{HanabiCard}[]
-    for pid in 0:obs_num_players(obs)-1
-        cards = HanabiCard[]
-        for i in 0:obs_get_hand_size(obs, pid)-1
-            card_ref = Ref{HanabiCard}()
-            obs_get_hand_card(obs, pid, i, card_ref)
-            push!(cards, card_ref[])
-        end
-        push!(hands, cards)
-    end
-    hands
+function get_hand_card_knowledge(obs, pid, i)
+    knowledge = Ref{HanabiCardKnowledge}()
+    obs_get_hand_card_knowledge(obs, pid, i, knowledge)
+    knowledge
+end
+
+function get_hand_card(obs, pid, i)
+    card_ref = Ref{HanabiCard}()
+    obs_get_hand_card(obs, pid, i, card_ref)
+    card_ref[]
 end
 
-function discard_pile(obs)
-    cards = HanabiCard[]
-    for i in 0:obs_discard_pile_size(obs)-1
-        card_ref = Ref{HanabiCard}()
-        obs_get_discard(obs, i, card_ref)
-        push!(cards, card_ref[])
+rank(knowledge::Base.RefValue{Hanabi.LibHanabi.PyHanabiCardKnowledge}) = rank_was_hinted(knowledge) != 0 ? known_rank(knowledge) + 1 : nothing
+rank(card::Hanabi.LibHanabi.PyHanabiCard) = card.rank + 1
+color(knowledge::Base.RefValue{Hanabi.LibHanabi.PyHanabiCardKnowledge}) = color_was_hinted(knowledge) != 0 ? COLOR(known_color(knowledge)) : nothing
+color(card::Hanabi.LibHanabi.PyHanabiCard) = COLOR(card.color)
+
+function get_fireworks(game, observation)
+    fireworks = Dict{COLOR, Int}()
+    for c in 0:(num_colors(game) - 1)
+        fireworks[COLOR(c)] = obs_fireworks(observation, c) + 1
     end
-    cards
+    fireworks
 end
+
+get_cur_player(env) = cur_player(env) + 1  # pid is 0-based
diff --git a/test/environments.jl b/test/environments.jl
@@ -20,17 +20,11 @@
 
     function basic_env_test(env::HanabiEnv, n=100)
         reset!(env)
-        os = observation_space(env)
-        as = action_space(env)
-        @test os isa AbstractSpace
-        @test as isa AbstractSpace
         @test reset!(env) == nothing
         for _ in 1:n
             a = rand(legal_actions(env))
-            @test a in as
             interact!(env, a)
             obs, reward, isdone = observe(env)
-            @test obs in os
             if isdone
                 reset!(env)
             end

-Original file line number
+Diff line change
 os:
   - linux
 julia:
 +  - 1.0
   - 1.1
   - nightly
 notifications: