Multi agent changes (#82)

findmyway · web-flow · commit 1246031c01c5 · 2020-09-01T12:16:47.000+08:00
* add snake game

* update readme

* ignore SnakeGameEnv in test

* refactor OpenSpiel a little

* finish CFR

* comment out tests related to SnakeGames for CI

* update dependency of RLBase
diff --git a/Project.toml b/Project.toml
@@ -14,7 +14,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 [compat]
 GR = "0.46, 0.47, 0.48, 0.49, 0.50, 0.51"
 OrdinaryDiffEq = "5"
-ReinforcementLearningBase = "0.8"
+ReinforcementLearningBase = "0.8.1"
 Requires = "1.0"
 StatsBase = "0.32, 0.33"
 julia = "1.3"
diff --git a/src/environments/open_spiel.jl b/src/environments/open_spiel.jl
@@ -3,6 +3,8 @@ import .OpenSpiel:
     get_type,
     provides_information_state_tensor,
     provides_observation_tensor,
+    provides_information_state_string,
+    provides_observation_string,
     dynamics,
     new_initial_state,
     chance_mode,
@@ -36,35 +38,32 @@ using StatsBase: sample, weights
 # Arguments
 
 - `name`::`String`, you can call `ReinforcementLearningEnvironments.OpenSpiel.registered_names()` to see all the supported names. Note that the name can contains parameters, like `"goofspiel(imp_info=True,num_cards=4,points_order=descending)"`. Because the parameters part is parsed by the backend C++ code, the bool variable must be `True` or `False` (instead of `true` or `false`). Another approach is to just specify parameters in `kwargs` in the Julia style.
-- `state_type`::`Union{Symbol,Nothing}`, Supported values are [`:information`](https://github.com/deepmind/open_spiel/blob/1ad92a54f3b800394b2bc7f178ccdff62d8369e1/open_spiel/spiel.h#L342-L367), [`:observation`](https://github.com/deepmind/open_spiel/blob/1ad92a54f3b800394b2bc7f178ccdff62d8369e1/open_spiel/spiel.h#L397-L408) or `nothing`. The default value is `nothing`, which means `:information` if the game ` provides_information_state_tensor`. If not, it means `:observation`.
+- `default_state_style`::`Union{AbstractStateStyle,Nothing}`, Supported values are [`Information{<:Union{String,Array}}`](https://github.com/deepmind/open_spiel/blob/1ad92a54f3b800394b2bc7f178ccdff62d8369e1/open_spiel/spiel.h#L342-L367), [`Observation{<:Union{String,Array}}`](https://github.com/deepmind/open_spiel/blob/1ad92a54f3b800394b2bc7f178ccdff62d8369e1/open_spiel/spiel.h#L397-L408) or `nothing`.
 - `rng::AbstractRNG`, used to initial the `rng` for chance nodes. And the `rng` will only be used if the environment contains chance node, else it is set to `nothing`. To set the seed of inner environment, you may check the documentation of each specific game. Usually adding a keyword argument named `seed` should work.
 - `is_chance_agent_required::Bool=false`, by default, no chance agent is required. An internal `rng` will be used to automatically generate actions for chance node. If set to `true`, you need to feed the action of chance agent to environment explicitly. And the `seed` will be ignored.
 """
 function OpenSpielEnv(
     name;
     rng = Random.GLOBAL_RNG,
-    state_type = nothing,
+    default_state_style = nothing,
     is_chance_agent_required = false,
     kwargs...,
 )
     game = load_game(name; kwargs...)
     game_type = get_type(game)
 
-    has_info_state = provides_information_state_tensor(game_type)
-    has_obs_state = provides_observation_tensor(game_type)
-    has_info_state ||
-        has_obs_state ||
-        @error "the environment neither provides information tensor nor provides observation tensor"
-    if isnothing(state_type)
-        state_type = has_info_state ? :information : :observation
-    end
-
-    if state_type == :observation
-        has_obs_state || @error "the environment doesn't support state_typeof $state_type"
-    elseif state_type == :information
-        has_info_state || @error "the environment doesn't support state_typeof $state_type"
-    else
-        @error "unknown state_type $state_type"
+    if isnothing(default_state_style)
+        default_state_style = if provides_information_state_string(game_type)
+            RLBase.Information{String}()
+        elseif provides_information_state_tensor(game_type)
+            RLBase.Information{Array}()
+        elseif provides_observation_tensor(game_type)
+            Observation{Array}()
+        elseif provides_observation_string(game_type)
+            Observation{String}()
+        else
+            nothing
+        end
     end
 
     state = new_initial_state(game)
@@ -103,7 +102,7 @@ function OpenSpielEnv(
     end
 
     env =
-        OpenSpielEnv{state_type,Tuple{c,d,i,n,r,u},typeof(state),typeof(game),typeof(rng)}(
+        OpenSpielEnv{Tuple{default_state_style,c,d,i,n,r,u},typeof(state),typeof(game),typeof(rng)}(
             state,
             game,
             rng,
@@ -113,14 +112,15 @@ function OpenSpielEnv(
 end
 
 RLBase.ActionStyle(env::OpenSpielEnv) = FULL_ACTION_SET
-RLBase.ChanceStyle(env::OpenSpielEnv{S,Tuple{C,D,I,N,R,U}}) where {S,C,D,I,N,R,U} = C
-RLBase.InformationStyle(env::OpenSpielEnv{S,Tuple{C,D,I,N,R,U}}) where {S,C,D,I,N,R,U} = I
-RLBase.NumAgentStyle(env::OpenSpielEnv{S,Tuple{C,D,I,N,R,U}}) where {S,C,D,I,N,R,U} = N
-RLBase.RewardStyle(env::OpenSpielEnv{S,Tuple{C,D,I,N,R,U}}) where {S,C,D,I,N,R,U} = R
-RLBase.UtilityStyle(env::OpenSpielEnv{S,Tuple{C,D,I,N,R,U}}) where {S,C,D,I,N,R,U} = U
+RLBase.ChanceStyle(env::OpenSpielEnv{Tuple{S,C,D,I,N,R,U}}) where {S,C,D,I,N,R,U} = C
+RLBase.InformationStyle(env::OpenSpielEnv{Tuple{S,C,D,I,N,R,U}}) where {S,C,D,I,N,R,U} = I
+RLBase.NumAgentStyle(env::OpenSpielEnv{Tuple{S,C,D,I,N,R,U}}) where {S,C,D,I,N,R,U} = N
+RLBase.RewardStyle(env::OpenSpielEnv{Tuple{S,C,D,I,N,R,U}}) where {S,C,D,I,N,R,U} = R
+RLBase.UtilityStyle(env::OpenSpielEnv{Tuple{S,C,D,I,N,R,U}}) where {S,C,D,I,N,R,U} = U
+RLBase.DefaultStateStyle(env::OpenSpielEnv{Tuple{S,C,D,I,N,R,U}}) where {S,C,D,I,N,R,U} = S
 
-Base.copy(env::OpenSpielEnv{S,T,ST,G,R}) where {S,T,ST,G,R} =
-    OpenSpielEnv{S,T,ST,G,R}(copy(env.state), env.game, env.rng)
+Base.copy(env::OpenSpielEnv{T,ST,G,R}) where {T,ST,G,R} =
+    OpenSpielEnv{T,ST,G,R}(copy(env.state), env.game, env.rng)
 
 function RLBase.reset!(env::OpenSpielEnv)
     state = new_initial_state(env.game)
@@ -132,55 +132,72 @@ _sample_external_events!(::Nothing, state) = nothing
 
 function _sample_external_events!(rng::AbstractRNG, state)
     while is_chance_node(state)
-        outcomes_with_probs = chance_outcomes(state)
-        actions, probs = zip(outcomes_with_probs...)
-        action = actions[sample(rng, weights(collect(probs)))]
-        apply_action(state, action)
+        apply_action(state, rand(rng, reinterpret(ActionProbPair{Int, Float64}, chance_outcomes(state))).action)
     end
 end
 
-function (env::OpenSpielEnv)(action)
+function (env::OpenSpielEnv)(action::Int)
     apply_action(env.state, action)
     ChanceStyle(env) === STOCHASTIC && _sample_external_events!(env.rng, env.state)
 end
 
-RLBase.get_actions(env::OpenSpielEnv) = 0:num_distinct_actions(env.game)-1
 RLBase.get_current_player(env::OpenSpielEnv) = current_player(env.state)
 RLBase.get_chance_player(env::OpenSpielEnv) = convert(Int, OpenSpiel.CHANCE_PLAYER)
-RLBase.get_players(env::OpenSpielEnv) = 0:(num_players(env.game)-1)
-
-function Random.seed!(env::OpenSpielEnv, seed)
-    if ChanceStyle(env) === STOCHASTIC
-        Random.seed!(env.rng, seed)
+RLBase.get_players(env::OpenSpielEnv) = get_players(env, ChanceStyle(env))
+RLBase.get_players(env::OpenSpielEnv, ::Any) = 0:(num_players(env.game)-1)
+RLBase.get_players(env::OpenSpielEnv, ::Union{ExplicitStochastic, SampledStochastic}) = (get_chance_player(env), 0:(num_players(env.game)-1)...)
+RLBase.get_num_players(env::OpenSpielEnv) = length(get_players(env))
+
+function RLBase.get_actions(env::OpenSpielEnv, player)
+    if player == get_chance_player(env)
+        reinterpret(ActionProbPair{Int, Float64}, chance_outcomes(env.state))
     else
-        @error "only environments of STOCHASTIC are supported, perhaps initialize the environment with a seed argument instead?"
+        0:num_distinct_actions(env.game)-1
     end
 end
 
-RLBase.get_legal_actions(env::OpenSpielEnv, player) = legal_actions(env.state, player)
+function RLBase.get_legal_actions(env::OpenSpielEnv, player)
+    if player == get_chance_player(env)
+        reinterpret(ActionProbPair{Int, Float64}, chance_outcomes(env.state))
+    else
+        legal_actions(env.state, player)
+    end
+end
 
 function RLBase.get_legal_actions_mask(env::OpenSpielEnv, player)
-    n = player == convert(Int, OpenSpiel.CHANCE_PLAYER) ? max_chance_outcomes(env.game) :
-        num_distinct_actions(env.game)
+    n = player == get_chance_player(env) ? max_chance_outcomes(env.game) : num_distinct_actions(env.game)
     mask = BitArray(undef, n)
     for a in legal_actions(env.state, player)
         mask[a+1] = true
     end
     mask
 end
 
+function Random.seed!(env::OpenSpielEnv, seed)
+    if ChanceStyle(env) === STOCHASTIC
+        Random.seed!(env.rng, seed)
+    else
+        @error "only environments of STOCHASTIC are supported, perhaps initialize the environment with a seed argument instead?"
+    end
+end
+
 RLBase.get_terminal(env::OpenSpielEnv, player) = OpenSpiel.is_terminal(env.state)
 
 function RLBase.get_reward(env::OpenSpielEnv, player)
     if DynamicStyle(env) === SIMULTANEOUS &&
        player == convert(Int, OpenSpiel.SIMULTANEOUS_PLAYER)
         rewards(env.state)
+    elseif player == get_chance_player(env)
+        0  # ??? type stable
     else
         player_reward(env.state, player)
     end
 end
 
-RLBase.get_state(env::OpenSpielEnv) = env.state
-RLBase.get_state(env::OpenSpielEnv, player::Integer) = env.state
+RLBase.get_state(env::OpenSpielEnv, player::Integer) = get_state(env, DefaultStateStyle(env), player)
+RLBase.get_state(env::OpenSpielEnv, ::RLBase.Information{String}, player) = information_state_string(env.state, player)
+RLBase.get_state(env::OpenSpielEnv, ::RLBase.Information{Array}, player) = information_state_tensor(env.state, player)
+RLBase.get_state(env::OpenSpielEnv, ::Observation{String}, player) = observation_string(env.state, player)
+RLBase.get_state(env::OpenSpielEnv, ::Observation{Array}, player) = observation_tensor(env.state, player)
 
 RLBase.get_history(env::OpenSpielEnv) = history(env.state)
diff --git a/src/environments/structs.jl b/src/environments/structs.jl
@@ -38,7 +38,7 @@ mutable struct MDPEnv{M,S,A,R} <: AbstractEnv
 end
 export MDPEnv
 
-mutable struct OpenSpielEnv{S,T,ST,G,R} <: AbstractEnv
+mutable struct OpenSpielEnv{T,ST,G,R} <: AbstractEnv
     state::ST
     game::G
     rng::R
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -6,7 +6,7 @@ using PyCall
 using POMDPs
 using POMDPModels
 using OpenSpiel
-using SnakeGames
+# using SnakeGames
 using Random
 
 @testset "ReinforcementLearningEnvironments" begin