add MADDPG algorithm (#444)

peterchen96 · web-flow · commit 4e5d25879808 · 2021-08-12T15:40:14.000+08:00
* add maddpg

* add experiment

* update cspell.json

* update the algo
diff --git a/.cspell/cspell.json b/.cspell/cspell.json
@@ -120,7 +120,8 @@
         "Norouzi",
         "gzopen",
         "turbulences",
-        "Decompressor"
+        "Decompressor",
+        "MADDPG"
     ],
     "ignoreWords": [],
     "minWordLength": 5,
@@ -143,4 +144,4 @@
         "\\{%.*%\\}", // liquid syntax
         "/^\\s*```[\\s\\S]*?^\\s*```/gm" // Another attempt at markdown code blocks. https://github.com/streetsidesoftware/vscode-spell-checker/issues/202#issuecomment-377477473
     ]
-}
+}
diff --git a/docs/experiments/experiments/Policy Gradient/JuliaRL_MADDPG_KuhnPoker.jl b/docs/experiments/experiments/Policy Gradient/JuliaRL_MADDPG_KuhnPoker.jl
@@ -0,0 +1,123 @@
+# ---
+# title: JuliaRL\_MADDPG\_KuhnPoker
+# cover: assets/JuliaRL_MADDPG_KuhnPoker.png
+# description: MADDPG applied to KuhnPoker
+# date: 2021-08-09
+# author: "[Peter Chen](https://github.com/peterchen96)" 
+# ---
+
+#+ tangle=true
+using ReinforcementLearning
+using StableRNGs
+using Flux
+using IntervalSets
+
+mutable struct ResultNEpisode <: AbstractHook
+    eval_freq::Int
+    episode_counter::Int
+    episode::Vector{Int}
+    results::Vector{Float64}
+end
+
+function (hook::ResultNEpisode)(::PostEpisodeStage, policy, env)
+    hook.episode_counter += 1
+    if hook.episode_counter % hook.eval_freq == 0
+        push!(hook.episode, hook.episode_counter)
+        push!(hook.results, reward(env, 1))
+    end
+end
+
+function RL.Experiment(
+    ::Val{:JuliaRL},
+    ::Val{:MADDPG},
+    ::Val{:KuhnPoker},
+    ::Nothing;
+    seed=123,
+)
+    rng = StableRNG(seed)
+    env = KuhnPokerEnv()
+    wrapped_env = ActionTransformedEnv(
+        StateTransformedEnv(
+            env;
+            state_mapping = s -> [findfirst(==(s), state_space(env))],
+            state_space_mapping = ss -> [[findfirst(==(s), state_space(env))] for s in state_space(env)]
+            ),
+        ## add a dummy action for the other agent.
+        action_mapping = x -> length(x) == 1 ? x : Int(x[current_player(env)] + 1),
+    )
+    ns, na = 1, 1
+    n_players = 2
+
+    init = glorot_uniform(rng)
+
+    create_actor() = Chain(
+            Dense(ns, 64, relu; init = init),
+            Dense(64, 64, relu; init = init),
+            Dense(64, na, tanh; init = init),
+        )
+
+    create_critic() = Chain(
+        Dense(n_players * ns + n_players * na, 64, relu; init = init),
+        Dense(64, 64, relu; init = init),
+        Dense(64, 1; init = init),
+        )
+
+    
+    policy = DDPGPolicy(
+        behavior_actor = NeuralNetworkApproximator(
+            model = create_actor(),
+            optimizer = ADAM(),
+        ),
+        behavior_critic = NeuralNetworkApproximator(
+            model = create_critic(),
+            optimizer = ADAM(),
+        ),
+        target_actor = NeuralNetworkApproximator(
+            model = create_actor(),
+            optimizer = ADAM(),
+        ),
+        target_critic = NeuralNetworkApproximator(
+            model = create_critic(),
+            optimizer = ADAM(),
+        ),
+        γ = 0.99f0,
+        ρ = 0.995f0,
+        na = na,
+        start_steps = 1000,
+        start_policy = RandomPolicy(-0.9..0.9; rng = rng),
+        update_after = 1000,
+        act_limit = 0.9,
+        act_noise = 0.1,
+        rng = rng,
+    )
+    trajectory = CircularArraySARTTrajectory(
+        capacity = 10000, # replay buffer capacity
+        state = Vector{Int} => (ns, ),
+        action = Float32 => (na, ),
+    )
+
+    agents = MADDPGManager(
+        Dict((player, Agent(
+            policy = NamedPolicy(player, deepcopy(policy)),
+            trajectory = deepcopy(trajectory),
+        )) for player in players(env) if player != chance_player(env)),
+        128, # batch_size
+        128, # update_freq
+        0, # step_counter
+        rng
+    )
+
+    stop_condition = StopAfterEpisode(100_000, is_show_progress=!haskey(ENV, "CI"))
+    hook = ResultNEpisode(1000, 0, [], [])
+    Experiment(agents, wrapped_env, stop_condition, hook, "# run MADDPG on KuhnPokerEnv")
+end
+
+#+ tangle=false
+using Plots
+ex = E`JuliaRL_MADDPG_KuhnPoker`
+run(ex)
+scatter(ex.hook.episode, ex.hook.results, xaxis=:log, xlabel="episode", ylabel="reward of player 1")
+
+savefig("assets/JuliaRL_MADDPG_KuhnPoker.png") #hide
+
+# ![](assets/JuliaRL_MADDPG_KuhnPoker.png)
diff --git a/docs/experiments/experiments/Policy Gradient/config.json b/docs/experiments/experiments/Policy Gradient/config.json
@@ -4,6 +4,7 @@
     "JuliaRL_A2C_CartPole.jl",
     "JuliaRL_A2CGAE_CartPole.jl",
     "JuliaRL_DDPG_Pendulum.jl",
+    "JuliaRL_MADDPG_KuhnPoker.jl",
     "JuliaRL_MAC_CartPole.jl",
     "JuliaRL_PPO_CartPole.jl",
     "JuliaRL_PPO_Pendulum.jl",
diff --git a/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/maddpg.jl b/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/maddpg.jl
@@ -0,0 +1,145 @@
+export MADDPGManager
+
+"""
+    MADDPGManager(; agents::Dict{<:Any, <:Agent}, args...)
+Multi-agent Deep Deterministic Policy Gradient(MADDPG) implemented in Julia. Here only works for simultaneous games whose action space is discrete.
+See the paper https://arxiv.org/abs/1706.02275 for more details.
+
+# Keyword arguments
+- `agents::Dict{<:Any, <:NamedPolicy{<:Agent{<:DDPGPolicy, <:AbstractTrajectory}, <:Any}}`, here each agent collects its own information. While updating the policy, each `critic` will assemble all agents' trajectory to update its own network.
+- `batch_size::Int`
+- `update_freq::Int`
+- `update_step::Int`, count the step.
+- `rng::AbstractRNG`.
+"""
+mutable struct MADDPGManager{P<:DDPGPolicy, T<:AbstractTrajectory, N<:Any} <: AbstractPolicy
+    agents::Dict{<:N, <:Agent{<:NamedPolicy{<:P, <:N}, <:T}}
+    batch_size::Int
+    update_freq::Int
+    update_step::Int
+    rng::AbstractRNG
+end
+
+# for simultaneous game with a discrete action space.
+function (π::MADDPGManager)(env::AbstractEnv)
+    while current_player(env) == chance_player(env)
+        env |> legal_action_space |> rand |> env
+    end
+    Dict((player, ceil(agent.policy(env))) for (player, agent) in π.agents)
+end
+
+function (π::MADDPGManager)(stage::Union{PreEpisodeStage, PostActStage}, env::AbstractEnv)
+    # only need to update trajectory.
+    for (_, agent) in π.agents
+        update!(agent.trajectory, agent.policy, env, stage)
+    end
+end
+
+function (π::MADDPGManager)(stage::PreActStage, env::AbstractEnv, actions)
+    # update each agent's trajectory.
+    for (player, agent) in π.agents
+        update!(agent.trajectory, agent.policy, env, stage, actions[player])
+    end
+    
+    # update policy
+    update!(π)
+end
+
+function (π::MADDPGManager)(stage::PostEpisodeStage, env::AbstractEnv)
+    # collect state and a dummy action to each agent's trajectory here.
+    for (_, agent) in π.agents
+        update!(agent.trajectory, agent.policy, env, stage)
+    end
+
+    # update policy
+    update!(π)
+end
+
+# update policy
+function RLBase.update!(π::MADDPGManager)
+    π.update_step += 1
+    π.update_step % π.update_freq == 0 || return
+
+    for (_, agent) in π.agents
+        length(agent.trajectory) > agent.policy.policy.update_after || return
+        length(agent.trajectory) > π.batch_size || return
+    end
+    
+    # get training data
+    temp_player = collect(keys(π.agents))[1]
+    t = π.agents[temp_player].trajectory
+    inds = rand(π.rng, 1:length(t), π.batch_size)
+    batches = Dict((player, RLCore.fetch!(BatchSampler{SARTS}(π.batch_size), agent.trajectory, inds)) 
+                for (player, agent) in π.agents)
+    
+    # get s, a, s′ for critic
+    s = Flux.stack((batches[player][:state] for (player, _) in π.agents), 1)
+    a = Flux.stack((batches[player][:action] for (player, _) in π.agents), 1)
+    s′ = Flux.stack((batches[player][:next_state] for (player, _) in π.agents), 1)
+
+    # for training behavior_actor
+    mu_actions = Flux.stack(
+        ((
+            batches[player][:state] |> # get personal state information
+            x -> send_to_device(device(agent.policy.policy.behavior_actor), x) |>
+            agent.policy.policy.behavior_actor |> send_to_host
+        ) for (player, agent) in π.agents), 1
+    )
+    # for training behavior_critic
+    new_actions = Flux.stack(
+        ((
+            batches[player][:next_state] |> # get personal next_state information
+            x -> send_to_device(device(agent.policy.policy.target_actor), x) |>
+            agent.policy.policy.target_actor |> send_to_host
+        ) for (player, agent) in π.agents), 1
+    )
+
+    for (player, agent) in π.agents
+        p = agent.policy.policy # get DDPGPolicy struct
+        A = p.behavior_actor
+        C = p.behavior_critic
+        Aₜ = p.target_actor
+        Cₜ = p.target_critic
+
+        γ = p.γ
+        ρ = p.ρ
+
+        _device(x) = send_to_device(device(A), x)
+
+        # Note that here default A, C, Aₜ, Cₜ on the same device.
+        s, a, s′ = _device((s, a, s′))
+        mu_actions = _device(mu_actions)
+        new_actions = _device(new_actions)
+        r = _device(batches[player][:reward])
+        t = _device(batches[player][:terminal])
+
+        qₜ = Cₜ(vcat(s′, new_actions)) |> vec
+        y = r .+ γ .* (1 .- t) .* qₜ
+
+        gs1 = gradient(Flux.params(C)) do
+            q = C(vcat(s, a)) |> vec
+            loss = mean((y .- q) .^ 2)
+            ignore() do
+                p.critic_loss = loss
+            end
+            loss
+        end
+
+        update!(C, gs1)
+
+        gs2 = gradient(Flux.params(A)) do
+            loss = -mean(C(vcat(s, mu_actions)))
+            ignore() do
+                p.actor_loss = loss
+            end
+            loss
+        end
+
+        update!(A, gs2)
+
+        # polyak averaging
+        for (dest, src) in zip(Flux.params([Aₜ, Cₜ]), Flux.params([A, C]))
+            dest .= ρ .* dest .+ (1 - ρ) .* src
+        end
+    end
+end
diff --git a/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/policy_gradient.jl b/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/policy_gradient.jl
@@ -7,3 +7,4 @@ include("MAC.jl")
 include("ddpg.jl")
 include("td3.jl")
 include("sac.jl")
+include("maddpg.jl")