|
| 1 | +export MADDPGManager |
| 2 | + |
| 3 | +""" |
| 4 | + MADDPGManager(; agents::Dict{<:Any, <:Agent}, args...) |
| 5 | +Multi-agent Deep Deterministic Policy Gradient(MADDPG) implemented in Julia. Here only works for simultaneous games whose action space is discrete. |
| 6 | +See the paper https://arxiv.org/abs/1706.02275 for more details. |
| 7 | +
|
| 8 | +# Keyword arguments |
| 9 | +- `agents::Dict{<:Any, <:NamedPolicy{<:Agent{<:DDPGPolicy, <:AbstractTrajectory}, <:Any}}`, here each agent collects its own information. While updating the policy, each `critic` will assemble all agents' trajectory to update its own network. |
| 10 | +- `batch_size::Int` |
| 11 | +- `update_freq::Int` |
| 12 | +- `update_step::Int`, count the step. |
| 13 | +- `rng::AbstractRNG`. |
| 14 | +""" |
| 15 | +mutable struct MADDPGManager{P<:DDPGPolicy, T<:AbstractTrajectory, N<:Any} <: AbstractPolicy |
| 16 | + agents::Dict{<:N, <:Agent{<:NamedPolicy{<:P, <:N}, <:T}} |
| 17 | + batch_size::Int |
| 18 | + update_freq::Int |
| 19 | + update_step::Int |
| 20 | + rng::AbstractRNG |
| 21 | +end |
| 22 | + |
| 23 | +# for simultaneous game with a discrete action space. |
| 24 | +function (π::MADDPGManager)(env::AbstractEnv) |
| 25 | + while current_player(env) == chance_player(env) |
| 26 | + env |> legal_action_space |> rand |> env |
| 27 | + end |
| 28 | + Dict((player, ceil(agent.policy(env))) for (player, agent) in π.agents) |
| 29 | +end |
| 30 | + |
| 31 | +function (π::MADDPGManager)(stage::Union{PreEpisodeStage, PostActStage}, env::AbstractEnv) |
| 32 | + # only need to update trajectory. |
| 33 | + for (_, agent) in π.agents |
| 34 | + update!(agent.trajectory, agent.policy, env, stage) |
| 35 | + end |
| 36 | +end |
| 37 | + |
| 38 | +function (π::MADDPGManager)(stage::PreActStage, env::AbstractEnv, actions) |
| 39 | + # update each agent's trajectory. |
| 40 | + for (player, agent) in π.agents |
| 41 | + update!(agent.trajectory, agent.policy, env, stage, actions[player]) |
| 42 | + end |
| 43 | + |
| 44 | + # update policy |
| 45 | + update!(π) |
| 46 | +end |
| 47 | + |
| 48 | +function (π::MADDPGManager)(stage::PostEpisodeStage, env::AbstractEnv) |
| 49 | + # collect state and a dummy action to each agent's trajectory here. |
| 50 | + for (_, agent) in π.agents |
| 51 | + update!(agent.trajectory, agent.policy, env, stage) |
| 52 | + end |
| 53 | + |
| 54 | + # update policy |
| 55 | + update!(π) |
| 56 | +end |
| 57 | + |
| 58 | +# update policy |
| 59 | +function RLBase.update!(π::MADDPGManager) |
| 60 | + π.update_step += 1 |
| 61 | + π.update_step % π.update_freq == 0 || return |
| 62 | + |
| 63 | + for (_, agent) in π.agents |
| 64 | + length(agent.trajectory) > agent.policy.policy.update_after || return |
| 65 | + length(agent.trajectory) > π.batch_size || return |
| 66 | + end |
| 67 | + |
| 68 | + # get training data |
| 69 | + temp_player = collect(keys(π.agents))[1] |
| 70 | + t = π.agents[temp_player].trajectory |
| 71 | + inds = rand(π.rng, 1:length(t), π.batch_size) |
| 72 | + batches = Dict((player, RLCore.fetch!(BatchSampler{SARTS}(π.batch_size), agent.trajectory, inds)) |
| 73 | + for (player, agent) in π.agents) |
| 74 | + |
| 75 | + # get s, a, s′ for critic |
| 76 | + s = Flux.stack((batches[player][:state] for (player, _) in π.agents), 1) |
| 77 | + a = Flux.stack((batches[player][:action] for (player, _) in π.agents), 1) |
| 78 | + s′ = Flux.stack((batches[player][:next_state] for (player, _) in π.agents), 1) |
| 79 | + |
| 80 | + # for training behavior_actor |
| 81 | + mu_actions = Flux.stack( |
| 82 | + (( |
| 83 | + batches[player][:state] |> # get personal state information |
| 84 | + x -> send_to_device(device(agent.policy.policy.behavior_actor), x) |> |
| 85 | + agent.policy.policy.behavior_actor |> send_to_host |
| 86 | + ) for (player, agent) in π.agents), 1 |
| 87 | + ) |
| 88 | + # for training behavior_critic |
| 89 | + new_actions = Flux.stack( |
| 90 | + (( |
| 91 | + batches[player][:next_state] |> # get personal next_state information |
| 92 | + x -> send_to_device(device(agent.policy.policy.target_actor), x) |> |
| 93 | + agent.policy.policy.target_actor |> send_to_host |
| 94 | + ) for (player, agent) in π.agents), 1 |
| 95 | + ) |
| 96 | + |
| 97 | + for (player, agent) in π.agents |
| 98 | + p = agent.policy.policy # get DDPGPolicy struct |
| 99 | + A = p.behavior_actor |
| 100 | + C = p.behavior_critic |
| 101 | + Aₜ = p.target_actor |
| 102 | + Cₜ = p.target_critic |
| 103 | + |
| 104 | + γ = p.γ |
| 105 | + ρ = p.ρ |
| 106 | + |
| 107 | + _device(x) = send_to_device(device(A), x) |
| 108 | + |
| 109 | + # Note that here default A, C, Aₜ, Cₜ on the same device. |
| 110 | + s, a, s′ = _device((s, a, s′)) |
| 111 | + mu_actions = _device(mu_actions) |
| 112 | + new_actions = _device(new_actions) |
| 113 | + r = _device(batches[player][:reward]) |
| 114 | + t = _device(batches[player][:terminal]) |
| 115 | + |
| 116 | + qₜ = Cₜ(vcat(s′, new_actions)) |> vec |
| 117 | + y = r .+ γ .* (1 .- t) .* qₜ |
| 118 | + |
| 119 | + gs1 = gradient(Flux.params(C)) do |
| 120 | + q = C(vcat(s, a)) |> vec |
| 121 | + loss = mean((y .- q) .^ 2) |
| 122 | + ignore() do |
| 123 | + p.critic_loss = loss |
| 124 | + end |
| 125 | + loss |
| 126 | + end |
| 127 | + |
| 128 | + update!(C, gs1) |
| 129 | + |
| 130 | + gs2 = gradient(Flux.params(A)) do |
| 131 | + loss = -mean(C(vcat(s, mu_actions))) |
| 132 | + ignore() do |
| 133 | + p.actor_loss = loss |
| 134 | + end |
| 135 | + loss |
| 136 | + end |
| 137 | + |
| 138 | + update!(A, gs2) |
| 139 | + |
| 140 | + # polyak averaging |
| 141 | + for (dest, src) in zip(Flux.params([Aₜ, Cₜ]), Flux.params([A, C])) |
| 142 | + dest .= ρ .* dest .+ (1 - ρ) .* src |
| 143 | + end |
| 144 | + end |
| 145 | +end |
0 commit comments