Skip to content

add MADDPG algorithm #444

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Aug 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .cspell/cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,8 @@
"Norouzi",
"gzopen",
"turbulences",
"Decompressor"
"Decompressor",
"MADDPG"
],
"ignoreWords": [],
"minWordLength": 5,
Expand All @@ -143,4 +144,4 @@
"\\{%.*%\\}", // liquid syntax
"/^\\s*```[\\s\\S]*?^\\s*```/gm" // Another attempt at markdown code blocks. https://github.com/streetsidesoftware/vscode-spell-checker/issues/202#issuecomment-377477473
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# ---
# title: JuliaRL\_MADDPG\_KuhnPoker
# cover: assets/JuliaRL_MADDPG_KuhnPoker.png
# description: MADDPG applied to KuhnPoker
# date: 2021-08-09
# author: "[Peter Chen](https://github.com/peterchen96)"
# ---

#+ tangle=true
using ReinforcementLearning
using StableRNGs
using Flux
using IntervalSets

mutable struct ResultNEpisode <: AbstractHook
eval_freq::Int
episode_counter::Int
episode::Vector{Int}
results::Vector{Float64}
end

function (hook::ResultNEpisode)(::PostEpisodeStage, policy, env)
hook.episode_counter += 1
if hook.episode_counter % hook.eval_freq == 0
push!(hook.episode, hook.episode_counter)
push!(hook.results, reward(env, 1))
end
end

function RL.Experiment(
::Val{:JuliaRL},
::Val{:MADDPG},
::Val{:KuhnPoker},
::Nothing;
seed=123,
)
rng = StableRNG(seed)
env = KuhnPokerEnv()
wrapped_env = ActionTransformedEnv(
StateTransformedEnv(
env;
state_mapping = s -> [findfirst(==(s), state_space(env))],
state_space_mapping = ss -> [[findfirst(==(s), state_space(env))] for s in state_space(env)]
),
## add a dummy action for the other agent.
action_mapping = x -> length(x) == 1 ? x : Int(x[current_player(env)] + 1),
)
ns, na = 1, 1
n_players = 2

init = glorot_uniform(rng)

create_actor() = Chain(
Dense(ns, 64, relu; init = init),
Dense(64, 64, relu; init = init),
Dense(64, na, tanh; init = init),
)

create_critic() = Chain(
Dense(n_players * ns + n_players * na, 64, relu; init = init),
Dense(64, 64, relu; init = init),
Dense(64, 1; init = init),
)


policy = DDPGPolicy(
behavior_actor = NeuralNetworkApproximator(
model = create_actor(),
optimizer = ADAM(),
),
behavior_critic = NeuralNetworkApproximator(
model = create_critic(),
optimizer = ADAM(),
),
target_actor = NeuralNetworkApproximator(
model = create_actor(),
optimizer = ADAM(),
),
target_critic = NeuralNetworkApproximator(
model = create_critic(),
optimizer = ADAM(),
),
γ = 0.99f0,
ρ = 0.995f0,
na = na,
start_steps = 1000,
start_policy = RandomPolicy(-0.9..0.9; rng = rng),
update_after = 1000,
act_limit = 0.9,
act_noise = 0.1,
rng = rng,
)
trajectory = CircularArraySARTTrajectory(
capacity = 10000, # replay buffer capacity
state = Vector{Int} => (ns, ),
action = Float32 => (na, ),
)

agents = MADDPGManager(
Dict((player, Agent(
policy = NamedPolicy(player, deepcopy(policy)),
trajectory = deepcopy(trajectory),
)) for player in players(env) if player != chance_player(env)),
128, # batch_size
128, # update_freq
0, # step_counter
rng
)

stop_condition = StopAfterEpisode(100_000, is_show_progress=!haskey(ENV, "CI"))
hook = ResultNEpisode(1000, 0, [], [])
Experiment(agents, wrapped_env, stop_condition, hook, "# run MADDPG on KuhnPokerEnv")
end

#+ tangle=false
using Plots
ex = E`JuliaRL_MADDPG_KuhnPoker`
run(ex)
scatter(ex.hook.episode, ex.hook.results, xaxis=:log, xlabel="episode", ylabel="reward of player 1")

savefig("assets/JuliaRL_MADDPG_KuhnPoker.png") #hide

# ![](assets/JuliaRL_MADDPG_KuhnPoker.png)
1 change: 1 addition & 0 deletions docs/experiments/experiments/Policy Gradient/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"JuliaRL_A2C_CartPole.jl",
"JuliaRL_A2CGAE_CartPole.jl",
"JuliaRL_DDPG_Pendulum.jl",
"JuliaRL_MADDPG_KuhnPoker.jl",
"JuliaRL_MAC_CartPole.jl",
"JuliaRL_PPO_CartPole.jl",
"JuliaRL_PPO_Pendulum.jl",
Expand Down
145 changes: 145 additions & 0 deletions src/ReinforcementLearningZoo/src/algorithms/policy_gradient/maddpg.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
export MADDPGManager

"""
MADDPGManager(; agents::Dict{<:Any, <:Agent}, args...)
Multi-agent Deep Deterministic Policy Gradient(MADDPG) implemented in Julia. Here only works for simultaneous games whose action space is discrete.
See the paper https://arxiv.org/abs/1706.02275 for more details.

# Keyword arguments
- `agents::Dict{<:Any, <:NamedPolicy{<:Agent{<:DDPGPolicy, <:AbstractTrajectory}, <:Any}}`, here each agent collects its own information. While updating the policy, each `critic` will assemble all agents' trajectory to update its own network.
- `batch_size::Int`
- `update_freq::Int`
- `update_step::Int`, count the step.
- `rng::AbstractRNG`.
"""
mutable struct MADDPGManager{P<:DDPGPolicy, T<:AbstractTrajectory, N<:Any} <: AbstractPolicy
agents::Dict{<:N, <:Agent{<:NamedPolicy{<:P, <:N}, <:T}}
batch_size::Int
update_freq::Int
update_step::Int
rng::AbstractRNG
end

# for simultaneous game with a discrete action space.
function (π::MADDPGManager)(env::AbstractEnv)
while current_player(env) == chance_player(env)
env |> legal_action_space |> rand |> env
end
Dict((player, ceil(agent.policy(env))) for (player, agent) in π.agents)
end

function (π::MADDPGManager)(stage::Union{PreEpisodeStage, PostActStage}, env::AbstractEnv)
# only need to update trajectory.
for (_, agent) in π.agents
update!(agent.trajectory, agent.policy, env, stage)
end
end

function (π::MADDPGManager)(stage::PreActStage, env::AbstractEnv, actions)
# update each agent's trajectory.
for (player, agent) in π.agents
update!(agent.trajectory, agent.policy, env, stage, actions[player])
end

# update policy
update!(π)
end

function (π::MADDPGManager)(stage::PostEpisodeStage, env::AbstractEnv)
# collect state and a dummy action to each agent's trajectory here.
for (_, agent) in π.agents
update!(agent.trajectory, agent.policy, env, stage)
end

# update policy
update!(π)
end

# update policy
function RLBase.update!(π::MADDPGManager)
π.update_step += 1
π.update_step % π.update_freq == 0 || return

for (_, agent) in π.agents
length(agent.trajectory) > agent.policy.policy.update_after || return
length(agent.trajectory) > π.batch_size || return
end

# get training data
temp_player = collect(keys(π.agents))[1]
t = π.agents[temp_player].trajectory
inds = rand(π.rng, 1:length(t), π.batch_size)
batches = Dict((player, RLCore.fetch!(BatchSampler{SARTS}(π.batch_size), agent.trajectory, inds))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The hardcoded SARTS will make the algorithm work only on environments of MINIMAL_ACTION_SET.

for (player, agent) in π.agents)

# get s, a, s′ for critic
s = Flux.stack((batches[player][:state] for (player, _) in π.agents), 1)
a = Flux.stack((batches[player][:action] for (player, _) in π.agents), 1)
s′ = Flux.stack((batches[player][:next_state] for (player, _) in π.agents), 1)

# for training behavior_actor
mu_actions = Flux.stack(
((
batches[player][:state] |> # get personal state information
x -> send_to_device(device(agent.policy.policy.behavior_actor), x) |>
agent.policy.policy.behavior_actor |> send_to_host
) for (player, agent) in π.agents), 1
)
# for training behavior_critic
new_actions = Flux.stack(
((
batches[player][:next_state] |> # get personal next_state information
x -> send_to_device(device(agent.policy.policy.target_actor), x) |>
agent.policy.policy.target_actor |> send_to_host
) for (player, agent) in π.agents), 1
)

for (player, agent) in π.agents
p = agent.policy.policy # get DDPGPolicy struct
A = p.behavior_actor
C = p.behavior_critic
Aₜ = p.target_actor
Cₜ = p.target_critic

γ = p.γ
ρ = p.ρ

_device(x) = send_to_device(device(A), x)

# Note that here default A, C, Aₜ, Cₜ on the same device.
s, a, s′ = _device((s, a, s′))
mu_actions = _device(mu_actions)
new_actions = _device(new_actions)
r = _device(batches[player][:reward])
t = _device(batches[player][:terminal])

qₜ = Cₜ(vcat(s′, new_actions)) |> vec
y = r .+ γ .* (1 .- t) .* qₜ

gs1 = gradient(Flux.params(C)) do
q = C(vcat(s, a)) |> vec
loss = mean((y .- q) .^ 2)
ignore() do
p.critic_loss = loss
end
loss
end

update!(C, gs1)

gs2 = gradient(Flux.params(A)) do
loss = -mean(C(vcat(s, mu_actions)))
ignore() do
p.actor_loss = loss
end
loss
end

update!(A, gs2)

# polyak averaging
for (dest, src) in zip(Flux.params([Aₜ, Cₜ]), Flux.params([A, C]))
dest .= ρ .* dest .+ (1 - ρ) .* src
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ include("MAC.jl")
include("ddpg.jl")
include("td3.jl")
include("sac.jl")
include("maddpg.jl")