Chapter 8.2 Dyna: Integrated Planning, Acting, and Learning
To demonstrate the flexibility of ReinforcementLearning.jl
, the DynaAgent
is also included and we'll explore its performance in this notebook.
xxxxxxxxxx
1
1
using ReinforcementLearning
The Maze Environment
In this chapter, the authors introduced a specific maze environment. So let's define it by implementing the interfaces in ReinforcementLearning.jl
.
CartesianIndex(0, -1)
CartesianIndex(0, 1)
CartesianIndex(-1, 0)
CartesianIndex(1, 0)
xxxxxxxxxx
6
1
const LRUD = [
2
CartesianIndex(0, -1), # left
3
CartesianIndex(0, 1), # right
4
CartesianIndex(-1, 0), # up
5
CartesianIndex(1, 0), # down
6
]
xxxxxxxxxx
31
1
begin
2
mutable struct MazeEnv <: AbstractEnv
3
walls::Set{CartesianIndex{2}}
4
position::CartesianIndex{2}
5
start::CartesianIndex{2}
6
goal::CartesianIndex{2}
7
NX::Int
8
NY::Int
9
end
10
11
function MazeEnv()
12
walls = Set([
13
[CartesianIndex(i, 3) for i = 2:4]
14
CartesianIndex(5, 6)
15
[CartesianIndex(j, 8) for j = 1:3]
16
])
17
start = CartesianIndex(3, 1)
18
goal = CartesianIndex(1, 9)
19
MazeEnv(walls, start, start, goal, 6, 9)
20
end
21
22
function (env::MazeEnv)(a::Int)
23
p = env.position + LRUD[a]
24
if p == env.goal
25
env.position = env.goal
26
elseif !(p ∈ env.walls)
27
env.position = CartesianIndex(min(max(p[1], 1), env.NX), min(max(p[2], 1), env.NY))
28
end
29
nothing
30
end
31
end
xxxxxxxxxx
1
1
RLBase.state_space(env::MazeEnv) = Base.OneTo(env.NX * env.NY)
xxxxxxxxxx
1
1
RLBase.action_space(env::MazeEnv) = Base.OneTo(length(LRUD))
xxxxxxxxxx
1
1
RLBase.reward(env::MazeEnv) = Float64(env.position == env.goal)
xxxxxxxxxx
1
1
RLBase.state(env::MazeEnv) = (env.position[2] - 1) * env.NX + env.position[1]
xxxxxxxxxx
1
1
RLBase.is_terminated(env::MazeEnv) = env.position == env.goal
xxxxxxxxxx
1
1
RLBase.reset!(env::MazeEnv) = env.position = env.start
* (generic function with 861 methods)
xxxxxxxxxx
20
1
begin
2
import Base: *
3
4
function extend(p::CartesianIndex{2}, n::Int)
5
x, y = Tuple(p)
6
[CartesianIndex(n * (x - 1) + i, n * (y - 1) + j) for i = 1:n for j = 1:n]
7
end
8
9
function remap(p::CartesianIndex{2}, n::Int)
10
x, y = Tuple(p)
11
CartesianIndex((x - 1) * n + 1, (y - 1) * n + 1)
12
end
13
14
function *(env::MazeEnv, n::Int)
15
walls = Set{CartesianIndex{2}}(ww for w in env.walls for ww in extend(w, n))
16
start, position, goal = remap(env.start, n), remap(env.position, n), remap(env.goal, n)
17
NX, NY = env.NX * n, env.NY * n
18
MazeEnv(walls, position, start, goal, NX, NY)
19
end
20
end
# MazeEnv
## Traits
| Trait Type | Value |
|:----------------- | ------------------------------------------------:|
| NumAgentStyle | ReinforcementLearningBase.SingleAgent() |
| DynamicStyle | ReinforcementLearningBase.Sequential() |
| InformationStyle | ReinforcementLearningBase.ImperfectInformation() |
| ChanceStyle | ReinforcementLearningBase.Stochastic() |
| RewardStyle | ReinforcementLearningBase.StepReward() |
| UtilityStyle | ReinforcementLearningBase.GeneralSum() |
| ActionStyle | ReinforcementLearningBase.MinimalActionSet() |
| StateStyle | ReinforcementLearningBase.Observation{Any}() |
| DefaultStateStyle | ReinforcementLearningBase.Observation{Any}() |
## Is Environment Terminated?
No
## State Space
`Base.OneTo(54)`
## Action Space
`Base.OneTo(4)`
## Current State
```
3
```
xxxxxxxxxx
1
1
x = MazeEnv()
Figure 8.2
xxxxxxxxxx
1
1
using Flux
plan_step (generic function with 1 method)
xxxxxxxxxx
27
1
function plan_step(n)
2
env = MazeEnv()
3
ns = length(state_space(env))
4
na = length(action_space(env))
5
6
agent = DynaAgent(
7
policy=QBasedPolicy(
8
learner=TDLearner(
9
approximator=TabularQApproximator(
10
n_state=ns,
11
n_action=na,
12
opt=Descent(0.1)
13
),
14
γ=0.1,
15
method=:SARS
16
),
17
explorer=EpsilonGreedyExplorer(0.1;is_break_tie=true)
18
),
19
model=ExperienceBasedSamplingModel(),
20
trajectory=VectorSARTTrajectory(),
21
plan_step=n
22
)
23
24
hook = StepsPerEpisode()
25
run(agent, env, StopAfterEpisode(50),hook)
26
hook.steps
27
end
xxxxxxxxxx
1
1
using Plots
xxxxxxxxxx
1
1
using Statistics
xxxxxxxxxx
7
1
begin
2
fig_8_2 = plot(legend=:topright)
3
for n in [0, 5, 50]
4
plot!(fig_8_2, mean(plan_step(n) for _ in 1:30), label="plan_step = $n")
5
end
6
fig_8_2
7
end
Figure 8.4
cumulative_dyna_reward (generic function with 1 method)
xxxxxxxxxx
45
1
function cumulative_dyna_reward(model, walls, nstep1, change, nstep2)
2
env = MazeEnv(
3
walls,
4
CartesianIndex(6, 4),
5
CartesianIndex(6, 4),
6
CartesianIndex(1, 9),
7
6,
8
9
9
)
10
ns = length(state_space(env))
11
na = length(action_space(env))
12
agent = DynaAgent(
13
policy=QBasedPolicy(
14
learner=TDLearner(
15
approximator=TabularQApproximator(
16
n_state=ns,
17
n_action=na,
18
opt=Descent(1.)
19
),
20
γ=0.95,
21
method=:SARS
22
),
23
explorer=EpsilonGreedyExplorer(0.1;is_break_tie=true)
24
),
25
model=model,
26
trajectory=VectorSARTTrajectory(),
27
plan_step=10
28
)
29
30
hook = StepsPerEpisode()
31
run(agent, env, StopAfterStep(nstep1;is_show_progress=false),hook)
32
change(env.walls)
33
run(agent, env, StopAfterStep(nstep2;is_show_progress=false),hook)
34
35
cumulative_reward = []
36
for (i, n) in enumerate(hook.steps)
37
for _ in 1:n
38
push!(cumulative_reward, i)
39
end
40
end
41
for _ in (nstep1+nstep2):-1:length(cumulative_reward)
42
push!(cumulative_reward, length(hook.steps))
43
end
44
cumulative_reward
45
end
walls (generic function with 1 method)
xxxxxxxxxx
1
1
walls() = Set([CartesianIndex(4, j) for j in 1:8])
change_walls (generic function with 1 method)
xxxxxxxxxx
4
1
function change_walls(walls)
2
pop!(walls, CartesianIndex(4,1))
3
push!(walls, CartesianIndex(4,9))
4
end
xxxxxxxxxx
6
1
begin
2
fig_8_4 = plot(legend=:topleft)
3
plot!(fig_8_4, mean(cumulative_dyna_reward(ExperienceBasedSamplingModel(), walls(), 1000, change_walls, 2000) for _ in 1:30), label="Dyna-Q")
4
plot!(fig_8_4, mean(cumulative_dyna_reward(TimeBasedSamplingModel(;n_actions=4), walls(), 1000, change_walls, 2000) for _ in 1:30), label="Dyna-Q+")
5
fig_8_4
6
end
Figure 8.5
new_walls (generic function with 1 method)
xxxxxxxxxx
1
1
new_walls() = Set([CartesianIndex(4, j) for j in 2:9])
new_change_walls (generic function with 1 method)
xxxxxxxxxx
3
1
function new_change_walls(walls)
2
pop!(walls, CartesianIndex(4,9))
3
end
xxxxxxxxxx
6
1
begin
2
fig_8_5 = plot(legend=:topleft)
3
plot!(fig_8_5, mean(cumulative_dyna_reward(ExperienceBasedSamplingModel(), new_walls(), 3000, new_change_walls, 3000) for _ in 1:50), label="dyna-Q")
4
plot!(fig_8_5, mean(cumulative_dyna_reward(TimeBasedSamplingModel(n_actions=4, κ = 1e-3), new_walls(), 3000, new_change_walls, 3000) for _ in 1:50), label="dyna-Q+")
5
fig_8_5
6
end
Example 8.4
run_once (generic function with 2 methods)
xxxxxxxxxx
24
1
function run_once(model, ratio=1)
2
env = MazeEnv() * ratio
3
ns = length(state_space(env))
4
na = length(action_space(env))
5
agent = DynaAgent(
6
policy=QBasedPolicy(
7
learner=TDLearner(
8
approximator=TabularQApproximator(
9
n_state=ns,
10
n_action=na,
11
opt=Descent(0.5)),
12
γ=0.95,
13
method=:SARS
14
),
15
explorer=EpsilonGreedyExplorer(0.1;is_break_tie=true)
16
),
17
model=model,
18
trajectory=VectorSARTTrajectory(),
19
plan_step=5
20
)
21
hook = StepsPerEpisode()
22
run(agent, env, (args...) -> length(hook.steps) > 0 && hook.steps[end] <= 14 * ratio * 1.2,hook)
23
model.sample_count
24
end
xxxxxxxxxx
6
1
begin
2
p = plot(legend=:topleft)
3
plot!(mean([run_once(ExperienceBasedSamplingModel(), ratio) for ratio in 1:6] for _ in 1:5), label="Dyna", yscale=:log10)
4
plot!(mean([run_once(PrioritizedSweepingSamplingModel(), ratio) for ratio in 1:6] for _ in 1:5), label="Prioritized", yscale=:log10)
5
p
6
end