xxxxxxxxxx
6
1
begin
2
using ReinforcementLearning
3
using Flux
4
using Statistics
5
using Plots
6
end
To describe the Grid World in Example 3.5, we'll create a distributional environment model. Here the distributional means, given a state-action paire, we can predict the possible next state, reward, termination info and the corresponding probability.
xxxxxxxxxx
35
1
begin
2
function nextstep(s::CartesianIndex{2}, a::CartesianIndex{2})
3
if s == CartesianIndex(1, 2)
4
r, s′ = 10., CartesianIndex(5, 2)
5
elseif s == CartesianIndex(1, 4)
6
r, s′ = 5., CartesianIndex(3, 4)
7
else
8
s′ = s + a
9
if 1 ≤ s′[1] ≤ 5 && 1 ≤ s′[2] ≤ 5
10
r = 0.
11
else
12
r = -1.
13
s′ = s
14
end
15
end
16
[(r, false, LinearIndices((5,5))[s′]) => 1.0]
17
end
18
19
ACTIONS = (
20
CartesianIndex(-1, 0),
21
CartesianIndex(1,0),
22
CartesianIndex(0, 1),
23
CartesianIndex(0, -1)
24
)
25
26
struct GridWorldModel <: AbstractEnvironmentModel
27
end
28
29
function (m::GridWorldModel)(s, a)
30
nextstep(CartesianIndices((5,5))[s], ACTIONS[a])
31
end
32
33
RLBase.state_space(m::GridWorldModel) = Base.OneTo(5*5)
34
RLBase.action_space(m::GridWorldModel) = Base.OneTo(length(ACTIONS))
35
end
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
xxxxxxxxxx
1
1
V = TabularVApproximator(;n_state=25, opt=Descent(1.0))
3.30943
1.52199
0.0512075
-0.973217
-1.85733
8.78964
2.99266
0.738502
-0.435172
-1.34491
4.42795
2.25045
0.673411
-0.354592
-1.22898
5.32267
1.90786
0.358465
-0.585334
-1.42265
1.49249
0.54769
-0.40287
-1.18281
-1.97492
1.0
xxxxxxxxxx
6
1
policy_evaluation!(
2
V = V,
3
π=RandomPolicy(Base.OneTo(4)),
4
model=GridWorldModel(),
5
γ=0.9
6
)
5×5 Array{Float64,2}:
3.30943 8.78964 4.42795 5.32267 1.49249
1.52199 2.99266 2.25045 1.90786 0.54769
0.0512075 0.738502 0.673411 0.358465 -0.40287
-0.973217 -0.435172 -0.354592 -0.585334 -1.18281
-1.85733 -1.34491 -1.22898 -1.42265 -1.97492
xxxxxxxxxx
1
1
table = reshape(V.table, 5, 5)
xxxxxxxxxx
1
1
heatmap(table,yflip=true)