Chapter 6.7 Maximization Bias and Double Learning
In example 6.7, authors introduced a MDP problem to compare the different performance of Q-Learning and Double-Q-Learning. This environment is kind of special compared to the environments we have seen before. In the first step, only LEFT
and RIGHT
are allowed. In the second step, if the LEFT
is chosen previously, then we have 10 valid actions. We call this kind of environment is of FULL_ACTION_SET
.
xxxxxxxxxx
6
1
begin
2
using ReinforcementLearning
3
using Statistics
4
using Flux
5
using Plots
6
end
MaximizationBiasEnv
xxxxxxxxxx
13
1
"""
2
states:
3
1:A
4
2:B
5
3:terminal
6
actions:
7
1: left
8
2: right
9
"""
10
Base. mutable struct MaximizationBiasEnv <: AbstractEnv
11
position::Int = 1
12
reward::Float64 = 0.0
13
end
xxxxxxxxxx
1
1
RLBase.state_space(env::MaximizationBiasEnv) = Base.OneTo(3)
xxxxxxxxxx
1
1
RLBase.action_space(env::MaximizationBiasEnv) = Base.OneTo(10)
xxxxxxxxxx
1
1
RLBase.ActionStyle(env::MaximizationBiasEnv) = FULL_ACTION_SET
1
xxxxxxxxxx
1
1
const LEFT = 1
2
xxxxxxxxxx
1
1
const RIGHT = 2
xxxxxxxxxx
7
1
function RLBase.legal_action_space(env::MaximizationBiasEnv)
2
if env.position == 1
3
(LEFT, RIGHT)
4
else
5
Base.OneTo(10)
6
end
7
end
xxxxxxxxxx
10
1
function RLBase.legal_action_space_mask(env::MaximizationBiasEnv)
2
m = fill(false, 10)
3
if env.position == 1
4
m[LEFT] = true
5
m[RIGHT] = true
6
else
7
m .= true
8
end
9
m
10
end
xxxxxxxxxx
15
1
function (env::MaximizationBiasEnv)(a::Int)
2
if env.position == 1
3
if a == 1
4
env.position = 2
5
env.reward = 0.0
6
else
7
env.position = 3
8
env.reward = 0.0
9
end
10
elseif env.position == 2
11
env.position = 3
12
env.reward = randn() - 0.1
13
end
14
nothing
15
end
xxxxxxxxxx
5
1
function RLBase.reset!(env::MaximizationBiasEnv)
2
env.position = 1
3
env.reward = 0.0
4
nothing
5
end
xxxxxxxxxx
1
1
RLBase.reward(env::MaximizationBiasEnv) = env.reward
xxxxxxxxxx
1
1
RLBase.is_terminated(env::MaximizationBiasEnv) = env.position == 3
xxxxxxxxxx
1
1
RLBase.state(env::MaximizationBiasEnv) = env.position
Now the environment is well defined.
# MaximizationBiasEnv
## Traits
| Trait Type | Value |
|:----------------- | ------------------------------------------------:|
| NumAgentStyle | ReinforcementLearningBase.SingleAgent() |
| DynamicStyle | ReinforcementLearningBase.Sequential() |
| InformationStyle | ReinforcementLearningBase.ImperfectInformation() |
| ChanceStyle | ReinforcementLearningBase.Stochastic() |
| RewardStyle | ReinforcementLearningBase.StepReward() |
| UtilityStyle | ReinforcementLearningBase.GeneralSum() |
| ActionStyle | ReinforcementLearningBase.FullActionSet() |
| StateStyle | ReinforcementLearningBase.Observation{Any}() |
| DefaultStateStyle | ReinforcementLearningBase.Observation{Any}() |
## Is Environment Terminated?
No
## State Space
`Base.OneTo(3)`
## Action Space
`Base.OneTo(10)`
## Current State
```
1
```
xxxxxxxxxx
1
1
world = MaximizationBiasEnv()
3
10
xxxxxxxxxx
1
1
NS, NA = length(state_space(world)), length(action_space(world))
To calculate the percentage of chosing LEFT
action in the first step, we'll create customized hook here:
CountOfLeft
xxxxxxxxxx
3
1
Base. mutable struct CountOfLeft <: AbstractHook
2
counts::Vector{Bool} = []
3
end
xxxxxxxxxx
5
1
function (f::CountOfLeft)(::PreActStage, agent, env, action)
2
if state(env) == 1
3
push!(f.counts, action == LEFT)
4
end
5
end
Next we create two agent factories, Q-Learning and Double-Q-Learning.
create_double_Q_agent (generic function with 1 method)
xxxxxxxxxx
28
1
create_double_Q_agent() = Agent(
2
policy=QBasedPolicy(
3
learner=DoubleLearner(
4
L1=TDLearner(
5
approximator=TabularQApproximator(
6
n_state=NS,
7
n_action=NA,
8
opt=Descent(0.1),
9
),
10
method=:SARS,
11
γ=1.,
12
n=0,
13
),
14
L2=TDLearner(
15
approximator=TabularQApproximator(;
16
n_state=NS,
17
n_action=NA,
18
opt=Descent(0.1),
19
),
20
method=:SARS,
21
γ=1.,
22
n=0,
23
),
24
),
25
explorer=EpsilonGreedyExplorer(0.1;is_break_tie=true)
26
),
27
trajectory=VectorSARTTrajectory()
28
)
create_Q_agent (generic function with 1 method)
xxxxxxxxxx
12
1
create_Q_agent() = Agent(
2
policy=QBasedPolicy(
3
learner=TDLearner(
4
approximator=TabularQApproximator(;n_state=NS, n_action=NA, opt=Descent(0.1)),
5
method=:SARS,
6
γ=1.0,
7
n=0
8
),
9
explorer=EpsilonGreedyExplorer(0.1;is_break_tie=true)
10
),
11
trajectory=VectorSARTTrajectory()
12
)
xxxxxxxxxx
18
1
begin
2
DQ_stats = []
3
for _ in 1:1000
4
hook = CountOfLeft()
5
run(create_double_Q_agent(), world, StopAfterEpisode(300),hook)
6
push!(DQ_stats, hook.counts)
7
end
8
plot(mean(DQ_stats), legend=:topright, label="double q")
9
10
Q_stats = []
11
for _ in 1:1000
12
hook = CountOfLeft()
13
run(create_Q_agent(), world, StopAfterEpisode(300),hook)
14
push!(Q_stats, hook.counts)
15
end
16
plot!(mean(Q_stats), legend=:topright, label="q")
17
hline!([0.05], linestyle=:dash, label="optimal")
18
end