xxxxxxxxxx
7
1
begin
2
using ReinforcementLearning
3
using Flux
4
using Statistics
5
using Plots
6
using StatsBase
7
end
Left Right Environment
xxxxxxxxxx
34
1
begin
2
Base. mutable struct LeftRightEnv <: AbstractEnv
3
reward::Float64 = 0.
4
current_state::Int = 1
5
end
6
7
RLBase.state_space(env::LeftRightEnv) = Base.OneTo(2)
8
RLBase.action_space(env::LeftRightEnv) = Base.OneTo(2)
9
10
function (env::LeftRightEnv)(a::Int)
11
if a == 2
12
env.reward = 0.
13
env.current_state = 2
14
else
15
s = sample(Weights([0.9, 0.1], 1.0))
16
if s == 1
17
env.reward = 0.
18
env.current_state = 1
19
else
20
env.reward = 1.0
21
env.current_state = 2
22
end
23
end
24
end
25
26
function RLBase.reset!(env::LeftRightEnv)
27
env.current_state = 1
28
env.reward = 0.
29
end
30
31
RLBase.reward(env::LeftRightEnv) = env.reward
32
RLBase.is_terminated(env::LeftRightEnv) = env.current_state == 2
33
RLBase.state(env::LeftRightEnv) = env.current_state
34
end
# LeftRightEnv
## Traits
| Trait Type | Value |
|:----------------- | ------------------------------------------------:|
| NumAgentStyle | ReinforcementLearningBase.SingleAgent() |
| DynamicStyle | ReinforcementLearningBase.Sequential() |
| InformationStyle | ReinforcementLearningBase.ImperfectInformation() |
| ChanceStyle | ReinforcementLearningBase.Stochastic() |
| RewardStyle | ReinforcementLearningBase.StepReward() |
| UtilityStyle | ReinforcementLearningBase.GeneralSum() |
| ActionStyle | ReinforcementLearningBase.MinimalActionSet() |
| StateStyle | ReinforcementLearningBase.Observation{Any}() |
| DefaultStateStyle | ReinforcementLearningBase.Observation{Any}() |
## Is Environment Terminated?
No
## State Space
`Base.OneTo(2)`
## Action Space
`Base.OneTo(2)`
## Current State
```
1
```
xxxxxxxxxx
1
1
world = LeftRightEnv()
2
xxxxxxxxxx
4
1
begin
2
ns = length(state_space(world))
3
na = length(action_space(world))
4
end
VBasedPolicy
├─ learner => MonteCarloLearner
│ ├─ approximator
│ │ ├─ 1
│ │ │ └─ TabularApproximator
│ │ │ ├─ table => 2-element Array{Float64,1}
│ │ │ └─ optimizer => Descent
│ │ │ └─ eta => 1.0
│ │ └─ 2
│ │ └─ TabularApproximator
│ │ ├─ table => 2-element Array{Float64,1}
│ │ └─ optimizer => InvDecay
│ │ ├─ gamma => 1.0
│ │ └─ state => IdDict
│ ├─ γ => 1.0
│ ├─ kind => ReinforcementLearningZoo.FirstVisit
│ └─ sampling => ReinforcementLearningZoo.OrdinaryImportanceSampling
└─ mapping => Main.var"#1#2"
xxxxxxxxxx
12
1
π_t = VBasedPolicy(
2
learner=MonteCarloLearner(
3
approximator=(
4
TabularVApproximator(;n_state=ns, opt=Descent(1.0)), # V
5
TabularVApproximator(;n_state=ns, opt=InvDecay(1.0)) # Returns
6
),
7
kind=FIRST_VISIT,
8
sampling=ORDINARY_IMPORTANCE_SAMPLING,
9
γ=1.0
10
),
11
mapping= (env, V) -> 1
12
)
xxxxxxxxxx
2
1
# A little ad-hoc here
2
RLBase.prob(::typeof(π_t), s, a) = a == 1 ? 1.0 : 0.
xxxxxxxxxx
4
1
struct CollectValue <: AbstractHook
2
values::Vector{Float64}
3
CollectValue() = new([])
4
end
xxxxxxxxxx
1
1
(f::CollectValue)(::PostEpisodeStage, agent, env) = push!(f.values, agent.policy.π_target.learner.approximator[2](1))