Chapter 10.3 Access Control
In Chapter 10.3, an algorithm of Differential semi-gradient Sarsa for estimating q̂ ≈ q⋆. This one is not included in ReinforcementLearning.jl
. Here we'll use it as an example to demonstrate how easy it is to extend components in ReinforcementLearning.jl
.
xxxxxxxxxx
begin
using ReinforcementLearning
using Flux
using Statistics
using Plots
end
Implement the DifferentialTDLearner
First let's define a DifferentialTDLearner
. It will be used to estimate Q values. So we have to implement (L::DifferentialTDLearner)(env::AbstractEnv)
, which simply forward the current state
to the inner approximator
.
xxxxxxxxxx
Base. mutable struct DifferentialTDLearner{A<:AbstractApproximator} <: AbstractLearner
approximator::A
β::Float64
R̄::Float64 = 0.0
end
xxxxxxxxxx
(L::DifferentialTDLearner)(env::AbstractEnv) = L.approximator(state(env))
Now based on the definition of this algorithm on the book, we can implement the updating logic as follows:
xxxxxxxxxx
function RLBase.update!(L::DifferentialTDLearner, transition::Tuple)
s, a, r, t, s′, a′ = transition
β, Q = L.β, L.approximator
δ = r - L.R̄ + (1-t)*Q(s′, a′) - Q(s,a)
L.R̄ += β* δ
update!(Q, (s,a) => -δ)
end
Next, we dispatch some runtime logic to our specific learner to make sure the above update!
function is called at the right time.
xxxxxxxxxx
begin
function RLBase.update!(
p::DifferentialTDLearner,
t::AbstractTrajectory,
e::AbstractEnv,
s::PreActStage,
)
if length(t[:terminal]) > 0
update!(
p,
(
t[:state][end-1],
t[:action][end-1],
t[:reward][end],
t[:terminal][end],
t[:state][end],
t[:action][end]
)
)
end
end
function RLBase.update!(
p::DifferentialTDLearner,
t::AbstractTrajectory,
e::AbstractEnv,
s::PostEpisodeStage,
)
update!(
p,
(
t[:state][end-1],
t[:action][end-1],
t[:reward][end],
t[:terminal][end],
t[:state][end],
t[:action][end]
)
)
end
end
The above function specifies that we only update the DifferentialTDLearnerTDLearner
at the PreActStage
and the PostEpisodeStage
. Also note that we don't need to push all the transitions into the trajectory at each step. So we can empty!
it at the start of an episode:
xxxxxxxxxx
function RLBase.update!(
t::AbstractTrajectory,
::QBasedPolicy{<:DifferentialTDLearner},
::AbstractEnv,
::PreEpisodeStage
)
empty!(t)
end
Access Control Environment
Before evaluating the learner implemented above, we have to first define the environment.
xxxxxxxxxx
begin
using Distributions
const N_SERVERS = 10
const PRIORITIES = [1.0, 2.0, 4.0, 8.0]
const FREE_PROB = 0.06
const ACCEPT_REJECT = (:accept, :reject)
const CUSTOMERS = 1:length(PRIORITIES)
const TRANSFORMER = LinearIndices((0:N_SERVERS, CUSTOMERS))
Base. mutable struct AccessControlEnv <: AbstractEnv
n_servers::Int = 10
n_free_servers::Int = 0
customer::Int = rand(CUSTOMERS)
reward::Float64 = 0.0
end
RLBase.state_space(env::AccessControlEnv) = Base.OneTo(length(TRANSFORMER))
RLBase.action_space(env::AccessControlEnv) = Base.OneTo(2)
function (env::AccessControlEnv)(a)
action, reward = ACCEPT_REJECT[a], 0.0
if env.n_free_servers > 0 && action == :accept
env.n_free_servers -= 1
reward = PRIORITIES[env.customer]
end
env.n_free_servers += rand(Binomial(env.n_servers - env.n_free_servers, FREE_PROB))
env.customer = rand(CUSTOMERS)
env.reward = reward
nothing
end
RLBase.reward(env::AccessControlEnv) = env.reward
RLBase.is_terminated(env::AccessControlEnv) = false
RLBase.state(env::AccessControlEnv) = TRANSFORMER[CartesianIndex(env.n_free_servers + 1, env.customer)]
function RLBase.reset!(env::AccessControlEnv)
env.n_free_servers = env.n_servers
env.customer = rand(CUSTOMERS)
env.reward = 0.0
nothing
end
end
# AccessControlEnv
## Traits
| Trait Type | Value |
|:----------------- | ------------------------------------------------:|
| NumAgentStyle | ReinforcementLearningBase.SingleAgent() |
| DynamicStyle | ReinforcementLearningBase.Sequential() |
| InformationStyle | ReinforcementLearningBase.ImperfectInformation() |
| ChanceStyle | ReinforcementLearningBase.Stochastic() |
| RewardStyle | ReinforcementLearningBase.StepReward() |
| UtilityStyle | ReinforcementLearningBase.GeneralSum() |
| ActionStyle | ReinforcementLearningBase.MinimalActionSet() |
| StateStyle | ReinforcementLearningBase.Observation{Any}() |
| DefaultStateStyle | ReinforcementLearningBase.Observation{Any}() |
## Is Environment Terminated?
No
## State Space
`Base.OneTo(44)`
## Action Space
`Base.OneTo(2)`
## Current State
```
23
```
xxxxxxxxxx
world = AccessControlEnv()
44
xxxxxxxxxx
NS = length(state_space(world))
2
xxxxxxxxxx
NA = length(action_space(world))
Agent
├─ policy => QBasedPolicy
│ ├─ learner => DifferentialTDLearner
│ │ ├─ approximator => TabularApproximator
│ │ │ ├─ table => 2×44 Array{Float64,2}
│ │ │ └─ optimizer => Descent
│ │ │ └─ eta => 0.01
│ │ ├─ β => 0.01
│ │ └─ R̄ => 0.0
│ └─ explorer => EpsilonGreedyExplorer
│ ├─ ϵ_stable => 0.1
│ ├─ ϵ_init => 1.0
│ ├─ warmup_steps => 0
│ ├─ decay_steps => 0
│ ├─ step => 1
│ ├─ rng => Random._GLOBAL_RNG
│ └─ is_training => true
└─ trajectory => Trajectory
└─ traces => NamedTuple
├─ state => 0-element Array{Int64,1}
├─ action => 0-element Array{Int64,1}
├─ reward => 0-element Array{Float32,1}
└─ terminal => 0-element Array{Bool,1}
xxxxxxxxxx
agent = Agent(
policy=QBasedPolicy(
learner=DifferentialTDLearner(
approximator=TabularQApproximator(
;n_state=NS,
n_action=NA,
opt=Descent(0.01)
),
β=0.01,
),
explorer=EpsilonGreedyExplorer(0.1)
),
trajectory=VectorSARTTrajectory()
)
xxxxxxxxxx
run(agent, world, StopAfterStep(2*10^6; is_show_progress=false))
xxxxxxxxxx
begin
p = plot(legend=:bottomright)
for i in 1:length(PRIORITIES)
plot!(
[agent.policy.learner.approximator(TRANSFORMER[(CartesianIndex(n+1, i))]) |> maximum
for n in 1:N_SERVERS],
label="priority = $(PRIORITIES[i])")
end
p
end