diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl index d9690a031..1a8f0d070 100644 --- a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl +++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl @@ -21,31 +21,32 @@ function RLCore.Experiment( ::Val{:CartPole}, seed = 123, ) + rng = StableRNG(seed) env = CartPoleEnv(; T=Float32, rng=rng) - ns, na = length(state(env)), length(first(action_space(env))) + ns, na = length(state(env)), length(action_space(env)) agent = Agent( policy=QBasedPolicy( learner=NFQ( - action_space=action_space(env), approximator=Approximator( model=Chain( - Dense(ns+na, 5, σ; init=glorot_uniform(rng)), - Dense(5, 5, σ; init=glorot_uniform(rng)), - Dense(5, 1; init=glorot_uniform(rng)), + Dense(ns, 32, σ; init=glorot_uniform(rng)), + Dense(32, 32, relu; init=glorot_uniform(rng)), + Dense(32, na; init=glorot_uniform(rng)), ), - optimiser=RMSProp() + optimiser=RMSProp(), ), loss_function=mse, - epochs=100, + epochs=10, num_iterations=10, γ = 0.95f0 ), explorer=EpsilonGreedyExplorer( kind=:exp, ϵ_stable=0.001, - warmup_steps=500, + warmup_steps=1000, + decay_steps=3000, rng=rng, ), ), @@ -53,22 +54,24 @@ function RLCore.Experiment( container=CircularArraySARTSTraces( capacity=10_000, state=Float32 => (ns,), - action=Float32 => (na,), ), sampler=BatchSampler{SS′ART}( batch_size=128, rng=rng ), controller=InsertSampleRatioController( - threshold=100, - n_inserted=-1 + threshold=1000, + ratio=1/10, + n_sampled=-1 ) ) ) stop_condition = StopAfterStep(10_000, is_show_progress=!haskey(ENV, "CI")) hook = TotalRewardPerEpisode() + Experiment(agent, env, stop_condition, hook) + end #+ tangle=false diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl index 4270a3882..4e54ca808 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl @@ -21,7 +21,6 @@ Neural Fitted Q-iteration as implemented in [1] [1] Riedmiller, M. (2005). Neural Fitted Q Iteration – First Experiences with a Data Efficient Neural Reinforcement Learning Method. In: Gama, J., Camacho, R., Brazdil, P.B., Jorge, A.M., Torgo, L. (eds) Machine Learning: ECML 2005. ECML 2005. Lecture Notes in Computer Science(), vol 3720. Springer, Berlin, Heidelberg. https://doi.org/10.1007/11564096_32 """ Base.@kwdef struct NFQ{A, R, F} <: AbstractLearner - action_space::AbstractVector approximator::A num_iterations::Integer = 20 epochs::Integer = 100 @@ -34,27 +33,30 @@ end RLCore.forward(L::NFQ, s::AbstractArray) = RLCore.forward(L.approximator, s) -function RLCore.forward(learner::NFQ, env::AbstractEnv) - as = action_space(env) - return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> x->RLCore.forward(learner, x) |> send_to_host |> vec +function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajectory) + for batch in trajectory + optimise!(learner, batch) + end end -function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajectory) +function RLBase.optimise!(learner::NFQ, batch::NamedTuple) Q = learner.approximator γ = learner.γ loss_func = learner.loss_function - as = learner.action_space - las = length(as) - batch = ReinforcementLearningTrajectories.StatsBase.sample(trajectory) - (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]] - a = Float32.(a) - s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss)) - for i = 1:learner.num_iterations + (s, a, r, s′) = batch[[:state, :action, :reward, :next_state]] + a = CartesianIndex.(a, 1:length(a)) + s, a, r, s′ = map(x->send_to_device(device(Q), x), (s, a, r, s′)) + for _ = 1:learner.num_iterations + q′ = vec(maximum(RLCore.forward(Q, s′); dims=1)) + G = r .+ γ .* q′ # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples - G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> x -> maximum(RLCore.forward(Q, x), dims=3) |> vec) for _ = 1:learner.epochs - Flux.train!((x, y) -> loss_func(RLCore.forward(Q, x), y), params(Q.model), [(vcat(s, a), transpose(G))], Q.optimiser) + gs = gradient(params(Q)) do + q = RLCore.forward(Q, s)[a] + loss_func(G, q) + end + RLBase.optimise!(Q, gs) end end end