JuliaReinforcementLearning · HenriDeh · Sep 28, 2023 · Sep 25, 2023 · Sep 25, 2023 · Sep 25, 2023
diff --git a/...ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl b/...ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl
@@ -21,54 +21,57 @@ function RLCore.Experiment(
     ::Val{:CartPole},
     seed = 123,
 )
+
     rng = StableRNG(seed)
     env = CartPoleEnv(; T=Float32, rng=rng)
-    ns, na = length(state(env)), length(first(action_space(env)))
+    ns, na = length(state(env)), length(action_space(env))
 
     agent = Agent(
         policy=QBasedPolicy(
             learner=NFQ(
-                action_space=action_space(env),
                 approximator=Approximator(
                     model=Chain(
-                        Dense(ns+na, 5, σ; init=glorot_uniform(rng)),
-                        Dense(5, 5, σ; init=glorot_uniform(rng)),
-                        Dense(5, 1; init=glorot_uniform(rng)),
+                        Dense(ns, 32, σ; init=glorot_uniform(rng)),
+                        Dense(32, 32, relu; init=glorot_uniform(rng)),
+                        Dense(32, na; init=glorot_uniform(rng)),
                     ),
-                    optimiser=RMSProp()
+                    optimiser=RMSProp(),
                 ),
                 loss_function=mse,
-                epochs=100,
+                epochs=10,
                 num_iterations=10,
                 γ = 0.95f0
             ),
             explorer=EpsilonGreedyExplorer(
                 kind=:exp,
                 ϵ_stable=0.001,
-                warmup_steps=500,
+                warmup_steps=1000,
+                decay_steps=3000,
                 rng=rng,
             ),
         ),
         trajectory=Trajectory(
             container=CircularArraySARTSTraces(
                 capacity=10_000,
                 state=Float32 => (ns,),
-                action=Float32 => (na,),
             ),
             sampler=BatchSampler{SS′ART}(
                 batch_size=128,
                 rng=rng
             ),
             controller=InsertSampleRatioController(
-                threshold=100,
-                n_inserted=-1
+                threshold=1000,
+                ratio=1/10,
+                n_sampled=-1
             )
         )
     )
 
     stop_condition = StopAfterStep(10_000, is_show_progress=!haskey(ENV, "CI"))
     hook = TotalRewardPerEpisode()
+
     Experiment(agent, env, stop_condition, hook)
+
     end
 
 #+ tangle=false

diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
@@ -21,7 +21,6 @@ Neural Fitted Q-iteration as implemented in [1]
 [1] Riedmiller, M. (2005). Neural Fitted Q Iteration – First Experiences with a Data Efficient Neural Reinforcement Learning Method. In: Gama, J., Camacho, R., Brazdil, P.B., Jorge, A.M., Torgo, L. (eds) Machine Learning: ECML 2005. ECML 2005. Lecture Notes in Computer Science(), vol 3720. Springer, Berlin, Heidelberg. https://doi.org/10.1007/11564096_32
 """
 Base.@kwdef struct NFQ{A, R, F} <: AbstractLearner
-    action_space::AbstractVector
     approximator::A
     num_iterations::Integer = 20
     epochs::Integer = 100
@@ -34,27 +33,30 @@ end
 
 RLCore.forward(L::NFQ, s::AbstractArray) = RLCore.forward(L.approximator, s)
 
-function RLCore.forward(learner::NFQ, env::AbstractEnv)
-    as = action_space(env)
-    return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> x->RLCore.forward(learner, x) |> send_to_host |> vec 
+function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajectory)
+    for batch in trajectory
+        optimise!(learner, batch)
+    end
 end
 
-function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajectory)
+function RLBase.optimise!(learner::NFQ, batch::NamedTuple)
     Q = learner.approximator
     γ = learner.γ
     loss_func = learner.loss_function
-    as = learner.action_space
-    las = length(as)
-    batch = ReinforcementLearningTrajectories.StatsBase.sample(trajectory)
 
-    (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]]
-    a = Float32.(a)
-    s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss))
-    for i = 1:learner.num_iterations
+    (s, a, r, s′) = batch[[:state, :action, :reward, :next_state]]
+    a = CartesianIndex.(a, 1:length(a))
+    s, a, r, s′ = map(x->send_to_device(device(Q), x), (s, a, r, s′))
+    for _ = 1:learner.num_iterations
+        q′ = vec(maximum(RLCore.forward(Q, s′); dims=1))
+        G = r .+ γ .* q′
         # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples
-        G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> x -> maximum(RLCore.forward(Q, x), dims=3) |> vec)
         for _ = 1:learner.epochs
-            Flux.train!((x, y) -> loss_func(RLCore.forward(Q, x), y), params(Q.model), [(vcat(s, a), transpose(G))], Q.optimiser)
+            gs = gradient(params(Q)) do
+                q = RLCore.forward(Q, s)[a]
+                loss_func(G, q)
+            end
+            RLBase.optimise!(Q, gs)
         end
     end
 end