From a23b1559ae551b58643ef8b87b00e8cfbc97c736 Mon Sep 17 00:00:00 2001
From: "Randall C. O'Reilly" <oreilly@ucdavis.edu>
Date: Fri, 22 Nov 2024 02:24:05 -0800
Subject: [PATCH] NewState implemented in GPU kernel; now learning as
 effectively in GPU as CPU. also fixed tests

---
 axon/act-layer.go    |  38 +++++++++++--
 axon/act-layer.goal  |  38 +++++++++++--
 axon/act-net.go      |  41 ++++++++++++++
 axon/act-net.goal    |  41 ++++++++++++++
 axon/act-path.go     |  14 ++---
 axon/act-path.goal   |  14 ++---
 axon/act.go          |  22 ++++----
 axon/act.goal        |  22 ++++----
 axon/basic_test.go   |  19 ++++---
 axon/basic_test.goal |  21 ++++---
 axon/gosl.go         | 129 +++++++++++++++++++++++++++++++++++++++++++
 axon/init-layer.go   |  61 +-------------------
 axon/init-layer.goal |  62 +--------------------
 axon/init-net.go     |  27 ++-------
 axon/init-net.goal   |  26 ++-------
 axon/init-path.go    |   2 +-
 axon/init-path.goal  |   2 +-
 17 files changed, 349 insertions(+), 230 deletions(-)

diff --git a/axon/act-layer.go b/axon/act-layer.go
index 4bd57daa..7589aef5 100644
--- a/axon/act-layer.go
+++ b/axon/act-layer.go
@@ -850,6 +850,38 @@ func (ly *LayerParams) CyclePostVSPatchLayer(ctx *Context, pi, di uint32, spi in
 
 ////////  Phase timescale
 
+// NewStateLayer does NewState at the layer level, called
+func (ly *LayerParams) NewStateLayer(ctx *Context) {
+	actMinusAvg := float32(0)
+	actPlusAvg := float32(0)
+	np := uint32(ly.Indexes.NPools)
+
+	for di := uint32(0); di < ctx.NData; di++ {
+		lpi := ly.PoolIndex(0)
+
+		actMinusAvg += PoolAvgMax(AMAct, AMMinus, Avg, lpi, di)
+		actPlusAvg += PoolAvgMax(AMAct, AMPlus, Avg, lpi, di)
+
+		ly.Acts.Clamp.IsInput.SetBool(ly.IsInput())
+		ly.Acts.Clamp.IsTarget.SetBool(ly.IsTarget())
+		LayerStates.Set(-1.0, int(ly.Index), int(LayerRT), int(di))
+
+		for spi := uint32(0); spi < np; spi++ {
+			pi := ly.PoolIndex(spi)
+			ly.NewStatePool(ctx, pi, di) // also calls DecayState on pool
+		}
+	}
+
+	// note: long-running averages must be based on aggregate data, drive adaptation
+	// of Gi layer inhibition.
+	davg := 1 / float32(ctx.NData)
+	actMinusAvg *= davg
+	actPlusAvg *= davg
+	for di := uint32(0); di < ctx.NData; di++ {
+		ly.NewStateLayerActAvg(ctx, di, actMinusAvg, actPlusAvg)
+	}
+}
+
 // NewStateLayerActAvg updates ActAvg.ActMAvg and ActPAvg based on current values
 // that have been averaged across NData already.
 func (ly *LayerParams) NewStateLayerActAvg(ctx *Context, di uint32, actMinusAvg, actPlusAvg float32) {
@@ -861,12 +893,6 @@ func (ly *LayerParams) NewStateLayerActAvg(ctx *Context, di uint32, actMinusAvg,
 	LayerStates.Set(pavg, int(ly.Index), int(LayerActPAvg), int(di))
 }
 
-func (ly *LayerParams) NewStateLayer(ctx *Context, di uint32) {
-	ly.Acts.Clamp.IsInput.SetBool(ly.IsInput())
-	ly.Acts.Clamp.IsTarget.SetBool(ly.IsTarget())
-	LayerStates.Set(-1, int(ly.Index), int(LayerRT), int(di))
-}
-
 func (ly *LayerParams) NewStatePool(ctx *Context, pi, di uint32) {
 	PoolsInt.Set(0, int(pi), int(Clamped), int(di))
 	if ly.Acts.Clamp.Add.IsFalse() && ly.Acts.Clamp.IsInput.IsTrue() {
diff --git a/axon/act-layer.goal b/axon/act-layer.goal
index f65bb6c3..68c08590 100644
--- a/axon/act-layer.goal
+++ b/axon/act-layer.goal
@@ -849,6 +849,38 @@ func (ly *LayerParams) CyclePostVSPatchLayer(ctx *Context, pi, di uint32, spi in
 
 ////////  Phase timescale
 
+// NewStateLayer does NewState at the layer level, called 
+func (ly *LayerParams) NewStateLayer(ctx *Context) {
+	actMinusAvg := float32(0)
+	actPlusAvg := float32(0)
+	np := uint32(ly.Indexes.NPools)
+	
+	for di := uint32(0); di < ctx.NData; di++ {
+		lpi := ly.PoolIndex(0)
+
+		actMinusAvg += PoolAvgMax(AMAct, AMMinus, Avg, lpi, di)
+		actPlusAvg += PoolAvgMax(AMAct, AMPlus, Avg, lpi, di)
+
+		ly.Acts.Clamp.IsInput.SetBool(ly.IsInput())
+		ly.Acts.Clamp.IsTarget.SetBool(ly.IsTarget())
+		LayerStates[ly.Index, LayerRT, di] = -1.0
+
+		for spi := uint32(0); spi < np; spi++ {
+			pi := ly.PoolIndex(spi)
+			ly.NewStatePool(ctx, pi, di) // also calls DecayState on pool
+		}
+	}
+
+	// note: long-running averages must be based on aggregate data, drive adaptation
+	// of Gi layer inhibition.
+	davg := 1 / float32(ctx.NData)
+	actMinusAvg *= davg
+	actPlusAvg *= davg
+	for di := uint32(0); di < ctx.NData; di++ {
+		ly.NewStateLayerActAvg(ctx, di, actMinusAvg, actPlusAvg)
+	}
+}
+
 // NewStateLayerActAvg updates ActAvg.ActMAvg and ActPAvg based on current values
 // that have been averaged across NData already.
 func (ly *LayerParams) NewStateLayerActAvg(ctx *Context, di uint32, actMinusAvg, actPlusAvg float32) {
@@ -860,12 +892,6 @@ func (ly *LayerParams) NewStateLayerActAvg(ctx *Context, di uint32, actMinusAvg,
 	LayerStates[ly.Index, LayerActPAvg, di] = pavg
 }
 
-func (ly *LayerParams) NewStateLayer(ctx *Context, di uint32) {
-	ly.Acts.Clamp.IsInput.SetBool(ly.IsInput())
-	ly.Acts.Clamp.IsTarget.SetBool(ly.IsTarget())
-	LayerStates[ly.Index, LayerRT, di] = -1
-}
-
 func (ly *LayerParams) NewStatePool(ctx *Context, pi, di uint32) {
 	PoolsInt[pi, Clamped, di] = 0
 	if ly.Acts.Clamp.Add.IsFalse() && ly.Acts.Clamp.IsInput.IsTrue() {
diff --git a/axon/act-net.go b/axon/act-net.go
index efd9af07..6dc32d5d 100644
--- a/axon/act-net.go
+++ b/axon/act-net.go
@@ -6,6 +6,8 @@
 
 package axon
 
+import "cogentcore.org/core/enums"
+
 // todo: don't even need layer-level ultimately.
 
 // Cycle runs n cycles of activation updating.
@@ -61,6 +63,21 @@ func (nt *Network) Cycle(ncyc int, getNeurons bool) {
 	//	}
 }
 
+// NewState handles all initialization at start of new input pattern.
+// This is called *before* applying external input data and operates across
+// all data parallel values.  The current Context.NData should be set
+// properly prior to calling this and subsequent Cycle methods.
+func (nt *Network) NewState(mode enums.Enum, testing bool) {
+	nix := nt.NetIxs()
+	ctx := nt.Context()
+	nd := int(nix.NNeurons * ctx.NData)
+	ctx.NewState(mode, testing)
+	ToGPUCtxGlobal()
+	RunNewStateLayer(int(nix.NLayers))
+	RunNewStateNeuron(nd)
+	RunInitGBuffsPath(int(nix.NPaths))
+}
+
 // InitExt initializes external input state.
 // Call prior to applying external inputs to layers.
 func (nt *Network) InitExt() {
@@ -295,6 +312,30 @@ func ApplyExtsNeuron(i uint32) { //gosl:kernel
 	Layers[li].ApplyExtsNeuron(ni, di)
 }
 
+// NewStateLayer is the kernel over Layers (not Data)
+// which does new state on pools as well.
+func NewStateLayer(li uint32) { //gosl:kernel
+	ctx := GetCtx(0)
+	Layers[li].NewStateLayer(ctx)
+}
+
+// NewStateNeuron is the kernel over Neurons * Data to
+// do new state on neurons (decay).
+func NewStateNeuron(i uint32) { //gosl:kernel
+	ctx := GetCtx(0)
+	di := ctx.DataIndex(i)
+	ni := ctx.ItemIndex(i)
+	li := NeuronIxs.Value(int(ni), int(NrnLayIndex))
+	Layers[li].NewStateNeuron(ctx, ni, di)
+}
+
+// InitGBuffsPath is the kernel over Paths to
+// initialize PathGBuf, PathGSyns.
+func InitGBuffsPath(pti uint32) { //gosl:kernel
+	ctx := GetCtx(0)
+	Paths[pti].InitGBuffs(ctx)
+}
+
 // MinusPhasePool is the kernel over Pools * Data to
 // do pool-level updating after end of minus phase.
 func MinusPhasePool(i uint32) { //gosl:kernel
diff --git a/axon/act-net.goal b/axon/act-net.goal
index 4a7f5dc7..8ec23efb 100644
--- a/axon/act-net.goal
+++ b/axon/act-net.goal
@@ -4,6 +4,8 @@
 
 package axon
 
+import "cogentcore.org/core/enums"
+
 // todo: don't even need layer-level ultimately.
 
 // Cycle runs n cycles of activation updating.
@@ -55,6 +57,21 @@ func (nt *Network) Cycle(ncyc int, getNeurons bool) {
 	// }
 }
 
+// NewState handles all initialization at start of new input pattern.
+// This is called *before* applying external input data and operates across
+// all data parallel values.  The current Context.NData should be set
+// properly prior to calling this and subsequent Cycle methods.
+func (nt *Network) NewState(mode enums.Enum, testing bool) {
+	nix := nt.NetIxs()
+	ctx := nt.Context()
+	nd := int(nix.NNeurons * ctx.NData)
+	ctx.NewState(mode, testing)
+	ToGPUCtxGlobal()
+	RunNewStateLayer(int(nix.NLayers))
+	RunNewStateNeuron(nd)
+	RunInitGBuffsPath(int(nix.NPaths))
+}
+
 // InitExt initializes external input state.
 // Call prior to applying external inputs to layers.
 func (nt *Network) InitExt() {
@@ -289,6 +306,30 @@ func ApplyExtsNeuron(i uint32) { //gosl:kernel
 	Layers[li].ApplyExtsNeuron(ni, di)
 }
 
+// NewStateLayer is the kernel over Layers (not Data)
+// which does new state on pools as well.
+func NewStateLayer(li uint32) { //gosl:kernel
+	ctx := GetCtx(0)
+	Layers[li].NewStateLayer(ctx)
+}
+
+// NewStateNeuron is the kernel over Neurons * Data to
+// do new state on neurons (decay).
+func NewStateNeuron(i uint32) { //gosl:kernel
+	ctx := GetCtx(0)
+	di := ctx.DataIndex(i)
+	ni := ctx.ItemIndex(i)
+	li := NeuronIxs[ni, NrnLayIndex]
+	Layers[li].NewStateNeuron(ctx, ni, di)
+}
+
+// InitGBuffsPath is the kernel over Paths to
+// initialize PathGBuf, PathGSyns.
+func InitGBuffsPath(pti uint32) { //gosl:kernel
+	ctx := GetCtx(0)
+	Paths[pti].InitGBuffs(ctx)
+}
+
 // MinusPhasePool is the kernel over Pools * Data to
 // do pool-level updating after end of minus phase.
 func MinusPhasePool(i uint32) { //gosl:kernel
diff --git a/axon/act-path.go b/axon/act-path.go
index 8dff0181..ba6a6d96 100644
--- a/axon/act-path.go
+++ b/axon/act-path.go
@@ -306,21 +306,21 @@ func (pt *PathParams) SendSpike(ctx *Context, ni, di, lni uint32) {
 // This is not typically needed (called during InitWeights, InitActs)
 // but can be called when needed.  Must be called to completely initialize
 // prior activity, e.g., full Glong clearing.
-func (pt *PathParams) InitGBuffs() {
+func (pt *PathParams) InitGBuffs(ctx *Context) {
 	nix := GetNetworkIxs(0)
 	maxd := nix.MaxData
 	mdel := nix.MaxDelay + 1
 	rnn := pt.Indexes.RecvNeurN
 	npst := pt.Indexes.NPathNeurSt
-	for dl := range mdel {
-		for ri := range rnn {
-			for di := range maxd {
-				PathGBuf.Set(0.0, int(npst+ri), int(dl), int(di))
+	for dl := uint32(0); dl < mdel; dl++ {
+		for ri := uint32(0); ri < rnn; ri++ {
+			for di := uint32(0); di < maxd; di++ {
+				PathGBuf.Set(0, int(npst+ri), int(dl), int(di))
 			}
 		}
 	}
-	for ri := range rnn {
-		for di := range maxd {
+	for ri := uint32(0); ri < rnn; ri++ {
+		for di := uint32(0); di < maxd; di++ {
 			PathGSyns.Set(0.0, int(npst+ri), int(di))
 		}
 	}
diff --git a/axon/act-path.goal b/axon/act-path.goal
index b3ca91c1..abdec7e3 100644
--- a/axon/act-path.goal
+++ b/axon/act-path.goal
@@ -304,21 +304,21 @@ func (pt *PathParams) SendSpike(ctx *Context, ni, di, lni uint32) {
 // This is not typically needed (called during InitWeights, InitActs)
 // but can be called when needed.  Must be called to completely initialize
 // prior activity, e.g., full Glong clearing.
-func (pt *PathParams) InitGBuffs() {
+func (pt *PathParams) InitGBuffs(ctx *Context) {
 	nix := GetNetworkIxs(0)
 	maxd := nix.MaxData
 	mdel := nix.MaxDelay + 1
 	rnn := pt.Indexes.RecvNeurN
 	npst := pt.Indexes.NPathNeurSt
-	for dl := range mdel {
-		for ri := range rnn {
-			for di := range maxd {
-				PathGBuf[npst+ri, dl, di] = 0.0
+	for dl := uint32(0); dl < mdel; dl++ {
+		for ri := uint32(0); ri < rnn; ri++ {
+			for di := uint32(0); di < maxd; di++ {
+				PathGBuf[npst+ri, dl, di] = 0
 			}
 		}
 	}
-	for ri := range rnn {
-		for di := range maxd {
+	for ri := uint32(0); ri < rnn; ri++ {
+		for di := uint32(0); di < maxd; di++ {
 			PathGSyns[npst+ri, di] = 0.0
 		}
 	}
diff --git a/axon/act.go b/axon/act.go
index be5ec412..f6ed1209 100644
--- a/axon/act.go
+++ b/axon/act.go
@@ -891,15 +891,15 @@ func (ac *ActParams) DecayAHP(ctx *Context, ni, di uint32, decay float32) {
 // Called with ac.Decay.Act by Layer during NewState
 func (ac *ActParams) DecayState(ctx *Context, ni, di uint32, decay, glong, ahp float32) {
 	// always reset these -- otherwise get insanely large values that take forever to update
-	Neurons.Set(-1, int(ni), int(ISIAvg), int(di))
+	Neurons.Set(-1.0, int(ni), int(ISIAvg), int(di))
 	Neurons.Set(ac.Init.Act, int(ni), int(ActInt), int(di))
-	Neurons.Set(0, int(ni), int(Spiked), int(di))
+	Neurons.Set(0.0, int(ni), int(Spiked), int(di))
 	for i := range 8 {
 		Neurons.Set(0.0, int(ni), int(SpkBin0+NeuronVars(i)), int(di))
 	}
 
 	if decay > 0 { // no-op for most, but not all..
-		Neurons.Set(0, int(ni), int(Spike), int(di))
+		Neurons.Set(0.0, int(ni), int(Spike), int(di))
 		Neurons.SetSub(decay*(Neurons.Value(int(ni), int(Act), int(di))-ac.Init.Act), int(ni), int(Act), int(di))
 		Neurons.SetSub(decay*(Neurons.Value(int(ni), int(ActInt), int(di))-ac.Init.Act), int(ni), int(ActInt), int(di))
 		Neurons.SetSub(decay*(Neurons.Value(int(ni), int(GeSyn), int(di))-NeuronAvgs.Value(int(ni), int(GeBase))), int(ni), int(GeSyn), int(di))
@@ -945,14 +945,14 @@ func (ac *ActParams) DecayState(ctx *Context, ni, di uint32, decay, glong, ahp f
 		ac.DecayLearnCa(ctx, ni, di, ac.Decay.LearnCa)
 	}
 
-	Neurons.Set(0, int(ni), int(Inet), int(di))
-	Neurons.Set(0, int(ni), int(GeRaw), int(di))
-	Neurons.Set(0, int(ni), int(GiRaw), int(di))
-	Neurons.Set(0, int(ni), int(GModRaw), int(di))
-	Neurons.Set(0, int(ni), int(GModSyn), int(di))
-	Neurons.Set(0, int(ni), int(GMaintRaw), int(di))
-	Neurons.Set(0, int(ni), int(SSGiDend), int(di))
-	Neurons.Set(0, int(ni), int(GeExt), int(di))
+	Neurons.Set(0.0, int(ni), int(Inet), int(di))
+	Neurons.Set(0.0, int(ni), int(GeRaw), int(di))
+	Neurons.Set(0.0, int(ni), int(GiRaw), int(di))
+	Neurons.Set(0.0, int(ni), int(GModRaw), int(di))
+	Neurons.Set(0.0, int(ni), int(GModSyn), int(di))
+	Neurons.Set(0.0, int(ni), int(GMaintRaw), int(di))
+	Neurons.Set(0.0, int(ni), int(SSGiDend), int(di))
+	Neurons.Set(0.0, int(ni), int(GeExt), int(di))
 
 	Neurons.SetSub(glong*Neurons.Value(int(ni), int(CtxtGeOrig), int(di)), int(ni), int(CtxtGeOrig), int(di))
 }
diff --git a/axon/act.goal b/axon/act.goal
index 5f624b2f..55efcf4b 100644
--- a/axon/act.goal
+++ b/axon/act.goal
@@ -889,15 +889,15 @@ func (ac *ActParams) DecayAHP(ctx *Context, ni, di uint32, decay float32) {
 // Called with ac.Decay.Act by Layer during NewState
 func (ac *ActParams) DecayState(ctx *Context, ni, di uint32, decay, glong, ahp float32) {
 	// always reset these -- otherwise get insanely large values that take forever to update
-	Neurons[ni, ISIAvg, di] = -1
+	Neurons[ni, ISIAvg, di] = -1.0
 	Neurons[ni, ActInt, di] = ac.Init.Act
-	Neurons[ni, Spiked, di] = 0
+	Neurons[ni, Spiked, di] = 0.0
 	for i := range 8 {
 		Neurons[ni, SpkBin0+NeuronVars(i), di] = 0.0
 	}
 
 	if decay > 0 { // no-op for most, but not all..
-		Neurons[ni, Spike, di] = 0
+		Neurons[ni, Spike, di] = 0.0
 		Neurons[ni, Act, di] -= decay * (Neurons[ni, Act, di] - ac.Init.Act)
 		Neurons[ni, ActInt, di] -= decay * (Neurons[ni, ActInt, di] - ac.Init.Act)
 		Neurons[ni, GeSyn, di] -= decay * (Neurons[ni, GeSyn, di] - NeuronAvgs[ni, GeBase])
@@ -943,14 +943,14 @@ func (ac *ActParams) DecayState(ctx *Context, ni, di uint32, decay, glong, ahp f
 		ac.DecayLearnCa(ctx, ni, di, ac.Decay.LearnCa)
 	}
 
-	Neurons[ni, Inet, di] = 0
-	Neurons[ni, GeRaw, di] = 0
-	Neurons[ni, GiRaw, di] = 0
-	Neurons[ni, GModRaw, di] = 0
-	Neurons[ni, GModSyn, di] = 0
-	Neurons[ni, GMaintRaw, di] = 0
-	Neurons[ni, SSGiDend, di] = 0
-	Neurons[ni, GeExt, di] = 0
+	Neurons[ni, Inet, di] = 0.0
+	Neurons[ni, GeRaw, di] = 0.0
+	Neurons[ni, GiRaw, di] = 0.0
+	Neurons[ni, GModRaw, di] = 0.0
+	Neurons[ni, GModSyn, di] = 0.0
+	Neurons[ni, GMaintRaw, di] = 0.0
+	Neurons[ni, SSGiDend, di] = 0.0
+	Neurons[ni, GeExt, di] = 0.0
 
 	Neurons[ni, CtxtGeOrig, di] -= glong * Neurons[ni, CtxtGeOrig, di]
 }
diff --git a/axon/basic_test.go b/axon/basic_test.go
index ad9b0ec3..5b95f768 100644
--- a/axon/basic_test.go
+++ b/axon/basic_test.go
@@ -395,10 +395,11 @@ func TestNetActShort(t *testing.T) {
 }
 
 func TestGPUAct(t *testing.T) {
-	if os.Getenv("TEST_GPU") != "true" {
-		t.Skip("Set TEST_GPU env var to run GPU tests")
-	}
+	//	if os.Getenv("TEST_GPU") != "true" {
+	//		t.Skip("Set TEST_GPU env var to run GPU tests")
+	//	}
 	NetActTestShort(t, Tol6, true)
+	// NetActTest(t, Tol6, true)
 }
 
 // NetActTest runs an activation test on the network and checks
@@ -523,6 +524,7 @@ func NetActTest(t *testing.T, tol float32, gpu bool) {
 	}
 
 	GPURelease()
+	UseGPU = false
 }
 
 // NetActTestShort runs an activation test on the network and checks
@@ -619,6 +621,7 @@ func NetActTestShort(t *testing.T, tol float32, gpu bool) {
 		testNet.PlusPhase()
 	}
 	GPURelease()
+	UseGPU = false
 }
 
 // ReportValDiffs -- reports diffs between a, b values at given tolerance
@@ -767,6 +770,7 @@ func RunDebugAct(t *testing.T, testNet *Network, printValues bool, gpu bool, ini
 	}
 
 	GPURelease()
+	UseGPU = false
 	return valMap
 }
 
@@ -959,6 +963,7 @@ func NetTestLearn(t *testing.T, tol float32, gpu bool) {
 	CompareFloats(tol, outwt, outWts, "outWts", t)
 
 	GPURelease()
+	UseGPU = false
 }
 
 func TestNetRLRate(t *testing.T) {
@@ -986,7 +991,7 @@ func NetTestRLRate(t *testing.T, tol float32, gpu bool) {
 	printCycs := false
 	printQtrs := false
 
-	patHidRLRates := []float32{5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 9.572107e-05, 0.005967056, 5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 8.116122e-05, 8.9521294e-05, 5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 7.533143e-05, 0.003373957}
+	patHidRLRates := []float32{5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 0.000100934616, 0.0029769302, 5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 8.4988904e-05, 5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 7.806076e-05, 0.002506172}
 
 	// these are organized by pattern within and then by test iteration (params) outer
 	// only the single active synapse is represented -- one per pattern
@@ -1002,9 +1007,9 @@ func NetTestRLRate(t *testing.T, tol float32, gpu bool) {
 	q3outCaP := make([]float32, 4*nLearnParams)
 	q3outCaD := make([]float32, 4*nLearnParams)
 
-	hidDwts := []float32{1.708368e-07, 2.9798115e-05, 3.612578e-07, 1.5011935e-05}
+	hidDwts := []float32{1.708368e-07, 1.486611e-05, 2.0177201e-07, 1.1150851e-05}
 	outDwts := []float32{0.0009000063, 0.005000009, 0.0022999954, 0.0025999974}
-	hidWts := []float32{0.5000011, 0.5001788, 0.50000215, 0.5000901}
+	hidWts := []float32{0.5000011, 0.50008905, 0.5000011, 0.5000669}
 	outWts := []float32{0.5053999, 0.52996504, 0.51379675, 0.51559514}
 
 	hiddwt := make([]float32, 4*nLearnParams)
@@ -1296,7 +1301,7 @@ func TestDebugLearn(t *testing.T) {
 func TestNDataLearn(t *testing.T) {
 	nd1Values := NetDebugLearn(t, false, false, 1, 1, true, false, false)
 	nd4Values := NetDebugLearn(t, false, false, 4, 4, true, false, false)
-	ReportValDiffs(t, Tol8, nd1Values, nd4Values, "nData = 1", "nData = 4", "DWt", "ActAvg", "DTrgAvg", "LayerAvgMaxGeM", "LayerAvgMaxGiM") // todo: LayerAvgMaxGeM and GiM need further investigation
+	ReportValDiffs(t, Tol7, nd1Values, nd4Values, "nData = 1", "nData = 4", "DWt", "ActAvg", "DTrgAvg", "LayerAvgMaxGeM", "LayerAvgMaxGiM") // todo: LayerAvgMaxGeM and GiM need further investigation
 }
 
 func TestNDataMaxDataLearn(t *testing.T) {
diff --git a/axon/basic_test.goal b/axon/basic_test.goal
index 9bd17041..8809f078 100644
--- a/axon/basic_test.goal
+++ b/axon/basic_test.goal
@@ -395,10 +395,11 @@ func TestNetActShort(t *testing.T) {
 }
 
 func TestGPUAct(t *testing.T) {
-	if os.Getenv("TEST_GPU") != "true" {
-		t.Skip("Set TEST_GPU env var to run GPU tests")
-	}
+	// if os.Getenv("TEST_GPU") != "true" {
+	// 	t.Skip("Set TEST_GPU env var to run GPU tests")
+	// }
 	NetActTestShort(t, Tol6, true)
+	// NetActTest(t, Tol6, true)
 }
 
 // NetActTest runs an activation test on the network and checks
@@ -523,6 +524,7 @@ func NetActTest(t *testing.T, tol float32, gpu bool) {
 	}
 
 	GPURelease()
+	UseGPU = false
 }
 
 // NetActTestShort runs an activation test on the network and checks
@@ -619,6 +621,7 @@ func NetActTestShort(t *testing.T, tol float32, gpu bool) {
 		testNet.PlusPhase()
 	}
 	GPURelease()
+	UseGPU = false
 }
 
 // ReportValDiffs -- reports diffs between a, b values at given tolerance
@@ -767,6 +770,7 @@ func RunDebugAct(t *testing.T, testNet *Network, printValues bool, gpu bool, ini
 	}
 
 	GPURelease()
+	UseGPU = false
 	return valMap
 }
 
@@ -959,6 +963,7 @@ func NetTestLearn(t *testing.T, tol float32, gpu bool) {
 	CompareFloats(tol, outwt, outWts, "outWts", t)
 
 	GPURelease()
+	UseGPU = false
 }
 
 func TestNetRLRate(t *testing.T) {
@@ -986,8 +991,8 @@ func NetTestRLRate(t *testing.T, tol float32, gpu bool) {
 	printCycs := false
 	printQtrs := false
 
-	patHidRLRates := []float32{5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 9.572107e-05, 0.005967056, 5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 8.116122e-05, 8.9521294e-05, 5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 7.533143e-05, 0.003373957}
-
+	patHidRLRates := []float32{5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 0.000100934616, 0.0029769302, 5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 8.4988904e-05, 5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 5.0000002e-05, 7.806076e-05, 0.002506172}
+	
 	// these are organized by pattern within and then by test iteration (params) outer
 	// only the single active synapse is represented -- one per pattern
 	// if there are differences, they will multiply over patterns and layers..
@@ -1002,9 +1007,9 @@ func NetTestRLRate(t *testing.T, tol float32, gpu bool) {
 	q3outCaP := make([]float32, 4*nLearnParams)
 	q3outCaD := make([]float32, 4*nLearnParams)
 
-	hidDwts := []float32{1.708368e-07, 2.9798115e-05, 3.612578e-07, 1.5011935e-05}
+	hidDwts := []float32{1.708368e-07, 1.486611e-05, 2.0177201e-07, 1.1150851e-05}	
 	outDwts := []float32{0.0009000063, 0.005000009, 0.0022999954, 0.0025999974}
-	hidWts := []float32{0.5000011, 0.5001788, 0.50000215, 0.5000901}
+	hidWts := []float32{0.5000011, 0.50008905, 0.5000011, 0.5000669}
 	outWts := []float32{0.5053999, 0.52996504, 0.51379675, 0.51559514}
 
 	hiddwt := make([]float32, 4*nLearnParams)
@@ -1296,7 +1301,7 @@ func TestDebugLearn(t *testing.T) {
 func TestNDataLearn(t *testing.T) {
 	nd1Values := NetDebugLearn(t, false, false, 1, 1, true, false, false)
 	nd4Values := NetDebugLearn(t, false, false, 4, 4, true, false, false)
-	ReportValDiffs(t, Tol8, nd1Values, nd4Values, "nData = 1", "nData = 4", "DWt", "ActAvg", "DTrgAvg", "LayerAvgMaxGeM", "LayerAvgMaxGiM") // todo: LayerAvgMaxGeM and GiM need further investigation
+	ReportValDiffs(t, Tol7, nd1Values, nd4Values, "nData = 1", "nData = 4", "DWt", "ActAvg", "DTrgAvg", "LayerAvgMaxGeM", "LayerAvgMaxGiM") // todo: LayerAvgMaxGeM and GiM need further investigation
 }
 
 func TestNDataMaxDataLearn(t *testing.T) {
diff --git a/axon/gosl.go b/axon/gosl.go
index f026a0fe..5a6ba35f 100644
--- a/axon/gosl.go
+++ b/axon/gosl.go
@@ -71,9 +71,12 @@ func GPUInit() {
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/DWtSyn.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/GPUTestWrite.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/GatherSpikes.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/InitGBuffsPath.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/LayerGi.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/MinusPhaseNeuron.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/MinusPhasePool.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/NewStateLayer.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/NewStateNeuron.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhaseNeuron.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhasePool.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhaseStartNeuron.wgsl", sy)
@@ -574,6 +577,48 @@ func RunOneGatherSpikes(n int, syncVars ...GPUVars) {
 		RunGatherSpikesCPU(n)
 	}
 }
+// RunInitGBuffsPath runs the InitGBuffsPath kernel with given number of elements,
+// on either the CPU or GPU depending on the UseGPU variable.
+// Can call multiple Run* kernels in a row, which are then all launched
+// in the same command submission on the GPU, which is by far the most efficient.
+// MUST call RunDone (with optional vars to sync) after all Run calls.
+// Alternatively, a single-shot RunOneInitGBuffsPath call does Run and Done for a
+// single run-and-sync case.
+func RunInitGBuffsPath(n int) {
+	if UseGPU {
+		RunInitGBuffsPathGPU(n)
+	} else {
+		RunInitGBuffsPathCPU(n)
+	}
+}
+
+// RunInitGBuffsPathGPU runs the InitGBuffsPath kernel on the GPU. See [RunInitGBuffsPath] for more info.
+func RunInitGBuffsPathGPU(n int) {
+	sy := GPUSystem
+	pl := sy.ComputePipelines["InitGBuffsPath"]
+	ce, _ := sy.BeginComputePass()
+	pl.Dispatch1D(ce, n, 64)
+}
+
+// RunInitGBuffsPathCPU runs the InitGBuffsPath kernel on the CPU.
+func RunInitGBuffsPathCPU(n int) {
+	gpu.VectorizeFunc(0, n, InitGBuffsPath)
+}
+
+// RunOneInitGBuffsPath runs the InitGBuffsPath kernel with given number of elements,
+// on either the CPU or GPU depending on the UseGPU variable.
+// This version then calls RunDone with the given variables to sync
+// after the Run, for a single-shot Run-and-Done call. If multiple kernels
+// can be run in sequence, it is much more efficient to do multiple Run*
+// calls followed by a RunDone call.
+func RunOneInitGBuffsPath(n int, syncVars ...GPUVars) {
+	if UseGPU {
+		RunInitGBuffsPathGPU(n)
+		RunDone(syncVars...)
+	} else {
+		RunInitGBuffsPathCPU(n)
+	}
+}
 // RunLayerGi runs the LayerGi kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
@@ -700,6 +745,90 @@ func RunOneMinusPhasePool(n int, syncVars ...GPUVars) {
 		RunMinusPhasePoolCPU(n)
 	}
 }
+// RunNewStateLayer runs the NewStateLayer kernel with given number of elements,
+// on either the CPU or GPU depending on the UseGPU variable.
+// Can call multiple Run* kernels in a row, which are then all launched
+// in the same command submission on the GPU, which is by far the most efficient.
+// MUST call RunDone (with optional vars to sync) after all Run calls.
+// Alternatively, a single-shot RunOneNewStateLayer call does Run and Done for a
+// single run-and-sync case.
+func RunNewStateLayer(n int) {
+	if UseGPU {
+		RunNewStateLayerGPU(n)
+	} else {
+		RunNewStateLayerCPU(n)
+	}
+}
+
+// RunNewStateLayerGPU runs the NewStateLayer kernel on the GPU. See [RunNewStateLayer] for more info.
+func RunNewStateLayerGPU(n int) {
+	sy := GPUSystem
+	pl := sy.ComputePipelines["NewStateLayer"]
+	ce, _ := sy.BeginComputePass()
+	pl.Dispatch1D(ce, n, 64)
+}
+
+// RunNewStateLayerCPU runs the NewStateLayer kernel on the CPU.
+func RunNewStateLayerCPU(n int) {
+	gpu.VectorizeFunc(0, n, NewStateLayer)
+}
+
+// RunOneNewStateLayer runs the NewStateLayer kernel with given number of elements,
+// on either the CPU or GPU depending on the UseGPU variable.
+// This version then calls RunDone with the given variables to sync
+// after the Run, for a single-shot Run-and-Done call. If multiple kernels
+// can be run in sequence, it is much more efficient to do multiple Run*
+// calls followed by a RunDone call.
+func RunOneNewStateLayer(n int, syncVars ...GPUVars) {
+	if UseGPU {
+		RunNewStateLayerGPU(n)
+		RunDone(syncVars...)
+	} else {
+		RunNewStateLayerCPU(n)
+	}
+}
+// RunNewStateNeuron runs the NewStateNeuron kernel with given number of elements,
+// on either the CPU or GPU depending on the UseGPU variable.
+// Can call multiple Run* kernels in a row, which are then all launched
+// in the same command submission on the GPU, which is by far the most efficient.
+// MUST call RunDone (with optional vars to sync) after all Run calls.
+// Alternatively, a single-shot RunOneNewStateNeuron call does Run and Done for a
+// single run-and-sync case.
+func RunNewStateNeuron(n int) {
+	if UseGPU {
+		RunNewStateNeuronGPU(n)
+	} else {
+		RunNewStateNeuronCPU(n)
+	}
+}
+
+// RunNewStateNeuronGPU runs the NewStateNeuron kernel on the GPU. See [RunNewStateNeuron] for more info.
+func RunNewStateNeuronGPU(n int) {
+	sy := GPUSystem
+	pl := sy.ComputePipelines["NewStateNeuron"]
+	ce, _ := sy.BeginComputePass()
+	pl.Dispatch1D(ce, n, 64)
+}
+
+// RunNewStateNeuronCPU runs the NewStateNeuron kernel on the CPU.
+func RunNewStateNeuronCPU(n int) {
+	gpu.VectorizeFunc(0, n, NewStateNeuron)
+}
+
+// RunOneNewStateNeuron runs the NewStateNeuron kernel with given number of elements,
+// on either the CPU or GPU depending on the UseGPU variable.
+// This version then calls RunDone with the given variables to sync
+// after the Run, for a single-shot Run-and-Done call. If multiple kernels
+// can be run in sequence, it is much more efficient to do multiple Run*
+// calls followed by a RunDone call.
+func RunOneNewStateNeuron(n int, syncVars ...GPUVars) {
+	if UseGPU {
+		RunNewStateNeuronGPU(n)
+		RunDone(syncVars...)
+	} else {
+		RunNewStateNeuronCPU(n)
+	}
+}
 // RunPlusPhaseNeuron runs the PlusPhaseNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
diff --git a/axon/init-layer.go b/axon/init-layer.go
index 46559272..e8951dd2 100644
--- a/axon/init-layer.go
+++ b/axon/init-layer.go
@@ -10,8 +10,6 @@ import (
 	"cogentcore.org/core/base/randx"
 )
 
-// todo: all of this could be done on LayerParams / gpu
-
 // InitWeights initializes the weight values in the network, i.e., resetting learning
 // Also calls InitActs
 func (ly *Layer) InitWeights(ctx *Context, nt *Network) { //types:add
@@ -184,18 +182,7 @@ func (ly *Layer) InitActs(ctx *Context) { //types:add
 			// Target layers are dynamically updated
 		}
 	}
-	ly.InitPathGBuffs(ctx)
-}
-
-// InitPathGBuffs initializes the pathway-level conductance buffers and
-// conductance integration values for receiving pathways in this layer.
-func (ly *Layer) InitPathGBuffs(ctx *Context) {
-	for _, pt := range ly.RecvPaths {
-		if pt.Off {
-			continue
-		}
-		pt.Params.InitGBuffs()
-	}
+	// ly.InitPathGBuffs(ctx)
 }
 
 // InitWeightsSym initializes the weight symmetry -- higher layers copy weights from lower layers
@@ -297,52 +284,6 @@ func (ly *Layer) InitGScale(ctx *Context) {
 	}
 }
 
-// NewState handles all initialization at start of new input pattern.
-// Does NOT call InitGScale()
-func (ly *Layer) NewState(ctx *Context) {
-	nn := ly.NNeurons
-	np := ly.NPools
-
-	actMinusAvg := float32(0)
-	actPlusAvg := float32(0)
-
-	for di := uint32(0); di < ctx.NData; di++ {
-		lpi := ly.Params.PoolIndex(0)
-
-		actMinusAvg += PoolAvgMax(AMAct, AMMinus, Avg, lpi, di)
-		actPlusAvg += PoolAvgMax(AMAct, AMPlus, Avg, lpi, di)
-
-		ly.Params.NewStateLayer(ctx, di)
-
-		for spi := uint32(0); spi < np; spi++ {
-			pi := ly.Params.PoolIndex(spi)
-			ly.Params.NewStatePool(ctx, pi, di) // also calls DecayState on pool
-		}
-
-		for lni := uint32(0); lni < nn; lni++ {
-			ni := ly.NeurStIndex + lni
-			if NeuronIsOff(ni) {
-				continue
-			}
-			// note: this calls the basic neuron-level DecayState
-			ly.Params.NewStateNeuron(ctx, ni, di)
-		}
-	}
-
-	// note: long-running averages must be based on aggregate data, drive adaptation
-	// of Gi layer inhibition.
-	davg := 1 / float32(ctx.NData)
-	actMinusAvg *= davg
-	actPlusAvg *= davg
-	for di := uint32(0); di < ctx.NData; di++ {
-		ly.Params.NewStateLayerActAvg(ctx, di, actMinusAvg, actPlusAvg)
-	}
-
-	// note: would be somewhat more expensive to only clear the di specific subset
-	// but all di are decayed every trial anyway so no big deal
-	ly.InitPathGBuffs(ctx)
-}
-
 // NewStateNeurons only calls the neurons part of new state -- for misbehaving GPU
 func (ly *Layer) NewStateNeurons(ctx *Context) {
 	nn := ly.NNeurons
diff --git a/axon/init-layer.goal b/axon/init-layer.goal
index 227d68a5..76eee1c5 100644
--- a/axon/init-layer.goal
+++ b/axon/init-layer.goal
@@ -8,8 +8,6 @@ import (
 	"cogentcore.org/core/base/randx"
 )
 
-// todo: all of this could be done on LayerParams / gpu
-
 // InitWeights initializes the weight values in the network, i.e., resetting learning
 // Also calls InitActs
 func (ly *Layer) InitWeights(ctx *Context, nt *Network) { //types:add
@@ -182,18 +180,7 @@ func (ly *Layer) InitActs(ctx *Context) { //types:add
 			// Target layers are dynamically updated
 		}
 	}
-	ly.InitPathGBuffs(ctx)
-}
-
-// InitPathGBuffs initializes the pathway-level conductance buffers and
-// conductance integration values for receiving pathways in this layer.
-func (ly *Layer) InitPathGBuffs(ctx *Context) {
-	for _, pt := range ly.RecvPaths {
-		if pt.Off {
-			continue
-		}
-		pt.Params.InitGBuffs()
-	}
+	// ly.InitPathGBuffs(ctx)
 }
 
 // InitWeightsSym initializes the weight symmetry -- higher layers copy weights from lower layers
@@ -294,52 +281,6 @@ func (ly *Layer) InitGScale(ctx *Context) {
 	}
 }
 
-// NewState handles all initialization at start of new input pattern.
-// Does NOT call InitGScale()
-func (ly *Layer) NewState(ctx *Context) {
-	nn := ly.NNeurons
-	np := ly.NPools
-
-	actMinusAvg := float32(0)
-	actPlusAvg := float32(0)
-
-	for di := uint32(0); di < ctx.NData; di++ {
-		lpi := ly.Params.PoolIndex(0)
-
-		actMinusAvg += PoolAvgMax(AMAct, AMMinus, Avg, lpi, di)
-		actPlusAvg += PoolAvgMax(AMAct, AMPlus, Avg, lpi, di)
-
-		ly.Params.NewStateLayer(ctx, di)
-
-		for spi := uint32(0); spi < np; spi++ {
-			pi := ly.Params.PoolIndex(spi)
-			ly.Params.NewStatePool(ctx, pi, di) // also calls DecayState on pool
-		}
-
-		for lni := uint32(0); lni < nn; lni++ {
-			ni := ly.NeurStIndex + lni
-			if NeuronIsOff(ni) {
-				continue
-			}
-			// note: this calls the basic neuron-level DecayState
-			ly.Params.NewStateNeuron(ctx, ni, di)
-		}
-	}
-
-	// note: long-running averages must be based on aggregate data, drive adaptation
-	// of Gi layer inhibition.
-	davg := 1 / float32(ctx.NData)
-	actMinusAvg *= davg
-	actPlusAvg *= davg
-	for di := uint32(0); di < ctx.NData; di++ {
-		ly.Params.NewStateLayerActAvg(ctx, di, actMinusAvg, actPlusAvg)
-	}
-
-	// note: would be somewhat more expensive to only clear the di specific subset
-	// but all di are decayed every trial anyway so no big deal
-	ly.InitPathGBuffs(ctx)
-}
-
 // NewStateNeurons only calls the neurons part of new state -- for misbehaving GPU
 func (ly *Layer) NewStateNeurons(ctx *Context) {
 	nn := ly.NNeurons
@@ -420,3 +361,4 @@ func (ly *Layer) DecayStateNeuronsAll(ctx *Context, decay, glong, ahp float32) {
 	}
 }
 
+
diff --git a/axon/init-net.go b/axon/init-net.go
index c3f61f8a..5b50cd05 100644
--- a/axon/init-net.go
+++ b/axon/init-net.go
@@ -7,7 +7,6 @@
 package axon
 
 import (
-	"cogentcore.org/core/enums"
 	"cogentcore.org/core/tensor"
 	"github.com/emer/emergent/v2/paths"
 )
@@ -27,27 +26,6 @@ func GlobalsReset() {
 	}
 }
 
-// NewState handles all initialization at start of new input pattern.
-// This is called *before* applying external input data and operates across
-// all data parallel values.  The current Context.NData should be set
-// properly prior to calling this and subsequent Cycle methods.
-func (nt *Network) NewState(mode enums.Enum, testing bool) {
-	// if nt.GPU.On { // todo: this has a bug in neuron-level access in updating SpkPrv
-	//
-	//		nt.GPU.RunNewState()
-	//		return
-	//	}
-	ctx := nt.Context()
-	ctx.NewState(mode, testing)
-	for _, ly := range nt.Layers {
-		if ly.Off {
-			continue
-		}
-		ly.NewState(ctx)
-	}
-	ToGPULayersNeurons()
-}
-
 // InitWeights initializes synaptic weights and all other associated long-term state variables
 // including running-average state values (e.g., layer running average activations etc)
 func (nt *Network) InitWeights() { //types:add
@@ -144,7 +122,7 @@ func (nt *Network) DecayState(decay, glong, ahp float32) {
 			ly.DecayState(ctx, di, decay, glong, ahp)
 		}
 	}
-	// nt.GPU.SyncStateToGPU()
+	ToGPULayersNeurons()
 }
 
 // DecayStateByType decays activation state for given layer types
@@ -180,6 +158,7 @@ func (nt *Network) DecayStateLayers(decay, glong, ahp float32, layers ...string)
 			ly.DecayState(ctx, di, decay, glong, ahp)
 		}
 	}
+	ToGPULayersNeurons()
 	// nt.GPU.SyncStateToGPU()
 }
 
@@ -193,6 +172,8 @@ func (nt *Network) InitActs() { //types:add
 		}
 		ly.InitActs(ctx)
 	}
+	ToGPULayersNeurons()
+	ToGPU(PathGBufVar, PathGSynsVar)
 	// nt.GPU.SyncStateToGPU() // todo:
 	// nt.GPU.SyncGBufToGPU() // zeros everyone
 }
diff --git a/axon/init-net.goal b/axon/init-net.goal
index b4e11a65..0d6217d0 100644
--- a/axon/init-net.goal
+++ b/axon/init-net.goal
@@ -5,7 +5,6 @@
 package axon
 
 import (
-	"cogentcore.org/core/enums"
 	"cogentcore.org/core/tensor"
 	"github.com/emer/emergent/v2/paths"
 )
@@ -25,26 +24,6 @@ func GlobalsReset() {
 	}
 }
 
-// NewState handles all initialization at start of new input pattern.
-// This is called *before* applying external input data and operates across
-// all data parallel values.  The current Context.NData should be set
-// properly prior to calling this and subsequent Cycle methods.
-func (nt *Network) NewState(mode enums.Enum, testing bool) {
-	// if nt.GPU.On { // todo: this has a bug in neuron-level access in updating SpkPrv
-	// 	nt.GPU.RunNewState()
-	// 	return
-	// }
-	ctx := nt.Context()
-	ctx.NewState(mode, testing)
-	for _, ly := range nt.Layers {
-		if ly.Off {
-			continue
-		}
-		ly.NewState(ctx)
-	}
-	ToGPULayersNeurons()
-}
-
 // InitWeights initializes synaptic weights and all other associated long-term state variables
 // including running-average state values (e.g., layer running average activations etc)
 func (nt *Network) InitWeights() { //types:add
@@ -141,7 +120,7 @@ func (nt *Network) DecayState(decay, glong, ahp float32) {
 			ly.DecayState(ctx, di, decay, glong, ahp)
 		}
 	}
-	// nt.GPU.SyncStateToGPU()
+	ToGPULayersNeurons()
 }
 
 // DecayStateByType decays activation state for given layer types
@@ -177,6 +156,7 @@ func (nt *Network) DecayStateLayers(decay, glong, ahp float32, layers ...string)
 			ly.DecayState(ctx, di, decay, glong, ahp)
 		}
 	}
+	ToGPULayersNeurons()
 	// nt.GPU.SyncStateToGPU()
 }
 
@@ -190,6 +170,8 @@ func (nt *Network) InitActs() { //types:add
 		}
 		ly.InitActs(ctx)
 	}
+	ToGPULayersNeurons()
+	ToGPU(PathGBufVar, PathGSynsVar)
 	// nt.GPU.SyncStateToGPU() // todo:
 	// nt.GPU.SyncGBufToGPU() // zeros everyone
 }
diff --git a/axon/init-path.go b/axon/init-path.go
index c97dd48b..7f7149a1 100644
--- a/axon/init-path.go
+++ b/axon/init-path.go
@@ -114,7 +114,7 @@ func (pt *Path) InitWeightsSyn(ctx *Context, syni uint32, rnd randx.Rand, mean,
 // enforcing current constraints.
 func (pt *Path) InitWeights(ctx *Context, nt *Network) {
 	pt.Params.Learn.LRate.Init()
-	pt.Params.InitGBuffs()
+	pt.Params.InitGBuffs(ctx)
 	rlay := pt.Recv
 	spct := pt.Params.SWts.Init.SPct
 	if rlay.Params.IsTarget() {
diff --git a/axon/init-path.goal b/axon/init-path.goal
index 44472e66..976a11a2 100644
--- a/axon/init-path.goal
+++ b/axon/init-path.goal
@@ -112,7 +112,7 @@ func (pt *Path) InitWeightsSyn(ctx *Context, syni uint32, rnd randx.Rand, mean,
 // enforcing current constraints.
 func (pt *Path) InitWeights(ctx *Context, nt *Network) {
 	pt.Params.Learn.LRate.Init()
-	pt.Params.InitGBuffs()
+	pt.Params.InitGBuffs(ctx)
 	rlay := pt.Recv
 	spct := pt.Params.SWts.Init.SPct
 	if rlay.Params.IsTarget() {