-
Notifications
You must be signed in to change notification settings - Fork 0
/
neanderthal_impl.clj
280 lines (248 loc) · 11.4 KB
/
neanderthal_impl.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
(ns mzero.ai.ann.neanderthal-impl
"Implementation of ANN propagation using pure clojure & neanderthal.
Backprop follows
https://fleuret.org/dlc/materials/dlc-handout-3-6-backprop.pdf
When making a forward pass, the size of the batch is stored in
`current-batch-size`"
(:require [clojure.spec.alpha :as s]
[mzero.ai.ann.ann :as mzann]
[mzero.ai.ann.common :as mzc]
[mzero.ai.ann.label-distributions :as mzld]
[mzero.ai.ann.network :as mzn]
[uncomplicate.commons.core :refer [release Releaseable with-release]]
[uncomplicate.neanderthal.core :as nc]
[uncomplicate.neanderthal.vect-math :as nvm]
[uncomplicate.neanderthal.native :as nn]))
(s/def ::weights
(-> nc/matrix?
(s/and (mzc/per-element-spec float?)
#(s/valid? ::mzn/dimension (nc/mrows %))
#(s/valid? ::mzn/dimension (nc/ncols %))
(fn [m]
(comment "At least a non-0 val per neuron (row)")
(every? #(pos? (nc/asum %)) (nc/rows m))))))
(s/def ::inputs (-> nc/matrix?
(s/and (mzc/per-element-spec ::mzn/neural-value)
#(s/valid? ::mzn/dimension (nc/dim %)))))
(s/def ::raw-outputs (-> nc/matrix?
(s/and (mzc/per-element-spec float?)
#(s/valid? ::mzn/dimension (nc/dim %)))))
(s/def ::outputs ::inputs)
(s/def ::raw-output-gradients ::raw-outputs)
(s/def ::layer
(-> (s/keys :req [::weights ::inputs ::outputs ::raw-outputs ::raw-output-gradients])
(s/and
(fn [{:keys [::weights ::inputs ::outputs ::raw-outputs ::raw-output-gradients]}]
(comment "Dimensions fit")
(and (= (nc/mrows inputs) (nc/mrows weights))
(= (nc/ncols weights)
(nc/mrows raw-outputs)
(nc/mrows outputs)
(nc/mrows raw-output-gradients)))))))
(s/def ::layers
(-> (s/every ::layer :max-count mzn/max-layer-depth)
(s/and
(fn [layers]
(comment "Each layer's output is the next layer's input")
(reduce #(if (identical? (-> %1 ::outputs) (-> %2 ::inputs)) %2 false)
layers)))))
(defn- tensor-fit
"Return a submatrix of `tensor1` that fits `batch-size` in terms of number
of columns. For activations and inputs with variable batch size."
[tensor batch-size]
(nc/submatrix tensor (nc/mrows tensor) batch-size))
(defn- wtimesx!
[weights inputs raw-outputs plusb?]
(let [;; hack to set an individual vector coordinate to 1.0
;; on gpu, cannot set indiv vector coordinates
;; so we perform a copy of subvector of size one
;; containing 1.0
add1-at-end
(fn [input]
(nc/entry! (nc/row input (dec (nc/mrows input))) 1)
input)]
(nc/mm! 1.0 weights
(cond-> inputs plusb? add1-at-end)
0.0 raw-outputs)))
(defn- batch-size-layers
"Return layers whose activation fields (inputs, raw-outs, outs, rogs)
are dimensioned to the given batch size. The dimensioning is done
with neanderthal views and does not entail any actual data resizing"
[layers batch-size]
(let [batch-size-layer
(fn [layer]
(reduce #(update %1 %2 tensor-fit batch-size)
layer
[::inputs ::raw-outputs ::raw-output-gradients ::outputs]))]
(map batch-size-layer layers)))
(defn sequential-forward-pass!
"Forward pass: run the network of `layers` using an initial
`input-tensor`. Almost everything in `layers` is changed.
For each layers it will compute weights * input and run the
activation function. If called with flag `plusb?` on, before
processing a layer, it will reset its input's last element to
1. This allows computation of `w*x+b` rather than `w*x`, provided
the rest of the code is aware that inputs' last elements are
dedicated to this use.
This pass is called sequential because layers are computed in turn
from first to last, with each layer using the new value of its
previous layer's output, rather than being computed all at once,
using their previous layer's old output."
[{:as ann-impl :keys [layers] {:keys [::mzann/af!]} :act-fns} input-tensor plusb?]
(let [current-batch-size (count input-tensor)
batch-sized-layers (batch-size-layers layers current-batch-size)
forward-pass-on-layer!
(fn [{:as layer :keys [::outputs ::raw-outputs ::weights ::inputs]}]
(wtimesx! weights inputs raw-outputs plusb?)
(af! ann-impl raw-outputs outputs))]
(nc/transfer! input-tensor (-> batch-sized-layers first ::inputs))
(doall (map forward-pass-on-layer! batch-sized-layers))
(assoc ann-impl :current-batch-size current-batch-size)))
(defn- compute-intermediate-layer-gradients
"Compute gradients of `next-layer` given `prev-layers` whose gradients
have already been computed (only the 1st is useful). Return layers
seq with next-layer cons'ed"
[ann-impl prev-layers {:as next-layer :keys [::raw-outputs]}]
(let [af-deriv! (-> ann-impl :act-fns ::mzann/af-deriv!)
{:keys [::raw-output-gradients ::weights]} (first prev-layers)
compute-raw-output-gradients
#(-> (nc/mm! 1.0 (nc/trans weights)
raw-output-gradients
0.0 %)
(nvm/mul! (af-deriv! ann-impl raw-outputs)))]
(-> next-layer
(update ::raw-output-gradients compute-raw-output-gradients)
(cons prev-layers))))
(defn- compute-last-layer-gradients
"Compute gradients for last layer (derivative of
loss). See [there](doc/ce-loss-deriv.pdf)
and [there](doc/ce-loss-multiclass.pdf)."
[{:as last-layer :keys [::raw-outputs]}
target-distribution-tensor
loss-gradient-fn]
(update last-layer ::raw-output-gradients
(fn [rog]
(-> raw-outputs
mzc/tens->vec
(loss-gradient-fn target-distribution-tensor)
(nc/transfer! rog)))))
(defn- compute-gradients!
[{:as ann-impl :keys [loss-gradient-fn current-batch-size]}
target-distribution-tensor]
(let [compute-last-layer-gradients
(fn [layers]
(-> (compute-last-layer-gradients (first layers)
target-distribution-tensor
loss-gradient-fn)
(cons (rest layers))))
reduce-layer
(fn [prev-layers next-layer]
(compute-intermediate-layer-gradients ann-impl prev-layers next-layer))
compute-other-layers-gradients
(fn [layers]
(reduce reduce-layer (vector (first layers)) (rest layers)))]
(-> ann-impl :layers reverse ;; backprop starts form last layer & goes to first
(batch-size-layers current-batch-size)
compute-last-layer-gradients
compute-other-layers-gradients)))
(defn- update-weights!
[layers discount-factor-matrix]
(let [update-layer-weights
(fn [{:as layer :keys [::raw-output-gradients ::inputs]}]
;; if discount factor matrix is identity ignore it
(when (not-every? #(= % 1.0) discount-factor-matrix)
(throw (ex-info
"Discount factor not implemented, 1.0-valued vector required"
{:discount discount-factor-matrix}))
(nc/mm! raw-output-gradients discount-factor-matrix))
(update layer ::weights #(nc/mm! (- mzann/step-size)
raw-output-gradients
(nc/trans inputs)
%)))]
(mapv update-layer-weights layers)))
(defn- convert-layer-without-inputs
[{:as layer :keys [::mzn/weights]} factory]
;; NB: nc/ge understands a nested vectors source as a list of cols
(let [output-dim (count (first weights))]
{::weights (nc/ge factory weights)
::raw-outputs (nc/ge factory output-dim mzann/max-batch-size)
::raw-output-gradients (nc/ge factory output-dim mzann/max-batch-size)
::outputs (nc/ge factory output-dim mzann/max-batch-size)}))
(defn- convert-to-ndt-layers
[layers factory]
(let [input-dim (-> layers first ::mzn/inputs count)
converted-layers
(map #(convert-layer-without-inputs % factory) layers)
initial-layer
(assoc (first converted-layers)
::inputs
(nc/ge factory input-dim mzann/max-batch-size))
plug-layers
(fn [already-plugged layer]
(conj already-plugged
(assoc layer ::inputs (-> already-plugged last ::outputs))))]
(reduce plug-layers [initial-layer] (rest converted-layers))))
(defn- computation-setup!
"Setup the ann for incoming computations. Some computation modes
require a specific setup, such as CUDA"
[this]
(when-let [computation-setup-fn
(-> this :computation-mode :computation-setup-fn)]
(computation-setup-fn this)))
(def default-opts {:computation-mode {:type :cpu :factory nn/native-float}})
(defn- diagonal-matrix [ann-impl vector]
(nc/gd (-> ann-impl :computation-mode :factory) (count vector) vector))
(defrecord NeanderthalImpl []
mzann/ANN
(-tens->vec [this ndt-tensor]
(computation-setup! this)
(mzc/tens->vec ndt-tensor))
(-initialize [this layers opts]
(let [this-with-opts (merge this default-opts opts)
factory (-> this-with-opts :computation-mode :factory)
max-sized-matrix #(nc/ge factory mzn/max-dimension mzn/max-dimension)]
(assoc this-with-opts
:layers (convert-to-ndt-layers layers factory)
:ones (nc/entry! (max-sized-matrix) 1)
:zeros (max-sized-matrix)
:buffer (max-sized-matrix)
:current-batch-size nil)))
(-forward-pass! [this inputs]
(computation-setup! this)
(sequential-forward-pass! this inputs true))
(-backward-pass! [this target-distribution-tensor discount-factor]
(assert (= (-> this :current-batch-size) (count target-distribution-tensor)))
(computation-setup! this)
(-> (compute-gradients! this target-distribution-tensor)
(update-weights! discount-factor))
this)
(-backward-pass! [this input-tensor target-distribution-tensor discount-factor]
(-> (mzann/-forward-pass! this input-tensor)
(mzann/-backward-pass! target-distribution-tensor discount-factor)))
(-layer-data [{:as this :keys [layers current-batch-size]} lindex lkey]
(computation-setup! this)
(let [weights (-> layers (nth lindex) ::weights)
mrows (nc/mrows weights) ncols (nc/ncols weights)
last? (= lindex (dec (count layers)))
this-ns "mzero.ai.ann.neanderthal-impl"]
(case lkey ;; weight and bias are in a single matrix ::weights
"weight" (nc/submatrix weights (dec mrows) (if last? ncols (dec ncols)))
"bias" (nc/subvector (last (nc/rows weights)) 0 (if last? ncols (dec ncols)))
(tensor-fit (get-in layers [lindex (keyword this-ns lkey)])
current-batch-size))))
(nb-layers [this] (-> this :layers count))
(clear-neuron! [this lindex nindex]
(computation-setup! this)
(let [weights
(-> this :layers
(nth (mod lindex (-> this :layers count)))
::weights)]
(->> (nc/row weights (mod nindex (nc/mrows weights)))
(nc/scal! 0.0))))
Releaseable
(release [this]
(let [release-layer
(fn [layer] (doseq [item (vals layer)] (release item)))]
(doseq [layer (-> this :layers)] (release-layer layer))
(release (-> this :ones))
(release (-> this :zeros)))))