diff options
Diffstat (limited to 'src/lib/src/train.c')
-rw-r--r-- | src/lib/src/train.c | 182 |
1 files changed, 104 insertions, 78 deletions
diff --git a/src/lib/src/train.c b/src/lib/src/train.c index dc93f0f..98f58ad 100644 --- a/src/lib/src/train.c +++ b/src/lib/src/train.c | |||
@@ -38,7 +38,7 @@ typedef struct nnSigmoidGradientElements { | |||
38 | /// each layer. A data type is defined for these because we allocate all the | 38 | /// each layer. A data type is defined for these because we allocate all the |
39 | /// required memory up front before entering the training loop. | 39 | /// required memory up front before entering the training loop. |
40 | typedef struct nnGradientElements { | 40 | typedef struct nnGradientElements { |
41 | nnActivation type; | 41 | nnLayerType type; |
42 | // Gradient vector, same size as the layer. | 42 | // Gradient vector, same size as the layer. |
43 | // This will contain the gradient expression except for the output value of | 43 | // This will contain the gradient expression except for the output value of |
44 | // the previous layer. | 44 | // the previous layer. |
@@ -57,10 +57,27 @@ void nnInitNet( | |||
57 | mt19937_64_init(&rng, seed); | 57 | mt19937_64_init(&rng, seed); |
58 | 58 | ||
59 | for (int l = 0; l < net->num_layers; ++l) { | 59 | for (int l = 0; l < net->num_layers; ++l) { |
60 | nnMatrix* weights = &net->weights[l]; | 60 | // Get the layer's weights and biases, if any. |
61 | nnMatrix* biases = &net->biases[l]; | 61 | nnMatrix* weights = 0; |
62 | nnMatrix* biases = 0; | ||
63 | switch (net->layers[l].type) { | ||
64 | case nnLinear: { | ||
65 | nnLinearImpl* linear = &net->layers[l].linear; | ||
66 | |||
67 | weights = &linear->weights; | ||
68 | biases = &linear->biases; | ||
69 | break; | ||
70 | } | ||
71 | // Activations. | ||
72 | case nnRelu: | ||
73 | case nnSigmoid: | ||
74 | break; | ||
75 | } | ||
76 | if (!weights || !biases) { | ||
77 | continue; | ||
78 | } | ||
62 | 79 | ||
63 | const R layer_size = (R)nnLayerInputSize(weights); | 80 | const R layer_size = (R)nnLayerInputSize(net, l); |
64 | const R scale = 1. / layer_size; | 81 | const R scale = 1. / layer_size; |
65 | const R stdev = 1. / sqrt((R)layer_size); | 82 | const R stdev = 1. / sqrt((R)layer_size); |
66 | const R sigma = stdev * stdev; | 83 | const R sigma = stdev * stdev; |
@@ -128,9 +145,6 @@ void nnTrain( | |||
128 | // with one sample at a time. | 145 | // with one sample at a time. |
129 | nnMatrix* errors = calloc(net->num_layers, sizeof(nnMatrix)); | 146 | nnMatrix* errors = calloc(net->num_layers, sizeof(nnMatrix)); |
130 | 147 | ||
131 | // Allocate the weight transpose matrices up front for backpropagation. | ||
132 | // nnMatrix* weights_T = calloc(net->num_layers, sizeof(nnMatrix)); | ||
133 | |||
134 | // Allocate the weight delta matrices. | 148 | // Allocate the weight delta matrices. |
135 | nnMatrix* weight_deltas = calloc(net->num_layers, sizeof(nnMatrix)); | 149 | nnMatrix* weight_deltas = calloc(net->num_layers, sizeof(nnMatrix)); |
136 | 150 | ||
@@ -144,30 +158,24 @@ void nnTrain( | |||
144 | nnMatrix* outputs_T = calloc(net->num_layers, sizeof(nnMatrix)); | 158 | nnMatrix* outputs_T = calloc(net->num_layers, sizeof(nnMatrix)); |
145 | 159 | ||
146 | assert(errors != 0); | 160 | assert(errors != 0); |
147 | // assert(weights_T != 0); | ||
148 | assert(weight_deltas != 0); | 161 | assert(weight_deltas != 0); |
149 | assert(gradient_elems); | 162 | assert(gradient_elems); |
150 | assert(outputs_T); | 163 | assert(outputs_T); |
151 | 164 | ||
152 | for (int l = 0; l < net->num_layers; ++l) { | 165 | for (int l = 0; l < net->num_layers; ++l) { |
153 | const nnMatrix* layer_weights = &net->weights[l]; | 166 | const int layer_input_size = nnLayerInputSize(net, l); |
154 | const int layer_output_size = net->weights[l].cols; | 167 | const int layer_output_size = nnLayerOutputSize(net, l); |
155 | const nnActivation activation = net->activations[l]; | 168 | const nnLayerImpl* layer = &net->layers[l]; |
156 | |||
157 | errors[l] = nnMatrixMake(1, layer_weights->cols); | ||
158 | |||
159 | // weights_T[l] = nnMatrixMake(layer_weights->cols, layer_weights->rows); | ||
160 | // nnMatrixTranspose(layer_weights, &weights_T[l]); | ||
161 | |||
162 | weight_deltas[l] = nnMatrixMake(layer_weights->rows, layer_weights->cols); | ||
163 | 169 | ||
164 | outputs_T[l] = nnMatrixMake(layer_output_size, 1); | 170 | errors[l] = nnMatrixMake(1, layer_output_size); |
171 | weight_deltas[l] = nnMatrixMake(layer_input_size, layer_output_size); | ||
172 | outputs_T[l] = nnMatrixMake(layer_output_size, 1); | ||
165 | 173 | ||
166 | // Allocate the gradient elements and vectors for weight delta calculation. | 174 | // Allocate the gradient elements and vectors for weight delta calculation. |
167 | nnGradientElements* elems = &gradient_elems[l]; | 175 | nnGradientElements* elems = &gradient_elems[l]; |
168 | elems->type = activation; | 176 | elems->type = layer->type; |
169 | switch (activation) { | 177 | switch (layer->type) { |
170 | case nnIdentity: | 178 | case nnLinear: |
171 | break; // Gradient vector will be borrowed, no need to allocate. | 179 | break; // Gradient vector will be borrowed, no need to allocate. |
172 | 180 | ||
173 | case nnSigmoid: | 181 | case nnSigmoid: |
@@ -208,6 +216,7 @@ void nnTrain( | |||
208 | 216 | ||
209 | // For now, we train with one sample at a time. | 217 | // For now, we train with one sample at a time. |
210 | for (int sample = 0; sample < inputs->rows; ++sample) { | 218 | for (int sample = 0; sample < inputs->rows; ++sample) { |
219 | // TODO: Introduce a BorrowMut. | ||
211 | // Slice the input and target matrices with the batch size. | 220 | // Slice the input and target matrices with the batch size. |
212 | // We are not mutating the inputs, but we need the cast to borrow. | 221 | // We are not mutating the inputs, but we need the cast to borrow. |
213 | nnMatrix training_inputs = | 222 | nnMatrix training_inputs = |
@@ -219,15 +228,16 @@ void nnTrain( | |||
219 | // Assuming one training input per iteration for now. | 228 | // Assuming one training input per iteration for now. |
220 | nnMatrixTranspose(&training_inputs, &training_inputs_T); | 229 | nnMatrixTranspose(&training_inputs, &training_inputs_T); |
221 | 230 | ||
222 | // Run a forward pass and compute the output layer error relevant to the | 231 | // Forward pass. |
223 | // derivative: o-t. | 232 | nnQuery(net, query, &training_inputs); |
224 | // Error: (t-o)^2 | 233 | |
225 | // dE/do = -2(t-o) | 234 | // Compute the error derivative: o-t. |
226 | // = +2(o-t) | 235 | // Error: 1/2 (t-o)^2 |
236 | // dE/do = -(t-o) | ||
237 | // = +(o-t) | ||
227 | // Note that we compute o-t instead to remove that outer negative sign. | 238 | // Note that we compute o-t instead to remove that outer negative sign. |
228 | // The 2 is dropped because we are only interested in the direction of the | 239 | // The 2 is dropped because we are only interested in the direction of the |
229 | // gradient. The learning rate controls the magnitude. | 240 | // gradient. The learning rate controls the magnitude. |
230 | nnQuery(net, query, &training_inputs); | ||
231 | nnMatrixSub( | 241 | nnMatrixSub( |
232 | training_outputs, &training_targets, &errors[net->num_layers - 1]); | 242 | training_outputs, &training_targets, &errors[net->num_layers - 1]); |
233 | 243 | ||
@@ -236,68 +246,86 @@ void nnTrain( | |||
236 | nnMatrixTranspose(&query->layer_outputs[l], &outputs_T[l]); | 246 | nnMatrixTranspose(&query->layer_outputs[l], &outputs_T[l]); |
237 | } | 247 | } |
238 | 248 | ||
239 | // Update weights and biases for each internal layer, backpropagating | 249 | // Update weights and biases for each internal layer, back-propagating |
240 | // errors along the way. | 250 | // errors along the way. |
241 | for (int l = net->num_layers - 1; l >= 0; --l) { | 251 | for (int l = net->num_layers - 1; l >= 0; --l) { |
242 | const nnMatrix* layer_output = &query->layer_outputs[l]; | 252 | const nnMatrix* layer_output = &query->layer_outputs[l]; |
243 | nnMatrix* layer_weights = &net->weights[l]; | 253 | nnGradientElements* elems = &gradient_elems[l]; |
244 | nnMatrix* layer_biases = &net->biases[l]; | 254 | nnMatrix* gradient = &elems->gradient; |
245 | nnGradientElements* elems = &gradient_elems[l]; | 255 | nnLayerImpl* layer = &net->layers[l]; |
246 | nnMatrix* gradient = &elems->gradient; | 256 | |
247 | const nnActivation activation = net->activations[l]; | 257 | // Compute this layer's gradient. |
248 | 258 | // | |
249 | // Compute the gradient (the part of the expression that does not | 259 | // By "gradient" we mean the expression common to the weights and bias |
250 | // contain the output of the previous layer). | 260 | // gradients. This is the part of the expression that does not contain |
261 | // this layer's input. | ||
251 | // | 262 | // |
252 | // Identity: G = error_k | 263 | // Linear: G = id |
253 | // Sigmoid: G = error_k * output_k * (1 - output_k). | 264 | // Relu: G = (output_k > 0 ? 1 : 0) |
254 | // Relu: G = error_k * (output_k > 0 ? 1 : 0) | 265 | // Sigmoid: G = output_k * (1 - output_k) |
255 | switch (activation) { | 266 | switch (layer->type) { |
256 | case nnIdentity: | 267 | case nnLinear: { |
257 | // TODO: Just copy the pointer? | 268 | // TODO: Just copy the pointer? |
258 | *gradient = nnMatrixBorrow(&errors[l]); | 269 | *gradient = nnMatrixBorrow(&errors[l]); |
259 | break; | 270 | break; |
271 | } | ||
272 | case nnRelu: | ||
273 | nnMatrixGt(layer_output, 0, gradient); | ||
274 | break; | ||
260 | case nnSigmoid: | 275 | case nnSigmoid: |
261 | nnMatrixSub(&elems->sigmoid.ones, layer_output, gradient); | 276 | nnMatrixSub(&elems->sigmoid.ones, layer_output, gradient); |
262 | nnMatrixMulPairs(layer_output, gradient, gradient); | 277 | nnMatrixMulPairs(layer_output, gradient, gradient); |
263 | nnMatrixMulPairs(&errors[l], gradient, gradient); | ||
264 | break; | ||
265 | case nnRelu: | ||
266 | nnMatrixGt(layer_output, 0, gradient); | ||
267 | nnMatrixMulPairs(&errors[l], gradient, gradient); | ||
268 | break; | 278 | break; |
269 | } | 279 | } |
270 | 280 | ||
271 | // Outer product to compute the weight deltas. | 281 | // Back-propagate the error. |
272 | const nnMatrix* output_T = | 282 | // |
273 | (l == 0) ? &training_inputs_T : &outputs_T[l - 1]; | 283 | // This combines this layer's gradient with the back-propagated error, |
274 | nnMatrixMul(output_T, gradient, &weight_deltas[l]); | 284 | // which is the combination of the gradients of subsequent layers down |
275 | 285 | // to the output layer error. | |
276 | // Backpropagate the error before updating weights. | 286 | // |
287 | // Note that this step uses the layer's original weights. | ||
277 | if (l > 0) { | 288 | if (l > 0) { |
278 | // G * W^T == G *^T W. | 289 | switch (layer->type) { |
279 | // nnMatrixMul(gradient, &weights_T[l], &errors[l-1]); | 290 | case nnLinear: { |
280 | nnMatrixMulRows(gradient, layer_weights, &errors[l - 1]); | 291 | const nnMatrix* layer_weights = &layer->linear.weights; |
292 | // E * W^T == E *^T W. | ||
293 | // Using nnMatrixMulRows, we avoid having to transpose the weight | ||
294 | // matrix. | ||
295 | nnMatrixMulRows(&errors[l], layer_weights, &errors[l - 1]); | ||
296 | break; | ||
297 | } | ||
298 | // For activations, the error back-propagates as is but multiplied by | ||
299 | // the layer's gradient. | ||
300 | case nnRelu: | ||
301 | case nnSigmoid: | ||
302 | nnMatrixMulPairs(&errors[l], gradient, &errors[l - 1]); | ||
303 | break; | ||
304 | } | ||
281 | } | 305 | } |
282 | 306 | ||
283 | // Update weights. | 307 | // Update layer weights. |
284 | nnMatrixScale(&weight_deltas[l], params->learning_rate); | 308 | if (layer->type == nnLinear) { |
285 | // The gradient has a negative sign from -(t - o), but we have computed | 309 | nnLinearImpl* linear = &layer->linear; |
286 | // e = o - t instead, so we can subtract directly. | 310 | nnMatrix* layer_weights = &linear->weights; |
287 | // nnMatrixAdd(layer_weights, &weight_deltas[l], layer_weights); | 311 | nnMatrix* layer_biases = &linear->biases; |
288 | nnMatrixSub(layer_weights, &weight_deltas[l], layer_weights); | 312 | |
289 | 313 | // Outer product to compute the weight deltas. | |
290 | // Update weight transpose matrix for the next training iteration. | 314 | // This layer's input is the previous layer's output. |
291 | // nnMatrixTranspose(layer_weights, &weights_T[l]); | 315 | const nnMatrix* input_T = |
292 | 316 | (l == 0) ? &training_inputs_T : &outputs_T[l - 1]; | |
293 | // Update biases. | 317 | nnMatrixMul(input_T, gradient, &weight_deltas[l]); |
294 | // This is the same formula as for weights, except that the o_j term is | 318 | |
295 | // just 1. We can simply re-use the gradient that we have already | 319 | // Update weights. |
296 | // computed for the weight update. | 320 | nnMatrixScale(&weight_deltas[l], params->learning_rate); |
297 | // nnMatrixMulAdd(layer_biases, gradient, params->learning_rate, | 321 | nnMatrixSub(layer_weights, &weight_deltas[l], layer_weights); |
298 | // layer_biases); | 322 | |
299 | nnMatrixMulSub( | 323 | // Update biases. |
300 | layer_biases, gradient, params->learning_rate, layer_biases); | 324 | // This is the same formula as for weights, except that the o_j term |
325 | // is just 1. | ||
326 | nnMatrixMulSub( | ||
327 | layer_biases, gradient, params->learning_rate, layer_biases); | ||
328 | } | ||
301 | } | 329 | } |
302 | 330 | ||
303 | // TODO: Add this under a verbose debugging mode. | 331 | // TODO: Add this under a verbose debugging mode. |
@@ -334,12 +362,11 @@ void nnTrain( | |||
334 | for (int l = 0; l < net->num_layers; ++l) { | 362 | for (int l = 0; l < net->num_layers; ++l) { |
335 | nnMatrixDel(&errors[l]); | 363 | nnMatrixDel(&errors[l]); |
336 | nnMatrixDel(&outputs_T[l]); | 364 | nnMatrixDel(&outputs_T[l]); |
337 | // nnMatrixDel(&weights_T[l]); | ||
338 | nnMatrixDel(&weight_deltas[l]); | 365 | nnMatrixDel(&weight_deltas[l]); |
339 | 366 | ||
340 | nnGradientElements* elems = &gradient_elems[l]; | 367 | nnGradientElements* elems = &gradient_elems[l]; |
341 | switch (elems->type) { | 368 | switch (elems->type) { |
342 | case nnIdentity: | 369 | case nnLinear: |
343 | break; // Gradient vector is borrowed, no need to deallocate. | 370 | break; // Gradient vector is borrowed, no need to deallocate. |
344 | 371 | ||
345 | case nnSigmoid: | 372 | case nnSigmoid: |
@@ -355,7 +382,6 @@ void nnTrain( | |||
355 | nnMatrixDel(&training_inputs_T); | 382 | nnMatrixDel(&training_inputs_T); |
356 | free(errors); | 383 | free(errors); |
357 | free(outputs_T); | 384 | free(outputs_T); |
358 | // free(weights_T); | ||
359 | free(weight_deltas); | 385 | free(weight_deltas); |
360 | free(gradient_elems); | 386 | free(gradient_elems); |
361 | } | 387 | } |