aboutsummaryrefslogtreecommitdiff
path: root/src/lib/src/train.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/src/train.c')
-rw-r--r--src/lib/src/train.c182
1 files changed, 104 insertions, 78 deletions
diff --git a/src/lib/src/train.c b/src/lib/src/train.c
index dc93f0f..98f58ad 100644
--- a/src/lib/src/train.c
+++ b/src/lib/src/train.c
@@ -38,7 +38,7 @@ typedef struct nnSigmoidGradientElements {
38/// each layer. A data type is defined for these because we allocate all the 38/// each layer. A data type is defined for these because we allocate all the
39/// required memory up front before entering the training loop. 39/// required memory up front before entering the training loop.
40typedef struct nnGradientElements { 40typedef struct nnGradientElements {
41 nnActivation type; 41 nnLayerType type;
42 // Gradient vector, same size as the layer. 42 // Gradient vector, same size as the layer.
43 // This will contain the gradient expression except for the output value of 43 // This will contain the gradient expression except for the output value of
44 // the previous layer. 44 // the previous layer.
@@ -57,10 +57,27 @@ void nnInitNet(
57 mt19937_64_init(&rng, seed); 57 mt19937_64_init(&rng, seed);
58 58
59 for (int l = 0; l < net->num_layers; ++l) { 59 for (int l = 0; l < net->num_layers; ++l) {
60 nnMatrix* weights = &net->weights[l]; 60 // Get the layer's weights and biases, if any.
61 nnMatrix* biases = &net->biases[l]; 61 nnMatrix* weights = 0;
62 nnMatrix* biases = 0;
63 switch (net->layers[l].type) {
64 case nnLinear: {
65 nnLinearImpl* linear = &net->layers[l].linear;
66
67 weights = &linear->weights;
68 biases = &linear->biases;
69 break;
70 }
71 // Activations.
72 case nnRelu:
73 case nnSigmoid:
74 break;
75 }
76 if (!weights || !biases) {
77 continue;
78 }
62 79
63 const R layer_size = (R)nnLayerInputSize(weights); 80 const R layer_size = (R)nnLayerInputSize(net, l);
64 const R scale = 1. / layer_size; 81 const R scale = 1. / layer_size;
65 const R stdev = 1. / sqrt((R)layer_size); 82 const R stdev = 1. / sqrt((R)layer_size);
66 const R sigma = stdev * stdev; 83 const R sigma = stdev * stdev;
@@ -128,9 +145,6 @@ void nnTrain(
128 // with one sample at a time. 145 // with one sample at a time.
129 nnMatrix* errors = calloc(net->num_layers, sizeof(nnMatrix)); 146 nnMatrix* errors = calloc(net->num_layers, sizeof(nnMatrix));
130 147
131 // Allocate the weight transpose matrices up front for backpropagation.
132 // nnMatrix* weights_T = calloc(net->num_layers, sizeof(nnMatrix));
133
134 // Allocate the weight delta matrices. 148 // Allocate the weight delta matrices.
135 nnMatrix* weight_deltas = calloc(net->num_layers, sizeof(nnMatrix)); 149 nnMatrix* weight_deltas = calloc(net->num_layers, sizeof(nnMatrix));
136 150
@@ -144,30 +158,24 @@ void nnTrain(
144 nnMatrix* outputs_T = calloc(net->num_layers, sizeof(nnMatrix)); 158 nnMatrix* outputs_T = calloc(net->num_layers, sizeof(nnMatrix));
145 159
146 assert(errors != 0); 160 assert(errors != 0);
147 // assert(weights_T != 0);
148 assert(weight_deltas != 0); 161 assert(weight_deltas != 0);
149 assert(gradient_elems); 162 assert(gradient_elems);
150 assert(outputs_T); 163 assert(outputs_T);
151 164
152 for (int l = 0; l < net->num_layers; ++l) { 165 for (int l = 0; l < net->num_layers; ++l) {
153 const nnMatrix* layer_weights = &net->weights[l]; 166 const int layer_input_size = nnLayerInputSize(net, l);
154 const int layer_output_size = net->weights[l].cols; 167 const int layer_output_size = nnLayerOutputSize(net, l);
155 const nnActivation activation = net->activations[l]; 168 const nnLayerImpl* layer = &net->layers[l];
156
157 errors[l] = nnMatrixMake(1, layer_weights->cols);
158
159 // weights_T[l] = nnMatrixMake(layer_weights->cols, layer_weights->rows);
160 // nnMatrixTranspose(layer_weights, &weights_T[l]);
161
162 weight_deltas[l] = nnMatrixMake(layer_weights->rows, layer_weights->cols);
163 169
164 outputs_T[l] = nnMatrixMake(layer_output_size, 1); 170 errors[l] = nnMatrixMake(1, layer_output_size);
171 weight_deltas[l] = nnMatrixMake(layer_input_size, layer_output_size);
172 outputs_T[l] = nnMatrixMake(layer_output_size, 1);
165 173
166 // Allocate the gradient elements and vectors for weight delta calculation. 174 // Allocate the gradient elements and vectors for weight delta calculation.
167 nnGradientElements* elems = &gradient_elems[l]; 175 nnGradientElements* elems = &gradient_elems[l];
168 elems->type = activation; 176 elems->type = layer->type;
169 switch (activation) { 177 switch (layer->type) {
170 case nnIdentity: 178 case nnLinear:
171 break; // Gradient vector will be borrowed, no need to allocate. 179 break; // Gradient vector will be borrowed, no need to allocate.
172 180
173 case nnSigmoid: 181 case nnSigmoid:
@@ -208,6 +216,7 @@ void nnTrain(
208 216
209 // For now, we train with one sample at a time. 217 // For now, we train with one sample at a time.
210 for (int sample = 0; sample < inputs->rows; ++sample) { 218 for (int sample = 0; sample < inputs->rows; ++sample) {
219 // TODO: Introduce a BorrowMut.
211 // Slice the input and target matrices with the batch size. 220 // Slice the input and target matrices with the batch size.
212 // We are not mutating the inputs, but we need the cast to borrow. 221 // We are not mutating the inputs, but we need the cast to borrow.
213 nnMatrix training_inputs = 222 nnMatrix training_inputs =
@@ -219,15 +228,16 @@ void nnTrain(
219 // Assuming one training input per iteration for now. 228 // Assuming one training input per iteration for now.
220 nnMatrixTranspose(&training_inputs, &training_inputs_T); 229 nnMatrixTranspose(&training_inputs, &training_inputs_T);
221 230
222 // Run a forward pass and compute the output layer error relevant to the 231 // Forward pass.
223 // derivative: o-t. 232 nnQuery(net, query, &training_inputs);
224 // Error: (t-o)^2 233
225 // dE/do = -2(t-o) 234 // Compute the error derivative: o-t.
226 // = +2(o-t) 235 // Error: 1/2 (t-o)^2
236 // dE/do = -(t-o)
237 // = +(o-t)
227 // Note that we compute o-t instead to remove that outer negative sign. 238 // Note that we compute o-t instead to remove that outer negative sign.
228 // The 2 is dropped because we are only interested in the direction of the 239 // The 2 is dropped because we are only interested in the direction of the
229 // gradient. The learning rate controls the magnitude. 240 // gradient. The learning rate controls the magnitude.
230 nnQuery(net, query, &training_inputs);
231 nnMatrixSub( 241 nnMatrixSub(
232 training_outputs, &training_targets, &errors[net->num_layers - 1]); 242 training_outputs, &training_targets, &errors[net->num_layers - 1]);
233 243
@@ -236,68 +246,86 @@ void nnTrain(
236 nnMatrixTranspose(&query->layer_outputs[l], &outputs_T[l]); 246 nnMatrixTranspose(&query->layer_outputs[l], &outputs_T[l]);
237 } 247 }
238 248
239 // Update weights and biases for each internal layer, backpropagating 249 // Update weights and biases for each internal layer, back-propagating
240 // errors along the way. 250 // errors along the way.
241 for (int l = net->num_layers - 1; l >= 0; --l) { 251 for (int l = net->num_layers - 1; l >= 0; --l) {
242 const nnMatrix* layer_output = &query->layer_outputs[l]; 252 const nnMatrix* layer_output = &query->layer_outputs[l];
243 nnMatrix* layer_weights = &net->weights[l]; 253 nnGradientElements* elems = &gradient_elems[l];
244 nnMatrix* layer_biases = &net->biases[l]; 254 nnMatrix* gradient = &elems->gradient;
245 nnGradientElements* elems = &gradient_elems[l]; 255 nnLayerImpl* layer = &net->layers[l];
246 nnMatrix* gradient = &elems->gradient; 256
247 const nnActivation activation = net->activations[l]; 257 // Compute this layer's gradient.
248 258 //
249 // Compute the gradient (the part of the expression that does not 259 // By "gradient" we mean the expression common to the weights and bias
250 // contain the output of the previous layer). 260 // gradients. This is the part of the expression that does not contain
261 // this layer's input.
251 // 262 //
252 // Identity: G = error_k 263 // Linear: G = id
253 // Sigmoid: G = error_k * output_k * (1 - output_k). 264 // Relu: G = (output_k > 0 ? 1 : 0)
254 // Relu: G = error_k * (output_k > 0 ? 1 : 0) 265 // Sigmoid: G = output_k * (1 - output_k)
255 switch (activation) { 266 switch (layer->type) {
256 case nnIdentity: 267 case nnLinear: {
257 // TODO: Just copy the pointer? 268 // TODO: Just copy the pointer?
258 *gradient = nnMatrixBorrow(&errors[l]); 269 *gradient = nnMatrixBorrow(&errors[l]);
259 break; 270 break;
271 }
272 case nnRelu:
273 nnMatrixGt(layer_output, 0, gradient);
274 break;
260 case nnSigmoid: 275 case nnSigmoid:
261 nnMatrixSub(&elems->sigmoid.ones, layer_output, gradient); 276 nnMatrixSub(&elems->sigmoid.ones, layer_output, gradient);
262 nnMatrixMulPairs(layer_output, gradient, gradient); 277 nnMatrixMulPairs(layer_output, gradient, gradient);
263 nnMatrixMulPairs(&errors[l], gradient, gradient);
264 break;
265 case nnRelu:
266 nnMatrixGt(layer_output, 0, gradient);
267 nnMatrixMulPairs(&errors[l], gradient, gradient);
268 break; 278 break;
269 } 279 }
270 280
271 // Outer product to compute the weight deltas. 281 // Back-propagate the error.
272 const nnMatrix* output_T = 282 //
273 (l == 0) ? &training_inputs_T : &outputs_T[l - 1]; 283 // This combines this layer's gradient with the back-propagated error,
274 nnMatrixMul(output_T, gradient, &weight_deltas[l]); 284 // which is the combination of the gradients of subsequent layers down
275 285 // to the output layer error.
276 // Backpropagate the error before updating weights. 286 //
287 // Note that this step uses the layer's original weights.
277 if (l > 0) { 288 if (l > 0) {
278 // G * W^T == G *^T W. 289 switch (layer->type) {
279 // nnMatrixMul(gradient, &weights_T[l], &errors[l-1]); 290 case nnLinear: {
280 nnMatrixMulRows(gradient, layer_weights, &errors[l - 1]); 291 const nnMatrix* layer_weights = &layer->linear.weights;
292 // E * W^T == E *^T W.
293 // Using nnMatrixMulRows, we avoid having to transpose the weight
294 // matrix.
295 nnMatrixMulRows(&errors[l], layer_weights, &errors[l - 1]);
296 break;
297 }
298 // For activations, the error back-propagates as is but multiplied by
299 // the layer's gradient.
300 case nnRelu:
301 case nnSigmoid:
302 nnMatrixMulPairs(&errors[l], gradient, &errors[l - 1]);
303 break;
304 }
281 } 305 }
282 306
283 // Update weights. 307 // Update layer weights.
284 nnMatrixScale(&weight_deltas[l], params->learning_rate); 308 if (layer->type == nnLinear) {
285 // The gradient has a negative sign from -(t - o), but we have computed 309 nnLinearImpl* linear = &layer->linear;
286 // e = o - t instead, so we can subtract directly. 310 nnMatrix* layer_weights = &linear->weights;
287 // nnMatrixAdd(layer_weights, &weight_deltas[l], layer_weights); 311 nnMatrix* layer_biases = &linear->biases;
288 nnMatrixSub(layer_weights, &weight_deltas[l], layer_weights); 312
289 313 // Outer product to compute the weight deltas.
290 // Update weight transpose matrix for the next training iteration. 314 // This layer's input is the previous layer's output.
291 // nnMatrixTranspose(layer_weights, &weights_T[l]); 315 const nnMatrix* input_T =
292 316 (l == 0) ? &training_inputs_T : &outputs_T[l - 1];
293 // Update biases. 317 nnMatrixMul(input_T, gradient, &weight_deltas[l]);
294 // This is the same formula as for weights, except that the o_j term is 318
295 // just 1. We can simply re-use the gradient that we have already 319 // Update weights.
296 // computed for the weight update. 320 nnMatrixScale(&weight_deltas[l], params->learning_rate);
297 // nnMatrixMulAdd(layer_biases, gradient, params->learning_rate, 321 nnMatrixSub(layer_weights, &weight_deltas[l], layer_weights);
298 // layer_biases); 322
299 nnMatrixMulSub( 323 // Update biases.
300 layer_biases, gradient, params->learning_rate, layer_biases); 324 // This is the same formula as for weights, except that the o_j term
325 // is just 1.
326 nnMatrixMulSub(
327 layer_biases, gradient, params->learning_rate, layer_biases);
328 }
301 } 329 }
302 330
303 // TODO: Add this under a verbose debugging mode. 331 // TODO: Add this under a verbose debugging mode.
@@ -334,12 +362,11 @@ void nnTrain(
334 for (int l = 0; l < net->num_layers; ++l) { 362 for (int l = 0; l < net->num_layers; ++l) {
335 nnMatrixDel(&errors[l]); 363 nnMatrixDel(&errors[l]);
336 nnMatrixDel(&outputs_T[l]); 364 nnMatrixDel(&outputs_T[l]);
337 // nnMatrixDel(&weights_T[l]);
338 nnMatrixDel(&weight_deltas[l]); 365 nnMatrixDel(&weight_deltas[l]);
339 366
340 nnGradientElements* elems = &gradient_elems[l]; 367 nnGradientElements* elems = &gradient_elems[l];
341 switch (elems->type) { 368 switch (elems->type) {
342 case nnIdentity: 369 case nnLinear:
343 break; // Gradient vector is borrowed, no need to deallocate. 370 break; // Gradient vector is borrowed, no need to deallocate.
344 371
345 case nnSigmoid: 372 case nnSigmoid:
@@ -355,7 +382,6 @@ void nnTrain(
355 nnMatrixDel(&training_inputs_T); 382 nnMatrixDel(&training_inputs_T);
356 free(errors); 383 free(errors);
357 free(outputs_T); 384 free(outputs_T);
358 // free(weights_T);
359 free(weight_deltas); 385 free(weight_deltas);
360 free(gradient_elems); 386 free(gradient_elems);
361} 387}