1 files changed, 104 insertions, 78 deletions
diff --git a/src/lib/src/train.c b/src/lib/src/train.c
index dc93f0f..98f58ad 100644
--- a/src/lib/src/train.c
+++ b/src/lib/src/train.c
@@ -38,7 +38,7 @@ typedef struct nnSigmoidGradientElements {
 /// each layer. A data type is defined for these because we allocate all the
 /// required memory up front before entering the training loop.
 typedef struct nnGradientElements {
-  nnActivation type;
+  nnLayerType type;
  // Gradient vector, same size as the layer.
  // This will contain the gradient expression except for the output value of
  // the previous layer.
@@ -57,10 +57,27 @@ void nnInitNet(
  mt19937_64_init(&rng, seed);
  for (int l = 0; l < net->num_layers; ++l) {
-    nnMatrix* weights = &net->weights[l];
+    // Get the layer's weights and biases, if any.
-    nnMatrix* biases  = &net->biases[l];
+    nnMatrix* weights = 0;
+    nnMatrix* biases  = 0;
+    switch (net->layers[l].type) {
+    case nnLinear: {
+      nnLinearImpl* linear = &net->layers[l].linear;
+      weights = &linear->weights;
+      biases  = &linear->biases;
+      break;
+    }
+    // Activations.
+    case nnRelu:
+    case nnSigmoid:
+      break;
+    }
+    if (!weights || !biases) {
+      continue;
+    }
-    const R layer_size = (R)nnLayerInputSize(weights);
+    const R layer_size = (R)nnLayerInputSize(net, l);
    const R scale      = 1. / layer_size;
    const R stdev      = 1. / sqrt((R)layer_size);
    const R sigma      = stdev * stdev;
@@ -128,9 +145,6 @@ void nnTrain(
  // with one sample at a time.
  nnMatrix* errors = calloc(net->num_layers, sizeof(nnMatrix));
-  // Allocate the weight transpose matrices up front for backpropagation.
-  // nnMatrix* weights_T = calloc(net->num_layers, sizeof(nnMatrix));
  // Allocate the weight delta matrices.
  nnMatrix* weight_deltas = calloc(net->num_layers, sizeof(nnMatrix));
@@ -144,30 +158,24 @@ void nnTrain(
  nnMatrix* outputs_T = calloc(net->num_layers, sizeof(nnMatrix));
  assert(errors != 0);
-  // assert(weights_T != 0);
  assert(weight_deltas != 0);
  assert(gradient_elems);
  assert(outputs_T);
  for (int l = 0; l < net->num_layers; ++l) {
-    const nnMatrix*    layer_weights     = &net->weights[l];
+    const int          layer_input_size  = nnLayerInputSize(net, l);
-    const int          layer_output_size = net->weights[l].cols;
+    const int          layer_output_size = nnLayerOutputSize(net, l);
-    const nnActivation activation        = net->activations[l];
+    const nnLayerImpl* layer             = &net->layers[l];
-    errors[l] = nnMatrixMake(1, layer_weights->cols);
-    // weights_T[l] = nnMatrixMake(layer_weights->cols, layer_weights->rows);
-    // nnMatrixTranspose(layer_weights, &weights_T[l]);
-    weight_deltas[l] = nnMatrixMake(layer_weights->rows, layer_weights->cols);
-    outputs_T[l] = nnMatrixMake(layer_output_size, 1);
+    errors[l]        = nnMatrixMake(1, layer_output_size);
+    weight_deltas[l] = nnMatrixMake(layer_input_size, layer_output_size);
+    outputs_T[l]     = nnMatrixMake(layer_output_size, 1);
    // Allocate the gradient elements and vectors for weight delta calculation.
    nnGradientElements* elems = &gradient_elems[l];
-    elems->type               = activation;
+    elems->type               = layer->type;
-    switch (activation) {
+    switch (layer->type) {
-    case nnIdentity:
+    case nnLinear:
      break; // Gradient vector will be borrowed, no need to allocate.
    case nnSigmoid:
@@ -208,6 +216,7 @@ void nnTrain(
    // For now, we train with one sample at a time.
    for (int sample = 0; sample < inputs->rows; ++sample) {
+      // TODO: Introduce a BorrowMut.
      // Slice the input and target matrices with the batch size.
      // We are not mutating the inputs, but we need the cast to borrow.
      nnMatrix training_inputs =
@@ -219,15 +228,16 @@ void nnTrain(
      // Assuming one training input per iteration for now.
      nnMatrixTranspose(&training_inputs, &training_inputs_T);
-      // Run a forward pass and compute the output layer error relevant to the
+      // Forward pass.
-      // derivative: o-t.
+      nnQuery(net, query, &training_inputs);
-      //   Error: (t-o)^2
-      //   dE/do = -2(t-o)
+      // Compute the error derivative: o-t.
-      //         = +2(o-t)
+      //   Error: 1/2 (t-o)^2
+      //   dE/do = -(t-o)
+      //         = +(o-t)
      // Note that we compute o-t instead to remove that outer negative sign.
      // The 2 is dropped because we are only interested in the direction of the
      // gradient. The learning rate controls the magnitude.
-      nnQuery(net, query, &training_inputs);
      nnMatrixSub(
          training_outputs, &training_targets, &errors[net->num_layers - 1]);
@@ -236,68 +246,86 @@ void nnTrain(
        nnMatrixTranspose(&query->layer_outputs[l], &outputs_T[l]);
      }
-      // Update weights and biases for each internal layer, backpropagating
+      // Update weights and biases for each internal layer, back-propagating
      // errors along the way.
      for (int l = net->num_layers - 1; l >= 0; --l) {
-        const nnMatrix*     layer_output  = &query->layer_outputs[l];
+        const nnMatrix*     layer_output = &query->layer_outputs[l];
-        nnMatrix*           layer_weights = &net->weights[l];
+        nnGradientElements* elems        = &gradient_elems[l];
-        nnMatrix*           layer_biases  = &net->biases[l];
+        nnMatrix*           gradient     = &elems->gradient;
-        nnGradientElements* elems         = &gradient_elems[l];
+        nnLayerImpl*        layer        = &net->layers[l];
-        nnMatrix*           gradient      = &elems->gradient;
-        const nnActivation  activation    = net->activations[l];
+        // Compute this layer's gradient.
+        //
-        // Compute the gradient (the part of the expression that does not
+        // By "gradient" we mean the expression common to the weights and bias
-        // contain the output of the previous layer).
+        // gradients. This is the part of the expression that does not contain
+        // this layer's input.
        //
-        // Identity: G = error_k
+        // Linear:   G = id
-        // Sigmoid:  G = error_k * output_k * (1 - output_k).
+        // Relu:     G = (output_k > 0 ? 1 : 0)
-        // Relu:     G = error_k * (output_k > 0 ? 1 : 0)
+        // Sigmoid:  G = output_k * (1 - output_k)
-        switch (activation) {
+        switch (layer->type) {
-        case nnIdentity:
+        case nnLinear: {
          // TODO: Just copy the pointer?
          *gradient = nnMatrixBorrow(&errors[l]);
          break;
+        }
+        case nnRelu:
+          nnMatrixGt(layer_output, 0, gradient);
+          break;
        case nnSigmoid:
          nnMatrixSub(&elems->sigmoid.ones, layer_output, gradient);
          nnMatrixMulPairs(layer_output, gradient, gradient);
-          nnMatrixMulPairs(&errors[l], gradient, gradient);
-          break;
-        case nnRelu:
-          nnMatrixGt(layer_output, 0, gradient);
-          nnMatrixMulPairs(&errors[l], gradient, gradient);
          break;
        }
-        // Outer product to compute the weight deltas.
+        // Back-propagate the error.
-        const nnMatrix* output_T =
+        //
-            (l == 0) ? &training_inputs_T : &outputs_T[l - 1];
+        // This combines this layer's gradient with the back-propagated error,
-        nnMatrixMul(output_T, gradient, &weight_deltas[l]);
+        // which is the combination of the gradients of subsequent layers down
+        // to the output layer error.
-        // Backpropagate the error before updating weights.
+        //
+        // Note that this step uses the layer's original weights.
        if (l > 0) {
-          // G * W^T == G *^T W.
+          switch (layer->type) {
-          // nnMatrixMul(gradient, &weights_T[l], &errors[l-1]);
+          case nnLinear: {
-          nnMatrixMulRows(gradient, layer_weights, &errors[l - 1]);
+            const nnMatrix* layer_weights = &layer->linear.weights;
+            // E * W^T == E *^T W.
+            // Using nnMatrixMulRows, we avoid having to transpose the weight
+            // matrix.
+            nnMatrixMulRows(&errors[l], layer_weights, &errors[l - 1]);
+            break;
+          }
+          // For activations, the error back-propagates as is but multiplied by
+          // the layer's gradient.
+          case nnRelu:
+          case nnSigmoid:
+            nnMatrixMulPairs(&errors[l], gradient, &errors[l - 1]);
+            break;
+          }
        }
-        // Update weights.
+        // Update layer weights.
-        nnMatrixScale(&weight_deltas[l], params->learning_rate);
+        if (layer->type == nnLinear) {
-        // The gradient has a negative sign from -(t - o), but we have computed
+          nnLinearImpl* linear        = &layer->linear;
-        // e = o - t instead, so we can subtract directly.
+          nnMatrix*     layer_weights = &linear->weights;
-        // nnMatrixAdd(layer_weights, &weight_deltas[l], layer_weights);
+          nnMatrix*     layer_biases  = &linear->biases;
-        nnMatrixSub(layer_weights, &weight_deltas[l], layer_weights);
+          // Outer product to compute the weight deltas.
-        // Update weight transpose matrix for the next training iteration.
+          // This layer's input is the previous layer's output.
-        // nnMatrixTranspose(layer_weights, &weights_T[l]);
+          const nnMatrix* input_T =
+              (l == 0) ? &training_inputs_T : &outputs_T[l - 1];
-        // Update biases.
+          nnMatrixMul(input_T, gradient, &weight_deltas[l]);
-        // This is the same formula as for weights, except that the o_j term is
-        // just 1. We can simply re-use the gradient that we have already
+          // Update weights.
-        // computed for the weight update.
+          nnMatrixScale(&weight_deltas[l], params->learning_rate);
-        // nnMatrixMulAdd(layer_biases, gradient, params->learning_rate,
+          nnMatrixSub(layer_weights, &weight_deltas[l], layer_weights);
-        // layer_biases);
-        nnMatrixMulSub(
+          // Update biases.
-            layer_biases, gradient, params->learning_rate, layer_biases);
+          // This is the same formula as for weights, except that the o_j term
+          // is just 1.
+          nnMatrixMulSub(
+              layer_biases, gradient, params->learning_rate, layer_biases);
+        }
      }
      // TODO: Add this under a verbose debugging mode.
@@ -334,12 +362,11 @@ void nnTrain(
  for (int l = 0; l < net->num_layers; ++l) {
    nnMatrixDel(&errors[l]);
    nnMatrixDel(&outputs_T[l]);
-    // nnMatrixDel(&weights_T[l]);
    nnMatrixDel(&weight_deltas[l]);
    nnGradientElements* elems = &gradient_elems[l];
    switch (elems->type) {
-    case nnIdentity:
+    case nnLinear:
      break; // Gradient vector is borrowed, no need to deallocate.
    case nnSigmoid:
@@ -355,7 +382,6 @@ void nnTrain(
  nnMatrixDel(&training_inputs_T);
  free(errors);
  free(outputs_T);
-  // free(weights_T);
  free(weight_deltas);
  free(gradient_elems);
 }

diff --git a/src/lib/src/train.c b/src/lib/src/train.c index dc93f0f..98f58ad 100644 --- a/src/lib/src/train.c +++ b/src/lib/src/train.c
@@ -38,7 +38,7 @@ typedef struct nnSigmoidGradientElements {
38	/// each layer. A data type is defined for these because we allocate all the	38	/// each layer. A data type is defined for these because we allocate all the
39	/// required memory up front before entering the training loop.	39	/// required memory up front before entering the training loop.
40	typedef struct nnGradientElements {	40	typedef struct nnGradientElements {
41	nnActivation type;	41	nnLayerType type;
42	// Gradient vector, same size as the layer.	42	// Gradient vector, same size as the layer.
43	// This will contain the gradient expression except for the output value of	43	// This will contain the gradient expression except for the output value of
44	// the previous layer.	44	// the previous layer.
@@ -57,10 +57,27 @@ void nnInitNet(
57	mt19937_64_init(&rng, seed);	57	mt19937_64_init(&rng, seed);
58		58
59	for (int l = 0; l < net->num_layers; ++l) {	59	for (int l = 0; l < net->num_layers; ++l) {
60	nnMatrix* weights = &net->weights[l];	60	// Get the layer's weights and biases, if any.
61	nnMatrix* biases = &net->biases[l];	61	nnMatrix* weights = 0;
		62	nnMatrix* biases = 0;
		63	switch (net->layers[l].type) {
		64	case nnLinear: {
		65	nnLinearImpl* linear = &net->layers[l].linear;
		66
		67	weights = &linear->weights;
		68	biases = &linear->biases;
		69	break;
		70	}
		71	// Activations.
		72	case nnRelu:
		73	case nnSigmoid:
		74	break;
		75	}
		76	if (!weights \|\| !biases) {
		77	continue;
		78	}
62		79
63	const R layer_size = (R)nnLayerInputSize(weights);	80	const R layer_size = (R)nnLayerInputSize(net, l);
64	const R scale = 1. / layer_size;	81	const R scale = 1. / layer_size;
65	const R stdev = 1. / sqrt((R)layer_size);	82	const R stdev = 1. / sqrt((R)layer_size);
66	const R sigma = stdev * stdev;	83	const R sigma = stdev * stdev;
@@ -128,9 +145,6 @@ void nnTrain(
128	// with one sample at a time.	145	// with one sample at a time.
129	nnMatrix* errors = calloc(net->num_layers, sizeof(nnMatrix));	146	nnMatrix* errors = calloc(net->num_layers, sizeof(nnMatrix));
130		147
131	// Allocate the weight transpose matrices up front for backpropagation.
132	// nnMatrix* weights_T = calloc(net->num_layers, sizeof(nnMatrix));
133
134	// Allocate the weight delta matrices.	148	// Allocate the weight delta matrices.
135	nnMatrix* weight_deltas = calloc(net->num_layers, sizeof(nnMatrix));	149	nnMatrix* weight_deltas = calloc(net->num_layers, sizeof(nnMatrix));
136		150
@@ -144,30 +158,24 @@ void nnTrain(
144	nnMatrix* outputs_T = calloc(net->num_layers, sizeof(nnMatrix));	158	nnMatrix* outputs_T = calloc(net->num_layers, sizeof(nnMatrix));
145		159
146	assert(errors != 0);	160	assert(errors != 0);
147	// assert(weights_T != 0);
148	assert(weight_deltas != 0);	161	assert(weight_deltas != 0);
149	assert(gradient_elems);	162	assert(gradient_elems);
150	assert(outputs_T);	163	assert(outputs_T);
151		164
152	for (int l = 0; l < net->num_layers; ++l) {	165	for (int l = 0; l < net->num_layers; ++l) {
153	const nnMatrix* layer_weights = &net->weights[l];	166	const int layer_input_size = nnLayerInputSize(net, l);
154	const int layer_output_size = net->weights[l].cols;	167	const int layer_output_size = nnLayerOutputSize(net, l);
155	const nnActivation activation = net->activations[l];	168	const nnLayerImpl* layer = &net->layers[l];
156
157	errors[l] = nnMatrixMake(1, layer_weights->cols);
158
159	// weights_T[l] = nnMatrixMake(layer_weights->cols, layer_weights->rows);
160	// nnMatrixTranspose(layer_weights, &weights_T[l]);
161
162	weight_deltas[l] = nnMatrixMake(layer_weights->rows, layer_weights->cols);
163		169
164	outputs_T[l] = nnMatrixMake(layer_output_size, 1);	170	errors[l] = nnMatrixMake(1, layer_output_size);
		171	weight_deltas[l] = nnMatrixMake(layer_input_size, layer_output_size);
		172	outputs_T[l] = nnMatrixMake(layer_output_size, 1);
165		173
166	// Allocate the gradient elements and vectors for weight delta calculation.	174	// Allocate the gradient elements and vectors for weight delta calculation.
167	nnGradientElements* elems = &gradient_elems[l];	175	nnGradientElements* elems = &gradient_elems[l];
168	elems->type = activation;	176	elems->type = layer->type;
169	switch (activation) {	177	switch (layer->type) {
170	case nnIdentity:	178	case nnLinear:
171	break; // Gradient vector will be borrowed, no need to allocate.	179	break; // Gradient vector will be borrowed, no need to allocate.
172		180
173	case nnSigmoid:	181	case nnSigmoid:
@@ -208,6 +216,7 @@ void nnTrain(
208		216
209	// For now, we train with one sample at a time.	217	// For now, we train with one sample at a time.
210	for (int sample = 0; sample < inputs->rows; ++sample) {	218	for (int sample = 0; sample < inputs->rows; ++sample) {
		219	// TODO: Introduce a BorrowMut.
211	// Slice the input and target matrices with the batch size.	220	// Slice the input and target matrices with the batch size.
212	// We are not mutating the inputs, but we need the cast to borrow.	221	// We are not mutating the inputs, but we need the cast to borrow.
213	nnMatrix training_inputs =	222	nnMatrix training_inputs =
@@ -219,15 +228,16 @@ void nnTrain(
219	// Assuming one training input per iteration for now.	228	// Assuming one training input per iteration for now.
220	nnMatrixTranspose(&training_inputs, &training_inputs_T);	229	nnMatrixTranspose(&training_inputs, &training_inputs_T);
221		230
222	// Run a forward pass and compute the output layer error relevant to the	231	// Forward pass.
223	// derivative: o-t.	232	nnQuery(net, query, &training_inputs);
224	// Error: (t-o)^2	233
225	// dE/do = -2(t-o)	234	// Compute the error derivative: o-t.
226	// = +2(o-t)	235	// Error: 1/2 (t-o)^2
		236	// dE/do = -(t-o)
		237	// = +(o-t)
227	// Note that we compute o-t instead to remove that outer negative sign.	238	// Note that we compute o-t instead to remove that outer negative sign.
228	// The 2 is dropped because we are only interested in the direction of the	239	// The 2 is dropped because we are only interested in the direction of the
229	// gradient. The learning rate controls the magnitude.	240	// gradient. The learning rate controls the magnitude.
230	nnQuery(net, query, &training_inputs);
231	nnMatrixSub(	241	nnMatrixSub(
232	training_outputs, &training_targets, &errors[net->num_layers - 1]);	242	training_outputs, &training_targets, &errors[net->num_layers - 1]);
233		243
@@ -236,68 +246,86 @@ void nnTrain(
236	nnMatrixTranspose(&query->layer_outputs[l], &outputs_T[l]);	246	nnMatrixTranspose(&query->layer_outputs[l], &outputs_T[l]);
237	}	247	}
238		248
239	// Update weights and biases for each internal layer, backpropagating	249	// Update weights and biases for each internal layer, back-propagating
240	// errors along the way.	250	// errors along the way.
241	for (int l = net->num_layers - 1; l >= 0; --l) {	251	for (int l = net->num_layers - 1; l >= 0; --l) {
242	const nnMatrix* layer_output = &query->layer_outputs[l];	252	const nnMatrix* layer_output = &query->layer_outputs[l];
243	nnMatrix* layer_weights = &net->weights[l];	253	nnGradientElements* elems = &gradient_elems[l];
244	nnMatrix* layer_biases = &net->biases[l];	254	nnMatrix* gradient = &elems->gradient;
245	nnGradientElements* elems = &gradient_elems[l];	255	nnLayerImpl* layer = &net->layers[l];
246	nnMatrix* gradient = &elems->gradient;	256
247	const nnActivation activation = net->activations[l];	257	// Compute this layer's gradient.
248		258	//
249	// Compute the gradient (the part of the expression that does not	259	// By "gradient" we mean the expression common to the weights and bias
250	// contain the output of the previous layer).	260	// gradients. This is the part of the expression that does not contain
		261	// this layer's input.
251	//	262	//
252	// Identity: G = error_k	263	// Linear: G = id
253	// Sigmoid: G = error_k * output_k * (1 - output_k).	264	// Relu: G = (output_k > 0 ? 1 : 0)
254	// Relu: G = error_k * (output_k > 0 ? 1 : 0)	265	// Sigmoid: G = output_k * (1 - output_k)
255	switch (activation) {	266	switch (layer->type) {
256	case nnIdentity:	267	case nnLinear: {
257	// TODO: Just copy the pointer?	268	// TODO: Just copy the pointer?
258	*gradient = nnMatrixBorrow(&errors[l]);	269	*gradient = nnMatrixBorrow(&errors[l]);
259	break;	270	break;
		271	}
		272	case nnRelu:
		273	nnMatrixGt(layer_output, 0, gradient);
		274	break;
260	case nnSigmoid:	275	case nnSigmoid:
261	nnMatrixSub(&elems->sigmoid.ones, layer_output, gradient);	276	nnMatrixSub(&elems->sigmoid.ones, layer_output, gradient);
262	nnMatrixMulPairs(layer_output, gradient, gradient);	277	nnMatrixMulPairs(layer_output, gradient, gradient);
263	nnMatrixMulPairs(&errors[l], gradient, gradient);
264	break;
265	case nnRelu:
266	nnMatrixGt(layer_output, 0, gradient);
267	nnMatrixMulPairs(&errors[l], gradient, gradient);
268	break;	278	break;
269	}	279	}
270		280
271	// Outer product to compute the weight deltas.	281	// Back-propagate the error.
272	const nnMatrix* output_T =	282	//
273	(l == 0) ? &training_inputs_T : &outputs_T[l - 1];	283	// This combines this layer's gradient with the back-propagated error,
274	nnMatrixMul(output_T, gradient, &weight_deltas[l]);	284	// which is the combination of the gradients of subsequent layers down
275		285	// to the output layer error.
276	// Backpropagate the error before updating weights.	286	//
		287	// Note that this step uses the layer's original weights.
277	if (l > 0) {	288	if (l > 0) {
278	// G * W^T == G *^T W.	289	switch (layer->type) {
279	// nnMatrixMul(gradient, &weights_T[l], &errors[l-1]);	290	case nnLinear: {
280	nnMatrixMulRows(gradient, layer_weights, &errors[l - 1]);	291	const nnMatrix* layer_weights = &layer->linear.weights;
		292	// E * W^T == E *^T W.
		293	// Using nnMatrixMulRows, we avoid having to transpose the weight
		294	// matrix.
		295	nnMatrixMulRows(&errors[l], layer_weights, &errors[l - 1]);
		296	break;
		297	}
		298	// For activations, the error back-propagates as is but multiplied by
		299	// the layer's gradient.
		300	case nnRelu:
		301	case nnSigmoid:
		302	nnMatrixMulPairs(&errors[l], gradient, &errors[l - 1]);
		303	break;
		304	}
281	}	305	}
282		306
283	// Update weights.	307	// Update layer weights.
284	nnMatrixScale(&weight_deltas[l], params->learning_rate);	308	if (layer->type == nnLinear) {
285	// The gradient has a negative sign from -(t - o), but we have computed	309	nnLinearImpl* linear = &layer->linear;
286	// e = o - t instead, so we can subtract directly.	310	nnMatrix* layer_weights = &linear->weights;
287	// nnMatrixAdd(layer_weights, &weight_deltas[l], layer_weights);	311	nnMatrix* layer_biases = &linear->biases;
288	nnMatrixSub(layer_weights, &weight_deltas[l], layer_weights);	312
289		313	// Outer product to compute the weight deltas.
290	// Update weight transpose matrix for the next training iteration.	314	// This layer's input is the previous layer's output.
291	// nnMatrixTranspose(layer_weights, &weights_T[l]);	315	const nnMatrix* input_T =
292		316	(l == 0) ? &training_inputs_T : &outputs_T[l - 1];
293	// Update biases.	317	nnMatrixMul(input_T, gradient, &weight_deltas[l]);
294	// This is the same formula as for weights, except that the o_j term is	318
295	// just 1. We can simply re-use the gradient that we have already	319	// Update weights.
296	// computed for the weight update.	320	nnMatrixScale(&weight_deltas[l], params->learning_rate);
297	// nnMatrixMulAdd(layer_biases, gradient, params->learning_rate,	321	nnMatrixSub(layer_weights, &weight_deltas[l], layer_weights);
298	// layer_biases);	322
299	nnMatrixMulSub(	323	// Update biases.
300	layer_biases, gradient, params->learning_rate, layer_biases);	324	// This is the same formula as for weights, except that the o_j term
		325	// is just 1.
		326	nnMatrixMulSub(
		327	layer_biases, gradient, params->learning_rate, layer_biases);
		328	}
301	}	329	}
302		330
303	// TODO: Add this under a verbose debugging mode.	331	// TODO: Add this under a verbose debugging mode.
@@ -334,12 +362,11 @@ void nnTrain(
334	for (int l = 0; l < net->num_layers; ++l) {	362	for (int l = 0; l < net->num_layers; ++l) {
335	nnMatrixDel(&errors[l]);	363	nnMatrixDel(&errors[l]);
336	nnMatrixDel(&outputs_T[l]);	364	nnMatrixDel(&outputs_T[l]);
337	// nnMatrixDel(&weights_T[l]);
338	nnMatrixDel(&weight_deltas[l]);	365	nnMatrixDel(&weight_deltas[l]);
339		366
340	nnGradientElements* elems = &gradient_elems[l];	367	nnGradientElements* elems = &gradient_elems[l];
341	switch (elems->type) {	368	switch (elems->type) {
342	case nnIdentity:	369	case nnLinear:
343	break; // Gradient vector is borrowed, no need to deallocate.	370	break; // Gradient vector is borrowed, no need to deallocate.
344		371
345	case nnSigmoid:	372	case nnSigmoid:
@@ -355,7 +382,6 @@ void nnTrain(
355	nnMatrixDel(&training_inputs_T);	382	nnMatrixDel(&training_inputs_T);
356	free(errors);	383	free(errors);
357	free(outputs_T);	384	free(outputs_T);
358	// free(weights_T);
359	free(weight_deltas);	385	free(weight_deltas);
360	free(gradient_elems);	386	free(gradient_elems);
361	}	387	}