3 files changed, 45 insertions, 8 deletions
diff --git a/src/lib/include/neuralnet/matrix.h b/src/lib/include/neuralnet/matrix.h
index 9816b81..0cb40cf 100644
--- a/src/lib/include/neuralnet/matrix.h
+++ b/src/lib/include/neuralnet/matrix.h
@@ -52,6 +52,12 @@ void nnMatrixInitConstant(nnMatrix*, R value);
 /// Multiply two matrices.
 void nnMatrixMul(const nnMatrix* left, const nnMatrix* right, nnMatrix* out);
+/// Multiply two matrices, row variant.
+///
+/// This function multiples two matrices row-by-row instead of row-by-column.
+/// nnMatrixMul(A, B, O) == nnMatrixMulRows(A, B^T, O).
+void nnMatrixMulRows(const nnMatrix* left, const nnMatrix* right, nnMatrix* out);
 /// Matrix multiply-add.
 ///
 /// out = left + (right * scale)
diff --git a/src/lib/src/matrix.c b/src/lib/src/matrix.c
index a7a4ce6..29cdec5 100644
--- a/src/lib/src/matrix.c
+++ b/src/lib/src/matrix.c
@@ -150,6 +150,35 @@ void nnMatrixMul(const nnMatrix* left, const nnMatrix* right, nnMatrix* out) {
  }
 }
+void nnMatrixMulRows(const nnMatrix* left, const nnMatrix* right, nnMatrix* out) {
+  assert(left != 0);
+  assert(right != 0);
+  assert(out != 0);
+  assert(out != left);
+  assert(out != right);
+  assert(left->cols == right->cols);
+  assert(out->rows == left->rows);
+  assert(out->cols == right->rows);
+  R* out_value = out->values;
+  for (int i = 0; i < left->rows; ++i) {
+    const R* left_row = &left->values[i * left->cols];
+    const R* right_value = right->values;
+    for (int j = 0; j < right->rows; ++j) {
+      *out_value = 0;
+      // Vector dot product.
+      for (int k = 0; k < left->cols; ++k) {
+        *out_value += left_row[k] * *right_value++;
+      }
+      out_value++;
+    }
+  }
+}
 void nnMatrixMulAdd(const nnMatrix* left, const nnMatrix* right, R scale, nnMatrix* out) {
  assert(left);
  assert(right);
diff --git a/src/lib/src/train.c b/src/lib/src/train.c
index 027de66..3061a99 100644
--- a/src/lib/src/train.c
+++ b/src/lib/src/train.c
@@ -129,7 +129,7 @@ void nnTrain(
  nnMatrix* errors = calloc(net->num_layers, sizeof(nnMatrix));
  // Allocate the weight transpose matrices up front for backpropagation.
-  nnMatrix* weights_T = calloc(net->num_layers, sizeof(nnMatrix));
+  //nnMatrix* weights_T = calloc(net->num_layers, sizeof(nnMatrix));
  // Allocate the weight delta matrices.
  nnMatrix* weight_deltas = calloc(net->num_layers, sizeof(nnMatrix));
@@ -143,7 +143,7 @@ void nnTrain(
  nnMatrix* outputs_T = calloc(net->num_layers, sizeof(nnMatrix));
  assert(errors != 0);
-  assert(weights_T != 0);
+  //assert(weights_T != 0);
  assert(weight_deltas != 0);
  assert(gradient_elems);
  assert(outputs_T);
@@ -155,8 +155,8 @@ void nnTrain(
    errors[l] = nnMatrixMake(1, layer_weights->cols);
-    weights_T[l] = nnMatrixMake(layer_weights->cols, layer_weights->rows);
+    //weights_T[l] = nnMatrixMake(layer_weights->cols, layer_weights->rows);
-    nnMatrixTranspose(layer_weights, &weights_T[l]);
+    //nnMatrixTranspose(layer_weights, &weights_T[l]);
    weight_deltas[l] = nnMatrixMake(layer_weights->rows, layer_weights->cols);
@@ -267,7 +267,9 @@ void nnTrain(
        // Backpropagate the error before updating weights.
        if (l > 0) {
-          nnMatrixMul(gradient, &weights_T[l], &errors[l-1]);
+          // G * W^T == G *^T W.
+          //nnMatrixMul(gradient, &weights_T[l], &errors[l-1]);
+          nnMatrixMulRows(gradient, layer_weights, &errors[l-1]);
        }
        // Update weights.
@@ -278,7 +280,7 @@ void nnTrain(
        nnMatrixSub(layer_weights, &weight_deltas[l], layer_weights);
        // Update weight transpose matrix for the next training iteration.
-        nnMatrixTranspose(layer_weights, &weights_T[l]);
+        //nnMatrixTranspose(layer_weights, &weights_T[l]);
        // Update biases.
        // This is the same formula as for weights, except that the o_j term is
@@ -319,7 +321,7 @@ void nnTrain(
  for (int l = 0; l < net->num_layers; ++l) {
    nnMatrixDel(&errors[l]);
    nnMatrixDel(&outputs_T[l]);
-    nnMatrixDel(&weights_T[l]);
+    //nnMatrixDel(&weights_T[l]);
    nnMatrixDel(&weight_deltas[l]);
    nnGradientElements* elems = &gradient_elems[l];
@@ -340,7 +342,7 @@ void nnTrain(
  nnMatrixDel(&training_inputs_T);
  free(errors);
  free(outputs_T);
-  free(weights_T);
+  //free(weights_T);
  free(weight_deltas);
  free(gradient_elems);
 }