24 files changed, 2593 insertions, 1 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..a060fab
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,6 @@
+cmake_minimum_required(VERSION 3.0)
+project(neuralnet)
+add_subdirectory(src/lib)
+add_subdirectory(src/bin)
diff --git a/LICENSE b/LICENSE
index 0c97efd..3471388 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,7 +1,7 @@
 GNU AFFERO GENERAL PUBLIC LICENSE
 Version 3, 19 November 2007
-Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+Copyright (C) 2022 Marc Sunet <https://shellblade.net/>
 Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
diff --git a/src/bin/CMakeLists.txt b/src/bin/CMakeLists.txt
new file mode 100644
index 0000000..051a56f
--- /dev/null
+++ b/src/bin/CMakeLists.txt
@@ -0,0 +1,3 @@
+cmake_minimum_required(VERSION 3.0)
+add_subdirectory(mnist)
diff --git a/src/bin/mnist/CMakeLists.txt b/src/bin/mnist/CMakeLists.txt
new file mode 100644
index 0000000..a6c54f2
--- /dev/null
+++ b/src/bin/mnist/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.0)
+add_executable(mnist
+  src/main.c)
+target_link_libraries(mnist PRIVATE
+  neuralnet
+  bsd
+  z)
+target_compile_options(mnist PRIVATE -Wall -Wextra)
diff --git a/src/bin/mnist/src/main.c b/src/bin/mnist/src/main.c
new file mode 100644
index 0000000..4d268ac
--- /dev/null
+++ b/src/bin/mnist/src/main.c
@@ -0,0 +1,473 @@
+#include <neuralnet/matrix.h>
+#include <neuralnet/neuralnet.h>
+#include <neuralnet/train.h>
+#include <zlib.h>
+#include <assert.h>
+#include <bsd/string.h>
+#include <linux/limits.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+static const int TRAIN_ITERATIONS = 100;
+static const int32_t IMAGE_FILE_MAGIC = 0x00000803;
+static const int32_t LABEL_FILE_MAGIC = 0x00000801;
+// Inputs of 0 cancel weights during training. This value is used to rescale the
+// input pixels from [0,255] to [PIXEL_LOWER_BOUND, 1.0].
+static const double PIXEL_LOWER_BOUND = 0.01;
+// Scale the outputs to (0,1) since the sigmoid cannot produce 0 or 1.
+static const double LABEL_LOWER_BOUND = 0.01;
+static const double LABEL_UPPER_BOUND = 0.99;
+// Epsilon used to compare R values.
+static const double EPS = 1e-10;
+#define min(a,b) ((a) < (b) ? (a) : (b))
+typedef struct ImageSet {
+  nnMatrix images;  // Images flattened into row vectors of the matrix.
+  nnMatrix labels;  // One-hot-encoded labels.
+  int count;        // Number of images and labels.
+  int rows;         // Rows in an image.
+  int cols;         // Columns in an image.
+} ImageSet;
+static void usage(const char* argv0) {
+  fprintf(stderr, "Usage: %s <path to mnist files directory> [num images]\n", argv0);
+  fprintf(stderr, "\n");
+  fprintf(stderr, "  Use -1 for [num images] to use all the images in the data set\n");
+}
+static bool R_eq(R a, R b) {
+  return fabs(a-b) <= EPS;
+}
+static void PrintImage(const nnMatrix* images, int rows, int cols, int image_index) {
+  assert(images);
+  assert((0 <= image_index) && (image_index < images->rows));
+  // Top line.
+  for (int j = 0; j < cols/2; ++j) {
+    printf(" -");
+  }
+  printf("\n");
+  // Image.
+  const R* value = nnMatrixRow(images, image_index);
+  for (int i = 0; i < rows; ++i) {
+    printf("|");
+    for (int j = 0; j < cols; ++j) {
+      if (*value > 0.8) {
+        printf("#");
+      } else if (*value > 0.5) {
+        printf("*");
+      }
+      else if (*value > PIXEL_LOWER_BOUND) {
+        printf(":");
+      } else if (*value == 0.0) {
+        // Values should not be exactly 0, otherwise they cancel out weights
+        // during training.
+        printf("X");
+      } else {
+        printf(" ");
+      }
+      value++;
+    }
+    printf("|\n");
+  }
+  // Bottom line.
+  for (int j = 0; j < cols/2; ++j) {
+    printf(" -");
+  }
+  printf("\n");
+}
+static void PrintLabel(const nnMatrix* labels, int label_index) {
+  assert(labels);
+  assert((0 <= label_index) && (label_index < labels->rows));
+  // Compute the label from the one-hot encoding.
+  const R* value = nnMatrixRow(labels, label_index);
+  int label = -1;
+  for (int i = 0; i < 10; ++i) {
+    if (R_eq(*value++, LABEL_UPPER_BOUND)) {
+      label = i;
+      break;
+    }
+  }
+  assert((0 <= label) && (label <= 9));
+  printf("Label: %d ( ", label);
+  value = nnMatrixRow(labels, label_index);
+  for (int i = 0; i < 10; ++i) {
+    printf("%.3f ", *value++);
+  }
+  printf(")\n");
+}
+static R lerp(R a, R b, R t) {
+  return a + t*(b-a);
+}
+/// Rescales a pixel from [0,255] to [PIXEL_LOWER_BOUND, 1.0].
+static R FormatPixel(uint8_t pixel) {
+  const R value = (R)(pixel) / 255.0 * (1.0 - PIXEL_LOWER_BOUND) + PIXEL_LOWER_BOUND;
+  assert(value >= PIXEL_LOWER_BOUND);
+  assert(value <= 1.0);
+  return value;
+}
+/// Rescales a one-hot-encoded label value to (0,1).
+static R FormatLabel(R label) {
+  const R value = lerp(LABEL_LOWER_BOUND, LABEL_UPPER_BOUND, label);
+  assert(value > 0.0);
+  assert(value < 1.0);
+  return value;
+}
+static int32_t ReverseEndian32(int32_t x) {
+  const int32_t x0 = x & 0xff;
+  const int32_t x1 = (x >> 8) & 0xff;
+  const int32_t x2 = (x >> 16) & 0xff;
+  const int32_t x3 = (x >> 24) & 0xff;
+  return (x0 << 24) | (x1 << 16) | (x2 << 8) | x3;
+}
+static void ImageToMatrix(
+    const uint8_t* pixels, int num_pixels, int row, nnMatrix* images) {
+  assert(pixels);
+  assert(images);
+  for (int i = 0; i < num_pixels; ++i) {
+    const R pixel = FormatPixel(pixels[i]);
+    nnMatrixSet(images, row, i, pixel);
+  }
+}
+static bool ReadImages(gzFile images_file, int max_num_images, ImageSet* image_set) {
+  assert(images_file != Z_NULL);
+  assert(image_set);
+  bool success = false;
+  uint8_t* pixels = 0;
+  int32_t magic, total_images, rows, cols;
+  if ( (gzread(images_file, (char*)&magic, sizeof(int32_t)) != sizeof(int32_t)) ||
+       (gzread(images_file, (char*)&total_images, sizeof(int32_t)) != sizeof(int32_t)) ||
+       (gzread(images_file, (char*)&rows, sizeof(int32_t)) != sizeof(int32_t)) ||
+       (gzread(images_file, (char*)&cols, sizeof(int32_t)) != sizeof(int32_t)) ) {
+    fprintf(stderr, "Failed to read header\n");
+    goto cleanup;
+  }
+  magic = ReverseEndian32(magic);
+  total_images = ReverseEndian32(total_images);
+  rows = ReverseEndian32(rows);
+  cols = ReverseEndian32(cols);
+  if (magic != IMAGE_FILE_MAGIC) {
+    fprintf(stderr, "Magic number mismatch. Got %x, expected: %x\n",
+      magic, IMAGE_FILE_MAGIC);
+    goto cleanup;
+  }
+  printf("Magic: %.8x\nTotal images: %d\nRows: %d\nCols: %d\n",
+    magic, total_images, rows, cols);
+  total_images = max_num_images >= 0 ? min(total_images, max_num_images) : total_images;
+  // Images are flattened into single row vectors.
+  const int num_pixels = rows * cols;
+  image_set->images = nnMatrixMake(total_images, num_pixels);
+  image_set->count = total_images;
+  image_set->rows = rows;
+  image_set->cols = cols;
+  pixels = calloc(1, num_pixels);
+  if (!pixels) {
+    fprintf(stderr, "Failed to allocate image buffer\n");
+    goto cleanup;
+  }
+  for (int i = 0; i < total_images; ++i) {
+    const int bytes_read = gzread(images_file, pixels, num_pixels);
+    if (bytes_read < num_pixels) {
+      fprintf(stderr, "Failed to read image %d\n", i);
+      goto cleanup;
+    }
+    ImageToMatrix(pixels, num_pixels, i, &image_set->images);
+  }
+  success = true;
+cleanup:
+  if (pixels) {
+    free(pixels);
+  }
+  if (!success) {
+    nnMatrixDel(&image_set->images);
+  }
+  return success;
+}
+static void OneHotEncode(const uint8_t* labels_bytes, int num_labels, nnMatrix* labels) {
+  assert(labels_bytes);
+  assert(labels);
+  assert(labels->rows == num_labels);
+  assert(labels->cols == 10);
+  static const R one_hot[10][10] = {
+    { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 },
+    { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+    { 0, 0, 0, 0, 1, 0, 0, 0, 0, 0 },
+    { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0 },
+    { 0, 0, 0, 0, 0, 0, 1, 0, 0, 0 },
+    { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+  };
+  R* value = labels->values;
+  for (int i = 0; i < num_labels; ++i) {
+    const uint8_t label = labels_bytes[i];
+    const R* one_hot_value = one_hot[label];
+    for (int j = 0; j < 10; ++j) {
+      *value++ = FormatLabel(*one_hot_value++);
+    }
+  }
+}
+static int OneHotDecode(const nnMatrix* label_matrix) {
+  assert(label_matrix);
+  assert(label_matrix->cols == 1);
+  assert(label_matrix->rows == 10);
+  R max_value = 0;
+  int pos_max = 0;
+  for (int i = 0; i < 10; ++i) {
+    const R value = nnMatrixAt(label_matrix, 0, i);
+    if (value > max_value) {
+      max_value = value;
+      pos_max = i;
+    }
+  }
+  assert(pos_max >= 0);
+  assert(pos_max <= 10);
+  return pos_max;
+}
+static bool ReadLabels(gzFile labels_file, int max_num_labels, ImageSet* image_set) {
+  assert(labels_file != Z_NULL);
+  assert(image_set != 0);
+  bool success = false;
+  uint8_t* labels = 0;
+  int32_t magic, total_labels;
+  if ( (gzread(labels_file, (char*)&magic, sizeof(int32_t)) != sizeof(int32_t)) ||
+       (gzread(labels_file, (char*)&total_labels, sizeof(int32_t)) != sizeof(int32_t)) ) {
+    fprintf(stderr, "Failed to read header\n");
+    goto cleanup;
+  }
+  magic = ReverseEndian32(magic);
+  total_labels = ReverseEndian32(total_labels);
+  if (magic != LABEL_FILE_MAGIC) {
+    fprintf(stderr, "Magic number mismatch. Got %x, expected: %x\n",
+      magic, LABEL_FILE_MAGIC);
+    goto cleanup;
+  }
+  printf("Magic: %.8x\nTotal labels: %d\n", magic, total_labels);
+  total_labels = max_num_labels >= 0 ? min(total_labels, max_num_labels) : total_labels;
+  assert(image_set->count == total_labels);
+  // One-hot encoding of labels, 10 values (digits) per label.
+  image_set->labels = nnMatrixMake(total_labels, 10);
+  labels = calloc(total_labels, sizeof(uint8_t));
+  if (!labels) {
+    fprintf(stderr, "Failed to allocate labels buffer\n");
+    goto cleanup;
+  }
+  if (gzread(labels_file, labels, total_labels * sizeof(uint8_t)) != total_labels) {
+    fprintf(stderr, "Failed to read labels\n");
+    goto cleanup;
+  }
+  OneHotEncode(labels, total_labels, &image_set->labels);
+  success = true;
+cleanup:
+  if (labels) {
+    free(labels);
+  }
+  if (!success) {
+    nnMatrixDel(&image_set->labels);
+  }
+  return success;
+}
+int main(int argc, const char** argv) {
+  if (argc < 2) {
+    usage(argv[0]);
+    return 1;
+  }
+  bool success = false;
+  gzFile train_images_file = Z_NULL;
+  gzFile train_labels_file = Z_NULL;
+  gzFile test_images_file  = Z_NULL;
+  gzFile test_labels_file  = Z_NULL;
+  ImageSet train_set = { 0 };
+  ImageSet test_set  = { 0 };
+  nnNeuralNetwork* net = 0;
+  nnQueryObject* query = 0;
+  const char* mnist_files_dir = argv[1];
+  const int max_num_images = argc > 2 ? atoi(argv[2]) : -1;
+  char train_labels_path[PATH_MAX];
+  char train_images_path[PATH_MAX];
+  char test_labels_path[PATH_MAX];
+  char test_images_path[PATH_MAX];
+  strlcpy(train_labels_path, mnist_files_dir, PATH_MAX);
+  strlcpy(train_images_path, mnist_files_dir, PATH_MAX);
+  strlcpy(test_labels_path,  mnist_files_dir, PATH_MAX);
+  strlcpy(test_images_path,  mnist_files_dir, PATH_MAX);
+  strlcat(train_labels_path, "/train-labels-idx1-ubyte.gz", PATH_MAX);
+  strlcat(train_images_path, "/train-images-idx3-ubyte.gz", PATH_MAX);
+  strlcat(test_labels_path,  "/t10k-labels-idx1-ubyte.gz",  PATH_MAX);
+  strlcat(test_images_path,  "/t10k-images-idx3-ubyte.gz",  PATH_MAX);
+  train_images_file = gzopen(train_images_path, "r");
+  if (train_images_file == Z_NULL) {
+    fprintf(stderr, "Failed to open file: %s\n", train_images_path);
+    goto cleanup;
+  }
+  train_labels_file = gzopen(train_labels_path, "r");
+  if (train_labels_file == Z_NULL) {
+    fprintf(stderr, "Failed to open file: %s\n", train_labels_path);
+    goto cleanup;
+  }
+  test_images_file = gzopen(test_images_path, "r");
+  if (test_images_file == Z_NULL) {
+    fprintf(stderr, "Failed to open file: %s\n", test_images_path);
+    goto cleanup;
+  }
+  test_labels_file = gzopen(test_labels_path, "r");
+  if (test_labels_file == Z_NULL) {
+    fprintf(stderr, "Failed to open file: %s\n", test_labels_path);
+    goto cleanup;
+  }
+  if (!ReadImages(train_images_file, max_num_images, &train_set)) {
+    goto cleanup;
+  }
+  if (!ReadLabels(train_labels_file, max_num_images, &train_set)) {
+    goto cleanup;
+  }
+  if (!ReadImages(test_images_file, max_num_images, &test_set)) {
+    goto cleanup;
+  }
+  if (!ReadLabels(test_labels_file, max_num_images, &test_set)) {
+    goto cleanup;
+  }
+  printf("\nTraining image/label pair examples:\n");
+  for (int i = 0; i < min(3, train_set.images.rows); ++i) {
+    PrintImage(&train_set.images, train_set.rows, train_set.cols, i);
+    PrintLabel(&train_set.labels, i);
+    printf("\n");
+  }
+  // Network definition.
+  const int image_size_pixels = train_set.rows * train_set.cols;
+  const int num_layers = 2;
+  const int layer_sizes[3] = { image_size_pixels, 100, 10 };
+  const nnActivation layer_activations[2] = { nnSigmoid, nnSigmoid };
+  if (!(net = nnMakeNet(num_layers, layer_sizes, layer_activations))) {
+    fprintf(stderr, "Failed to create neural network\n");
+    goto cleanup;
+  }
+  // Train.
+  printf("Training with up to %d images from the data set\n\n", max_num_images);
+  const nnTrainingParams training_params = {
+    .learning_rate = 0.1,
+    .max_iterations = TRAIN_ITERATIONS,
+    .seed = 0,
+    .weight_init = nnWeightInitNormal,
+    .debug = true,
+  };
+  nnTrain(net, &train_set.images, &train_set.labels, &training_params);
+  // Test.
+  int hits = 0;
+  query = nnMakeQueryObject(net, /*num_inputs=*/1);
+  for (int i = 0; i < test_set.count; ++i) {
+    const nnMatrix test_image = nnMatrixBorrowRows(&test_set.images, i, 1);
+    const nnMatrix test_label = nnMatrixBorrowRows(&test_set.labels, i, 1);
+    nnQuery(net, query, &test_image);
+    const int test_label_expected = OneHotDecode(&test_label);
+    const int test_label_actual   = OneHotDecode(nnNetOutputs(query));
+    if (test_label_actual == test_label_expected) {
+      ++hits;
+    }
+  }
+  const R hit_ratio = (R)hits / (R)test_set.count;
+  printf("Test images: %d\n", test_set.count);
+  printf("Hits: %d/%d (%.3f%%)\n", hits, test_set.count, hit_ratio*100);
+  success = true;
+cleanup:
+  if (query) {
+    nnDeleteQueryObject(&query);
+  }
+  if (net) {
+    nnDeleteNet(&net);
+  }
+  nnMatrixDel(&train_set.images);
+  nnMatrixDel(&test_set.images);
+  if (train_images_file != Z_NULL) {
+    gzclose(train_images_file);
+  }
+  if (train_labels_file != Z_NULL) {
+    gzclose(train_labels_file);
+  }
+  if (test_images_file != Z_NULL) {
+    gzclose(test_images_file);
+  }
+  if (test_labels_file != Z_NULL) {
+    gzclose(test_labels_file);
+  }
+  return success ? 0 : 1;
+}
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
new file mode 100644
index 0000000..9e0e924
--- /dev/null
+++ b/src/lib/CMakeLists.txt
@@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.0)
+# Library
+add_library(neuralnet
+  src/matrix.c
+  src/neuralnet.c
+  src/train.c)
+target_include_directories(neuralnet PUBLIC
+  include)
+target_link_libraries(neuralnet PRIVATE
+  math  # System math library.
+  random)
+target_compile_options(neuralnet PRIVATE -Wall -Wextra)
+# Test
+add_executable(neuralnet-test
+  test/matrix_test.c
+  test/neuralnet_test.c
+  test/test_main.c
+  test/train_linear_perceptron_test.c
+  test/train_linear_perceptron_non_origin_test.c
+  test/train_sigmoid_test.c
+  test/train_xor_test.c)
+target_link_libraries(neuralnet-test PRIVATE
+  neuralnet)
+# So that we can include header files from the private implementation.
+target_include_directories(neuralnet-test PRIVATE
+  src)
+target_compile_options(neuralnet-test PRIVATE -DUNIT_TEST -Wall -Wextra)
diff --git a/src/lib/include/neuralnet/matrix.h b/src/lib/include/neuralnet/matrix.h
new file mode 100644
index 0000000..9816b81
--- /dev/null
+++ b/src/lib/include/neuralnet/matrix.h
@@ -0,0 +1,111 @@
+#pragma once
+#include <neuralnet/types.h>
+#include <assert.h>
+/// NxM matrix.
+typedef struct nnMatrix {
+  int rows;
+  int cols;
+  R*  values;
+} nnMatrix;
+/// Construct a matrix.
+nnMatrix nnMatrixMake(int rows, int cols);
+/// Delete a matrix and free its internal memory.
+void nnMatrixDel(nnMatrix*);
+/// Move a matrix.
+///
+/// |in| is an empty matrix after the move.
+/// |out| is a matrix like |in| before the move.
+void nnMatrixMove(nnMatrix* in, nnMatrix* out);
+/// Deep-copy a matrix.
+void nnMatrixCopy(const nnMatrix* in, nnMatrix* out);
+/// Write the matrix values into an array in a row-major fashion.
+void nnMatrixToArray(const nnMatrix* in, R* out);
+/// Write the given row of a matrix into an array.
+void nnMatrixRowToArray(const nnMatrix* in, int row, R* out);
+/// Copy a column from a source to a target matrix.
+void nnMatrixCopyCol(const nnMatrix* in, nnMatrix* out, int col_in, int col_out);
+/// Mutable borrow of a matrix.
+nnMatrix nnMatrixBorrow(nnMatrix* in);
+/// Mutable borrow of a subrange of rows of a matrix.
+nnMatrix nnMatrixBorrowRows(nnMatrix* in, int row_start, int num_rows);
+/// Initialize the matrix from an array of values.
+///
+/// The array must hold values in a row-major fashion.
+void nnMatrixInit(nnMatrix*, const R* values);
+/// Initialize all matrix values to a given constant.
+void nnMatrixInitConstant(nnMatrix*, R value);
+/// Multiply two matrices.
+void nnMatrixMul(const nnMatrix* left, const nnMatrix* right, nnMatrix* out);
+/// Matrix multiply-add.
+///
+/// out = left + (right * scale)
+void nnMatrixMulAdd(const nnMatrix* left, const nnMatrix* right, R scale, nnMatrix* out);
+/// Matrix multiply-subtract.
+///
+/// out = left - (right * scale)
+void nnMatrixMulSub(const nnMatrix* left, const nnMatrix* right, R scale, nnMatrix* out);
+/// Hadamard product of two matrices.
+void nnMatrixMulPairs(const nnMatrix* left, const nnMatrix* right, nnMatrix* out);
+/// Add two matrices.
+void nnMatrixAdd(const nnMatrix* left, const nnMatrix* right, nnMatrix* out);
+/// Subtract two matrices.
+void nnMatrixSub(const nnMatrix* left, const nnMatrix* right, nnMatrix* out);
+/// Adds a row vector to all rows of the matrix.
+void nnMatrixAddRow(const nnMatrix* matrix, const nnMatrix* row, nnMatrix* out);
+/// Scale a matrix.
+void nnMatrixScale(nnMatrix*, R scale);
+/// Transpose a matrix.
+/// |in| must be different than |out|.
+void nnMatrixTranspose(const nnMatrix* in, nnMatrix* out);
+/// Threshold the values of a matrix using a greater-than operator.
+///
+/// out[x,y] = 1 if in[x,y] > threshold else 0
+void nnMatrixGt(const nnMatrix* in, R threshold, nnMatrix* out);
+/// Return the matrix value at the given row and column.
+static inline R nnMatrixAt(const nnMatrix* matrix, int row, int col) {
+  assert(matrix);
+  return matrix->values[row * matrix->cols + col];
+}
+/// Set the matrix value at the given row and column.
+static inline void nnMatrixSet(nnMatrix* matrix, int row, int col, R value) {
+  assert(matrix);
+  matrix->values[row * matrix->cols + col] = value;
+}
+/// Return a pointer to the given row in the matrix.
+static inline const R* nnMatrixRow(const nnMatrix* matrix, int row) {
+  assert(matrix);
+  return &matrix->values[row * matrix->cols];
+}
+/// Return a mutable pointer to the given row in the matrix.
+static inline R* nnMatrixRow_mut(nnMatrix* matrix, int row) {
+  assert(matrix);
+  return &matrix->values[row * matrix->cols];
+}
diff --git a/src/lib/include/neuralnet/neuralnet.h b/src/lib/include/neuralnet/neuralnet.h
new file mode 100644
index 0000000..1cf1c53
--- /dev/null
+++ b/src/lib/include/neuralnet/neuralnet.h
@@ -0,0 +1,64 @@
+#pragma once
+#include <neuralnet/types.h>
+typedef struct nnMatrix nnMatrix;
+typedef struct nnNeuralNetwork nnNeuralNetwork;
+typedef struct nnQueryObject nnQueryObject;
+/// Neuron activation.
+typedef enum nnActivation {
+  nnIdentity,
+  nnSigmoid,
+  nnRelu,
+} nnActivation;
+/// Create a network.
+nnNeuralNetwork* nnMakeNet(int num_layers, const int* layer_sizes, const nnActivation* activations);
+/// Delete the network and free its internal memory.
+void nnDeleteNet(nnNeuralNetwork**);
+/// Set the network's weights.
+void nnSetWeights(nnNeuralNetwork*, const R* weights);
+/// Set the network's biases.
+void nnSetBiases(nnNeuralNetwork*, const R* biases);
+/// Query the network.
+///
+/// |input| is a matrix of inputs, one row per input and as many columns as the
+/// input's dimension.
+///
+/// The query object's output matrix (see nnQueryOutputs()) is a matrix of
+/// outputs, one row per output and as many columns as the output's dimension.
+void nnQuery(const nnNeuralNetwork*, nnQueryObject*, const nnMatrix* input);
+/// Query the network, array version.
+void nnQueryArray(const nnNeuralNetwork*, nnQueryObject*, const R* input, R* output);
+/// Create a query object.
+///
+/// The query object holds all the internal memory required to query a network.
+/// Query objects allocate all memory up front so that network queries can run
+/// without additional memory allocation.
+nnQueryObject* nnMakeQueryObject(const nnNeuralNetwork*, int num_inputs);
+/// Delete the query object and free its internal memory.
+void nnDeleteQueryObject(nnQueryObject**);
+/// Return the outputs of the query.
+const nnMatrix* nnNetOutputs(const nnQueryObject*);
+/// Return the network's input size.
+int nnNetInputSize(const nnNeuralNetwork*);
+/// Return the network's output size.
+int nnNetOutputSize(const nnNeuralNetwork*);
+/// Return the layer's input size.
+int nnLayerInputSize(const nnMatrix* weights);
+/// Return the layer's output size.
+int nnLayerOutputSize(const nnMatrix* weights);
diff --git a/src/lib/include/neuralnet/train.h b/src/lib/include/neuralnet/train.h
new file mode 100644
index 0000000..79f8e7b
--- /dev/null
+++ b/src/lib/include/neuralnet/train.h
@@ -0,0 +1,42 @@
+#pragma once
+#include <neuralnet/neuralnet.h>
+#include <stdbool.h>
+#include <stdint.h>
+typedef struct nnMatrix nnMatrix;
+/// Weight initialization strategy.
+///
+/// Note that regardless of strategy, a layer's weights are scaled by the
+/// layer's size. This is to avoid saturation when, e.g., using a sigmoid
+/// activation with many inputs. Thus, a (0,1) initialization is really
+/// (0,scale), for example.
+typedef enum nnWeightInitStrategy {
+  nnWeightInit01,      // (0,1) range.
+  nnWeightInit11,      // (-1,+1) range.
+  nnWeightInitNormal,  // Normal distribution.
+} nnWeightInitStrategy;
+/// Network training parameters.
+typedef struct nnTrainingParams {
+  R learning_rate;
+  int max_iterations;
+  uint64_t seed;
+  nnWeightInitStrategy weight_init;
+  bool debug;
+} nnTrainingParams;
+/// Train the network.
+///
+/// |inputs| is a matrix of inputs, one row per input and as many columns as
+/// the input's dimension.
+///
+/// |targets| is a matrix of targets, one row per target and as many columns as
+/// the target's dimension.
+void nnTrain(
+  nnNeuralNetwork*,
+  const nnMatrix* inputs,
+  const nnMatrix* targets,
+  const nnTrainingParams*);
diff --git a/src/lib/include/neuralnet/types.h b/src/lib/include/neuralnet/types.h
new file mode 100644
index 0000000..e8d3942
--- /dev/null
+++ b/src/lib/include/neuralnet/types.h
@@ -0,0 +1,3 @@
+#pragma once
+typedef double R;
diff --git a/src/lib/src/activation.h b/src/lib/src/activation.h
new file mode 100644
index 0000000..42ab73f
--- /dev/null
+++ b/src/lib/src/activation.h
@@ -0,0 +1,21 @@
+#pragma once
+#include <neuralnet/types.h>
+#include <math.h>
+static inline R sigmoid(R x) {
+  return 1. / (1. + exp(-x));
+}
+static inline R relu(R x) {
+  return fmax(0, x);
+}
+#define NN_MAP_ARRAY(f, in, out, size) \
+  for (int i = 0; i < size; ++i) { \
+    out[i] = f(in[i]); \
+  }
+#define sigmoid_array(in, out, size) NN_MAP_ARRAY(sigmoid, in, out, size)
+#define relu_array(in, out, size)    NN_MAP_ARRAY(relu, in, out, size)
diff --git a/src/lib/src/matrix.c b/src/lib/src/matrix.c
new file mode 100644
index 0000000..a7a4ce6
--- /dev/null
+++ b/src/lib/src/matrix.c
@@ -0,0 +1,298 @@
+#include <neuralnet/matrix.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+nnMatrix nnMatrixMake(int rows, int cols) {
+  R* values = calloc(rows * cols, sizeof(R));
+  assert(values != 0);
+  return (nnMatrix) {
+    .rows = rows,
+    .cols = cols,
+    .values = values,
+  };
+}
+void nnMatrixDel(nnMatrix* matrix) {
+  assert(matrix != 0);
+  if (matrix->values != 0) {
+    free(matrix->values);
+    matrix->values = 0;
+    matrix->rows = 0;
+    matrix->cols = 0;
+  }
+}
+void nnMatrixMove(nnMatrix* in, nnMatrix* out) {
+  assert(in);
+  assert(out);
+  out->rows = in->rows;
+  out->cols = in->cols;
+  out->values = in->values;
+  in->rows = 0;
+  in->cols = 0;
+  in->values = 0;
+}
+void nnMatrixCopy(const nnMatrix* in, nnMatrix* out) {
+  assert(in);
+  assert(out);
+  assert(in->rows == out->rows);
+  assert(in->cols == out->cols);
+  const R* in_value = in->values;
+  R* out_value = out->values;
+  for (int i = 0; i < in->rows * in->cols; ++i) {
+    *out_value++ = *in_value++;
+  }
+}
+void nnMatrixToArray(const nnMatrix* in, R* out) {
+  assert(in);
+  assert(out);
+  const R* values = in->values;
+  for (int i = 0; i < in->rows * in->cols; ++i) {
+    *out++ = *values++;
+  }
+}
+void nnMatrixRowToArray(const nnMatrix* in, int row, R* out) {
+  assert(in);
+  assert(out);
+  const R* values = in->values + row * in->cols;
+  for (int i = 0; i < in->cols; ++i) {
+    *out++ = *values++;
+  }
+}
+void nnMatrixCopyCol(const nnMatrix* in, nnMatrix* out, int col_in, int col_out) {
+  assert(in);
+  assert(out);
+  assert(in->rows == out->rows);
+  assert(col_in < in->cols);
+  assert(col_out < out->cols);
+  for (int row = 0; row < in->rows; ++row) {
+    nnMatrixSet(out, row, col_out, nnMatrixAt(in, row, col_in));
+  }
+}
+nnMatrix nnMatrixBorrow(nnMatrix* in) {
+  assert(in);
+  nnMatrix out;
+  out.rows = in->rows;
+  out.cols = in->cols;
+  out.values = in->values;
+  return out;
+}
+nnMatrix nnMatrixBorrowRows(nnMatrix* in, int row_start, int num_rows) {
+  assert(in);
+  assert(row_start < in->rows);
+  assert(row_start + num_rows <= in->rows);
+  nnMatrix out;
+  out.rows = num_rows;
+  out.cols = in->cols;
+  out.values = nnMatrixRow_mut(in, row_start);
+  return out;
+}
+void nnMatrixInit(nnMatrix* matrix, const R* values) {
+  assert(matrix);
+  assert(values);
+  memcpy(matrix->values, values, matrix->rows * matrix->cols * sizeof(R));
+}
+void nnMatrixInitConstant(nnMatrix* matrix, R value) {
+  assert(matrix);
+  for (int i = 0; i < matrix->rows * matrix->cols; ++i) {
+    matrix->values[i] = value;
+  }
+}
+void nnMatrixMul(const nnMatrix* left, const nnMatrix* right, nnMatrix* out) {
+  assert(left != 0);
+  assert(right != 0);
+  assert(out != 0);
+  assert(out != left);
+  assert(out != right);
+  assert(left->cols == right->rows);
+  assert(out->rows == left->rows);
+  assert(out->cols == right->cols);
+  R* out_value = out->values;
+  for (int i = 0; i < left->rows; ++i) {
+    const R* left_row = &left->values[i * left->cols];
+    for (int j = 0; j < right->cols; ++j) {
+      const R* right_col = &right->values[j];
+      *out_value = 0;
+      // Vector dot product.
+      for (int k = 0; k < left->cols; ++k) {
+        *out_value += left_row[k] * right_col[0];
+        right_col += right->cols;  // Next row in the column.
+      }
+      out_value++;
+    }
+  }
+}
+void nnMatrixMulAdd(const nnMatrix* left, const nnMatrix* right, R scale, nnMatrix* out) {
+  assert(left);
+  assert(right);
+  assert(out);
+  assert(left->rows == right->rows);
+  assert(left->cols == right->cols);
+  assert(left->rows == out->rows);
+  assert(left->cols == out->cols);
+  const R* left_value = left->values;
+  const R* right_value = right->values;
+  R* out_value = out->values;
+  for (int i = 0; i < left->rows * left->cols; ++i) {
+    *out_value++ = *left_value++ + *right_value++ * scale;
+  }
+}
+void nnMatrixMulSub(const nnMatrix* left, const nnMatrix* right, R scale, nnMatrix* out) {
+  assert(left);
+  assert(right);
+  assert(out);
+  assert(left->rows == right->rows);
+  assert(left->cols == right->cols);
+  assert(left->rows == out->rows);
+  assert(left->cols == out->cols);
+  const R* left_value = left->values;
+  const R* right_value = right->values;
+  R* out_value = out->values;
+  for (int i = 0; i < left->rows * left->cols; ++i) {
+    *out_value++ = *left_value++ - *right_value++ * scale;
+  }
+}
+void nnMatrixMulPairs(const nnMatrix* left, const nnMatrix* right, nnMatrix* out) {
+  assert(left != 0);
+  assert(right != 0);
+  assert(out != 0);
+  assert(left->rows == right->rows);
+  assert(left->cols == right->cols);
+  assert(left->rows == out->rows);
+  assert(left->cols == out->cols);
+  R* left_value = left->values;
+  R* right_value = right->values;
+  R* out_value = out->values;
+  for (int i = 0; i < left->rows * left->cols; ++i) {
+    *out_value++ = *left_value++ * *right_value++;
+  }
+}
+void nnMatrixAdd(const nnMatrix* left, const nnMatrix* right, nnMatrix* out) {
+  assert(left);
+  assert(right);
+  assert(out);
+  assert(left->rows == right->rows);
+  assert(left->cols == right->cols);
+  assert(left->rows == out->rows);
+  assert(left->cols == out->cols);
+  const R* left_value = left->values;
+  const R* right_value = right->values;
+  R* out_value = out->values;
+  for (int i = 0; i < left->rows * left->cols; ++i) {
+    *out_value++ = *left_value++ + *right_value++;
+  }
+}
+void nnMatrixSub(const nnMatrix* left, const nnMatrix* right, nnMatrix* out) {
+  assert(left);
+  assert(right);
+  assert(out);
+  assert(left->rows == right->rows);
+  assert(left->cols == right->cols);
+  assert(left->rows == out->rows);
+  assert(left->cols == out->cols);
+  const R* left_value = left->values;
+  const R* right_value = right->values;
+  R* out_value = out->values;
+  for (int i = 0; i < left->rows * left->cols; ++i) {
+    *out_value++ = *left_value++ - *right_value++;
+  }
+}
+void nnMatrixAddRow(const nnMatrix* matrix, const nnMatrix* row, nnMatrix* out) {
+  assert(matrix);
+  assert(row);
+  assert(out);
+  assert(row->rows == 1);
+  assert(matrix->cols == row->cols);
+  assert(matrix->rows == out->rows);
+  assert(matrix->cols == out->cols);
+  const R* matrix_value = matrix->values;
+  R* out_value = out->values;
+  for (int i = 0; i < matrix->rows; ++i) {
+    const R* row_value = row->values;
+    for (int j = 0; j < row->cols; ++j) {
+      *out_value++ = *matrix_value++ + *row_value++;
+    }
+  }
+}
+void nnMatrixScale(nnMatrix* matrix, R scale) {
+  assert(matrix);
+  R* value = matrix->values;
+  for (int i = 0; i < matrix->rows * matrix->cols; ++i) {
+    *value++ *= scale;
+  }
+}
+void nnMatrixTranspose(const nnMatrix* in, nnMatrix* out) {
+  assert(in);
+  assert(out);
+  assert(in != out);
+  assert(in->rows == out->cols);
+  assert(in->cols == out->rows);
+  for (int i = 0; i < in->rows; ++i) {
+    for (int j = 0; j < in->cols; ++j) {
+      nnMatrixSet(out, j, i, nnMatrixAt(in, i, j));
+    }
+  }
+}
+void nnMatrixGt(const nnMatrix* in, R threshold, nnMatrix* out) {
+  assert(in);
+  assert(out);
+  assert(in->rows == out->rows);
+  assert(in->cols == out->cols);
+  const R* in_value = in->values;
+  R* out_value = out->values;
+  for (int i = 0; i < in->rows * in->cols; ++i) {
+    *out_value++ = (*in_value++) > threshold ? 1 : 0;
+  }
+}
diff --git a/src/lib/src/neuralnet.c b/src/lib/src/neuralnet.c
new file mode 100644
index 0000000..cac611a
--- /dev/null
+++ b/src/lib/src/neuralnet.c
@@ -0,0 +1,228 @@
+#include <neuralnet/neuralnet.h>
+#include <neuralnet/matrix.h>
+#include "activation.h"
+#include "neuralnet_impl.h"
+#include <assert.h>
+#include <stdlib.h>
+nnNeuralNetwork* nnMakeNet(int num_layers, const int* layer_sizes, const nnActivation* activations) {
+  assert(num_layers > 0);
+  assert(layer_sizes);
+  assert(activations);
+  nnNeuralNetwork* net = calloc(1, sizeof(nnNeuralNetwork));
+  if (net == 0) {
+    return 0;
+  }
+  net->num_layers = num_layers;
+  net->weights = calloc(num_layers, sizeof(nnMatrix));
+  net->biases  = calloc(num_layers, sizeof(nnMatrix));
+  net->activations = calloc(num_layers, sizeof(nnActivation));
+  if ( (net->weights == 0) || (net->biases == 0) || (net->activations == 0) ) {
+    nnDeleteNet(&net);
+    return 0;
+  }
+  for (int l = 0; l < num_layers; ++l) {
+    // layer_sizes = { input layer size, first hidden layer size, ...}
+    const int layer_input_size  = layer_sizes[l];
+    const int layer_output_size = layer_sizes[l+1];
+    // We store the transpose of the weight matrix as written in textbooks.
+    // Our vectors are row vectors and the matrices row-major.
+    const int rows = layer_input_size;
+    const int cols = layer_output_size;
+    net->weights[l] = nnMatrixMake(rows, cols);
+    net->biases[l]  = nnMatrixMake(1, cols);
+    net->activations[l] = activations[l];
+  }
+  return net;
+}
+void nnDeleteNet(nnNeuralNetwork** net) {
+  if ( (!net) || (!(*net)) ) {
+    return;
+  }
+  if ((*net)->weights != 0) {
+    for (int l = 0; l < (*net)->num_layers; ++l) {
+      nnMatrixDel(&(*net)->weights[l]);
+    }
+    free((*net)->weights);
+    (*net)->weights = 0;
+  }
+  if ((*net)->biases != 0) {
+    for (int l = 0; l < (*net)->num_layers; ++l) {
+      nnMatrixDel(&(*net)->biases[l]);
+    }
+    free((*net)->biases);
+    (*net)->biases = 0;
+  }
+  if ((*net)->activations) {
+    free((*net)->activations);
+    (*net)->activations = 0;
+  }
+  free(*net);
+  *net = 0;
+}
+void nnSetWeights(nnNeuralNetwork* net, const R* weights) {
+  assert(net);
+  assert(weights);
+  for (int l = 0; l < net->num_layers; ++l) {
+    nnMatrix* layer_weights = &net->weights[l];
+    R* layer_values = layer_weights->values;
+    for (int j = 0; j < layer_weights->rows * layer_weights->cols; ++j) {
+      *layer_values++ = *weights++;
+    }
+  }
+}
+void nnSetBiases(nnNeuralNetwork* net, const R* biases) {
+  assert(net);
+  assert(biases);
+  for (int l = 0; l < net->num_layers; ++l) {
+    nnMatrix* layer_biases = &net->biases[l];
+    R* layer_values = layer_biases->values;
+    for (int j = 0; j < layer_biases->rows * layer_biases->cols; ++j) {
+      *layer_values++ = *biases++;
+    }
+  }
+}
+void nnQuery(const nnNeuralNetwork* net, nnQueryObject* query, const nnMatrix* input) {
+  assert(net);
+  assert(query);
+  assert(input);
+  assert(net->num_layers == query->num_layers);
+  assert(input->rows <= query->network_outputs->rows);
+  assert(input->cols == nnNetInputSize(net));
+  for (int i = 0; i < input->rows; ++i) {
+    // Not mutating the input, but we need the cast to borrow.
+    nnMatrix input_vector = nnMatrixBorrowRows((nnMatrix*)input, i, 1);
+    for (int l = 0; l < net->num_layers; ++l) {
+      const nnMatrix* layer_weights = &net->weights[l];
+      const nnMatrix* layer_biases  = &net->biases[l];
+      // Y^T = (W*X)^T = X^T*W^T
+      //
+      // TODO: If we had a row-row matrix multiplication, we could compute:
+      //   Y^T = W ** X^T
+      // The row-row multiplication could be more cache-friendly. We just need
+      // to store W as is, without transposing.
+      // We could also rewrite the original Mul function to go row x row,
+      // decomposing the multiplication. Preserving the original meaning of Mul
+      // makes everything clearer.
+      nnMatrix output_vector = nnMatrixBorrowRows(&query->layer_outputs[l], i, 1);
+      nnMatrixMul(&input_vector, layer_weights, &output_vector);
+      nnMatrixAddRow(&output_vector, layer_biases, &output_vector);
+      switch (net->activations[l]) {
+        case nnIdentity:
+          break;  // Nothing to do for the identity function.
+        case nnSigmoid:
+          sigmoid_array(output_vector.values, output_vector.values, output_vector.cols);
+          break;
+        case nnRelu:
+          relu_array(output_vector.values, output_vector.values, output_vector.cols);
+          break;
+        default:
+          assert(0);
+      }
+      input_vector = output_vector;  // Borrow.
+    }
+  }
+}
+void nnQueryArray(const nnNeuralNetwork* net, nnQueryObject* query, const R* input, R* output) {
+  assert(net);
+  assert(query);
+  assert(input);
+  assert(output);
+  assert(net->num_layers > 0);
+  nnMatrix input_vector = nnMatrixMake(net->weights[0].cols, 1);
+  nnMatrixInit(&input_vector, input);
+  nnQuery(net, query, &input_vector);
+  nnMatrixRowToArray(query->network_outputs, 0, output);
+}
+nnQueryObject* nnMakeQueryObject(const nnNeuralNetwork* net, int num_inputs) {
+  assert(net);
+  assert(num_inputs > 0);
+  assert(net->num_layers > 0);
+  nnQueryObject* query = calloc(1, sizeof(nnQueryObject));
+  if (!query) {
+    return 0;
+  }
+  query->num_layers = net->num_layers;
+  // Allocate the intermediate layer output matrices.
+  query->layer_outputs = calloc(net->num_layers, sizeof(nnMatrix));
+  if (!query->layer_outputs) {
+    free(query);
+    return 0;
+  }
+  for (int l = 0; l < net->num_layers; ++l) {
+      const nnMatrix* layer_weights = &net->weights[l];
+      const int layer_output_size = nnLayerOutputSize(layer_weights);
+      query->layer_outputs[l] = nnMatrixMake(num_inputs, layer_output_size);
+  }
+  query->network_outputs = &query->layer_outputs[net->num_layers - 1];
+  return query;
+}
+void nnDeleteQueryObject(nnQueryObject** query) {
+  if ( (!query) || (!(*query)) ) {
+    return;
+  }
+  if ((*query)->layer_outputs != 0) {
+    for (int l = 0; l < (*query)->num_layers; ++l) {
+      nnMatrixDel(&(*query)->layer_outputs[l]);
+    }
+  }
+  free((*query)->layer_outputs);
+  free(*query);
+  *query = 0;
+}
+const nnMatrix* nnNetOutputs(const nnQueryObject* query) {
+  assert(query);
+  return query->network_outputs;
+}
+int nnNetInputSize(const nnNeuralNetwork* net) {
+  assert(net);
+  assert(net->num_layers > 0);
+  return net->weights[0].rows;
+}
+int nnNetOutputSize(const nnNeuralNetwork* net) {
+  assert(net);
+  assert(net->num_layers > 0);
+  return net->weights[net->num_layers - 1].cols;
+}
+int nnLayerInputSize(const nnMatrix* weights) {
+  assert(weights);
+  return weights->rows;
+}
+int nnLayerOutputSize(const nnMatrix* weights) {
+  assert(weights);
+  return weights->cols;
+}
diff --git a/src/lib/src/neuralnet_impl.h b/src/lib/src/neuralnet_impl.h
new file mode 100644
index 0000000..26107b5
--- /dev/null
+++ b/src/lib/src/neuralnet_impl.h
@@ -0,0 +1,36 @@
+#pragma once
+#include <neuralnet/matrix.h>
+/// Neural network object.
+///
+/// We store the transposes of the weight matrices so that we can do forward
+/// passes with a minimal amount of work. That is, if in paper we write:
+///
+///   [w11 w21]
+///   [w12 w22]
+///
+/// then the weight matrix in memory is stored as the following array:
+///
+///   w11 w12 w21 w22
+typedef struct nnNeuralNetwork {
+  int num_layers;             // Number of non-input layers (hidden + output).
+  nnMatrix* weights;          // One matrix per non-input layer.
+  nnMatrix* biases;           // One vector per non-input layer.
+  nnActivation* activations;  // One per non-input layer.
+} nnNeuralNetwork;
+/// A query object that holds all the memory necessary to query a network.
+///
+/// |layer_outputs| is an array of matrices of intermediate layer outputs. There
+/// is one matrix per intermediate layer. Each matrix holds the layer's output,
+/// with one row per input, and as many columns as the layer's output size (the
+/// output vector is transposed.)
+///
+/// |network_outputs| points to the last output matrix in |layer_outputs| for
+/// convenience.
+typedef struct nnQueryObject {
+  int       num_layers;
+  nnMatrix* layer_outputs;    // Output matrices, one output per layer.
+  nnMatrix* network_outputs;  // Points to the last output matrix.
+} nnTrainingQueryObject;
diff --git a/src/lib/src/train.c b/src/lib/src/train.c
new file mode 100644
index 0000000..027de66
--- /dev/null
+++ b/src/lib/src/train.c
@@ -0,0 +1,346 @@
+#include <neuralnet/train.h>
+#include <neuralnet/matrix.h>
+#include "neuralnet_impl.h"
+#include <random/mt19937-64.h>
+#include <random/normal.h>
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#define LOGD printf
+// If debug mode is requested, we will show progress every this many iterations.
+static const int PROGRESS_THRESHOLD = 5;  // %
+/// Computes the total MSE from the output error matrix.
+R ComputeMSE(const nnMatrix* errors) {
+  R sum_sq = 0;
+  const int N = errors->rows * errors->cols;
+  const R* value = errors->values;
+  for (int i = 0; i < N; ++i) {
+    sum_sq += *value * *value;
+    value++;
+  }
+  return sum_sq / (R)N;
+}
+/// Holds the bits required to compute a sigmoid gradient.
+typedef struct nnSigmoidGradientElements {
+  nnMatrix ones;      // A vector of just ones, same size as the layer.
+} nnSigmoidGradientElements;
+/// Holds the various elements required to compute gradients. These depend on
+/// what activation function are used, so they'll potentially be different for
+/// each layer. A data type is defined for these because we allocate all the
+/// required memory up front before entering the training loop.
+typedef struct nnGradientElements {
+  nnActivation type;
+  // Gradient vector, same size as the layer.
+  // This will contain the gradient expression except for the output value of
+  // the previous layer.
+  nnMatrix gradient;
+  union {
+    nnSigmoidGradientElements sigmoid;
+  };
+} nnGradientElements;
+// Initialize the network's weights randomly and set their biases to 0.
+void nnInitNet(nnNeuralNetwork* net, uint64_t seed, const nnWeightInitStrategy strategy) {
+  assert(net);
+  mt19937_64 rng = mt19937_64_make();
+  mt19937_64_init(&rng, seed);
+  for (int l = 0; l < net->num_layers; ++l) {
+    nnMatrix* weights = &net->weights[l];
+    nnMatrix* biases  = &net->biases[l];
+    const R layer_size = (R)nnLayerInputSize(weights);
+    const R scale = 1. / layer_size;
+    const R stdev = 1. / sqrt((R)layer_size);
+    const R sigma = stdev * stdev;
+    R* value = weights->values;
+    for (int k = 0; k < weights->rows * weights->cols; ++k) {
+      switch (strategy) {
+        case nnWeightInit01: {
+          const R x01 = mt19937_64_gen_real3(&rng);  // (0, +1) interval.
+          *value++ = scale * x01;
+          break;
+        }
+        case nnWeightInit11: {
+          const R x11 = mt19937_64_gen_real4(&rng);  // (-1, +1) interval.
+          *value++ = scale * x11;
+          break;
+        }
+        case nnWeightInitNormal:
+          // Using initialization with a normal distribution of standard
+          // deviation 1 / sqrt(num_layer_weights) to prevent saturation when
+          // multiplying inputs.
+          const R u01 = mt19937_64_gen_real3(&rng);  // (0, +1) interval.
+          const R v01 = mt19937_64_gen_real3(&rng);  // (0, +1) interval.
+          R z0, z1;
+          normal2(u01, v01, &z0, &z1);
+          z0 = normal_transform(z0, /*mu=*/0, sigma);
+          z1 = normal_transform(z1, /*mu=*/0, sigma);
+          *value++ = z0;
+          if (k < weights->rows * weights->cols - 1) {
+            *value++ = z1;
+            ++k;
+          }
+          break;
+        default:
+          assert(false);
+      }
+    }
+    // Initialize biases.
+    // 0 is used so that functions originally go through the origin.
+    value = biases->values;
+    for (int k = 0; k < biases->rows * biases->cols; ++k, ++value) {
+      *value = 0;
+    }
+  }
+}
+// |inputs|  has one row vector per sample.
+// |targets| has one row vector per sample.
+//
+// For now, each iteration trains with one sample (row) at a time.
+void nnTrain(
+    nnNeuralNetwork* net,
+    const nnMatrix* inputs,
+    const nnMatrix* targets,
+    const nnTrainingParams* params) {
+  assert(net);
+  assert(inputs);
+  assert(targets);
+  assert(params);
+  assert(nnNetOutputSize(net) == targets->cols);
+  assert(net->num_layers > 0);
+  // Allocate error vectors to hold the backpropagated error values.
+  // For now, these are one row vector per layer, meaning that we will train
+  // with one sample at a time.
+  nnMatrix* errors = calloc(net->num_layers, sizeof(nnMatrix));
+  // Allocate the weight transpose matrices up front for backpropagation.
+  nnMatrix* weights_T = calloc(net->num_layers, sizeof(nnMatrix));
+  // Allocate the weight delta matrices.
+  nnMatrix* weight_deltas = calloc(net->num_layers, sizeof(nnMatrix));
+  // Allocate the data structures required to compute gradients.
+  // This depends on each layer's activation type.
+  nnGradientElements* gradient_elems = calloc(net->num_layers, sizeof(nnGradientElements));
+  // Allocate the output transpose vectors for weight delta calculation.
+  // This is one column vector per layer.
+  nnMatrix* outputs_T = calloc(net->num_layers, sizeof(nnMatrix));
+  assert(errors != 0);
+  assert(weights_T != 0);
+  assert(weight_deltas != 0);
+  assert(gradient_elems);
+  assert(outputs_T);
+  for (int l = 0; l < net->num_layers; ++l) {
+    const nnMatrix* layer_weights = &net->weights[l];
+    const int layer_output_size = net->weights[l].cols;
+    const nnActivation activation = net->activations[l];
+    errors[l] = nnMatrixMake(1, layer_weights->cols);
+    weights_T[l] = nnMatrixMake(layer_weights->cols, layer_weights->rows);
+    nnMatrixTranspose(layer_weights, &weights_T[l]);
+    weight_deltas[l] = nnMatrixMake(layer_weights->rows, layer_weights->cols);
+    outputs_T[l] = nnMatrixMake(layer_output_size, 1);
+    // Allocate the gradient elements and vectors for weight delta calculation.
+    nnGradientElements* elems = &gradient_elems[l];
+    elems->type = activation;
+    switch (activation) {
+      case nnIdentity:
+        break;  // Gradient vector will be borrowed, no need to allocate.
+      case nnSigmoid:
+        elems->gradient = nnMatrixMake(1, layer_output_size);
+        // Allocate the 1s vectors.
+        elems->sigmoid.ones = nnMatrixMake(1, layer_output_size);
+        nnMatrixInitConstant(&elems->sigmoid.ones, 1);
+        break;
+      case nnRelu:
+        elems->gradient = nnMatrixMake(1, layer_output_size);
+        break;
+    }
+  }
+  // Construct the query object with a size of 1 since we are training with one
+  // sample at a time.
+  nnQueryObject* query = nnMakeQueryObject(net, 1);
+  // Network outputs are given by the query object. Every network query updates
+  // the outputs.
+  const nnMatrix* const training_outputs = query->network_outputs;
+  // A vector to store the training input transposed.
+  nnMatrix training_inputs_T = nnMatrixMake(inputs->cols, 1);
+  // If debug mode is requested, we will show progress every Nth iteration.
+  const int progress_frame =
+    (params->max_iterations < PROGRESS_THRESHOLD)
+    ? 1
+    : (params->max_iterations * PROGRESS_THRESHOLD / 100);
+  // --- TRAIN
+  nnInitNet(net, params->seed, params->weight_init);
+  for (int iter = 0; iter < params->max_iterations; ++iter) {
+    // For now, we train with one sample at a time.
+    for (int sample = 0; sample < inputs->rows; ++sample) {
+      // Slice the input and target matrices with the batch size.
+      // We are not mutating the inputs, but we need the cast to borrow.
+      nnMatrix training_inputs  = nnMatrixBorrowRows((nnMatrix*)inputs,  sample, 1);
+      nnMatrix training_targets = nnMatrixBorrowRows((nnMatrix*)targets, sample, 1);
+      // Will need the input transposed for backpropagation.
+      // Assuming one training input per iteration for now.
+      nnMatrixTranspose(&training_inputs, &training_inputs_T);
+      // Run a forward pass and compute the output layer error.
+      // We don't square the error here; instead, we just compute t-o, which is
+      // part of the derivative, -2(t-o). Also, we compute o-t instead to
+      // remove that outer negative sign.
+      nnQuery(net, query, &training_inputs);
+      //nnMatrixSub(&training_targets, training_outputs, &errors[net->num_layers - 1]);
+      nnMatrixSub(training_outputs, &training_targets, &errors[net->num_layers - 1]);
+      // Update outputs_T, which we need during weight updates.
+      for (int l = 0; l < net->num_layers; ++l) {
+        nnMatrixTranspose(&query->layer_outputs[l], &outputs_T[l]);
+      }
+      // Update weights and biases for each internal layer, backpropagating
+      // errors along the way.
+      for (int l = net->num_layers - 1; l >= 0; --l) {
+        const nnMatrix* layer_output = &query->layer_outputs[l];
+        nnMatrix* layer_weights = &net->weights[l];
+        nnMatrix* layer_biases  = &net->biases[l];
+        nnGradientElements* elems = &gradient_elems[l];
+        nnMatrix* gradient = &elems->gradient;
+        const nnActivation activation = net->activations[l];
+        // Compute the gradient (the part of the expression that does not
+        // contain the output of the previous layer).
+        //
+        // Identity: G = error_k
+        // Sigmoid:  G = error_k * output_k * (1 - output_k).
+        // Relu:     G = error_k * (output_k > 0 ? 1 : 0)
+        switch (activation) {
+          case nnIdentity:
+            // TODO: Just copy the pointer?
+            *gradient = nnMatrixBorrow(&errors[l]);
+            break;
+          case nnSigmoid:
+            nnMatrixSub(&elems->sigmoid.ones, layer_output, gradient);
+            nnMatrixMulPairs(layer_output, gradient, gradient);
+            nnMatrixMulPairs(&errors[l], gradient, gradient);
+            break;
+          case nnRelu:
+            nnMatrixGt(layer_output, 0, gradient);
+            nnMatrixMulPairs(&errors[l], gradient, gradient);
+            break;
+        }
+        // Outer product to compute the weight deltas.
+        const nnMatrix* output_T = (l == 0) ? &training_inputs_T : &outputs_T[l-1];
+        nnMatrixMul(output_T, gradient, &weight_deltas[l]);
+        // Backpropagate the error before updating weights.
+        if (l > 0) {
+          nnMatrixMul(gradient, &weights_T[l], &errors[l-1]);
+        }
+        // Update weights.
+        nnMatrixScale(&weight_deltas[l], params->learning_rate);
+        // The gradient has a negative sign from -(t - o), but we have computed
+        // e = o - t instead, so we can subtract directly.
+        //nnMatrixAdd(layer_weights, &weight_deltas[l], layer_weights);
+        nnMatrixSub(layer_weights, &weight_deltas[l], layer_weights);
+        // Update weight transpose matrix for the next training iteration.
+        nnMatrixTranspose(layer_weights, &weights_T[l]);
+        // Update biases.
+        // This is the same formula as for weights, except that the o_j term is
+        // just 1. We can simply re-use the gradient that we have already
+        // computed for the weight update.
+        //nnMatrixMulAdd(layer_biases, gradient, params->learning_rate, layer_biases);
+        nnMatrixMulSub(layer_biases, gradient, params->learning_rate, layer_biases);
+      }
+      // TODO: Add this under a verbose debugging mode.
+      // if (params->debug) {
+      //   LOGD("Iter: %d, Sample: %d, Error: %f\n", iter, sample, ComputeMSE(&errors[net->num_layers - 1]));
+      //   LOGD("TGT: ");
+      //   for (int i = 0; i < training_targets.cols; ++i) {
+      //     printf("%.3f  ", training_targets.values[i]);
+      //   }
+      //   printf("\n");
+      //   LOGD("OUT: ");
+      //   for (int i = 0; i < training_outputs->cols; ++i) {
+      //     printf("%.3f  ", training_outputs->values[i]);
+      //   }
+      //   printf("\n");
+      // }
+    }
+    if (params->debug && ((iter % progress_frame) == 0)) {
+      LOGD("Iter: %d/%d, Error: %f\n",
+        iter, params->max_iterations, ComputeMSE(&errors[net->num_layers - 1]));
+    }
+  }
+  // Print the final error.
+  if (params->debug) {
+    LOGD("Iter: %d/%d, Error: %f\n",
+      params->max_iterations, params->max_iterations, ComputeMSE(&errors[net->num_layers - 1]));
+  }
+  for (int l = 0; l < net->num_layers; ++l) {
+    nnMatrixDel(&errors[l]);
+    nnMatrixDel(&outputs_T[l]);
+    nnMatrixDel(&weights_T[l]);
+    nnMatrixDel(&weight_deltas[l]);
+    nnGradientElements* elems = &gradient_elems[l];
+    switch (elems->type) {
+      case nnIdentity:
+        break;  // Gradient vector is borrowed, no need to deallocate.
+      case nnSigmoid:
+        nnMatrixDel(&elems->gradient);
+        nnMatrixDel(&elems->sigmoid.ones);
+        break;
+      case nnRelu:
+        nnMatrixDel(&elems->gradient);
+        break;
+    }
+  }
+  nnMatrixDel(&training_inputs_T);
+  free(errors);
+  free(outputs_T);
+  free(weights_T);
+  free(weight_deltas);
+  free(gradient_elems);
+}
diff --git a/src/lib/test/matrix_test.c b/src/lib/test/matrix_test.c
new file mode 100644
index 0000000..8191c97
--- /dev/null
+++ b/src/lib/test/matrix_test.c
@@ -0,0 +1,350 @@
+#include <neuralnet/matrix.h>
+#include "test.h"
+#include "test_util.h"
+#include <assert.h>
+#include <stdlib.h>
+// static void PrintMatrix(const nnMatrix* matrix) {
+//   assert(matrix);
+//   for (int i = 0; i < matrix->rows; ++i) {
+//     for (int j = 0; j < matrix->cols; ++j) {
+//       printf("%f ", nnMatrixAt(matrix, i, j));
+//     }
+//     printf("\n");
+//   }
+// }
+TEST_CASE(nnMatrixMake_1x1) {
+  nnMatrix A = nnMatrixMake(1, 1);
+  TEST_EQUAL(A.rows, 1);
+  TEST_EQUAL(A.cols, 1);
+}
+TEST_CASE(nnMatrixMake_3x1) {
+  nnMatrix A = nnMatrixMake(3, 1);
+  TEST_EQUAL(A.rows, 3);
+  TEST_EQUAL(A.cols, 1);
+}
+TEST_CASE(nnMatrixInit_3x1) {
+  nnMatrix A = nnMatrixMake(3, 1);
+  nnMatrixInit(&A, (R[]) { 1, 2, 3 });
+  TEST_EQUAL(A.values[0], 1);
+  TEST_EQUAL(A.values[1], 2);
+  TEST_EQUAL(A.values[2], 3);
+}
+TEST_CASE(nnMatrixCopyCol_test) {
+  nnMatrix A = nnMatrixMake(3, 2);
+  nnMatrix B = nnMatrixMake(3, 1);
+  nnMatrixInit(&A, (R[]) {
+    1, 2,
+    3, 4,
+    5, 6,
+  });
+  nnMatrixCopyCol(&A, &B, 1, 0);
+  TEST_EQUAL(nnMatrixAt(&B, 0, 0), 2);
+  TEST_EQUAL(nnMatrixAt(&B, 1, 0), 4);
+  TEST_EQUAL(nnMatrixAt(&B, 2, 0), 6);
+  nnMatrixDel(&A);
+  nnMatrixDel(&B);
+}
+TEST_CASE(nnMatrixMul_square_3x3) {
+  nnMatrix A = nnMatrixMake(3, 3);
+  nnMatrix B = nnMatrixMake(3, 3);
+  nnMatrix O = nnMatrixMake(3, 3);
+  nnMatrixInit(&A, (const R[]){
+    1, 2, 3,
+    4, 5, 6,
+    7, 8, 9,
+  });
+  nnMatrixInit(&B, (const R[]){
+    2, 4, 3,
+    6, 8, 5,
+    1, 7, 9,
+  });
+  nnMatrixMul(&A, &B, &O);
+  const R expected[3][3] = {
+    { 17, 41, 40 },
+    { 44, 98, 91 },
+    { 71, 155, 142 },
+  };
+  for (int i = 0; i < O.rows; ++i) {
+    for (int j = 0; j < O.cols; ++j) {
+      TEST_TRUE(double_eq(nnMatrixAt(&O, i, j), expected[i][j], EPS));
+    }
+  }
+  nnMatrixDel(&A);
+  nnMatrixDel(&B);
+  nnMatrixDel(&O);
+}
+TEST_CASE(nnMatrixMul_non_square_2x3_3x1) {
+  nnMatrix A = nnMatrixMake(2, 3);
+  nnMatrix B = nnMatrixMake(3, 1);
+  nnMatrix O = nnMatrixMake(2, 1);
+  nnMatrixInit(&A, (const R[]){
+    1, 2, 3,
+    4, 5, 6,
+  });
+  nnMatrixInit(&B, (const R[]){
+    2,
+    6,
+    1,
+  });
+  nnMatrixMul(&A, &B, &O);
+  const R expected[2][1] = {
+    { 17 },
+    { 44 },
+  };
+  for (int i = 0; i < O.rows; ++i) {
+    for (int j = 0; j < O.cols; ++j) {
+      TEST_TRUE(double_eq(nnMatrixAt(&O, i, j), expected[i][j], EPS));
+    }
+  }
+  nnMatrixDel(&A);
+  nnMatrixDel(&B);
+  nnMatrixDel(&O);
+}
+TEST_CASE(nnMatrixMulAdd_test) {
+  nnMatrix A = nnMatrixMake(2, 3);
+  nnMatrix B = nnMatrixMake(2, 3);
+  nnMatrix O = nnMatrixMake(2, 3);
+  const R scale = 2;
+  nnMatrixInit(&A, (const R[]){
+    1, 2, 3,
+    4, 5, 6,
+  });
+  nnMatrixInit(&B, (const R[]){
+    2, 3, 1,
+    7, 4, 3
+  });
+  nnMatrixMulAdd(&A, &B, scale, &O);  // O = A + B * scale
+  const R expected[2][3] = {
+    { 5, 8, 5 },
+    { 18, 13, 12 },
+  };
+  for (int i = 0; i < O.rows; ++i) {
+    for (int j = 0; j < O.cols; ++j) {
+      TEST_TRUE(double_eq(nnMatrixAt(&O, i, j), expected[i][j], EPS));
+    }
+  }
+  nnMatrixDel(&A);
+  nnMatrixDel(&B);
+  nnMatrixDel(&O);
+}
+TEST_CASE(nnMatrixMulSub_test) {
+  nnMatrix A = nnMatrixMake(2, 3);
+  nnMatrix B = nnMatrixMake(2, 3);
+  nnMatrix O = nnMatrixMake(2, 3);
+  const R scale = 2;
+  nnMatrixInit(&A, (const R[]){
+    1, 2, 3,
+    4, 5, 6,
+  });
+  nnMatrixInit(&B, (const R[]){
+    2, 3, 1,
+    7, 4, 3
+  });
+  nnMatrixMulSub(&A, &B, scale, &O);  // O = A - B * scale
+  const R expected[2][3] = {
+    { -3, -4, 1 },
+    { -10, -3, 0 },
+  };
+  for (int i = 0; i < O.rows; ++i) {
+    for (int j = 0; j < O.cols; ++j) {
+      TEST_TRUE(double_eq(nnMatrixAt(&O, i, j), expected[i][j], EPS));
+    }
+  }
+  nnMatrixDel(&A);
+  nnMatrixDel(&B);
+  nnMatrixDel(&O);
+}
+TEST_CASE(nnMatrixMulPairs_2x3) {
+  nnMatrix A = nnMatrixMake(2, 3);
+  nnMatrix B = nnMatrixMake(2, 3);
+  nnMatrix O = nnMatrixMake(2, 3);
+  nnMatrixInit(&A, (const R[]){
+    1, 2, 3,
+    4, 5, 6,
+  });
+  nnMatrixInit(&B, (const R[]){
+    2, 3, 1,
+    7, 4, 3
+  });
+  nnMatrixMulPairs(&A, &B, &O);
+  const R expected[2][3] = {
+    { 2, 6, 3 },
+    { 28, 20, 18 },
+  };
+  for (int i = 0; i < O.rows; ++i) {
+    for (int j = 0; j < O.cols; ++j) {
+      TEST_TRUE(double_eq(nnMatrixAt(&O, i, j), expected[i][j], EPS));
+    }
+  }
+  nnMatrixDel(&A);
+  nnMatrixDel(&B);
+  nnMatrixDel(&O);
+}
+TEST_CASE(nnMatrixAdd_square_2x2) {
+  nnMatrix A = nnMatrixMake(2, 2);
+  nnMatrix B = nnMatrixMake(2, 2);
+  nnMatrix C = nnMatrixMake(2, 2);
+  nnMatrixInit(&A, (R[]) {
+    1, 2,
+    3, 4,
+  });
+  nnMatrixInit(&B, (R[]) {
+    2, 1,
+    5, 3,
+  });
+  nnMatrixAdd(&A, &B, &C);
+  TEST_TRUE(double_eq(nnMatrixAt(&C, 0, 0), 3, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&C, 0, 1), 3, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&C, 1, 0), 8, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&C, 1, 1), 7, EPS));
+  nnMatrixDel(&A);
+  nnMatrixDel(&B);
+  nnMatrixDel(&C);
+}
+TEST_CASE(nnMatrixSub_square_2x2) {
+  nnMatrix A = nnMatrixMake(2, 2);
+  nnMatrix B = nnMatrixMake(2, 2);
+  nnMatrix C = nnMatrixMake(2, 2);
+  nnMatrixInit(&A, (R[]) {
+    1, 2,
+    3, 4,
+  });
+  nnMatrixInit(&B, (R[]) {
+    2, 1,
+    5, 3,
+  });
+  nnMatrixSub(&A, &B, &C);
+  TEST_TRUE(double_eq(nnMatrixAt(&C, 0, 0), -1, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&C, 0, 1), +1, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&C, 1, 0), -2, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&C, 1, 1), +1, EPS));
+  nnMatrixDel(&A);
+  nnMatrixDel(&B);
+  nnMatrixDel(&C);
+}
+TEST_CASE(nnMatrixAddRow_test) {
+  nnMatrix A = nnMatrixMake(2, 3);
+  nnMatrix B = nnMatrixMake(1, 3);
+  nnMatrix C = nnMatrixMake(2, 3);
+  nnMatrixInit(&A, (R[]) {
+    1, 2, 3,
+    4, 5, 6,
+  });
+  nnMatrixInit(&B, (R[]) {
+    2, 1, 3,
+  });
+  nnMatrixAddRow(&A, &B, &C);
+  TEST_TRUE(double_eq(nnMatrixAt(&C, 0, 0), 3, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&C, 0, 1), 3, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&C, 0, 2), 6, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&C, 1, 0), 6, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&C, 1, 1), 6, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&C, 1, 2), 9, EPS));
+  nnMatrixDel(&A);
+  nnMatrixDel(&B);
+  nnMatrixDel(&C);
+}
+TEST_CASE(nnMatrixTranspose_square_2x2) {
+  nnMatrix A = nnMatrixMake(2, 2);
+  nnMatrix B = nnMatrixMake(2, 2);
+  nnMatrixInit(&A, (R[]) {
+    1, 2,
+    3, 4
+  });
+  nnMatrixTranspose(&A, &B);
+  TEST_TRUE(double_eq(nnMatrixAt(&B, 0, 0), 1, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&B, 0, 1), 3, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&B, 1, 0), 2, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&B, 1, 1), 4, EPS));
+  nnMatrixDel(&A);
+  nnMatrixDel(&B);
+}
+TEST_CASE(nnMatrixTranspose_non_square_2x1) {
+  nnMatrix A = nnMatrixMake(2, 1);
+  nnMatrix B = nnMatrixMake(1, 2);
+  nnMatrixInit(&A, (R[]) {
+    1,
+    3,
+  });
+  nnMatrixTranspose(&A, &B);
+  TEST_TRUE(double_eq(nnMatrixAt(&B, 0, 0), 1, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&B, 0, 1), 3, EPS));
+  nnMatrixDel(&A);
+  nnMatrixDel(&B);
+}
+TEST_CASE(nnMatrixGt_test) {
+  nnMatrix A = nnMatrixMake(2, 3);
+  nnMatrix B = nnMatrixMake(2, 3);
+  nnMatrixInit(&A, (R[]) {
+    -3, 2, 0,
+    4, -1, 5
+  });
+  nnMatrixGt(&A, 0, &B);
+  TEST_TRUE(double_eq(nnMatrixAt(&B, 0, 0), 0, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&B, 0, 1), 1, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&B, 0, 2), 0, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&B, 1, 0), 1, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&B, 1, 1), 0, EPS));
+  TEST_TRUE(double_eq(nnMatrixAt(&B, 1, 2), 1, EPS));
+  nnMatrixDel(&A);
+  nnMatrixDel(&B);
+}
diff --git a/src/lib/test/neuralnet_test.c b/src/lib/test/neuralnet_test.c
new file mode 100644
index 0000000..14d9438
--- /dev/null
+++ b/src/lib/test/neuralnet_test.c
@@ -0,0 +1,92 @@
+#include <neuralnet/neuralnet.h>
+#include <neuralnet/matrix.h>
+#include "activation.h"
+#include "neuralnet_impl.h"
+#include "test.h"
+#include "test_util.h"
+#include <assert.h>
+TEST_CASE(neuralnet_perceptron_test) {
+  const int num_layers = 1;
+  const int layer_sizes[] = { 1, 1 };
+  const nnActivation layer_activations[] = { nnSigmoid };
+  const R weights[] = { 0.3 };
+  nnNeuralNetwork* net = nnMakeNet(num_layers, layer_sizes, layer_activations);
+  assert(net);
+  nnSetWeights(net, weights);
+  nnQueryObject* query = nnMakeQueryObject(net, /*num_inputs=*/1);
+  const R input[] = { 0.9 };
+  R output[1];
+  nnQueryArray(net, query, input, output);
+  const R expected_output = sigmoid(input[0] * weights[0]);
+  printf("\nOutput: %f, Expected: %f\n", output[0], expected_output);
+  TEST_TRUE(double_eq(output[0], expected_output, EPS));
+  nnDeleteQueryObject(&query);
+  nnDeleteNet(&net);
+}
+TEST_CASE(neuralnet_xor_test) {
+  const int num_layers = 2;
+  const int layer_sizes[] = { 2, 2, 1 };
+  const nnActivation layer_activations[] = { nnRelu, nnIdentity };
+  const R weights[] = {
+    1, 1, 1, 1,  // First (hidden) layer.
+    1, -2        // Second (output) layer.
+  };
+  const R biases[] = {
+    0, -1,  // First (hidden) layer.
+    0       // Second (output) layer.
+  };
+  nnNeuralNetwork* net = nnMakeNet(num_layers, layer_sizes, layer_activations);
+  assert(net);
+  nnSetWeights(net, weights);
+  nnSetBiases(net, biases);
+  // First layer weights.
+  TEST_EQUAL(nnMatrixAt(&net->weights[0], 0, 0), 1);
+  TEST_EQUAL(nnMatrixAt(&net->weights[0], 0, 1), 1);
+  TEST_EQUAL(nnMatrixAt(&net->weights[0], 0, 2), 1);
+  TEST_EQUAL(nnMatrixAt(&net->weights[0], 0, 3), 1);
+  // Second layer weights.
+  TEST_EQUAL(nnMatrixAt(&net->weights[1], 0, 0), 1);
+  TEST_EQUAL(nnMatrixAt(&net->weights[1], 0, 1), -2);
+  // First layer biases.
+  TEST_EQUAL(nnMatrixAt(&net->biases[0], 0, 0), 0);
+  TEST_EQUAL(nnMatrixAt(&net->biases[0], 0, 1), -1);
+  // Second layer biases.
+  TEST_EQUAL(nnMatrixAt(&net->biases[1], 0, 0), 0);
+  // Test.
+  #define M 4
+  nnQueryObject* query = nnMakeQueryObject(net, /*num_inputs=*/M);
+  const R test_inputs[M][2] = { { 0., 0. }, { 1., 0. }, { 0., 1. }, { 1., 1. } };
+  nnMatrix test_inputs_matrix = nnMatrixMake(M, 2);
+  nnMatrixInit(&test_inputs_matrix, (const R*)test_inputs);
+  nnQuery(net, query, &test_inputs_matrix);
+  const R expected_outputs[M] = { 0., 1., 1., 0. };
+  for (int i = 0; i < M; ++i) {
+    const R test_output = nnMatrixAt(nnNetOutputs(query), i, 0);
+    printf("\nInput: (%f, %f), Output: %f, Expected: %f\n",
+      test_inputs[i][0], test_inputs[i][1], test_output, expected_outputs[i]);
+  }
+  for (int i = 0; i < M; ++i) {
+    const R test_output = nnMatrixAt(nnNetOutputs(query), i, 0);
+    TEST_TRUE(double_eq(test_output, expected_outputs[i], OUTPUT_EPS));
+  }
+  nnDeleteQueryObject(&query);
+  nnDeleteNet(&net);
+}
diff --git a/src/lib/test/test.h b/src/lib/test/test.h
new file mode 100644
index 0000000..fd8dc22
--- /dev/null
+++ b/src/lib/test/test.h
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: MIT
+#pragma once
+#ifdef UNIT_TEST
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#if defined(__DragonFly__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) ||     \
+    defined(__NetBSD__) || defined(__OpenBSD__)
+#define USE_SYSCTL_FOR_ARGS 1
+// clang-format off
+#include <sys/types.h>
+#include <sys/sysctl.h>
+// clang-format on
+#include <unistd.h>        // getpid
+#endif
+struct test_file_metadata;
+struct test_failure {
+        bool present;
+        const char *message;
+        const char *file;
+        int line;
+};
+struct test_case_metadata {
+        void (*fn)(struct test_case_metadata *, struct test_file_metadata *);
+        struct test_failure failure;
+        const char *name;
+        struct test_case_metadata *next;
+};
+struct test_file_metadata {
+        bool registered;
+        const char *name;
+        struct test_file_metadata *next;
+        struct test_case_metadata *tests;
+};
+struct test_file_metadata __attribute__((weak)) * test_file_head;
+#define SET_FAILURE(_message)                                                             \
+        metadata->failure = (struct test_failure) {                                       \
+                .message = _message, .file = __FILE__, .line = __LINE__, .present = true, \
+        }
+#define TEST_EQUAL(a, b)                                                                 \
+        do {                                                                             \
+                if ((a) != (b)) {                                                        \
+                        SET_FAILURE(#a " != " #b);                                       \
+                        return;                                                          \
+                }                                                                        \
+        } while (0)
+#define TEST_TRUE(a)                                                                     \
+        do {                                                                             \
+                if (!(a)) {                                                              \
+                        SET_FAILURE(#a " is not true");                                  \
+                        return;                                                          \
+                }                                                                        \
+        } while (0)
+#define TEST_STREQUAL(a, b)                                                              \
+        do {                                                                             \
+                if (strcmp(a, b) != 0) {                                                 \
+                        SET_FAILURE(#a " != " #b);                                       \
+                        return;                                                          \
+                }                                                                        \
+        } while (0)
+#define TEST_CASE(_name)                                                                  \
+        static void __test_h_##_name(struct test_case_metadata *,                         \
+                                     struct test_file_metadata *);                        \
+        static struct test_file_metadata __test_h_file;                                   \
+        static struct test_case_metadata __test_h_meta_##_name = {                        \
+            .name = #_name,                                                               \
+            .fn = __test_h_##_name,                                                       \
+        };                                                                                \
+        static void __attribute__((constructor(101))) __test_h_##_name##_register(void) { \
+                __test_h_meta_##_name.next = __test_h_file.tests;                         \
+                __test_h_file.tests = &__test_h_meta_##_name;                             \
+                if (!__test_h_file.registered) {                                          \
+                        __test_h_file.name = __FILE__;                                    \
+                        __test_h_file.next = test_file_head;                              \
+                        test_file_head = &__test_h_file;                                  \
+                        __test_h_file.registered = true;                                  \
+                }                                                                         \
+        }                                                                                 \
+        static void __test_h_##_name(                                                     \
+            struct test_case_metadata *metadata __attribute__((unused)),                  \
+            struct test_file_metadata *file_metadata __attribute__((unused)))
+extern void __attribute__((weak)) (*test_h_unittest_setup)(void);
+/// Run defined tests, return true if all tests succeeds
+/// @param[out] tests_run if not NULL, set to whether tests were run
+static inline void __attribute__((constructor(102))) run_tests(void) {
+        bool should_run = false;
+#ifdef USE_SYSCTL_FOR_ARGS
+        int mib[] = {
+                CTL_KERN,
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+                KERN_PROC_ARGS,
+                getpid(),
+                KERN_PROC_ARGV,
+#else
+                KERN_PROC,
+                KERN_PROC_ARGS,
+                getpid(),
+#endif
+        };
+        char *arg = NULL;
+        size_t arglen;
+        sysctl(mib, sizeof(mib) / sizeof(mib[0]), NULL, &arglen, NULL, 0);
+        arg = malloc(arglen);
+        sysctl(mib, sizeof(mib) / sizeof(mib[0]), arg, &arglen, NULL, 0);
+#else
+        FILE *cmdlinef = fopen("/proc/self/cmdline", "r");
+        char *arg = NULL;
+        int arglen;
+        fscanf(cmdlinef, "%ms%n", &arg, &arglen);
+        fclose(cmdlinef);
+#endif
+        for (char *pos = arg; pos < arg + arglen; pos += strlen(pos) + 1) {
+                if (strcmp(pos, "--unittest") == 0) {
+                        should_run = true;
+                        break;
+                }
+        }
+        free(arg);
+        if (!should_run) {
+                return;
+        }
+        if (&test_h_unittest_setup) {
+                test_h_unittest_setup();
+        }
+        struct test_file_metadata *i = test_file_head;
+        int failed = 0, success = 0;
+        while (i) {
+                fprintf(stderr, "Running tests from %s:\n", i->name);
+                struct test_case_metadata *j = i->tests;
+                while (j) {
+                        fprintf(stderr, "\t%s ... ", j->name);
+                        j->failure.present = false;
+                        j->fn(j, i);
+                        if (j->failure.present) {
+                                fprintf(stderr, "failed (%s at %s:%d)\n", j->failure.message,
+                                        j->failure.file, j->failure.line);
+                                failed++;
+                        } else {
+                                fprintf(stderr, "passed\n");
+                                success++;
+                        }
+                        j = j->next;
+                }
+                fprintf(stderr, "\n");
+                i = i->next;
+        }
+        int total = failed + success;
+        fprintf(stderr, "Test results: passed %d/%d, failed %d/%d\n", success, total,
+                failed, total);
+        exit(failed == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+#else
+#include <stdbool.h>
+#define TEST_CASE(name) static void __attribute__((unused)) __test_h_##name(void)
+#define TEST_EQUAL(a, b)                                                                 \
+        (void)(a);                                                                       \
+        (void)(b)
+#define TEST_TRUE(a) (void)(a)
+#define TEST_STREQUAL(a, b)                                                              \
+        (void)(a);                                                                       \
+        (void)(b)
+#endif
diff --git a/src/lib/test/test_main.c b/src/lib/test/test_main.c
new file mode 100644
index 0000000..4cce7f6
--- /dev/null
+++ b/src/lib/test/test_main.c
@@ -0,0 +1,3 @@
+int main() {
+  return 0;
+}
diff --git a/src/lib/test/test_util.h b/src/lib/test/test_util.h
new file mode 100644
index 0000000..8abb99a
--- /dev/null
+++ b/src/lib/test/test_util.h
@@ -0,0 +1,22 @@
+#pragma once
+#include <neuralnet/types.h>
+#include <math.h>
+// General epsilon for comparing values.
+static const R EPS = 1e-10;
+// Epsilon for comparing network weights after training.
+static const R WEIGHT_EPS = 0.01;
+// Epsilon for comparing network outputs after training.
+static const R OUTPUT_EPS = 0.01;
+static inline bool double_eq(double a, double b, double eps) {
+  return fabs(a - b) <= eps;
+}
+static inline R lerp(R a, R b, R t) {
+  return a + t*(b-a);
+}
diff --git a/src/lib/test/train_linear_perceptron_non_origin_test.c b/src/lib/test/train_linear_perceptron_non_origin_test.c
new file mode 100644
index 0000000..5a320ac
--- /dev/null
+++ b/src/lib/test/train_linear_perceptron_non_origin_test.c
@@ -0,0 +1,67 @@
+#include <neuralnet/train.h>
+#include <neuralnet/matrix.h>
+#include <neuralnet/neuralnet.h>
+#include "activation.h"
+#include "neuralnet_impl.h"
+#include "test.h"
+#include "test_util.h"
+#include <assert.h>
+TEST_CASE(neuralnet_train_linear_perceptron_non_origin_test) {
+  const int num_layers = 1;
+  const int layer_sizes[] = { 1, 1 };
+  const nnActivation layer_activations[] = { nnIdentity };
+  nnNeuralNetwork* net = nnMakeNet(num_layers, layer_sizes, layer_activations);
+  assert(net);
+  // Train.
+  // Try to learn the Y = 2X + 1 line.
+  #define N 2
+  const R inputs[N]  = { 0., 1. };
+  const R targets[N] = { 1., 3. };
+  nnMatrix inputs_matrix  = nnMatrixMake(N, 1);
+  nnMatrix targets_matrix = nnMatrixMake(N, 1);
+  nnMatrixInit(&inputs_matrix, inputs);
+  nnMatrixInit(&targets_matrix, targets);
+  nnTrainingParams params = {
+    .learning_rate = 0.7,
+    .max_iterations = 20,
+    .seed = 0,
+    .weight_init = nnWeightInit01,
+    .debug = false,
+  };
+  nnTrain(net, &inputs_matrix, &targets_matrix, &params);
+  const R weight = nnMatrixAt(&net->weights[0], 0, 0);
+  const R expected_weight = 2.0;
+  printf("\nTrained network weight: %f, Expected: %f\n", weight, expected_weight);
+  TEST_TRUE(double_eq(weight, expected_weight, WEIGHT_EPS));
+  const R bias = nnMatrixAt(&net->biases[0], 0, 0);
+  const R expected_bias = 1.0;
+  printf("Trained network bias: %f, Expected: %f\n", bias, expected_bias);
+  TEST_TRUE(double_eq(bias, expected_bias, WEIGHT_EPS));
+  // Test.
+  nnQueryObject* query = nnMakeQueryObject(net, /*num_inputs=*/1);
+  const R test_input[] = { 2.3 };
+  R test_output[1];
+  nnQueryArray(net, query, test_input, test_output);
+  const R expected_output = test_input[0] * expected_weight + expected_bias;
+  printf("Output: %f, Expected: %f\n", test_output[0], expected_output);
+  TEST_TRUE(double_eq(test_output[0], expected_output, OUTPUT_EPS));
+  nnDeleteQueryObject(&query);
+  nnDeleteNet(&net);
+}
diff --git a/src/lib/test/train_linear_perceptron_test.c b/src/lib/test/train_linear_perceptron_test.c
new file mode 100644
index 0000000..2b1336d
--- /dev/null
+++ b/src/lib/test/train_linear_perceptron_test.c
@@ -0,0 +1,62 @@
+#include <neuralnet/train.h>
+#include <neuralnet/matrix.h>
+#include <neuralnet/neuralnet.h>
+#include "activation.h"
+#include "neuralnet_impl.h"
+#include "test.h"
+#include "test_util.h"
+#include <assert.h>
+TEST_CASE(neuralnet_train_linear_perceptron_test) {
+  const int num_layers = 1;
+  const int layer_sizes[] = { 1, 1 };
+  const nnActivation layer_activations[] = { nnIdentity };
+  nnNeuralNetwork* net = nnMakeNet(num_layers, layer_sizes, layer_activations);
+  assert(net);
+  // Train.
+  // Try to learn the Y=X line.
+  #define N 2
+  const R inputs[N]  = { 0., 1. };
+  const R targets[N] = { 0., 1. };
+  nnMatrix inputs_matrix  = nnMatrixMake(N, 1);
+  nnMatrix targets_matrix = nnMatrixMake(N, 1);
+  nnMatrixInit(&inputs_matrix, inputs);
+  nnMatrixInit(&targets_matrix, targets);
+  nnTrainingParams params = {
+    .learning_rate = 0.7,
+    .max_iterations = 10,
+    .seed = 0,
+    .weight_init = nnWeightInit01,
+    .debug = false,
+  };
+  nnTrain(net, &inputs_matrix, &targets_matrix, &params);
+  const R weight = nnMatrixAt(&net->weights[0], 0, 0);
+  const R expected_weight = 1.0;
+  printf("\nTrained network weight: %f, Expected: %f\n", weight, expected_weight);
+  TEST_TRUE(double_eq(weight, expected_weight, WEIGHT_EPS));
+  // Test.
+  nnQueryObject* query = nnMakeQueryObject(net, /*num_inputs=*/1);
+  const R test_input[] = { 2.3 };
+  R test_output[1];
+  nnQueryArray(net, query, test_input, test_output);
+  const R expected_output = test_input[0];
+  printf("Output: %f, Expected: %f\n", test_output[0], expected_output);
+  TEST_TRUE(double_eq(test_output[0], expected_output, OUTPUT_EPS));
+  nnDeleteQueryObject(&query);
+  nnDeleteNet(&net);
+}
diff --git a/src/lib/test/train_sigmoid_test.c b/src/lib/test/train_sigmoid_test.c
new file mode 100644
index 0000000..588e7ca
--- /dev/null
+++ b/src/lib/test/train_sigmoid_test.c
@@ -0,0 +1,66 @@
+#include <neuralnet/train.h>
+#include <neuralnet/matrix.h>
+#include <neuralnet/neuralnet.h>
+#include "activation.h"
+#include "neuralnet_impl.h"
+#include "test.h"
+#include "test_util.h"
+#include <assert.h>
+TEST_CASE(neuralnet_train_sigmoid_test) {
+  const int num_layers = 1;
+  const int layer_sizes[] = { 1, 1 };
+  const nnActivation layer_activations[] = { nnSigmoid };
+  nnNeuralNetwork* net = nnMakeNet(num_layers, layer_sizes, layer_activations);
+  assert(net);
+  // Train.
+  // Try to learn the sigmoid function.
+  #define N 3
+  R inputs[N];
+  R targets[N];
+  for (int i = 0; i < N; ++i) {
+    inputs[i] = lerp(-1, +1, (R)i / (R)(N-1));
+    targets[i] = sigmoid(inputs[i]);
+  }
+  nnMatrix inputs_matrix  = nnMatrixMake(N, 1);
+  nnMatrix targets_matrix = nnMatrixMake(N, 1);
+  nnMatrixInit(&inputs_matrix, inputs);
+  nnMatrixInit(&targets_matrix, targets);
+  nnTrainingParams params = {
+    .learning_rate = 0.9,
+    .max_iterations = 100,
+    .seed = 0,
+    .weight_init = nnWeightInit01,
+    .debug = false,
+  };
+  nnTrain(net, &inputs_matrix, &targets_matrix, &params);
+  const R weight = nnMatrixAt(&net->weights[0], 0, 0);
+  const R expected_weight = 1.0;
+  printf("\nTrained network weight: %f, Expected: %f\n", weight, expected_weight);
+  TEST_TRUE(double_eq(weight, expected_weight, WEIGHT_EPS));
+  // Test.
+  nnQueryObject* query = nnMakeQueryObject(net, /*num_inputs=*/1);
+  const R test_input[] = { 0.3 };
+  R test_output[1];
+  nnQueryArray(net, query, test_input, test_output);
+  const R expected_output = 0.574442516811659;  // sigmoid(0.3)
+  printf("Output: %f, Expected: %f\n", test_output[0], expected_output);
+  TEST_TRUE(double_eq(test_output[0], expected_output, OUTPUT_EPS));
+  nnDeleteQueryObject(&query);
+  nnDeleteNet(&net);
+}
diff --git a/src/lib/test/train_xor_test.c b/src/lib/test/train_xor_test.c
new file mode 100644
index 0000000..6ddc6e0
--- /dev/null
+++ b/src/lib/test/train_xor_test.c
@@ -0,0 +1,66 @@
+#include <neuralnet/train.h>
+#include <neuralnet/matrix.h>
+#include <neuralnet/neuralnet.h>
+#include "activation.h"
+#include "neuralnet_impl.h"
+#include "test.h"
+#include "test_util.h"
+#include <assert.h>
+TEST_CASE(neuralnet_train_xor_test) {
+  const int num_layers = 2;
+  const int layer_sizes[] = { 2, 2, 1 };
+  const nnActivation layer_activations[] = { nnRelu, nnIdentity };
+  nnNeuralNetwork* net = nnMakeNet(num_layers, layer_sizes, layer_activations);
+  assert(net);
+  // Train.
+  #define N 4
+  const R inputs[N][2]  = { { 0., 0. }, { 0., 1. }, { 1., 0. }, { 1., 1. } };
+  const R targets[N] = { 0., 1., 1., 0. };
+  nnMatrix inputs_matrix  = nnMatrixMake(N, 2);
+  nnMatrix targets_matrix = nnMatrixMake(N, 1);
+  nnMatrixInit(&inputs_matrix, (const R*)inputs);
+  nnMatrixInit(&targets_matrix, targets);
+  nnTrainingParams params = {
+    .learning_rate = 0.1,
+    .max_iterations = 500,
+    .seed = 0,
+    .weight_init = nnWeightInit01,
+    .debug = false,
+  };
+  nnTrain(net, &inputs_matrix, &targets_matrix, &params);
+  // Test.
+  #define M 4
+  nnQueryObject* query = nnMakeQueryObject(net, /*num_inputs=*/M);
+  const R test_inputs[M][2] = { { 0., 0. }, { 1., 0. }, { 0., 1. }, { 1., 1. } };
+  nnMatrix test_inputs_matrix = nnMatrixMake(M, 2);
+  nnMatrixInit(&test_inputs_matrix, (const R*)test_inputs);
+  nnQuery(net, query, &test_inputs_matrix);
+  const R expected_outputs[M] = { 0., 1., 1., 0. };
+  for (int i = 0; i < M; ++i) {
+    const R test_output = nnMatrixAt(nnNetOutputs(query), i, 0);
+    printf("\nInput: (%f, %f), Output: %f, Expected: %f\n",
+      test_inputs[i][0], test_inputs[i][1], test_output, expected_outputs[i]);
+  }
+  for (int i = 0; i < M; ++i) {
+    const R test_output = nnMatrixAt(nnNetOutputs(query), i, 0);
+    TEST_TRUE(double_eq(test_output, expected_outputs[i], OUTPUT_EPS));
+  }
+  nnDeleteQueryObject(&query);
+  nnDeleteNet(&net);
+}