From 653e98e029a0d0f110b0ac599e50406060bb0f87 Mon Sep 17 00:00:00 2001 From: 3gg <3gg@shellblade.net> Date: Sat, 16 Dec 2023 10:21:16 -0800 Subject: Decouple activations from linear layer. --- src/lib/include/neuralnet/matrix.h | 3 + src/lib/include/neuralnet/neuralnet.h | 51 +++-- src/lib/src/activation.h | 4 +- src/lib/src/matrix.c | 6 + src/lib/src/neuralnet.c | 218 +++++++++++---------- src/lib/src/neuralnet_impl.h | 35 ++-- src/lib/src/train.c | 182 +++++++++-------- src/lib/test/neuralnet_test.c | 103 ++++++---- .../test/train_linear_perceptron_non_origin_test.c | 46 ++--- src/lib/test/train_linear_perceptron_test.c | 44 +++-- src/lib/test/train_sigmoid_test.c | 46 +++-- src/lib/test/train_xor_test.c | 55 ++++-- 12 files changed, 451 insertions(+), 342 deletions(-) (limited to 'src/lib') diff --git a/src/lib/include/neuralnet/matrix.h b/src/lib/include/neuralnet/matrix.h index b7281bf..f80b985 100644 --- a/src/lib/include/neuralnet/matrix.h +++ b/src/lib/include/neuralnet/matrix.h @@ -17,6 +17,9 @@ nnMatrix nnMatrixMake(int rows, int cols); /// Delete a matrix and free its internal memory. void nnMatrixDel(nnMatrix*); +/// Construct a matrix from an array of values. +nnMatrix nnMatrixFromArray(int rows, int cols, const R values[]); + /// Move a matrix. /// /// |in| is an empty matrix after the move. diff --git a/src/lib/include/neuralnet/neuralnet.h b/src/lib/include/neuralnet/neuralnet.h index 05c9406..f122c2a 100644 --- a/src/lib/include/neuralnet/neuralnet.h +++ b/src/lib/include/neuralnet/neuralnet.h @@ -1,32 +1,45 @@ #pragma once +#include #include -typedef struct nnMatrix nnMatrix; - typedef struct nnNeuralNetwork nnNeuralNetwork; typedef struct nnQueryObject nnQueryObject; -/// Neuron activation. -typedef enum nnActivation { - nnIdentity, +/// Linear layer parameters. +/// +/// Either one of the following must be set: +/// a) Training: input and output sizes. +/// b) Inference: weights + biases. +typedef struct nnLinearParams { + int input_size; + int output_size; + nnMatrix weights; + nnMatrix biases; +} nnLinearParams; + +/// Layer type. +typedef enum nnLayerType { + nnLinear, nnSigmoid, nnRelu, -} nnActivation; +} nnLayerType; + +/// Neural network layer. +typedef struct nnLayer { + nnLayerType type; + union { + nnLinearParams linear; + }; +} nnLayer; /// Create a network. nnNeuralNetwork* nnMakeNet( - int num_layers, const int* layer_sizes, const nnActivation* activations); + const nnLayer* layers, int num_layers, int input_size); /// Delete the network and free its internal memory. void nnDeleteNet(nnNeuralNetwork**); -/// Set the network's weights. -void nnSetWeights(nnNeuralNetwork*, const R* weights); - -/// Set the network's biases. -void nnSetBiases(nnNeuralNetwork*, const R* biases); - /// Query the network. /// /// |input| is a matrix of inputs, one row per input and as many columns as the @@ -42,10 +55,10 @@ void nnQueryArray( /// Create a query object. /// -/// The query object holds all the internal memory required to query a network. -/// Query objects allocate all memory up front so that network queries can run -/// without additional memory allocation. -nnQueryObject* nnMakeQueryObject(const nnNeuralNetwork*, int num_inputs); +/// The query object holds all the internal memory required to query a network +/// with batches of the given size. Memory is allocated up front so that network +/// queries can run without additional memory allocation. +nnQueryObject* nnMakeQueryObject(const nnNeuralNetwork*, int batch_size); /// Delete the query object and free its internal memory. void nnDeleteQueryObject(nnQueryObject**); @@ -60,7 +73,7 @@ int nnNetInputSize(const nnNeuralNetwork*); int nnNetOutputSize(const nnNeuralNetwork*); /// Return the layer's input size. -int nnLayerInputSize(const nnMatrix* weights); +int nnLayerInputSize(const nnNeuralNetwork*, int layer); /// Return the layer's output size. -int nnLayerOutputSize(const nnMatrix* weights); +int nnLayerOutputSize(const nnNeuralNetwork*, int layer); diff --git a/src/lib/src/activation.h b/src/lib/src/activation.h index b56a69e..4c8a9e4 100644 --- a/src/lib/src/activation.h +++ b/src/lib/src/activation.h @@ -9,8 +9,8 @@ static inline R sigmoid(R x) { return 1. / (1. + exp(-x)); } static inline R relu(R x) { return fmax(0, x); } #define NN_MAP_ARRAY(f, in, out, size) \ - for (int i = 0; i < size; ++i) { \ - out[i] = f(in[i]); \ + for (int ii = 0; ii < size; ++ii) { \ + out[ii] = f(in[ii]); \ } #define sigmoid_array(in, out, size) NN_MAP_ARRAY(sigmoid, in, out, size) diff --git a/src/lib/src/matrix.c b/src/lib/src/matrix.c index d98c8bb..d5c3fcc 100644 --- a/src/lib/src/matrix.c +++ b/src/lib/src/matrix.c @@ -26,6 +26,12 @@ void nnMatrixDel(nnMatrix* matrix) { } } +nnMatrix nnMatrixFromArray(int rows, int cols, const R values[]) { + nnMatrix m = nnMatrixMake(rows, cols); + nnMatrixInit(&m, values); + return m; +} + void nnMatrixMove(nnMatrix* in, nnMatrix* out) { assert(in); assert(out); diff --git a/src/lib/src/neuralnet.c b/src/lib/src/neuralnet.c index a5fc59b..4322b8c 100644 --- a/src/lib/src/neuralnet.c +++ b/src/lib/src/neuralnet.c @@ -7,11 +7,65 @@ #include #include +static void MakeLayerImpl( + int prev_layer_output_size, const nnLayer* layer, nnLayerImpl* impl) { + impl->type = layer->type; + + switch (layer->type) { + case nnLinear: { + const nnLinearParams* params = &layer->linear; + nnLinearImpl* linear = &impl->linear; + + if ((params->input_size > 0) && (params->output_size > 0)) { + const int rows = params->input_size; + const int cols = params->output_size; + linear->weights = nnMatrixMake(rows, cols); + linear->biases = nnMatrixMake(1, cols); + linear->owned = true; + } else { + linear->weights = params->weights; + linear->biases = params->biases; + linear->owned = false; + } + + impl->input_size = linear->weights.rows; + impl->output_size = linear->weights.cols; + + break; + } + + // Activation layers. + case nnRelu: + case nnSigmoid: + impl->input_size = prev_layer_output_size; + impl->output_size = prev_layer_output_size; + break; + } +} + +static void DeleteLayer(nnLayerImpl* layer) { + switch (layer->type) { + case nnLinear: { + nnLinearImpl* linear = &layer->linear; + if (linear->owned) { + nnMatrixDel(&linear->weights); + nnMatrixDel(&linear->biases); + } + break; + } + + // No parameters for these layers. + case nnRelu: + case nnSigmoid: + break; + } +} + nnNeuralNetwork* nnMakeNet( - int num_layers, const int* layer_sizes, const nnActivation* activations) { + const nnLayer* layers, int num_layers, int input_size) { + assert(layers); assert(num_layers > 0); - assert(layer_sizes); - assert(activations); + assert(input_size > 0); nnNeuralNetwork* net = calloc(1, sizeof(nnNeuralNetwork)); if (net == 0) { @@ -20,84 +74,38 @@ nnNeuralNetwork* nnMakeNet( net->num_layers = num_layers; - net->weights = calloc(num_layers, sizeof(nnMatrix)); - net->biases = calloc(num_layers, sizeof(nnMatrix)); - net->activations = calloc(num_layers, sizeof(nnActivation)); - if ((net->weights == 0) || (net->biases == 0) || (net->activations == 0)) { + net->layers = calloc(num_layers, sizeof(nnLayerImpl)); + if (net->layers == 0) { nnDeleteNet(&net); return 0; } + int prev_layer_output_size = input_size; for (int l = 0; l < num_layers; ++l) { - // layer_sizes = { input layer size, first hidden layer size, ...} - const int layer_input_size = layer_sizes[l]; - const int layer_output_size = layer_sizes[l + 1]; - - // We store the transpose of the weight matrix as written in textbooks. - // Our vectors are row vectors and the matrices row-major. - const int rows = layer_input_size; - const int cols = layer_output_size; - - net->weights[l] = nnMatrixMake(rows, cols); - net->biases[l] = nnMatrixMake(1, cols); - net->activations[l] = activations[l]; + MakeLayerImpl(prev_layer_output_size, &layers[l], &net->layers[l]); + prev_layer_output_size = net->layers[l].output_size; } return net; } -void nnDeleteNet(nnNeuralNetwork** net) { - if ((!net) || (!(*net))) { +void nnDeleteNet(nnNeuralNetwork** ppNet) { + if ((!ppNet) || (!(*ppNet))) { return; } - if ((*net)->weights != 0) { - for (int l = 0; l < (*net)->num_layers; ++l) { - nnMatrixDel(&(*net)->weights[l]); - } - free((*net)->weights); - (*net)->weights = 0; - } - if ((*net)->biases != 0) { - for (int l = 0; l < (*net)->num_layers; ++l) { - nnMatrixDel(&(*net)->biases[l]); - } - free((*net)->biases); - (*net)->biases = 0; - } - if ((*net)->activations) { - free((*net)->activations); - (*net)->activations = 0; - } - free(*net); - *net = 0; -} - -void nnSetWeights(nnNeuralNetwork* net, const R* weights) { - assert(net); - assert(weights); + nnNeuralNetwork* net = *ppNet; for (int l = 0; l < net->num_layers; ++l) { - nnMatrix* layer_weights = &net->weights[l]; - R* layer_values = layer_weights->values; - - for (int j = 0; j < layer_weights->rows * layer_weights->cols; ++j) { - *layer_values++ = *weights++; - } + DeleteLayer(&net->layers[l]); } -} - -void nnSetBiases(nnNeuralNetwork* net, const R* biases) { - assert(net); - assert(biases); - - for (int l = 0; l < net->num_layers; ++l) { - nnMatrix* layer_biases = &net->biases[l]; - R* layer_values = layer_biases->values; - for (int j = 0; j < layer_biases->rows * layer_biases->cols; ++j) { - *layer_values++ = *biases++; - } + if (net->layers) { + free(net->layers); + net->layers = 0; } + + free(net); + *ppNet = 0; } void nnQuery( @@ -114,35 +122,40 @@ void nnQuery( nnMatrix input_vector = nnMatrixBorrowRows((nnMatrix*)input, i, 1); for (int l = 0; l < net->num_layers; ++l) { - const nnMatrix* layer_weights = &net->weights[l]; - const nnMatrix* layer_biases = &net->biases[l]; - // Y^T = (W*X)^T = X^T*W^T - // - // TODO: If we had a row-row matrix multiplication, we could compute: - // Y^T = W ** X^T - // The row-row multiplication could be more cache-friendly. We just need - // to store W as is, without transposing. - // We could also rewrite the original Mul function to go row x row, - // decomposing the multiplication. Preserving the original meaning of Mul - // makes everything clearer. nnMatrix output_vector = nnMatrixBorrowRows(&query->layer_outputs[l], i, 1); - nnMatrixMul(&input_vector, layer_weights, &output_vector); - nnMatrixAddRow(&output_vector, layer_biases, &output_vector); - switch (net->activations[l]) { - case nnIdentity: - break; // Nothing to do for the identity function. - case nnSigmoid: - sigmoid_array( - output_vector.values, output_vector.values, output_vector.cols); + switch (net->layers[l].type) { + case nnLinear: { + const nnLinearImpl* linear = &net->layers[l].linear; + const nnMatrix* layer_weights = &linear->weights; + const nnMatrix* layer_biases = &linear->biases; + + // Y^T = (W*X)^T = X^T*W^T + // + // TODO: If we had a row-row matrix multiplication, we could compute: + // Y^T = W ** X^T + // + // The row-row multiplication could be more cache-friendly. We just need + // to store W as is, without transposing. + // + // We could also rewrite the original Mul function to go row x row, + // decomposing the multiplication. Preserving the original meaning of + // Mul makes everything clearer. + nnMatrixMul(&input_vector, layer_weights, &output_vector); + nnMatrixAddRow(&output_vector, layer_biases, &output_vector); break; + } case nnRelu: + assert(input_vector.cols == output_vector.cols); relu_array( - output_vector.values, output_vector.values, output_vector.cols); + input_vector.values, output_vector.values, output_vector.cols); + break; + case nnSigmoid: + assert(input_vector.cols == output_vector.cols); + sigmoid_array( + input_vector.values, output_vector.values, output_vector.cols); break; - default: - assert(0); } input_vector = output_vector; // Borrow. @@ -159,15 +172,15 @@ void nnQueryArray( assert(output); assert(net->num_layers > 0); - nnMatrix input_vector = nnMatrixMake(net->weights[0].cols, 1); + nnMatrix input_vector = nnMatrixMake(1, nnNetInputSize(net)); nnMatrixInit(&input_vector, input); nnQuery(net, query, &input_vector); nnMatrixRowToArray(query->network_outputs, 0, output); } -nnQueryObject* nnMakeQueryObject(const nnNeuralNetwork* net, int num_inputs) { +nnQueryObject* nnMakeQueryObject(const nnNeuralNetwork* net, int batch_size) { assert(net); - assert(num_inputs > 0); + assert(batch_size > 0); assert(net->num_layers > 0); nnQueryObject* query = calloc(1, sizeof(nnQueryObject)); @@ -183,11 +196,12 @@ nnQueryObject* nnMakeQueryObject(const nnNeuralNetwork* net, int num_inputs) { free(query); return 0; } + for (int l = 0; l < net->num_layers; ++l) { - const nnMatrix* layer_weights = &net->weights[l]; - const int layer_output_size = nnLayerOutputSize(layer_weights); - query->layer_outputs[l] = nnMatrixMake(num_inputs, layer_output_size); + const int layer_output_size = nnLayerOutputSize(net, l); + query->layer_outputs[l] = nnMatrixMake(batch_size, layer_output_size); } + query->network_outputs = &query->layer_outputs[net->num_layers - 1]; return query; @@ -213,23 +227,19 @@ const nnMatrix* nnNetOutputs(const nnQueryObject* query) { } int nnNetInputSize(const nnNeuralNetwork* net) { - assert(net); - assert(net->num_layers > 0); - return net->weights[0].rows; + return nnLayerInputSize(net, 0); } int nnNetOutputSize(const nnNeuralNetwork* net) { - assert(net); - assert(net->num_layers > 0); - return net->weights[net->num_layers - 1].cols; + return nnLayerOutputSize(net, net->num_layers - 1); } -int nnLayerInputSize(const nnMatrix* weights) { - assert(weights); - return weights->rows; +int nnLayerInputSize(const nnNeuralNetwork* net, int layer) { + assert(net); + return net->layers[layer].input_size; } -int nnLayerOutputSize(const nnMatrix* weights) { - assert(weights); - return weights->cols; +int nnLayerOutputSize(const nnNeuralNetwork* net, int layer) { + assert(net); + return net->layers[layer].output_size; } diff --git a/src/lib/src/neuralnet_impl.h b/src/lib/src/neuralnet_impl.h index f5a9c63..935c5ea 100644 --- a/src/lib/src/neuralnet_impl.h +++ b/src/lib/src/neuralnet_impl.h @@ -2,22 +2,29 @@ #include +#include + +/// Linear layer parameters. +typedef struct nnLinearImpl { + nnMatrix weights; + nnMatrix biases; + bool owned; /// Whether the library owns the weights and biases. +} nnLinearImpl; + +/// Neural network layer. +typedef struct nnLayerImpl { + nnLayerType type; + int input_size; + int output_size; + union { + nnLinearImpl linear; + }; +} nnLayerImpl; + /// Neural network object. -/// -/// We store the transposes of the weight matrices so that we can do forward -/// passes with a minimal amount of work. That is, if in paper we write: -/// -/// [w11 w21] -/// [w12 w22] -/// -/// then the weight matrix in memory is stored as the following array: -/// -/// w11 w12 w21 w22 typedef struct nnNeuralNetwork { - int num_layers; // Number of non-input layers (hidden + output). - nnMatrix* weights; // One matrix per non-input layer. - nnMatrix* biases; // One vector per non-input layer. - nnActivation* activations; // One per non-input layer. + int num_layers; // Number of non-input layers (hidden + output). + nnLayerImpl* layers; // One per non-input layer. } nnNeuralNetwork; /// A query object that holds all the memory necessary to query a network. diff --git a/src/lib/src/train.c b/src/lib/src/train.c index dc93f0f..98f58ad 100644 --- a/src/lib/src/train.c +++ b/src/lib/src/train.c @@ -38,7 +38,7 @@ typedef struct nnSigmoidGradientElements { /// each layer. A data type is defined for these because we allocate all the /// required memory up front before entering the training loop. typedef struct nnGradientElements { - nnActivation type; + nnLayerType type; // Gradient vector, same size as the layer. // This will contain the gradient expression except for the output value of // the previous layer. @@ -57,10 +57,27 @@ void nnInitNet( mt19937_64_init(&rng, seed); for (int l = 0; l < net->num_layers; ++l) { - nnMatrix* weights = &net->weights[l]; - nnMatrix* biases = &net->biases[l]; + // Get the layer's weights and biases, if any. + nnMatrix* weights = 0; + nnMatrix* biases = 0; + switch (net->layers[l].type) { + case nnLinear: { + nnLinearImpl* linear = &net->layers[l].linear; + + weights = &linear->weights; + biases = &linear->biases; + break; + } + // Activations. + case nnRelu: + case nnSigmoid: + break; + } + if (!weights || !biases) { + continue; + } - const R layer_size = (R)nnLayerInputSize(weights); + const R layer_size = (R)nnLayerInputSize(net, l); const R scale = 1. / layer_size; const R stdev = 1. / sqrt((R)layer_size); const R sigma = stdev * stdev; @@ -128,9 +145,6 @@ void nnTrain( // with one sample at a time. nnMatrix* errors = calloc(net->num_layers, sizeof(nnMatrix)); - // Allocate the weight transpose matrices up front for backpropagation. - // nnMatrix* weights_T = calloc(net->num_layers, sizeof(nnMatrix)); - // Allocate the weight delta matrices. nnMatrix* weight_deltas = calloc(net->num_layers, sizeof(nnMatrix)); @@ -144,30 +158,24 @@ void nnTrain( nnMatrix* outputs_T = calloc(net->num_layers, sizeof(nnMatrix)); assert(errors != 0); - // assert(weights_T != 0); assert(weight_deltas != 0); assert(gradient_elems); assert(outputs_T); for (int l = 0; l < net->num_layers; ++l) { - const nnMatrix* layer_weights = &net->weights[l]; - const int layer_output_size = net->weights[l].cols; - const nnActivation activation = net->activations[l]; - - errors[l] = nnMatrixMake(1, layer_weights->cols); - - // weights_T[l] = nnMatrixMake(layer_weights->cols, layer_weights->rows); - // nnMatrixTranspose(layer_weights, &weights_T[l]); - - weight_deltas[l] = nnMatrixMake(layer_weights->rows, layer_weights->cols); + const int layer_input_size = nnLayerInputSize(net, l); + const int layer_output_size = nnLayerOutputSize(net, l); + const nnLayerImpl* layer = &net->layers[l]; - outputs_T[l] = nnMatrixMake(layer_output_size, 1); + errors[l] = nnMatrixMake(1, layer_output_size); + weight_deltas[l] = nnMatrixMake(layer_input_size, layer_output_size); + outputs_T[l] = nnMatrixMake(layer_output_size, 1); // Allocate the gradient elements and vectors for weight delta calculation. nnGradientElements* elems = &gradient_elems[l]; - elems->type = activation; - switch (activation) { - case nnIdentity: + elems->type = layer->type; + switch (layer->type) { + case nnLinear: break; // Gradient vector will be borrowed, no need to allocate. case nnSigmoid: @@ -208,6 +216,7 @@ void nnTrain( // For now, we train with one sample at a time. for (int sample = 0; sample < inputs->rows; ++sample) { + // TODO: Introduce a BorrowMut. // Slice the input and target matrices with the batch size. // We are not mutating the inputs, but we need the cast to borrow. nnMatrix training_inputs = @@ -219,15 +228,16 @@ void nnTrain( // Assuming one training input per iteration for now. nnMatrixTranspose(&training_inputs, &training_inputs_T); - // Run a forward pass and compute the output layer error relevant to the - // derivative: o-t. - // Error: (t-o)^2 - // dE/do = -2(t-o) - // = +2(o-t) + // Forward pass. + nnQuery(net, query, &training_inputs); + + // Compute the error derivative: o-t. + // Error: 1/2 (t-o)^2 + // dE/do = -(t-o) + // = +(o-t) // Note that we compute o-t instead to remove that outer negative sign. // The 2 is dropped because we are only interested in the direction of the // gradient. The learning rate controls the magnitude. - nnQuery(net, query, &training_inputs); nnMatrixSub( training_outputs, &training_targets, &errors[net->num_layers - 1]); @@ -236,68 +246,86 @@ void nnTrain( nnMatrixTranspose(&query->layer_outputs[l], &outputs_T[l]); } - // Update weights and biases for each internal layer, backpropagating + // Update weights and biases for each internal layer, back-propagating // errors along the way. for (int l = net->num_layers - 1; l >= 0; --l) { - const nnMatrix* layer_output = &query->layer_outputs[l]; - nnMatrix* layer_weights = &net->weights[l]; - nnMatrix* layer_biases = &net->biases[l]; - nnGradientElements* elems = &gradient_elems[l]; - nnMatrix* gradient = &elems->gradient; - const nnActivation activation = net->activations[l]; - - // Compute the gradient (the part of the expression that does not - // contain the output of the previous layer). + const nnMatrix* layer_output = &query->layer_outputs[l]; + nnGradientElements* elems = &gradient_elems[l]; + nnMatrix* gradient = &elems->gradient; + nnLayerImpl* layer = &net->layers[l]; + + // Compute this layer's gradient. + // + // By "gradient" we mean the expression common to the weights and bias + // gradients. This is the part of the expression that does not contain + // this layer's input. // - // Identity: G = error_k - // Sigmoid: G = error_k * output_k * (1 - output_k). - // Relu: G = error_k * (output_k > 0 ? 1 : 0) - switch (activation) { - case nnIdentity: + // Linear: G = id + // Relu: G = (output_k > 0 ? 1 : 0) + // Sigmoid: G = output_k * (1 - output_k) + switch (layer->type) { + case nnLinear: { // TODO: Just copy the pointer? *gradient = nnMatrixBorrow(&errors[l]); break; + } + case nnRelu: + nnMatrixGt(layer_output, 0, gradient); + break; case nnSigmoid: nnMatrixSub(&elems->sigmoid.ones, layer_output, gradient); nnMatrixMulPairs(layer_output, gradient, gradient); - nnMatrixMulPairs(&errors[l], gradient, gradient); - break; - case nnRelu: - nnMatrixGt(layer_output, 0, gradient); - nnMatrixMulPairs(&errors[l], gradient, gradient); break; } - // Outer product to compute the weight deltas. - const nnMatrix* output_T = - (l == 0) ? &training_inputs_T : &outputs_T[l - 1]; - nnMatrixMul(output_T, gradient, &weight_deltas[l]); - - // Backpropagate the error before updating weights. + // Back-propagate the error. + // + // This combines this layer's gradient with the back-propagated error, + // which is the combination of the gradients of subsequent layers down + // to the output layer error. + // + // Note that this step uses the layer's original weights. if (l > 0) { - // G * W^T == G *^T W. - // nnMatrixMul(gradient, &weights_T[l], &errors[l-1]); - nnMatrixMulRows(gradient, layer_weights, &errors[l - 1]); + switch (layer->type) { + case nnLinear: { + const nnMatrix* layer_weights = &layer->linear.weights; + // E * W^T == E *^T W. + // Using nnMatrixMulRows, we avoid having to transpose the weight + // matrix. + nnMatrixMulRows(&errors[l], layer_weights, &errors[l - 1]); + break; + } + // For activations, the error back-propagates as is but multiplied by + // the layer's gradient. + case nnRelu: + case nnSigmoid: + nnMatrixMulPairs(&errors[l], gradient, &errors[l - 1]); + break; + } } - // Update weights. - nnMatrixScale(&weight_deltas[l], params->learning_rate); - // The gradient has a negative sign from -(t - o), but we have computed - // e = o - t instead, so we can subtract directly. - // nnMatrixAdd(layer_weights, &weight_deltas[l], layer_weights); - nnMatrixSub(layer_weights, &weight_deltas[l], layer_weights); - - // Update weight transpose matrix for the next training iteration. - // nnMatrixTranspose(layer_weights, &weights_T[l]); - - // Update biases. - // This is the same formula as for weights, except that the o_j term is - // just 1. We can simply re-use the gradient that we have already - // computed for the weight update. - // nnMatrixMulAdd(layer_biases, gradient, params->learning_rate, - // layer_biases); - nnMatrixMulSub( - layer_biases, gradient, params->learning_rate, layer_biases); + // Update layer weights. + if (layer->type == nnLinear) { + nnLinearImpl* linear = &layer->linear; + nnMatrix* layer_weights = &linear->weights; + nnMatrix* layer_biases = &linear->biases; + + // Outer product to compute the weight deltas. + // This layer's input is the previous layer's output. + const nnMatrix* input_T = + (l == 0) ? &training_inputs_T : &outputs_T[l - 1]; + nnMatrixMul(input_T, gradient, &weight_deltas[l]); + + // Update weights. + nnMatrixScale(&weight_deltas[l], params->learning_rate); + nnMatrixSub(layer_weights, &weight_deltas[l], layer_weights); + + // Update biases. + // This is the same formula as for weights, except that the o_j term + // is just 1. + nnMatrixMulSub( + layer_biases, gradient, params->learning_rate, layer_biases); + } } // TODO: Add this under a verbose debugging mode. @@ -334,12 +362,11 @@ void nnTrain( for (int l = 0; l < net->num_layers; ++l) { nnMatrixDel(&errors[l]); nnMatrixDel(&outputs_T[l]); - // nnMatrixDel(&weights_T[l]); nnMatrixDel(&weight_deltas[l]); nnGradientElements* elems = &gradient_elems[l]; switch (elems->type) { - case nnIdentity: + case nnLinear: break; // Gradient vector is borrowed, no need to deallocate. case nnSigmoid: @@ -355,7 +382,6 @@ void nnTrain( nnMatrixDel(&training_inputs_T); free(errors); free(outputs_T); - // free(weights_T); free(weight_deltas); free(gradient_elems); } diff --git a/src/lib/test/neuralnet_test.c b/src/lib/test/neuralnet_test.c index 14d9438..0f8d7b8 100644 --- a/src/lib/test/neuralnet_test.c +++ b/src/lib/test/neuralnet_test.c @@ -1,8 +1,8 @@ #include -#include #include "activation.h" #include "neuralnet_impl.h" +#include #include "test.h" #include "test_util.h" @@ -10,23 +10,31 @@ #include TEST_CASE(neuralnet_perceptron_test) { - const int num_layers = 1; - const int layer_sizes[] = { 1, 1 }; - const nnActivation layer_activations[] = { nnSigmoid }; - const R weights[] = { 0.3 }; + const int num_layers = 2; + const int input_size = 1; + const R weights[] = {0.3}; + const R biases[] = {0.0}; + const nnLayer layers[] = { + {.type = nnLinear, + .linear = + {.weights = nnMatrixFromArray(1, 1, weights), + .biases = nnMatrixFromArray(1, 1, biases)}}, + {.type = nnSigmoid}, + }; - nnNeuralNetwork* net = nnMakeNet(num_layers, layer_sizes, layer_activations); + nnNeuralNetwork* net = nnMakeNet(layers, num_layers, input_size); assert(net); - nnSetWeights(net, weights); - nnQueryObject* query = nnMakeQueryObject(net, /*num_inputs=*/1); + nnQueryObject* query = nnMakeQueryObject(net, 1); - const R input[] = { 0.9 }; - R output[1]; + const R input[] = {0.9}; + R output[1]; nnQueryArray(net, query, input, output); const R expected_output = sigmoid(input[0] * weights[0]); - printf("\nOutput: %f, Expected: %f\n", output[0], expected_output); + printf( + "\n[neuralnet_perceptron_test] Output: %f, Expected: %f\n", output[0], + expected_output); TEST_TRUE(double_eq(output[0], expected_output, EPS)); nnDeleteQueryObject(&query); @@ -34,53 +42,66 @@ TEST_CASE(neuralnet_perceptron_test) { } TEST_CASE(neuralnet_xor_test) { - const int num_layers = 2; - const int layer_sizes[] = { 2, 2, 1 }; - const nnActivation layer_activations[] = { nnRelu, nnIdentity }; - const R weights[] = { - 1, 1, 1, 1, // First (hidden) layer. - 1, -2 // Second (output) layer. - }; - const R biases[] = { - 0, -1, // First (hidden) layer. - 0 // Second (output) layer. + // First (hidden) layer. + const R weights0[] = {1, 1, 1, 1}; + const R biases0[] = {0, -1}; + // Second (output) layer. + const R weights1[] = {1, -2}; + const R biases1[] = {0}; + // Network. + const int num_layers = 3; + const int input_size = 2; + const nnLayer layers[] = { + {.type = nnLinear, + .linear = + {.weights = nnMatrixFromArray(2, 2, weights0), + .biases = nnMatrixFromArray(1, 2, biases0)}}, + {.type = nnRelu}, + {.type = nnLinear, + .linear = + {.weights = nnMatrixFromArray(2, 1, weights1), + .biases = nnMatrixFromArray(1, 1, biases1)}}, }; - nnNeuralNetwork* net = nnMakeNet(num_layers, layer_sizes, layer_activations); + nnNeuralNetwork* net = nnMakeNet(layers, num_layers, input_size); assert(net); - nnSetWeights(net, weights); - nnSetBiases(net, biases); // First layer weights. - TEST_EQUAL(nnMatrixAt(&net->weights[0], 0, 0), 1); - TEST_EQUAL(nnMatrixAt(&net->weights[0], 0, 1), 1); - TEST_EQUAL(nnMatrixAt(&net->weights[0], 0, 2), 1); - TEST_EQUAL(nnMatrixAt(&net->weights[0], 0, 3), 1); - // Second layer weights. - TEST_EQUAL(nnMatrixAt(&net->weights[1], 0, 0), 1); - TEST_EQUAL(nnMatrixAt(&net->weights[1], 0, 1), -2); + TEST_EQUAL(nnMatrixAt(&net->layers[0].linear.weights, 0, 0), 1); + TEST_EQUAL(nnMatrixAt(&net->layers[0].linear.weights, 0, 1), 1); + TEST_EQUAL(nnMatrixAt(&net->layers[0].linear.weights, 0, 2), 1); + TEST_EQUAL(nnMatrixAt(&net->layers[0].linear.weights, 0, 3), 1); + // Second linear layer (third layer) weights. + TEST_EQUAL(nnMatrixAt(&net->layers[2].linear.weights, 0, 0), 1); + TEST_EQUAL(nnMatrixAt(&net->layers[2].linear.weights, 0, 1), -2); // First layer biases. - TEST_EQUAL(nnMatrixAt(&net->biases[0], 0, 0), 0); - TEST_EQUAL(nnMatrixAt(&net->biases[0], 0, 1), -1); - // Second layer biases. - TEST_EQUAL(nnMatrixAt(&net->biases[1], 0, 0), 0); + TEST_EQUAL(nnMatrixAt(&net->layers[0].linear.biases, 0, 0), 0); + TEST_EQUAL(nnMatrixAt(&net->layers[0].linear.biases, 0, 1), -1); + // Second linear layer (third layer) biases. + TEST_EQUAL(nnMatrixAt(&net->layers[2].linear.biases, 0, 0), 0); // Test. - #define M 4 +#define M 4 - nnQueryObject* query = nnMakeQueryObject(net, /*num_inputs=*/M); + nnQueryObject* query = nnMakeQueryObject(net, M); - const R test_inputs[M][2] = { { 0., 0. }, { 1., 0. }, { 0., 1. }, { 1., 1. } }; + const R test_inputs[M][2] = { + {0., 0.}, + {1., 0.}, + {0., 1.}, + {1., 1.} + }; nnMatrix test_inputs_matrix = nnMatrixMake(M, 2); nnMatrixInit(&test_inputs_matrix, (const R*)test_inputs); nnQuery(net, query, &test_inputs_matrix); - const R expected_outputs[M] = { 0., 1., 1., 0. }; + const R expected_outputs[M] = {0., 1., 1., 0.}; for (int i = 0; i < M; ++i) { const R test_output = nnMatrixAt(nnNetOutputs(query), i, 0); - printf("\nInput: (%f, %f), Output: %f, Expected: %f\n", - test_inputs[i][0], test_inputs[i][1], test_output, expected_outputs[i]); + printf( + "\nInput: (%f, %f), Output: %f, Expected: %f\n", test_inputs[i][0], + test_inputs[i][1], test_output, expected_outputs[i]); } for (int i = 0; i < M; ++i) { const R test_output = nnMatrixAt(nnNetOutputs(query), i, 0); diff --git a/src/lib/test/train_linear_perceptron_non_origin_test.c b/src/lib/test/train_linear_perceptron_non_origin_test.c index 5a320ac..40a42e0 100644 --- a/src/lib/test/train_linear_perceptron_non_origin_test.c +++ b/src/lib/test/train_linear_perceptron_non_origin_test.c @@ -1,9 +1,8 @@ #include +#include "neuralnet_impl.h" #include #include -#include "activation.h" -#include "neuralnet_impl.h" #include "test.h" #include "test_util.h" @@ -11,19 +10,21 @@ #include TEST_CASE(neuralnet_train_linear_perceptron_non_origin_test) { - const int num_layers = 1; - const int layer_sizes[] = { 1, 1 }; - const nnActivation layer_activations[] = { nnIdentity }; + const int num_layers = 1; + const int input_size = 1; + const nnLayer layers[] = { + {.type = nnLinear, .linear = {.input_size = 1, .output_size = 1}} + }; - nnNeuralNetwork* net = nnMakeNet(num_layers, layer_sizes, layer_activations); + nnNeuralNetwork* net = nnMakeNet(layers, num_layers, input_size); assert(net); - // Train. +// Train. - // Try to learn the Y = 2X + 1 line. - #define N 2 - const R inputs[N] = { 0., 1. }; - const R targets[N] = { 1., 3. }; +// Try to learn the Y = 2X + 1 line. +#define N 2 + const R inputs[N] = {0., 1.}; + const R targets[N] = {1., 3.}; nnMatrix inputs_matrix = nnMatrixMake(N, 1); nnMatrix targets_matrix = nnMatrixMake(N, 1); @@ -31,31 +32,32 @@ TEST_CASE(neuralnet_train_linear_perceptron_non_origin_test) { nnMatrixInit(&targets_matrix, targets); nnTrainingParams params = { - .learning_rate = 0.7, - .max_iterations = 20, - .seed = 0, - .weight_init = nnWeightInit01, - .debug = false, + .learning_rate = 0.7, + .max_iterations = 20, + .seed = 0, + .weight_init = nnWeightInit01, + .debug = false, }; nnTrain(net, &inputs_matrix, &targets_matrix, ¶ms); - const R weight = nnMatrixAt(&net->weights[0], 0, 0); + const R weight = nnMatrixAt(&net->layers[0].linear.weights, 0, 0); const R expected_weight = 2.0; - printf("\nTrained network weight: %f, Expected: %f\n", weight, expected_weight); + printf( + "\nTrained network weight: %f, Expected: %f\n", weight, expected_weight); TEST_TRUE(double_eq(weight, expected_weight, WEIGHT_EPS)); - const R bias = nnMatrixAt(&net->biases[0], 0, 0); + const R bias = nnMatrixAt(&net->layers[0].linear.biases, 0, 0); const R expected_bias = 1.0; printf("Trained network bias: %f, Expected: %f\n", bias, expected_bias); TEST_TRUE(double_eq(bias, expected_bias, WEIGHT_EPS)); // Test. - nnQueryObject* query = nnMakeQueryObject(net, /*num_inputs=*/1); + nnQueryObject* query = nnMakeQueryObject(net, 1); - const R test_input[] = { 2.3 }; - R test_output[1]; + const R test_input[] = {2.3}; + R test_output[1]; nnQueryArray(net, query, test_input, test_output); const R expected_output = test_input[0] * expected_weight + expected_bias; diff --git a/src/lib/test/train_linear_perceptron_test.c b/src/lib/test/train_linear_perceptron_test.c index 2b1336d..667643b 100644 --- a/src/lib/test/train_linear_perceptron_test.c +++ b/src/lib/test/train_linear_perceptron_test.c @@ -1,9 +1,8 @@ #include +#include "neuralnet_impl.h" #include #include -#include "activation.h" -#include "neuralnet_impl.h" #include "test.h" #include "test_util.h" @@ -11,19 +10,21 @@ #include TEST_CASE(neuralnet_train_linear_perceptron_test) { - const int num_layers = 1; - const int layer_sizes[] = { 1, 1 }; - const nnActivation layer_activations[] = { nnIdentity }; + const int num_layers = 1; + const int input_size = 1; + const nnLayer layers[] = { + {.type = nnLinear, .linear = {.input_size = 1, .output_size = 1}} + }; - nnNeuralNetwork* net = nnMakeNet(num_layers, layer_sizes, layer_activations); + nnNeuralNetwork* net = nnMakeNet(layers, num_layers, input_size); assert(net); - // Train. +// Train. - // Try to learn the Y=X line. - #define N 2 - const R inputs[N] = { 0., 1. }; - const R targets[N] = { 0., 1. }; +// Try to learn the Y=X line. +#define N 2 + const R inputs[N] = {0., 1.}; + const R targets[N] = {0., 1.}; nnMatrix inputs_matrix = nnMatrixMake(N, 1); nnMatrix targets_matrix = nnMatrixMake(N, 1); @@ -31,26 +32,27 @@ TEST_CASE(neuralnet_train_linear_perceptron_test) { nnMatrixInit(&targets_matrix, targets); nnTrainingParams params = { - .learning_rate = 0.7, - .max_iterations = 10, - .seed = 0, - .weight_init = nnWeightInit01, - .debug = false, + .learning_rate = 0.7, + .max_iterations = 10, + .seed = 0, + .weight_init = nnWeightInit01, + .debug = false, }; nnTrain(net, &inputs_matrix, &targets_matrix, ¶ms); - const R weight = nnMatrixAt(&net->weights[0], 0, 0); + const R weight = nnMatrixAt(&net->layers[0].linear.weights, 0, 0); const R expected_weight = 1.0; - printf("\nTrained network weight: %f, Expected: %f\n", weight, expected_weight); + printf( + "\nTrained network weight: %f, Expected: %f\n", weight, expected_weight); TEST_TRUE(double_eq(weight, expected_weight, WEIGHT_EPS)); // Test. - nnQueryObject* query = nnMakeQueryObject(net, /*num_inputs=*/1); + nnQueryObject* query = nnMakeQueryObject(net, 1); - const R test_input[] = { 2.3 }; - R test_output[1]; + const R test_input[] = {2.3}; + R test_output[1]; nnQueryArray(net, query, test_input, test_output); const R expected_output = test_input[0]; diff --git a/src/lib/test/train_sigmoid_test.c b/src/lib/test/train_sigmoid_test.c index 588e7ca..39a84b0 100644 --- a/src/lib/test/train_sigmoid_test.c +++ b/src/lib/test/train_sigmoid_test.c @@ -1,9 +1,9 @@ #include -#include -#include #include "activation.h" #include "neuralnet_impl.h" +#include +#include #include "test.h" #include "test_util.h" @@ -11,21 +11,24 @@ #include TEST_CASE(neuralnet_train_sigmoid_test) { - const int num_layers = 1; - const int layer_sizes[] = { 1, 1 }; - const nnActivation layer_activations[] = { nnSigmoid }; + const int num_layers = 2; + const int input_size = 1; + const nnLayer layers[] = { + {.type = nnLinear, .linear = {.input_size = 1, .output_size = 1}}, + {.type = nnSigmoid}, + }; - nnNeuralNetwork* net = nnMakeNet(num_layers, layer_sizes, layer_activations); + nnNeuralNetwork* net = nnMakeNet(layers, num_layers, input_size); assert(net); - // Train. +// Train. - // Try to learn the sigmoid function. - #define N 3 +// Try to learn the sigmoid function. +#define N 3 R inputs[N]; R targets[N]; for (int i = 0; i < N; ++i) { - inputs[i] = lerp(-1, +1, (R)i / (R)(N-1)); + inputs[i] = lerp(-1, +1, (R)i / (R)(N - 1)); targets[i] = sigmoid(inputs[i]); } @@ -35,29 +38,30 @@ TEST_CASE(neuralnet_train_sigmoid_test) { nnMatrixInit(&targets_matrix, targets); nnTrainingParams params = { - .learning_rate = 0.9, - .max_iterations = 100, - .seed = 0, - .weight_init = nnWeightInit01, - .debug = false, + .learning_rate = 0.9, + .max_iterations = 100, + .seed = 0, + .weight_init = nnWeightInit01, + .debug = false, }; nnTrain(net, &inputs_matrix, &targets_matrix, ¶ms); - const R weight = nnMatrixAt(&net->weights[0], 0, 0); + const R weight = nnMatrixAt(&net->layers[0].linear.weights, 0, 0); const R expected_weight = 1.0; - printf("\nTrained network weight: %f, Expected: %f\n", weight, expected_weight); + printf( + "\nTrained network weight: %f, Expected: %f\n", weight, expected_weight); TEST_TRUE(double_eq(weight, expected_weight, WEIGHT_EPS)); // Test. - nnQueryObject* query = nnMakeQueryObject(net, /*num_inputs=*/1); + nnQueryObject* query = nnMakeQueryObject(net, 1); - const R test_input[] = { 0.3 }; - R test_output[1]; + const R test_input[] = {0.3}; + R test_output[1]; nnQueryArray(net, query, test_input, test_output); - const R expected_output = 0.574442516811659; // sigmoid(0.3) + const R expected_output = 0.574442516811659; // sigmoid(0.3) printf("Output: %f, Expected: %f\n", test_output[0], expected_output); TEST_TRUE(double_eq(test_output[0], expected_output, OUTPUT_EPS)); diff --git a/src/lib/test/train_xor_test.c b/src/lib/test/train_xor_test.c index 6ddc6e0..78695a3 100644 --- a/src/lib/test/train_xor_test.c +++ b/src/lib/test/train_xor_test.c @@ -1,9 +1,9 @@ #include -#include -#include #include "activation.h" #include "neuralnet_impl.h" +#include +#include #include "test.h" #include "test_util.h" @@ -11,18 +11,27 @@ #include TEST_CASE(neuralnet_train_xor_test) { - const int num_layers = 2; - const int layer_sizes[] = { 2, 2, 1 }; - const nnActivation layer_activations[] = { nnRelu, nnIdentity }; + const int num_layers = 3; + const int input_size = 2; + const nnLayer layers[] = { + {.type = nnLinear, .linear = {.input_size = 2, .output_size = 2}}, + {.type = nnRelu}, + {.type = nnLinear, .linear = {.input_size = 2, .output_size = 1}} + }; - nnNeuralNetwork* net = nnMakeNet(num_layers, layer_sizes, layer_activations); + nnNeuralNetwork* net = nnMakeNet(layers, num_layers, input_size); assert(net); // Train. - #define N 4 - const R inputs[N][2] = { { 0., 0. }, { 0., 1. }, { 1., 0. }, { 1., 1. } }; - const R targets[N] = { 0., 1., 1., 0. }; +#define N 4 + const R inputs[N][2] = { + {0., 0.}, + {0., 1.}, + {1., 0.}, + {1., 1.} + }; + const R targets[N] = {0., 1., 1., 0.}; nnMatrix inputs_matrix = nnMatrixMake(N, 2); nnMatrix targets_matrix = nnMatrixMake(N, 1); @@ -30,31 +39,37 @@ TEST_CASE(neuralnet_train_xor_test) { nnMatrixInit(&targets_matrix, targets); nnTrainingParams params = { - .learning_rate = 0.1, - .max_iterations = 500, - .seed = 0, - .weight_init = nnWeightInit01, - .debug = false, + .learning_rate = 0.1, + .max_iterations = 500, + .seed = 0, + .weight_init = nnWeightInit01, + .debug = false, }; nnTrain(net, &inputs_matrix, &targets_matrix, ¶ms); // Test. - #define M 4 +#define M 4 - nnQueryObject* query = nnMakeQueryObject(net, /*num_inputs=*/M); + nnQueryObject* query = nnMakeQueryObject(net, M); - const R test_inputs[M][2] = { { 0., 0. }, { 1., 0. }, { 0., 1. }, { 1., 1. } }; + const R test_inputs[M][2] = { + {0., 0.}, + {1., 0.}, + {0., 1.}, + {1., 1.} + }; nnMatrix test_inputs_matrix = nnMatrixMake(M, 2); nnMatrixInit(&test_inputs_matrix, (const R*)test_inputs); nnQuery(net, query, &test_inputs_matrix); - const R expected_outputs[M] = { 0., 1., 1., 0. }; + const R expected_outputs[M] = {0., 1., 1., 0.}; for (int i = 0; i < M; ++i) { const R test_output = nnMatrixAt(nnNetOutputs(query), i, 0); - printf("\nInput: (%f, %f), Output: %f, Expected: %f\n", - test_inputs[i][0], test_inputs[i][1], test_output, expected_outputs[i]); + printf( + "\nInput: (%f, %f), Output: %f, Expected: %f\n", test_inputs[i][0], + test_inputs[i][1], test_output, expected_outputs[i]); } for (int i = 0; i < M; ++i) { const R test_output = nnMatrixAt(nnNetOutputs(query), i, 0); -- cgit v1.2.3