#include #include #include "neuralnet_impl.h" #include #include #include #include #include #include #define LOGD printf // If debug mode is requested, we will show progress every this many iterations. static const int PROGRESS_THRESHOLD = 5; // % /// Computes the total MSE from the output error matrix. R ComputeMSE(const nnMatrix* errors) { R sum_sq = 0; const int N = errors->rows * errors->cols; const R* value = errors->values; for (int i = 0; i < N; ++i) { sum_sq += *value * *value; value++; } return sum_sq / (R)N; } /// Holds the bits required to compute a sigmoid gradient. typedef struct nnSigmoidGradientElements { nnMatrix ones; // A vector of just ones, same size as the layer. } nnSigmoidGradientElements; /// Holds the various elements required to compute gradients. These depend on /// what activation function are used, so they'll potentially be different for /// each layer. A data type is defined for these because we allocate all the /// required memory up front before entering the training loop. typedef struct nnGradientElements { nnActivation type; // Gradient vector, same size as the layer. // This will contain the gradient expression except for the output value of // the previous layer. nnMatrix gradient; union { nnSigmoidGradientElements sigmoid; }; } nnGradientElements; // Initialize the network's weights randomly and set their biases to 0. void nnInitNet(nnNeuralNetwork* net, uint64_t seed, const nnWeightInitStrategy strategy) { assert(net); mt19937_64 rng = mt19937_64_make(); mt19937_64_init(&rng, seed); for (int l = 0; l < net->num_layers; ++l) { nnMatrix* weights = &net->weights[l]; nnMatrix* biases = &net->biases[l]; const R layer_size = (R)nnLayerInputSize(weights); const R scale = 1. / layer_size; const R stdev = 1. / sqrt((R)layer_size); const R sigma = stdev * stdev; R* value = weights->values; for (int k = 0; k < weights->rows * weights->cols; ++k) { switch (strategy) { case nnWeightInit01: { const R x01 = mt19937_64_gen_real3(&rng); // (0, +1) interval. *value++ = scale * x01; break; } case nnWeightInit11: { const R x11 = mt19937_64_gen_real4(&rng); // (-1, +1) interval. *value++ = scale * x11; break; } case nnWeightInitNormal: // Using initialization with a normal distribution of standard // deviation 1 / sqrt(num_layer_weights) to prevent saturation when // multiplying inputs. const R u01 = mt19937_64_gen_real3(&rng); // (0, +1) interval. const R v01 = mt19937_64_gen_real3(&rng); // (0, +1) interval. R z0, z1; normal2(u01, v01, &z0, &z1); z0 = normal_transform(z0, /*mu=*/0, sigma); z1 = normal_transform(z1, /*mu=*/0, sigma); *value++ = z0; if (k < weights->rows * weights->cols - 1) { *value++ = z1; ++k; } break; default: assert(false); } } // Initialize biases. // 0 is used so that functions originally go through the origin. value = biases->values; for (int k = 0; k < biases->rows * biases->cols; ++k, ++value) { *value = 0; } } } // |inputs| has one row vector per sample. // |targets| has one row vector per sample. // // For now, each iteration trains with one sample (row) at a time. void nnTrain( nnNeuralNetwork* net, const nnMatrix* inputs, const nnMatrix* targets, const nnTrainingParams* params) { assert(net); assert(inputs); assert(targets); assert(params); assert(nnNetOutputSize(net) == targets->cols); assert(net->num_layers > 0); // Allocate error vectors to hold the backpropagated error values. // For now, these are one row vector per layer, meaning that we will train // with one sample at a time. nnMatrix* errors = calloc(net->num_layers, sizeof(nnMatrix)); // Allocate the weight transpose matrices up front for backpropagation. nnMatrix* weights_T = calloc(net->num_layers, sizeof(nnMatrix)); // Allocate the weight delta matrices. nnMatrix* weight_deltas = calloc(net->num_layers, sizeof(nnMatrix)); // Allocate the data structures required to compute gradients. // This depends on each layer's activation type. nnGradientElements* gradient_elems = calloc(net->num_layers, sizeof(nnGradientElements)); // Allocate the output transpose vectors for weight delta calculation. // This is one column vector per layer. nnMatrix* outputs_T = calloc(net->num_layers, sizeof(nnMatrix)); assert(errors != 0); assert(weights_T != 0); assert(weight_deltas != 0); assert(gradient_elems); assert(outputs_T); for (int l = 0; l < net->num_layers; ++l) { const nnMatrix* layer_weights = &net->weights[l]; const int layer_output_size = net->weights[l].cols; const nnActivation activation = net->activations[l]; errors[l] = nnMatrixMake(1, layer_weights->cols); weights_T[l] = nnMatrixMake(layer_weights->cols, layer_weights->rows); nnMatrixTranspose(layer_weights, &weights_T[l]); weight_deltas[l] = nnMatrixMake(layer_weights->rows, layer_weights->cols); outputs_T[l] = nnMatrixMake(layer_output_size, 1); // Allocate the gradient elements and vectors for weight delta calculation. nnGradientElements* elems = &gradient_elems[l]; elems->type = activation; switch (activation) { case nnIdentity: break; // Gradient vector will be borrowed, no need to allocate. case nnSigmoid: elems->gradient = nnMatrixMake(1, layer_output_size); // Allocate the 1s vectors. elems->sigmoid.ones = nnMatrixMake(1, layer_output_size); nnMatrixInitConstant(&elems->sigmoid.ones, 1); break; case nnRelu: elems->gradient = nnMatrixMake(1, layer_output_size); break; } } // Construct the query object with a size of 1 since we are training with one // sample at a time. nnQueryObject* query = nnMakeQueryObject(net, 1); // Network outputs are given by the query object. Every network query updates // the outputs. const nnMatrix* const training_outputs = query->network_outputs; // A vector to store the training input transposed. nnMatrix training_inputs_T = nnMatrixMake(inputs->cols, 1); // If debug mode is requested, we will show progress every Nth iteration. const int progress_frame = (params->max_iterations < PROGRESS_THRESHOLD) ? 1 : (params->max_iterations * PROGRESS_THRESHOLD / 100); // --- TRAIN nnInitNet(net, params->seed, params->weight_init); for (int iter = 0; iter < params->max_iterations; ++iter) { // For now, we train with one sample at a time. for (int sample = 0; sample < inputs->rows; ++sample) { // Slice the input and target matrices with the batch size. // We are not mutating the inputs, but we need the cast to borrow. nnMatrix training_inputs = nnMatrixBorrowRows((nnMatrix*)inputs, sample, 1); nnMatrix training_targets = nnMatrixBorrowRows((nnMatrix*)targets, sample, 1); // Will need the input transposed for backpropagation. // Assuming one training input per iteration for now. nnMatrixTranspose(&training_inputs, &training_inputs_T); // Run a forward pass and compute the output layer error. // We don't square the error here; instead, we just compute t-o, which is // part of the derivative, -2(t-o). Also, we compute o-t instead to // remove that outer negative sign. nnQuery(net, query, &training_inputs); //nnMatrixSub(&training_targets, training_outputs, &errors[net->num_layers - 1]); nnMatrixSub(training_outputs, &training_targets, &errors[net->num_layers - 1]); // Update outputs_T, which we need during weight updates. for (int l = 0; l < net->num_layers; ++l) { nnMatrixTranspose(&query->layer_outputs[l], &outputs_T[l]); } // Update weights and biases for each internal layer, backpropagating // errors along the way. for (int l = net->num_layers - 1; l >= 0; --l) { const nnMatrix* layer_output = &query->layer_outputs[l]; nnMatrix* layer_weights = &net->weights[l]; nnMatrix* layer_biases = &net->biases[l]; nnGradientElements* elems = &gradient_elems[l]; nnMatrix* gradient = &elems->gradient; const nnActivation activation = net->activations[l]; // Compute the gradient (the part of the expression that does not // contain the output of the previous layer). // // Identity: G = error_k // Sigmoid: G = error_k * output_k * (1 - output_k). // Relu: G = error_k * (output_k > 0 ? 1 : 0) switch (activation) { case nnIdentity: // TODO: Just copy the pointer? *gradient = nnMatrixBorrow(&errors[l]); break; case nnSigmoid: nnMatrixSub(&elems->sigmoid.ones, layer_output, gradient); nnMatrixMulPairs(layer_output, gradient, gradient); nnMatrixMulPairs(&errors[l], gradient, gradient); break; case nnRelu: nnMatrixGt(layer_output, 0, gradient); nnMatrixMulPairs(&errors[l], gradient, gradient); break; } // Outer product to compute the weight deltas. const nnMatrix* output_T = (l == 0) ? &training_inputs_T : &outputs_T[l-1]; nnMatrixMul(output_T, gradient, &weight_deltas[l]); // Backpropagate the error before updating weights. if (l > 0) { nnMatrixMul(gradient, &weights_T[l], &errors[l-1]); } // Update weights. nnMatrixScale(&weight_deltas[l], params->learning_rate); // The gradient has a negative sign from -(t - o), but we have computed // e = o - t instead, so we can subtract directly. //nnMatrixAdd(layer_weights, &weight_deltas[l], layer_weights); nnMatrixSub(layer_weights, &weight_deltas[l], layer_weights); // Update weight transpose matrix for the next training iteration. nnMatrixTranspose(layer_weights, &weights_T[l]); // Update biases. // This is the same formula as for weights, except that the o_j term is // just 1. We can simply re-use the gradient that we have already // computed for the weight update. //nnMatrixMulAdd(layer_biases, gradient, params->learning_rate, layer_biases); nnMatrixMulSub(layer_biases, gradient, params->learning_rate, layer_biases); } // TODO: Add this under a verbose debugging mode. // if (params->debug) { // LOGD("Iter: %d, Sample: %d, Error: %f\n", iter, sample, ComputeMSE(&errors[net->num_layers - 1])); // LOGD("TGT: "); // for (int i = 0; i < training_targets.cols; ++i) { // printf("%.3f ", training_targets.values[i]); // } // printf("\n"); // LOGD("OUT: "); // for (int i = 0; i < training_outputs->cols; ++i) { // printf("%.3f ", training_outputs->values[i]); // } // printf("\n"); // } } if (params->debug && ((iter % progress_frame) == 0)) { LOGD("Iter: %d/%d, Error: %f\n", iter, params->max_iterations, ComputeMSE(&errors[net->num_layers - 1])); } } // Print the final error. if (params->debug) { LOGD("Iter: %d/%d, Error: %f\n", params->max_iterations, params->max_iterations, ComputeMSE(&errors[net->num_layers - 1])); } for (int l = 0; l < net->num_layers; ++l) { nnMatrixDel(&errors[l]); nnMatrixDel(&outputs_T[l]); nnMatrixDel(&weights_T[l]); nnMatrixDel(&weight_deltas[l]); nnGradientElements* elems = &gradient_elems[l]; switch (elems->type) { case nnIdentity: break; // Gradient vector is borrowed, no need to deallocate. case nnSigmoid: nnMatrixDel(&elems->gradient); nnMatrixDel(&elems->sigmoid.ones); break; case nnRelu: nnMatrixDel(&elems->gradient); break; } } nnMatrixDel(&training_inputs_T); free(errors); free(outputs_T); free(weights_T); free(weight_deltas); free(gradient_elems); }