#include <neuralnet/train.h>

#include <neuralnet/matrix.h>
#include "neuralnet_impl.h"

#include <random/mt19937-64.h>
#include <random/normal.h>

#include <assert.h>
#include <math.h>
#include <stdlib.h>

#include <stdio.h>
#define LOGD printf

// If debug mode is requested, we will show progress every this many iterations.
static const int PROGRESS_THRESHOLD = 5;  // %

/// Computes the total MSE from the output error matrix.
R ComputeMSE(const nnMatrix* errors) {
  R sum_sq = 0;
  const int N = errors->rows * errors->cols;
  const R* value = errors->values;
  for (int i = 0; i < N; ++i) {
    sum_sq += *value * *value;
    value++;
  }
  return sum_sq / (R)N;
}

/// Holds the bits required to compute a sigmoid gradient.
typedef struct nnSigmoidGradientElements {
  nnMatrix ones;      // A vector of just ones, same size as the layer.
} nnSigmoidGradientElements;

/// Holds the various elements required to compute gradients. These depend on
/// what activation function are used, so they'll potentially be different for
/// each layer. A data type is defined for these because we allocate all the
/// required memory up front before entering the training loop.
typedef struct nnGradientElements {
  nnActivation type;
  // Gradient vector, same size as the layer.
  // This will contain the gradient expression except for the output value of
  // the previous layer.
  nnMatrix gradient;
  union {
    nnSigmoidGradientElements sigmoid;
  };
} nnGradientElements;

// Initialize the network's weights randomly and set their biases to 0.
void nnInitNet(nnNeuralNetwork* net, uint64_t seed, const nnWeightInitStrategy strategy) {
  assert(net);

  mt19937_64 rng = mt19937_64_make();
  mt19937_64_init(&rng, seed);

  for (int l = 0; l < net->num_layers; ++l) {
    nnMatrix* weights = &net->weights[l];
    nnMatrix* biases  = &net->biases[l];

    const R layer_size = (R)nnLayerInputSize(weights);
    const R scale = 1. / layer_size;
    const R stdev = 1. / sqrt((R)layer_size);
    const R sigma = stdev * stdev;

    R* value = weights->values;
    for (int k = 0; k < weights->rows * weights->cols; ++k) {
      switch (strategy) {
        case nnWeightInit01: {
          const R x01 = mt19937_64_gen_real3(&rng);  // (0, +1) interval.
          *value++ = scale * x01;
          break;
        }
        case nnWeightInit11: {
          const R x11 = mt19937_64_gen_real4(&rng);  // (-1, +1) interval.
          *value++ = scale * x11;
          break;
        }
        case nnWeightInitNormal:
          // Using initialization with a normal distribution of standard
          // deviation 1 / sqrt(num_layer_weights) to prevent saturation when
          // multiplying inputs.
          const R u01 = mt19937_64_gen_real3(&rng);  // (0, +1) interval.
          const R v01 = mt19937_64_gen_real3(&rng);  // (0, +1) interval.
          R z0, z1;
          normal2(u01, v01, &z0, &z1);
          z0 = normal_transform(z0, /*mu=*/0, sigma);
          z1 = normal_transform(z1, /*mu=*/0, sigma);
          *value++ = z0;
          if (k < weights->rows * weights->cols - 1) {
            *value++ = z1;
            ++k;
          }
          break;
        default:
          assert(false);
      }
    }

    // Initialize biases.
    // 0 is used so that functions originally go through the origin.
    value = biases->values;
    for (int k = 0; k < biases->rows * biases->cols; ++k, ++value) {
      *value = 0;
    }
  }
}

// |inputs|  has one row vector per sample.
// |targets| has one row vector per sample.
//
// For now, each iteration trains with one sample (row) at a time.
void nnTrain(
    nnNeuralNetwork* net,
    const nnMatrix* inputs,
    const nnMatrix* targets,
    const nnTrainingParams* params) {
  assert(net);
  assert(inputs);
  assert(targets);
  assert(params);
  assert(nnNetOutputSize(net) == targets->cols);
  assert(net->num_layers > 0);

  // Allocate error vectors to hold the backpropagated error values.
  // For now, these are one row vector per layer, meaning that we will train
  // with one sample at a time.
  nnMatrix* errors = calloc(net->num_layers, sizeof(nnMatrix));

  // Allocate the weight transpose matrices up front for backpropagation.
  nnMatrix* weights_T = calloc(net->num_layers, sizeof(nnMatrix));

  // Allocate the weight delta matrices.
  nnMatrix* weight_deltas = calloc(net->num_layers, sizeof(nnMatrix));

  // Allocate the data structures required to compute gradients.
  // This depends on each layer's activation type.
  nnGradientElements* gradient_elems = calloc(net->num_layers, sizeof(nnGradientElements));

  // Allocate the output transpose vectors for weight delta calculation.
  // This is one column vector per layer.
  nnMatrix* outputs_T = calloc(net->num_layers, sizeof(nnMatrix));

  assert(errors != 0);
  assert(weights_T != 0);
  assert(weight_deltas != 0);
  assert(gradient_elems);
  assert(outputs_T);

  for (int l = 0; l < net->num_layers; ++l) {
    const nnMatrix* layer_weights = &net->weights[l];
    const int layer_output_size = net->weights[l].cols;
    const nnActivation activation = net->activations[l];

    errors[l] = nnMatrixMake(1, layer_weights->cols);

    weights_T[l] = nnMatrixMake(layer_weights->cols, layer_weights->rows);
    nnMatrixTranspose(layer_weights, &weights_T[l]);

    weight_deltas[l] = nnMatrixMake(layer_weights->rows, layer_weights->cols);

    outputs_T[l] = nnMatrixMake(layer_output_size, 1);

    // Allocate the gradient elements and vectors for weight delta calculation.
    nnGradientElements* elems = &gradient_elems[l];
    elems->type = activation;
    switch (activation) {
      case nnIdentity:
        break;  // Gradient vector will be borrowed, no need to allocate.

      case nnSigmoid:
        elems->gradient = nnMatrixMake(1, layer_output_size);
        // Allocate the 1s vectors.
        elems->sigmoid.ones = nnMatrixMake(1, layer_output_size);
        nnMatrixInitConstant(&elems->sigmoid.ones, 1);
        break;

      case nnRelu:
        elems->gradient = nnMatrixMake(1, layer_output_size);
        break;
    }
  }

  // Construct the query object with a size of 1 since we are training with one
  // sample at a time.
  nnQueryObject* query = nnMakeQueryObject(net, 1);

  // Network outputs are given by the query object. Every network query updates
  // the outputs.
  const nnMatrix* const training_outputs = query->network_outputs;

  // A vector to store the training input transposed.
  nnMatrix training_inputs_T = nnMatrixMake(inputs->cols, 1);

  // If debug mode is requested, we will show progress every Nth iteration.
  const int progress_frame =
    (params->max_iterations < PROGRESS_THRESHOLD)
    ? 1
    : (params->max_iterations * PROGRESS_THRESHOLD / 100);

  // --- TRAIN

  nnInitNet(net, params->seed, params->weight_init);

  for (int iter = 0; iter < params->max_iterations; ++iter) {

    // For now, we train with one sample at a time.
    for (int sample = 0; sample < inputs->rows; ++sample) {
      // Slice the input and target matrices with the batch size.
      // We are not mutating the inputs, but we need the cast to borrow.
      nnMatrix training_inputs  = nnMatrixBorrowRows((nnMatrix*)inputs,  sample, 1);
      nnMatrix training_targets = nnMatrixBorrowRows((nnMatrix*)targets, sample, 1);

      // Will need the input transposed for backpropagation.
      // Assuming one training input per iteration for now.
      nnMatrixTranspose(&training_inputs, &training_inputs_T);

      // Run a forward pass and compute the output layer error.
      // We don't square the error here; instead, we just compute t-o, which is
      // part of the derivative, -2(t-o). Also, we compute o-t instead to
      // remove that outer negative sign.
      nnQuery(net, query, &training_inputs);
      //nnMatrixSub(&training_targets, training_outputs, &errors[net->num_layers - 1]);
      nnMatrixSub(training_outputs, &training_targets, &errors[net->num_layers - 1]);

      // Update outputs_T, which we need during weight updates.
      for (int l = 0; l < net->num_layers; ++l) {
        nnMatrixTranspose(&query->layer_outputs[l], &outputs_T[l]);
      }

      // Update weights and biases for each internal layer, backpropagating
      // errors along the way.
      for (int l = net->num_layers - 1; l >= 0; --l) {
        const nnMatrix* layer_output = &query->layer_outputs[l];
        nnMatrix* layer_weights = &net->weights[l];
        nnMatrix* layer_biases  = &net->biases[l];
        nnGradientElements* elems = &gradient_elems[l];
        nnMatrix* gradient = &elems->gradient;
        const nnActivation activation = net->activations[l];

        // Compute the gradient (the part of the expression that does not
        // contain the output of the previous layer).
        //
        // Identity: G = error_k
        // Sigmoid:  G = error_k * output_k * (1 - output_k).
        // Relu:     G = error_k * (output_k > 0 ? 1 : 0)
        switch (activation) {
          case nnIdentity:
            // TODO: Just copy the pointer?
            *gradient = nnMatrixBorrow(&errors[l]);
            break;
          case nnSigmoid:
            nnMatrixSub(&elems->sigmoid.ones, layer_output, gradient);
            nnMatrixMulPairs(layer_output, gradient, gradient);
            nnMatrixMulPairs(&errors[l], gradient, gradient);
            break;
          case nnRelu:
            nnMatrixGt(layer_output, 0, gradient);
            nnMatrixMulPairs(&errors[l], gradient, gradient);
            break;
        }

        // Outer product to compute the weight deltas.
        const nnMatrix* output_T = (l == 0) ? &training_inputs_T : &outputs_T[l-1];
        nnMatrixMul(output_T, gradient, &weight_deltas[l]);

        // Backpropagate the error before updating weights.
        if (l > 0) {
          nnMatrixMul(gradient, &weights_T[l], &errors[l-1]);
        }

        // Update weights.
        nnMatrixScale(&weight_deltas[l], params->learning_rate);
        // The gradient has a negative sign from -(t - o), but we have computed
        // e = o - t instead, so we can subtract directly.
        //nnMatrixAdd(layer_weights, &weight_deltas[l], layer_weights);
        nnMatrixSub(layer_weights, &weight_deltas[l], layer_weights);

        // Update weight transpose matrix for the next training iteration.
        nnMatrixTranspose(layer_weights, &weights_T[l]);

        // Update biases.
        // This is the same formula as for weights, except that the o_j term is
        // just 1. We can simply re-use the gradient that we have already
        // computed for the weight update.
        //nnMatrixMulAdd(layer_biases, gradient, params->learning_rate, layer_biases);
        nnMatrixMulSub(layer_biases, gradient, params->learning_rate, layer_biases);
      }

      // TODO: Add this under a verbose debugging mode.
      // if (params->debug) {
      //   LOGD("Iter: %d, Sample: %d, Error: %f\n", iter, sample, ComputeMSE(&errors[net->num_layers - 1]));
      //   LOGD("TGT: ");
      //   for (int i = 0; i < training_targets.cols; ++i) {
      //     printf("%.3f  ", training_targets.values[i]);
      //   }
      //   printf("\n");
      //   LOGD("OUT: ");
      //   for (int i = 0; i < training_outputs->cols; ++i) {
      //     printf("%.3f  ", training_outputs->values[i]);
      //   }
      //   printf("\n");
      // }
    }

    if (params->debug && ((iter % progress_frame) == 0)) {
      LOGD("Iter: %d/%d, Error: %f\n",
        iter, params->max_iterations, ComputeMSE(&errors[net->num_layers - 1]));
    }
  }

  // Print the final error.
  if (params->debug) {
    LOGD("Iter: %d/%d, Error: %f\n",
      params->max_iterations, params->max_iterations, ComputeMSE(&errors[net->num_layers - 1]));
  }

  for (int l = 0; l < net->num_layers; ++l) {
    nnMatrixDel(&errors[l]);
    nnMatrixDel(&outputs_T[l]);
    nnMatrixDel(&weights_T[l]);
    nnMatrixDel(&weight_deltas[l]);

    nnGradientElements* elems = &gradient_elems[l];
    switch (elems->type) {
      case nnIdentity:
        break;  // Gradient vector is borrowed, no need to deallocate.

      case nnSigmoid:
        nnMatrixDel(&elems->gradient);
        nnMatrixDel(&elems->sigmoid.ones);
        break;

      case nnRelu:
        nnMatrixDel(&elems->gradient);
        break;
    }
  }
  nnMatrixDel(&training_inputs_T);
  free(errors);
  free(outputs_T);
  free(weights_T);
  free(weight_deltas);
  free(gradient_elems);
}