#!/usr/bin/python3
# vim: set fileencoding=utf-8 :

import logging

import numpy as np


logger = logging.getLogger(__name__)


class Network:
    """
    a simple neuronal network implementation
    """

    def __init__(self, layers, activations, seed=None):
        assert len(activations) == len(layers) - 1

        self.activations = []
        self.biases = []
        self.costs = []
        self.derivatives = []
        self.epochs = 0
        self.layers = layers
        self.weights = []

        # search for activation functions and derivatives on this class
        for name in activations:
            activation = getattr(self, f'activation_{name}')
            derivative = getattr(self, f'derivative_{name}')
            self.activations.append(activation)
            self.derivatives.append(derivative)

        # fill weights and biases with zeroes
        previous = None
        if seed is not None:
            # set the random starting point to get consistent images
            np.random.seed(seed)
        for i, layer in enumerate(layers):
            if i == 0:
                previous = layer
                continue
            # initialize with random parameters (to break symmetry)
            self.biases.append(2 * np.random.random((layer, 1)) - 1)
            self.weights.append(2 * np.random.random((layer, previous)) - 1)
            previous = layer

    def train(self, x_ik, y_ik, epochs=10000, learning_rate=0.1):
        for epoch in range(epochs): 
            self.epochs += 1

            # propagate all data and obtain the results of the NN
            # for the whole dataset (usually done in batches)
            z_ik, a_ik = self.forward_propagation(x_ik, train=True)

            # calculate back propagation
            partial_w, partial_b = self.back_propagation(y_ik, z_ik, a_ik)

            # modify weights and biases by the calculated gradients
            self.weights = [w - learning_rate * dw for w, dw in zip(self.weights, partial_w)]
            self.biases = [b - learning_rate * db for b, db in zip(self.biases, partial_b)]

    def back_propagation(self, y_ik, z_ik, a_ik):
        """
        calculate gradients of the weights and biases
        """
        # calculate batch size and shape input data
        y_ik = np.array(y_ik)
        batch_size = y_ik.size // self.layers[-1]
        y_ik = y_ik.reshape((self.layers[-1], batch_size))

        # save derivatives for each layer
        dw = []  # dC/dW
        db = []  # dC/dB
        d_ik = list(None for _ in range(len(self.layers) - 1))  # prefill

        # please refere to the course to unterstand the maths
        # the selected variable names and indices are consistent with the course
        # all variables are lists of matrices, in matrix multiplication the
        # indices change: a_ij * b_jk = c_ik (and b_jk * a_ij != c_ik)
        # the order of multiplication matters here, as well as the "order" of the indices
        # - if the formula says: a_ij * b_kj = c_ki
        # - you need transpose a_ij and calculate: b_kj * (a_ij).T

        # last layer
        d_ik[-1] = (a_ik[-1] - y_ik) * self.derivatives[-1](z_ik[-1])
        for l in reversed(range(1, len(d_ik))):
            # back propagation
            d_ik[l - 1] = self.weights[l].T.dot(d_ik[l]) * self.derivatives[l-1](z_ik[l - 1])

        # calculate gradients
        for layer, delta in enumerate(d_ik, 1):
            dw.append(np.dot(delta, a_ik[layer - 1].T) / float(batch_size))
            db.append(np.dot(delta, np.ones((batch_size, 1))) / float(batch_size))

        # return the derivitives respect to weight matrix and biases
        return dw, db

    def forward_propagation(self, x_ik, train=False):
        """
        calculate the results of the neuronal network
        """
        # calculate batch size and shape input data
        a = np.array(x_ik)
        batch_size = a.size // self.layers[0]

        if a.shape != (self.layers[0], batch_size):
            # logger.warning(
            #     "x_ik needs to be reshaped from %s to (%s, %s)",
            #     a.shape,
            #     self.layers[0],
            #     batch_size,
            # )
            a = a.reshape((self.layers[0], batch_size))

        # save activations for each layer
        z_ik = []
        a_ik = [a]

        # do forward propagation
        for b, w, phi in zip(self.biases, self.weights, self.activations):
            z = np.dot(w, a) + b
            z_ik.append(z)
            a = phi(z)
            a_ik.append(a)

        if train:
            return z_ik, a_ik

        # a is transposed to change it's shape:
        # each row now represents the solution for the corresponding input batch
        return a.T

    @staticmethod
    def activation_sigmoid(x):
        """
        Sigmoid activation function
        """
        return 1 / (1 + np.exp(-x))

    @staticmethod
    def activation_linear(x):
        """
        Linear activation function
        """
        return x

    @staticmethod
    def activation_relu(x):
        """
        ReLu activation function
        """
        return np.where(x >= 0, x, 0)   

    @staticmethod
    def activation_tanh(x):
        """
        Tanh activation function
        """
        return np.tanh(x)

    @staticmethod
    def activation_leakyrelu(x):
        """
        Leaky ReLu activation function
        """
        return np.where(x >= 0, x, x * 0.01)   

    @classmethod
    def derivative_sigmoid(cls, x):
        """
        first derivative of the sigmoid activation function
        """
        return cls.activation_sigmoid(x) * (1 - cls.activation_sigmoid(x))

    @staticmethod
    def derivative_linear(x):
        """
        first derivative of the linear activation function
        """
        return np.ones(x.shape)

    @staticmethod
    def derivative_relu(x):
        """
        first derivative of the ReLu activation function
        """
        return np.where(x>0, 1, 0)

    @staticmethod
    def derivative_tanh(x):
        """
        first derivative of the Tanh activation function
        """
        return 1 - np.tanh(x)**2

    @staticmethod
    def derivative_leakyrelu(x):
        """
        first derivative of the leaky ReLu activation function
        """
        return np.where(x>0, 1, -0.01)


if __name__=='__main__':

    nn = Network([1, 2, 1], activations=["sigmoid", "linear"], seed=1)

    x = [0.4, 0.5, 0.7, 0.9]
    y = [0.2, 0.6, 0.5, 0.2]

    for n in range(11):
        nn.train(x, y, epochs=1, learning_rate=1.0)
        print(f'{nn.epochs:d}')
        print(nn.weights)
        print(nn.biases)