#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 29 18:43:24 2021

This is an example of a feedforward 3 layer network coded from scratch.
This is not a good example of Python code, because I want to highlight
the linear algebra coming from the layers and training.

The code should read just like the linear algebra equations in the notes.


Built-ins used:  train_test_split, StandardScaler, confusion_matrix

"""

import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

def sigmoid(z):
    """The sigmoid function."""
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    """Derivative of the sigmoid function."""
    return sigmoid(z)*(1-sigmoid(z))


#%%  Load and prep the data

Z=load_iris()
X=Z.data
t=Z.target
temp=np.identity(3)
T=temp[:,t]
scaler=StandardScaler()
Xs=scaler.fit_transform(X)
del Z,temp

Xtrain, Xtest, Ttrain, Ttest = train_test_split(Xs, T.T, test_size=0.3, random_state=42)
Xtrain=Xtrain.T
Xtest=Xtest.T
Ttrain=Ttrain.T
Ttest=Ttest.T


#%% Training setup for the neural network:
alpha=0.1      #Learning parameter for Stochastic Gradient Descent
NumEpochs=800  #Number of Epochs
xdim,Numpts=Xtrain.shape;
tdim,temp=Ttrain.shape;

NetNodes=[xdim, 10, tdim];

#%% Define/Initialize Structures of the network (probably optional)
P1=np.zeros((NetNodes[1],1))  
S1=np.zeros((NetNodes[1],1))  
dS1=np.zeros((NetNodes[1],1)) 
Delta1=np.zeros((NetNodes[1],1))
P2=np.zeros((NetNodes[2],1))  
S2=np.zeros((NetNodes[2],1))  
dS2=np.zeros((NetNodes[2],1)) 
Delta2=np.zeros((NetNodes[2],1))
W1=np.random.randn(NetNodes[1],NetNodes[0]) 
b1=np.random.randn(NetNodes[1],1)
W2=np.random.randn(NetNodes[2],NetNodes[1]) 
b2=np.random.randn(NetNodes[2],1)
err=np.zeros((1,NumEpochs));

#% Main Training Loop - 
for j in range(NumEpochs):
    err[0,j]=0  # accumulates error over all data points
    for k in range(Numpts):
        # Forward Pass:
        temp=Xtrain[:,k]
        P1=W1 @ temp[:,np.newaxis] +b1  
        S1=sigmoid(P1)
        dS1=sigmoid_prime(P1)
        P2=W2 @ S1 + b2          
        S2=P2          
        dS2=np.ones((P2.shape))
        
        # Backwards Pass:
        temp2=Ttrain[:,k]
        Delta2=temp2[:,np.newaxis]-S2 
        Delta1=(W2.T @ Delta2)*dS1
        
        # Update Weights and Biases
        dW1=Delta1 @ temp[:,np.newaxis].T  
        db1=Delta1
        dW2=Delta2 @ S1.T           
        db2=Delta2
        
        W1=W1+alpha*dW1 
        b1=b1+alpha*db1
        W2=W2+alpha*dW2 
        b2=b2+alpha*db2
        
        err[0,j]=err[0,j]+np.linalg.norm(temp2[:,np.newaxis]-S2);

"""
 Although we shouldn't really do this, sometimes its good to know that you
 have a small error on your training set to be sure your algorithm is
 correct.
"""

Ztrain=W2 @ ( sigmoid(W1 @ Xtrain + b1))+b2
t1=np.argmax(Ztrain,axis=0)
t2=np.argmax(Ttrain,axis=0)
Ct=confusion_matrix(t1,t2)
print('Error on training set- Only for code verification')
print(Ct)


# Now compute our "real" estimate of the error:
Zout=W2 @ ( sigmoid(W1 @ Xtest + b1))+b2;
t1=np.argmax(Zout,axis=0);
t2=np.argmax(Ttest,axis=0);
C=confusion_matrix(t1,t2);
print('Error on the test set')
print(C)