Neural Network from Scratch¶

This Code Demonstrated how to build a Neural Network from scratch without using any python modules. (of course we use numpy :) )

1) Addition and Multiplication are represented a Gates
2) Sigmoid, Tanh are implemented as separate units
3) Softmax with Cross Entropy Loss is used.
   Refer to Differentiation of Cross Entropy Loss to grasp details of SoftMax Class
4) Complete Backpropogation is implemented using backward() method.(Similar to pyTorch)

import numpy as np
class MultiplyGate:
    def forward(self,W,X):
        return np.dot(X,W)
    
    def backward(self,W,X,dZ):
        dW = np.dot(X.T,dZ)
        dX = np.dot(dZ,W.T)
        return dW,dX
    
class AddGate:
    def forward(self,X,b):
        return X+b
    
    def backward(self,X,b,dZ):
        db = np.dot(np.ones((1,dZ.shape[0])),dZ)
        dX = dZ*np.ones_like(X)
        return db,dX
        
class Sigmoid:
    def forward(self,X):
        return 1.0/(1.0+np.exp(-X))
    def backward(self,X,top_diff):
        output = self.forward(self,X)
        return output*(1-output)*top_diff

class Tanh:
    def forward(self,X):
        return np.tanh(X)
    def backward(self,X,top_diff):
        output = self.forward(X)
        return (1.0-np.square(output))*top_diff
    
class Softmax:
    def predict(self,X):
        exp_scores = np.exp(X)
        return exp_scores/np.sum(exp_scores,axis=1,keepdims=True)
    
    def loss(self,X,y):
        num_examples = X.shape[0]
        probs = self.predict(X)
        correct_logprobs = -np.log(probs[range(num_examples),y])
        data_loss = np.sum(correct_logprobs)
        return (1./num_examples)*data_loss
    
    def diff(self,X,y):
        num_examples = X.shape[0]
        probs = self.predict(X)
        probs[range(num_examples),y] -= 1
               
        return probs

class Model:
    def __init__(self,layers_dim):
        self.W=[]
        self.b=[]
        for i in range(len(layers_dim)-1):
            self.W.append(np.random.randn(layers_dim[i],layers_dim[i+1])/np.sqrt(layers_dim[i]))
            self.b.append(np.random.randn(layers_dim[i+1]).reshape(1,layers_dim[i+1]))
            
    def calculate_loss(self,X,y):
        mulGate = MultiplyGate()
        addGate = AddGate()
        layer = Tanh()
        softmaxOutput = Softmax()
        
        input = X
        for i in range(len(self.W)):
            mul = mulGate.forward(self.W[i],input)
            add = addGate.forward(mul,self.b[i])
            input = layer.forward(add)
        
        return softmaxOutput.loss(input,y)
    
    def predict(self,X):
        mulGate = MultiplyGate()
        addGate = AddGate()
        layer = Tanh()
        softmaxOutput = Softmax()
        
        input = X
        for i in range(len(self.W)):
            mul = mulGate.forward(self.W[i],input)
            add = addGate.forward(mul,self.b[i])
            input = layer.forward(add)
        probs =softmaxOutput.predict(input)
        
        return np.argmax(probs,axis=1)
    
    def train(self, X, y, num_passes=20000, epsilon=0.01, reg_lambda=0.01, print_loss=False):
        mulGate = MultiplyGate()
        addGate = AddGate()
        layer = Tanh()
        softmaxOutput = Softmax()

        for epoch in range(num_passes):
            # Forward propagation
            input = X
            forward = [(None, None, input)]
            for i in range(len(self.W)):
                mul = mulGate.forward(self.W[i], input)
                add = addGate.forward(mul, self.b[i])
                input = layer.forward(add)
                forward.append((mul, add, input))

            # Back propagation
            dtanh = softmaxOutput.diff(forward[len(forward)-1][2], y)
            for i in range(len(forward)-1, 0, -1):
                dadd = layer.backward(forward[i][1], dtanh)
                db, dmul = addGate.backward(forward[i][0], self.b[i-1], dadd)
                dW, dtanh = mulGate.backward(self.W[i-1], forward[i-1][2], dmul)
                # Add regularization terms (b1 and b2 don't have regularization terms)
                dW += reg_lambda * self.W[i-1]
                # Gradient descent parameter update
                self.b[i-1] += -epsilon * db
                self.W[i-1] += -epsilon * dW

            if print_loss and epoch % 1000 == 0:
                print("Loss after iteration %i: %f" %(epoch, self.calculate_loss(X, y)))

import matplotlib.pyplot as plt
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.linear_model

# Generate a dataset and plot it
np.random.seed(0)
X, y = sklearn.datasets.make_moons(200, noise=0.20)
plt.scatter(X[:,0], X[:,1], s=40, c=y, cmap=plt.cm.Spectral)
plt.show()

layers_dim = [2,3,2]

model = Model(layers_dim)
model.train(X, y, num_passes=3000, epsilon=0.01, reg_lambda=0.01, print_loss=True)

Loss after iteration 0: 0.700172
Loss after iteration 1000: 0.322803
Loss after iteration 2000: 0.192783

import matplotlib.pyplot as plt
def plot_decision_boundary(pred_func, X, y):
    # Set min and max values and give it some padding
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    h = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Predict the function value for the whole gid
    Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)

plot_decision_boundary(lambda x: model.predict(x), X, y)
plt.title("Decision Boundary for hidden layer size 3")
plt.show()

C:\Program Files\Anaconda3\lib\site-packages\numpy\ma\core.py:6385: MaskedArrayFutureWarning: In the future the default for ma.minimum.reduce will be axis=0, not the current None, to match np.minimum.reduce. Explicitly pass 0 or None to silence this warning.
  return self.reduce(a)
C:\Program Files\Anaconda3\lib\site-packages\numpy\ma\core.py:6385: MaskedArrayFutureWarning: In the future the default for ma.maximum.reduce will be axis=0, not the current None, to match np.maximum.reduce. Explicitly pass 0 or None to silence this warning.
  return self.reduce(a)