CSC 578 Spring 2024

NN578_network2.ipynb

In [1]:
import random
import json
import numpy as np

random.seed(17)

Definitions of cost functions (as function classes)

In [2]:
class QuadraticCost(object):
    @staticmethod
    def fn(a, y):
        """Return the cost associated with an output ``a`` and desired output ``y``."""
        return 0.5*np.linalg.norm(y-a)**2

    @staticmethod
    def derivative(a, y):
        """Return the first derivative of the function."""
        return -(y-a)

class CrossEntropyCost(object):
    @staticmethod
    def fn(a, y):
        """Return the cost associated with an output ``a`` and desired output
        ``y``.  Note that np.nan_to_num is used to ensure numerical
        stability.  In particular, if both ``a`` and ``y`` have a 1.0
        in the same slot, then the expression (1-y)*np.log(1-a)
        returns nan.  The np.nan_to_num ensures that that is converted
        to the correct value (0.0)."""
        return np.sum(np.nan_to_num(-y*np.log(a)-(1-y)*np.log(1-a)))

    @staticmethod
    def derivative(a, y):
        """Return the first derivative of the function."""
        ###
        ### FILL IN HERE
        ###
        pass

class LogLikelihood(object):
    @staticmethod
    def fn(a, y):
        ###
        ### FILL IN HERE
        ###
        pass

    @staticmethod
    def derivative(a, y):
        ###
        ### FILL IN HERE
        ###
        pass

Definitions of activation functions (as function classes)

In [3]:
class Sigmoid(object):
    @staticmethod
    def fn(z):
        """The sigmoid function."""
        return 1.0/(1.0+np.exp(-z))

    @classmethod
    def derivative(cls,z):
        """Derivative of the sigmoid function."""
        return cls.fn(z)*(1-cls.fn(z))

class Softmax(object):
    @staticmethod
    def fn(z):
        """The softmax of vector z.
        Parameter z is an array of shape (len(z), 1)."""
        ###
        ### FILL IN HERE
        ###
        pass

    @classmethod
    def derivative(cls,z):
        """Derivative of the softmax.
        REMEMBER the derivative is an N*N matrix."""
        a = cls.fn(z) # obtain the softmax vector
        return np.diagflat(a) - np.dot(a, a.T)

class Tanh(object):
    @staticmethod
    def fn(z):
        """The tanh function."""
        ###
        ### FILL IN HERE
        ###
        pass

    @classmethod
    def derivative(cls,z):
        """Derivative of the tanh function."""
        ###
        ### FILL IN HERE
        ###
        pass

class ReLU(object):
    @staticmethod
    def fn(z):
        """The ReLU function."""
        ###
        ### FILL IN HERE
        ###
        pass

    @classmethod
    def derivative(cls,z):
        """Derivative of the ReLU function."""
        ###
        ### FILL IN HERE
        ###
        pass

class LeakyReLU(object):
    @staticmethod  # you could also make it a classmethod if you like
    def fn(z):
        """The LeakyReLU function."""
        ###
        ### FILL IN HERE
        ###
        pass

    @classmethod  # you could also make it a statocmethod if you like
    def derivative(cls,z):
        """Derivative of the LeakyReLU function."""
        ###
        ### FILL IN HERE
        ###
        pass

The main Network class

In [4]:
class Network(object):

    def __init__(self, sizes):
        """The list ``sizes`` contains the number of neurons in the
        respective layers of the network, for example [2, 3, 1].
        The biases and weights are initialized in a separate function.
        Model parameters are set here."""
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.default_weight_initializer()

    def default_weight_initializer(self):
        """Initialize each weight using a Gaussian distribution with mean 0
        and standard deviation 1, over the square root of the number of
        weights connecting to the same neuron -- changed from network.py."""
        self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
        self.weights = [np.random.randn(y, x)/np.sqrt(x)
                        for x, y in zip(self.sizes[:-1], self.sizes[1:])]

    ## CHANGES NEEDED.
    ## The original code allows any activation function for the output layer.
    ## Change so that if Tanh is passed for act_output, print an error message
    ## 'Error: Tanh cannot be used for output layer.  Changing to Sigmoid..'
    ## and do just that in the function.
    def set_model_parameters(self, cost=CrossEntropyCost, act_hidden=Sigmoid,
                              act_output=None):
        self.cost=cost
        self.act_hidden = act_hidden
        if act_output == None:
            self.act_output = self.act_hidden
        else:
            self.act_output = act_output

    def set_compile_parameters(self, regularization=None, lmbda=0.0,
                              dropoutpercent=0.0):
        """Function for setting compilation hyperparameters."""
        self.regularization = regularization
        self.lmbda = lmbda
        self.dropoutpercent = dropoutpercent

    def SGD(self, training_data, epochs, mini_batch_size, eta,
            evaluation_data=None,
            regularization=None, lmbda=0.0, dropoutpercent=0.0):
        """Train the neural network using mini-batch stochastic gradient
        descent.  The ``training_data`` is a list of tuples ``(x, y)``
        representing the training inputs and the desired outputs, and
        same for the ``evaluation_data``."""
        # This sets the compilation hyperparameters!
        self.set_compile_parameters(regularization, lmbda, dropoutpercent)

        if evaluation_data:
            n_data = len(evaluation_data)

        n = len(training_data)
        training_cost, training_accuracy = [], []
        evaluation_cost, evaluation_accuracy = [], []

        for j in range(epochs):
            #random.shuffle(training_data)
            mini_batches = [
                training_data[k:k+mini_batch_size]
                for k in range(0, n, mini_batch_size)]
            for mini_batch in mini_batches:
                self.update_mini_batch(
                    mini_batch, eta, len(training_data))

            ## Evaluation for both training and evaluation datasets
            cost = self.total_cost(training_data)
            training_cost.append(cost)

            accuracy = self.accuracy(training_data)
            training_accuracy.append(accuracy)

            if evaluation_data:
              cost = self.total_cost(evaluation_data)
              evaluation_cost.append(cost)
              accuracy = self.accuracy(evaluation_data)
              evaluation_accuracy.append(accuracy)

        # After all epochs are done,
        print ("Training {} epochs complete.\n".format(epochs))
        return training_cost, training_accuracy, \
                evaluation_cost, evaluation_accuracy

    ##  CHANGES NEEDED.
    ##  This original code is hard-coding the L2 norm.  You need to change
    ##  so that the parameter self.regularization is used and do the
    ##  appropriate regularization.
    def update_mini_batch(self, mini_batch, eta, n):
        """Update the network's weights and biases by applying gradient
        descent using backpropagation to a single mini batch.  The
        ``mini_batch`` is a list of tuples ``(x, y)``, ``eta`` is the
        learning rate, and ``n`` is the total size of the training data set."""
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]

        for x, y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        self.weights = [(1-eta*(self.lmbda/n))*w-(eta/len(mini_batch))*nw
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(mini_batch))*nb
                       for b, nb in zip(self.biases, nabla_b)]

    def backprop(self, x, y):
        """Return a tuple ``(nabla_b, nabla_w)`` representing the
        gradient for the cost function C_x.  ``nabla_b`` and
        ``nabla_w`` are layer-by-layer lists of numpy arrays, similar
        to ``self.biases`` and ``self.weights``."""
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]

        # feedforward
        activation = x
        activations = [x] # list to store all the activations, layer by layer
        zs = [] # list to store all the z vectors, layer by layer
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation)+b
            zs.append(z)
            ## SIMILAR CHANGE TO feedforward() IS NEEDED,
            ## to apply the activation function for the output layer
            ## (act_output) to the output layer(!)
            activation = (self.act_hidden).fn(z)
            activations.append(activation)

        # backward pass
        ## Cost and activation functions are parameterized now.
        ## Call the activation function of the output layer with z.
        a_prime = (self.act_output).derivative(zs[-1]) # nt: da/dz
        c_prime = (self.cost).derivative(activations[-1], y) # nt: dC/da

        # Compute delta -- separate case for Softmax
        if self.act_output == Softmax:
            delta = np.dot(a_prime, c_prime)
        else:
            delta = c_prime * a_prime # nt: dC/da * da/dz

        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())
        # Note that the variable l in the loop below is used a little
        # differently to the notation in Chapter 2 of the book.  Here,
        # l = 1 means the last layer of neurons, l = 2 is the
        # second-last layer, and so on.  It's a renumbering of the
        # scheme in the book, used here to take advantage of the fact
        # that Python can use negative indices in lists.
        for l in range(2, self.num_layers):
            z = zs[-l]
            ## Changed to call the activation function of the
            ## hidden layer with z.
            sp = (self.act_hidden).derivative(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return (nabla_b, nabla_w)

    def accuracy(self, data):
        """Return the number of inputs in ``data`` for which the neural
        network outputs the correct result. The neural network's
        output is assumed to be the index of whichever neuron in the
        final layer has the highest activation.
        """
        results = [(np.argmax(self.feedforward(x)), np.argmax(y))
                   for (x, y) in data]
        return sum(int(x == y) for (x, y) in results)

    ## CHANGES NEEDED.
    ##  This original code is hard-coding the L2 norm.  You need to change
    ##  so that the parameter self.regularization is used and do the
    ##  appropriate regularization.
    def total_cost(self, data):
        """Return the total cost for the data set ``data``."""
        cost = 0.0
        for x, y in data:
            a = self.feedforward(x)
            cost += self.cost.fn(a, y)/len(data)
        cost += 0.5*(self.lmbda/len(data))*sum(
            np.linalg.norm(w)**2 for w in self.weights)
        return cost

    def feedforward(self, a):
        """Return the output of the network if ``a`` is input."""
        for b, w in zip(self.biases, self.weights):
            ## FURTHER CHANGES NEEDED.
            ## The function is changed from ch1 'network.py' to apply the
            ## activation function (act_hidden) for the hidden layers.
            ## But NOTE that this is still incorrect because the output
            ## layer has to be applied with a different activation function
            ## (act_output).
            a = (self.act_hidden).fn(np.dot(w, a)+b)
        return a

    def save(self, filename):
        """Save the neural network to the file ``filename``."""
        data = {"sizes": self.sizes,
                "weights": [w.tolist() for w in self.weights],
                "biases": [b.tolist() for b in self.biases],
                "cost": str(self.cost.__name__)}
        f = open(filename, "w")
        json.dump(data, f)
        f.close()

    @classmethod
    def load_network(cls, filename):
        """Load a neural network from a json file ``filename``.  Returns an
        instance of Network. """
        f = open(filename, "r")
        data = json.load(f)
        f.close()
        net = cls(data["sizes"])
        net.weights = [np.array(w) for w in data["weights"]]
        net.biases = [np.array(b) for b in data["biases"]]
        return net
In [5]:
import pandas as pd

def my_load_csv(fname, input_size, target_size, seednum=17):
    ''' Function to load the data from a csv file.  Note the target (y)
        is assumed to be already in the one-hot-vector notation.
        Also each instance in the returned data is made into column vectors.'''
    # Read in the data into pandas dataframe
    df = pd.read_csv(fname, header=None)

    # Set the random seed if specified to shuffle, for reproducibility.
    # Otherwise no shuffling.
    if seednum:
        df = df.sample(frac=1, random_state=seednum)

    # Separate the X and Y parts
    X = df[df.columns[:input_size]].values.tolist()
    Y = df[df.columns[-target_size:]].values.tolist()

    # Combine the parts for each instance and put all in a list.
    # Note: x and y are both converted into a column vector/array.
    dataset = [(np.reshape(x, (input_size, 1)), np.reshape(y, (target_size, 1)))
               for x, y in zip(X, Y)]
    return dataset