class Network(object):
def __init__(self, sizes):
"""The list ``sizes`` contains the number of neurons in the
respective layers of the network, for example [2, 3, 1].
The biases and weights are initialized in a separate function.
Model parameters are set here."""
self.num_layers = len(sizes)
self.sizes = sizes
self.default_weight_initializer()
def default_weight_initializer(self):
"""Initialize each weight using a Gaussian distribution with mean 0
and standard deviation 1, over the square root of the number of
weights connecting to the same neuron -- changed from network.py."""
self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
self.weights = [np.random.randn(y, x)/np.sqrt(x)
for x, y in zip(self.sizes[:-1], self.sizes[1:])]
## CHANGES NEEDED.
## The original code allows any activation function for the output layer.
## Change so that if Tanh is passed for act_output, print an error message
## 'Error: Tanh cannot be used for output layer. Changing to Sigmoid..'
## and do just that in the function.
def set_model_parameters(self, cost=CrossEntropyCost, act_hidden=Sigmoid,
act_output=None):
self.cost=cost
self.act_hidden = act_hidden
if act_output == None:
self.act_output = self.act_hidden
else:
self.act_output = act_output
def set_compile_parameters(self, regularization=None, lmbda=0.0,
dropoutpercent=0.0):
"""Function for setting compilation hyperparameters."""
self.regularization = regularization
self.lmbda = lmbda
self.dropoutpercent = dropoutpercent
def SGD(self, training_data, epochs, mini_batch_size, eta,
evaluation_data=None,
regularization=None, lmbda=0.0, dropoutpercent=0.0):
"""Train the neural network using mini-batch stochastic gradient
descent. The ``training_data`` is a list of tuples ``(x, y)``
representing the training inputs and the desired outputs, and
same for the ``evaluation_data``."""
# This sets the compilation hyperparameters!
self.set_compile_parameters(regularization, lmbda, dropoutpercent)
if evaluation_data:
n_data = len(evaluation_data)
n = len(training_data)
training_cost, training_accuracy = [], []
evaluation_cost, evaluation_accuracy = [], []
for j in range(epochs):
#random.shuffle(training_data)
mini_batches = [
training_data[k:k+mini_batch_size]
for k in range(0, n, mini_batch_size)]
for mini_batch in mini_batches:
self.update_mini_batch(
mini_batch, eta, len(training_data))
## Evaluation for both training and evaluation datasets
cost = self.total_cost(training_data)
training_cost.append(cost)
accuracy = self.accuracy(training_data)
training_accuracy.append(accuracy)
if evaluation_data:
cost = self.total_cost(evaluation_data)
evaluation_cost.append(cost)
accuracy = self.accuracy(evaluation_data)
evaluation_accuracy.append(accuracy)
# After all epochs are done,
print ("Training {} epochs complete.\n".format(epochs))
return training_cost, training_accuracy, \
evaluation_cost, evaluation_accuracy
## CHANGES NEEDED.
## This original code is hard-coding the L2 norm. You need to change
## so that the parameter self.regularization is used and do the
## appropriate regularization.
def update_mini_batch(self, mini_batch, eta, n):
"""Update the network's weights and biases by applying gradient
descent using backpropagation to a single mini batch. The
``mini_batch`` is a list of tuples ``(x, y)``, ``eta`` is the
learning rate, and ``n`` is the total size of the training data set."""
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [(1-eta*(self.lmbda/n))*w-(eta/len(mini_batch))*nw
for w, nw in zip(self.weights, nabla_w)]
self.biases = [b-(eta/len(mini_batch))*nb
for b, nb in zip(self.biases, nabla_b)]
def backprop(self, x, y):
"""Return a tuple ``(nabla_b, nabla_w)`` representing the
gradient for the cost function C_x. ``nabla_b`` and
``nabla_w`` are layer-by-layer lists of numpy arrays, similar
to ``self.biases`` and ``self.weights``."""
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# feedforward
activation = x
activations = [x] # list to store all the activations, layer by layer
zs = [] # list to store all the z vectors, layer by layer
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation)+b
zs.append(z)
## SIMILAR CHANGE TO feedforward() IS NEEDED,
## to apply the activation function for the output layer
## (act_output) to the output layer(!)
activation = (self.act_hidden).fn(z)
activations.append(activation)
# backward pass
## Cost and activation functions are parameterized now.
## Call the activation function of the output layer with z.
a_prime = (self.act_output).derivative(zs[-1]) # nt: da/dz
c_prime = (self.cost).derivative(activations[-1], y) # nt: dC/da
# Compute delta -- separate case for Softmax
if self.act_output == Softmax:
delta = np.dot(a_prime, c_prime)
else:
delta = c_prime * a_prime # nt: dC/da * da/dz
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
# Note that the variable l in the loop below is used a little
# differently to the notation in Chapter 2 of the book. Here,
# l = 1 means the last layer of neurons, l = 2 is the
# second-last layer, and so on. It's a renumbering of the
# scheme in the book, used here to take advantage of the fact
# that Python can use negative indices in lists.
for l in range(2, self.num_layers):
z = zs[-l]
## Changed to call the activation function of the
## hidden layer with z.
sp = (self.act_hidden).derivative(z)
delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
return (nabla_b, nabla_w)
def accuracy(self, data):
"""Return the number of inputs in ``data`` for which the neural
network outputs the correct result. The neural network's
output is assumed to be the index of whichever neuron in the
final layer has the highest activation.
"""
results = [(np.argmax(self.feedforward(x)), np.argmax(y))
for (x, y) in data]
return sum(int(x == y) for (x, y) in results)
## CHANGES NEEDED.
## This original code is hard-coding the L2 norm. You need to change
## so that the parameter self.regularization is used and do the
## appropriate regularization.
def total_cost(self, data):
"""Return the total cost for the data set ``data``."""
cost = 0.0
for x, y in data:
a = self.feedforward(x)
cost += self.cost.fn(a, y)/len(data)
cost += 0.5*(self.lmbda/len(data))*sum(
np.linalg.norm(w)**2 for w in self.weights)
return cost
def feedforward(self, a):
"""Return the output of the network if ``a`` is input."""
for b, w in zip(self.biases, self.weights):
## FURTHER CHANGES NEEDED.
## The function is changed from ch1 'network.py' to apply the
## activation function (act_hidden) for the hidden layers.
## But NOTE that this is still incorrect because the output
## layer has to be applied with a different activation function
## (act_output).
a = (self.act_hidden).fn(np.dot(w, a)+b)
return a
def save(self, filename):
"""Save the neural network to the file ``filename``."""
data = {"sizes": self.sizes,
"weights": [w.tolist() for w in self.weights],
"biases": [b.tolist() for b in self.biases],
"cost": str(self.cost.__name__)}
f = open(filename, "w")
json.dump(data, f)
f.close()
@classmethod
def load_network(cls, filename):
"""Load a neural network from a json file ``filename``. Returns an
instance of Network. """
f = open(filename, "r")
data = json.load(f)
f.close()
net = cls(data["sizes"])
net.weights = [np.array(w) for w in data["weights"]]
net.biases = [np.array(b) for b in data["biases"]]
return net