Import the necessary libraries, including NumPy, Theano, and Matplotlib. Get the normalized data and set the hyperparameters. Define the error rate, ReLU, and main functions. Create the Theano variables and expressions and define the cost function and prediction. Define the training expressions and functions, and create another function for the whole dataset. Create a loop to train the neural network, and print out the cost and error rate after each iteration. Finally, plot the cost values and display the graph.
import numpy as np
import theano
import theano.tensor as T
import matplotlib.pyplot as plt
from util import get_normalized_data, y2indicator
def error_rate(p, t):
return np.mean(p != t)
def relu(a):
return a * (a > 0)
def main():
# Step 1: Get the data and define all the usual variables
X, Y = get_normalized_data()
max_iter = 20
print_period = 10
lr = 0.00004
reg = 0.01
Xtrain = X[:-1000,]
Ytrain = Y[:-1000]
Xtest = X[-1000:,]
Ytest = Y[-1000:]
Ytrain_ind = y2indicator(Ytrain)
Ytest_ind = y2indicator(Ytest)
N, D = Xtrain.shape
batch_sz = 500
n_batches = N / batch_sz
M = 300
K = 10
W1_init = np.random.randn(D, M) / 28
b1_init = np.zeros(M)
W2_init = np.random.randn(M, K) / np.sqrt(M)
b2_init = np.zeros(K)
# Step 2: Define Theano variables and expressions
thX = T.matrix('X')
thT = T.matrix('T')
W1 = theano.shared(W1_init, 'W1')
b1 = theano.shared(b1_init, 'b1')
W2 = theano.shared(W2_init, 'W2')
b2 = theano.shared(b2_init, 'b2')
# We can use the built-in Theano functions to do ReLU and softmax
thZ = relu(thX.dot(W1) + b1) # ReLU is new in version 0.7.1 but just in case you don't have it
thY = T.nnet.softmax(thZ.dot(W2) + b2)
# Define the cost function and prediction
cost = -(thT * T.log(thY)).sum() + reg*((W1*W1).sum() + (b1*b1).sum() + (W2*W2).sum() + (b2*b2).sum())
prediction = T.argmax(thY, axis=1)
# Step 3: Training expressions and functions
# We can just include regularization as part of the cost because it is also automatically differentiated!
update_W1 = W1 - lr*T.grad(cost, W1)
update_b1 = b1 - lr*T.grad(cost, b1)
update_W2 = W2 - lr*T.grad(cost, W2)
update_b2 = b2 - lr*T.grad(cost, b2)
train = theano.function(
inputs=[thX, thT],
updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2)],
)
# Create another function for this because we want it over the whole dataset
get_prediction = theano.function(
inputs=[thX, thT],
outputs=[cost, prediction],
)
LL = []
for i in xrange(max_iter):
for j in xrange(n_batches):
Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
train(Xbatch, Ybatch)
if j % print_period == 0:
cost_val, prediction_val = get_prediction(Xtest, Ytest_ind)
err = error_rate(prediction_val, Ytest)
print "Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err)
LL.append(cost_val)
plt.plot(LL)
plt.show()
# How would you incorporate momentum into the gradient descent procedure?
if name == ‘main‘:
main()
Alternate Theano Code:
import numpy as np
import theano
import theano.tensor as T
import matplotlib.pyplot as plt
from util import get_normalized_data, y2indicator
def error_rate(p, t):
return np.mean(p != t)
def relu(a):
return a * (a > 0)
def main():
# Step 1: Get the data and define all the usual variables
X, Y = get_normalized_data()
max_iter = 20
print_period = 10
lr = 0.00004
reg = 0.01
Xtrain = X[:-1000,]
Ytrain = Y[:-1000]
Xtest = X[-1000:,]
Ytest = Y[-1000:]
Ytrain_ind = y2indicator(Ytrain)
Ytest_ind = y2indicator(Ytest)
N, D = Xtrain.shape
batch_sz = 500
n_batches = N / batch_sz
M = 300
K = 10
W1_init = np.random.randn(D, M) / 28
b1_init = np.zeros(M)
W2_init = np.random.randn(M, K) / np.sqrt(M)
b2_init = np.zeros(K)
# Step 2: Define Theano variables and expressions
thX = T.matrix('X')
thT = T.matrix('T')
W1 = theano.shared(W1_init, 'W1')
b1 = theano.shared(b1_init, 'b1')
W2 = theano.shared(W2_init, 'W2')
b2 = theano.shared(b2_init, 'b2')
# We can use the built-in Theano functions to do ReLU and softmax
thZ = T.nnet.relu(thX.dot(W1) + b1)
thY = T.nnet.softmax(thZ.dot(W2) + b2)
# Define the cost function and prediction
cost = -(thT * T.log(thY)).sum() + reg*((W1*W1).sum() + (b1*b1).sum() + (W2*W2).sum() + (b2*b2).sum())
prediction = T.argmax(thY, axis=1)
# Step 3: Training expressions and functions
# We can just include regularization as part of the cost because it is also automatically differentiated!
update_W1 = W1 - lr*T.grad(cost, W1)
update_b1 = b1 - lr*T.grad(cost, b1)
update_W2 = W2 - lr*T.grad(cost, W2)
update_b2 = b2 - lr*T.grad(cost, b2)
train = theano.function(
inputs=[thX, thT],
updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2)],
)
# Create another function for this because we want it over the whole dataset
get_prediction = theano.function(
inputs=[thX, thT],
outputs=[cost, prediction],
)
LL = []
for i in xrange(max_iter):
for j in xrange(n_batches):
Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
train(Xbatch, Ybatch)
if j % print_period == 0:
cost_val, prediction_val = get_prediction(Xtest, Ytest_ind)
err = error_rate(prediction_val, Ytest)
print "Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err)
LL.append(cost_val)
plt.plot(LL)
plt.show()
# How would you incorporate momentum into the gradient descent procedure?
if name == ‘main‘:
main()