import numpy as np import time import pdb from matplotlib import pyplot # save theta to p5_params.npz that can be used by easynn def save_theta(theta): f1_W, f1_b, f2_W, f2_b = theta np.savez_compressed("p5_params.npz", **{ "f1.weight": f1_W, "f1.bias": f1_b, "f2.weight": f2_W, "f2.bias": f2_b }) # initialize theta using uniform distribution [-bound, bound] # return theta as (f1_W, f1_b, f2_W, f2_b) def initialize_theta(bound): f1_W = np.random.uniform(-bound, bound, (32, 784)) f1_b = np.random.uniform(-bound, bound, 32) f2_W = np.random.uniform(-bound, bound, (10, 32)) f2_b = np.random.uniform(-bound, bound, 10) return (f1_W, f1_b, f2_W, f2_b) # forward: # x = Flatten(images) # g = Linear_f1(x) # h = ReLU(g) # z = Linear_f2(h) # return (z, h, g, x) def forward(images, theta): # number of samples N = images.shape[0] # unpack theta into f1 and f2 f1_W, f1_b, f2_W, f2_b = theta # x = Flatten(images) x = images.astype(float).transpose(0,3,1,2).reshape((N, -1)) # g = Linear_f1(x) g = np.zeros((N, f1_b.shape[0])) for i in range(N): g[i, :] = np.matmul(f1_W, x[i])+f1_b # h = ReLU(g) h = g*(g > 0) # z = Linear_f2(h) z = np.zeros((N, f2_b.shape[0])) for i in range(N): z[i, :] = np.matmul(f2_W, h[i])+f2_b return (z, h, g, x) # backprop: # J = cross entropy between labels and softmax(z) # return nabla_J def backprop(labels, theta, z, h, g, x): # number of samples N = labels.shape[0] # unpack theta into f1 and f2 f1_W, f1_b, f2_W, f2_b = theta # nabla_J consists of partial J to partial f1_W, f1_b, f2_W, f2_b p_f1_W = np.zeros(f1_W.shape) p_f1_b = np.zeros(f1_b.shape) p_f2_W = np.zeros(f2_W.shape) p_f2_b = np.zeros(f2_b.shape) for i in range(N): # compute the contribution to nabla_J for sample i # cross entropy and softmax # compute partial J to partial z[i] # scale by 1/N for averaging expz = np.exp(z[i]-max(z[i])) p_z = expz/sum(expz)/N p_z[labels[i]] -= 1/N # z = Linear_f2(h) # compute partial J to partial h[i] # accumulate partial J to partial f2_W, f2_b p_h = np.dot(f2_W.T, p_z) p_f2_W += np.outer(p_z, h[i]) p_f2_b += p_z # h = ReLU(g) # compute partial J to partial g[i] p_g = p_h * (g[i] > 0) # g = Linear_f1(x) # accumulate partial J to partial f1_W, f1_b p_f1_W += np.outer(p_g, x[i]) p_f1_b += p_g return (p_f1_W, p_f1_b, p_f2_W, p_f2_b) # apply SGD to update theta by nabla_J and the learning rate epsilon # return updated theta def update_theta(theta, nabla_J, epsilon): # ToDo: modify code below as needed #updated_theta = theta #return updated_theta f1_W, f1_b, f2_W, f2_b = theta p_f1_W, p_f1_b, p_f2_W, p_f2_b = nabla_J # update the weights and biases for the first layer (f1) f1_W_updated = f1_W - epsilon * p_f1_W f1_b_updated = f1_b - epsilon * p_f1_b # update the weights and biases for the second layer (f2) f2_W_updated = f2_W - epsilon * p_f2_W f2_b_updated = f2_b - epsilon * p_f2_b return (f1_W_updated, f1_b_updated, f2_W_updated, f2_b_updated) def print_training_hyperparams_for_session(epsilon, batch_size, bound): print("Starting training session with 10 epochs:") print("") print("Hyperparameters:") print(f"epsilon: {epsilon}") print(f"bound: {bound}") print(f"batch_size: {batch_size}") print("") print("Results:") def plot_epoch(epochs, accuracies, epsilon, batch_size, bound): pyplot.figure(figsize=(10, 6)) pyplot.plot(epochs, accuracies, label=f"Epsilon: {epsilon}, Batch Size: {batch_size}, Bound: {bound}") pyplot.xlabel('Epoch') pyplot.ylabel('Accuracy') pyplot.title('Training Accuracy over Epochs') pyplot.legend() pyplot.grid(True) pyplot.show() def plot_all_epochs(training_results): pyplot.figure(figsize=(12, 8)) for epochs, accuracies, epsilon, batch_size, bound in training_results: label = f"Epsilon: {epsilon}, Batch Size: {batch_size}, Bound: {bound}" pyplot.plot(epochs, accuracies, label=label) pyplot.xlabel('Epoch') pyplot.ylabel('Accuracy') pyplot.title('Training Accuracy over Epochs for Different Hyperparameters') pyplot.legend() pyplot.grid(True) pyplot.show() def plot_table(training_results): # Setting up the data for the table cell_text = [] columns = ['Epoch', 'Accuracy', 'Epsilon', 'Batch Size', 'Bound'] for result in training_results: epochs, accuracies, epsilon, batch_size, bound = result for epoch, accuracy in zip(epochs, accuracies): cell_text.append([epoch, f"{accuracy:.3f}", epsilon, batch_size, bound]) # Determine the figure size needed for the table figsize = (10, len(cell_text) * 0.2) fig, ax = pyplot.subplots(figsize=figsize) ax.axis('tight') ax.axis('off') # Create the table table = ax.table(cellText=cell_text, colLabels=columns, loc='center', cellLoc='center') # Adjust table scale table.auto_set_font_size(False) table.set_fontsize(8) table.auto_set_column_width(col=list(range(len(columns)))) pyplot.show() def start_training(epsilon, batch_size, bound, mnist_train): # ToDo: set numpy random seed to the last 8 digits of your CWID np.random.seed(20497299) validation_images = mnist_train["images"][:1000] validation_labels = mnist_train["labels"][:1000] training_images = mnist_train["images"][1000:] training_labels = mnist_train["labels"][1000:] # hyperparameters # we can experiment with these values to see if increasing or decreasing # these values can influence our accuracy # default values #bound = 1 # initial weight range #epsilon = 0.00001 # learning rate #print_training_hyperparams_for_session(epsilon, batch_size, bound) # start training accuracies = [] epochs = [] start = time.time() theta = initialize_theta(bound) batches = training_images.shape[0]//batch_size for epoch in range(10): indices = np.arange(training_images.shape[0]) np.random.shuffle(indices) for i in range(batches): batch_images = training_images[indices[i*batch_size:(i+1)*batch_size]] batch_labels = training_labels[indices[i*batch_size:(i+1)*batch_size]] z, h, g, x = forward(batch_images, theta) nabla_J = backprop(batch_labels, theta, z, h, g, x) theta = update_theta(theta, nabla_J, epsilon) # check accuracy using validation examples z, _, _, _ = forward(validation_images, theta) pred_labels = z.argmax(axis = 1) accuracy = sum(pred_labels == validation_labels) / validation_images.shape[0] accuracies.append(accuracy) epochs.append(epoch) #count = sum(pred_labels == validation_labels) print("epoch %d, accuracy %.3f, time %.2f" % ( epoch, accuracy, time.time()-start)) #plot_epoch(epochs, accuracies, epsilon, batch_size, bound) # save the weights to be submitted save_theta(theta) # return this data so we can plot it with matplotlib return epochs, accuracies def main(): training_results = [] mnist_train = np.load("mnist_train.npz") training_results = [] # load training data once mnist_train = np.load("mnist_train.npz") # we can add to this list if we want to test combinations of hyperparameters hyperparams = [ (0.00001, 1, 4), # default params (0.00001, 0.1, 4), (0.00001, 0.5, 4), (0.00001, 0.7, 4), (0.00001, 0.01, 4), (0.00001, 0.01, 3), (0.00001, 0.01, 2), (0.000013, 0.012, 1), (0.000013, 0.012002899999999983, 1), (0.000013, 0.01200591999999996, 1), ] for epsilon, bound, batch_size in hyperparams: epochs, accuracies = start_training(epsilon, batch_size, bound, mnist_train) training_results.append((epochs, accuracies, epsilon, batch_size, bound)) # uncomment if you would like to see plotted results #plot_all_epochs(training_results) # plot table #plot_table(training_results[9]) if __name__ == '__main__': main()