import numpy as np
import time
import pdb

from matplotlib import pyplot

# save theta to p5_params.npz that can be used by easynn
def save_theta(theta):
    f1_W, f1_b, f2_W, f2_b = theta

    np.savez_compressed("p5_params.npz", **{
        "f1.weight": f1_W,
        "f1.bias": f1_b,
        "f2.weight": f2_W,
        "f2.bias": f2_b
    })


# initialize theta using uniform distribution [-bound, bound]
# return theta as (f1_W, f1_b, f2_W, f2_b)
def initialize_theta(bound):
    f1_W = np.random.uniform(-bound, bound, (32, 784))
    f1_b = np.random.uniform(-bound, bound, 32)
    f2_W = np.random.uniform(-bound, bound, (10, 32))
    f2_b = np.random.uniform(-bound, bound, 10)
    return (f1_W, f1_b, f2_W, f2_b)


# forward:
#   x = Flatten(images)
#   g = Linear_f1(x)
#   h = ReLU(g)
#   z = Linear_f2(h)
# return (z, h, g, x)
def forward(images, theta):
    # number of samples
    N = images.shape[0]

    # unpack theta into f1 and f2
    f1_W, f1_b, f2_W, f2_b = theta

    # x = Flatten(images)
    x = images.astype(float).transpose(0,3,1,2).reshape((N, -1))

    # g = Linear_f1(x)
    g = np.zeros((N, f1_b.shape[0]))
    for i in range(N):
        g[i, :] = np.matmul(f1_W, x[i])+f1_b

    # h = ReLU(g)
    h = g*(g > 0)

    # z = Linear_f2(h)
    z = np.zeros((N, f2_b.shape[0]))
    for i in range(N):
        z[i, :] = np.matmul(f2_W, h[i])+f2_b

    return (z, h, g, x)


# backprop:
#   J = cross entropy between labels and softmax(z)
# return nabla_J
def backprop(labels, theta, z, h, g, x):
    # number of samples
    N = labels.shape[0]

    # unpack theta into f1 and f2
    f1_W, f1_b, f2_W, f2_b = theta

    # nabla_J consists of partial J to partial f1_W, f1_b, f2_W, f2_b
    p_f1_W = np.zeros(f1_W.shape)
    p_f1_b = np.zeros(f1_b.shape)
    p_f2_W = np.zeros(f2_W.shape)
    p_f2_b = np.zeros(f2_b.shape)

    for i in range(N):
        # compute the contribution to nabla_J for sample i

        # cross entropy and softmax
        #   compute partial J to partial z[i]
        #   scale by 1/N for averaging
        expz = np.exp(z[i]-max(z[i]))
        p_z = expz/sum(expz)/N
        p_z[labels[i]] -= 1/N

        # z = Linear_f2(h)
        #   compute partial J to partial h[i]
        #   accumulate partial J to partial f2_W, f2_b
        p_h = np.dot(f2_W.T, p_z)
        p_f2_W += np.outer(p_z, h[i])
        p_f2_b += p_z

        # h = ReLU(g)
        #   compute partial J to partial g[i]
        p_g = p_h * (g[i] > 0)


        # g = Linear_f1(x)
        #   accumulate partial J to partial f1_W, f1_b
        p_f1_W += np.outer(p_g, x[i])
        p_f1_b += p_g

    return (p_f1_W, p_f1_b, p_f2_W, p_f2_b)


# apply SGD to update theta by nabla_J and the learning rate epsilon
# return updated theta
def update_theta(theta, nabla_J, epsilon):
    # ToDo: modify code below as needed
    #updated_theta = theta
    #return updated_theta
    f1_W, f1_b, f2_W, f2_b = theta
    p_f1_W, p_f1_b, p_f2_W, p_f2_b = nabla_J

    # update the weights and biases for the first layer (f1)
    f1_W_updated = f1_W - epsilon * p_f1_W
    f1_b_updated = f1_b - epsilon * p_f1_b

    # update the weights and biases for the second layer (f2)
    f2_W_updated = f2_W - epsilon * p_f2_W
    f2_b_updated = f2_b - epsilon * p_f2_b

    return (f1_W_updated, f1_b_updated, f2_W_updated, f2_b_updated)

def print_training_hyperparams_for_session(epsilon, batch_size, bound):
    print("Starting training session with 10 epochs:")
    print("")
    print("Hyperparameters:") 
    print(f"epsilon: {epsilon}")
    print(f"bound: {bound}")
    print(f"batch_size: {batch_size}")
    print("")
    print("Results:")

def plot_epoch(epochs, accuracies, epsilon, batch_size, bound):
    pyplot.figure(figsize=(10, 6))
    pyplot.plot(epochs, accuracies, label=f"Epsilon: {epsilon}, Batch Size: {batch_size}, Bound: {bound}")
    pyplot.xlabel('Epoch')
    pyplot.ylabel('Accuracy')
    pyplot.title('Training Accuracy over Epochs')
    pyplot.legend()
    pyplot.grid(True)
    pyplot.show()

def plot_all_epochs(training_results):
    pyplot.figure(figsize=(12, 8))

    for epochs, accuracies, epsilon, batch_size, bound in training_results:
        label = f"Epsilon: {epsilon}, Batch Size: {batch_size}, Bound: {bound}"
        pyplot.plot(epochs, accuracies, label=label)

    pyplot.xlabel('Epoch')
    pyplot.ylabel('Accuracy')
    pyplot.title('Training Accuracy over Epochs for Different Hyperparameters')
    pyplot.legend()
    pyplot.grid(True)
    pyplot.show()

def plot_table(training_results):
    # Setting up the data for the table
    cell_text = []
    columns = ['Epoch', 'Accuracy', 'Epsilon', 'Batch Size', 'Bound']
    for result in training_results:
        epochs, accuracies, epsilon, batch_size, bound = result
        for epoch, accuracy in zip(epochs, accuracies):
            cell_text.append([epoch, f"{accuracy:.3f}", epsilon, batch_size, bound])

    # Determine the figure size needed for the table
    figsize = (10, len(cell_text) * 0.2)
    fig, ax = pyplot.subplots(figsize=figsize)
    ax.axis('tight')
    ax.axis('off')

    # Create the table
    table = ax.table(cellText=cell_text, colLabels=columns, loc='center', cellLoc='center')

    # Adjust table scale
    table.auto_set_font_size(False)
    table.set_fontsize(8)
    table.auto_set_column_width(col=list(range(len(columns))))

    pyplot.show()


def start_training(epsilon, batch_size, bound, mnist_train):

    # ToDo: set numpy random seed to the last 8 digits of your CWID
    np.random.seed(20497299)

    validation_images = mnist_train["images"][:1000]
    validation_labels = mnist_train["labels"][:1000]
    training_images = mnist_train["images"][1000:]
    training_labels = mnist_train["labels"][1000:]

    # hyperparameters
    # we can experiment with these values to see if increasing or decreasing
    # these values can influence our accuracy
    # default values
    #bound = 1 # initial weight range
    #epsilon = 0.00001 # learning rate

    #print_training_hyperparams_for_session(epsilon, batch_size, bound)
    # start training

    accuracies = []
    epochs = []
    start = time.time()
    theta = initialize_theta(bound)
    batches = training_images.shape[0]//batch_size
    for epoch in range(10): 
        indices = np.arange(training_images.shape[0])
        np.random.shuffle(indices)
        for i in range(batches):
            batch_images = training_images[indices[i*batch_size:(i+1)*batch_size]]
            batch_labels = training_labels[indices[i*batch_size:(i+1)*batch_size]]

            z, h, g, x = forward(batch_images, theta)
            nabla_J = backprop(batch_labels, theta, z, h, g, x)
            theta = update_theta(theta, nabla_J, epsilon)

        # check accuracy using validation examples
        z, _, _, _ = forward(validation_images, theta)
        pred_labels = z.argmax(axis = 1)
        accuracy = sum(pred_labels == validation_labels) / validation_images.shape[0]
        accuracies.append(accuracy)
        epochs.append(epoch)
        #count = sum(pred_labels == validation_labels)
        print("epoch %d, accuracy %.3f, time %.2f" % (
            epoch, accuracy, time.time()-start))

    #plot_epoch(epochs, accuracies, epsilon, batch_size, bound)

    # save the weights to be submitted
    save_theta(theta)

    # return this data so we can plot it with matplotlib
    return epochs, accuracies

def main():
    training_results = []
    mnist_train = np.load("mnist_train.npz")

    training_results = []

    # load training data once
    mnist_train = np.load("mnist_train.npz")

    # we can add to this list if we want to test combinations of hyperparameters
    hyperparams = [
        (0.00001, 1, 4), # default params
        (0.00001, 0.1, 4),
        (0.00001, 0.5, 4),
        (0.00001, 0.7, 4),
        (0.00001, 0.01, 4),
        (0.00001, 0.01, 3),
        (0.00001, 0.01, 2),
        (0.000013, 0.012, 1), 
        (0.000013, 0.012002899999999983, 1), 
        (0.000013, 0.01200591999999996, 1), 
    ]

    for epsilon, bound, batch_size in hyperparams:
        epochs, accuracies = start_training(epsilon, batch_size, bound, mnist_train)
        training_results.append((epochs, accuracies, epsilon, batch_size, bound))

    # uncomment if you would like to see plotted results
    #plot_all_epochs(training_results)

    # plot table
    #plot_table(training_results[9])

if __name__ == '__main__':
    main()