diff --git a/Makefile b/Makefile
new file mode 100755
index 0000000..eaa6648
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,9 @@
+.PHONY: all libeasynn easynn_test
+
+all: libeasynn easynn_test
+
+libeasynn:
+	g++ -Wall src/*.cpp -fPIC -O -g -shared -o libeasynn.so
+
+easynn_test: libeasynn
+	g++ -Wall easynn_test.cpp -g -lm -L. -Wl,-rpath=. -leasynn -o easynn_test
\ No newline at end of file
diff --git a/easynn.py b/easynn.py
new file mode 100755
index 0000000..239d50e
--- /dev/null
+++ b/easynn.py
@@ -0,0 +1,145 @@
+class Expr:
+    next_id = 0
+    
+    def __init__(self, op, inputs):
+        self.op = op
+        self.inputs = inputs
+        self.id = Expr.next_id
+        Expr.next_id += 1
+        
+        if not isinstance(op, Op):
+            raise Exception("Not an operator: %s" % op)
+        
+    def __dfs_post(self, ids, visitor):
+        ids[self.id] = True
+        for expr in self.inputs:
+            if expr.id in ids:
+                continue
+            expr.__dfs_post(ids, visitor)
+        visitor(self)
+        
+    def statements(self):
+        lines = []
+        self.__dfs_post({}, lambda that: lines.append("%s" % that))
+        return "\n".join(lines)
+        
+    def __str__(self):
+        args = ",".join(["t%d" % expr.id for expr in self.inputs])
+        return "t%d = %s(%s)" % (self.id, self.op, args)
+    
+    def __promote(r):
+        if isinstance(r, Expr):
+            return r
+        else:
+            return Const(r)
+        
+    def __add__(self, r):
+        return Op("", "Add", 2, {})(self, Expr.__promote(r))
+
+    def __sub__(self, r):
+        return Op("", "Sub", 2, {})(self, Expr.__promote(r))
+
+    def __mul__(self, r):
+        return Op("", "Mul", 2, {})(self, Expr.__promote(r))
+
+    def __neg__(self):
+        return Op("", "Neg", 1, {})(self)
+    
+    def compile(self, builder):
+        self.__dfs_post({}, lambda that: builder.append(that))
+        return builder.build()
+    
+    def resolve(self, parameters):
+        self.__dfs_post({}, lambda that: that.op.resolve(parameters))
+        return self
+    
+
+class Op:
+    def __init__(self, name, op_type, num_args, parameters):
+        self.name = name
+        self.op_type = op_type
+        self.num_args = num_args
+        self.parameters = parameters
+    
+    def __call__(self, *inputs):
+        if self.num_args >= 0 and self.num_args != len(inputs):
+            raise Exception("%s: need %d arguments but found %d" % (self, self.num_args, len(inputs)))
+        for i, expr in enumerate(inputs):
+            if not isinstance(expr, Expr):
+                raise Exception("%s: arg %d is not an expression: %s" % (self, i, expr))
+        return Expr(self, inputs)
+    
+    def __str__(self):
+        name = "%s.%s" % (self.name, self.op_type)
+        if len(self.parameters) == 0:
+            return name
+        params = ",".join(["%s=%s" % (k, v.shape if hasattr(v, "shape") else v) for k, v in self.parameters.items()])
+        return "%s[%s]" % (name, params)
+    
+    def resolve(self, parameters):
+        if self.name == "":
+            return
+        for k, v in parameters.items():
+            if k.startswith(self.name+"."):
+                self.parameters[k[len(self.name)+1:]] = v
+
+
+def Const(c):
+    if isinstance(c, (int, float)):
+        c = float(c)
+    elif hasattr(c, "shape"):
+        c = c.astype(float)
+    else:
+        raise Exception("Const must be float or int or ndarray: %s" % c)
+    
+    return Expr(Op("", "Const", 0, {
+        "value": c
+    }), [])
+
+
+def Input(n):
+    return Expr(Op(n, "Input", 0, {}), [])
+
+
+def Input2d(n, h, w, ic):
+    return Expr(Op(n, "Input2d", 0, {
+        "height": h,
+        "width": w,
+        "in_channels": ic
+    }), [])
+
+
+def MaxPool2d(k, s):
+    return Op("", "MaxPool2d", 1, {
+        "kernel_size": k,
+        "stride": s
+    })
+
+
+def ReLU():
+    return Op("", "ReLU", 1, {})
+
+
+def Flatten():
+    return Op("", "Flatten", 1, {})
+
+
+def Conv2d(n, ic, oc, k, p = 0):
+    return Op(n, "Conv2d", 1, {
+        "in_channels": ic,
+        "out_channels": oc,
+        "kernel_size": k,
+        "padding": p
+    })
+
+
+def Linear(n, i, o):
+    return Op(n, "Linear", 1, {
+        "in_features": i,
+        "out_features": o
+    })
+
+
+def Show():
+    return Op("", "Show", 1, {})
+    
diff --git a/easynn_cpp.py b/easynn_cpp.py
new file mode 100755
index 0000000..50bb673
--- /dev/null
+++ b/easynn_cpp.py
@@ -0,0 +1,130 @@
+import numpy as np
+import ctypes
+import time
+
+_libeasynn = ctypes.CDLL("./libeasynn.so")
+
+_libeasynn.create_program.restype = ctypes.c_void_p
+
+_libeasynn.append_expression.argtypes = [
+    ctypes.c_void_p,
+    ctypes.c_int,
+    ctypes.c_char_p,
+    ctypes.c_char_p,
+    ctypes.c_void_p,
+    ctypes.c_int
+]
+
+_libeasynn.add_op_param_double.argtypes = [
+    ctypes.c_void_p,
+    ctypes.c_char_p,
+    ctypes.c_double
+]
+
+_libeasynn.add_op_param_ndarray.argtypes = [
+    ctypes.c_void_p,
+    ctypes.c_char_p,
+    ctypes.c_int,
+    ctypes.c_void_p,
+    ctypes.c_void_p
+]
+
+_libeasynn.build.restype = ctypes.c_void_p
+_libeasynn.build.argtypes = [
+    ctypes.c_void_p
+]
+
+_libeasynn.add_kwargs_double.argtypes = [
+    ctypes.c_void_p,
+    ctypes.c_char_p,
+    ctypes.c_double
+]
+
+_libeasynn.add_kwargs_ndarray.argtypes = [
+    ctypes.c_void_p,
+    ctypes.c_char_p,
+    ctypes.c_int,
+    ctypes.c_void_p,
+    ctypes.c_void_p
+]
+
+_libeasynn.execute.argtypes = [
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+    ctypes.c_void_p
+]
+
+
+def check_and_raise(ret):
+    if ret != 0:
+        raise Exception("libeasynn error: code %d" % ret)
+
+
+def to_float_or_ndarray(c_dim, c_shape, c_data):
+    if c_dim.value == 0:
+        return c_data[0]
+    shape = [c_shape[k] for k in range(c_dim.value)]
+    N = 1
+    for s in shape:
+        N *= s
+    print("shape =", shape, "N =", N)
+    flat = np.array([c_data[i] for i in range(N)])
+    return flat.reshape(shape)
+
+
+class Eval:
+    def __init__(self, evaluation):
+        self.evaluation = evaluation
+    
+    def __call__(self, **kwargs):
+        start = time.time()
+        for k, v in kwargs.items():
+            if isinstance(v, (int, float)):
+                _libeasynn.add_kwargs_double(
+                    self.evaluation, k.encode(), ctypes.c_double(v))
+            elif hasattr(v, "shape"):
+                v = np.require(v, dtype=np.double, requirements=['C', 'O', 'W', 'A'])
+                _libeasynn.add_kwargs_ndarray(
+                    self.evaluation, k.encode(), v.ndim, v.ctypes.shape,
+                    v.ctypes.data)
+            else:
+                raise Exception("%s: kwargs must be float or int or ndarray" % k)
+        c_dim = ctypes.c_int()
+        c_shape = ctypes.POINTER(ctypes.c_size_t)()
+        c_data = ctypes.POINTER(ctypes.c_double)()
+        check_and_raise(_libeasynn.execute(self.evaluation,
+            ctypes.byref(c_dim), ctypes.byref(c_shape), ctypes.byref(c_data)))
+        res = to_float_or_ndarray(c_dim, c_shape, c_data)
+        t = time.time()-start
+        if t > 0.1:
+            print("c++ time %.2f" % t)
+        return res
+
+
+class Builder:
+    def __init__(self):
+        self.program = _libeasynn.create_program()
+    
+    def append(self, expr):
+        inputs = [ex.id for ex in expr.inputs]
+        num_inputs = len(inputs)
+        op = expr.op
+        _libeasynn.append_expression(
+            self.program, expr.id,
+            op.name.encode(), op.op_type.encode(),
+            (ctypes.c_int*num_inputs)(*inputs), num_inputs)
+        for k, v in op.parameters.items():
+            if isinstance(v, (int, float)):
+                check_and_raise(_libeasynn.add_op_param_double(
+                    self.program, k.encode(), ctypes.c_double(v)))
+            elif hasattr(v, "shape"):
+                v = np.require(v, dtype=np.double, requirements=['C', 'O', 'W', 'A'])
+                check_and_raise(_libeasynn.add_op_param_ndarray(
+                    self.program, k.encode(), v.ndim, v.ctypes.shape,
+                    v.ctypes.data))
+            else:
+                raise Exception("%s: op params must be float or int or ndarray: %s" % (expr, k))
+
+    def build(self):
+        return Eval(_libeasynn.build(self.program))
diff --git a/easynn_golden.py b/easynn_golden.py
new file mode 100755
index 0000000..cf620ef
--- /dev/null
+++ b/easynn_golden.py
@@ -0,0 +1,189 @@
+import numpy as np
+import time
+
+
+def Input(expr, op, args, **kwargs):
+    if op.name in kwargs:
+        c  = kwargs[op.name]
+        if isinstance(c, (int, float)):
+            return float(c)
+        elif hasattr(c, "shape"):
+            return c.astype(float)
+        else:
+            raise Exception("%s: Input must be float or int or ndarray: %s" % (expr, c))
+    else:
+        raise Exception("%s: missing input" % expr)
+
+
+def Input2d(expr, op, args, **kwargs):
+    if not op.name in kwargs:
+        raise Exception("%s: missing input" % expr)
+    imgs = kwargs[op.name]
+    if not hasattr(imgs, "shape"):
+        raise Exception("%s: Input must be ndarray: %s" % (expr, imgs))
+    if any([
+        len(imgs.shape) != 4,
+        imgs.shape[1] != op.parameters["height"],
+        imgs.shape[2] != op.parameters["width"],
+        imgs.shape[3] != op.parameters["in_channels"]]):
+        raise Exception("%s: Invalid input size: %s" % (expr, imgs.shape))
+    # NHWC => NCHW
+    return imgs.astype(float).transpose(0,3,1,2)
+
+
+def Const(expr, op, args, **kwargs):
+    return op.parameters["value"]
+
+
+def Neg(expr, op, args, **kwargs):
+    return -args[0]
+
+
+def Add(expr, op, args, **kwargs):
+    a = args[0]
+    b = args[1]
+    if not hasattr(a, "shape") and not hasattr(b, "shape"):
+        return a+b
+    elif hasattr(a, "shape") and hasattr(b, "shape"):
+        if a.shape != b.shape:
+            raise Exception("%s: size mismatch: %s+%s" % (expr, a.shape, b.shape))
+        return a+b
+    else:
+        raise Exception("%s: cannot mix scalar and ndarray" % expr)
+
+
+def Sub(expr, op, args, **kwargs):
+    a = args[0]
+    b = args[1]
+    if not hasattr(a, "shape") and not hasattr(b, "shape"):
+        return a-b
+    elif hasattr(a, "shape") and hasattr(b, "shape"):
+        if a.shape != b.shape:
+            raise Exception("%s: size mismatch: %s-%s" % (expr, a.shape, b.shape))
+        return a-b
+    else:
+        raise Exception("%s: cannot mix scalar and ndarray" % expr)
+
+
+def Mul(expr, op, args, **kwargs):
+    a = args[0]
+    b = args[1]
+    if not hasattr(a, "shape") or not hasattr(b, "shape"):
+        return a*b
+    else:
+        if len(a.shape) != 2 or len(b.shape) != 2:
+            raise Exception("%s: matmul only: %s*%s" % (expr, a.shape, b.shape))
+        if a.shape[1] != b.shape[0]:
+            raise Exception("%s: size mismatch: %s*%s" % (expr, a.shape, b.shape))
+        return np.matmul(a, b)
+
+
+def Flatten(expr, op, args, **kwargs):
+    x = args[0]
+    if not hasattr(x, "shape"):
+        raise Exception("%s: ndarray only: %s" % (expr, imgs))
+    return x.reshape((x.shape[0], -1))
+
+
+def ReLU(expr, op, args, **kwargs):
+    x = args[0]
+    return x*(x > 0)
+
+
+def Linear(expr, op, args, **kwargs):
+    x = args[0]
+    if not hasattr(x, "shape"):
+        raise Exception("%s: ndarray only: %s" % (expr, x))
+    if "weight" not in op.parameters or "bias" not in op.parameters:
+        raise Exception("%s: missing weight or bias" % expr)
+    weight = op.parameters["weight"]
+    bias = op.parameters["bias"]
+    if not hasattr(weight, "shape") or not hasattr(bias, "shape"):
+        raise Exception("%s: ndarray only for weight or bias" % expr)
+    in_features = op.parameters["in_features"]
+    out_features = op.parameters["out_features"]
+    if any([
+        len(x.shape) != 2,
+        x.shape[1] != in_features,
+        weight.shape != (out_features, in_features),
+        bias.shape != (out_features,)]):
+        raise Exception("%s: size mismatch: %s*%s+%s" % (expr, weight.shape, x.shape, bias.shape))
+    return np.einsum("ni,oi->no", x, weight,
+        optimize = "optimal")+bias.reshape((1, out_features))
+
+
+def MaxPool2d(expr, op, args, **kwargs):
+    x = args[0]
+    if not hasattr(x, "shape"):
+        raise Exception("%s: ndarray only: %s" % (expr, x))
+    kernel_size = op.parameters["kernel_size"]
+    stride = op.parameters["stride"]
+    if kernel_size != stride:
+        raise Exception("%s: kernel_size != stride" % expr)
+    if any([
+        len(x.shape) != 4,
+        x.shape[2]%stride != 0,
+        x.shape[3]%stride != 0]):
+        raise Exception("%s: size mismatch: %s" % (expr, x.shape))
+    new_shape = (x.shape[0], x.shape[1], x.shape[2]//stride, stride, x.shape[3]//stride, stride)
+    return np.nanmax(x.reshape(new_shape), axis = (3,5))
+
+
+def Conv2d(expr, op, args, **kwargs):
+    x = args[0]
+    if not hasattr(x, "shape"):
+        raise Exception("%s: ndarray only: %s" % (expr, x))
+    if "weight" not in op.parameters or "bias" not in op.parameters:
+        raise Exception("%s: missing weight or bias" % expr)
+    weight = op.parameters["weight"]
+    bias = op.parameters["bias"]
+    in_channels = op.parameters["in_channels"]
+    out_channels = op.parameters["out_channels"]
+    kernel_size = op.parameters["kernel_size"]
+    padding = op.parameters["padding"]
+    if any([
+        len(x.shape) != 4,
+        x.shape[1] != in_channels,
+        weight.shape != (out_channels, in_channels, kernel_size, kernel_size),
+        bias.shape != (out_channels,)]):
+        raise Exception("%s: size mismatch: %s" % (expr, x.shape))
+    if padding != 0:
+        tmp = np.zeros((x.shape[0], x.shape[1], x.shape[2]+2*padding, x.shape[3]+2*padding))
+        tmp[:, :, 1:-2, 1:-2] = x
+        x = tmp
+    conv_shape = x.shape[:2]+(x.shape[2]+1-kernel_size, x.shape[3]+1-kernel_size, kernel_size, kernel_size)
+    conv_strides = x.strides+x.strides[2:]
+    conv = np.lib.stride_tricks.as_strided(x, shape = conv_shape, strides = conv_strides, writeable = False)
+    return np.einsum("nihwyx,oiyx->nohw", conv, weight,
+        optimize = "optimal")+bias.reshape((1, out_channels, 1, 1))
+
+
+class Eval:
+    def __init__(self, program):
+        self.program = program
+    
+    def __call__(self, **kwargs):
+        start = time.time()
+        values = {}
+        for expr in self.program:
+            args = [values[ex.id] for ex in expr.inputs]
+            if expr.op.op_type not in globals():
+                raise Exception("%s: not implemented" % expr)
+            values[expr.id] = globals()[expr.op.op_type](expr, expr.op, args, **kwargs)
+            #print("numpy op", expr.op.op_type, "time %.2f" % (time.time()-start))
+        res = values[self.program[-1].id]
+        t = time.time()-start
+        if t > 0.1:
+            print("numpy time %.2f" % t)
+        return res
+
+
+class Builder:
+    def __init__(self):
+        self.program = []
+    
+    def append(self, expr):
+        self.program.append(expr)
+    
+    def build(self):
+        return Eval(self.program)
diff --git a/easynn_test.cpp b/easynn_test.cpp
new file mode 100755
index 0000000..db9a061
--- /dev/null
+++ b/easynn_test.cpp
@@ -0,0 +1,47 @@
+/**
+ * A simple test program helps you to debug your easynn implementation.
+ */
+
+#include <stdio.h>
+#include "src/libeasynn.h"
+
+int main()
+{
+    program *prog = create_program();
+
+    int inputs0[] = {};
+    append_expression(prog, 0, "a", "Input", inputs0, 0);
+
+    //int inputs1[] = {0, 0};
+    //append_expression(prog, 1, "", "Add", inputs1, 2);
+
+    //int inputs2[] = {2, 2};
+    //append_expression(prog, 2, "", "Mul", inputs2, 2);
+
+    //int inputs3[] = {10, 5};
+    //append_expression(prog, 3, "", "Sub", inputs3, 2);
+
+
+    evaluation *eval = build(prog);
+    add_kwargs_double(eval, "a", 5);
+
+
+    // tensor tests
+    // it is easier for me 
+    // to just compile the library and run the python grading tests than to use this program
+    int dim = 0;
+    size_t *shape = nullptr;
+    double *data = nullptr;
+    if (execute(eval, &dim, &shape, &data) != 0)
+    {
+        printf("evaluation fails\n");
+        return -1;
+    }
+
+    if (dim == 0)
+        printf("res = %f\n", data[0]);
+    else
+        printf("result as tensor is not supported yet\n");
+
+    return 0;
+}
diff --git a/ece449.code-workspace b/ece449.code-workspace
new file mode 100755
index 0000000..aeba41b
--- /dev/null
+++ b/ece449.code-workspace
@@ -0,0 +1,12 @@
+{
+	"folders": [
+		{
+			"path": "."
+		}
+	],
+	"settings": {
+		"files.associations": {
+			"array": "cpp"
+		}
+	}
+}
\ No newline at end of file
diff --git a/grade.py b/grade.py
new file mode 100755
index 0000000..911939f
--- /dev/null
+++ b/grade.py
@@ -0,0 +1,30 @@
+import traceback
+import multiprocessing
+
+def grade_all(name, begin, end, funcs):
+    grade = 0
+    for q in range(begin, end):
+        def func():
+            exit(0 if funcs["grade_Q%d" % q]() else 1)
+        try:
+            p = multiprocessing.Process(target = func)
+            p.start()
+            p.join(300)
+            if p.is_alive():
+                p.terminate()
+                print("============Q%d timeout!============\n" % q, flush = True)
+            elif p.exitcode == 0:
+                print("============Q%d passed!============\n" % q, flush = True)
+                grade += 1
+            else:
+                print("============Q%d failed!============\n" % q, flush = True)
+        except Exception as e:
+            print("============Q%d failed!============" % q, flush = True)
+            print(traceback.format_exc())
+    print("Grading result: %d functions passed" % grade)
+    
+    print("*************************************************************")
+    print("* You may receive 0 points unless your code tests correctly *")
+    print("* Please commit and push your code at least daily.          *")
+    print("* DO NOT CHANGE any grade_px.py for grading                 *")
+    print("*************************************************************")
diff --git a/grade_p1.py b/grade_p1.py
new file mode 100755
index 0000000..d4c0a20
--- /dev/null
+++ b/grade_p1.py
@@ -0,0 +1,104 @@
+import numpy as np
+import easynn as nn
+import easynn_golden as golden
+import grade
+import prj01
+
+
+# create
+def grade_Q1():
+    a = prj01.Q1()
+    if not hasattr(a, "shape"):
+        return False
+    if a.shape != (10, 5):
+        return False
+    return all([a[i, j] == i+j for i in range(10) for j in range(5)])
+
+
+# add
+def grade_Q2():
+    a = np.random.rand(100, 10)
+    b = np.random.rand(100, 10)
+    c = prj01.Q2(a, b)
+    if not hasattr(c, "shape"):
+        return False
+    if c.shape != (100, 10):
+        return False
+    return all([abs(c[i, j]-a[i, j]-b[i, j]) < 1e-9 for i in range(100) for j in range(10)])
+
+
+# mul
+def grade_Q3():
+    a = np.array([[i for i in range(10)]])
+    b = a.transpose()
+    c = prj01.Q3(a, b)
+    d = prj01.Q3(b, a)
+    if not hasattr(c, "shape") or not hasattr(d, "shape"):
+        return False
+    if c.shape != (1, 1) or d.shape != (10, 10):
+        return False
+    return c[0, 0] == 285 and all([d[i, j] == i*j for i in range(10) for j in range(10)])
+
+
+# max
+def grade_Q4():
+    a = np.random.rand(100, 10)
+    b = prj01.Q4(a)
+    if len(b) != 100:
+        return False
+    return all([a[i, b[i]] >= a[i, j] for i in range(100) for j in range(10)])
+
+
+# solve
+def grade_Q5():
+    A = -np.random.rand(100, 100)+np.diag([100]*100)
+    b = np.random.rand(100, 1)
+    x = prj01.Q5(A, b)
+    Ax = prj01.Q3(A, x)
+    return np.allclose(Ax, b)
+
+
+# a+b
+def grade_Q6():
+    a = np.random.rand(100, 10)
+    b = np.random.rand(100, 10)
+    c = prj01.Q6().compile(golden.Builder())(a = a, b = b)
+    return np.allclose(c, prj01.Q2(a, b))
+
+
+# a+b*c
+def grade_Q7():
+    a = np.random.rand(100, 50)
+    b = np.random.rand(100, 10)
+    c = np.random.rand(10, 50)
+    d = prj01.Q7().compile(golden.Builder())(a = a, b = b, c = c)
+    return np.allclose(d, prj01.Q2(a, prj01.Q3(b, c)))
+
+
+# Ax+b
+def grade_Q8():
+    A = np.random.rand(100, 100)
+    x = np.random.rand(100, 1)
+    b = np.random.rand(100, 1)
+    y = prj01.Q8(A, b).compile(golden.Builder())(x = x)
+    return np.allclose(y, prj01.Q2(prj01.Q3(A, x), b))
+
+
+# x**n
+def grade_Q9():
+    x = np.random.rand(100, 100)
+    y = prj01.Q9(8).compile(golden.Builder())(x = x)
+    x2 = prj01.Q3(x, x)
+    x4 = prj01.Q3(x2, x2)
+    x8 = prj01.Q3(x4, x4)
+    return np.allclose(y, x8)
+
+
+# |x|
+def grade_Q10():
+    x = np.random.randn(100, 100)
+    y = prj01.Q10().compile(golden.Builder())(x = x)
+    return np.allclose(y, np.abs(x))
+
+
+grade.grade_all("p1", 1, 11, globals())
diff --git a/grade_p2.py b/grade_p2.py
new file mode 100755
index 0000000..f7ea4f2
--- /dev/null
+++ b/grade_p2.py
@@ -0,0 +1,89 @@
+import numpy as np
+import easynn as nn
+import easynn_golden as golden
+import easynn_cpp as cpp
+import grade
+
+
+def random_kwargs(kwargs):
+    return {k: np.random.random(shape) if shape != None else np.random.random() for k, shape in kwargs.items()}
+
+
+def is_same(p, n, **kwargs):
+    e0 = p.compile(golden.Builder())
+    e1 = p.compile(cpp.Builder())
+    nkwargs = [random_kwargs(kwargs) for i in range(n)]
+    return all([np.allclose(e0(**nkwargs[i]), e1(**nkwargs[i])) for i in range(n)])
+
+
+def grade_Q1():
+    x = nn.Input("x")
+    return is_same(x, 1, x = None)
+
+
+def grade_Q2():
+    c = nn.Const(np.random.random())
+    return is_same(c, 1)
+
+
+def grade_Q3():
+    x = nn.Input("x")
+    c = nn.Const(np.random.random())
+    y = x+c
+    return is_same(y, 1, x = None)
+
+
+def grade_Q4():
+    x = nn.Input("x")
+    c = nn.Const(np.random.random())
+    y = x*c
+    return is_same(y, 1, x = None)
+
+
+def grade_Q5():
+    x = nn.Input("x")
+    y = nn.Input("y")
+    z = x+y
+    return is_same(z, 1, x = None, y = None)
+
+
+def grade_Q6():
+    x = nn.Input("x")
+    y = nn.Input("y")
+    z = x*y
+    return is_same(z, 1, x = None, y = None)
+
+
+def grade_Q7():
+    x = nn.Input("x")
+    y = nn.Input("y")
+    z = nn.Input("z")
+    r = x+y*z
+    return is_same(r, 1, x = None, y = None, z = None)
+
+
+def grade_Q8():
+    x = nn.Input("x")
+    y = nn.Input("y")
+    z = nn.Input("z")
+    r = (x-y)*z
+    return is_same(r, 1, x = None, y = None, z = None)
+
+
+def grade_Q9():
+    x = nn.Input("x")
+    y = nn.Input("y")
+    c = nn.Const(np.random.random())
+    z = x*y-x*c-y*c+c*c
+    return is_same(z, 10, x = None, y = None)
+
+
+def grade_Q10():
+    x = nn.Input("x")
+    y = nn.Input("y")
+    c = nn.Const(np.random.random())
+    z = x*y-x*c-y*c+c*c
+    return is_same(z, 10, x = None, y = None)
+
+
+grade.grade_all("p2", 1, 11, globals())
diff --git a/grade_p3.py b/grade_p3.py
new file mode 100755
index 0000000..2ae9d35
--- /dev/null
+++ b/grade_p3.py
@@ -0,0 +1,99 @@
+import numpy as np
+import easynn as nn
+import easynn_golden as golden
+import easynn_cpp as cpp
+import grade
+
+
+def random_kwargs(kwargs):
+    return {k: np.random.random(shape) if shape != None else np.random.random() for k, shape in kwargs.items()}
+
+
+def is_same(p, n, **kwargs):
+    e0 = p.compile(golden.Builder())
+    e1 = p.compile(cpp.Builder())
+    nkwargs = [random_kwargs(kwargs) for i in range(n)]
+    return all([np.allclose(e0(**nkwargs[i]), e1(**nkwargs[i])) for i in range(n)])
+
+
+def grade_Q1():
+    x = nn.Input("x")
+    return is_same(x, 1, x = (9,)) and is_same(x, 1, x = (9, 9))
+
+
+def grade_Q2():
+    c1 = nn.Const(np.random.random((10,)))
+    c2 = nn.Const(np.random.random((10, 10)))
+    return is_same(c1, 1) and is_same(c2, 1)
+
+
+def grade_Q3():
+    x = nn.Input("x")
+    y = nn.Input("y")
+    z = x+y
+    return all([
+        is_same(z, 1, x = (11,), y = (11,)),
+        is_same(z, 1, x = (11, 12), y = (11, 12)),
+        is_same(z, 1, x = (11, 12, 13), y = (11, 12, 13)),
+        is_same(z, 1, x = (11, 12, 13, 14), y = (11, 12, 13, 14))])
+
+
+def grade_Q4():
+    x = nn.Input("x")
+    y = nn.Input("y")
+    z = x-y
+    return all([
+        is_same(z, 1, x = (11,), y = (11,)),
+        is_same(z, 1, x = (11, 12), y = (11, 12)),
+        is_same(z, 1, x = (11, 12, 13), y = (11, 12, 13)),
+        is_same(z, 1, x = (11, 12, 13, 14), y = (11, 12, 13, 14))])
+
+
+def grade_Q5():
+    x = nn.Input("x")
+    y = nn.Input("y")
+    z = x*y
+    return is_same(z, 1, x = (11, 12), y = (12, 13))
+
+
+def grade_Q6():
+    x = nn.Input("x")
+    y = nn.Input("y")
+    z = x*y
+    return is_same(z, 1, x = None, y = (12, 13)) and is_same(z, 1, x = (11, 12), y = None)
+
+
+def grade_Q7():
+    x = nn.Input("x")
+    y = nn.Input("y")
+    z = nn.Input("z")
+    r = x+y*z
+    return is_same(r, 1, x = (11, 13), y = (11, 12), z = (12, 13))
+
+
+def grade_Q8():
+    x = nn.Input("x")
+    y = nn.Input("y")
+    z = nn.Input("z")
+    r = (x-y)*z
+    return is_same(r, 1, x = (11, 12), y = (11, 12), z = (12, 13))
+
+
+def grade_Q9():
+    x = nn.Input("x")
+    y = nn.Input("y")
+    z = nn.Input("z")
+    r = x*y*z
+    return is_same(r, 1, x = (11, 12), y = (12, 13), z = (13, 14))
+
+
+def grade_Q10():
+    x = nn.Input("x")
+    y = nn.Input("y")
+    c1 = nn.Const(np.random.random((11, 12)))
+    c2 = nn.Const(np.random.random((12, 13)))
+    z = x*y-x*c2-c1*y+c1*c2
+    return is_same(z, 1, x = (11, 12), y = (12, 13))
+
+
+grade.grade_all("p3", 1, 11, globals())
diff --git a/grade_p4.py b/grade_p4.py
new file mode 100755
index 0000000..90f7a4e
--- /dev/null
+++ b/grade_p4.py
@@ -0,0 +1,151 @@
+import numpy as np
+import easynn as nn
+import easynn_golden as golden
+import easynn_cpp as cpp
+import grade
+
+
+def random_kwargs(kwargs):
+    return {k: np.random.random(shape) if shape != None else np.random.random() for k, shape in kwargs.items()}
+
+
+def is_same(p, n, **kwargs):
+    e0 = p.compile(golden.Builder())
+    e1 = p.compile(cpp.Builder())
+    nkwargs = [random_kwargs(kwargs) for i in range(n)]
+    return all([np.allclose(e0(**nkwargs[i]), e1(**nkwargs[i])) for i in range(n)])
+
+
+def grade_Q1():
+    relu = nn.ReLU()
+    x = relu(nn.Input("x"))
+    return is_same(x, 1, x = (10, 11, 12, 13))
+
+
+def grade_Q2():
+    flatten = nn.Flatten()
+    x = flatten(nn.Input("x"))
+    return is_same(x, 1, x = (10, 11, 12, 13))
+
+
+def grade_Q3():
+    x = nn.Input2d("images", 10, 11, 3)
+    return is_same(x, 1, images = (50, 10, 11, 3))
+
+
+def grade_Q4():
+    f = nn.Linear("f", 100, 10)
+    x = f(nn.Input("x"))
+    x.resolve({
+        "f.weight": np.random.random((10, 100)),
+        "f.bias": np.random.random((10,))})
+    return is_same(x, 1, x = (50, 100))
+
+
+def grade_Q5():
+    pool = nn.MaxPool2d(3, 3)
+    x = pool(nn.Input2d("x", 12, 15, 3))
+    return is_same(x, 1, x = (10, 12, 15, 3))
+
+
+def grade_Q6():
+    c = nn.Conv2d("c", 3, 16, 5)
+    x = c(nn.Input2d("x", 15, 20, 3))
+    x.resolve({
+        "c.weight": np.random.random((16, 3, 5, 5)),
+        "c.bias": np.random.random((16,))
+    })
+    return is_same(x, 1, x = (10, 15, 20, 3))
+
+
+def grade_Q7():
+    relu = nn.ReLU()
+    flatten = nn.Flatten()
+    f1 = nn.Linear("f1", 28*28, 100)
+    f2 = nn.Linear("f2", 100, 10)
+    x = nn.Input2d("images", 28, 28, 1)
+    x = flatten(x)
+    x = f2(relu(f1(x)))
+    x.resolve(np.load("msimple_params.npz"))
+    mnist_test = np.load("mnist_test.npz")
+    images = mnist_test["images"][:20]
+
+    infer0 = x.compile(golden.Builder())
+    infer1 = x.compile(cpp.Builder())
+    logit0 = infer0(images = images)
+    logit1 = infer1(images = images)
+    return np.allclose(logit0, logit1)
+
+
+def grade_Q8():
+    relu = nn.ReLU()
+    flatten = nn.Flatten()
+    f1 = nn.Linear("f1", 28*28, 100)
+    f2 = nn.Linear("f2", 100, 10)
+    x = nn.Input2d("images", 28, 28, 1)
+    x = flatten(x)
+    x = f2(relu(f1(x)))
+    x.resolve(np.load("msimple_params.npz"))
+    mnist_test = np.load("mnist_test.npz")
+    images = mnist_test["images"][:1000]
+
+    infer0 = x.compile(golden.Builder())
+    infer1 = x.compile(cpp.Builder())
+    label0 = infer0(images = images).argmax(axis = 1)
+    label1 = infer1(images = images).argmax(axis = 1)
+    return np.allclose(label0, label1)
+
+
+def grade_Q9():
+    pool = nn.MaxPool2d(2, 2)
+    relu = nn.ReLU()
+    flatten = nn.Flatten()
+
+    x = nn.Input2d("images", 28, 28, 1)
+    c1 = nn.Conv2d("c1", 1, 8, 3) # 28->26
+    c2 = nn.Conv2d("c2", 8, 8, 3) # 26->24
+    x = pool(relu(c2(relu(c1(x))))) # 24->12
+    c3 = nn.Conv2d("c3", 8, 16, 3) # 12->10
+    c4 = nn.Conv2d("c4", 16, 16, 3) # 10->8
+    x = pool(relu(c4(relu(c3(x))))) # 8->4
+    f = nn.Linear("f", 16*4*4, 10)
+    x = f(flatten(x))
+
+    x.resolve(np.load("mnist_params.npz"))
+    mnist_test = np.load("mnist_test.npz")
+    images = mnist_test["images"][:20]
+
+    infer0 = x.compile(golden.Builder())
+    infer1 = x.compile(cpp.Builder())
+    logit0 = infer0(images = images)
+    logit1 = infer1(images = images)
+    return np.allclose(logit0, logit1)
+
+
+def grade_Q10():
+    pool = nn.MaxPool2d(2, 2)
+    relu = nn.ReLU()
+    flatten = nn.Flatten()
+
+    x = nn.Input2d("images", 28, 28, 1)
+    c1 = nn.Conv2d("c1", 1, 8, 3) # 28->26
+    c2 = nn.Conv2d("c2", 8, 8, 3) # 26->24
+    x = pool(relu(c2(relu(c1(x))))) # 24->12
+    c3 = nn.Conv2d("c3", 8, 16, 3) # 12->10
+    c4 = nn.Conv2d("c4", 16, 16, 3) # 10->8
+    x = pool(relu(c4(relu(c3(x))))) # 8->4
+    f = nn.Linear("f", 16*4*4, 10)
+    x = f(flatten(x))
+
+    x.resolve(np.load("mnist_params.npz"))
+    mnist_test = np.load("mnist_test.npz")
+    images = mnist_test["images"][:1000]
+
+    infer0 = x.compile(golden.Builder())
+    infer1 = x.compile(cpp.Builder())
+    label0 = infer0(images = images).argmax(axis = 1)
+    label1 = infer1(images = images).argmax(axis = 1)
+    return np.allclose(label0, label1)
+
+
+grade.grade_all("p4", 1, 11, globals())
diff --git a/grade_p5.py b/grade_p5.py
new file mode 100755
index 0000000..eecc8a0
--- /dev/null
+++ b/grade_p5.py
@@ -0,0 +1,79 @@
+import numpy as np
+import easynn as nn
+import easynn_golden as golden
+import grade
+
+def grade_Q1():
+    relu = nn.ReLU()
+    flatten = nn.Flatten()
+    f1 = nn.Linear("f1", 28*28, 32)
+    f2 = nn.Linear("f2", 32, 10)
+    x = nn.Input2d("images", 28, 28, 1)
+    x = flatten(x)
+    x = f2(relu(f1(x)))
+    x.resolve(np.load("p5_params.npz"))
+    mnist_test = np.load("mnist_test.npz")
+    images = mnist_test["images"]
+    labels = mnist_test["labels"]
+
+    infer = x.compile(golden.Builder())
+    pred_labels = infer(images = images).argmax(axis = 1)
+    count = sum(labels == pred_labels)
+    print(count)
+    return count > 8500
+
+def grade_Q2():
+    relu = nn.ReLU()
+    flatten = nn.Flatten()
+    f1 = nn.Linear("f1", 28*28, 32)
+    f2 = nn.Linear("f2", 32, 10)
+    x = nn.Input2d("images", 28, 28, 1)
+    x = flatten(x)
+    x = f2(relu(f1(x)))
+    x.resolve(np.load("p5_params.npz"))
+    mnist_test = np.load("mnist_test.npz")
+    images = mnist_test["images"]
+    labels = mnist_test["labels"]
+
+    infer = x.compile(golden.Builder())
+    pred_labels = infer(images = images).argmax(axis = 1)
+    count = sum(labels == pred_labels)
+    return count > 9000
+
+def grade_Q3():
+    relu = nn.ReLU()
+    flatten = nn.Flatten()
+    f1 = nn.Linear("f1", 28*28, 32)
+    f2 = nn.Linear("f2", 32, 10)
+    x = nn.Input2d("images", 28, 28, 1)
+    x = flatten(x)
+    x = f2(relu(f1(x)))
+    x.resolve(np.load("p5_params.npz"))
+    mnist_test = np.load("mnist_test.npz")
+    images = mnist_test["images"]
+    labels = mnist_test["labels"]
+
+    infer = x.compile(golden.Builder())
+    pred_labels = infer(images = images).argmax(axis = 1)
+    count = sum(labels == pred_labels)
+    return count > 9300
+
+def grade_Q4():
+    relu = nn.ReLU()
+    flatten = nn.Flatten()
+    f1 = nn.Linear("f1", 28*28, 32)
+    f2 = nn.Linear("f2", 32, 10)
+    x = nn.Input2d("images", 28, 28, 1)
+    x = flatten(x)
+    x = f2(relu(f1(x)))
+    x.resolve(np.load("p5_params.npz"))
+    mnist_test = np.load("mnist_test.npz")
+    images = mnist_test["images"]
+    labels = mnist_test["labels"]
+
+    infer = x.compile(golden.Builder())
+    pred_labels = infer(images = images).argmax(axis = 1)
+    count = sum(labels == pred_labels)
+    return count > 9500
+
+grade.grade_all("p5", 1, 5, globals())
diff --git a/grade_p6.py b/grade_p6.py
new file mode 100755
index 0000000..3b8e2b5
--- /dev/null
+++ b/grade_p6.py
@@ -0,0 +1,91 @@
+import numpy as np
+import easynn as nn
+import easynn_golden as golden
+import grade
+
+def grade_Q1():
+    pool = nn.MaxPool2d(2, 2)
+    relu = nn.ReLU()
+    flatten = nn.Flatten()
+    x = nn.Input2d("images", 28, 28, 1)
+    c1 = nn.Conv2d("c1", 1, 8, 5) # 28->24
+    x = pool(relu(c1(x))) # 24->12
+    c2 = nn.Conv2d("c2", 8, 8, 5) # 12->8
+    x = pool(relu(c2(x))) # 8->4
+    f = nn.Linear("f", 8*4*4, 10)
+    x = f(flatten(x))
+    x.resolve(np.load("p6_params.npz"))
+    mnist_test = np.load("mnist_test.npz")
+    images = mnist_test["images"]
+    labels = mnist_test["labels"]
+
+    infer = x.compile(golden.Builder())
+    pred_labels = infer(images = images).argmax(axis = 1)
+    count = sum(labels == pred_labels)
+    print(count)
+    return count > 8500
+
+def grade_Q2():
+    pool = nn.MaxPool2d(2, 2)
+    relu = nn.ReLU()
+    flatten = nn.Flatten()
+    x = nn.Input2d("images", 28, 28, 1)
+    c1 = nn.Conv2d("c1", 1, 8, 5) # 28->24
+    x = pool(relu(c1(x))) # 24->12
+    c2 = nn.Conv2d("c2", 8, 8, 5) # 12->8
+    x = pool(relu(c2(x))) # 8->4
+    f = nn.Linear("f", 8*4*4, 10)
+    x = f(flatten(x))
+    x.resolve(np.load("p6_params.npz"))
+    mnist_test = np.load("mnist_test.npz")
+    images = mnist_test["images"]
+    labels = mnist_test["labels"]
+
+    infer = x.compile(golden.Builder())
+    pred_labels = infer(images = images).argmax(axis = 1)
+    count = sum(labels == pred_labels)
+    return count > 9000
+
+def grade_Q3():
+    pool = nn.MaxPool2d(2, 2)
+    relu = nn.ReLU()
+    flatten = nn.Flatten()
+    x = nn.Input2d("images", 28, 28, 1)
+    c1 = nn.Conv2d("c1", 1, 8, 5) # 28->24
+    x = pool(relu(c1(x))) # 24->12
+    c2 = nn.Conv2d("c2", 8, 8, 5) # 12->8
+    x = pool(relu(c2(x))) # 8->4
+    f = nn.Linear("f", 8*4*4, 10)
+    x = f(flatten(x))
+    x.resolve(np.load("p6_params.npz"))
+    mnist_test = np.load("mnist_test.npz")
+    images = mnist_test["images"]
+    labels = mnist_test["labels"]
+
+    infer = x.compile(golden.Builder())
+    pred_labels = infer(images = images).argmax(axis = 1)
+    count = sum(labels == pred_labels)
+    return count > 9300
+
+def grade_Q4():
+    pool = nn.MaxPool2d(2, 2)
+    relu = nn.ReLU()
+    flatten = nn.Flatten()
+    x = nn.Input2d("images", 28, 28, 1)
+    c1 = nn.Conv2d("c1", 1, 8, 5) # 28->24
+    x = pool(relu(c1(x))) # 24->12
+    c2 = nn.Conv2d("c2", 8, 8, 5) # 12->8
+    x = pool(relu(c2(x))) # 8->4
+    f = nn.Linear("f", 8*4*4, 10)
+    x = f(flatten(x))
+    x.resolve(np.load("p6_params.npz"))
+    mnist_test = np.load("mnist_test.npz")
+    images = mnist_test["images"]
+    labels = mnist_test["labels"]
+
+    infer = x.compile(golden.Builder())
+    pred_labels = infer(images = images).argmax(axis = 1)
+    count = sum(labels == pred_labels)
+    return count > 9500
+
+grade.grade_all("p6", 1, 5, globals())
diff --git a/mnist_params.npz b/mnist_params.npz
new file mode 100755
index 0000000..56bfc1f
Binary files /dev/null and b/mnist_params.npz differ
diff --git a/mnist_test.npz b/mnist_test.npz
new file mode 100755
index 0000000..cd0e111
Binary files /dev/null and b/mnist_test.npz differ
diff --git a/mnist_train.npz b/mnist_train.npz
new file mode 100755
index 0000000..9067929
Binary files /dev/null and b/mnist_train.npz differ
diff --git a/msimple_params.npz b/msimple_params.npz
new file mode 100755
index 0000000..9860fe8
Binary files /dev/null and b/msimple_params.npz differ
diff --git a/p5_params.npz b/p5_params.npz
new file mode 100755
index 0000000..17814a2
Binary files /dev/null and b/p5_params.npz differ
diff --git a/prj01.py b/prj01.py
new file mode 100755
index 0000000..8df0ca5
--- /dev/null
+++ b/prj01.py
@@ -0,0 +1,76 @@
+import numpy as np
+import easynn as nn
+
+# Create a numpy array of 10 rows and 5 columns.
+# Set the element at row i and column j to be i+j.
+def Q1():
+    array = np.zeros((10, 5))
+
+    for i in range(10):
+        for j in range(5):
+            array[i, j] = i+j
+
+    return array
+
+# Add two numpy arrays together.
+def Q2(a, b):
+    return a + b
+
+# Multiply two 2D numpy arrays using matrix multiplication.
+def Q3(a, b):
+    return np.matmul(a, b)
+
+# For each row of a 2D numpy array, find the column index
+# with the maximum element. Return all these column indices.
+def Q4(a):
+    return [ np.argmax(row) for row in a ]
+
+# Solve Ax = b.
+def Q5(A, b):
+    return np.linalg.solve(A, b)
+
+# Return an EasyNN expression for a+b.
+def Q6():
+    a = nn.Input('a')
+    b = nn.Input('b')
+    c = a+b
+    return c
+
+# Return an EasyNN expression for a+b*c.
+def Q7():
+    a = nn.Input('a')
+    b = nn.Input('b')
+    c = nn.Input('c')
+    d = a+b*c
+    return d
+
+# Given A and b, return an EasyNN expression for Ax+b.
+def Q8(A, b):
+    A_const = nn.Const(A)
+    b_const = nn.Const(b)
+    x_input = nn.Input("x")
+
+    expr = ((A_const*x_input) + b_const)
+
+    return expr
+
+
+# Given n, return an EasyNN expression for x**n.
+def Q9(n):
+    x = nn.Input("x")
+    n_const = nn.Const(n)
+
+    expr = x
+    for _ in range(n - 1):
+        expr = expr * x
+
+    return expr
+
+# the element-wise absolute value |x|.
+def Q10():
+    x = nn.Input("x")
+    zero = nn.Const(0)
+    max = nn.ReLU()(x)
+    min = x - max
+    abs = max - min
+    return abs
diff --git a/prj05.py b/prj05.py
new file mode 100755
index 0000000..cd2b6c9
--- /dev/null
+++ b/prj05.py
@@ -0,0 +1,274 @@
+import numpy as np
+import time
+import pdb
+
+from matplotlib import pyplot
+
+# save theta to p5_params.npz that can be used by easynn
+def save_theta(theta):
+    f1_W, f1_b, f2_W, f2_b = theta
+
+    np.savez_compressed("p5_params.npz", **{
+        "f1.weight": f1_W,
+        "f1.bias": f1_b,
+        "f2.weight": f2_W,
+        "f2.bias": f2_b
+    })
+
+
+# initialize theta using uniform distribution [-bound, bound]
+# return theta as (f1_W, f1_b, f2_W, f2_b)
+def initialize_theta(bound):
+    f1_W = np.random.uniform(-bound, bound, (32, 784))
+    f1_b = np.random.uniform(-bound, bound, 32)
+    f2_W = np.random.uniform(-bound, bound, (10, 32))
+    f2_b = np.random.uniform(-bound, bound, 10)
+    return (f1_W, f1_b, f2_W, f2_b)
+
+
+# forward:
+#   x = Flatten(images)
+#   g = Linear_f1(x)
+#   h = ReLU(g)
+#   z = Linear_f2(h)
+# return (z, h, g, x)
+def forward(images, theta):
+    # number of samples
+    N = images.shape[0]
+
+    # unpack theta into f1 and f2
+    f1_W, f1_b, f2_W, f2_b = theta
+
+    # x = Flatten(images)
+    x = images.astype(float).transpose(0,3,1,2).reshape((N, -1))
+
+    # g = Linear_f1(x)
+    g = np.zeros((N, f1_b.shape[0]))
+    for i in range(N):
+        g[i, :] = np.matmul(f1_W, x[i])+f1_b
+
+    # h = ReLU(g)
+    h = g*(g > 0)
+
+    # z = Linear_f2(h)
+    z = np.zeros((N, f2_b.shape[0]))
+    for i in range(N):
+        z[i, :] = np.matmul(f2_W, h[i])+f2_b
+
+    return (z, h, g, x)
+
+
+# backprop:
+#   J = cross entropy between labels and softmax(z)
+# return nabla_J
+def backprop(labels, theta, z, h, g, x):
+    # number of samples
+    N = labels.shape[0]
+
+    # unpack theta into f1 and f2
+    f1_W, f1_b, f2_W, f2_b = theta
+
+    # nabla_J consists of partial J to partial f1_W, f1_b, f2_W, f2_b
+    p_f1_W = np.zeros(f1_W.shape)
+    p_f1_b = np.zeros(f1_b.shape)
+    p_f2_W = np.zeros(f2_W.shape)
+    p_f2_b = np.zeros(f2_b.shape)
+
+    for i in range(N):
+        # compute the contribution to nabla_J for sample i
+
+        # cross entropy and softmax
+        #   compute partial J to partial z[i]
+        #   scale by 1/N for averaging
+        expz = np.exp(z[i]-max(z[i]))
+        p_z = expz/sum(expz)/N
+        p_z[labels[i]] -= 1/N
+
+        # z = Linear_f2(h)
+        #   compute partial J to partial h[i]
+        #   accumulate partial J to partial f2_W, f2_b
+        p_h = np.dot(f2_W.T, p_z)
+        p_f2_W += np.outer(p_z, h[i])
+        p_f2_b += p_z
+
+        # h = ReLU(g)
+        #   compute partial J to partial g[i]
+        p_g = p_h * (g[i] > 0)
+
+
+        # g = Linear_f1(x)
+        #   accumulate partial J to partial f1_W, f1_b
+        p_f1_W += np.outer(p_g, x[i])
+        p_f1_b += p_g
+
+    return (p_f1_W, p_f1_b, p_f2_W, p_f2_b)
+
+
+# apply SGD to update theta by nabla_J and the learning rate epsilon
+# return updated theta
+def update_theta(theta, nabla_J, epsilon):
+    # ToDo: modify code below as needed
+    #updated_theta = theta
+    #return updated_theta
+    f1_W, f1_b, f2_W, f2_b = theta
+    p_f1_W, p_f1_b, p_f2_W, p_f2_b = nabla_J
+
+    # update the weights and biases for the first layer (f1)
+    f1_W_updated = f1_W - epsilon * p_f1_W
+    f1_b_updated = f1_b - epsilon * p_f1_b
+
+    # update the weights and biases for the second layer (f2)
+    f2_W_updated = f2_W - epsilon * p_f2_W
+    f2_b_updated = f2_b - epsilon * p_f2_b
+
+    return (f1_W_updated, f1_b_updated, f2_W_updated, f2_b_updated)
+
+def print_training_hyperparams_for_session(epsilon, batch_size, bound):
+    print("Starting training session with 10 epochs:")
+    print("")
+    print("Hyperparameters:") 
+    print(f"epsilon: {epsilon}")
+    print(f"bound: {bound}")
+    print(f"batch_size: {batch_size}")
+    print("")
+    print("Results:")
+
+def plot_epoch(epochs, accuracies, epsilon, batch_size, bound):
+    pyplot.figure(figsize=(10, 6))
+    pyplot.plot(epochs, accuracies, label=f"Epsilon: {epsilon}, Batch Size: {batch_size}, Bound: {bound}")
+    pyplot.xlabel('Epoch')
+    pyplot.ylabel('Accuracy')
+    pyplot.title('Training Accuracy over Epochs')
+    pyplot.legend()
+    pyplot.grid(True)
+    pyplot.show()
+
+def plot_all_epochs(training_results):
+    pyplot.figure(figsize=(12, 8))
+
+    for epochs, accuracies, epsilon, batch_size, bound in training_results:
+        label = f"Epsilon: {epsilon}, Batch Size: {batch_size}, Bound: {bound}"
+        pyplot.plot(epochs, accuracies, label=label)
+
+    pyplot.xlabel('Epoch')
+    pyplot.ylabel('Accuracy')
+    pyplot.title('Training Accuracy over Epochs for Different Hyperparameters')
+    pyplot.legend()
+    pyplot.grid(True)
+    pyplot.show()
+
+def plot_table(training_results):
+    # Setting up the data for the table
+    cell_text = []
+    columns = ['Epoch', 'Accuracy', 'Epsilon', 'Batch Size', 'Bound']
+    for result in training_results:
+        epochs, accuracies, epsilon, batch_size, bound = result
+        for epoch, accuracy in zip(epochs, accuracies):
+            cell_text.append([epoch, f"{accuracy:.3f}", epsilon, batch_size, bound])
+
+    # Determine the figure size needed for the table
+    figsize = (10, len(cell_text) * 0.2)
+    fig, ax = pyplot.subplots(figsize=figsize)
+    ax.axis('tight')
+    ax.axis('off')
+
+    # Create the table
+    table = ax.table(cellText=cell_text, colLabels=columns, loc='center', cellLoc='center')
+
+    # Adjust table scale
+    table.auto_set_font_size(False)
+    table.set_fontsize(8)
+    table.auto_set_column_width(col=list(range(len(columns))))
+
+    pyplot.show()
+
+
+def start_training(epsilon, batch_size, bound, mnist_train):
+
+    # ToDo: set numpy random seed to the last 8 digits of your CWID
+    np.random.seed(20497299)
+
+    validation_images = mnist_train["images"][:1000]
+    validation_labels = mnist_train["labels"][:1000]
+    training_images = mnist_train["images"][1000:]
+    training_labels = mnist_train["labels"][1000:]
+
+    # hyperparameters
+    # we can experiment with these values to see if increasing or decreasing
+    # these values can influence our accuracy
+    # default values
+    #bound = 1 # initial weight range
+    #epsilon = 0.00001 # learning rate
+
+    #print_training_hyperparams_for_session(epsilon, batch_size, bound)
+    # start training
+
+    accuracies = []
+    epochs = []
+    start = time.time()
+    theta = initialize_theta(bound)
+    batches = training_images.shape[0]//batch_size
+    for epoch in range(10): 
+        indices = np.arange(training_images.shape[0])
+        np.random.shuffle(indices)
+        for i in range(batches):
+            batch_images = training_images[indices[i*batch_size:(i+1)*batch_size]]
+            batch_labels = training_labels[indices[i*batch_size:(i+1)*batch_size]]
+
+            z, h, g, x = forward(batch_images, theta)
+            nabla_J = backprop(batch_labels, theta, z, h, g, x)
+            theta = update_theta(theta, nabla_J, epsilon)
+
+        # check accuracy using validation examples
+        z, _, _, _ = forward(validation_images, theta)
+        pred_labels = z.argmax(axis = 1)
+        accuracy = sum(pred_labels == validation_labels) / validation_images.shape[0]
+        accuracies.append(accuracy)
+        epochs.append(epoch)
+        #count = sum(pred_labels == validation_labels)
+        print("epoch %d, accuracy %.3f, time %.2f" % (
+            epoch, accuracy, time.time()-start))
+
+    #plot_epoch(epochs, accuracies, epsilon, batch_size, bound)
+
+    # save the weights to be submitted
+    save_theta(theta)
+
+    # return this data so we can plot it with matplotlib
+    return epochs, accuracies
+
+def main():
+    training_results = []
+    mnist_train = np.load("mnist_train.npz")
+
+    training_results = []
+
+    # load training data once
+    mnist_train = np.load("mnist_train.npz")
+
+    # we can add to this list if we want to test combinations of hyperparameters
+    hyperparams = [
+        (0.00001, 1, 4), # default params
+        (0.00001, 0.1, 4),
+        (0.00001, 0.5, 4),
+        (0.00001, 0.7, 4),
+        (0.00001, 0.01, 4),
+        (0.00001, 0.01, 3),
+        (0.00001, 0.01, 2),
+        (0.000013, 0.012, 1), 
+        (0.000013, 0.012002899999999983, 1), 
+        (0.000013, 0.01200591999999996, 1), 
+    ]
+
+    for epsilon, bound, batch_size in hyperparams:
+        epochs, accuracies = start_training(epsilon, batch_size, bound, mnist_train)
+        training_results.append((epochs, accuracies, epsilon, batch_size, bound))
+
+    # uncomment if you would like to see plotted results
+    #plot_all_epochs(training_results)
+
+    # plot table
+    #plot_table(training_results[9])
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/prj06.py b/prj06.py
new file mode 100755
index 0000000..41c6ee2
--- /dev/null
+++ b/prj06.py
@@ -0,0 +1,374 @@
+import numpy as np
+import time
+import random
+import matplotlib.pyplot as plt
+
+
+# save theta to p6_params.npz that can be used by easynn
+def save_theta(theta):
+    c1_W, c1_b, c2_W, c2_b, f_W, f_b = theta
+
+    np.savez_compressed("p6_params.npz", **{
+        "c1.weight": c1_W,
+        "c1.bias": c1_b,
+        "c2.weight": c2_W,
+        "c2.bias": c2_b,
+        "f.weight": f_W,
+        "f.bias": f_b
+    })
+
+
+# initialize theta using uniform distribution [-bound, bound]
+# return theta as (c1_W, c1_b, c2_W, c2_b, f_W, f_b)
+def initialize_theta(bound):
+    c1_W = np.random.uniform(-bound, bound, (8, 1, 5, 5))
+    c1_b = np.random.uniform(-bound, bound, 8)
+    c2_W = np.random.uniform(-bound, bound, (8, 8, 5, 5))
+    c2_b = np.random.uniform(-bound, bound, 8)
+    f_W = np.random.uniform(-bound, bound, (10, 128))
+    f_b = np.random.uniform(-bound, bound, 10)
+    return (c1_W, c1_b, c2_W, c2_b, f_W, f_b)
+
+
+# return out_nchw
+def Conv2d(in_nchw, kernel_W, kernel_b):
+    OC, IC, KH, KW = kernel_W.shape
+    N, C, H, W = in_nchw.shape
+    if C != IC or kernel_b.shape != (OC,):
+        raise Exception("Conv2d size mismatch: %s @ (%s, %s)" % (
+            in_nchw.shape, kernel_W.shape, kernel_b.shape))
+
+    # view in_nchw as a 6D tensor
+    shape = (N, IC, H-KH+1, W-KW+1, KH, KW)
+    strides = in_nchw.strides+in_nchw.strides[2:]
+    data = np.lib.stride_tricks.as_strided(in_nchw,
+        shape = shape, strides = strides, writeable = False)
+    # np.einsum("nihwyx,oiyx->nohw", data, kernel_W)
+    nhwo = np.tensordot(data, kernel_W, ((1,4,5), (1,2,3)))
+    return nhwo.transpose(0,3,1,2)+kernel_b.reshape((1, OC, 1, 1))
+
+# return p_b for the whole batch
+def Conv2d_backprop_b(p_out, in_nchw, kernel_W, kernel_b):
+    OC, IC, KH, KW = kernel_W.shape
+    N, C, H, W = in_nchw.shape
+    if C != IC or kernel_b.shape != (OC,) or p_out.shape != (N, OC, H-KH+1, W-KW+1):
+        raise Exception("Conv2d_backprop_b size mismatch: %s = %s @ (%s, %s)" % (
+            p_out.shape, in_nchw.shape, kernel_W.shape, kernel_b.shape))
+
+    return np.einsum("nohw->o", p_out, optimize = "optimal")/N
+
+# return p_W for the whole batch
+def Conv2d_backprop_W(p_out, in_nchw, kernel_W, kernel_b):
+    OC, IC, KH, KW = kernel_W.shape
+    N, C, H, W = in_nchw.shape
+    if C != IC or kernel_b.shape != (OC,) or p_out.shape != (N, OC, H-KH+1, W-KW+1):
+        raise Exception("Conv2d_backprop_W size mismatch: %s = %s @ (%s, %s)" % (
+            p_out.shape, in_nchw.shape, kernel_W.shape, kernel_b.shape))
+    
+    # view in_nchw as a 6D tensor
+    shape = (N, IC, KH, KW, H-KH+1, W-KW+1)
+    strides = in_nchw.strides+in_nchw.strides[2:]
+    data = np.lib.stride_tricks.as_strided(in_nchw,
+        shape = shape, strides = strides, writeable = False)
+    # np.einsum("nohw,niyxhw->oiyx", p_out, data)
+    return np.tensordot(p_out, data, ((0,2,3), (0,4,5)))/N
+
+# return p_in for the whole batch
+def Conv2d_backprop_in(p_out, in_nchw, kernel_W, kernel_b):
+    OC, IC, KH, KW = kernel_W.shape
+    N, C, H, W = in_nchw.shape
+    if C != IC or kernel_b.shape != (OC,) or p_out.shape != (N, OC, H-KH+1, W-KW+1):
+        raise Exception("Conv2d_backprop_in size mismatch: %s = %s @ (%s, %s)" % (
+            p_out.shape, in_nchw.shape, kernel_W.shape, kernel_b.shape))
+
+    # view p_out as a padded 6D tensor
+    padded = np.zeros((N, OC, H+KH-1, W+KW-1))
+    padded[:, :, KH-1:-KH+1, KW-1:-KW+1] = p_out
+    shape = (N, IC, H, W, KH, KW)
+    strides = padded.strides+padded.strides[2:]
+    data = np.lib.stride_tricks.as_strided(padded,
+        shape = shape, strides = strides, writeable = False)
+    # np.einsum("nohwyx,oiyx->nihw", data, kernel_W)
+    nhwi = np.tensordot(data, kernel_W, ((1,4,5), (0,2,3)))
+    return nhwi.transpose((0,3,1,2))
+
+
+# return out_nchw
+def MaxPool_2by2(in_nchw):
+    N, C, H, W = in_nchw.shape
+    shape = (N, C, H//2, 2, W//2, 2)
+    return np.nanmax(in_nchw.reshape(shape), axis = (3,5))
+
+# return p_in for the whole batch
+def MaxPool_2by2_backprop(p_out, out_nchw, in_nchw):
+    p_in = np.zeros(in_nchw.shape)
+    p_in[:, :, 0::2, 0::2] = p_out*(out_nchw == in_nchw[:, :, 0::2, 0::2])
+    p_in[:, :, 0::2, 1::2] = p_out*(out_nchw == in_nchw[:, :, 0::2, 1::2])
+    p_in[:, :, 1::2, 0::2] = p_out*(out_nchw == in_nchw[:, :, 1::2, 0::2])
+    p_in[:, :, 1::2, 1::2] = p_out*(out_nchw == in_nchw[:, :, 1::2, 1::2])
+    return p_in
+
+
+# forward:
+#    x = NCHW(images)
+#   cx = Conv2d_c1(x)
+#   rx = ReLU(cx)
+#    y = MaxPool(rx)
+#   cy = Conv2d_c2(hx)
+#   ry = ReLU(cy)
+#    g = MaxPool(ry)
+#    h = Flatten(g)
+#    z = Linear_f(h)
+# return (z, h, g, ry, cy, y, rx, cx, x)
+def forward(images, theta):
+    # number of samples
+    N = images.shape[0]
+
+    # unpack theta into c1, c2, and f
+    c1_W, c1_b, c2_W, c2_b, f_W, f_b = theta
+
+    # x = NCHW(images)
+    x = images.astype(float).transpose(0,3,1,2)
+
+    # cx = Conv2d_c1(x)
+    cx = Conv2d(x, c1_W, c1_b)
+
+    # rx = ReLU(cx)
+    rx = cx*(cx > 0)
+
+    # y = MaxPool(rx)
+    y = MaxPool_2by2(rx)
+    
+    # cy = Conv2d_c2(y)
+    cy = Conv2d(y, c2_W, c2_b)
+
+    # ry = ReLU(cy)
+    ry = cy*(cy > 0)
+
+    # g = MaxPool(ry)
+    g = MaxPool_2by2(ry)
+
+    # h = Flatten(g)
+    h = g.reshape((N, -1))
+
+    # z = Linear_f(h)
+    z = np.zeros((N, f_b.shape[0]))
+    for i in range(N):
+        z[i, :] = np.matmul(f_W, h[i])+f_b
+
+    return (z, h, g, ry, cy, y, rx, cx, x)
+
+def backprop(labels, theta, z, h, g, ry, cy, y, rx, cx, x):
+    # Number of samples
+    N = labels.shape[0]
+
+    # Unpack theta into c1, c2, and f
+    c1_W, c1_b, c2_W, c2_b, f_W, f_b = theta
+
+    # Initialize gradients for each parameter
+    p_c1_W, p_c1_b = np.zeros_like(c1_W), np.zeros_like(c1_b)
+    p_c2_W, p_c2_b = np.zeros_like(c2_W), np.zeros_like(c2_b)
+    p_f_W, p_f_b = np.zeros_like(f_W), np.zeros_like(f_b)
+    # sample-by-sample from z to h
+    #p_f_W = np.zeros(f_W.shape)
+    #p_f_b = np.zeros(f_b.shape)
+    p_h = np.zeros(h.shape)
+
+    # Step 1: Backprop through linear layer
+    for i in range(N):
+        # Softmax and cross-entropy gradient
+        expz = np.exp(z[i] - np.max(z[i]))
+        softmax = expz / np.sum(expz)
+        softmax[labels[i]] -= 1
+        p_z = softmax / N
+
+        # Backprop through linear layer to compute gradients
+        p_h[i, :] = np.dot(f_W.T, p_z)
+        p_f_W += np.outer(p_h[i, :], p_z).T
+        p_f_b += p_z
+
+    # Average the gradients over the batch
+    p_f_W /= N
+    p_f_b /= N
+
+    # Step 2: Backprop through flatten operation
+    p_g = p_h.reshape(g.shape)
+
+    # Step 3: Backprop through MaxPool layer
+    p_ry = MaxPool_2by2_backprop(p_g, g, ry)
+
+    # Step 4: Backprop through ReLU layer
+    p_cy = (ry > 0) * p_ry
+
+    # Step 5: Backprop through second convolutional layer
+    p_c2_W = Conv2d_backprop_W(p_cy, y, c2_W, c2_b)
+    p_c2_b = Conv2d_backprop_b(p_cy, y, c2_W, c2_b)
+    p_y = Conv2d_backprop_in(p_cy, y, c2_W, c2_b)
+
+    # Step 6: Backprop through first MaxPool layer
+    p_rx = MaxPool_2by2_backprop(p_y, y, rx)
+
+    # Step 7: Backprop through first ReLU layer
+    p_cx = (rx > 0) * p_rx
+
+    # Step 8: Backprop through first convolutional layer
+    p_c1_W = Conv2d_backprop_W(p_cx, x, c1_W, c1_b)
+    p_c1_b = Conv2d_backprop_b(p_cx, x, c1_W, c1_b)
+
+    return (p_c1_W, p_c1_b, p_c2_W, p_c2_b, p_f_W, p_f_b)
+
+# apply SGD to update theta by nabla_J and the learning rate epsilon
+# return updated theta
+
+def update_theta(theta, nabla_J, epsilon):
+    # Unpack the current parameters and their corresponding gradients
+    c1_W, c1_b, c2_W, c2_b, f_W, f_b = theta
+    p_c1_W, p_c1_b, p_c2_W, p_c2_b, p_f_W, p_f_b = nabla_J
+
+    # Update each parameter by subtracting the gradient times the learning rate
+    c1_W_updated = c1_W - epsilon * p_c1_W
+    c1_b_updated = c1_b - epsilon * p_c1_b
+    c2_W_updated = c2_W - epsilon * p_c2_W
+    c2_b_updated = c2_b - epsilon * p_c2_b
+    f_W_updated = f_W - epsilon * p_f_W
+    f_b_updated = f_b - epsilon * p_f_b
+
+    # Return the updated parameters as a tuple
+    updated_theta = (c1_W_updated, c1_b_updated, c2_W_updated, c2_b_updated, f_W_updated, f_b_updated)
+    return updated_theta
+
+def sample_attempts(accuracies, num_samples=10):
+    total_attempts = len(accuracies)
+    if total_attempts <= num_samples:
+        return accuracies
+
+    # Always include first and last attempts
+    sampled_indices = [0, total_attempts - 1]
+
+    # Sample additional attempts, excluding the first and last
+    additional_samples = random.sample(range(1, total_attempts - 1), num_samples - 2)
+    sampled_indices.extend(additional_samples)
+
+    # Sort the indices to maintain chronological order
+    sampled_indices.sort()
+
+    # Get the sampled attempts
+    sampled_accuracies = [accuracies[i] for i in sampled_indices]
+    return sampled_accuracies
+
+# hyperparameters
+# default params
+#bound = 1 # initial weight range
+#epsilon = 0.00001 # learning rate
+#batch_size = 4
+
+# start training
+def start_training(mnist_train, bound, epsilon, batch_size):
+
+    print(f"bound: {bound}, epsilon: {epsilon}, batch_size: {batch_size}")
+    validation_images = mnist_train["images"][:1000]
+    validation_labels = mnist_train["labels"][:1000]
+    training_images = mnist_train["images"][1000:21000]
+    training_labels = mnist_train["labels"][1000:21000]
+    #training_images = mnist_train["images"][1000:11000]
+    #training_labels = mnist_train["labels"][1000:11000]
+
+
+    start = time.time()
+    theta = initialize_theta(bound)
+    batches = training_images.shape[0]//batch_size
+
+    accuracies = []
+
+    for epoch in range(5):
+        indices = np.arange(training_images.shape[0])
+        np.random.shuffle(indices)
+        for i in range(batches):
+            batch_images = training_images[indices[i*batch_size:(i+1)*batch_size]]
+            batch_labels = training_labels[indices[i*batch_size:(i+1)*batch_size]]
+
+            z, h, g, ry, cy, y, rx, cx, x = forward(batch_images, theta)
+            nabla_J = backprop(batch_labels, theta, z, h, g, ry, cy, y, rx, cx, x)
+            theta = update_theta(theta, nabla_J, epsilon)
+
+        # check accuracy using validation examples
+        z, _, _, _, _, _, _, _, _ = forward(validation_images, theta)
+        pred_labels = z.argmax(axis = 1)
+        count = sum(pred_labels == validation_labels)
+        accuracy = count/validation_images.shape[0]
+        accuracies.append(accuracy)
+        print("epoch %d, accuracy %.3f, time %.2f" % (
+            epoch, accuracy, time.time()-start))
+
+
+    # save the weights to be submitted
+    save_theta(theta)
+    return accuracies
+
+def plot_accuracies(accuracies):
+    plt.figure(figsize=(12, 6))
+
+    for attempt, acc_list in accuracies:
+        plt.plot(acc_list, label=f'Attempt {attempt+1}')
+
+    plt.title('Model Accuracy over Epochs for Sampled Attempts')
+    plt.xlabel('Epoch')
+    plt.ylabel('Accuracy')
+    plt.legend()
+    plt.grid(True)
+    plt.show()
+
+def main():
+    # list of (attempt, accuracies_for_run)
+    accuracies = []
+
+    # ToDo: set numpy random seed to the last 8 digits of your CWID
+    np.random.seed(20497299)
+
+    # load training data once 
+    mnist_train = np.load("mnist_train.npz")
+
+    max_accuracy = 0.93  # Target accuracy
+    max_attempts = 999999    # Maximum number of attempts to reach the target accuracy
+
+    # initial starting values
+    # notes from experimentation:
+
+    # 90 percent accuracy
+    # attempt: 36
+    # increase bound/epsilon *= 1.01
+    bound = 0.01
+    epsilon = 0.0001
+    batch_size = 1 
+
+    # 90 percent accuracy if we start at initial starting values. on 36th attempt
+    # note: if we start at these below values we do not hit 90. only if we start at 0.01/0.0001 and *= by 1.01
+    #bound = 0.014166027560312683
+    #epsilon = 0.0001416602756031268
+    #batch_size = 1
+
+    attempt = 0
+    while attempt < max_attempts:
+        print(f"Attempt: {attempt} out of: {max_attempts}")
+        accuracies_for_run = start_training(mnist_train, bound, epsilon, batch_size)
+        accuracies.append((attempt, accuracies_for_run))
+
+        if any(i >= max_accuracy for i in accuracies_for_run):
+            print(f"Reached Max Accuracy: {max_accuracy}")
+            print(f"bound: {bound}")
+            print(f"epsilon: {epsilon}")
+            print(f"batch_size: {batch_size}")
+            sampled_accuracies = sample_attempts(accuracies)
+            plot_accuracies(sampled_accuracies)
+            break
+
+        bound *= 1.0005  # Increase by 0.05%
+        epsilon *= 1.0005 # Increase by 0.05%
+        
+        #bound *= 1.01 # Increase by 1%
+        #epsilon *= 1.01 # Increase by 1%
+
+        attempt += 1
+
+if __name__ == "__main__":
+    main()
diff --git a/src/eval_add.cpp b/src/eval_add.cpp
new file mode 100755
index 0000000..ba1ca61
--- /dev/null
+++ b/src/eval_add.cpp
@@ -0,0 +1,17 @@
+#include "eval_add.h"
+
+tensor eval_add::compute(const tensor &a, const tensor &b) {
+    tensor c = a + b; 
+    return c;
+}
+
+void eval_add::eval(vars_type &variables, const kwargs_type &kwargs) {
+    assert(inputs_.size() == 2);
+    auto ita = variables.find(inputs_[0]);
+    auto itb = variables.find(inputs_[1]);
+    variables[expr_id_] = compute(ita->second,itb->second);
+}
+
+std::shared_ptr<eval_op> eval_add::clone(const expression &expr) {
+    return std::make_shared<eval_add>(expr);
+}
\ No newline at end of file
diff --git a/src/eval_add.h b/src/eval_add.h
new file mode 100755
index 0000000..0657512
--- /dev/null
+++ b/src/eval_add.h
@@ -0,0 +1,21 @@
+#ifndef EVAL_ADD_H
+#define EVAL_ADD_H
+
+#include "eval_op.h"
+
+typedef std::map<std::string, std::shared_ptr<eval_op>> eval_op_proto_map;
+
+class eval_add: public eval_op {
+    tensor compute(const tensor &a, const tensor &b);
+    std::shared_ptr<eval_op> clone(const expression &expr) override;
+public:
+    eval_add() {}
+    eval_add(const expression &expr): eval_op(expr) {}
+    void eval(vars_type &variables, const kwargs_type &kwargs) override;
+    static void store_prototype(eval_op_proto_map &proto_map) {
+        assert(proto_map.find("Add") == proto_map.end());
+        proto_map["Add"] = std::make_shared<eval_add>(); // where is expr?
+    }
+}; // class eval_add
+
+#endif
\ No newline at end of file
diff --git a/src/eval_const.cpp b/src/eval_const.cpp
new file mode 100755
index 0000000..efedd40
--- /dev/null
+++ b/src/eval_const.cpp
@@ -0,0 +1,17 @@
+#include "eval_const.h"
+
+eval_const::eval_const(const expression &expr): eval_op(expr) {
+    auto params = expr.get_op_param_tensor("value");
+    value_ = params;
+    // value_.print(); // uncomment for viewing logging details of tensor object
+
+}
+
+void eval_const::eval(vars_type &variables, const kwargs_type &kwargs) {
+    variables[expr_id_] = value_;
+    // value_.print(); // uncomment for viewing logging details of tensor object
+}
+
+std::shared_ptr<eval_op> eval_const::clone(const expression &expr) {
+    return std::make_shared<eval_const>(expr);
+}
\ No newline at end of file
diff --git a/src/eval_const.h b/src/eval_const.h
new file mode 100755
index 0000000..21e5039
--- /dev/null
+++ b/src/eval_const.h
@@ -0,0 +1,21 @@
+#ifndef EVAL_CONST_H
+#define EVAL_CONST_H
+
+#include "eval_op.h"
+
+typedef std::map<std::string, std::shared_ptr<eval_op>> eval_op_proto_map;
+
+class eval_const: public eval_op {
+    tensor value_;
+    std::shared_ptr<eval_op> clone(const expression &expr) override;
+public:
+    eval_const() {}
+    eval_const(const expression &expr);
+    void eval(vars_type &variables, const kwargs_type &kwargs) override;
+    static void store_prototype(eval_op_proto_map &proto_map) {
+        assert(proto_map.find("Const") == proto_map.end());
+        proto_map["Const"] = std::make_shared<eval_const>(); // where is expr?
+    }
+}; // class eval_const
+
+#endif
\ No newline at end of file
diff --git a/src/eval_conv2d.cpp b/src/eval_conv2d.cpp
new file mode 100755
index 0000000..42b972a
--- /dev/null
+++ b/src/eval_conv2d.cpp
@@ -0,0 +1,36 @@
+#include "eval_conv2d.h"
+
+eval_conv2d::eval_conv2d(const expression &expr): eval_op(expr), 
+      in_channels(expr.get_op_param_int("in_channels")),
+      kernel_size(expr.get_op_param_int("kernel_size")),
+      out_channels(expr.get_op_param_int("out_channels")),
+      padding(expr.get_op_param_double("padding")),
+      weight(expr.get_op_param_tensor("weight")),
+      bias(expr.get_op_param_tensor("bias")),
+      stride(expr.get_op_param_int("stride"))
+    {
+
+    }
+
+void eval_conv2d::eval(vars_type &variables, const kwargs_type &kwargs) {
+    if (inputs_.empty()) {
+        throw std::runtime_error("No input tensor for eval_conv2d.");
+    }
+
+    // Get the input tensor from variables using the expression ID of the input
+    int inputExprId = inputs_[0];
+    if (variables.find(inputExprId) == variables.end()) {
+        throw std::runtime_error("Input tensor not found in variables map.");
+    }
+    tensor input_tensor = variables.at(inputExprId);
+
+    // Perform the Conv2D operation
+    tensor conv2d_tensor = Conv2D(input_tensor, weight, bias, in_channels, out_channels, kernel_size);
+
+    // Store the result in the variables map using the current expression ID
+    variables[expr_id_] = conv2d_tensor;
+}
+
+std::shared_ptr<eval_op> eval_conv2d::clone(const expression &expr) {
+    return std::make_shared<eval_conv2d>(expr);
+}
\ No newline at end of file
diff --git a/src/eval_conv2d.h b/src/eval_conv2d.h
new file mode 100755
index 0000000..df0685f
--- /dev/null
+++ b/src/eval_conv2d.h
@@ -0,0 +1,38 @@
+#ifndef EVAL_CONV2D_H
+#define EVAL_CONV2D_H
+
+#include "eval_op.h"
+
+typedef std::map<std::string, std::shared_ptr<eval_op>> eval_op_proto_map;
+
+class eval_conv2d: public eval_op {
+    tensor value_;
+    std::shared_ptr<eval_op> clone(const expression &expr) override;
+public:
+    eval_conv2d () {}
+    eval_conv2d(const expression &expr);
+    void eval(vars_type &variables, const kwargs_type &kwargs) override;
+    static void store_prototype(eval_op_proto_map &proto_map) {
+        assert(proto_map.find("Conv2d") == proto_map.end());
+        proto_map["Conv2d"] = std::make_shared<eval_conv2d>(); // where is expr?
+    }
+private:
+    int in_channels;
+    int kernel_size;
+    int out_channels;
+    double padding;
+    tensor weight;
+    tensor bias;
+    int stride;
+    /*
+    tensor weight;
+    tensor bias;
+    int kernel_size;
+    int stride;
+    int in_channels;
+    int out_channels;
+    double padding;
+    */
+}; // class eval_conv2d
+
+#endif
\ No newline at end of file
diff --git a/src/eval_flatten.cpp b/src/eval_flatten.cpp
new file mode 100755
index 0000000..74a8076
--- /dev/null
+++ b/src/eval_flatten.cpp
@@ -0,0 +1,27 @@
+#include "eval_flatten.h"
+
+eval_flatten::eval_flatten(const expression &expr): eval_op(expr) {
+}
+
+void eval_flatten::eval(vars_type &variables, const kwargs_type &kwargs) {
+   // Check that inputs_ is not empty
+   assert(!inputs_.empty());
+
+   // Retrieve the tensor from variables using the first input expression ID
+   const int inputExprId = inputs_[0];
+   tensor t = variables.at(inputExprId);
+
+   // For debugging: print the input tensor
+   std::cout << "Input Tensor for Flatten (expr_id " << inputExprId << "):" << std::endl;
+   //t.print();
+
+   // Perform the Flatten operation
+   tensor flattenResult = Flatten(t);
+
+   // Store the result in variables under the key expr_id_
+   variables[expr_id_] = flattenResult;
+}
+
+std::shared_ptr<eval_op> eval_flatten::clone(const expression &expr) {
+    return std::make_shared<eval_flatten>(expr);
+}
\ No newline at end of file
diff --git a/src/eval_flatten.h b/src/eval_flatten.h
new file mode 100755
index 0000000..d535a57
--- /dev/null
+++ b/src/eval_flatten.h
@@ -0,0 +1,21 @@
+#ifndef EVAL_FLATTEN_H
+#define EVAL_FLATTEN_H
+
+#include "eval_op.h"
+
+typedef std::map<std::string, std::shared_ptr<eval_op>> eval_op_proto_map;
+
+class eval_flatten: public eval_op {
+    tensor value_;
+    std::shared_ptr<eval_op> clone(const expression &expr) override;
+public:
+    eval_flatten () {}
+    eval_flatten(const expression &expr);
+    void eval(vars_type &variables, const kwargs_type &kwargs) override;
+    static void store_prototype(eval_op_proto_map &proto_map) {
+        assert(proto_map.find("Flatten") == proto_map.end());
+        proto_map["Flatten"] = std::make_shared<eval_flatten>(); // where is expr?
+    }
+}; // class eval_flatten
+
+#endif
\ No newline at end of file
diff --git a/src/eval_input.cpp b/src/eval_input.cpp
new file mode 100755
index 0000000..bd4796f
--- /dev/null
+++ b/src/eval_input.cpp
@@ -0,0 +1,13 @@
+#include "eval_input.h"
+
+eval_input::eval_input(const expression &expr): eval_op(expr) {
+}
+
+
+void eval_input::eval(vars_type &variables, const kwargs_type &kwargs) {
+   variables[expr_id_] = kwargs.at(op_name_);
+}
+
+std::shared_ptr<eval_op> eval_input::clone(const expression &expr) {
+    return std::make_shared<eval_input>(expr);
+}
\ No newline at end of file
diff --git a/src/eval_input.h b/src/eval_input.h
new file mode 100755
index 0000000..44cead6
--- /dev/null
+++ b/src/eval_input.h
@@ -0,0 +1,21 @@
+#ifndef EVAL_INPUT_H
+#define EVAL_INPUT_H
+
+#include "eval_op.h"
+
+typedef std::map<std::string, std::shared_ptr<eval_op>> eval_op_proto_map;
+
+class eval_input: public eval_op {
+    tensor value_;
+    std::shared_ptr<eval_op> clone(const expression &expr) override;
+public:
+    eval_input () {}
+    eval_input(const expression &expr);
+    void eval(vars_type &variables, const kwargs_type &kwargs) override;
+    static void store_prototype(eval_op_proto_map &proto_map) {
+        assert(proto_map.find("Input") == proto_map.end());
+        proto_map["Input"] = std::make_shared<eval_input>(); // where is expr?
+    }
+}; // class eval_input
+
+#endif
\ No newline at end of file
diff --git a/src/eval_input2d.cpp b/src/eval_input2d.cpp
new file mode 100755
index 0000000..43b3697
--- /dev/null
+++ b/src/eval_input2d.cpp
@@ -0,0 +1,23 @@
+#include "eval_input2d.h"
+
+eval_input2d::eval_input2d(const expression &expr): eval_op(expr), height(expr.get_op_param_int("height")), width(expr.get_op_param_int("width")) {
+}
+
+void eval_input2d::eval(vars_type &variables, const kwargs_type &kwargs) {
+   // Retrieve the tensor directly from kwargs using op_name_
+   tensor t = kwargs.at(op_name_);
+
+   // For debugging: print the input tensor
+   std::cout << "Input Tensor for Input2d (op_name " << op_name_ << "):" << std::endl;
+   //t.print();
+
+   // Perform the Input2d operation
+   tensor input2dResult = Input2d(t);
+
+   // Store the result in variables under the key expr_id_
+   variables[expr_id_] = input2dResult;
+}
+
+std::shared_ptr<eval_op> eval_input2d::clone(const expression &expr) {
+    return std::make_shared<eval_input2d>(expr);
+}
\ No newline at end of file
diff --git a/src/eval_input2d.h b/src/eval_input2d.h
new file mode 100755
index 0000000..f509cb3
--- /dev/null
+++ b/src/eval_input2d.h
@@ -0,0 +1,24 @@
+#ifndef EVAL_INPUT2D_H
+#define EVAL_INPUT2D_H
+
+#include "eval_op.h"
+
+typedef std::map<std::string, std::shared_ptr<eval_op>> eval_op_proto_map;
+
+class eval_input2d: public eval_op {
+    tensor value_;
+    std::shared_ptr<eval_op> clone(const expression &expr) override;
+public:
+    eval_input2d () {}
+    eval_input2d(const expression &expr);
+    void eval(vars_type &variables, const kwargs_type &kwargs) override;
+    static void store_prototype(eval_op_proto_map &proto_map) {
+        assert(proto_map.find("Input2d") == proto_map.end());
+        proto_map["Input2d"] = std::make_shared<eval_input2d>(); // where is expr?
+    }
+private:
+    int height;
+    int width;
+}; // class eval_input2d
+
+#endif
\ No newline at end of file
diff --git a/src/eval_linear.cpp b/src/eval_linear.cpp
new file mode 100755
index 0000000..452233d
--- /dev/null
+++ b/src/eval_linear.cpp
@@ -0,0 +1,27 @@
+#include "eval_linear.h"
+
+eval_linear::eval_linear(const expression &expr): eval_op(expr),weight(expr.get_op_param_tensor("weight")),bias(expr.get_op_param_tensor("bias")) {
+}
+
+void eval_linear::eval(vars_type &variables, const kwargs_type &kwargs) {
+    if (inputs_.empty()) {
+        throw std::runtime_error("No input tensor for eval_linear.");
+    }
+
+    int inputExprId = inputs_[0];
+    if (variables.find(inputExprId) == variables.end()) {
+        throw std::runtime_error("Input tensor not found in variables map.");
+    }
+
+    tensor x = variables.at(inputExprId);
+
+    // Now you have x, weight, and bias available to perform the Linear operation
+    tensor linearResult = Linear(weight, x, bias);
+
+    // Process the result...
+    variables[expr_id_] = linearResult;
+}
+
+std::shared_ptr<eval_op> eval_linear::clone(const expression &expr) {
+    return std::make_shared<eval_linear>(expr);
+}
\ No newline at end of file
diff --git a/src/eval_linear.h b/src/eval_linear.h
new file mode 100755
index 0000000..c80278d
--- /dev/null
+++ b/src/eval_linear.h
@@ -0,0 +1,24 @@
+#ifndef EVAL_LINEAR_H
+#define EVAL_LINEAR_H
+
+#include "eval_op.h"
+
+typedef std::map<std::string, std::shared_ptr<eval_op>> eval_op_proto_map;
+
+class eval_linear: public eval_op {
+    tensor value_;
+    std::shared_ptr<eval_op> clone(const expression &expr) override;
+public:
+    eval_linear () {}
+    eval_linear(const expression &expr);
+    void eval(vars_type &variables, const kwargs_type &kwargs) override;
+    static void store_prototype(eval_op_proto_map &proto_map) {
+        assert(proto_map.find("Linear") == proto_map.end());
+        proto_map["Linear"] = std::make_shared<eval_linear>(); // where is expr?
+    }
+private:
+    tensor weight; // Add weight tensor
+    tensor bias;   // Add bias tensor
+}; // class eval_linear
+
+#endif
\ No newline at end of file
diff --git a/src/eval_maxpool2d.cpp b/src/eval_maxpool2d.cpp
new file mode 100755
index 0000000..6a21cd3
--- /dev/null
+++ b/src/eval_maxpool2d.cpp
@@ -0,0 +1,32 @@
+#include "eval_maxpool2d.h"
+
+eval_maxpool2d::eval_maxpool2d(const expression &expr): eval_op(expr), kernel_size(expr.get_op_param_int("kernel_size")), stride(expr.get_op_param_int("stride")){
+    // Retrieve kernel size and stride from the expression
+    //int kernel_size = expr.get_ker_and_str_value("kernel_size");
+    //int stride = expr.get_ker_and_str_value("stride");
+}
+
+void eval_maxpool2d::eval(vars_type &variables, const kwargs_type &kwargs) {
+    // Ensure that there is at least one input (the input tensor)
+    if (inputs_.empty()) {
+        throw std::runtime_error("No input tensor for eval_maxpool2d.");
+    }
+
+    int inputExprId = inputs_[0];
+    if (variables.find(inputExprId) == variables.end()) {
+        throw std::runtime_error("Input tensor not found in variables map.");
+    }
+
+    // Get the input tensor
+    tensor input_tensor = variables.at(inputExprId);
+
+    // Perform the max pooling operation
+    tensor pooled_tensor = MaxPool2d(input_tensor, kernel_size, stride);
+
+    // Store the result in the variables map using the expression ID
+    variables[expr_id_] = pooled_tensor;
+}
+
+std::shared_ptr<eval_op> eval_maxpool2d::clone(const expression &expr) {
+    return std::make_shared<eval_maxpool2d>(expr);
+}
\ No newline at end of file
diff --git a/src/eval_maxpool2d.h b/src/eval_maxpool2d.h
new file mode 100755
index 0000000..f01eb4e
--- /dev/null
+++ b/src/eval_maxpool2d.h
@@ -0,0 +1,24 @@
+#ifndef EVAL_MAXPOOL2D_H
+#define EVAL_MAXPOOL2D_H
+
+#include "eval_op.h"
+
+typedef std::map<std::string, std::shared_ptr<eval_op>> eval_op_proto_map;
+
+class eval_maxpool2d: public eval_op {
+    tensor value_;
+    std::shared_ptr<eval_op> clone(const expression &expr) override;
+public:
+    eval_maxpool2d () {}
+    eval_maxpool2d(const expression &expr);
+    void eval(vars_type &variables, const kwargs_type &kwargs) override;
+    static void store_prototype(eval_op_proto_map &proto_map) {
+        assert(proto_map.find("MaxPool2d") == proto_map.end());
+        proto_map["MaxPool2d"] = std::make_shared<eval_maxpool2d>(); // where is expr?
+    }
+private:
+    int kernel_size;
+    int stride;
+}; // class eval_linear
+
+#endif
\ No newline at end of file
diff --git a/src/eval_mul.cpp b/src/eval_mul.cpp
new file mode 100755
index 0000000..f7b071f
--- /dev/null
+++ b/src/eval_mul.cpp
@@ -0,0 +1,20 @@
+#include "eval_mul.h"
+
+tensor eval_mul::compute(const tensor &a, const tensor &b) {
+    // these use our tensor operator* overload
+    tensor c = a * b; 
+    return c;
+}
+
+void eval_mul::eval(vars_type &variables, const kwargs_type &kwargs) {
+    assert(inputs_.size() == 2);
+    auto ita = variables.find(inputs_[0]);
+    auto itb = variables.find(inputs_[1]);
+    // handle errors for ita and itb
+    // we handle these in the operator* override instead of here
+    variables[expr_id_] = compute(ita->second,itb->second);
+}
+
+std::shared_ptr<eval_op> eval_mul::clone(const expression &expr) {
+    return std::make_shared<eval_mul>(expr);
+}
\ No newline at end of file
diff --git a/src/eval_mul.h b/src/eval_mul.h
new file mode 100755
index 0000000..ced14bb
--- /dev/null
+++ b/src/eval_mul.h
@@ -0,0 +1,22 @@
+#ifndef EVAL_MUL_H
+#define EVAL_MUL_H
+
+#include "eval_op.h"
+
+typedef std::map<std::string, std::shared_ptr<eval_op>> eval_op_proto_map;
+
+class eval_mul: public eval_op {
+    tensor compute(const tensor &a, const tensor &b);
+    std::shared_ptr<eval_op> clone(const expression &expr) override;
+public:
+    eval_mul() {}
+    eval_mul(const expression &expr): eval_op(expr) {}
+    void eval(vars_type &variables, const kwargs_type &kwargs) override;
+    static void store_prototype(eval_op_proto_map &proto_map) {
+        assert(proto_map.find("Mul") == proto_map.end());
+        proto_map["Mul"] = std::make_shared<eval_mul>(); // where is expr?
+    }
+
+}; // class eval_mul
+
+#endif
\ No newline at end of file
diff --git a/src/eval_op.cpp b/src/eval_op.cpp
new file mode 100755
index 0000000..27fbfec
--- /dev/null
+++ b/src/eval_op.cpp
@@ -0,0 +1,20 @@
+#include "eval_op.h"
+
+eval_op::eval_op(const expression &expr)
+{
+    // set all of our member vars one time using expr getters
+    expr_id_ = expr.get_expr_id();
+    op_name_ = expr.get_op_name();
+    op_type_ = expr.get_op_type();
+    inputs_ = expr.get_inputs();
+}
+
+
+void eval_op::eval(vars_type & variables, const kwargs_type & kwargs)
+{
+    assert(false); // should be provided by derived classes
+}
+
+eval_op::~eval_op() {
+
+}
\ No newline at end of file
diff --git a/src/eval_op.h b/src/eval_op.h
new file mode 100755
index 0000000..b9f7c71
--- /dev/null
+++ b/src/eval_op.h
@@ -0,0 +1,27 @@
+#ifndef EVAL_OP_H
+#define EVAL_OP_H
+
+#include <map>
+#include "expression.h"
+#include "assert.h"
+#include <memory>
+
+// typedef introduces alias for types.
+typedef std::map<int, tensor> vars_type;
+typedef std::map<std::string, tensor> kwargs_type;
+
+class eval_op {
+protected:
+    friend class evaluation;
+    eval_op(): expr_id_(-1) {}
+    eval_op(const expression &expr);
+    int expr_id_;
+    std::string op_name_, op_type_;
+    std::vector<int> inputs_;
+public:
+    virtual ~eval_op();
+    virtual void eval(vars_type &variables, const kwargs_type &kwargs) = 0;
+    virtual std::shared_ptr<eval_op> clone(const expression &expr) = 0;
+}; //class eval_op
+
+#endif
\ No newline at end of file
diff --git a/src/eval_op_prototypes.cpp b/src/eval_op_prototypes.cpp
new file mode 100755
index 0000000..b9e8109
--- /dev/null
+++ b/src/eval_op_prototypes.cpp
@@ -0,0 +1,46 @@
+#include "eval_op_prototypes.h"
+
+eval_op_prototypes::eval_op_prototypes() {
+    eval_const::store_prototype(proto_map_);
+    eval_input::store_prototype(proto_map_);
+    eval_add::store_prototype(proto_map_);
+    eval_sub::store_prototype(proto_map_);
+    eval_mul::store_prototype(proto_map_);
+    eval_relu::store_prototype(proto_map_);
+    eval_flatten::store_prototype(proto_map_);
+    eval_input2d::store_prototype(proto_map_);
+    eval_linear::store_prototype(proto_map_);
+    eval_maxpool2d::store_prototype(proto_map_);
+    eval_conv2d::store_prototype(proto_map_);
+    //log_proto_types(); // used this when troubleshooting. uncomment to enable log
+}
+
+void eval_op_prototypes::log_proto_types() {
+    std::cout << "eval_op_prototypes: registering prototypes begin:" << std::endl;
+    std::cout << "----------------------" << std::endl;
+    for (const auto& pair : proto_map_) {
+        std::cout << "eval_op_prototypes: registered prototype: " << pair.first << std::endl;
+    }
+    std::cout << "----------------------" << std::endl;
+    std::cout << "eval_op_prototypes: registering prototypes complete" << std::endl;
+
+}
+
+std::shared_ptr<eval_op> eval_op_prototypes::locate(std::string name)
+{
+    auto found = proto_map_.find(name);
+
+    if (found != proto_map_.end()) {
+        return found->second;
+    } else {
+        std::string error_message = "eval_op_prototypes: prototype for operator '" + name + "' not found.";
+        std::cerr << error_message << std::endl;
+        throw std::runtime_error(error_message);
+    }
+}
+
+eval_op_prototypes &eval_op_prototypes::instance()
+{
+    static eval_op_prototypes instance; // the only instance
+    return instance;
+}
\ No newline at end of file
diff --git a/src/eval_op_prototypes.h b/src/eval_op_prototypes.h
new file mode 100755
index 0000000..3d57d1d
--- /dev/null
+++ b/src/eval_op_prototypes.h
@@ -0,0 +1,25 @@
+#include "eval_op.h"
+#include "eval_input.h"
+#include "eval_input2d.h"
+#include "eval_add.h"
+#include "eval_mul.h"
+#include "eval_sub.h"
+#include "eval_const.h"
+#include "eval_flatten.h"
+#include "eval_relu.h"
+#include "eval_linear.h"
+#include "eval_maxpool2d.h"
+#include "eval_conv2d.h"
+
+typedef std::map<std::string, std::shared_ptr<eval_op>> eval_op_proto_map;
+
+class eval_op_prototypes {
+    // prevent creation of additional instances
+    eval_op_prototypes(const eval_op_prototypes &)=delete;
+    eval_op_prototypes();
+    eval_op_proto_map proto_map_;
+    void log_proto_types();
+public:
+    std::shared_ptr<eval_op> locate(std::string name);
+    static eval_op_prototypes &instance(); // access the only instance
+}; // class eval_op_prototypes
\ No newline at end of file
diff --git a/src/eval_relu.cpp b/src/eval_relu.cpp
new file mode 100755
index 0000000..4fb2cf2
--- /dev/null
+++ b/src/eval_relu.cpp
@@ -0,0 +1,27 @@
+#include "eval_relu.h"
+
+eval_relu::eval_relu(const expression &expr): eval_op(expr) {
+}
+
+void eval_relu::eval(vars_type &variables, const kwargs_type &kwargs) {
+   // Check that inputs_ is not empty
+   assert(!inputs_.empty());
+
+   // Retrieve the tensor from variables using the first input expression ID
+   const int inputExprId = inputs_[0];
+   tensor t = variables.at(inputExprId);
+
+   // For debugging: print the input tensor
+   std::cout << "Input Tensor for ReLU (expr_id " << inputExprId << "):" << std::endl;
+   //t.print();
+
+   // Perform the ReLU operation
+   tensor reluResult = ReLU(t);
+
+   // Store the result in variables under the key expr_id_
+   variables[expr_id_] = reluResult;
+}
+
+std::shared_ptr<eval_op> eval_relu::clone(const expression &expr) {
+    return std::make_shared<eval_relu>(expr);
+}
\ No newline at end of file
diff --git a/src/eval_relu.h b/src/eval_relu.h
new file mode 100755
index 0000000..bdacac5
--- /dev/null
+++ b/src/eval_relu.h
@@ -0,0 +1,21 @@
+#ifndef EVAL_RELU_H
+#define EVAL_RELU_H
+
+#include "eval_op.h"
+
+typedef std::map<std::string, std::shared_ptr<eval_op>> eval_op_proto_map;
+
+class eval_relu: public eval_op {
+    tensor value_;
+    std::shared_ptr<eval_op> clone(const expression &expr) override;
+public:
+    eval_relu () {}
+    eval_relu(const expression &expr);
+    void eval(vars_type &variables, const kwargs_type &kwargs) override;
+    static void store_prototype(eval_op_proto_map &proto_map) {
+        assert(proto_map.find("ReLU") == proto_map.end());
+        proto_map["ReLU"] = std::make_shared<eval_relu>(); // where is expr?
+    }
+}; // class eval_relu
+
+#endif
\ No newline at end of file
diff --git a/src/eval_sub.cpp b/src/eval_sub.cpp
new file mode 100755
index 0000000..f3aa903
--- /dev/null
+++ b/src/eval_sub.cpp
@@ -0,0 +1,18 @@
+#include "eval_sub.h"
+
+tensor eval_sub::compute(const tensor &a, const tensor &b) {
+    // these use our tensor operator- overload
+    tensor c = a - b; 
+    return c;
+}
+
+void eval_sub::eval(vars_type &variables, const kwargs_type &kwargs) {
+    assert(inputs_.size() == 2);
+    auto ita = variables.find(inputs_[0]);
+    auto itb = variables.find(inputs_[1]);
+    variables[expr_id_] = compute(ita->second,itb->second);
+}
+
+std::shared_ptr<eval_op> eval_sub::clone(const expression &expr) {
+    return std::make_shared<eval_sub>(expr);
+}
\ No newline at end of file
diff --git a/src/eval_sub.h b/src/eval_sub.h
new file mode 100755
index 0000000..3085693
--- /dev/null
+++ b/src/eval_sub.h
@@ -0,0 +1,22 @@
+#ifndef EVAL_SUB_H
+#define EVAL_SUB_H
+
+#include "eval_op.h"
+
+typedef std::map<std::string, std::shared_ptr<eval_op>> eval_op_proto_map;
+
+class eval_sub: public eval_op {
+    tensor compute(const tensor &a, const tensor &b);
+    std::shared_ptr<eval_op> clone(const expression &expr) override;
+public:
+    eval_sub() {}
+    eval_sub(const expression &expr): eval_op(expr) {}
+    void eval(vars_type &variables, const kwargs_type &kwargs) override;
+    static void store_prototype(eval_op_proto_map &proto_map) {
+        assert(proto_map.find("Sub") == proto_map.end());
+        proto_map["Sub"] = std::make_shared<eval_sub>(); // where is expr?
+    }
+
+}; // class eval_sub
+
+#endif
\ No newline at end of file
diff --git a/src/evaluation.cpp b/src/evaluation.cpp
new file mode 100755
index 0000000..66aa3da
--- /dev/null
+++ b/src/evaluation.cpp
@@ -0,0 +1,60 @@
+#include "evaluation.h"
+#include "eval_op_prototypes.h"
+
+evaluation::evaluation(const std::vector<expression> &exprs) {
+    try {
+        std::cout << "Starting evaluation construction with " << exprs.size() << " expressions." << std::endl;
+        for (auto &expr: exprs) {
+            std::cout << "Processing expression with ID: " << expr.get_expr_id() << std::endl;
+            std::shared_ptr<eval_op> p = eval_op_prototypes::instance().locate(expr.get_op_type());
+            ops_.push_back(p->clone(expr));
+            std::cout << "Expression with ID: " << expr.get_expr_id() << " processed successfully." << std::endl;
+}
+        std::cout << "Evaluation construction completed successfully." << std::endl;
+    } catch (const std::runtime_error& e) {
+        std::cerr << "Failed to construct evaluation: " << e.what() << std::endl;
+    }
+}
+
+void evaluation::add_kwargs_double(
+    const char *key,
+    double value)
+{
+    tensor scalar(value);
+    kwargs_[key] = scalar;
+}
+
+void evaluation::add_kwargs_ndarray(
+    const char *key,
+    int dim,
+    size_t shape[],
+    double data[])
+{
+    tensor ndarray(dim, shape, data);
+    kwargs_[key] = ndarray;
+}
+
+tensor &evaluation::get_result() 
+{ 
+    if (!variables_.empty()) {
+        int last_expr_id = std::prev(variables_.end())->first;
+        std::cout << "Returning result from the last expression with ID: " << last_expr_id << std::endl;
+        return variables_[last_expr_id];
+    }
+    throw std::runtime_error("No result available."); 
+}
+
+int evaluation::execute()
+{
+    std::cout << "Executing evaluation with " << ops_.size() << " operations.\n";
+    variables_.clear();
+
+    for (auto &op : ops_) {
+        std::cout << "Evaluating operation of type: " << typeid(*op).name() << "\n";
+        op->eval(variables_, kwargs_);
+        std::cout << "Operation evaluation complete. Variables size: " << variables_.size() << "\n";
+    }
+
+    std::cout << "Evaluation complete. Total variables evaluated: " << variables_.size() << "\n";
+    return 0;
+}
\ No newline at end of file
diff --git a/src/evaluation.h b/src/evaluation.h
new file mode 100755
index 0000000..3bb0058
--- /dev/null
+++ b/src/evaluation.h
@@ -0,0 +1,33 @@
+#ifndef EVALUATION_H
+#define EVALUATION_H
+
+#include "expression.h"
+#include "eval_op.h"
+
+class evaluation
+{
+public:
+    evaluation(const std::vector<expression> &exprs);
+
+    void add_kwargs_double(
+        const char *key,
+        double value);
+
+    void add_kwargs_ndarray(
+        const char *key,
+        int dim,
+        size_t shape[],
+        double data[]);
+
+    // return 0 for success
+    int execute();
+
+    tensor &get_result();
+
+private:
+    std::vector<std::shared_ptr<eval_op>> ops_; // instead of exprs_
+    std::map<int, tensor> variables_;
+    std::map<std::string, tensor> kwargs_;
+}; // class evaluation
+
+#endif // EVALUATION_H
diff --git a/src/expression.cpp b/src/expression.cpp
new file mode 100755
index 0000000..07c5c05
--- /dev/null
+++ b/src/expression.cpp
@@ -0,0 +1,112 @@
+#include "expression.h"
+#include "evaluation.h"
+
+expression::expression(
+    int expr_id,
+    const char *op_name,
+    const char *op_type,
+    int *inputs,
+    int num_inputs)
+    : expr_id_(expr_id), op_name_(op_name), op_type_(op_type)
+{
+    // initialize inputs_ with the given inputs from our expression.
+    for (int i = 0; i < num_inputs; ++i)
+    {
+        inputs_.push_back(inputs[i]);
+    }
+
+}
+
+// getter methods
+int expression::get_expr_id() const
+{
+    return expr_id_;
+}
+
+const std::string& expression::get_op_name() const
+{
+    return op_name_;
+}
+
+const std::string& expression::get_op_type() const
+{
+    return op_type_;
+}
+
+const std::vector<int>& expression::get_inputs() const
+{
+    return inputs_;
+}
+
+tensor expression::get_op_param_tensor(const std::string& key) const {
+        auto it = op_params_tensor_.find(key);
+        if (it != op_params_tensor_.end()) {
+            return it->second;
+        } else {
+            throw std::out_of_range("No operation parameter tensor found for key: " + key);
+        }
+}
+
+double expression::get_op_param_double(const std::string& key) const {
+    auto it = op_params_double_.find(key);
+    if (it != op_params_double_.end()) {
+        return it->second;
+    } else {
+        return 0.0;
+    }
+}
+
+int expression::get_op_param_int(const std::string& key) const {
+    auto it = op_params_int_.find(key);
+    if (it != op_params_int_.end()) {
+        return it->second;
+    } else {
+        return 0;
+    }
+}
+
+void expression::add_op_param_double(
+    const char *key,
+    double value)
+{
+
+    std::cout << "Adding operation parameter (double) with key: " << key << std::endl;
+
+    const char *w = "weight"; 
+    const char *b = "bias"; 
+    const char *v = "value"; // for backwards compatibility with p3 tests
+    const char *k = "kernel_size";
+    const char *s = "stride";
+    const char *i = "in_channels";
+    const char *o = "out_channels";
+    const char *p = "padding";
+
+    if((*key) == (*w)|| (*key) == (*b) || (*key) == (*v)){ // handle weight, bias, value tensor values
+        op_params_tensor_[key] = tensor(value);
+    } else if ((*key) == (*k) || (*key) == (*s) || (*key) == (*i) || (*key) == (*o)) { // handle kernel_size, stride, in_channels, out_channels int values
+        op_params_int_[key] = int(value);
+    } else if ((*key) == (*p)){ // handle padding double value
+        op_params_double_[key] = value;
+    }else{
+        op_params_tensor_[key] = tensor(value);
+    }
+}
+
+void expression::add_op_param_ndarray(
+    const char *key,
+    int dim,
+    size_t shape[],
+    double data[])
+{
+    std::cout << "Adding operation parameter (ndarray) with key: " << key << std::endl;
+
+    const char *w = "weight"; 
+    const char *b = "bias"; 
+
+  if((*key) == (*w)|| (*key) == (*b) ){
+    op_params_tensor_[key] = tensor(dim, shape, data);
+  }else{
+    op_params_tensor_[key] = tensor(dim, shape, data);
+  }
+
+}
diff --git a/src/expression.h b/src/expression.h
new file mode 100755
index 0000000..0a69142
--- /dev/null
+++ b/src/expression.h
@@ -0,0 +1,51 @@
+#ifndef EXPRESSION_H
+#define EXPRESSION_H
+
+#include <vector>
+#include <string>
+#include <map>
+#include "tensor.h"
+
+class evaluation;
+
+class expression
+{
+    friend class evaluation;
+public:
+    expression(
+        int expr_id,
+        const char *op_name,
+        const char *op_type,
+        int *inputs,
+        int num_inputs);
+
+    // Getter methods for expression attributes
+    int get_expr_id() const;
+    const std::string& get_op_name() const;
+    const std::string& get_op_type() const;
+    const std::vector<int>& get_inputs() const;
+    tensor get_op_param_tensor(const std::string& key) const;
+    double get_op_param_double(const std::string& key) const;
+    int get_op_param_int(const std::string& key) const;
+
+    void add_op_param_double(
+        const char *key,
+        double value);
+
+    void add_op_param_ndarray(
+        const char *key,
+        int dim,
+        size_t shape[],
+        double data[]);
+
+private:
+    int expr_id_;
+    std::string op_name_;
+    std::string op_type_;
+    std::vector<int> inputs_;
+    std::map<std::string, tensor> op_params_tensor_;
+    std::map<std::string,double> op_params_double_;
+    std::map<std::string,int> op_params_int_;
+}; // class expression
+
+#endif // EXPRESSION_H
diff --git a/src/libeasynn.cpp b/src/libeasynn.cpp
new file mode 100755
index 0000000..f4c05df
--- /dev/null
+++ b/src/libeasynn.cpp
@@ -0,0 +1,109 @@
+#include <stdio.h>
+
+#include "libeasynn.h"
+#include "program.h"
+#include "evaluation.h"
+
+program *create_program()
+{
+    program *prog = new program;
+    printf("created_program: program %p\n", prog);
+    return prog;
+}
+
+void append_expression(
+    program *prog,
+    int expr_id,
+    const char *op_name,
+    const char *op_type,
+    int inputs[],
+    int num_inputs)
+{
+    printf("append_expression: program %p, expr_id %d, op_name %s, op_type %s, inputs %d (",
+        prog, expr_id, op_name, op_type, num_inputs);
+    for (int i = 0; i != num_inputs; ++i)
+        printf("%d,", inputs[i]);
+    printf(")\n");
+    prog->append_expression(expr_id, op_name, op_type, inputs, num_inputs);
+}
+
+int add_op_param_double(
+    program *prog,
+    const char *key,
+    double value)
+{
+    printf("program %p, key %s, value %f\n",
+        prog, key, value);
+    return prog->add_op_param_double(key, value);
+}
+
+int add_op_param_ndarray(
+    program *prog,
+    const char *key,
+    int dim,
+    size_t shape[],
+    double data[])
+{
+    printf("program %p, key %s, value %p dim %d (",
+        prog, key, data, dim);
+    for (int i = 0; i != dim; ++i)
+        printf("%zu,", shape[i]);
+    printf(")\n");
+    return prog->add_op_param_ndarray(key, dim, shape, data);
+}
+
+evaluation *build(program *prog)
+{
+    evaluation *eval = prog->build();
+    printf("evaluation %p\n", eval);
+    return eval;
+}
+
+void add_kwargs_double(
+    evaluation *eval,
+    const char *key,
+    double value)
+{
+    printf("evaluation %p, key %s, value %f\n",
+        eval, key, value);
+    eval->add_kwargs_double(key, value);
+}
+
+void add_kwargs_ndarray(
+    evaluation *eval,
+    const char *key,
+    int dim,
+    size_t shape[],
+    double data[])
+{
+    printf("evaluation %p, key %s, value %p dim %d (",
+        eval, key, data, dim);
+    for (int i = 0; i != dim; ++i)
+        printf("%zu,", shape[i]);
+    printf(")\n");
+    eval->add_kwargs_ndarray(key, dim, shape, data);
+}
+
+int execute(
+    evaluation *eval,
+    int *p_dim,
+    size_t **p_shape,
+    double **p_data)
+{
+    printf("evaluation %p, p_dim %p, p_shape %p, p_data %p\n",
+        eval, p_dim, p_shape, p_data);
+    int ret = eval->execute();
+    if (ret != 0) {
+        printf("execution failed with return code %d\n", ret);
+        return ret;
+    }
+    tensor &res = eval->get_result();
+    *p_dim = res.get_dim();
+    *p_shape = res.get_shape_array();
+    *p_data = res.get_data_array();
+    //res.print(); // uncomment to view tensor data output
+
+    printf("execution completed succesfully. result: dim=%d\n",*p_dim);
+    fflush(stdout);
+    return 0;
+}
diff --git a/src/libeasynn.h b/src/libeasynn.h
new file mode 100755
index 0000000..7b31d5c
--- /dev/null
+++ b/src/libeasynn.h
@@ -0,0 +1,63 @@
+#ifndef LIBEASYNN_H
+#define LIBEASYNN_H
+
+class program;
+class evaluation;
+
+// return program handle
+extern "C"
+program *create_program();
+
+extern "C"
+void append_expression(
+    program *prog,
+    int expr_id,
+    const char *op_name,
+    const char *op_type,
+    int inputs[],
+    int num_inputs);
+
+// return 0 for success
+extern "C"
+int add_op_param_double(
+    program *prog,
+    const char *key,
+    double value);
+
+// return 0 for success
+extern "C"
+int add_op_param_ndarray(
+    program *prog,
+    const char *key,
+    int dim,
+    size_t shape[],
+    double data[]);
+
+// return evaluation handle
+extern "C"
+evaluation *build(
+    program *prog);
+
+extern "C"
+void add_kwargs_double(
+    evaluation *eval,
+    const char *key,
+    double value);
+
+extern "C"
+void add_kwargs_ndarray(
+    evaluation *eval,
+    const char *key,
+    int dim,
+    size_t shape[],
+    double data[]);
+
+// return 0 for success
+extern "C"
+int execute(
+    evaluation *eval,
+    int *p_dim,
+    size_t **p_shape,
+    double **p_data);
+
+#endif // LIBEASYNN_H
diff --git a/src/program.cpp b/src/program.cpp
new file mode 100755
index 0000000..b5c9964
--- /dev/null
+++ b/src/program.cpp
@@ -0,0 +1,47 @@
+#include "program.h"
+#include "evaluation.h"
+
+program::program()
+{
+}
+
+void program::append_expression(
+    int expr_id,
+    const char *op_name,
+    const char *op_type,
+    int inputs[],
+    int num_inputs)
+{
+    // create an expression object with the provided parameters
+    expression expr(expr_id, op_name, op_type, inputs, num_inputs);
+
+    // append the expression to the expressions vector
+    expressions_.push_back(expr);
+}
+
+int program::add_op_param_double(
+    const char *key,
+    double value)
+{
+    // get end of expressions_ values and send to our add_op_param_double
+    expressions_.back().add_op_param_double(key, value);
+    return 0;
+}
+
+int program::add_op_param_ndarray(
+    const char *key,
+    int dim,
+    size_t shape[],
+    double data[])
+{
+   // get end of expressions_ values and send to our add_op_param_ndarray
+   expressions_.back().add_op_param_ndarray(key, dim, shape, data);
+   return 0;
+}
+
+
+evaluation *program::build()
+{
+    evaluation *eval = new evaluation(expressions_);
+    return eval;
+}
diff --git a/src/program.h b/src/program.h
new file mode 100755
index 0000000..24e0ca0
--- /dev/null
+++ b/src/program.h
@@ -0,0 +1,38 @@
+#ifndef PROGRAM_H
+#define PROGRAM_H
+
+#include "expression.h"
+
+class evaluation;
+
+class program
+{
+public:
+    program();
+
+    void append_expression(
+        int expr_id,
+        const char *op_name,
+        const char *op_type,
+        int inputs[],
+        int num_inputs);
+
+    // return 0 for success
+    int add_op_param_double(
+        const char *key,
+        double value);
+
+    // return 0 for success
+    int add_op_param_ndarray(
+        const char *key,
+        int dim,
+        size_t shape[],
+        double data[]);
+
+    evaluation *build();
+
+private:
+    std::vector<expression> expressions_; // Store the expressions.
+}; // class program
+
+#endif // PROGRAM_H
diff --git a/src/tensor.cpp b/src/tensor.cpp
new file mode 100755
index 0000000..af63e1e
--- /dev/null
+++ b/src/tensor.cpp
@@ -0,0 +1,430 @@
+#include "tensor.h"
+#include <assert.h>
+#include <stdexcept>
+#include <algorithm>
+#include <numeric>
+#include <math.h>
+
+tensor::tensor() {}
+
+tensor::tensor(double v) {
+  shape_.push_back(1);
+  data_.push_back(v);
+}
+
+tensor::tensor(int dim, std::initializer_list<size_t> shape, std::initializer_list<double> data) {
+    shape_.assign(shape.begin(), shape.end());
+    data_.assign(data.begin(), data.end());
+    dim_ = dim;
+}
+
+size_t tensor::getTotalSize() const {
+  size_t totalSize = 1;
+  for (size_t dim : shape_) {
+    if (totalSize > std::numeric_limits<size_t>::max() / dim)
+      throw std::overflow_error("Size overflow detected");
+    totalSize *= dim;
+  }
+  return totalSize;
+}
+
+tensor::tensor(int dim, size_t shape[], double data[]):
+    shape_(shape, shape+dim) {
+    dim_ = dim;
+    // calc N has shape[0]*shape[1]*...*shape[dim-1]
+    // calc N as the product of elements in the shape array
+    size_t N = 1;
+    for (int i=0; i < dim; ++i) {
+        N = shape[i] * N;
+    }
+
+    data_.assign(data, data+N);
+}
+
+int tensor::get_dim() const {
+    return shape_.size();
+}
+
+double tensor::item() const {
+    if (shape_.empty()) {
+        return data_[0];
+    } else {
+        throw std::runtime_error("item() called on a non-scalar tensor.");
+    }
+}
+
+double &tensor::item() {
+    if (shape_.empty()) {
+        return data_[0];
+    } else {
+        throw std::runtime_error("item() called on a non-scalar tensor.");
+    }
+}
+
+double tensor::at(size_t i) const {
+    assert(get_dim() == 1);
+    assert(i < shape_[0]);
+    return data_[i];
+}
+
+double tensor::at(size_t i, size_t j) const {
+    assert(get_dim() == 2);
+    if (shape_.size() != 2)
+        throw std::runtime_error("Tensor is not 2-dimensional");
+
+    assert((i < shape_[0]) && (j < shape_[1]));
+    return data_[i * shape_[1] + j];
+}
+
+double tensor::at(size_t i, size_t j, size_t k, size_t l) const {
+    assert(get_dim() == 4); // Ensure the tensor is 4D
+    assert(i < shape_[0] && j < shape_[1] && k < shape_[2] && l < shape_[3]);
+
+    size_t index = i * shape_[1] * shape_[2] * shape_[3] + j * shape_[2] * shape_[3] + k * shape_[3] + l;
+    return data_[index];
+}
+
+double& tensor::at(size_t i, size_t j, size_t k, size_t l) {
+    assert(get_dim() == 4); // Ensure the tensor is 4D
+    assert(i < shape_[0] && j < shape_[1] && k < shape_[2] && l < shape_[3]);
+
+    size_t index = i * shape_[1] * shape_[2] * shape_[3] + j * shape_[2] * shape_[3] + k * shape_[3] + l;
+    return data_[index];
+}
+
+size_t* tensor::get_shape_array() {
+    return shape_.empty()? nullptr: &shape_[0];
+}
+
+double* tensor::get_data_array() {
+    return &data_[0];
+}
+
+// This can be used for debugging across the entire program.
+// prints the values of the tensor. Be careful though because
+// it will clutter the screen output.
+void tensor::print() const {
+    std::cout << "Tensor - Dim: " << dim_ << ", Shape: {";
+    for (const auto& s : shape_) std::cout << s << " ";
+    std::cout << "}, Data: {";
+    for (const auto& d : data_) std::cout << d << " ";
+    std::cout << "}\n";
+}
+
+// helper template
+template <class T> tensor helper(const tensor &lhs, const tensor &rhs) {
+  tensor result;
+  result.shape_ = lhs.shape_;
+  size_t totalSize = lhs.getTotalSize();
+  result.data_.reserve(totalSize);
+  std::transform(lhs.data_.begin(), lhs.data_.begin() + totalSize,
+                 rhs.data_.begin(), std::back_inserter(result.data_), T());
+  return result;
+}
+
+// operator overloads
+tensor operator+(const tensor &lhs, const tensor &rhs) {
+    tensor tensor_a = tensor(lhs);
+    tensor tensor_b = tensor(rhs);
+
+    size_t size_a = lhs.getTotalSize();
+    std::vector<double> result;
+    for (size_t i =0; i< size_a; ++i) {
+        double sum_array = tensor_a.get_data_array()[i] + tensor_b.get_data_array()[i];
+        result.push_back(sum_array);
+    }
+    tensor c(tensor_a.get_dim(), tensor_a.get_shape_array(), &result[0]);
+    return c;
+}
+
+tensor operator-(const tensor &lhs, const tensor &rhs) {
+    tensor tensor_a = tensor(lhs);
+    tensor tensor_b = tensor(rhs);
+
+    size_t size_a = lhs.getTotalSize();
+    std::vector<double> result;
+    for (size_t i =0; i< size_a; ++i) {
+        double sum_array = tensor_a.get_data_array()[i] - tensor_b.get_data_array()[i];
+        result.push_back(sum_array);
+    }
+    tensor c(tensor_a.get_dim(), tensor_a.get_shape_array(), &result[0]);
+    return c;
+}
+
+tensor operator*(const tensor &lhs, const tensor &rhs) 
+{   
+    try {
+
+        // scalar * scalar multiplication
+        if (lhs.getTotalSize() == 1 && rhs.getTotalSize() == 1) {
+            return tensor(lhs.data_[0] * rhs.data_[0]);
+        }
+        if (lhs.shape_.empty() || rhs.shape_.empty()) {
+            return tensor(); // for p3-q6, passing in None 
+        }
+
+        tensor result;
+        size_t lhsSize = lhs.getTotalSize();
+        size_t rhsSize = rhs.getTotalSize();
+             
+        // scalar * tensor multiplication
+        if (lhsSize == 1 || rhsSize == 1) {
+            double scalar = (lhsSize == 1 ? lhs : rhs).data_[0];
+            const tensor &t = lhsSize == 1 ? rhs : lhs;
+            result.shape_ = t.shape_;
+            result.data_.reserve(lhsSize * rhsSize);
+            std::transform(t.data_.begin(), t.data_.end(),
+                std::back_inserter(result.data_),
+                [&scalar](double x) { return x * scalar; });
+        } else {
+        // matrix multiplication
+            size_t I = lhs.shape_[0];
+            size_t J = rhs.shape_[1];
+            size_t K = lhs.shape_[1];
+            result.shape_ = {I, J};
+            size_t totalSize = result.getTotalSize();
+            result.data_.resize(totalSize);
+
+            for (size_t i = 0; i < I; ++i) {
+                for (size_t j = 0; j < J; ++j) {
+                    for (size_t k = 0; k < K; ++k) {
+                        result.data_[J * i + j] +=
+                            lhs.data_[K * i + k] * rhs.data_[J * k + j];
+                    }
+                }
+            }
+        }
+        return result;
+    } catch (const std::bad_alloc&) {
+        throw std::runtime_error("Memory allocation failed in tensor multiplication.");
+    }
+}
+
+tensor ReLU(const tensor &x){
+  tensor result;
+  size_t xSize = x.getTotalSize();
+  result.shape_ = x.shape_;
+  for(size_t i = 0; i < xSize; i++ ){
+      result.data_.push_back(x.data_[i]*(x.data_[i]>0));
+  }
+  return result;
+  }
+
+tensor Flatten(const tensor &x){
+  tensor result;
+  size_t xSize = x.getTotalSize();
+  size_t shape_1 = xSize/x.shape_[0];
+  std::vector<size_t> shape_;
+  shape_.push_back(x.shape_[0]);
+  shape_.push_back(shape_1);
+  result.shape_ = shape_;
+  result.data_ = x.data_;
+  return result;
+  }
+
+tensor Input2d(const tensor &x){
+     tensor result;
+     std::vector<size_t> shape_;
+     shape_.push_back(x.shape_[0]);
+     shape_.push_back(x.shape_[3]);
+     shape_.push_back(x.shape_[1]);
+     shape_.push_back(x.shape_[2]);
+     result.shape_ = shape_;
+     for(size_t i = 0; i < x.shape_[0]; i++ ){
+      for(size_t j =0; j < x.shape_[3];j++){
+        for(size_t k = 0; k <x.shape_[1]*x.shape_[2]; k++){
+            result.data_.push_back(x.data_[j + k*x.shape_[3] + i*x.shape_[3]*x.shape_[1]*x.shape_[2]]);
+        }
+      }
+    }
+    return result;
+}
+
+tensor Linear(const tensor &w, const tensor &x, const tensor &b){
+    tensor result_wT;
+    std::vector<size_t> shape_0;
+    shape_0.push_back(x.shape_[1]);
+    shape_0.push_back(w.shape_[0]);
+    size_t totalSize_w = w.getTotalSize();
+    result_wT.data_.reserve(totalSize_w);
+    result_wT.shape_ = shape_0;
+    for(size_t i = 0; i < w.shape_[1]; i++){
+      for(size_t j = 0; j < w.shape_[0]; j++){
+         result_wT.data_.push_back(w.data_[i + j*w.shape_[1]]);
+      }
+    }
+    tensor result_xwT;
+    std::vector<size_t> shape_xwT;
+    shape_xwT.push_back(x.shape_[0]);
+    shape_xwT.push_back(result_wT.shape_[1]);
+    size_t totalSize_xwT = x.shape_[0]*result_wT.shape_[1];
+    result_xwT.data_.resize(totalSize_xwT);
+    result_xwT.shape_ = shape_xwT;
+    result_xwT =  x * result_wT;
+
+    tensor result_bBrodcast;
+    std::vector<size_t> shape_1;
+    shape_1.push_back(b.shape_[0]);
+    shape_1.push_back(1*x.shape_[0]);
+    result_bBrodcast.shape_ = shape_1;
+    for(size_t i = 0; i < result_bBrodcast.shape_[0]; i++){
+       for(size_t j = 0; j < result_bBrodcast.shape_[1]; j++){
+          result_bBrodcast.data_.push_back(b.data_[i]);
+       }
+    }
+
+   tensor result_bBroadcastT;
+   std::vector<size_t> shape_2;
+   size_t totalSize_b = result_bBrodcast.getTotalSize();
+   result_bBrodcast.data_.resize(totalSize_b);
+   shape_2.push_back(result_bBrodcast.shape_[1]);
+   shape_2.push_back(result_bBrodcast.shape_[0]);
+   result_bBroadcastT.shape_ = shape_2;
+  for(size_t i = 0; i < result_bBrodcast.shape_[1]; i++){
+      for(size_t j = 0; j < result_bBrodcast.shape_[0]; j++){
+         result_bBroadcastT.data_.push_back(result_bBrodcast.data_[i + j*result_bBrodcast.shape_[1]]);
+      }
+    }
+    tensor result;
+    result.data_.resize(result_xwT.shape_[0]*result_xwT.shape_[1]);
+    result = result_xwT + result_bBroadcastT;
+    return result;
+}
+
+
+std::vector<double> maxpooling(std::vector<std::vector<double> > map, int k_w, int k_h, int s_w, int s_h)
+{
+	int row = map.size(); int col = map[0].size();
+	int out_row = 1+(row-k_h)/s_h, out_col = 1+(col-k_w)/s_w;
+	int mod_row = row%s_h, mod_col = col%s_w;
+	if (mod_row != 0) out_row++;
+	if (mod_col != 0) out_col++;
+
+	std::vector<std::vector<double> > res(out_row, std::vector<double>(out_col, 0));
+	std::vector<std::vector<double> > pad_map(map);
+
+	for (int i=0;i<row;i++)
+		for (int j = 0; j < k_h - mod_row; j++)
+		{
+			pad_map[i].push_back(map[i][col - 1]);
+		}
+	for (int j = 0; j < k_w - mod_col; j++)
+	{
+		pad_map.push_back(pad_map[row - 1]);
+	}
+
+	for(int i=0;i<out_row;i++)
+		for (int j = 0; j < out_col; j++)
+		{
+			int start_x = j*s_w;
+			int start_y = i*s_h;
+			std::vector<double> temp;
+			for(int ii=0;ii<k_w;ii++)
+				for (int jj = 0; jj < k_h; jj++)
+				{
+					temp.push_back(pad_map[start_y + jj][start_x + ii]);
+				}
+			sort(temp.begin(), temp.end());
+			res[i][j] = temp[temp.size() - 1];
+		}
+       std::vector<double> res_;
+      for (auto l : res)
+	     {
+		       for (auto ll : l)
+		       {
+			          res_.push_back(ll);
+		       }
+	      }
+	return res_;
+}
+
+tensor MaxPool2d(const tensor &x, const int kernel_size, const int stride){
+    std::cout <<"======1================="<<std::endl;
+    std::cout << x.shape_[0] <<"_"<< x.shape_[1] << x.shape_[2] << x.shape_[3] << std::endl;
+    std::cout << kernel_size << std::endl;
+    std::cout << stride << std::endl;
+    std::cout <<"======2================="<<std::endl;
+
+    tensor result;
+    size_t xSize = x.getTotalSize();
+    std::cout << xSize << std::endl;
+    std::vector<size_t> shape_;
+    shape_.push_back(x.shape_[0]);
+    shape_.push_back(x.shape_[1]);
+    shape_.push_back(floor(x.shape_[2]/kernel_size));
+    shape_.push_back(floor(x.shape_[3]/kernel_size));
+    result.shape_ = shape_;
+    for(size_t i = 0; i < x.shape_[0]; i++ ){
+      for(size_t j =0; j < x.shape_[1];j++){
+        std::vector<std::vector<double> > tmp;
+        for(size_t k =0; k < x.shape_[2];k++){
+          std::vector<double> tmp_;
+          for(size_t l =0; l < x.shape_[3];l++){
+            double int_data = x.data_[l + k*x.shape_[3] + j*x.shape_[3]*x.shape_[2] + i*x.shape_[3]*x.shape_[2]*x.shape_[1] ];
+            tmp_.push_back(int_data);
+          }
+          tmp.push_back(tmp_);
+        }
+       auto res = maxpooling(tmp,kernel_size,kernel_size,stride,stride);
+       for(size_t i = 0; i< res.size(); i++){
+         result.data_.push_back(res[i]);
+       }
+      }
+    }
+    return result;
+  }
+
+tensor Conv2D(const tensor &input_tensor,
+              const tensor &kernel_tensor,
+              const tensor &bias_tensor,
+              int in_channels,
+              int out_channels,
+              int kernel_size)
+
+{
+    std::cout << "In Channels: " << in_channels << ", Out Channels: " << out_channels << std::endl;
+    std::cout << "Kernel Size: " << kernel_size << std::endl;
+
+    // Calculate output dimensions
+    int out_height = input_tensor.shape_[2] - kernel_size + 1;
+    int out_width = input_tensor.shape_[3] - kernel_size + 1;
+
+    // Ensure out_height and out_width are non-negative
+    out_height = std::max(out_height, 0);
+    out_width = std::max(out_width, 0);
+
+    // Initialize shape array for output tensor
+    size_t output_shape[] = {input_tensor.shape_[0], static_cast<size_t>(out_channels), static_cast<size_t>(out_height), static_cast<size_t>(out_width)};
+
+    // Initialize data array for output tensor (fill with zeros)
+    std::vector<double> output_data(output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3], 0);
+
+    // Create output tensor
+    tensor output_tensor(4, output_shape, output_data.data());
+
+    // Convolution operation
+    for (size_t n = 0; n < input_tensor.shape_[0]; ++n) { // for each batch
+        for (int m = 0; m < out_channels; ++m) { // for each filter
+            for (int y = 0; y < out_height; ++y) {
+                for (int x = 0; x < out_width; ++x) {
+                    double sum = 0.0;
+                    for (int c = 0; c < in_channels; ++c) { // for each channel
+                        for (int ky = 0; ky < kernel_size; ++ky) {
+                            for (int kx = 0; kx < kernel_size; ++kx) {
+                                size_t ix = x + kx;
+                                size_t iy = y + ky;
+                                if (ix < input_tensor.shape_[3] && iy < input_tensor.shape_[2]) {
+                                    sum += input_tensor.at(n, c, iy, ix) * kernel_tensor.at(m, c, ky, kx);
+                                }
+                            }
+                        }
+                    }
+                    sum += bias_tensor.at(m); // add bias
+                    output_tensor.at(n, m, y, x) = sum;
+                }
+            }
+        }
+    }
+
+    return output_tensor;
+}
\ No newline at end of file
diff --git a/src/tensor.h b/src/tensor.h
new file mode 100755
index 0000000..6c604b4
--- /dev/null
+++ b/src/tensor.h
@@ -0,0 +1,59 @@
+#ifndef TENSOR_H
+#define TENSOR_H
+
+#include <stdio.h>
+#include <vector>
+#include <iostream>
+
+class tensor {
+public:
+    tensor(); // scalar 0
+    explicit tensor(double v); // scalar v
+    tensor(int dim, size_t shape[], double data[]); // From C
+    tensor(int dim, std::initializer_list<size_t> shape, std::initializer_list<double> data); // Conv2d
+
+    int get_dim() const;
+
+    // scalar only
+    double item() const;
+    double &item();
+
+    // more accessors
+    double at(size_t i) const;
+    double at(size_t i, size_t j) const;
+    double at(size_t i, size_t j, size_t k, size_t l) const;
+    double& at(size_t i, size_t j, size_t k, size_t l);
+
+    // passing tensor back to C code
+    size_t *get_shape_array();
+    double *get_data_array();
+
+    // easy way to print out the tensor for logging
+    void print() const;
+
+    size_t getTotalSize() const;
+
+  // friend is better than member, because it allows
+  // double + tensor (implicit conversion)
+  // operator overloads
+  friend tensor operator+(const tensor &lhs, const tensor &rhs);
+  friend tensor operator-(const tensor &lhs, const tensor &rhs);
+  friend tensor operator*(const tensor &lhs, const tensor &rhs);
+
+  template <class T>
+  friend tensor helper(const tensor &lhs, const tensor &rhs);
+
+  friend tensor ReLU(const tensor &x);
+  friend tensor Flatten(const tensor &x);
+  friend tensor Input2d(const tensor &x);
+  friend tensor Linear(const tensor &w, const tensor &x, const tensor &b);
+  friend tensor MaxPool2d(const tensor &x, const int kernel_size, const int stride);
+  friend tensor Conv2D(const tensor &input_tensor, const tensor &kernel_tensor, const tensor &bias_tensor, int in_channels, int out_channels, int kernel_size);
+
+private:
+    int dim_;
+    std::vector<size_t> shape_;
+    std::vector<double> data_;
+}; // class tensor
+
+#endif // TENSOR_H;
\ No newline at end of file