Automatic differentiation is based on the multivariate chainrule: 
				\(f:\R^n \to \R\), \(g:\R^m \to \R^n\), \(h:\R^n \to \R^l\)
				\[
				d (h \circ g)(\bfx) = d h(g(\bfx)) dg(\bfx),
				\]
				\[
				\nabla (f \circ g)(\bfx) = \nabla f(g(\bfx))^T dg(\bfx),
				\]
				We have for \(i=1,\dots,m\)
				\begin{align}
				\frac{\partial (f \circ g)}{\partial x_i}(\bfx) =
				\nabla f(g(\bfx))^T
				\begin{bmatrix}
				\frac{\partial g_1}{\partial x_i} (\bfx) \\
				\vdots \\
				\frac{\partial g_n}{\partial x_i} (\bfx)
				\end{bmatrix}
				= \sum_{j=1}^n \frac{\partial f}{\partial x_j} (g(\bfx)) \frac{\partial g_j}{\partial x_i} (\bfx).
				\end{align}
				So we make a forward pass and calculate \(f(g(\bfx)).\) If we know the gradient of \(f\) at \(g(\bfx)\) we can backpropagate it to \(\bfx\) via above formula.
			
				Example. Matrix multiplication.
				\(\bfA \in \R^{m \times n}\), \(\bfB \in \R^{n \times l},\)
				\(f(\bfA \bfB) = f(\bfC) \in \R\).
				Let \(\nabla_\bfC f(\bfC) = \nabla\) such that
				\[\frac{\partial f}{\partial c_{ij}}(\bfC) = \nabla_{ij}.\]
				By definition of matrix multiplication
				\[c_{ij} = \sum_{k=1}^n a_{ik}b_{kj}, \quad \frac{\partial c_{ij}}{\partial a_{ik}} = b_{kj}, \quad \frac{\partial c_{ij}}{\partial b_{kj}} = a_{ik}.\]
				From the chain rule it follows that
				\[\frac{\partial f}{\partial  a_{ik}}(\bfC) = \sum_{j=1}^l b_{kj}\nabla_{ij}, \quad \frac{\partial f}{\partial b_{kj}}(\bfC) = \sum_{i=1}^m a_{ik}\nabla_{ij}\]
				which can be summerised as
				\[\frac{df}{d\bfA} =  \nabla\bfB^T, \quad \frac{df}{d\bfB} = \bfA^T \nabla. \]
			
function *(self::DMat, other::DMat)
    res = DMat(self.s*other.s, prev=[self, other], op="*")
    res.backward = function bw(∇)
        self.∇ .+=  ∇ * adjoint(other.s)
        other.∇ .+= adjoint(self.s) * ∇
    end
    return demote(res)
end
			Example. Convolution code.
function convolve_loop(W::AbstractArray, b::AbstractArray, stride::Tuple{Int,Int}, pad::Tuple{Int,Int}, A::AbstractArray)
	kx, ky, kd1, kd2 = size(W)
	inx, iny, = size(A)
	m, n = size_after_conv((inx, iny), stride, pad, (kx, ky))
	output = Array{eltype(W), 3}(undef, m, n, kd2)
	@inbounds for i in 1:m, j in 1:n, k in 1:kd2
		x = 1+(i-1)*stride[1]-pad[1]
		y = 1+(j-1)*stride[2]-pad[2]
		acc = 0.
		for (i´,x´) in enumerate(x:x+kx-1), (j´,y´) in enumerate(y:y+ky-1), l in 1:kd1
			a = 0.
			if (1 ≤ x´ && x´ ≤ inx) && (1 ≤ y´ && y´ ≤ iny)
				a = A[x´,y´,l]
			end
			acc += W[i´,j´,l,k] * a
		end
		output[i,j,k] = b[k] + acc
	end
	return output
end
			
function ∇convolve_loop(W::AbstractArray, b::AbstractArray, stride::Tuple{Int,Int}, pad::Tuple{Int,Int}, A::AbstractArray, ∇::AbstractArray)
	∇W = zeros(size(W)); ∇b=zeros(size(b)); ∇A = zeros(size(A))
	kx, ky, kd1, kd2 = size(W)
	inx, iny, = size(A)
	m, n = size_after_conv((inx, iny), stride, pad, (kx, ky))
	@assert (m, n, kd2) == size(∇)
	@inbounds for i in 1:m, j in 1:n, k in 1:kd2
		x = 1+(i-1)*stride[1]-pad[1]
		y = 1+(j-1)*stride[2]-pad[2]
		for (i´,x´) in enumerate(x:x+kx-1), (j´,y´) in enumerate(y:y+ky-1), l in 1:kd1
			if (1 ≤ x´ && x´ ≤ inx) && (1 ≤ y´ && y´ ≤ iny)
				∇W[i´,j´,l,k] +=  A[x´,y´,l] * ∇[i,j,k]
				∇A[x´,y´,l] += W[i´,j´,l, k] * ∇[i,j,k]
			end
		end
		∇b[k] += ∇[i,j,k]
	end
	return ∇W, ∇b, ∇A
end
Implementation.
I used custom structs which keep track of the parents which created them. By overloading operators we calculate the forward pass and create a tree. As \(\text{id}\circ f = f\) and \(\nabla_f f = 1\) we know the gradient at the root node and propagate it back layer by layer.
function backward(d::DVal)
    # order the tree nodes by depth
    topo = DType[]
    visited = Set{DType}()
    function build_topo(v)
        if !(v in visited)
            push!(visited, v)
            for child in v.prev
                build_topo(child)
            end
            push!(topo,v)
        end
    end
    build_topo(d)
    d.∇ = 1
    # go one variable at a time and apply the chain rule to get its gradient
    for v in reverse(topo)
        v.backward(v.∇)
    end
end
With this method a wide range of operations can be differentiated. Like for example an arbitratry retrieval of elements. The gradient is backpropagated by simply adding it to the correct indexes.
function getindex(self::DTensor, I...)
    res = DTensor(getindex(self.s, I...), prev=[self], op="[$I]")
    res.backward = function bw(∇)
        S = getindex(self.∇, I...)
        setindex!(self.∇, S .+ ∇, I...)
    end
    return res
end
MNIST
With alot of operations implemented I was able to create and differentiate following deep neural network:
m = Model(
    # First convolution, operating upon a 28x28 image
    Conv((3, 3), 1=>16, relu),
    MaxPool((2,2)),
    # Second convolution, operating upon a 13x13 image
    Conv((3, 3), 16=>32, relu),
    MaxPool((2,2)),
    # Third convolution, operating upon a 5x5 image
    Conv((3, 3), 32=>32, relu),
    MaxPool((2,2)),
    # Reshape 3d tensor into a 2d one using flatten, at this point it should be (1, 1, 32, N)
    flatten,
    Dense(32, 10)
)
After 35 epochs witch batchsize 128 and ADAM optimiser an accuracy of 98.84% was reached. Below can bee seen all correctly labeled images (left) and all wrongly labeld images (right).
 
					 
				Code available here.