Automatic Differentiation and MNIST

Automatic differentiation is based on the multivariate chainrule:
\(f:\R^n \to \R\), \(g:\R^m \to \R^n\), \(h:\R^n \to \R^l\) \[ d (h \circ g)(\bfx) = d h(g(\bfx)) dg(\bfx), \] \[ \nabla (f \circ g)(\bfx) = \nabla f(g(\bfx))^T dg(\bfx), \] We have for \(i=1,\dots,m\) \begin{align} \frac{\partial (f \circ g)}{\partial x_i}(\bfx) = \nabla f(g(\bfx))^T \begin{bmatrix} \frac{\partial g_1}{\partial x_i} (\bfx) \\ \vdots \\ \frac{\partial g_n}{\partial x_i} (\bfx) \end{bmatrix} = \sum_{j=1}^n \frac{\partial f}{\partial x_j} (g(\bfx)) \frac{\partial g_j}{\partial x_i} (\bfx). \end{align} So we make a forward pass and calculate \(f(g(\bfx)).\) If we know the gradient of \(f\) at \(g(\bfx)\) we can backpropagate it to \(\bfx\) via above formula.

Example. Matrix multiplication.
\(\bfA \in \R^{m \times n}\), \(\bfB \in \R^{n \times l},\) \(f(\bfA \bfB) = f(\bfC) \in \R\).
Let \(\nabla_\bfC f(\bfC) = \nabla\) such that \[\frac{\partial f}{\partial c_{ij}}(\bfC) = \nabla_{ij}.\] By definition of matrix multiplication \[c_{ij} = \sum_{k=1}^n a_{ik}b_{kj}, \quad \frac{\partial c_{ij}}{\partial a_{ik}} = b_{kj}, \quad \frac{\partial c_{ij}}{\partial b_{kj}} = a_{ik}.\] From the chain rule it follows that \[\frac{\partial f}{\partial a_{ik}}(\bfC) = \sum_{j=1}^l b_{kj}\nabla_{ij}, \quad \frac{\partial f}{\partial b_{kj}}(\bfC) = \sum_{i=1}^m a_{ik}\nabla_{ij}\] which can be summerised as \[\frac{df}{d\bfA} = \nabla\bfB^T, \quad \frac{df}{d\bfB} = \bfA^T \nabla. \]


function *(self::DMat, other::DMat)
    res = DMat(self.s*other.s, prev=[self, other], op="*")
    res.backward = function bw(∇)
        self.∇ .+=  ∇ * adjoint(other.s)
        other.∇ .+= adjoint(self.s) * ∇
    end
    return demote(res)
end

Example. Convolution code.


function convolve_loop(W::AbstractArray, b::AbstractArray, stride::Tuple{Int,Int}, pad::Tuple{Int,Int}, A::AbstractArray)
	kx, ky, kd1, kd2 = size(W)
	inx, iny, = size(A)
	m, n = size_after_conv((inx, iny), stride, pad, (kx, ky))
	output = Array{eltype(W), 3}(undef, m, n, kd2)
	@inbounds for i in 1:m, j in 1:n, k in 1:kd2
		x = 1+(i-1)*stride[1]-pad[1]
		y = 1+(j-1)*stride[2]-pad[2]
		acc = 0.
		for (i´,x´) in enumerate(x:x+kx-1), (j´,y´) in enumerate(y:y+ky-1), l in 1:kd1
			a = 0.
			if (1 ≤ x´ && x´ ≤ inx) && (1 ≤ y´ && y´ ≤ iny)
				a = A[x´,y´,l]
			end
			acc += W[i´,j´,l,k] * a
		end
		output[i,j,k] = b[k] + acc
	end
	return output
end


function ∇convolve_loop(W::AbstractArray, b::AbstractArray, stride::Tuple{Int,Int}, pad::Tuple{Int,Int}, A::AbstractArray, ∇::AbstractArray)
	∇W = zeros(size(W)); ∇b=zeros(size(b)); ∇A = zeros(size(A))
	kx, ky, kd1, kd2 = size(W)
	inx, iny, = size(A)
	m, n = size_after_conv((inx, iny), stride, pad, (kx, ky))
	@assert (m, n, kd2) == size(∇)
	@inbounds for i in 1:m, j in 1:n, k in 1:kd2
		x = 1+(i-1)*stride[1]-pad[1]
		y = 1+(j-1)*stride[2]-pad[2]
		for (i´,x´) in enumerate(x:x+kx-1), (j´,y´) in enumerate(y:y+ky-1), l in 1:kd1
			if (1 ≤ x´ && x´ ≤ inx) && (1 ≤ y´ && y´ ≤ iny)
				∇W[i´,j´,l,k] +=  A[x´,y´,l] * ∇[i,j,k]
				∇A[x´,y´,l] += W[i´,j´,l, k] * ∇[i,j,k]
			end
		end
		∇b[k] += ∇[i,j,k]
	end
	return ∇W, ∇b, ∇A
end

Implementation.

I used custom structs which keep track of the parents which created them. By overloading operators we calculate the forward pass and create a tree. As \(\text{id}\circ f = f\) and \(\nabla_f f = 1\) we know the gradient at the root node and propagate it back layer by layer.


function backward(d::DVal)
    # order the tree nodes by depth
    topo = DType[]
    visited = Set{DType}()
    function build_topo(v)
        if !(v in visited)
            push!(visited, v)
            for child in v.prev
                build_topo(child)
            end
            push!(topo,v)
        end
    end

    build_topo(d)

    d.∇ = 1
    # go one variable at a time and apply the chain rule to get its gradient
    for v in reverse(topo)
        v.backward(v.∇)
    end
end

With this method a wide range of operations can be differentiated. Like for example an arbitratry retrieval of elements. The gradient is backpropagated by simply adding it to the correct indexes.


function getindex(self::DTensor, I...)
    res = DTensor(getindex(self.s, I...), prev=[self], op="[$I]")
    res.backward = function bw(∇)
        S = getindex(self.∇, I...)
        setindex!(self.∇, S .+ ∇, I...)
    end
    return res
end

MNIST

With alot of operations implemented I was able to create and differentiate following deep neural network:


m = Model(
    # First convolution, operating upon a 28x28 image
    Conv((3, 3), 1=>16, relu),
    MaxPool((2,2)),

    # Second convolution, operating upon a 13x13 image
    Conv((3, 3), 16=>32, relu),
    MaxPool((2,2)),

    # Third convolution, operating upon a 5x5 image
    Conv((3, 3), 32=>32, relu),
    MaxPool((2,2)),

    # Reshape 3d tensor into a 2d one using flatten, at this point it should be (1, 1, 32, N)
    flatten,
    Dense(32, 10)
)

After 35 epochs witch batchsize 128 and ADAM optimiser an accuracy of 98.84% was reached. Below can bee seen all correctly labeled images (left) and all wrongly labeld images (right).

Code available here.