Automatic differentiation is based on the multivariate chainrule:
\(f:\R^n \to \R\), \(g:\R^m \to \R^n\), \(h:\R^n \to \R^l\)
\[
d (h \circ g)(\bfx) = d h(g(\bfx)) dg(\bfx),
\]
\[
\nabla (f \circ g)(\bfx) = \nabla f(g(\bfx))^T dg(\bfx),
\]
We have for \(i=1,\dots,m\)
\begin{align}
\frac{\partial (f \circ g)}{\partial x_i}(\bfx) =
\nabla f(g(\bfx))^T
\begin{bmatrix}
\frac{\partial g_1}{\partial x_i} (\bfx) \\
\vdots \\
\frac{\partial g_n}{\partial x_i} (\bfx)
\end{bmatrix}
= \sum_{j=1}^n \frac{\partial f}{\partial x_j} (g(\bfx)) \frac{\partial g_j}{\partial x_i} (\bfx).
\end{align}
So we make a forward pass and calculate \(f(g(\bfx)).\) If we know the gradient of \(f\) at \(g(\bfx)\) we can backpropagate it to \(\bfx\) via above formula.
Example. Matrix multiplication.
\(\bfA \in \R^{m \times n}\), \(\bfB \in \R^{n \times l},\)
\(f(\bfA \bfB) = f(\bfC) \in \R\).
Let \(\nabla_\bfC f(\bfC) = \nabla\) such that
\[\frac{\partial f}{\partial c_{ij}}(\bfC) = \nabla_{ij}.\]
By definition of matrix multiplication
\[c_{ij} = \sum_{k=1}^n a_{ik}b_{kj}, \quad \frac{\partial c_{ij}}{\partial a_{ik}} = b_{kj}, \quad \frac{\partial c_{ij}}{\partial b_{kj}} = a_{ik}.\]
From the chain rule it follows that
\[\frac{\partial f}{\partial a_{ik}}(\bfC) = \sum_{j=1}^l b_{kj}\nabla_{ij}, \quad \frac{\partial f}{\partial b_{kj}}(\bfC) = \sum_{i=1}^m a_{ik}\nabla_{ij}\]
which can be summerised as
\[\frac{df}{d\bfA} = \nabla\bfB^T, \quad \frac{df}{d\bfB} = \bfA^T \nabla. \]
function *(self::DMat, other::DMat)
res = DMat(self.s*other.s, prev=[self, other], op="*")
res.backward = function bw(∇)
self.∇ .+= ∇ * adjoint(other.s)
other.∇ .+= adjoint(self.s) * ∇
end
return demote(res)
end
Example. Convolution code.
function convolve_loop(W::AbstractArray, b::AbstractArray, stride::Tuple{Int,Int}, pad::Tuple{Int,Int}, A::AbstractArray)
kx, ky, kd1, kd2 = size(W)
inx, iny, = size(A)
m, n = size_after_conv((inx, iny), stride, pad, (kx, ky))
output = Array{eltype(W), 3}(undef, m, n, kd2)
@inbounds for i in 1:m, j in 1:n, k in 1:kd2
x = 1+(i-1)*stride[1]-pad[1]
y = 1+(j-1)*stride[2]-pad[2]
acc = 0.
for (i´,x´) in enumerate(x:x+kx-1), (j´,y´) in enumerate(y:y+ky-1), l in 1:kd1
a = 0.
if (1 ≤ x´ && x´ ≤ inx) && (1 ≤ y´ && y´ ≤ iny)
a = A[x´,y´,l]
end
acc += W[i´,j´,l,k] * a
end
output[i,j,k] = b[k] + acc
end
return output
end
function ∇convolve_loop(W::AbstractArray, b::AbstractArray, stride::Tuple{Int,Int}, pad::Tuple{Int,Int}, A::AbstractArray, ∇::AbstractArray)
∇W = zeros(size(W)); ∇b=zeros(size(b)); ∇A = zeros(size(A))
kx, ky, kd1, kd2 = size(W)
inx, iny, = size(A)
m, n = size_after_conv((inx, iny), stride, pad, (kx, ky))
@assert (m, n, kd2) == size(∇)
@inbounds for i in 1:m, j in 1:n, k in 1:kd2
x = 1+(i-1)*stride[1]-pad[1]
y = 1+(j-1)*stride[2]-pad[2]
for (i´,x´) in enumerate(x:x+kx-1), (j´,y´) in enumerate(y:y+ky-1), l in 1:kd1
if (1 ≤ x´ && x´ ≤ inx) && (1 ≤ y´ && y´ ≤ iny)
∇W[i´,j´,l,k] += A[x´,y´,l] * ∇[i,j,k]
∇A[x´,y´,l] += W[i´,j´,l, k] * ∇[i,j,k]
end
end
∇b[k] += ∇[i,j,k]
end
return ∇W, ∇b, ∇A
end
Implementation.
I used custom structs which keep track of the parents which created them. By overloading operators we calculate the forward pass and create a tree. As \(\text{id}\circ f = f\) and \(\nabla_f f = 1\) we know the gradient at the root node and propagate it back layer by layer.
function backward(d::DVal)
# order the tree nodes by depth
topo = DType[]
visited = Set{DType}()
function build_topo(v)
if !(v in visited)
push!(visited, v)
for child in v.prev
build_topo(child)
end
push!(topo,v)
end
end
build_topo(d)
d.∇ = 1
# go one variable at a time and apply the chain rule to get its gradient
for v in reverse(topo)
v.backward(v.∇)
end
end
With this method a wide range of operations can be differentiated. Like for example an arbitratry retrieval of elements. The gradient is backpropagated by simply adding it to the correct indexes.
function getindex(self::DTensor, I...)
res = DTensor(getindex(self.s, I...), prev=[self], op="[$I]")
res.backward = function bw(∇)
S = getindex(self.∇, I...)
setindex!(self.∇, S .+ ∇, I...)
end
return res
end
MNIST
With alot of operations implemented I was able to create and differentiate following deep neural network:
m = Model(
# First convolution, operating upon a 28x28 image
Conv((3, 3), 1=>16, relu),
MaxPool((2,2)),
# Second convolution, operating upon a 13x13 image
Conv((3, 3), 16=>32, relu),
MaxPool((2,2)),
# Third convolution, operating upon a 5x5 image
Conv((3, 3), 32=>32, relu),
MaxPool((2,2)),
# Reshape 3d tensor into a 2d one using flatten, at this point it should be (1, 1, 32, N)
flatten,
Dense(32, 10)
)
After 35 epochs witch batchsize 128 and ADAM optimiser an accuracy of 98.84% was reached. Below can bee seen all correctly labeled images (left) and all wrongly labeld images (right).
Code available here.