TF 2.0 is out! Get hands-on practice at TF World, Oct 28-31. Use code TF20 for 20% off select passes. Register now

Custom differentiation

View on TensorFlow.org Run in Google Colab View source on GitHub

This tutorial will show you how to define your own custom derivatives, perform derivative surgery, and implement your own gradient checkpointing API in just 5 lines of Swift.

Declaring custom derivatives

You can define custom derivatives for any Swift function that has differentiable parameters and results. By doing that, you can even import a C function and make it differentiable.

import Glibc

func sillyExp(_ x: Float) -> Float {
    let 𝑒 = Float(M_E)
    print("Taking 𝑒(\(𝑒)) to the power of \(x)!")
    return pow(𝑒, x)
}

@differentiating(sillyExp)
func sillyDerivative(_ x: Float) -> (value: Float, pullback: (Float) -> Float) {
    let y = sillyExp(x)
    return (value: y, pullback: { v in v * y })
}

print("exp(3) =", sillyExp(3))
print("𝛁exp(3) =", gradient(of: sillyExp)(3))
Taking 𝑒(2.7182817) to the power of 3.0!
exp(3) = 20.085535
Taking 𝑒(2.7182817) to the power of 3.0!
𝛁exp(3) = 20.085535

Stop derivatives from propagating

Commonly known as "stop gradient" in machine learning use cases, method withoutDerivative(at:) stops derivatives from propagating.

Plus, withoutDerivative(at:) can sometimes help the Swift compiler with identifying what not to differentiate and producing more efficient derivaitves. When it is detectable that the derivative of a function will always be zero, the Swift compiler will produce a warning. Explicitly using withoutDerivative(at:) silences that warning.

let x: Float = 2.0
let y: Float = 3.0
gradient(at: x, y) { x, y in
    sin(sin(sin(x))) + withoutDerivative(at: cos(cos(cos(y))))
}
▿ 2 elements
  - .0 : -0.18009877
  - .1 : 0.0

Derivative surgery

Method withGradient(_:) makes arbitrary operations (including mutation) run on the gradient at a value during the enclosing function’s backpropagation.

Use this to debug or make experimental tweaks to backpropagation.

It works anywhere

All differentiation APIs provided by the standard library are defined generically over all types that conform to the Differentiable protocol: Float, Double, Float80, SIMD vectors, and even your own types!

Read technical document Differentiable Types for more insights on the Differentiable protocol.

var x: Float = 30
x.gradient { x -> Float in
    // Print the partial derivative with respect to the result of `sin(x)`.
    let a = sin(x).withGradient { print("∂+/∂sin = \($0)") } 
    // Force the partial derivative with respect to `x` to be `0.5`.
    let b = log(x.withGradient { (dx: inout Float) in
        print("∂log/∂x = \(dx), but rewritten to 0.5");
        dx = 0.5
    })
    return a + b
}
∂log/∂x = 0.033333335, but rewritten to 0.5
∂+/∂sin = 1.0

0.65425146

Use it in a neural network module

Just like how we used it in a simple Float function, we can use it in any numerical application, like the following neural network built using the Swift for TensorFlow Deep Learning Library.

import TensorFlow

struct MLP: Layer {
    var layer1 = Dense<Float>(inputSize: 2, outputSize: 10, activation: relu)
    var layer2 = Dense<Float>(inputSize: 10, outputSize: 1, activation: relu)
    
    @differentiable
    func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
        let h0 = layer1(input).withGradient { print("∂L/∂layer1 =", $0) }
        return layer2(h0)
    }
}

var classifier = MLP()
let optimizer = SGD(for: classifier, learningRate: 0.02)

let x: Tensor<Float> = [[0, 0], [0, 1], [1, 0], [1, 1]]
let y: Tensor<Float> = [0, 1, 1, 0]

for _ in 0..<10 {
    let 𝛁model = classifier.gradient { classifier -> Tensor<Float> in
        let ŷ = classifier(x).withGradient { print("∂L/∂ŷ =", $0) }
        let loss = (ŷ - y).squared().mean()
        print("Loss: \(loss)")
        return loss
    }
    optimizer.update(&classifier.allDifferentiableVariables, along: 𝛁model)
}
Loss: 0.36787152
∂L/∂ŷ = [[      -0.25],
 [-0.21979737],
 [0.029429317],
 [ 0.07870537]]
∂L/∂layer1 = [[         0.0,          0.0,          0.0,          0.0,          0.0,          0.0,
           0.0,          0.0,          0.0,          0.0],
 [  0.15134498,   0.08074134,  -0.12006457,  -0.16216259,  -0.06618697,    0.0525441,
  -0.110061675, -0.025550669,  0.088662766, -0.028133495],
 [-0.020264026, -0.010810695,    0.0160758,  0.021712426,  0.008861968, -0.007035284,
   0.014736481, 0.0034210542, -0.011871318, 0.0037668764],
 [ -0.05419384, -0.028911978,    0.0429929,   0.05806742,   0.02370033, -0.018815069,
    0.03941105,  0.009149222, -0.031748496,  0.010074084]]
Loss: 0.36323816
∂L/∂ŷ = [[-0.24787295],
 [-0.21167284],
 [0.028472602],
 [ 0.07861963]]
∂L/∂layer1 = [[  0.17034283,   0.09112822,  -0.13597326,  -0.18259563,  -0.07435731,   0.05882091,
    -0.1241397, -0.028814355,  0.099882215,  -0.03209903],
 [  0.14546545,  0.077819586, -0.116115324,   -0.1559288, -0.063497946,   0.05023053,
   -0.10600996,  -0.02460622,   0.08529512, -0.027411193],
 [-0.019566894, -0.010467692,   0.01561894,  0.020974344,  0.008541255, -0.006756624,
   0.014259645, 0.0033098394, -0.011473243, 0.0036871426],
 [ -0.05402885, -0.028903788,  0.043127608,  0.057915155,  0.023584437, -0.018656647,
   0.039374273,  0.009139255, -0.031680353,  0.010181078]]
Loss: 0.35611892
∂L/∂ŷ = [[-0.24196446],
 [ -0.2000233],
 [0.029946685],
 [ 0.08164552]]
∂L/∂layer1 = [[   0.16597457,     0.0890296,   -0.13326885,    -0.1779743,  -0.072311446,    0.05701217,
    -0.12120601,  -0.028127514,    0.09740552,  -0.031685367],
 [    0.1372052,   0.073597565,  -0.110168554,   -0.14712493,  -0.059777264,   0.047129903,
    -0.10019663,  -0.023252001,   0.080521636,   -0.02619315],
 [ -0.020541811,  -0.011018732,   0.016493995,   0.022026956,   0.008949612, -0.0070561003,
    0.015001038,  0.0034811965,  -0.012055376,   0.003921533],
 [ -0.056004427,  -0.030041058,   0.044968605,   0.060053464,   0.024399888,  -0.019237487,
    0.040898267,   0.009491003,  -0.032867324,   0.010691521]]
Loss: 0.34967244
∂L/∂ŷ = [[-0.23636964],
 [-0.18907148],
 [0.031095803],
 [0.084183514]]
∂L/∂layer1 = [[  0.16185817,   0.08705014,  -0.13067713,  -0.17358741,  -0.07036185,   0.05532043,
   -0.11843857, -0.027477136,  0.095068045, -0.031280197],
 [  0.12946995,   0.06963119, -0.104528315,  -0.13885212, -0.056282267,  0.044250675,
  -0.094738714, -0.021978892,  0.076044686,  -0.02502095],
 [-0.021293385, -0.011451953,   0.01719134,  0.022836434,  0.009256511, -0.007277725,
   0.015581284, 0.0036147772, -0.012506754,  0.004115092],
 [-0.057646107, -0.031003078,  0.046540923,    0.0618235,  0.025059512, -0.019702481,
   0.042182133,  0.009786036,  -0.03385867,  0.011140504]]
Loss: 0.3438152
∂L/∂ŷ = [[-0.23106515],
 [-0.17876765],
 [ 0.03195831],
 [ 0.08628389]]
∂L/∂layer1 = [[   0.15797314,    0.08517992,   -0.12819137,   -0.16941962,   -0.06850335,   0.053735502,
    -0.11582381,  -0.026860509,   0.092858635,  -0.030883618],
 [   0.12221871,    0.06590095,   -0.09917752,   -0.13107449,  -0.052998833,   0.041573424,
   -0.089609146,   -0.02078111,    0.07184173,  -0.023893658],
 [ -0.021849053,   -0.01178112,   0.017729975,     0.0234322,   0.009474607, -0.0074320855,
    0.016019436,  0.0037150409,  -0.012843153,   0.004271472],
 [ -0.058990017,  -0.031807717,   0.047868963,    0.06326434,   0.025580386,  -0.020065805,
      0.0432507,   0.010030198,  -0.034675088,   0.011532499]]
Loss: 0.3384761
∂L/∂ŷ = [[ -0.2260297],
 [ -0.1690661],
 [0.032568455],
 [ 0.08799219]]
∂L/∂layer1 = [[     0.154301,   0.083410025,   -0.12580524,   -0.16545667,     -0.066731,   0.052248247,
   -0.113349475,  -0.026275154,    0.09076726,   -0.03049564],
 [   0.11541434,   0.062389184,   -0.09410003,  -0.123758584,   -0.04991357,    0.03908074,
   -0.084783345,  -0.019653337,    0.06789225,  -0.022810185],
 [  -0.02223312,  -0.012018491,   0.018127186,   0.023840532,   0.009615221, -0.0075284117,
    0.016332444,  0.0037859678,  -0.013078588,   0.004394095],
 [  -0.06006858,   -0.03247109,    0.04897533,   0.064411424,   0.025978032,  -0.020339971,
    0.044126365,   0.010228782,  -0.035335224,   0.011871795]]
Loss: 0.3335942
∂L/∂ŷ = [[-0.22124359],
 [-0.15992486],
 [0.032957107],
 [ 0.08934954]]
∂L/∂layer1 = [[    0.1508249,    0.08173239,   -0.12351285,   -0.16168518,   -0.06503999,    0.05085041,
    -0.11100442,  -0.025718786,     0.0887847,  -0.030116184],
 [   0.10902306,   0.059079867,   -0.08928067,   -0.11687335,  -0.047013845,   0.036756974,
    -0.08023901,  -0.018590702,   0.064177595,  -0.021769337],
 [ -0.022467328,  -0.012175102,   0.018398844,   0.024085108,   0.009688552, -0.0075748293,
     0.01653555,  0.0038311472,  -0.013225635,  0.0044861967],
 [  -0.06091085,   -0.03300774,    0.04988084,    0.06529679,   0.026266493,  -0.020536011,
    0.044829294,  0.0103865685,  -0.035855826,   0.012162464]]
Loss: 0.32911736
∂L/∂ŷ = [[ -0.2166889],
 [-0.15130532],
 [0.033151716],
 [0.090393335]]
∂L/∂layer1 = [[  0.14752965,   0.08013974,  -0.12130868,  -0.15809274,   -0.0634258,  0.049534593,
   -0.10877859,  -0.02518932,  0.086902656, -0.029745128],
 [  0.10301414,  0.055958424,  -0.08470507, -0.110389926, -0.044287737,  0.034588054,
    -0.0759558, -0.017588709,    0.0606807, -0.020769851],
 [-0.022570888, -0.012260757,  0.018559285,   0.02418696,  0.009703654, -0.007578407,
   0.016642278,   0.00385377, -0.013295431,  0.004550773],
 [-0.061543055,  -0.03343087,  0.050604787,   0.06594952,   0.02645853,  -0.02066371,
    0.04537777,  0.010507906,  -0.03625207,  0.012408394]]
Loss: 0.32500064
∂L/∂ŷ = [[-0.21234906],
 [-0.14317185],
 [ 0.03317684],
 [ 0.09115714]]
∂L/∂layer1 = [[  0.14440133,  0.078625455, -0.119187534,   -0.1546677, -0.061884068,   0.04829408,
   -0.10666279, -0.024684828,   0.08511352, -0.029382294],
 [  0.09735953,  0.053011548,  -0.08035967,  -0.10428141,  -0.04172402,  0.032561257,
   -0.07191512, -0.016643222,  0.057385985, -0.019810388],
 [ -0.02256087, -0.012284228,  0.018621536,  0.024164861,  0.009668599, -0.007545335,
   0.016664704, 0.0038566904, -0.013297904, 0.0045906096],
 [-0.061988555,  -0.03375231,  0.051164787,    0.0663957,  0.026565574, -0.020731667,
    0.04578817,  0.010596695, -0.036537506,  0.012613222]]
Loss: 0.32120517
∂L/∂ŷ = [[-0.20820889],
 [-0.13549167],
 [ 0.03305462],
 [ 0.09167144]]
∂L/∂layer1 = [[  0.14142731,   0.07718353,  -0.11714458,  -0.15139927, -0.060410682,  0.047122817,
   -0.10464867, -0.024203548,   0.08341037, -0.029027475],
 [  0.09203365,  0.050227083,  -0.07623169,  -0.09852289, -0.039312176,  0.030665113,
   -0.06809999, -0.015750429,  0.054279193, -0.018889593],
 [-0.022452578, -0.012253427,  0.018597523,  0.024035696,  0.009590619, -0.007481077,
   0.016613709, 0.0038424828, -0.013241981,  0.004608315],
 [ -0.06226845,  -0.03398282,  0.051577106,  0.066658966,  0.026597971,  -0.02074751,
   0.046075333,  0.010656481,  -0.03672441,  0.012780388]]

Recomputing activations during backpropagation to save memory (checkpointing)

Checkpointing is a traditional technique in reverse-mode automatic differentiation for saving memory. Rather than saving large intermediate values in the original computation for computing derivatives, the intermediate values are instead recomputed as needed during backpropagation.

This technique has been realized in modern deep learning libraries as well. In Swift, API withRecomputationInPullbacks(_:) enables you to control what to recompute during backpropagation, and it is available on all Differentiable types.

But today, let us learn how to define our own gradient checkpointing APIs from scratch, in just a few lines of code.

Our gradient checkpointing API

We can define our own gradient checkpointing API, makeRecomputedInGradient(_:), in terms of standard library function differentiableFunction(from:), which is a shorthand for creating a differentiable function directly from a derivative function (also called a "vector-Jacobian products (VJP) function").

As we have seen before, the derivative function returns a tuple of the original function's result and a pullback closure. We return original(x) in value:, and call pullback(at:in:) on original to evaluate the original function again and get a pullback.

/// Given a differentiable function, returns the same differentiable function except when
/// derivatives of this function are being computed. In that case, values in the original function needed
/// for computing the derivatives will be recomputed, instead of being captured by the differential or pullback.
///
/// - Parameter body: The body of the differentiable function.
/// - Returns: The same differentiable function whose derivatives, when computed, will recompute
///   some values from the original function.
func makeRecomputedInGradient<T: Differentiable, U: Differentiable>(
    _ original: @escaping @differentiable (T) -> U
) -> @differentiable (T) -> U {
    return differentiableFunction { x in
        (value: original(x), pullback: { v in pullback(at: x, in: original)(v) })
    }
}

Verify it works

let input: Float = 10.0
print("Running original computation...")

// Differentiable multiplication with checkpointing.
let square = makeRecomputedInGradient { (x: Float) -> Float in
    print("  Computing square...")
    return x * x
}

// Differentiate `f(x) = (cos(x))^2`.
let (output, backprop) = input.valueWithPullback { input -> Float in
    return square(cos(input))
}
print("Running backpropagation...")
let grad = backprop(1)
print("Gradient = \(grad)")
Running original computation...
  Computing square...
Running backpropagation...
  Computing square...
Gradient = -0.9129453

Extend it to neural network modules

In this example, we define a simple convolutional neural network.

struct Model: Layer {
    var conv = Conv2D<Float>(filterShape: (5, 5, 3, 6))
    var maxPool = MaxPool2D<Float>(poolSize: (2, 2), strides: (2, 2))
    var flatten = Flatten<Float>()
    var dense = Dense<Float>(inputSize: 36 * 6, outputSize: 10)

    @differentiable
    func call(_ input: Tensor<Float>) -> Tensor<Float> {
        return input.sequenced(through: conv, maxPool, flatten, dense)
    }
}

We want to make activations in the convolution layer (conv) be recomputed during backpropagation. However, using makeRecomputedInGradient(_:) could make the resulting code look cumbersome, especially when we want to apply layers sequentially using sequenced(in:through:_:_:_:_:).

input.sequenced(in: context, through: conv, maxPool, flatten, dense)

So, why don't we define a special layer type that wraps a layer and makes its activations be recomputed during backpropagation? Let's do it.

First, we define a makeRecomputedInGradient(_:) function that takes a binary function.

// Same as the previous `makeRecomputedInGradient(_:)`, except it's for binary functions.
func makeRecomputedInGradient<T: Differentiable, U: Differentiable, V: Differentiable>(
    _ original: @escaping @differentiable (T, U) -> V
) -> @differentiable (T, U) -> V {
    return differentiableFunction { x, y in
        (value: original(x, y), pullback: { v in pullback(at: x, y, in: original)(v) })
    }
}

Then, we define a generic layer ActivationDiscarding<Wrapped>.

import TensorFlow

/// A layer wrapper that makes the underlying layer's activations be discarded during application
/// and recomputed during backpropagation.
struct ActivationDiscarding<Wrapped: Layer>: Layer 
    where Wrapped.AllDifferentiableVariables == Wrapped.TangentVector {
    /// The wrapped layer.
    var wrapped: Wrapped

    @differentiable
    func callAsFunction(_ input: Wrapped.Input) -> Wrapped.Output {
        let apply = makeRecomputedInGradient { (layer: Wrapped, input: Input) -> Wrapped.Output in
            print("    Applying \(Wrapped.self) layer...")
            return layer(input)
        }
        return apply(wrapped, input)
    }
}

Finally, we can add a method on all layers that returns the same layer except its activations are discarded during application and recomputed during backpropagation.

extension Layer where AllDifferentiableVariables == TangentVector {
    func discardingActivations() -> ActivationDiscarding<Self> {
        return ActivationDiscarding(wrapped: self)
    }
}

Back in the model, all we have to change is to wrap the convolution layer into the activation-recomputing layer.

var conv = Conv2D<Float>(filterShape: (5, 5, 3, 6)).discardingActivations()

Now, simply use it in the model!

struct Model: Layer {
    var conv = Conv2D<Float>(filterShape: (5, 5, 3, 6)).discardingActivations()
    var maxPool = MaxPool2D<Float>(poolSize: (2, 2), strides: (2, 2))
    var flatten = Flatten<Float>()
    var dense = Dense<Float>(inputSize: 36 * 6, outputSize: 10)

    @differentiable
    func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
        return input.sequenced(through: conv, maxPool, flatten, dense)
    }
}

When we run a training loop, we can see that the convolution layer's activations are computed twice: once during layer application, and once during backpropagation.

// Use random training data.
let x = Tensor<Float>(randomNormal: [10, 16, 16, 3])
let y = Tensor<Int32>(rangeFrom: 0, to: 10, stride: 1)

var model = Model()
let opt = SGD(for: model)

for i in 1...5 {
    print("Starting training step \(i)")
    print("  Running original computation...")
    let (logits, backprop) = model.appliedForBackpropagation(to: x)
    let (loss, dL_dŷ) = logits.valueWithGradient { logits in
        softmaxCrossEntropy(logits: logits, labels: y)
    }
    print("  Loss: \(loss)")
    print("  Running backpropagation...")
    let (dL_dθ, _) = backprop(dL_dŷ)
    
    opt.update(&model.allDifferentiableVariables, along: dL_dθ)
}
Starting training step 1
  Running original computation...
    Applying Conv2D<Float> layer...
  Loss: 3.1912904
  Running backpropagation...
    Applying Conv2D<Float> layer...
Starting training step 2
  Running original computation...
    Applying Conv2D<Float> layer...
  Loss: 2.8539681
  Running backpropagation...
    Applying Conv2D<Float> layer...
Starting training step 3
  Running original computation...
    Applying Conv2D<Float> layer...
  Loss: 2.5659466
  Running backpropagation...
    Applying Conv2D<Float> layer...
Starting training step 4
  Running original computation...
    Applying Conv2D<Float> layer...
  Loss: 2.3152032
  Running backpropagation...
    Applying Conv2D<Float> layer...
Starting training step 5
  Running original computation...
    Applying Conv2D<Float> layer...
  Loss: 2.097056
  Running backpropagation...
    Applying Conv2D<Float> layer...

Just like that, it is super easy to define generic differentiable programming libraries for different domains.