# 高级自动微分

## 设置

``````import tensorflow as tf

import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams['figure.figsize'] = (8, 6)
``````

## 控制梯度记录

``````x = tf.Variable(2.0)
y = tf.Variable(3.0)

x_sq = x * x
with t.stop_recording():
y_sq = y * y
z = x_sq + y_sq

print('dz/dx:', grad['x'])  # 2*x => 4
``````
```dz/dx: tf.Tensor(4.0, shape=(), dtype=float32)
dz/dy: None
```

``````x = tf.Variable(2.0)
y = tf.Variable(3.0)
reset = True

y_sq = y * y
if reset:
# Throw out all the tape recorded so far
t.reset()
z = x * x + y_sq

print('dz/dx:', grad['x'])  # 2*x => 4
``````
```dz/dx: tf.Tensor(4.0, shape=(), dtype=float32)
dz/dy: None
```

## 停止梯度

``````x = tf.Variable(2.0)
y = tf.Variable(3.0)

y_sq = y**2

print('dz/dx:', grad['x'])  # 2*x => 4
``````
```dz/dx: tf.Tensor(4.0, shape=(), dtype=float32)
dz/dy: None
```

## 自定义梯度

``````# Establish an identity operation, but clip during the gradient pass
def backward(dy):
return tf.clip_by_norm(dy, 0.5)
return y, backward

v = tf.Variable(2.0)
print(t.gradient(output, v))  # calls "backward", which clips 4 to 2
``````
```tf.Tensor(2.0, shape=(), dtype=float32)
```

## 多个条带

``````x0 = tf.constant(0.0)
x1 = tf.constant(0.0)

tape0.watch(x0)
tape1.watch(x1)

y0 = tf.math.sin(x0)
y1 = tf.nn.sigmoid(x1)

y = y0 + y1

ys = tf.reduce_sum(y)
``````
``````tape0.gradient(ys, x0).numpy()   # cos(x) => 1.0
``````
```1.0
```
``````tape1.gradient(ys, x1).numpy()   # sigmoid(x1)*(1-sigmoid(x1)) => 0.25
``````
```0.25
```

### 高阶梯度

`GradientTape` 上下文管理器内的运算会被记录下来，以供自动微分。如果在该上下文中计算梯度，梯度计算也会被记录。因此，完全相同的 API 也适用于高阶梯度。例如：

``````x = tf.Variable(1.0)  # Create a Tensorflow variable initialized to 1.0

y = x * x * x

# Compute the gradient inside the outer `t2` context manager
# which means the gradient computation is differentiable as well.

print('dy_dx:', dy_dx.numpy())  # 3 * x**2 => 3.0
print('d2y_dx2:', d2y_dx2.numpy())  # 6 * x => 6.0
``````
```dy_dx: 3.0
d2y_dx2: 6.0
```

#### 示例：输入梯度正则化

1. 使用内条带计算输出相对于输入的梯度。
2. 计算该输入梯度的幅度。
3. 计算该幅度相对于模型的梯度。
``````x = tf.random.normal([7, 5])

layer = tf.keras.layers.Dense(10, activation=tf.nn.relu)
``````
``````with tf.GradientTape() as t2:
# The inner tape only takes the gradient with respect to the input,
# not the variables.
t1.watch(x)
y = layer(x)
out = tf.reduce_sum(layer(x)**2)
# 1. Calculate the input gradient.
# 2. Calculate the magnitude of the input gradient.
g1_mag = tf.norm(g1)

# 3. Calculate the gradient of the magnitude with respect to the model.
``````
``````[var.shape for var in dg1_mag]
``````
```[TensorShape([5, 10]), TensorShape([10])]
```

## 雅可比矩阵

`GradientTape.jacobian` 方法让您能够有效计算雅可比矩阵。

• 类似于 `gradient``sources` 参数可以是张量或张量的容器。
• 不同于 `gradient``target` 张量必须是单个张量。

### 标量源

``````x = tf.linspace(-10.0, 10.0, 200+1)
delta = tf.Variable(0.0)

y = tf.nn.sigmoid(x+delta)

dy_dx = tape.jacobian(y, delta)
``````

``````print(y.shape)
print(dy_dx.shape)
``````
```(201,)
(201,)
```
``````plt.plot(x.numpy(), y, label='y')
plt.plot(x.numpy(), dy_dx, label='dy/dx')
plt.legend()
_ = plt.xlabel('x')
``````

### 张量源

``````x = tf.random.normal([7, 5])
layer = tf.keras.layers.Dense(10, activation=tf.nn.relu)

y = layer(x)

y.shape
``````
```TensorShape([7, 10])
```

``````layer.kernel.shape
``````
```TensorShape([5, 10])
```

``````j = tape.jacobian(y, layer.kernel)
j.shape
``````
```TensorShape([7, 10, 5, 10])
```

``````g = tape.gradient(y, layer.kernel)
print('g.shape:', g.shape)

j_sum = tf.reduce_sum(j, axis=[0, 1])
delta = tf.reduce_max(abs(g - j_sum)).numpy()
assert delta < 1e-3
print('delta:', delta)
``````
```g.shape: (5, 10)
delta: 2.3841858e-07
```

#### 示例：黑塞矩阵

``````x = tf.random.normal([7, 5])
layer1 = tf.keras.layers.Dense(8, activation=tf.nn.relu)
layer2 = tf.keras.layers.Dense(6, activation=tf.nn.relu)

x = layer1(x)
x = layer2(x)
loss = tf.reduce_mean(x**2)

h = t2.jacobian(g, layer1.kernel)
``````
``````print(f'layer.kernel.shape: {layer1.kernel.shape}')
print(f'h.shape: {h.shape}')
``````
```layer.kernel.shape: (5, 8)
h.shape: (5, 8, 5, 8)
```

``````n_params = tf.reduce_prod(layer1.kernel.shape)

g_vec = tf.reshape(g, [n_params, 1])
h_mat = tf.reshape(h, [n_params, n_params])
``````

``````def imshow_zero_center(image, **kwargs):
lim = tf.reduce_max(abs(image))
plt.imshow(image, vmin=-lim, vmax=lim, cmap='seismic', **kwargs)
plt.colorbar()
``````
``````imshow_zero_center(h_mat)
``````

``````eps = 1e-3
eye_eps = tf.eye(h_mat.shape[0])*eps
``````

``````# X(k+1) = X(k) - (∇²f(X(k)))^-1 @ ∇f(X(k))
# h_mat = ∇²f(X(k))
# g_vec = ∇f(X(k))
update = tf.linalg.solve(h_mat + eye_eps, g_vec)

# Reshape the update and apply it to the variable.
_ = layer1.kernel.assign_sub(tf.reshape(update, layer1.kernel.shape))
``````

### 批量雅可比矩阵

``````x = tf.random.normal([7, 5])

layer1 = tf.keras.layers.Dense(8, activation=tf.nn.elu)
layer2 = tf.keras.layers.Dense(6, activation=tf.nn.elu)

tape.watch(x)
y = layer1(x)
y = layer2(y)

y.shape
``````
```TensorShape([7, 6])
```

`y` 相对 `x` 的完整雅可比矩阵的形状为 `(batch, ins, batch, outs)`，即使您只想要 `(batch, ins, outs)`

``````j = tape.jacobian(y, x)
j.shape
``````
```TensorShape([7, 6, 7, 5])
```

``````imshow_zero_center(j[:, 0, :, 0])
_ = plt.title('A (batch, batch) slice')
``````

``````def plot_as_patches(j):
# Reorder axes so the diagonals will each form a contiguous patch.
j = tf.transpose(j, [1, 0, 3, 2])
# Pad in between each patch.
lim = tf.reduce_max(abs(j))
j = tf.pad(j, [[0, 0], [1, 1], [0, 0], [1, 1]],
constant_values=-lim)
# Reshape to form a single image.
s = j.shape
j = tf.reshape(j, [s[0]*s[1], s[2]*s[3]])
imshow_zero_center(j, extent=[-0.5, s[2]-0.5, s[0]-0.5, -0.5])

plot_as_patches(j)
_ = plt.title('All (batch, batch) slices are diagonal')
``````

``````j_sum = tf.reduce_sum(j, axis=2)
print(j_sum.shape)
j_select = tf.einsum('bxby->bxy', j)
print(j_select.shape)
``````
```(7, 6, 5)
(7, 6, 5)
```

``````jb = tape.batch_jacobian(y, x)
jb.shape
``````
```WARNING:tensorflow:5 out of the last 5 calls to <function pfor.<locals>.f at 0x7f49b409ae18> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
TensorShape([7, 6, 5])
```
``````error = tf.reduce_max(abs(jb - j_sum))
assert error < 1e-3
print(error.numpy())
``````
```0.0
```

``````x = tf.random.normal([7, 5])

layer1 = tf.keras.layers.Dense(8, activation=tf.nn.elu)
bn = tf.keras.layers.BatchNormalization()
layer2 = tf.keras.layers.Dense(6, activation=tf.nn.elu)

tape.watch(x)
y = layer1(x)
y = bn(y, training=True)
y = layer2(y)

j = tape.jacobian(y, x)
print(f'j.shape: {j.shape}')
``````
```WARNING:tensorflow:6 out of the last 6 calls to <function pfor.<locals>.f at 0x7f49b4232ea0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
j.shape: (7, 6, 7, 5)
```
``````plot_as_patches(j)

_ = plt.title('These slices are not diagonal')
_ = plt.xlabel("Don't use `batch_jacobian`")
``````

``````jb = tape.batch_jacobian(y, x)
print(f'jb.shape: {jb.shape}')
``````
```WARNING:tensorflow:7 out of the last 7 calls to <function pfor.<locals>.f at 0x7f49b41c8ae8> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
jb.shape: (7, 6, 5)
```
[{ "type": "thumb-down", "id": "missingTheInformationINeed", "label":"没有我需要的信息" },{ "type": "thumb-down", "id": "tooComplicatedTooManySteps", "label":"太复杂/步骤太多" },{ "type": "thumb-down", "id": "outOfDate", "label":"内容需要更新" },{ "type": "thumb-down", "id": "translationIssue", "label":"翻译问题" },{ "type": "thumb-down", "id": "samplesCodeIssue", "label":"Samples / code issue" },{ "type": "thumb-down", "id": "otherDown", "label":"其他" }]
[{ "type": "thumb-up", "id": "easyToUnderstand", "label":"易于理解" },{ "type": "thumb-up", "id": "solvedMyProblem", "label":"解决了我的问题" },{ "type": "thumb-up", "id": "otherUp", "label":"其他" }]