diff --git a/CMakeLists.txt b/CMakeLists.txt
index c4e6e2d92..6b02b28d2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -118,7 +118,7 @@ if(ENABLE_PYTHON_SUPPORT)
     GIT_REPOSITORY
       git://github.com/pvieito/PythonKit
     GIT_TAG
-      master
+      6a05a15
     CMAKE_ARGS
       -D BUILD_SHARED_LIBS=YES
       -D CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
@@ -189,7 +189,7 @@ if(NOT X10_FOUND AND NOT USE_BUNDLED_X10)
       COMMAND
         rm -rf <SOURCE_DIR>/bazel-bin # ${CMAKE_COMMAND} -E rm -Rrf <SOURCE_DIR>/bazel-bin
       COMMAND
-        bazel build ${VISIBILITY_FLAGS} -c opt --define framework_shared_object=false //tensorflow/compiler/tf2xla/xla_tensor:x10 --nocheck_visibility
+        bazel build ${VISIBILITY_FLAGS} -c opt --define framework_shared_object=false //tensorflow:tensorflow //tensorflow/compiler/tf2xla/xla_tensor:x10 --nocheck_visibility
       COMMAND
         bazel shutdown
     INSTALL_COMMAND
diff --git a/Documentation/X10/SUMMARY.md b/Documentation/X10/SUMMARY.md
index 1beaa6aa0..2d00e42fc 100644
--- a/Documentation/X10/SUMMARY.md
+++ b/Documentation/X10/SUMMARY.md
@@ -22,7 +22,7 @@ public struct MyModel: Layer {
   public var dense3 = Dense<Float>(inputSize: 4, outputSize: 4)
   public var flatten = Flatten<Float>()
 
-  @differentiable
+  @differentiable(reverse)
   public func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
     let layer1 = dense1(input)
     let layer2 = layer1.reshaped(to: [1, 4])
diff --git a/README.md b/README.md
index 8c795d224..1f9ec86fe 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ struct Model: Layer {
     var layer2 = Dense<Float>(inputSize: hiddenSize, outputSize: hiddenSize, activation: relu)
     var layer3 = Dense<Float>(inputSize: hiddenSize, outputSize: 3, activation: identity)
     
-    @differentiable
+    @differentiable(reverse)
     func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
         return input.sequenced(through: layer1, layer2, layer3)
     }
diff --git a/Sources/TensorFlow/BackwardsCompatibility.swift b/Sources/TensorFlow/BackwardsCompatibility.swift
index 3752a22b3..3aaf26adb 100644
--- a/Sources/TensorFlow/BackwardsCompatibility.swift
+++ b/Sources/TensorFlow/BackwardsCompatibility.swift
@@ -23,8 +23,8 @@ import _Differentiation
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func l1Loss<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>
@@ -37,8 +37,8 @@ public func l1Loss<Scalar: TensorFlowFloatingPoint>(
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func l2Loss<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>
@@ -51,8 +51,8 @@ public func l2Loss<Scalar: TensorFlowFloatingPoint>(
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func hingeLoss<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>
@@ -65,8 +65,8 @@ public func hingeLoss<Scalar: TensorFlowFloatingPoint>(
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func squaredHingeLoss<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>
@@ -79,8 +79,8 @@ public func squaredHingeLoss<Scalar: TensorFlowFloatingPoint>(
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func categoricalHingeLoss<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>
@@ -94,8 +94,8 @@ public func categoricalHingeLoss<Scalar: TensorFlowFloatingPoint>(
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func logCoshLoss<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>
@@ -108,8 +108,8 @@ public func logCoshLoss<Scalar: TensorFlowFloatingPoint>(
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func poissonLoss<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>
@@ -123,8 +123,8 @@ public func poissonLoss<Scalar: TensorFlowFloatingPoint>(
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func kullbackLeiblerDivergence<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>
@@ -137,7 +137,7 @@ public func kullbackLeiblerDivergence<Scalar: TensorFlowFloatingPoint>(
 /// - Parameters:
 ///   - logits: One-hot encoded outputs from a neural network.
 ///   - labels: Indices (zero-indexed) of the correct outputs.
-@differentiable(wrt: logits)
+@differentiable(reverse, wrt: logits)
 public func softmaxCrossEntropy<Scalar: TensorFlowFloatingPoint>(
   logits: Tensor<Scalar>,
   probabilities: Tensor<Scalar>
@@ -149,8 +149,8 @@ public func softmaxCrossEntropy<Scalar: TensorFlowFloatingPoint>(
 /// - Parameters:
 ///   - logits: The unscaled output of a neural network.
 ///   - labels: Integer values that correspond to the correct output.
-@differentiable(wrt: logits)
-@differentiable(wrt: (logits, labels))
+@differentiable(reverse, wrt: logits)
+@differentiable(reverse, wrt: (logits, labels))
 public func sigmoidCrossEntropy<Scalar: TensorFlowFloatingPoint>(
   logits: Tensor<Scalar>,
   labels: Tensor<Scalar>
diff --git a/Sources/TensorFlow/Core/DifferentialOperators.swift b/Sources/TensorFlow/Core/DifferentialOperators.swift
index d3030892c..cfd6ce810 100644
--- a/Sources/TensorFlow/Core/DifferentialOperators.swift
+++ b/Sources/TensorFlow/Core/DifferentialOperators.swift
@@ -23,10 +23,10 @@ import _Differentiation
 @inlinable
 public func valueWithGradient<T, R>(
   at x: T,
-  in f: @differentiable (T) -> Tensor<R>
+  in f: @differentiable(reverse) (T) -> Tensor<R>
 ) -> (value: Tensor<R>, gradient: T.TangentVector)
 where T: Differentiable, R: TensorFlowFloatingPoint {
-  let (y, pullback) = valueWithPullback(at: x, in: f)
+  let (y, pullback) = valueWithPullback(at: x, of: f)
   precondition(
     y.rank == 0,
     """
@@ -40,10 +40,10 @@ where T: Differentiable, R: TensorFlowFloatingPoint {
 public func valueWithGradient<T, U, R>(
   at x: T,
   _ y: U,
-  in f: @differentiable (T, U) -> Tensor<R>
+  in f: @differentiable(reverse) (T, U) -> Tensor<R>
 ) -> (value: Tensor<R>, gradient: (T.TangentVector, U.TangentVector))
 where T: Differentiable, U: Differentiable, R: TensorFlowFloatingPoint {
-  let (y, pullback) = valueWithPullback(at: x, y, in: f)
+  let (y, pullback) = valueWithPullback(at: x, y, of: f)
   precondition(
     y.rank == 0,
     """
@@ -58,10 +58,10 @@ public func valueWithGradient<T, U, V, R>(
   at x: T,
   _ y: U,
   _ z: V,
-  in f: @differentiable (T, U, V) -> Tensor<R>
+  in f: @differentiable(reverse) (T, U, V) -> Tensor<R>
 ) -> (value: Tensor<R>, gradient: (T.TangentVector, U.TangentVector, V.TangentVector))
 where T: Differentiable, U: Differentiable, V: Differentiable, R: TensorFlowFloatingPoint {
-  let (y, pullback) = valueWithPullback(at: x, y, z, in: f)
+  let (y, pullback) = valueWithPullback(at: x, y, z, of: f)
   precondition(y.rank == 0)
   return (y, pullbackOfOneLikeY(y: y, pullback: pullback))
 }
@@ -70,7 +70,7 @@ where T: Differentiable, U: Differentiable, V: Differentiable, R: TensorFlowFloa
 
 @inlinable
 public func valueWithGradient<T, R>(
-  of f: @escaping @differentiable (T) -> Tensor<R>
+  of f: @escaping @differentiable(reverse) (T) -> Tensor<R>
 ) -> (T) -> (value: Tensor<R>, gradient: T.TangentVector)
 where T: Differentiable, R: TensorFlowFloatingPoint {
   return { x in valueWithGradient(at: x, in: f) }
@@ -78,7 +78,7 @@ where T: Differentiable, R: TensorFlowFloatingPoint {
 
 @inlinable
 public func valueWithGradient<T, U, R>(
-  of f: @escaping @differentiable (T, U) -> Tensor<R>
+  of f: @escaping @differentiable(reverse) (T, U) -> Tensor<R>
 ) -> (T, U) -> (value: Tensor<R>, gradient: (T.TangentVector, U.TangentVector))
 where T: Differentiable, U: Differentiable, R: TensorFlowFloatingPoint {
   return { x, y in valueWithGradient(at: x, y, in: f) }
@@ -86,7 +86,7 @@ where T: Differentiable, U: Differentiable, R: TensorFlowFloatingPoint {
 
 @inlinable
 public func valueWithGradient<T, U, V, R>(
-  of f: @escaping @differentiable (T, U, V) -> Tensor<R>
+  of f: @escaping @differentiable(reverse) (T, U, V) -> Tensor<R>
 ) -> (T, U, V) -> (
   value: Tensor<R>,
   gradient: (T.TangentVector, U.TangentVector, V.TangentVector)
@@ -100,7 +100,7 @@ where T: Differentiable, U: Differentiable, V: Differentiable, R: TensorFlowFloa
 @inlinable
 public func gradient<T, R>(
   at x: T,
-  in f: @differentiable (T) -> Tensor<R>
+  in f: @differentiable(reverse) (T) -> Tensor<R>
 ) -> T.TangentVector where T: Differentiable, R: TensorFlowFloatingPoint {
   return valueWithGradient(at: x, in: f).1
 }
@@ -109,7 +109,7 @@ public func gradient<T, R>(
 public func gradient<T, U, R>(
   at x: T,
   _ y: U,
-  in f: @differentiable (T, U) -> Tensor<R>
+  in f: @differentiable(reverse) (T, U) -> Tensor<R>
 ) -> (T.TangentVector, U.TangentVector)
 where T: Differentiable, U: Differentiable, R: TensorFlowFloatingPoint {
   return valueWithGradient(at: x, y, in: f).1
@@ -120,7 +120,7 @@ public func gradient<T, U, V, R>(
   at x: T,
   _ y: U,
   _ z: V,
-  in f: @differentiable (T, U, V) -> Tensor<R>
+  in f: @differentiable(reverse) (T, U, V) -> Tensor<R>
 ) -> (T.TangentVector, U.TangentVector, V.TangentVector)
 where T: Differentiable, U: Differentiable, V: Differentiable, R: TensorFlowFloatingPoint {
   return valueWithGradient(at: x, y, z, in: f).1
@@ -130,14 +130,14 @@ where T: Differentiable, U: Differentiable, V: Differentiable, R: TensorFlowFloa
 
 @inlinable
 public func gradient<T, R>(
-  of f: @escaping @differentiable (T) -> Tensor<R>
+  of f: @escaping @differentiable(reverse) (T) -> Tensor<R>
 ) -> (T) -> T.TangentVector where T: Differentiable, R: TensorFlowFloatingPoint {
   return { x in gradient(at: x, in: f) }
 }
 
 @inlinable
 public func gradient<T, U, R>(
-  of f: @escaping @differentiable (T, U) -> Tensor<R>
+  of f: @escaping @differentiable(reverse) (T, U) -> Tensor<R>
 ) -> (T, U) -> (T.TangentVector, U.TangentVector)
 where T: Differentiable, U: Differentiable, R: TensorFlowFloatingPoint {
   return { x, y in gradient(at: x, y, in: f) }
@@ -145,7 +145,7 @@ where T: Differentiable, U: Differentiable, R: TensorFlowFloatingPoint {
 
 @inlinable
 public func gradient<T, U, V, R>(
-  of f: @escaping @differentiable (T, U, V) -> Tensor<R>
+  of f: @escaping @differentiable(reverse) (T, U, V) -> Tensor<R>
 ) -> (T, U, V) -> (T.TangentVector, U.TangentVector, V.TangentVector)
 where T: Differentiable, U: Differentiable, V: Differentiable, R: TensorFlowFloatingPoint {
   return { x, y, z in gradient(at: x, y, z, in: f) }
diff --git a/Sources/TensorFlow/Core/MixedPrecision.swift b/Sources/TensorFlow/Core/MixedPrecision.swift
index 97d5d98ff..ea58503f8 100644
--- a/Sources/TensorFlow/Core/MixedPrecision.swift
+++ b/Sources/TensorFlow/Core/MixedPrecision.swift
@@ -153,7 +153,7 @@ extension Tensor {
 
   /// Promotes a scalar to a tensor with the same device and precision as the given tensor.
   // TODO (SR-12968): Mark `tensor` with `@noDerivative` and remove custom vjp below.
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public init(_ value: Scalar, deviceAndPrecisionLike tensor: Tensor) {
     let device = tensor.device
     let tmp = Tensor(value, on: device)
diff --git a/Sources/TensorFlow/Core/Tensor.swift b/Sources/TensorFlow/Core/Tensor.swift
index b376aa6de..497f02afa 100644
--- a/Sources/TensorFlow/Core/Tensor.swift
+++ b/Sources/TensorFlow/Core/Tensor.swift
@@ -40,12 +40,12 @@ public struct Tensor<Scalar: TensorFlowScalar> {
   @usableFromInline
   internal var _isScalarZero = false
 
-  /// An internal workaround for SR-13263: debug info generation crash.
-  @usableFromInline
-  class SR13263Workaround {}
+  // /// An internal workaround for SR-13263: debug info generation crash.
+  // @usableFromInline
+  // class SR13263Workaround {}
 
-  /// An internal workaround for SR-13263: debug info generation crash.
-  internal var _sr13263Workaround: SR13263Workaround?
+  // /// An internal workaround for SR-13263: debug info generation crash.
+  // internal var _sr13263Workaround: SR13263Workaround?
   
   @inlinable
   public init(handle: TensorHandle<Scalar>) {
@@ -132,7 +132,7 @@ extension Tensor {
   /// Reshape to scalar.
   /// - Precondition: The tensor has exactly one scalar.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public func scalarized() -> Scalar {
     precondition(
       shape.contiguousSize == 1,
@@ -174,7 +174,7 @@ extension Tensor {
     return handle.makeHostCopy()
   }
 
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public var scalars: [Scalar] {
     if handle.backend == .XLA {
       let (storage, _) = xlaTensor.fetchTensorValues(Scalar.self)
@@ -203,7 +203,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
 
 extension Tensor {
   /// Creates a 0-D tensor from a scalar value.
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public init(_ value: Scalar, on device: Device = .default) {
     switch device.backend {
     case .XLA:
@@ -227,7 +227,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
 extension Tensor {
   /// Creates a 1D tensor from scalars.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public init(_ scalars: [Scalar], on device: Device = .default) {
     self.init(shape: [scalars.count], scalars: scalars, on: device)
   }
@@ -247,7 +247,7 @@ extension Tensor {
   ///   - scalars: The scalar contents of the tensor.
   /// - Precondition: The product of the dimensions of the shape must equal the number of scalars.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public init(shape: TensorShape, scalars: [Scalar], on device: Device = .default) {
     precondition(
       shape.contiguousSize == scalars.count,
@@ -628,7 +628,7 @@ extension Tensor: AdditiveArithmetic where Scalar: Numeric {
   /// Adds two tensors and produces their sum.
   /// - Note: `+` supports broadcasting.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public static func + (lhs: Tensor, rhs: Tensor) -> Tensor {
     if lhs._isScalarZero {
       return rhs
@@ -641,7 +641,7 @@ extension Tensor: AdditiveArithmetic where Scalar: Numeric {
   /// Subtracts one tensor from another and produces their difference.
   /// - Note: `-` supports broadcasting.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public static func - (lhs: Tensor, rhs: Tensor) -> Tensor {
     if rhs._isScalarZero {
       return lhs
@@ -745,7 +745,7 @@ public protocol TensorProtocol {
 public protocol DifferentiableTensorProtocol:
   TensorProtocol & Differentiable & EuclideanDifferentiable
 where Scalar: TensorFlowFloatingPoint {
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   func annotate(_ annotation: String) -> Self
 }
 
@@ -773,7 +773,7 @@ where Scalar: TensorFlowFloatingPoint {
   ///
   /// - Parameter annotation: The annotation to be added.
   /// - Returns: The annotated tensor.
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func annotate(_ annotation: String) -> Tensor<Scalar> {
     switch handle.backend {
     case .XLA:
diff --git a/Sources/TensorFlow/Initializers.swift b/Sources/TensorFlow/Initializers.swift
index 33a703d0d..e4ffea51c 100644
--- a/Sources/TensorFlow/Initializers.swift
+++ b/Sources/TensorFlow/Initializers.swift
@@ -36,7 +36,7 @@ extension Tensor {
   ///   - repeatedValue: The scalar value to repeat.
   ///   - shape: The dimensions of the tensor.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public init(
     repeating repeatedValue: Scalar, shape: TensorShape,
     on device: Device = .default
@@ -49,7 +49,7 @@ extension Tensor {
   /// Creates a tensor by broadcasting the given scalar to a given rank with
   /// all dimensions being 1.
   @inlinable
-  // @differentiable(where Scalar: TensorFlowFloatingPoint)
+  // @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public init(broadcasting scalar: Scalar, rank: Int, on device: Device = .default) {
     self = Tensor(scalar, on: device).reshaped(to: TensorShape(repeating: 1, count: rank))
   }
@@ -93,7 +93,7 @@ extension Tensor where Scalar: Numeric {
 
   /// Perform an element-wise conversion from another `Tensor`.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint, OtherScalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint, OtherScalar: TensorFlowFloatingPoint)
   public init<OtherScalar: Numeric>(_ other: Tensor<OtherScalar>) {
     self = _Raw.cast(other)
   }
@@ -116,7 +116,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
 extension Tensor {
   /// Creates a tensor from an array of tensors (which may themselves be scalars).
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public init(_ elements: [Tensor]) {
     self = _Raw.pack(elements)
   }
@@ -150,7 +150,7 @@ extension Tensor {
   ///
   /// - Returns: The stacked tensor.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public init(stacking tensors: [Tensor], alongAxis axis: Int = 0) {
     self = _Raw.pack(tensors, axis: Int64(axis))
   }
@@ -188,7 +188,7 @@ extension Tensor {
   ///
   /// - Returns: The concatenated tensor.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public init(concatenating tensors: [Tensor], alongAxis axis: Int = 0) {
     precondition(tensors.count > 0)
     self = _Raw.concatV2(tensors, axis: Tensor<Int32>(Int32(axis), on: tensors.first!.device))
diff --git a/Sources/TensorFlow/Layer.swift b/Sources/TensorFlow/Layer.swift
index 44aa6fdca..884696709 100644
--- a/Sources/TensorFlow/Layer.swift
+++ b/Sources/TensorFlow/Layer.swift
@@ -32,14 +32,14 @@ where
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   func callAsFunction(_ input: Input) -> Output
 
   /// Returns the output obtained from applying the layer to the given input.
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   func forward(_ input: Input) -> Output
 }
 
@@ -48,7 +48,7 @@ extension Module {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func forward(_ input: Input) -> Output {
     return callAsFunction(input)
   }
@@ -60,7 +60,7 @@ extension Module where Input: TensorProtocol, Output: DifferentiableTensorProtoc
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The annotated output.
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func callAsFunction(_ input: Input) -> Output {
     let activation = forward(input)
     return annotated(activation)
@@ -72,7 +72,7 @@ extension Module where Input: TensorProtocol, Output: DifferentiableTensorProtoc
   ///
   /// - Parameter output: The output to the layer.
   /// - Returns: The annotated output.
-  @differentiable
+  @differentiable(reverse)
   public func annotated(_ output: Output) -> Output {
     let annotated = output.annotate("type=\(Self.self)")
     return annotated
@@ -153,31 +153,19 @@ public protocol Layer: Module where Input: Differentiable {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
+  @differentiable(reverse)
   func callAsFunction(_ input: Input) -> Output
-
-  @differentiable
-  func forward(_ input: Input) -> Output
 }
 
-extension Layer {
-  // Workaround for SR-13455: autodiff undefined symbol linker error.
-  @differentiable(wrt: self)
-  @differentiable
-  public func forward(_ input: Input) -> Output {
-    return callAsFunction(input)
-  }
-}
 
-extension Layer where Input: DifferentiableTensorProtocol, Output: DifferentiableTensorProtocol {
-  // Workaround for SR-13455: autodiff undefined symbol linker error.
-  @differentiable(wrt: self)
-  @differentiable
-  public func callAsFunction(_ input: Input) -> Output {
-    let activation = forward(input)
-    return annotated(activation)
-  }
-}
+// extension Layer where Input: DifferentiableTensorProtocol, Output: DifferentiableTensorProtocol {
+//   // Workaround for SR-13455: autodiff undefined symbol linker error.
+//   @differentiable(reverse, wrt: self)
+//   public func callAsFunction(_ input: Input) -> Output {
+//     let activation = callAsFunction(input)
+//     return annotated(activation)
+//   }
+// }
 
 /// An empty struct representing empty `TangentVector`s for parameterless layers.
 public struct EmptyTangentVector: EuclideanDifferentiable, VectorProtocol, ElementaryFunctions,
@@ -200,12 +188,11 @@ public struct EmptyTangentVector: EuclideanDifferentiable, VectorProtocol, Eleme
 ///
 /// The `TangentVector` of parameterless layers is always `EmptyTangentVector`.
 public protocol ParameterlessLayer: Layer where TangentVector == EmptyTangentVector {
-  @differentiable
-  func callAsFunction(_ input: Input) -> Output
+  @differentiable(reverse) func callAsFunction(_ input: Input) -> Output
 }
 
 extension ParameterlessLayer {
-  public mutating func move(along direction: EmptyTangentVector) {}
+  public mutating func move(by direction: EmptyTangentVector) {}
   public var differentiableVectorView: EmptyTangentVector { EmptyTangentVector() }
 }
 
@@ -269,7 +256,7 @@ extension Differentiable {
   ///   - l1: The first layer.
   ///   - l2: The second layer.
   /// - Returns: The final layer's output after sequential application.
-  @differentiable
+  @differentiable(reverse)
   public func sequenced<L1: Layer, L2: Layer>(through l1: L1, _ l2: L2) -> L2.Output
   where L1.Input == Self, L1.Output == L2.Input {
     let o1 = l1(self)
@@ -284,7 +271,7 @@ extension Differentiable {
   ///   - l2: The second layer.
   ///   - l3: The third layer.
   /// - Returns: The final layer's output after sequential application.
-  @differentiable
+  @differentiable(reverse)
   public func sequenced<L1: Layer, L2: Layer, L3: Layer>(through l1: L1, _ l2: L2, _ l3: L3)
     -> L3.Output
   where L1.Input == Self, L1.Output == L2.Input, L2.Output == L3.Input {
@@ -302,7 +289,7 @@ extension Differentiable {
   ///   - l3: The third layer.
   ///   - l4: The fourth layer.
   /// - Returns: The final layer's output after sequential application.
-  @differentiable
+  @differentiable(reverse)
   public func sequenced<L1: Layer, L2: Layer, L3: Layer, L4: Layer>(
     through l1: L1, _ l2: L2, _ l3: L3, _ l4: L4
   ) -> L4.Output
@@ -326,7 +313,7 @@ extension Differentiable {
   ///   - l4: The third layer.
   ///   - l5: The fifth layer.
   /// - Returns: The final layer's output after sequential application.
-  @differentiable
+  @differentiable(reverse)
   public func sequenced<L1: Layer, L2: Layer, L3: Layer, L4: Layer, L5: Layer>(
     through l1: L1, _ l2: L2, _ l3: L3, _ l4: L4, _ l5: L5
   ) -> L5.Output
@@ -352,7 +339,7 @@ extension Differentiable {
   ///   - l5: The fifth layer.
   ///   - l6: The sixth layer.
   /// - Returns: The final layer's output after sequential application.
-  @differentiable
+  @differentiable(reverse)
   public func sequenced<L1: Layer, L2: Layer, L3: Layer, L4: Layer, L5: Layer, L6: Layer>(
     through l1: L1, _ l2: L2, _ l3: L3, _ l4: L4, _ l5: L5, _ l6: L6
   ) -> L6.Output
diff --git a/Sources/TensorFlow/Layers/Convolutional.swift b/Sources/TensorFlow/Layers/Convolutional.swift
index 024533572..3a5cc1331 100644
--- a/Sources/TensorFlow/Layers/Convolutional.swift
+++ b/Sources/TensorFlow/Layers/Convolutional.swift
@@ -36,7 +36,7 @@ public struct Conv1D<Scalar: TensorFlowFloatingPoint>: Layer {
   @noDerivative private let useBias: Bool
 
   /// The element-wise activation function type.
-  public typealias Activation = @differentiable (Tensor<Scalar>) -> Tensor<Scalar>
+  public typealias Activation = @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar>
 
   /// Creates a `Conv1D` layer with the specified filter, bias, activation function, stride,
   /// dilation and padding.
@@ -79,8 +79,8 @@ public struct Conv1D<Scalar: TensorFlowFloatingPoint>: Layer {
   /// - Returns: The output of shape [batch size, output width, output channel count].
   ///
   /// - Note: Padding size equals zero when using `.valid`.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     let conv = conv1D(
       input,
       filter: filter,
@@ -149,7 +149,7 @@ public struct Conv2D<Scalar: TensorFlowFloatingPoint>: Layer {
   @noDerivative private let useBias: Bool
 
   /// The element-wise activation function type.
-  public typealias Activation = @differentiable (Tensor<Scalar>) -> Tensor<Scalar>
+  public typealias Activation = @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar>
 
   /// Creates a `Conv2D` layer with the specified filter, bias, activation function, strides,
   /// dilations and padding.
@@ -201,8 +201,8 @@ public struct Conv2D<Scalar: TensorFlowFloatingPoint>: Layer {
   ///   [batch count, output height, output width, output channel count].
   ///
   /// - Note: Padding size equals zero when using `.valid`.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     let conv = conv2D(
       input,
       filter: filter,
@@ -273,7 +273,7 @@ public struct Conv3D<Scalar: TensorFlowFloatingPoint>: Layer {
   @noDerivative private let useBias: Bool
 
   /// The element-wise activation function type.
-  public typealias Activation = @differentiable (Tensor<Scalar>) -> Tensor<Scalar>
+  public typealias Activation = @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar>
 
   /// Creates a `Conv3D` layer with the specified filter, bias, activation function, strides, and
   /// padding.
@@ -332,8 +332,8 @@ public struct Conv3D<Scalar: TensorFlowFloatingPoint>: Layer {
   ///   [batch count, output depth, output height, output width, output channel count].
   ///
   /// - Note: Padding size equals zero when using `.valid`.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     let conv = conv3D(
       input,
       filter: filter,
@@ -405,7 +405,7 @@ public struct TransposedConv1D<Scalar: TensorFlowFloatingPoint>: Layer {
   @noDerivative private let useBias: Bool
 
   /// The element-wise activation function type.
-  public typealias Activation = @differentiable (Tensor<Scalar>) -> Tensor<Scalar>
+  public typealias Activation = @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar>
 
   /// Creates a `TransposedConv1D` layer with the specified filter, bias,
   /// activation function, strides, and padding.
@@ -436,8 +436,8 @@ public struct TransposedConv1D<Scalar: TensorFlowFloatingPoint>: Layer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     let batchSize = input.shape[0]
     let w = (input.shape[1] - (1 * paddingIndex)) * stride + (filter.shape[0] * paddingIndex)
     let c = filter.shape[2]
@@ -506,7 +506,7 @@ public struct TransposedConv2D<Scalar: TensorFlowFloatingPoint>: Layer {
   @noDerivative private let useBias: Bool
 
   /// The element-wise activation function type.
-  public typealias Activation = @differentiable (Tensor<Scalar>) -> Tensor<Scalar>
+  public typealias Activation = @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar>
 
   /// Creates a `TransposedConv2D` layer with the specified filter, bias,
   /// activation function, strides, and padding.
@@ -538,8 +538,8 @@ public struct TransposedConv2D<Scalar: TensorFlowFloatingPoint>: Layer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     let batchSize = input.shape[0]
     let h = (input.shape[1] - (1 * paddingIndex)) * strides.0 + (filter.shape[0] * paddingIndex)
     let w = (input.shape[2] - (1 * paddingIndex)) * strides.1 + (filter.shape[1] * paddingIndex)
@@ -610,7 +610,7 @@ public struct TransposedConv3D<Scalar: TensorFlowFloatingPoint>: Layer {
   @noDerivative private let useBias: Bool
 
   /// The element-wise activation function type.
-  public typealias Activation = @differentiable (Tensor<Scalar>) -> Tensor<Scalar>
+  public typealias Activation = @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar>
 
   /// Creates a `TransposedConv3D` layer with the specified filter, bias,
   /// activation function, strides, and padding.
@@ -641,8 +641,8 @@ public struct TransposedConv3D<Scalar: TensorFlowFloatingPoint>: Layer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     let batchSize = input.shape[0]
     let w = (input.shape[1] - (1 * paddingIndex)) * strides.0 + (filter.shape[0] * paddingIndex)
     let h = (input.shape[2] - (1 * paddingIndex)) * strides.1 + (filter.shape[1] * paddingIndex)
@@ -714,7 +714,7 @@ public struct DepthwiseConv2D<Scalar: TensorFlowFloatingPoint>: Layer {
   @noDerivative private let useBias: Bool
 
   /// The element-wise activation function type.
-  public typealias Activation = @differentiable (Tensor<Scalar>) -> Tensor<Scalar>
+  public typealias Activation = @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar>
 
   /// Creates a `DepthwiseConv2D` layer with the specified filter, bias, activation function,
   /// strides, and padding.
@@ -749,8 +749,8 @@ public struct DepthwiseConv2D<Scalar: TensorFlowFloatingPoint>: Layer {
   ///   [batch count, input height, input width, input channel count]
   /// - Returns: The output of shape,
   ///   [batch count, output height, output width, input channel count * channel multiplier]
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     let conv = depthwiseConv2D(
       input,
       filter: filter,
@@ -823,8 +823,8 @@ public struct ZeroPadding1D<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     input.padded(forSizes: [(0, 0), padding, (0, 0)])
   }
 }
@@ -857,8 +857,8 @@ public struct ZeroPadding2D<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     input.padded(forSizes: [(0, 0), padding.0, padding.1, (0, 0)])
   }
 }
@@ -891,8 +891,8 @@ public struct ZeroPadding3D<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     input.padded(forSizes: [(0, 0), padding.0, padding.1, padding.2, (0, 0)])
   }
 }
@@ -922,7 +922,7 @@ public struct SeparableConv1D<Scalar: TensorFlowFloatingPoint>: Layer {
   @noDerivative private let useBias: Bool
 
   /// The element-wise activation function type.
-  public typealias Activation = @differentiable (Tensor<Scalar>) -> Tensor<Scalar>
+  public typealias Activation = @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar>
 
   /// Creates a `SeparableConv1D` layer with the specified depthwise and pointwise filter,
   /// bias, activation function, strides, and padding.
@@ -960,8 +960,8 @@ public struct SeparableConv1D<Scalar: TensorFlowFloatingPoint>: Layer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     let depthwise = depthwiseConv2D(
       input.expandingShape(at: 1),
       filter: depthwiseFilter.expandingShape(at: 1),
@@ -1044,7 +1044,7 @@ public struct SeparableConv2D<Scalar: TensorFlowFloatingPoint>: Layer {
   @noDerivative private let useBias: Bool
 
   /// The element-wise activation function type.
-  public typealias Activation = @differentiable (Tensor<Scalar>) -> Tensor<Scalar>
+  public typealias Activation = @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar>
 
   /// Creates a `SeparableConv2D` layer with the specified depthwise and pointwise filter,
   /// bias, activation function, strides, and padding.
@@ -1082,8 +1082,8 @@ public struct SeparableConv2D<Scalar: TensorFlowFloatingPoint>: Layer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     let depthwise = depthwiseConv2D(
       input,
       filter: depthwiseFilter,
diff --git a/Sources/TensorFlow/Layers/Core.swift b/Sources/TensorFlow/Layers/Core.swift
index 6bedff653..43552ffed 100644
--- a/Sources/TensorFlow/Layers/Core.swift
+++ b/Sources/TensorFlow/Layers/Core.swift
@@ -28,8 +28,8 @@ public struct Flatten<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     let batchSize = input.shape[0]
     let remaining = input.shape[1..<input.rank].contiguousSize
     return input.reshaped(to: [batchSize, remaining])
@@ -66,8 +66,8 @@ public struct Reshape<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     return input.reshaped(toShape: shape)
   }
 }
@@ -75,7 +75,7 @@ public struct Reshape<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer {
 /// A layer that encloses a custom differentiable function.
 public struct Function<Input: Differentiable, Output: Differentiable>: ParameterlessLayer {
   public typealias TangentVector = EmptyTangentVector
-  public typealias Body = @differentiable (Input) -> Output
+  public typealias Body = @differentiable(reverse) (Input) -> Output
 
   @noDerivative public let body: Body
 
@@ -83,7 +83,7 @@ public struct Function<Input: Differentiable, Output: Differentiable>: Parameter
     self.body = body
   }
 
-  @differentiable
+  @differentiable(reverse)
   public func callAsFunction(_ input: Input) -> Output {
     body(input)
   }
diff --git a/Sources/TensorFlow/Layers/Dense.swift b/Sources/TensorFlow/Layers/Dense.swift
index c91bbde86..0d50f28c7 100644
--- a/Sources/TensorFlow/Layers/Dense.swift
+++ b/Sources/TensorFlow/Layers/Dense.swift
@@ -38,14 +38,14 @@ public struct Dense<Scalar: TensorFlowFloatingPoint>: Layer {
   @noDerivative private let useBias: Bool
 
   /// The element-wise activation function type.
-  public typealias Activation = @differentiable (Tensor<Scalar>) -> Tensor<Scalar>
+  public typealias Activation = @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar>
 
   /// Creates an instance from the given weight, optional bias, and activation function.
   ///
   /// - Note: currently, `weight` is the only differentiability parameter. `bias` can be made a
   ///   differentiability parameter after `Optional` conditionally conforms to `Differentiable`:
   ///   TF-499.
-  @differentiable(wrt: weight)
+  @differentiable(reverse, wrt: weight)
   public init(
     weight: Tensor<Scalar>,
     bias: Tensor<Scalar>? = nil,
@@ -77,8 +77,8 @@ public struct Dense<Scalar: TensorFlowFloatingPoint>: Layer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     if batched {
       let hidden = matmul(input.expandingShape(at: 1), weight).squeezingShape(at: 1)
       return activation(useBias ? hidden + bias : hidden)
diff --git a/Sources/TensorFlow/Layers/Dropout.swift b/Sources/TensorFlow/Layers/Dropout.swift
index a4d5f488f..b6047e17d 100644
--- a/Sources/TensorFlow/Layers/Dropout.swift
+++ b/Sources/TensorFlow/Layers/Dropout.swift
@@ -20,7 +20,7 @@ import _Differentiation
 
 extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// Computes dropout given a probability.
-  @differentiable(wrt: self where Scalar: Differentiable)
+  @differentiable(reverse, wrt: self where Scalar: Differentiable)
   fileprivate func droppingOut(probability: Double) -> Tensor {
     let noise = Tensor(randomUniform: shape, on: device)
     let keepMask = noise .>= Scalar(probability)
@@ -54,8 +54,8 @@ public struct Dropout<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     switch Context.local.learningPhase {
     case .training:
       return input.droppingOut(probability: probability)
@@ -81,8 +81,8 @@ public struct GaussianNoise<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer
   }
 
   /// Returns a tensor obtained by adding noise to `input`
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     switch Context.local.learningPhase {
     case .training:
       let noise = Tensor<Scalar>(
@@ -118,8 +118,8 @@ public struct GaussianDropout<Scalar: TensorFlowFloatingPoint>: ParameterlessLay
   }
 
   /// Applies multiplicative 1-centered Gaussian noise to the input during training only.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     switch Context.local.learningPhase {
     case .training:
       let noise = Tensor<Scalar>(
@@ -158,8 +158,8 @@ public struct AlphaDropout<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer
   }
 
   /// Adds noise to `input` during training, and is a no-op during inference.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     switch Context.local.learningPhase {
     case .training:
       let alpha = 1.6732632423543772848170429916717
diff --git a/Sources/TensorFlow/Layers/Embedding.swift b/Sources/TensorFlow/Layers/Embedding.swift
index 5c68f7c5c..95e6fdac3 100644
--- a/Sources/TensorFlow/Layers/Embedding.swift
+++ b/Sources/TensorFlow/Layers/Embedding.swift
@@ -54,8 +54,8 @@ public struct Embedding<Scalar: TensorFlowFloatingPoint>: Module {
   /// - Parameter
   ///   - input: The indices that will be mapped to their vector representations.
   /// - Returns: The tensor created by replacing input indices with their vector representations.
-  @differentiable(wrt: self)
-  public func forward(_ input: Tensor<Int32>) -> Tensor<Scalar> {
+  @differentiable(reverse, wrt: self)
+  public func callAsFunction(_ input: Tensor<Int32>) -> Tensor<Scalar> {
     embeddings.gathering(atIndices: input)
   }
 }
diff --git a/Sources/TensorFlow/Layers/Morphological.swift b/Sources/TensorFlow/Layers/Morphological.swift
index e0e06cce7..106781cab 100644
--- a/Sources/TensorFlow/Layers/Morphological.swift
+++ b/Sources/TensorFlow/Layers/Morphological.swift
@@ -69,8 +69,8 @@ public struct `Dilation2D`<Scalar: TensorFlowFloatingPoint>: Layer {
   ///   [batch count, output height, output width, output channel count].
   ///
   /// - Note: Padding size equals zero when using `.valid`.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     let dilated = dilation2D(
       input,
       filter: filter,
@@ -139,8 +139,8 @@ public struct `Erosion2D`<Scalar: TensorFlowFloatingPoint>: Layer {
   ///   [batch count, output height, output width, output channel count].
   ///
   /// - Note: Padding size equals zero when using `.valid`.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     let eroded = erosion2D(
       input,
       filter: filter,
diff --git a/Sources/TensorFlow/Layers/Normalization.swift b/Sources/TensorFlow/Layers/Normalization.swift
index 79af7667f..e46e0ed22 100644
--- a/Sources/TensorFlow/Layers/Normalization.swift
+++ b/Sources/TensorFlow/Layers/Normalization.swift
@@ -23,7 +23,7 @@ import _Differentiation
 ///   - offset: The tensor to be added to normalized tensor.
 ///   - scale: The tensor to be applied to normalized tensor.
 ///   - varianceEpsilon: The small number to avoid dividing by 0.
-@differentiable(wrt: (input, mean, variance, offset, scale))
+@differentiable(reverse, wrt: (input, mean, variance, offset, scale))
 private func normalize<Scalar: TensorFlowFloatingPoint>(
   _ input: Tensor<Scalar>,
   mean: Tensor<Scalar>,
@@ -98,20 +98,26 @@ public struct BatchNorm<Scalar: TensorFlowFloatingPoint>: Layer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
-    let positiveAxis = (input.rank + axis) % input.rank
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+    let inputRank = input.rank
+    let positiveAxis = (inputRank + axis) % inputRank
     precondition(
       input.shape[positiveAxis] == offset.shape[0],
       "The number of features of the input and the offset doesn't match.")
-    var offset = self.offset
-    var scale = self.scale
-    if positiveAxis != input.rank - 1 {
-      var broadcastShape = TensorShape([Int](repeating: 1, count: input.rank))
-      broadcastShape[positiveAxis] = input.shape[positiveAxis]
-      offset = offset.reshaped(to: broadcastShape)
-      scale = scale.reshaped(to: broadcastShape)
-    }
+//     var (offset, scale) = {x in (x.offset, x.scale) }(self)
+//     if positiveAxis != input.rank - 1 {
+//       var broadcastShape = TensorShape([Int](repeating: 1, count: input.rank))
+//       broadcastShape[positiveAxis] = input.shape[positiveAxis]
+//       offset = offset.reshaped(to: broadcastShape)
+//       scale = scale.reshaped(to: broadcastShape)
+//     }
+    let offsetOriginal = self.offset
+    let scaleOriginal = self.scale
+    let (offset, scale) = Self._sr13263workaround(offset: offsetOriginal,
+                                                  scale: scaleOriginal,
+                                                  input: input,
+                                                  positiveAxis: positiveAxis)
     switch Context.local.learningPhase {
     case .training:
       return doTraining(input, offset: offset, scale: scale, axis: positiveAxis)
@@ -119,6 +125,23 @@ public struct BatchNorm<Scalar: TensorFlowFloatingPoint>: Layer {
       return doInference(input, offset: offset, scale: scale)
     }
   }
+  
+  @inline(never)
+  @differentiable(reverse) // if the function is `public` or `internal`, the compiler crashes
+  private static func _sr13263workaround(
+    offset: Tensor<Scalar>, 
+    scale: Tensor<Scalar>,
+    input: Tensor<Scalar>,
+    positiveAxis: Int
+  ) -> (Tensor<Scalar>, Tensor<Scalar>) {
+    if positiveAxis != input.rank - 1 {
+      var broadcastShape = TensorShape([Int](repeating: 1, count: input.rank))
+      broadcastShape[positiveAxis] = input.shape[positiveAxis]
+      return (offset.reshaped(to: broadcastShape), scale.reshaped(to: broadcastShape))
+    } else {
+      return (offset, scale)
+    }
+  }
 
   private func doTraining(
     _ input: Tensor<Scalar>, offset: Tensor<Scalar>, scale: Tensor<Scalar>, axis: Int
@@ -240,8 +263,8 @@ public struct LayerNorm<Scalar: TensorFlowFloatingPoint>: Layer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     // Note: `withoutDerivative(at:)` is currently needed in the following to prevent the resulting
     // tensor for `epsilon` from being scalarized on the backwards pass, breaking X10 traces.
     let epsilon = withoutDerivative(at: input) { Tensor(self.epsilon, deviceAndPrecisionLike: $0) }
@@ -341,8 +364,8 @@ public struct GroupNorm<Scalar: TensorFlowFloatingPoint>: Layer {
   /// - Returns: The output.
   /// - Precondition: The axis cannot be batch axis.
   /// - Precondition: The numbers of features of the input and the offset must be same.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     let positiveAxis = (input.rank + axis) % input.rank
     precondition(positiveAxis != 0, "The axis cannot be batch axis.")
     precondition(
@@ -447,8 +470,8 @@ public struct InstanceNorm<Scalar: TensorFlowFloatingPoint>: Layer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     delegate(input)
   }
 }
diff --git a/Sources/TensorFlow/Layers/Pooling.swift b/Sources/TensorFlow/Layers/Pooling.swift
index 6a5c66c06..563534c10 100644
--- a/Sources/TensorFlow/Layers/Pooling.swift
+++ b/Sources/TensorFlow/Layers/Pooling.swift
@@ -44,8 +44,8 @@ public struct MaxPool1D<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     maxPool2D(
       input.expandingShape(at: 1),
       filterSize: (1, 1, poolSize, 1),
@@ -85,8 +85,8 @@ public struct MaxPool2D<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     maxPool2D(input, filterSize: poolSize, strides: strides, padding: padding)
   }
 }
@@ -142,8 +142,8 @@ public struct MaxPool3D<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     maxPool3D(input, filterSize: poolSize, strides: strides, padding: padding)
   }
 }
@@ -204,8 +204,8 @@ public struct AvgPool1D<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     avgPool2D(
       input.expandingShape(at: 1),
       filterSize: (1, 1, poolSize, 1),
@@ -245,8 +245,8 @@ public struct AvgPool2D<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     avgPool2D(input, filterSize: poolSize, strides: strides, padding: padding)
   }
 }
@@ -302,8 +302,8 @@ public struct AvgPool3D<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     avgPool3D(input, filterSize: poolSize, strides: strides, padding: padding)
   }
 }
@@ -346,8 +346,8 @@ public struct GlobalAvgPool1D<Scalar: TensorFlowFloatingPoint>: ParameterlessLay
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     precondition(input.rank == 3, "The rank of the input must be 3.")
     return input.mean(squeezingAxes: 1)
   }
@@ -365,8 +365,8 @@ public struct GlobalAvgPool2D<Scalar: TensorFlowFloatingPoint>: ParameterlessLay
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     precondition(input.rank == 4, "The rank of the input must be 4.")
     return input.mean(squeezingAxes: [1, 2])
   }
@@ -384,8 +384,8 @@ public struct GlobalAvgPool3D<Scalar: TensorFlowFloatingPoint>: ParameterlessLay
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     precondition(input.rank == 5, "The rank of the input must be 5.")
     return input.mean(squeezingAxes: [1, 2, 3])
   }
@@ -406,8 +406,8 @@ public struct GlobalMaxPool1D<Scalar: TensorFlowFloatingPoint>: ParameterlessLay
   ///   - context: The contextual information for the layer application, e.g. the current learning
   ///     phase.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     precondition(input.rank == 3, "The rank of the input must be 3.")
     return input.max(squeezingAxes: 1)
   }
@@ -425,8 +425,8 @@ public struct GlobalMaxPool2D<Scalar: TensorFlowFloatingPoint>: ParameterlessLay
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     precondition(input.rank == 4, "The rank of the input must be 4.")
     return input.max(squeezingAxes: [1, 2])
   }
@@ -444,8 +444,8 @@ public struct GlobalMaxPool3D<Scalar: TensorFlowFloatingPoint>: ParameterlessLay
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     precondition(input.rank == 5, "The rank of the input must be 5.")
     return input.max(squeezingAxes: [1, 2, 3])
   }
@@ -494,8 +494,8 @@ public struct FractionalMaxPool2D<Scalar: TensorFlowFloatingPoint>: Parameterles
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     fractionalMaxPool2D(
       input,
       poolingRatio: poolingRatio,
diff --git a/Sources/TensorFlow/Layers/Recurrent.swift b/Sources/TensorFlow/Layers/Recurrent.swift
index 1fd0c30bf..9a07c4120 100644
--- a/Sources/TensorFlow/Layers/Recurrent.swift
+++ b/Sources/TensorFlow/Layers/Recurrent.swift
@@ -24,7 +24,7 @@ public struct RNNCellInput<Input: Differentiable, State: Differentiable>: Differ
   /// The previous state.
   public var state: State
 
-  @differentiable
+  @differentiable(reverse)
   public init(input: Input, state: State) {
     self.input = input
     self.state = state
@@ -41,7 +41,7 @@ public struct RNNCellOutput<Output: Differentiable, State: Differentiable>: Diff
   /// The current state.
   public var state: State
 
-  @differentiable
+  @differentiable(reverse)
   public init(output: Output, state: State) {
     self.output = output
     self.state = state
@@ -76,7 +76,7 @@ extension RecurrentLayerCell {
   ///   - timeStepInput: The input at the current time step.
   ///   - previousState: The previous state of the recurrent layer cell.
   /// - Returns: The output.
-  @differentiable
+  @differentiable(reverse)
   public func callAsFunction(
     input: TimeStepInput,
     state: State
@@ -84,7 +84,7 @@ extension RecurrentLayerCell {
     self(RNNCellInput(input: input, state: state))
   }
 
-  @differentiable
+  @differentiable(reverse)
   public func call(input: TimeStepInput, state: State) -> RNNCellOutput<TimeStepOutput, State> {
     self(RNNCellInput(input: input, state: state))
   }
@@ -122,7 +122,7 @@ public struct BasicRNNCell<Scalar: TensorFlowFloatingPoint>: RecurrentLayerCell
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The hidden state.
-  @differentiable
+  @differentiable(reverse)
   public func callAsFunction(_ input: Input) -> Output {
     let concatenatedInput = input.input.concatenated(with: input.state, alongAxis: 1)
     let newState = tanh(matmul(concatenatedInput, weight) + bias)
@@ -202,14 +202,14 @@ public struct LSTMCell<Scalar: TensorFlowFloatingPoint>: RecurrentLayerCell {
     public var cell: Tensor<Scalar>
     public var hidden: Tensor<Scalar>
 
-    @differentiable
+    @differentiable(reverse)
     public init(cell: Tensor<Scalar>, hidden: Tensor<Scalar>) {
       self.cell = cell
       self.hidden = hidden
     }
 
     /// Concatenates two values.
-    @differentiable
+    @differentiable(reverse)
     public static func concatenate(_ lhs: Self, _ rhs: Self) -> Self {
       // TODO(TF-1005): Remove workaround for differenting concatenated.
       let concatCell = lhs.cell.concatenated(with: rhs.cell, alongAxis: -1)
@@ -224,25 +224,25 @@ public struct LSTMCell<Scalar: TensorFlowFloatingPoint>: RecurrentLayerCell {
     }
 
     /// Adds two values and produces their sum.
-    @differentiable
+    @differentiable(reverse)
     public static func sum(_ lhs: Self, _ rhs: Self) -> Self {
       Self(cell: lhs.cell + rhs.cell, hidden: lhs.hidden + rhs.hidden)
     }
 
     /// Averages two values.
-    @differentiable
+    @differentiable(reverse)
     public static func average(_ lhs: Self, _ rhs: Self) -> Self {
       Self(cell: (lhs.cell + rhs.cell) / 2, hidden: (lhs.hidden + rhs.hidden) / 2)
     }
 
     /// Multiplies two values.
-    @differentiable
+    @differentiable(reverse)
     public static func multiply(_ lhs: Self, _ rhs: Self) -> Self {
       Self(cell: lhs.cell * rhs.cell, hidden: lhs.hidden * rhs.hidden)
     }
 
     /// Stack two values.
-    @differentiable
+    @differentiable(reverse)
     public static func stack(_ lhs: Self, _ rhs: Self) -> Self {
       // TODO(TF-1005): Remove workaround for differenting stacking.
       let stackCell = Tensor(stacking: [lhs.cell, rhs.cell])
@@ -269,7 +269,7 @@ public struct LSTMCell<Scalar: TensorFlowFloatingPoint>: RecurrentLayerCell {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The hidden state.
-  @differentiable
+  @differentiable(reverse)
   public func callAsFunction(_ input: Input) -> Output {
     let gateInput = input.input.concatenated(with: input.state.hidden, alongAxis: 1)
 
@@ -344,7 +344,7 @@ public struct GRUCell<Scalar: TensorFlowFloatingPoint>: RecurrentLayerCell {
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The hidden state.
-  @differentiable
+  @differentiable(reverse)
   public func callAsFunction(_ input: Input) -> Output {
     let updateGate = sigmoid(
       (matmul(input.input, updateKernel) + updateBias)
@@ -377,7 +377,7 @@ public struct RecurrentLayer<Cell: RecurrentLayerCell>: Layer {
     self.cell = cell()
   }
 
-  @differentiable(wrt: (self, inputs, initialState))
+  @differentiable(reverse, wrt: (self, inputs, initialState))
   public func callAsFunction(
     _ inputs: [Cell.TimeStepInput],
     initialState: Cell.State
@@ -393,7 +393,7 @@ public struct RecurrentLayer<Cell: RecurrentLayerCell>: Layer {
     return timeStepOutputs
   }
 
-  @differentiable(wrt: (self, inputs, initialState))
+  @differentiable(reverse, wrt: (self, inputs, initialState))
   public func call(
     _ inputs: [Cell.TimeStepInput],
     initialState: Cell.State
@@ -445,13 +445,13 @@ public struct RecurrentLayer<Cell: RecurrentLayerCell>: Layer {
     )
   }
 
-  @differentiable
+  @differentiable(reverse)
   public func callAsFunction(_ inputs: [Cell.TimeStepInput]) -> [Cell.TimeStepOutput] {
     let initialState = withoutDerivative(at: cell.zeroState(for: inputs[0]))
     return self(inputs, initialState: initialState)
   }
 
-  @differentiable(wrt: (self, inputs, initialState))
+  @differentiable(reverse, wrt: (self, inputs, initialState))
   public func lastOutput(
     from inputs: [Cell.TimeStepInput],
     initialState: Cell.State
@@ -460,7 +460,7 @@ public struct RecurrentLayer<Cell: RecurrentLayerCell>: Layer {
     return self(inputs, initialState: initialState)[withoutDerivative(at: inputs.count - 1)]
   }
 
-  @differentiable(wrt: (self, inputs))
+  @differentiable(reverse, wrt: (self, inputs))
   public func lastOutput(from inputs: [Cell.TimeStepInput]) -> Cell.TimeStepOutput {
     precondition(!inputs.isEmpty, "'inputs' must be non-empty.")
     let initialState = withoutDerivative(at: cell.zeroState(for: inputs[0]))
@@ -473,32 +473,32 @@ public struct RecurrentLayer<Cell: RecurrentLayerCell>: Layer {
 /// Used by `BidirectionalRecurrentLayer` as a generic requirement for merge functions.
 public protocol Mergeable: Differentiable, AdditiveArithmetic {
   /// Concatenates two values.
-  @differentiable
+  @differentiable(reverse)
   static func concatenate(_ lhs: Self, _ rhs: Self) -> Self
 
   /// Adds two values and produces their sum.
   ///
   /// - Note: renaming `sum` to `+` results in a compiler crash when conforming `Tensor` to
   /// `Mergeable` (SR-13229).
-  @differentiable
+  @differentiable(reverse)
   static func sum(_ lhs: Self, _ rhs: Self) -> Self
 
   /// Averages two values.
-  @differentiable
+  @differentiable(reverse)
   static func average(_ lhs: Self, _ rhs: Self) -> Self
 
   /// Multiplies two values.
-  @differentiable
+  @differentiable(reverse)
   static func multiply(_ lhs: Self, _ rhs: Self) -> Self
 
   /// Stack two values.
-  @differentiable
+  @differentiable(reverse)
   static func stack(_ lhs: Self, _ rhs: Self) -> Self
 }
 
 extension Tensor: Mergeable where Scalar: TensorFlowFloatingPoint {
   /// Concatenates two tensors along last axis.
-  @differentiable
+  @differentiable(reverse)
   public static func concatenate(_ lhs: Tensor, _ rhs: Tensor) -> Tensor {
     // TODO(TF-1005): Remove workaround for differenting concatenated.
     let concat = lhs.concatenated(with: rhs, alongAxis: -1)
@@ -508,25 +508,25 @@ extension Tensor: Mergeable where Scalar: TensorFlowFloatingPoint {
   }
 
   /// Adds two values and produces their sum.
-  @differentiable
+  @differentiable(reverse)
   public static func sum(_ lhs: Tensor, _ rhs: Tensor) -> Tensor {
     lhs + rhs
   }
 
   /// Averages two values.
-  @differentiable
+  @differentiable(reverse)
   public static func average(_ lhs: Tensor, _ rhs: Tensor) -> Tensor {
     (lhs + rhs) / 2
   }
 
   /// Multiplies two values.
-  @differentiable
+  @differentiable(reverse)
   public static func multiply(_ lhs: Tensor, _ rhs: Tensor) -> Tensor {
     lhs * rhs
   }
 
   /// Stack two values.
-  @differentiable
+  @differentiable(reverse)
   public static func stack(_ lhs: Tensor, _ rhs: Tensor) -> Tensor {
     // TODO(TF-1005): Remove workaround for differenting stacking.
     let stack = Tensor(stacking: [lhs, rhs])
@@ -537,7 +537,7 @@ extension Tensor: Mergeable where Scalar: TensorFlowFloatingPoint {
 }
 
 /// Concatenates two values.
-@differentiable
+@differentiable(reverse)
 public func concatenate<T: Mergeable>(
   _ first: T,
   _ second: T
@@ -546,7 +546,7 @@ public func concatenate<T: Mergeable>(
 }
 
 /// Adds two values and produces their sum.
-@differentiable
+@differentiable(reverse)
 public func sum<T: Mergeable>(
   _ first: T,
   _ second: T
@@ -555,7 +555,7 @@ public func sum<T: Mergeable>(
 }
 
 /// Averages two values.
-@differentiable
+@differentiable(reverse)
 public func average<T: Mergeable>(
   _ first: T,
   _ second: T
@@ -564,7 +564,7 @@ public func average<T: Mergeable>(
 }
 
 /// Multiplies two values.
-@differentiable
+@differentiable(reverse)
 public func multiply<T: Mergeable>(
   _ first: T,
   _ second: T
@@ -573,7 +573,7 @@ public func multiply<T: Mergeable>(
 }
 
 /// Stack two values.
-@differentiable
+@differentiable(reverse)
 public func stack<T: Mergeable>(
   _ first: T,
   _ second: T
@@ -585,7 +585,7 @@ public struct BidirectionalRecurrentLayer<Cell: RecurrentLayerCell>: Layer
 where Cell.TimeStepOutput: Mergeable {
   public typealias Input = [Cell.TimeStepInput]
   public typealias Output = [Cell.TimeStepOutput]
-  public typealias MergeFunction = @differentiable (Cell.TimeStepOutput, Cell.TimeStepOutput) -> Cell.TimeStepOutput
+  public typealias MergeFunction = @differentiable(reverse) (Cell.TimeStepOutput, Cell.TimeStepOutput) -> Cell.TimeStepOutput
 
   /// A wrapper around a `@differentiable` merge function.
   ///
@@ -615,7 +615,7 @@ where Cell.TimeStepOutput: Mergeable {
     _mergeFunction = .init(mergeFunction)
   }
 
-  @differentiable
+  @differentiable(reverse)
   public func callAsFunction(
     _ inputs: Input,
     initialForwardLayerState: Cell.State,
@@ -629,7 +629,7 @@ where Cell.TimeStepOutput: Mergeable {
       backwardOutputs.differentiableReversed(), mergeFunction: mergeFunction)
   }
 
-  @differentiable
+  @differentiable(reverse)
   public func callAsFunction(_ inputs: Input) -> Output {
     precondition(!inputs.isEmpty, "'inputs' must be non-empty.")
     let initialForwardLayerState = withoutDerivative(
@@ -643,7 +643,7 @@ where Cell.TimeStepOutput: Mergeable {
     )
   }
 
-  @differentiable
+  @differentiable(reverse)
   public func lastOutput(
     from inputs: Input,
     initialForwardLayerState: Cell.State,
@@ -657,7 +657,7 @@ where Cell.TimeStepOutput: Mergeable {
     )[withoutDerivative(at: inputs.count - 1)]
   }
 
-  @differentiable
+  @differentiable(reverse)
   public func lastOutput(from inputs: Input) -> Cell.TimeStepOutput {
     precondition(!inputs.isEmpty, "'inputs' must be non-empty.")
     return self(inputs)[withoutDerivative(at: inputs.count - 1)]
@@ -695,7 +695,7 @@ fileprivate extension Array where Element: Differentiable {
   ///
   /// This has a custom derivative, which works around the SR-13945 segfault that you would
   /// encounter if you tried to implement this at the callsite using a for loop.
-  @differentiable
+  @differentiable(reverse)
   func differentiableReversed() -> Self {
     .init(self.reversed())
   }
@@ -711,19 +711,19 @@ fileprivate extension Array where Element: Differentiable {
   ///
   /// This has a custom derivative, which works around the SR-13945 segfault that you would
   /// encounter if you tried to implement this at the callsite using a for loop.
-  @differentiable
+  @differentiable(reverse)
   func differentiableMerging(
-    _ other: Self, mergeFunction: @differentiable (Element, Element) -> Element
+    _ other: Self, mergeFunction: @differentiable(reverse) (Element, Element) -> Element
   ) -> Self {
     zip(self, other).map { mergeFunction($0.0, $0.1) }
   }
 
   @derivative(of: differentiableMerging)
   func vjpDifferentiableMerging(
-    _ other: Self, mergeFunction: @differentiable (Element, Element) -> Element
+    _ other: Self, mergeFunction: @differentiable(reverse) (Element, Element) -> Element
   ) -> (value: Self, pullback: (TangentVector) -> (TangentVector, TangentVector)) {
     let valuesWithPullbacks = zip(self, other).map {
-      valueWithPullback(at: $0.0, $0.1, in: mergeFunction)
+      valueWithPullback(at: $0.0, $0.1, of: mergeFunction)
     }
     let pullbacks = valuesWithPullbacks.map { $0.pullback }
     return (
diff --git a/Sources/TensorFlow/Layers/Sequential.swift b/Sources/TensorFlow/Layers/Sequential.swift
index 3631ec405..951c2c088 100644
--- a/Sources/TensorFlow/Layers/Sequential.swift
+++ b/Sources/TensorFlow/Layers/Sequential.swift
@@ -47,8 +47,7 @@ import _Differentiation
 /// ````
 public struct Sequential<Layer1: Module, Layer2: Layer>: Module
 where
-  Layer1.Output == Layer2.Input,
-  Layer1.TangentVector.VectorSpaceScalar == Layer2.TangentVector.VectorSpaceScalar
+  Layer1.Output == Layer2.Input
 {
   public var layer1: Layer1
   public var layer2: Layer2
@@ -58,7 +57,7 @@ where
     self.layer2 = layer2
   }
 
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func callAsFunction(_ input: Layer1.Input) -> Layer2.Output {
     layer2(layer1(input))
   }
@@ -69,7 +68,7 @@ where
 }
 
 extension Sequential: Layer where Layer1: Layer {
-  @differentiable
+  @differentiable(reverse)
   public func callAsFunction(_ input: Layer1.Input) -> Layer2.Output {
     layer2(layer1(input))
   }
@@ -78,42 +77,28 @@ extension Sequential: Layer where Layer1: Layer {
 /// A layer that sequentially composes 3 layers.
 public typealias Sequential3<L1: Module, L2: Layer, L3: Layer> = Sequential<L1, Sequential<L2, L3>>
 where
-  L1.Output == L2.Input, L2.Output == L3.Input,
-  L1.TangentVector.VectorSpaceScalar == L2.TangentVector.VectorSpaceScalar,
-  L2.TangentVector.VectorSpaceScalar == L3.TangentVector.VectorSpaceScalar
+  L1.Output == L2.Input, L2.Output == L3.Input
 
 /// A layer that sequentially composes 4 layers.
 public typealias Sequential4<L1: Module, L2: Layer, L3: Layer, L4: Layer> = Sequential<
   L1, Sequential<L2, Sequential<L3, L4>>
 >
 where
-  L1.Output == L2.Input, L2.Output == L3.Input, L3.Output == L4.Input,
-  L1.TangentVector.VectorSpaceScalar == L2.TangentVector.VectorSpaceScalar,
-  L2.TangentVector.VectorSpaceScalar == L3.TangentVector.VectorSpaceScalar,
-  L3.TangentVector.VectorSpaceScalar == L4.TangentVector.VectorSpaceScalar
+  L1.Output == L2.Input, L2.Output == L3.Input, L3.Output == L4.Input
 
 /// A layer that sequentially composes 5 layers.
 public typealias Sequential5<L1: Module, L2: Layer, L3: Layer, L4: Layer, L5: Layer> = Sequential<
   L1, Sequential<L2, Sequential<L3, Sequential<L4, L5>>>
 >
 where
-  L1.Output == L2.Input, L2.Output == L3.Input, L3.Output == L4.Input, L4.Output == L5.Input,
-  L1.TangentVector.VectorSpaceScalar == L2.TangentVector.VectorSpaceScalar,
-  L2.TangentVector.VectorSpaceScalar == L3.TangentVector.VectorSpaceScalar,
-  L3.TangentVector.VectorSpaceScalar == L4.TangentVector.VectorSpaceScalar,
-  L4.TangentVector.VectorSpaceScalar == L5.TangentVector.VectorSpaceScalar
+  L1.Output == L2.Input, L2.Output == L3.Input, L3.Output == L4.Input, L4.Output == L5.Input
 
 /// A layer that sequentially composes 6 layers.
 public typealias Sequential6<L1: Module, L2: Layer, L3: Layer, L4: Layer, L5: Layer, L6: Layer> =
   Sequential<L1, Sequential<L2, Sequential<L3, Sequential<L4, Sequential<L5, L6>>>>>
 where
   L1.Output == L2.Input, L2.Output == L3.Input, L3.Output == L4.Input, L4.Output == L5.Input,
-  L5.Output == L6.Input,
-  L1.TangentVector.VectorSpaceScalar == L2.TangentVector.VectorSpaceScalar,
-  L2.TangentVector.VectorSpaceScalar == L3.TangentVector.VectorSpaceScalar,
-  L3.TangentVector.VectorSpaceScalar == L4.TangentVector.VectorSpaceScalar,
-  L4.TangentVector.VectorSpaceScalar == L5.TangentVector.VectorSpaceScalar,
-  L5.TangentVector.VectorSpaceScalar == L6.TangentVector.VectorSpaceScalar
+  L5.Output == L6.Input
 
 /// A layer that sequentially composes 7 layers.
 public typealias Sequential7<
@@ -123,13 +108,7 @@ public typealias Sequential7<
 >
 where
   L1.Output == L2.Input, L2.Output == L3.Input, L3.Output == L4.Input, L4.Output == L5.Input,
-  L5.Output == L6.Input, L6.Output == L7.Input,
-  L1.TangentVector.VectorSpaceScalar == L2.TangentVector.VectorSpaceScalar,
-  L2.TangentVector.VectorSpaceScalar == L3.TangentVector.VectorSpaceScalar,
-  L3.TangentVector.VectorSpaceScalar == L4.TangentVector.VectorSpaceScalar,
-  L4.TangentVector.VectorSpaceScalar == L5.TangentVector.VectorSpaceScalar,
-  L5.TangentVector.VectorSpaceScalar == L6.TangentVector.VectorSpaceScalar,
-  L6.TangentVector.VectorSpaceScalar == L7.TangentVector.VectorSpaceScalar
+  L5.Output == L6.Input, L6.Output == L7.Input
 
 /// A layer that sequentially composes 8 layers.
 public typealias Sequential8<
@@ -140,14 +119,7 @@ public typealias Sequential8<
 >
 where
   L1.Output == L2.Input, L2.Output == L3.Input, L3.Output == L4.Input, L4.Output == L5.Input,
-  L5.Output == L6.Input, L6.Output == L7.Input, L7.Output == L8.Input,
-  L1.TangentVector.VectorSpaceScalar == L2.TangentVector.VectorSpaceScalar,
-  L2.TangentVector.VectorSpaceScalar == L3.TangentVector.VectorSpaceScalar,
-  L3.TangentVector.VectorSpaceScalar == L4.TangentVector.VectorSpaceScalar,
-  L4.TangentVector.VectorSpaceScalar == L5.TangentVector.VectorSpaceScalar,
-  L5.TangentVector.VectorSpaceScalar == L6.TangentVector.VectorSpaceScalar,
-  L6.TangentVector.VectorSpaceScalar == L7.TangentVector.VectorSpaceScalar,
-  L7.TangentVector.VectorSpaceScalar == L8.TangentVector.VectorSpaceScalar
+  L5.Output == L6.Input, L6.Output == L7.Input, L7.Output == L8.Input
 
 /// A layer that sequentially composes 9 layers.
 public typealias Sequential9<
@@ -163,15 +135,7 @@ public typealias Sequential9<
 >
 where
   L1.Output == L2.Input, L2.Output == L3.Input, L3.Output == L4.Input, L4.Output == L5.Input,
-  L5.Output == L6.Input, L6.Output == L7.Input, L7.Output == L8.Input, L8.Output == L9.Input,
-  L1.TangentVector.VectorSpaceScalar == L2.TangentVector.VectorSpaceScalar,
-  L2.TangentVector.VectorSpaceScalar == L3.TangentVector.VectorSpaceScalar,
-  L3.TangentVector.VectorSpaceScalar == L4.TangentVector.VectorSpaceScalar,
-  L4.TangentVector.VectorSpaceScalar == L5.TangentVector.VectorSpaceScalar,
-  L5.TangentVector.VectorSpaceScalar == L6.TangentVector.VectorSpaceScalar,
-  L6.TangentVector.VectorSpaceScalar == L7.TangentVector.VectorSpaceScalar,
-  L7.TangentVector.VectorSpaceScalar == L8.TangentVector.VectorSpaceScalar,
-  L8.TangentVector.VectorSpaceScalar == L9.TangentVector.VectorSpaceScalar
+  L5.Output == L6.Input, L6.Output == L7.Input, L7.Output == L8.Input, L8.Output == L9.Input
 
 /// A layer that sequentially composes 10 layers.
 public typealias Sequential10<
@@ -192,16 +156,7 @@ public typealias Sequential10<
 where
   L1.Output == L2.Input, L2.Output == L3.Input, L3.Output == L4.Input, L4.Output == L5.Input,
   L5.Output == L6.Input, L6.Output == L7.Input, L7.Output == L8.Input, L8.Output == L9.Input,
-  L9.Output == L10.Input,
-  L1.TangentVector.VectorSpaceScalar == L2.TangentVector.VectorSpaceScalar,
-  L2.TangentVector.VectorSpaceScalar == L3.TangentVector.VectorSpaceScalar,
-  L3.TangentVector.VectorSpaceScalar == L4.TangentVector.VectorSpaceScalar,
-  L4.TangentVector.VectorSpaceScalar == L5.TangentVector.VectorSpaceScalar,
-  L5.TangentVector.VectorSpaceScalar == L6.TangentVector.VectorSpaceScalar,
-  L6.TangentVector.VectorSpaceScalar == L7.TangentVector.VectorSpaceScalar,
-  L7.TangentVector.VectorSpaceScalar == L8.TangentVector.VectorSpaceScalar,
-  L8.TangentVector.VectorSpaceScalar == L9.TangentVector.VectorSpaceScalar,
-  L9.TangentVector.VectorSpaceScalar == L10.TangentVector.VectorSpaceScalar
+  L9.Output == L10.Input
 
 @resultBuilder
 public struct LayerBuilder {
@@ -218,9 +173,7 @@ public struct LayerBuilder {
     -> Sequential<L1, Sequential<L2, L3>>
   where
     L1.Output == L2.Input,
-    L2.Output == L3.Input,
-    L1.TangentVector.VectorSpaceScalar == L2.TangentVector.VectorSpaceScalar,
-    L2.TangentVector.VectorSpaceScalar == L3.TangentVector.VectorSpaceScalar
+    L2.Output == L3.Input
   {
     Sequential(l1, Sequential(l2, l3))
   }
@@ -235,10 +188,7 @@ public struct LayerBuilder {
   where
     L1.Output == L2.Input,
     L2.Output == L3.Input,
-    L3.Output == L4.Input,
-    L1.TangentVector.VectorSpaceScalar == L2.TangentVector.VectorSpaceScalar,
-    L2.TangentVector.VectorSpaceScalar == L3.TangentVector.VectorSpaceScalar,
-    L3.TangentVector.VectorSpaceScalar == L4.TangentVector.VectorSpaceScalar
+    L3.Output == L4.Input
   {
     Sequential(l1, Sequential(l2, Sequential(l3, l4)))
   }
@@ -255,11 +205,7 @@ public struct LayerBuilder {
     L1.Output == L2.Input,
     L2.Output == L3.Input,
     L3.Output == L4.Input,
-    L4.Output == L5.Input,
-    L1.TangentVector.VectorSpaceScalar == L2.TangentVector.VectorSpaceScalar,
-    L2.TangentVector.VectorSpaceScalar == L3.TangentVector.VectorSpaceScalar,
-    L3.TangentVector.VectorSpaceScalar == L4.TangentVector.VectorSpaceScalar,
-    L4.TangentVector.VectorSpaceScalar == L5.TangentVector.VectorSpaceScalar
+    L4.Output == L5.Input
   {
     Sequential(l1, Sequential(l2, Sequential(l3, Sequential(l4, l5))))
   }
@@ -278,12 +224,7 @@ public struct LayerBuilder {
     L2.Output == L3.Input,
     L3.Output == L4.Input,
     L4.Output == L5.Input,
-    L5.Output == L6.Input,
-    L1.TangentVector.VectorSpaceScalar == L2.TangentVector.VectorSpaceScalar,
-    L2.TangentVector.VectorSpaceScalar == L3.TangentVector.VectorSpaceScalar,
-    L3.TangentVector.VectorSpaceScalar == L4.TangentVector.VectorSpaceScalar,
-    L4.TangentVector.VectorSpaceScalar == L5.TangentVector.VectorSpaceScalar,
-    L5.TangentVector.VectorSpaceScalar == L6.TangentVector.VectorSpaceScalar
+    L5.Output == L6.Input
   {
     Sequential(l1, Sequential(l2, Sequential(l3, Sequential(l4, Sequential(l5, l6)))))
   }
@@ -306,13 +247,7 @@ public struct LayerBuilder {
     L3.Output == L4.Input,
     L4.Output == L5.Input,
     L5.Output == L6.Input,
-    L6.Output == L7.Input,
-    L1.TangentVector.VectorSpaceScalar == L2.TangentVector.VectorSpaceScalar,
-    L2.TangentVector.VectorSpaceScalar == L3.TangentVector.VectorSpaceScalar,
-    L3.TangentVector.VectorSpaceScalar == L4.TangentVector.VectorSpaceScalar,
-    L4.TangentVector.VectorSpaceScalar == L5.TangentVector.VectorSpaceScalar,
-    L5.TangentVector.VectorSpaceScalar == L6.TangentVector.VectorSpaceScalar,
-    L6.TangentVector.VectorSpaceScalar == L7.TangentVector.VectorSpaceScalar
+    L6.Output == L7.Input
   {
     Sequential(
       l1, Sequential(l2, Sequential(l3, Sequential(l4, Sequential(l5, Sequential(l6, l7))))))
@@ -341,14 +276,7 @@ public struct LayerBuilder {
     L4.Output == L5.Input,
     L5.Output == L6.Input,
     L6.Output == L7.Input,
-    L7.Output == L8.Input,
-    L1.TangentVector.VectorSpaceScalar == L2.TangentVector.VectorSpaceScalar,
-    L2.TangentVector.VectorSpaceScalar == L3.TangentVector.VectorSpaceScalar,
-    L3.TangentVector.VectorSpaceScalar == L4.TangentVector.VectorSpaceScalar,
-    L4.TangentVector.VectorSpaceScalar == L5.TangentVector.VectorSpaceScalar,
-    L5.TangentVector.VectorSpaceScalar == L6.TangentVector.VectorSpaceScalar,
-    L6.TangentVector.VectorSpaceScalar == L7.TangentVector.VectorSpaceScalar,
-    L7.TangentVector.VectorSpaceScalar == L8.TangentVector.VectorSpaceScalar
+    L7.Output == L8.Input
   {
     Sequential(
       l1,
@@ -384,15 +312,7 @@ public struct LayerBuilder {
     L5.Output == L6.Input,
     L6.Output == L7.Input,
     L7.Output == L8.Input,
-    L8.Output == L9.Input,
-    L1.TangentVector.VectorSpaceScalar == L2.TangentVector.VectorSpaceScalar,
-    L2.TangentVector.VectorSpaceScalar == L3.TangentVector.VectorSpaceScalar,
-    L3.TangentVector.VectorSpaceScalar == L4.TangentVector.VectorSpaceScalar,
-    L4.TangentVector.VectorSpaceScalar == L5.TangentVector.VectorSpaceScalar,
-    L5.TangentVector.VectorSpaceScalar == L6.TangentVector.VectorSpaceScalar,
-    L6.TangentVector.VectorSpaceScalar == L7.TangentVector.VectorSpaceScalar,
-    L7.TangentVector.VectorSpaceScalar == L8.TangentVector.VectorSpaceScalar,
-    L8.TangentVector.VectorSpaceScalar == L9.TangentVector.VectorSpaceScalar
+    L8.Output == L9.Input
   {
     Sequential(
       l1,
@@ -438,16 +358,7 @@ public struct LayerBuilder {
     L6.Output == L7.Input,
     L7.Output == L8.Input,
     L8.Output == L9.Input,
-    L9.Output == L10.Input,
-    L1.TangentVector.VectorSpaceScalar == L2.TangentVector.VectorSpaceScalar,
-    L2.TangentVector.VectorSpaceScalar == L3.TangentVector.VectorSpaceScalar,
-    L3.TangentVector.VectorSpaceScalar == L4.TangentVector.VectorSpaceScalar,
-    L4.TangentVector.VectorSpaceScalar == L5.TangentVector.VectorSpaceScalar,
-    L5.TangentVector.VectorSpaceScalar == L6.TangentVector.VectorSpaceScalar,
-    L6.TangentVector.VectorSpaceScalar == L7.TangentVector.VectorSpaceScalar,
-    L7.TangentVector.VectorSpaceScalar == L8.TangentVector.VectorSpaceScalar,
-    L8.TangentVector.VectorSpaceScalar == L9.TangentVector.VectorSpaceScalar,
-    L9.TangentVector.VectorSpaceScalar == L10.TangentVector.VectorSpaceScalar
+    L9.Output == L10.Input
   {
     Sequential(
       l1,
diff --git a/Sources/TensorFlow/Layers/Sequential.swift.gyb b/Sources/TensorFlow/Layers/Sequential.swift.gyb
index b1bf45afe..cf38485a3 100644
--- a/Sources/TensorFlow/Layers/Sequential.swift.gyb
+++ b/Sources/TensorFlow/Layers/Sequential.swift.gyb
@@ -56,7 +56,7 @@ public struct Sequential<Layer1: Module, Layer2: Layer>: Module
         self.layer2 = layer2
     }
 
-    @differentiable(wrt: self)
+    @differentiable(reverse, wrt: self)
     public func callAsFunction(_ input: Layer1.Input) -> Layer2.Output {
         layer2(layer1(input))
     }
@@ -67,7 +67,7 @@ public struct Sequential<Layer1: Module, Layer2: Layer>: Module
 }
 
 extension Sequential: Layer where Layer1: Layer {
-    @differentiable
+    @differentiable(reverse)
     public func callAsFunction(_ input: Layer1.Input) -> Layer2.Output {
         layer2(layer1(input))
     }
diff --git a/Sources/TensorFlow/Layers/Upsampling.swift b/Sources/TensorFlow/Layers/Upsampling.swift
index 0fb4a2fed..8e4eb0d2f 100644
--- a/Sources/TensorFlow/Layers/Upsampling.swift
+++ b/Sources/TensorFlow/Layers/Upsampling.swift
@@ -32,8 +32,8 @@ public struct UpSampling1D<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     let shape = input.shape
     let (batchSize, timesteps, channels) = (shape[0], shape[1], shape[2])
     let scaleOnes = Tensor<Scalar>(ones: [1, 1, size, 1], on: input.device)
@@ -60,8 +60,8 @@ public struct UpSampling2D<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     let device = input.device
     let shape = input.shape
     let (batchSize, height, width, channels) = (shape[0], shape[1], shape[2], shape[3])
@@ -88,7 +88,7 @@ public struct UpSampling3D<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer
   /// Repeats the elements of a tensor along an axis, like `np.repeat`.
   /// Function adapted from `def repeat_elements`:
   /// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/backend.py
-  @differentiable
+  @differentiable(reverse)
   private func repeatingElements(
     _ input: Tensor<Scalar>, alongAxis axis: Int, count: Int
   ) -> Tensor<Scalar> {
@@ -123,8 +123,8 @@ public struct UpSampling3D<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer
   ///
   /// - Parameter input: The input to the layer.
   /// - Returns: The output.
-  @differentiable
-  public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+  @differentiable(reverse)
+  public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     var result = repeatingElements(input, alongAxis: 1, count: size)
     result = repeatingElements(result, alongAxis: 2, count: size)
     result = repeatingElements(result, alongAxis: 3, count: size)
diff --git a/Sources/TensorFlow/Loss.swift b/Sources/TensorFlow/Loss.swift
index 4477cff93..d5e7ed01c 100644
--- a/Sources/TensorFlow/Loss.swift
+++ b/Sources/TensorFlow/Loss.swift
@@ -21,12 +21,12 @@ import _Differentiation
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
 ///   - reduction: Reduction to apply on the computed element-wise loss values.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func l1Loss<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>,
-  reduction: @differentiable (Tensor<Scalar>) -> Tensor<Scalar> = _sum
+  reduction: @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar> = _sum
 ) -> Tensor<Scalar> {
   reduction(abs(expected - predicted))
 }
@@ -38,12 +38,12 @@ public func l1Loss<Scalar: TensorFlowFloatingPoint>(
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
 ///   - reduction: Reduction to apply on the computed element-wise loss values.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func l2Loss<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>,
-  reduction: @differentiable (Tensor<Scalar>) -> Tensor<Scalar> = _sum
+  reduction: @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar> = _sum
 ) -> Tensor<Scalar> {
   reduction((expected - predicted).squared())
 }
@@ -54,8 +54,8 @@ public func l2Loss<Scalar: TensorFlowFloatingPoint>(
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func meanAbsoluteError<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>
@@ -69,8 +69,8 @@ public func meanAbsoluteError<Scalar: TensorFlowFloatingPoint>(
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func meanSquaredError<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>
@@ -87,8 +87,8 @@ public func meanSquaredError<Scalar: TensorFlowFloatingPoint>(
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func meanSquaredLogarithmicError<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>
@@ -104,8 +104,8 @@ public func meanSquaredLogarithmicError<Scalar: TensorFlowFloatingPoint>(
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func meanAbsolutePercentageError<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>
@@ -121,12 +121,12 @@ public func meanAbsolutePercentageError<Scalar: TensorFlowFloatingPoint>(
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
 ///   - reduction: Reduction to apply on the computed element-wise loss values.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func hingeLoss<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>,
-  reduction: @differentiable (Tensor<Scalar>) -> Tensor<Scalar> = _mean
+  reduction: @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar> = _mean
 ) -> Tensor<Scalar> {
   let device = predicted.device
   return reduction(max(Tensor(0, on: device), Tensor(1, on: device) - expected * predicted))
@@ -140,12 +140,12 @@ public func hingeLoss<Scalar: TensorFlowFloatingPoint>(
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
 ///   - reduction: Reduction to apply on the computed element-wise loss values.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func squaredHingeLoss<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>,
-  reduction: @differentiable (Tensor<Scalar>) -> Tensor<Scalar> = _mean
+  reduction: @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar> = _mean
 ) -> Tensor<Scalar> {
   reduction(hingeLoss(predicted: predicted, expected: expected).squared())
 }
@@ -159,12 +159,12 @@ public func squaredHingeLoss<Scalar: TensorFlowFloatingPoint>(
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
 ///   - reduction: Reduction to apply on the computed element-wise loss values.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func categoricalHingeLoss<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>,
-  reduction: @differentiable (Tensor<Scalar>) -> Tensor<Scalar> = _mean
+  reduction: @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar> = _mean
 ) -> Tensor<Scalar> {
   let device = predicted.device
   let positive = (expected * predicted).sum(alongAxes: -1)
@@ -180,12 +180,12 @@ public func categoricalHingeLoss<Scalar: TensorFlowFloatingPoint>(
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
 ///   - reduction: Reduction to apply on the computed element-wise loss values.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func logCoshLoss<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>,
-  reduction: @differentiable (Tensor<Scalar>) -> Tensor<Scalar> = _mean
+  reduction: @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar> = _mean
 ) -> Tensor<Scalar> {
   let device = predicted.device
   let x = predicted - expected
@@ -200,12 +200,12 @@ public func logCoshLoss<Scalar: TensorFlowFloatingPoint>(
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
 ///   - reduction: Reduction to apply on the computed element-wise loss values.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func poissonLoss<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>,
-  reduction: @differentiable (Tensor<Scalar>) -> Tensor<Scalar> = _mean
+  reduction: @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar> = _mean
 ) -> Tensor<Scalar> {
   reduction(predicted - expected * log(predicted))
 }
@@ -217,12 +217,12 @@ public func poissonLoss<Scalar: TensorFlowFloatingPoint>(
 ///   - predicted: Predicted outputs from a neural network.
 ///   - expected: Expected values, i.e. targets, that correspond to the correct output.
 ///   - reduction: Reduction to apply on the computed element-wise loss values.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func kullbackLeiblerDivergence<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>,
-  reduction: @differentiable (Tensor<Scalar>) -> Tensor<Scalar> = _sum
+  reduction: @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar> = _sum
 ) -> Tensor<Scalar> {
   reduction(expected * log(expected / predicted))
 }
@@ -236,17 +236,17 @@ public func kullbackLeiblerDivergence<Scalar: TensorFlowFloatingPoint>(
 ///   - logits: One-hot encoded outputs from a neural network.
 ///   - labels: Indices (zero-indexed) of the correct outputs.
 ///   - reduction: Reduction to apply on the computed element-wise loss values.
-@differentiable(wrt: logits)
+@differentiable(reverse, wrt: logits)
 public func softmaxCrossEntropy<Scalar: TensorFlowFloatingPoint>(
   logits: Tensor<Scalar>,
   labels: Tensor<Int32>,
-  reduction: @differentiable (Tensor<Scalar>) -> Tensor<Scalar> = _mean
+  reduction: @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar> = _mean
 ) -> Tensor<Scalar> {
   reduction(softmaxCrossEntropyHelper(logits: logits, labels: labels))
 }
 
 @inlinable
-@differentiable(wrt: logits)
+@differentiable(reverse, wrt: logits)
 func softmaxCrossEntropyHelper<Scalar: TensorFlowFloatingPoint>(
   logits: Tensor<Scalar>,
   labels: Tensor<Int32>
@@ -274,17 +274,17 @@ func _vjpSoftmaxCrossEntropyHelper<Scalar: TensorFlowFloatingPoint>(
 ///   - probabilities: Probability values that correspond to the correct output. Each row must be a
 ///                    valid probability distribution.
 ///   - reduction: Reduction to apply on the computed element-wise loss values.
-@differentiable(wrt: logits)
+@differentiable(reverse, wrt: logits)
 public func softmaxCrossEntropy<Scalar: TensorFlowFloatingPoint>(
   logits: Tensor<Scalar>,
   probabilities: Tensor<Scalar>,
-  reduction: @differentiable (Tensor<Scalar>) -> Tensor<Scalar> = _mean
+  reduction: @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar> = _mean
 ) -> Tensor<Scalar> {
   reduction(softmaxCrossEntropyHelper(logits: logits, probabilities: probabilities))
 }
 
 @inlinable
-@differentiable(wrt: logits)
+@differentiable(reverse, wrt: logits)
 func softmaxCrossEntropyHelper<Scalar: TensorFlowFloatingPoint>(
   logits: Tensor<Scalar>,
   probabilities: Tensor<Scalar>
@@ -311,12 +311,12 @@ func _vjpSoftmaxCrossEntropyHelper<Scalar: TensorFlowFloatingPoint>(
 ///   - logits: The unscaled output of a neural network.
 ///   - labels: Integer values that correspond to the correct output.
 ///   - reduction: Reduction to apply on the computed element-wise loss values.
-@differentiable(wrt: logits)
-@differentiable(wrt: (logits, labels))
+@differentiable(reverse, wrt: logits)
+@differentiable(reverse, wrt: (logits, labels))
 public func sigmoidCrossEntropy<Scalar: TensorFlowFloatingPoint>(
   logits: Tensor<Scalar>,
   labels: Tensor<Scalar>,
-  reduction: @differentiable (Tensor<Scalar>) -> Tensor<Scalar> = _mean
+  reduction: @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar> = _mean
 ) -> Tensor<Scalar> {
   let device = logits.device
   // This numerically stable implementation is based on the TensorFlow Python API.
@@ -339,13 +339,13 @@ public func sigmoidCrossEntropy<Scalar: TensorFlowFloatingPoint>(
 ///   - delta: A floating point scalar representing the point where the Huber loss function changes
 ///     from quadratic to linear.
 ///   - reduction: Reduction to apply on the computed element-wise loss values.
-@differentiable(wrt: predicted)
-@differentiable(wrt: (predicted, expected))
+@differentiable(reverse, wrt: predicted)
+@differentiable(reverse, wrt: (predicted, expected))
 public func huberLoss<Scalar: TensorFlowFloatingPoint>(
   predicted: Tensor<Scalar>,
   expected: Tensor<Scalar>,
   delta: Scalar,
-  reduction: @differentiable (Tensor<Scalar>) -> Tensor<Scalar> = _sum
+  reduction: @differentiable(reverse) (Tensor<Scalar>) -> Tensor<Scalar> = _sum
 ) -> Tensor<Scalar> {
   let error = expected - predicted
   let absError = abs(error)
@@ -357,7 +357,7 @@ public func huberLoss<Scalar: TensorFlowFloatingPoint>(
 /// Workaround for TF-1030 so that we can use sum as a default argument for reductions.
 /// `Tensor<Scalar>.sum()` is the preferred way to do this.
 // TODO(TF-1030): Remove this and replace with `{ $0.sum() }`.
-@differentiable
+@differentiable(reverse)
 public func _sum<Scalar: TensorFlowFloatingPoint>(
   _ value: Tensor<Scalar>
 ) -> Tensor<Scalar> {
@@ -367,7 +367,7 @@ public func _sum<Scalar: TensorFlowFloatingPoint>(
 /// Workaround for TF-1030 so that we can use mean as a default argument for reductions.
 /// `Tensor<Scalar>.mean()` is the preferred way to do this.
 // TODO(TF-1030): Remove this and replace with `{ $0.mean() }`.
-@differentiable
+@differentiable(reverse)
 public func _mean<Scalar: TensorFlowFloatingPoint>(
   _ value: Tensor<Scalar>
 ) -> Tensor<Scalar> {
diff --git a/Sources/TensorFlow/Operators/Basic.swift b/Sources/TensorFlow/Operators/Basic.swift
index 1a131c2aa..d2ea259ac 100644
--- a/Sources/TensorFlow/Operators/Basic.swift
+++ b/Sources/TensorFlow/Operators/Basic.swift
@@ -18,7 +18,7 @@ infix operator .!=: ComparisonPrecedence
 
 /// Returns a tensor with the same shape and scalars as the specified tensor.
 @inlinable
-@differentiable(where Scalar: TensorFlowFloatingPoint)
+@differentiable(reverse where Scalar: TensorFlowFloatingPoint)
 public func identity<Scalar>(_ x: Tensor<Scalar>) -> Tensor<Scalar> {
   x
 }
@@ -59,7 +59,7 @@ extension Tensor {
   ///
   /// - Returns: Array containing the unstacked tensors.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public func unstacked(alongAxis axis: Int = 0) -> [Tensor] {
     ensureValid(axis: axis)
     let posAxis = axis < 0 ? axis + rank : axis
@@ -89,7 +89,7 @@ extension Tensor {
   ///
   /// - Returns: An array containing the tensors part.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public func split(count: Int, alongAxis axis: Int = 0) -> [Tensor] {
     ensureValid(axis: axis)
     let canonicalAxis = axis < 0 ? axis + rank : axis
@@ -123,7 +123,7 @@ extension Tensor {
   ///
   /// - Returns: Array containing the tensors parts.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func split(sizes: Tensor<Int32>, alongAxis axis: Int = 0) -> [Tensor] {
     ensureValid(axis: axis)
     precondition(
@@ -137,7 +137,7 @@ extension Tensor {
   }
 
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func split(sizes: [Int], alongAxis axis: Int = 0) -> [Tensor] {
     ensureValid(axis: axis)
     let canonicalAxis = axis < 0 ? axis + rank : axis
@@ -161,7 +161,7 @@ extension Tensor {
   /// - Precondition: The shape of `multiples` must be `[tensor.rank]`.
   /// - Precondition: All scalars in `multiples` must be non-negative.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func tiled(multiples: [Int]) -> Tensor {
     precondition(
       multiples.allSatisfy { $0 >= 0 },
@@ -179,7 +179,7 @@ extension Tensor {
   /// - Precondition: The expected `rank` of multiples must be `1`.
   /// - Precondition: The shape of `multiples` must be `[tensor.rank]`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func tiled(multiples: Tensor<Int32>) -> Tensor {
     precondition(multiples.rank == 1, "The expected rank of multiples must be 1.")
     precondition(
@@ -191,7 +191,7 @@ extension Tensor {
   /// Reshape to the shape of the specified `Tensor`.
   /// - Precondition: The number of scalars matches the new shape.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func reshaped<T>(like other: Tensor<T>) -> Tensor {
     reshaped(toShape: other.shapeTensor)
   }
@@ -199,7 +199,7 @@ extension Tensor {
   /// Reshape to the specified shape.
   /// - Precondition: The number of scalars matches the new shape.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func reshaped(to newShape: TensorShape) -> Tensor {
     _Raw.reshape(self, shape: newShape.dimensions.map(Int64.init))
   }
@@ -207,14 +207,14 @@ extension Tensor {
   /// Reshape to the specified `Tensor` representing a shape.
   /// - Precondition: The number of scalars matches the new shape.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func reshaped(toShape newShape: Tensor<Int32>) -> Tensor {
     return _Raw.reshape(self, shape: newShape)
   }
 
   /// Return a copy of the tensor collapsed into a 1-D `Tensor`, in row-major order.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func flattened() -> Tensor {
     reshaped(to: [-1])
   }
@@ -222,7 +222,7 @@ extension Tensor {
   /// Returns a shape-expanded `Tensor`, with a dimension of 1 inserted at the specified shape
   /// indices.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func expandingShape(at axes: Int...) -> Tensor {
     expandingShape(at: axes)
   }
@@ -230,7 +230,7 @@ extension Tensor {
   /// Returns a shape-expanded `Tensor`, with a dimension of 1 inserted at the
   /// specified shape indices.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func expandingShape(at axes: [Int]) -> Tensor {
     var resultShape = self.shape.dimensions.map { Int64($0) }
     for i in axes {
@@ -243,7 +243,7 @@ extension Tensor {
 
   /// Returns a rank-lifted `Tensor` with a leading dimension of 1.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func rankLifted() -> Tensor {
     expandingShape(at: 0)
   }
@@ -251,7 +251,7 @@ extension Tensor {
   /// Removes the specified dimensions of size 1 from the shape of a tensor. If no dimensions are
   /// specified, then all dimensions of size 1 will be removed.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func squeezingShape(at axes: Int...) -> Tensor {
     squeezingShape(at: axes)
   }
@@ -259,7 +259,7 @@ extension Tensor {
   /// Removes the specified dimensions of size 1 from the shape of a tensor. If no dimensions are
   /// specified, then all dimensions of size 1 will be removed.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func squeezingShape(at axes: [Int]) -> Tensor {
     _Raw.squeeze(self, squeezeDims: axes.map(Int32.init))
   }
@@ -375,7 +375,7 @@ infix operator ++: AdditionPrecedence
 extension Tensor {
   /// Returns a transposed tensor, with dimensions permuted in the specified order.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func transposed(permutation: Tensor<Int32>) -> Tensor {
     _Raw.transpose(self, perm: permutation)
   }
@@ -383,14 +383,14 @@ extension Tensor {
   /// Returns a transposed tensor, with dimensions permuted in the specified order.
   @available(*, deprecated, renamed: "transposed(permutation:)")
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func transposed(withPermutations permutations: Tensor<Int32>) -> Tensor {
     transposed(permutation: permutations)
   }
 
   /// Returns a transposed tensor, with dimensions permuted in the specified order.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func transposed(permutation: [Int]) -> Tensor {
     _Raw.transpose(self, perm: permutation)
   }
@@ -398,14 +398,14 @@ extension Tensor {
   /// Returns a transposed tensor, with dimensions permuted in the specified order.
   @available(*, deprecated, renamed: "transposed(permutation:)")
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func transposed(withPermutations permutations: [Int]) -> Tensor {
     transposed(permutation: permutations)
   }
 
   /// Returns a transposed tensor, with dimensions permuted in the specified order.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func transposed(permutation: Int...) -> Tensor {
     transposed(permutation: permutation)
   }
@@ -413,14 +413,14 @@ extension Tensor {
   /// Returns a transposed tensor, with dimensions permuted in the specified order.
   @available(*, deprecated, renamed: "transposed(permutation:)")
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func transposed(withPermutations permutations: Int...) -> Tensor {
     transposed(permutation: permutations)
   }
 
   /// Returns a transposed tensor, with dimensions permuted in reverse order.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func transposed() -> Tensor {
     return transposed(permutation: Array(stride(from: Int(rank - 1), to: -1, by: -1)))
   }
@@ -429,7 +429,7 @@ extension Tensor {
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   /// - Precondition: There must be no duplication in `axes`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func reversed(inAxes axes: Tensor<Int32>) -> Tensor {
     ensureValid(axes: axes)
     return _Raw.reverseV2(self, axis: axes)
@@ -439,7 +439,7 @@ extension Tensor {
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   /// - Precondition: There must be no duplication in `axes`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func reversed(inAxes axes: [Int]) -> Tensor {
     precondition(
       axes.count == Set(axes.map { $0 < 0 ? $0 + rank : $0 }).count,
@@ -452,7 +452,7 @@ extension Tensor {
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   /// - Precondition: There must be no duplication in `axes`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func reversed(inAxes axes: Int...) -> Tensor {
     reversed(inAxes: axes)
   }
@@ -462,7 +462,7 @@ extension Tensor {
   ///   specified axis.
   /// - Precondition: The axis must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public func concatenated(with other: Tensor, alongAxis axis: Int = 0) -> Tensor {
     return Tensor(concatenating: [self, other], alongAxis: axis)
   }
@@ -473,7 +473,7 @@ extension Tensor {
   ///   and may be controversial. The existence/naming of `++` will be discussed
   ///   during a later API design phase.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public static func ++ (lhs: Tensor, rhs: Tensor) -> Tensor {
     return lhs.concatenated(with: rhs)
   }
@@ -524,7 +524,7 @@ extension Tensor {
   ///
   /// - Returns: The gathered tensor.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func gathering<Index: TensorFlowIndex>(
     atIndices indices: Tensor<Index>,
     alongAxis axis: Int = 0
@@ -552,7 +552,7 @@ extension Tensor {
   ///
   /// - Returns: The gathered tensor.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func batchGathering<Index: TensorFlowIndex>(
     atIndices indices: Tensor<Index>,
     alongAxis axis: Int = 1,
@@ -674,7 +674,7 @@ extension Tensor {
   /// - Returns: `(self.rank - K + 1)`-dimensional tensor populated by entries in this tensor
   ///   corresponding to `true` values in `mask`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func gathering(where mask: Tensor<Bool>, alongAxis axis: Int = 0) -> Tensor {
     precondition(mask.rank != 0, "The boolean mask cannot be a scalar.")
     let posAxis = withoutDerivative(at: self.rank) { r in axis < 0 ? axis + r : axis }
@@ -913,13 +913,13 @@ infix operator .=
 
 extension Tensor {
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func broadcasted(toShape shape: Tensor<Int32>) -> Tensor {
     return _Raw.broadcastTo(self, shape: shape)
   }
 
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func broadcasted(to shape: TensorShape) -> Tensor {
     return broadcasted(toShape: Tensor<Int32>({ shape.dimensions.map(Int32.init) }(), on: device))
   }
@@ -927,7 +927,7 @@ extension Tensor {
   /// Broadcast to the same shape as the specified `Tensor`.
   /// - Precondition: The specified shape must be compatible for broadcasting.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func broadcasted<OtherScalar>(like other: Tensor<OtherScalar>) -> Tensor {
     return broadcasted(toShape: other.shapeTensor)
   }
@@ -940,7 +940,7 @@ extension Tensor {
 
 extension Tensor where Scalar: Numeric {
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func unbroadcasted(toShape otherShape: Tensor<Int32>) -> Tensor {
     // TODO: Simplify this once differentiating control flow is supported.
     return unbroadcasted(
@@ -951,13 +951,13 @@ extension Tensor where Scalar: Numeric {
   }
 
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func unbroadcasted<OtherScalar>(like other: Tensor<OtherScalar>) -> Tensor {
     return unbroadcasted(toShape: other.shapeTensor)
   }
 
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func unbroadcasted(to shape: TensorShape) -> Tensor {
     let dimensions = self.shape.dimensions
     var otherDimensions = shape.dimensions
@@ -1031,7 +1031,7 @@ extension Tensor where Scalar: Numeric {
 
   /// Returns a tensor padded with constant according to the specified padding sizes.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func padded(forSizes sizes: [(before: Int, after: Int)], with value: Scalar = 0)
     -> Tensor
   {
@@ -1040,7 +1040,7 @@ extension Tensor where Scalar: Numeric {
 
   /// Returns a padded tensor according to the specified padding sizes and mode.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func padded(forSizes sizes: [(before: Int, after: Int)], mode: PaddingMode) -> Tensor {
     let paddings = Tensor<Int32>(
       shape: [sizes.count, 2],
@@ -1102,7 +1102,7 @@ extension Tensor {
   /// - Parameter lowerBounds: The lower bounds at each dimension.
   /// - Parameter upperBounds: The upper bounds at each dimension.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func slice(lowerBounds: [Int], upperBounds: [Int]) -> Tensor {
     // TODO: Precondition `lowerBounds.count == upperBounds.count`,
     // preferably in graph.
@@ -1113,13 +1113,13 @@ extension Tensor {
   }
 
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func slice(lowerBounds: Tensor<Int32>, sizes: Tensor<Int32>) -> Tensor {
     return _Raw.slice(self, begin: lowerBounds, size: sizes)
   }
 
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func slice(lowerBounds: [Int], sizes: [Int]) -> Tensor {
     return _Raw.slice(self, begin: lowerBounds, size: sizes)
   }
@@ -1297,7 +1297,7 @@ extension Tensor {
   }
 
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  // @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   internal subscript(_ indexPath: IndexPath) -> Tensor {
     get {
       let device = self.device
@@ -1323,7 +1323,7 @@ extension Tensor {
   }
 
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  // @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public subscript(_ ranges: TensorRangeExpression...) -> Tensor {
     get {
       return self[{ IndexPath({ ranges.map { $0.tensorRange } }()) }()]
@@ -1334,27 +1334,27 @@ extension Tensor {
   }
 }
 
-extension Tensor where Scalar: TensorFlowFloatingPoint {
-  @usableFromInline
-  @derivative(of: subscript)
-  internal func _vjpSubscript(
-    _ indexPath: IndexPath
-  ) -> (value: Tensor, pullback: (Tensor) -> Tensor) {
-    return (
-      self[indexPath],
-      { [shape = shapeTensor] v in
-        _Raw.stridedSliceGrad(
-          shape: shape, begin: Tensor<Int32>(indexPath.begin, on: device),
-          end: Tensor<Int32>(indexPath.end, on: device),
-          strides: Tensor<Int32>(indexPath.strides, on: device), dy: v,
-          beginMask: indexPath.beginMask,
-          endMask: indexPath.endMask, ellipsisMask: indexPath.ellipsisMask,
-          newAxisMask: indexPath.newAxisMask,
-          shrinkAxisMask: indexPath.squeezeAxisMask)
-      }
-    )
-  }
-}
+// extension Tensor {
+//   @usableFromInline
+//   @derivative(of: subscript)
+//   internal func _vjpSubscript(
+//     _ indexPath: IndexPath
+//   ) -> (value: Tensor, pullback: (Tensor) -> Tensor) {
+//     return (
+//       self[indexPath],
+//       { [shape = shapeTensor] v in
+//         _Raw.stridedSliceGrad(
+//           shape: shape, begin: Tensor<Int32>(indexPath.begin, on: device),
+//           end: Tensor<Int32>(indexPath.end, on: device),
+//           strides: Tensor<Int32>(indexPath.strides, on: device), dy: v,
+//           beginMask: indexPath.beginMask,
+//           endMask: indexPath.endMask, ellipsisMask: indexPath.ellipsisMask,
+//           newAxisMask: indexPath.newAxisMask,
+//           shrinkAxisMask: indexPath.squeezeAxisMask)
+//       }
+//     )
+//   }
+// }
 
 extension Tensor.IndexPath {
   @inlinable
diff --git a/Sources/TensorFlow/Operators/Image.swift b/Sources/TensorFlow/Operators/Image.swift
index d3cfdf4cf..5562acf9f 100644
--- a/Sources/TensorFlow/Operators/Image.swift
+++ b/Sources/TensorFlow/Operators/Image.swift
@@ -41,7 +41,7 @@ public enum ResizeMethod {
 ///   - antialias: Iff `true`, use an anti-aliasing filter when downsampling an image.
 /// - Precondition: The images must have rank `3` or `4`.
 /// - Precondition: The size must be positive.
-@differentiable(wrt: images)
+@differentiable(reverse, wrt: images)
 public func resize(
   images: Tensor<Float>,
   size: (newHeight: Int, newWidth: Int),
@@ -168,7 +168,7 @@ public func resizeArea<Scalar: TensorFlowNumeric>(
 }
 
 @usableFromInline
-@differentiable(wrt: images)
+@differentiable(reverse, wrt: images)
 func scaleAndTranslate(
   images: Tensor<Float>,
   size: Tensor<Int32>,
@@ -218,7 +218,7 @@ func _vjpScaleAndTranslate(
 }
 
 @usableFromInline
-@differentiable(wrt: images where Scalar: TensorFlowFloatingPoint)
+@differentiable(reverse, wrt: images where Scalar: TensorFlowFloatingPoint)
 func resizeNearestNeighbor<Scalar: TensorFlowNumeric>(
   images: Tensor<Scalar>,
   size: Tensor<Int32>,
@@ -261,7 +261,7 @@ func _vjpResizeNearestNeighbor<Scalar: TensorFlowFloatingPoint>(
 }
 
 @usableFromInline
-@differentiable(wrt: images where Scalar: TensorFlowFloatingPoint)
+@differentiable(reverse, wrt: images where Scalar: TensorFlowFloatingPoint)
 func resizeBilinear<Scalar: TensorFlowNumeric>(
   images: Tensor<Scalar>,
   size: Tensor<Int32>,
@@ -304,7 +304,7 @@ func _vjpResizeBilinear<Scalar: TensorFlowFloatingPoint>(
 }
 
 @usableFromInline
-@differentiable(wrt: images where Scalar: TensorFlowFloatingPoint)
+@differentiable(reverse, wrt: images where Scalar: TensorFlowFloatingPoint)
 func resizeBicubic<Scalar: TensorFlowFloatingPoint>(
   images: Tensor<Scalar>,
   size: Tensor<Int32>,
@@ -356,7 +356,7 @@ func _vjpResizeBicubic<Scalar: TensorFlowFloatingPoint>(
 ///   - rates: The dilation rates for each dimension of the input.
 /// - Precondition: `input` must have rank `4`.
 /// - Precondition: `filter` must have rank `3`.
-@differentiable(wrt: (input, filter))
+@differentiable(reverse, wrt: (input, filter))
 public func dilation2D<Scalar: TensorFlowFloatingPoint>(
   _ input: Tensor<Scalar>,
   filter: Tensor<Scalar>,
@@ -401,7 +401,7 @@ func _vjpDilation2D<Scalar: TensorFlowFloatingPoint>(
 }
 
 /// TensorFlow builtin dilation2d gradient helper for the input.
-@differentiable(wrt: (x, filter))
+@differentiable(reverse, wrt: (x, filter))
 @usableFromInline
 func dilation2DBackpropInput<Scalar: TensorFlowFloatingPoint>(
   _ x: Tensor<Scalar>,
@@ -451,7 +451,7 @@ func _vjpDilation2DBackpropInput<Scalar: TensorFlowFloatingPoint>(
 }
 
 /// TensorFlow builtin dilation2d gradient helper for the input.
-@differentiable(wrt: (x, input))
+@differentiable(reverse, wrt: (x, input))
 @usableFromInline
 func dilation2DBackpropFilter<Scalar: TensorFlowFloatingPoint>(
   _ x: Tensor<Scalar>,
@@ -510,7 +510,7 @@ func _vjpDilation2DBackpropFilter<Scalar: TensorFlowFloatingPoint>(
 ///   - rates: The dilation rates for each dimension of the input.
 /// - Precondition: `input` must have rank `4`.
 /// - Precondition: `filter` must have rank 3.
-@differentiable(wrt: (input, filter))
+@differentiable(reverse, wrt: (input, filter))
 public func erosion2D<Scalar: TensorFlowFloatingPoint>(
   _ input: Tensor<Scalar>,
   filter: Tensor<Scalar>,
diff --git a/Sources/TensorFlow/Operators/LinearAlgebra.swift b/Sources/TensorFlow/Operators/LinearAlgebra.swift
index 1ad3c4f6d..7417fb2d9 100644
--- a/Sources/TensorFlow/Operators/LinearAlgebra.swift
+++ b/Sources/TensorFlow/Operators/LinearAlgebra.swift
@@ -32,7 +32,7 @@ extension Tensor where Scalar: TensorFlowNumeric {
   /// // [1, 2, 3, 4]
   /// ```
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public func diagonalPart() -> Tensor {
     precondition(rank >= 2, "The tensor must have at least rank 2.")
     return _Raw.matrixDiagPart(self)
@@ -53,7 +53,7 @@ extension Tensor where Scalar: TensorFlowNumeric {
   /// //  [0, 0, 0, 4]]
   /// ```
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public func diagonal() -> Tensor {
     _Raw.matrixDiag(diagonal: self)
   }
@@ -70,7 +70,7 @@ extension Tensor where Scalar: TensorFlowNumeric {
   }
 
   @available(*, deprecated, renamed: "bandPart(subdiagonalCount:superdiagonalCount:)")
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func bandPart(_ subdiagonalCount: Int, _ superdiagonalCount: Int) -> Tensor {
     return bandPart(subdiagonalCount: subdiagonalCount, superdiagonalCount: superdiagonalCount)
   }
@@ -105,7 +105,7 @@ extension Tensor where Scalar: TensorFlowNumeric {
   ///   - superdiagonalCount: The number of superdiagonals to keep. If negative, keep entire upper
   ///     triangle.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public func bandPart(subdiagonalCount: Int, superdiagonalCount: Int) -> Tensor {
     precondition(rank >= 2, "The tensor must have at least rank 2.")
     let lower = Tensor<Int32>(Int32(subdiagonalCount), on: self.device)
@@ -178,7 +178,7 @@ public func eye<Scalar: Numeric>(
 /// - Parameter matrix: A tensor of shape `[..., M, N]`.
 /// - Precondition: `matrix` must be a tensor with shape `[..., M, N]`.
 @inlinable
-@differentiable(wrt: matrix where T: TensorFlowFloatingPoint)
+@differentiable(reverse, wrt: matrix where T: TensorFlowFloatingPoint)
 public func trace<T: TensorFlowNumeric>(_ matrix: Tensor<T>) -> Tensor<T> {
   precondition(matrix.rank >= 2, "The tensor must have at least rank 2.")
   return matrix.diagonalPart().sum(squeezingAxes: -1)
@@ -214,7 +214,7 @@ func slogdet<T: TensorFlowFloatingPoint>(_ matrix: Tensor<T>) -> (
 /// - Parameter matrix: A tensor of shape `[..., M, N]`.
 /// - Returns: The natural logarithm of the determinant of `matrix`.
 @inlinable
-@differentiable(wrt: matrix where T: TensorFlowFloatingPoint)
+@differentiable(reverse, wrt: matrix where T: TensorFlowFloatingPoint)
 func logdet<T: TensorFlowFloatingPoint>(_ matrix: Tensor<T>) -> Tensor<T> {
   return 2.0 * log(cholesky(matrix).diagonalPart()).sum(squeezingAxes: -1)
 }
@@ -235,7 +235,7 @@ func logdet<T: TensorFlowFloatingPoint>(_ matrix: Tensor<T>) -> Tensor<T> {
 ///
 /// - Parameter input: A tensor of shape `[..., M, M]`.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func cholesky<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   _Raw.cholesky(x)
 }
@@ -326,7 +326,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
 /// - Precondition: `matrix` must be a tensor with shape `[..., M, M]`.
 /// - Precondition: `rhs` must be a tensor with shape `[..., M, K]`.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func triangularSolve<T: TensorFlowFloatingPoint>(
   matrix: Tensor<T>,
   rhs: Tensor<T>,
diff --git a/Sources/TensorFlow/Operators/Math.swift b/Sources/TensorFlow/Operators/Math.swift
index 438b368bd..e2f72bd48 100644
--- a/Sources/TensorFlow/Operators/Math.swift
+++ b/Sources/TensorFlow/Operators/Math.swift
@@ -49,7 +49,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   ///
   /// For real types, if `x` is negative the result is `.nan`. For complex
   /// types there is a branch cut on the negative real axis.
-  @differentiable
+  @differentiable(reverse)
   public static func sqrt(_ x: Self) -> Self {
     _Raw.sqrt(x)
   }
@@ -64,7 +64,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   }
 
   /// The cosine of `x`, interpreted as an angle in radians.
-  @differentiable
+  @differentiable(reverse)
   public static func cos(_ x: Self) -> Self {
     _Raw.cos(x)
   }
@@ -78,7 +78,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   }
 
   /// The sine of `x`, interpreted as an angle in radians.
-  @differentiable
+  @differentiable(reverse)
   public static func sin(_ x: Self) -> Self {
     _Raw.sin(x)
   }
@@ -92,7 +92,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   }
 
   /// The tangent of `x`, interpreted as an angle in radians.
-  @differentiable
+  @differentiable(reverse)
   public static func tan(_ x: Self) -> Self {
     _Raw.tan(x)
   }
@@ -107,7 +107,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   }
 
   /// The inverse cosine of `x` in radians.
-  @differentiable
+  @differentiable(reverse)
   public static func acos(_ x: Self) -> Self {
     _Raw.acos(x)
   }
@@ -121,7 +121,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   }
 
   /// The inverse sine of `x` in radians.
-  @differentiable
+  @differentiable(reverse)
   public static func asin(_ x: Self) -> Self {
     _Raw.asin(x)
   }
@@ -135,7 +135,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   }
 
   /// The inverse tangent of `x` in radians.
-  @differentiable
+  @differentiable(reverse)
   public static func atan(_ x: Self) -> Self {
     _Raw.atan(x)
   }
@@ -149,7 +149,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   }
 
   /// The hyperbolic cosine of `x`.
-  @differentiable
+  @differentiable(reverse)
   public static func cosh(_ x: Self) -> Self {
     _Raw.cosh(x)
   }
@@ -163,7 +163,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   }
 
   /// The hyperbolic sine of `x`.
-  @differentiable
+  @differentiable(reverse)
   public static func sinh(_ x: Self) -> Self {
     _Raw.sinh(x)
   }
@@ -177,7 +177,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   }
 
   /// The hyperbolic tangent of `x`.
-  @differentiable
+  @differentiable(reverse)
   public static func tanh(_ x: Self) -> Self {
     _Raw.tanh(x)
   }
@@ -192,7 +192,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   }
 
   /// The inverse hyperbolic cosine of `x`.
-  @differentiable
+  @differentiable(reverse)
   public static func acosh(_ x: Self) -> Self {
     _Raw.acosh(x)
   }
@@ -206,7 +206,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   }
 
   /// The inverse hyperbolic sine of `x`.
-  @differentiable
+  @differentiable(reverse)
   public static func asinh(_ x: Self) -> Self {
     _Raw.asinh(x)
   }
@@ -220,7 +220,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   }
 
   /// The inverse hyperbolic tangent of `x`.
-  @differentiable
+  @differentiable(reverse)
   public static func atanh(_ x: Self) -> Self {
     _Raw.atanh(x)
   }
@@ -234,7 +234,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   }
 
   /// The exponential function applied to `x`, or `e**x`.
-  @differentiable
+  @differentiable(reverse)
   public static func exp(_ x: Self) -> Self {
     _Raw.exp(x)
   }
@@ -249,25 +249,25 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   }
 
   /// Two raised to to power `x`.
-  @differentiable
+  @differentiable(reverse)
   public static func exp2(_ x: Self) -> Self {
     pow(Tensor(2, on: x.device), x)
   }
 
   /// Ten raised to to power `x`.
-  @differentiable
+  @differentiable(reverse)
   public static func exp10(_ x: Self) -> Self {
     pow(Tensor(10, on: x.device), x)
   }
 
   /// `exp(x) - 1` evaluated so as to preserve accuracy close to zero.
-  @differentiable
+  @differentiable(reverse)
   public static func expm1(_ x: Self) -> Self {
     _Raw.expm1(x)
   }
 
 #if TENSORFLOW_USE_STANDARD_TOOLCHAIN
-  @differentiable
+  @differentiable(reverse)
   public static func expMinusOne(_ x: Self) -> Self {
     return expm1(x)
   }
@@ -283,7 +283,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   }
 
   /// The natural logarithm of `x`.
-  @differentiable
+  @differentiable(reverse)
   public static func log(_ x: Self) -> Self {
     _Raw.log(x)
   }
@@ -297,25 +297,25 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   }
 
   /// The base-two logarithm of `x`.
-  @differentiable
+  @differentiable(reverse)
   public static func log2(_ x: Self) -> Self {
     log(x) / Scalar.log(2)
   }
 
   /// The base-ten logarithm of `x`.
-  @differentiable
+  @differentiable(reverse)
   public static func log10(_ x: Self) -> Self {
     log(x) / Scalar.log(10)
   }
 
   /// `log(1 + x)` evaluated so as to preserve accuracy close to zero.
-  @differentiable
+  @differentiable(reverse)
   public static func log1p(_ x: Self) -> Self {
     _Raw.log1p(x)
   }
 
 #if TENSORFLOW_USE_STANDARD_TOOLCHAIN
-  @differentiable
+  @differentiable(reverse)
   public static func log(onePlus x: Self) -> Self {
     return log1p(x)
   }
@@ -334,7 +334,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   /// For real types, if `x` is negative the result is NaN, even if `y` has
   /// an integral value. For complex types, there is a branch cut on the
   /// negative real axis.
-  @differentiable
+  @differentiable(reverse)
   public static func pow(_ x: Self, _ y: Self) -> Self {
     _Raw.pow(x, y)
   }
@@ -364,7 +364,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   /// `x` raised to the `n`th power.
   ///
   /// The product of `n` copies of `x`.
-  @differentiable
+  @differentiable(reverse)
   public static func pow(_ x: Self, _ n: Int) -> Self {
     pow(x, Tensor(Scalar(n), on: x.device))
   }
@@ -373,7 +373,7 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
   ///
   /// For real types, if `x` is negative and `n` is even, the result is NaN.
   /// For complex types, there is a branch cut along the negative real axis.
-  @differentiable
+  @differentiable(reverse)
   public static func root(_ x: Self, _ n: Int) -> Self {
     sign(x) * pow(abs(x), Tensor(Scalar(1) / Scalar(n), on: x.device))
   }
@@ -386,17 +386,17 @@ extension Tensor: ElementaryFunctions where Scalar: TensorFlowFloatingPoint {
 extension Tensor: VectorProtocol where Scalar: TensorFlowFloatingPoint {
   public typealias VectorSpaceScalar = Float
 
-  // @differentiable(where Scalar: TensorFlowFloatingPoint)
+  // @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public func scaled(by scale: Float) -> Self {
     Scalar(scale) * self
   }
 
-  // @differentiable(where Scalar: TensorFlowFloatingPoint)
+  // @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public func adding(_ scalar: Float) -> Self {
     self + Scalar(scalar)
   }
 
-  // @differentiable(where Scalar: TensorFlowFloatingPoint)
+  // @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public func subtracting(_ scalar: Float) -> Self {
     self - Scalar(scalar)
   }
@@ -452,28 +452,28 @@ public extension VectorProtocol where VectorSpaceScalar: SignedNumeric {
 extension Tensor where Scalar: Numeric {
   /// Adds the scalar to every scalar of the tensor and produces the sum.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public static func + (lhs: Scalar, rhs: Tensor) -> Tensor {
     return Tensor(lhs, deviceAndPrecisionLike: rhs) + rhs
   }
 
   /// Adds the scalar to every scalar of the tensor and produces the sum.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public static func + (lhs: Tensor, rhs: Scalar) -> Tensor {
     return lhs + Tensor(rhs, deviceAndPrecisionLike: lhs)
   }
 
   /// Subtracts the scalar from every scalar of the tensor and produces the difference.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public static func - (lhs: Scalar, rhs: Tensor) -> Tensor {
     return Tensor(lhs, deviceAndPrecisionLike: rhs) - rhs
   }
 
   /// Subtracts the scalar from every scalar of the tensor and produces the difference
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public static func - (lhs: Tensor, rhs: Scalar) -> Tensor {
     return lhs - Tensor(rhs, deviceAndPrecisionLike: lhs)
   }
@@ -510,21 +510,21 @@ extension Tensor where Scalar: Numeric {
   /// Returns the tensor produced by multiplying the two tensors.
   /// - Note: `*` supports broadcasting.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public static func * (lhs: Tensor, rhs: Tensor) -> Tensor {
     return _Raw.mul(lhs, rhs)
   }
 
   /// Returns the tensor by multiplying it with every scalar of the tensor.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public static func * (lhs: Scalar, rhs: Tensor) -> Tensor {
     return Tensor(lhs, deviceAndPrecisionLike: rhs) * rhs
   }
 
   /// Multiplies the scalar with every scalar of the tensor and produces the product.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public static func * (lhs: Tensor, rhs: Scalar) -> Tensor {
     return lhs * Tensor(rhs, deviceAndPrecisionLike: lhs)
   }
@@ -546,21 +546,21 @@ extension Tensor where Scalar: Numeric {
   /// Returns the quotient of dividing the first tensor by the second.
   /// - Note: `/` supports broadcasting.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public static func / (lhs: Tensor, rhs: Tensor) -> Tensor {
     return _Raw.div(lhs, rhs)
   }
 
   /// Returns the quotient of dividing the scalar by the tensor, broadcasting the scalar.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public static func / (lhs: Scalar, rhs: Tensor) -> Tensor {
     return Tensor(lhs, deviceAndPrecisionLike: rhs) / rhs
   }
 
   /// Returns the quotient of dividing the tensor by the scalar, broadcasting the scalar.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public static func / (lhs: Tensor, rhs: Scalar) -> Tensor {
     return lhs / Tensor(rhs, deviceAndPrecisionLike: lhs)
   }
@@ -822,28 +822,28 @@ extension Tensor where Scalar == Bool {
 extension Tensor where Scalar: TensorFlowNumeric {
   /// Returns `max(min(self, max), min)`.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public func clipped(min: Tensor, max: Tensor) -> Tensor {
     _Raw.clipByValue(t: self, clipValueMin: min, clipValueMax: max)
   }
 
   /// Returns `max(min(self, max), min)`.
   @inlinable
-  @differentiable(wrt: (self, min) where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: (self, min) where Scalar: TensorFlowFloatingPoint)
   public func clipped(min: Tensor, max: Scalar) -> Tensor {
     clipped(min: min, max: Tensor(max, deviceAndPrecisionLike: self))
   }
 
   /// Returns `max(min(self, max), min)`.
   @inlinable
-  @differentiable(wrt: (self, max) where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: (self, max) where Scalar: TensorFlowFloatingPoint)
   public func clipped(min: Scalar, max: Tensor) -> Tensor {
     clipped(min: Tensor(min, deviceAndPrecisionLike: self), max: max)
   }
 
   /// Returns `max(min(self, max), min)`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func clipped(min: Scalar, max: Scalar) -> Tensor {
     clipped(
       min: Tensor(min, deviceAndPrecisionLike: self),
@@ -965,7 +965,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
 extension Tensor where Scalar: SignedNumeric {
   /// Returns the negation of the specified tensor element-wise.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public static prefix func - (rhs: Tensor) -> Tensor {
     return _Raw.neg(rhs)
   }
@@ -981,7 +981,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
 
 /// Returns the absolute value of the specified tensor element-wise.
 @inlinable
-@differentiable(where T: TensorFlowFloatingPoint)
+@differentiable(reverse where T: TensorFlowFloatingPoint)
 public func abs<T: SignedNumeric>(_ x: Tensor<T>) -> Tensor<T> {
   _Raw.abs(x)
 }
@@ -997,28 +997,28 @@ internal func _vjpAbs<T: TensorFlowFloatingPoint>(
 
 /// Returns the natural logarithm of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func log<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.log(x)
 }
 
 /// Returns the base-two logarithm of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func log2<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   log(x) / T.log(2)
 }
 
 /// Returns the base-ten logarithm of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func log10<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   log(x) / T.log(10)
 }
 
 /// Returns the logarithm of `1 + x` element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func log1p<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.log1p(x)
 }
@@ -1028,7 +1028,7 @@ public func log1p<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
 /// - Note: The approach is shown in Equation 7 of:
 ///   https://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func log1mexp<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   let isTooSmall = withoutDerivative(at: x) { x in -x .< T(log(2.0)) }
   // This `replacing` will ultimately be a no-op because we will not select this code-path
@@ -1040,84 +1040,84 @@ public func log1mexp<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
 
 /// Returns the sine of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func sin<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.sin(x)
 }
 
 /// Returns the cosine of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func cos<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.cos(x)
 }
 
 /// Returns the tangent of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func tan<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.tan(x)
 }
 
 /// Returns the hyperbolic sine of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func sinh<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.sinh(x)
 }
 
 /// Returns the hyperbolic cosine of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func cosh<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.cosh(x)
 }
 
 /// Returns the hyperbolic tangent of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func tanh<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.tanh(x)
 }
 
 /// Returns the inverse cosine of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func acos<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.acos(x)
 }
 
 /// Returns the inverse sine of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func asin<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.asin(x)
 }
 
 /// Returns the inverse tangent of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func atan<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.atan(x)
 }
 
 /// Returns the inverse hyperbolic cosine of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func acosh<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.acosh(x)
 }
 
 /// Returns the inverse hyperbolic sine of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func asinh<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.asinh(x)
 }
 
 /// Returns the inverse hyperbolic tangent of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func atanh<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.atanh(x)
 }
@@ -1125,7 +1125,7 @@ public func atanh<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
 /// Returns the square of the tensor.
 extension Tensor where Scalar: Numeric {
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func squared() -> Tensor {
     _Raw.square(self)
   }
@@ -1141,14 +1141,14 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
 
 /// Returns the square root of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func sqrt<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.sqrt(x)
 }
 
 /// Returns the inverse square root of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func rsqrt<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   _Raw.rsqrt(x)
 }
@@ -1164,35 +1164,35 @@ internal func _vjpRsqrt<T: TensorFlowFloatingPoint>(
 
 /// Returns the exponential of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func exp<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.exp(x)
 }
 
 /// Returns two raised to the power of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func exp2<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.exp2(x)
 }
 
 /// Returns ten raised to the power of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func exp10<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.exp10(x)
 }
 
 /// Returns the exponential of `x - 1` element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func expm1<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   Tensor.expm1(x)
 }
 
 /// Returns the values of the specified tensor rounded to the nearest integer, element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func round<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   _Raw.round(x)
 }
@@ -1207,7 +1207,7 @@ internal func _vjpRound<T: TensorFlowFloatingPoint>(
 
 /// Returns the ceiling of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func ceil<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   _Raw.ceil(x)
 }
@@ -1222,7 +1222,7 @@ internal func _vjpCeil<T: TensorFlowFloatingPoint>(
 
 /// Returns the floor of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func floor<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   _Raw.floor(x)
 }
@@ -1238,7 +1238,7 @@ internal func _vjpFloor<T: TensorFlowFloatingPoint>(
 /// Returns an indication of the sign of the specified tensor element-wise.
 /// Specifically, computes `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
 @inlinable
-@differentiable(where T: TensorFlowFloatingPoint)
+@differentiable(reverse where T: TensorFlowFloatingPoint)
 public func sign<T: Numeric>(_ x: Tensor<T>) -> Tensor<T> {
   _Raw.sign(x)
 }
@@ -1254,7 +1254,7 @@ internal func _vjpSign<T: TensorFlowFloatingPoint>(
 /// Returns the sigmoid of the specified tensor element-wise.
 /// Specifically, computes `1 / (1 + exp(-x))`.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func sigmoid<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   _Raw.sigmoid(x)
 }
@@ -1271,7 +1271,7 @@ internal func _vjpSigmoid<T: TensorFlowFloatingPoint>(
 /// Returns the log-sigmoid of the specified tensor element-wise. Specifically,
 /// `log(1 / (1 + exp(-x)))`. For numerical stability, we use `-softplus(-x)`.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func logSigmoid<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   -softplus(-x)
 }
@@ -1279,7 +1279,7 @@ public func logSigmoid<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T>
 /// Returns the softplus of the specified tensor element-wise.
 /// Specifically, computes `log(exp(features) + 1)`.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func softplus<T: TensorFlowFloatingPoint>(_ features: Tensor<T>) -> Tensor<T> {
   _Raw.softplus(features: features)
 }
@@ -1295,7 +1295,7 @@ internal func _vjpSoftplus<T: TensorFlowFloatingPoint>(
 /// Returns the softsign of the specified tensor element-wise.
 /// Specifically, computes `features/ (abs(features) + 1)`.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func softsign<T: TensorFlowFloatingPoint>(_ features: Tensor<T>) -> Tensor<T> {
   _Raw.softsign(features: features)
 }
@@ -1311,7 +1311,7 @@ internal func _vjpSoftsign<T: TensorFlowFloatingPoint>(
 /// Returns the softmax of the specified tensor along the last axis.
 /// Specifically, computes `exp(x) / exp(x).sum(alongAxes: -1)`.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func softmax<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   _Raw.softmax(logits: x)
 }
@@ -1319,7 +1319,7 @@ public func softmax<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
 /// Returns the softmax of the specified tensor along the specified axis.
 /// Specifically, computes `exp(x) / exp(x).sum(alongAxes: axis)`.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func softmax<T: TensorFlowFloatingPoint>(_ x: Tensor<T>, alongAxis axis: Int) -> Tensor<T> {
   let xExp = exp(x)
   return xExp / xExp.sum(alongAxes: Tensor<Int32>(Int32(axis), on: xExp.device))
@@ -1342,7 +1342,7 @@ func _vjpSoftmax<T: TensorFlowFloatingPoint>(
 
 /// Returns the log-softmax of the specified tensor element-wise.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func logSoftmax<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   _Raw.logSoftmax(logits: x)
 }
@@ -1361,7 +1361,7 @@ func _vjpLogSoftmax<T: TensorFlowFloatingPoint>(
 /// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
 /// ](http://arxiv.org/abs/1511.07289)
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func elu<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   _Raw.elu(features: x)
 }
@@ -1382,7 +1382,7 @@ func _vjpElu<T: TensorFlowFloatingPoint>(
 ///
 /// See [Gaussian Error Linear Units](https://arxiv.org/abs/1606.08415).
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func gelu<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   // Use withoutDerivative to prevent device mismatch in pullback.
   let xWithoutDerivative = withoutDerivative(at: x)
@@ -1403,7 +1403,7 @@ public func gelu<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
 /// Returns a tensor by applying the ReLU activation function to the specified tensor element-wise.
 /// Specifically, computes `max(0, x)`.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func relu<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   _Raw.relu(features: x)
 }
@@ -1418,7 +1418,7 @@ func _vjpRelu<T: TensorFlowFloatingPoint>(
 
 /// Returns a tensor by applying the ReLU6 activation function, namely `min(max(0, x), 6)`.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func relu6<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   _Raw.relu6(features: x)
 }
@@ -1435,7 +1435,7 @@ func _vjpRelu6<T: TensorFlowFloatingPoint>(
 /// to the specified tensor element-wise.
 /// Specifically, computes `max(x, x * alpha)`.
 @inlinable
-@differentiable(wrt: x)
+@differentiable(reverse, wrt: x)
 public func leakyRelu<T: TensorFlowFloatingPoint>(
   _ x: Tensor<T>,
   alpha: Double = 0.2
@@ -1464,7 +1464,7 @@ func _vjpLeakyRelu<T: TensorFlowFloatingPoint>(
 ///   Please refer to [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515) for more
 ///   information.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func selu<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   _Raw.selu(features: x)
 }
@@ -1489,7 +1489,7 @@ func _vjpSelu<T: TensorFlowFloatingPoint>(
 /// Source: "Searching for Activation Functions" (Ramachandran et al. 2017)
 /// https://arxiv.org/abs/1710.05941
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func swish<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   x * sigmoid(x)
 }
@@ -1519,7 +1519,7 @@ func _vjpSwish<T: TensorFlowFloatingPoint>(
 /// Source: "Searching for MobileNetV3" (Howard et al. 2019)
 /// https://arxiv.org/abs/1905.02244
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func hardSigmoid<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   relu6(x + 3) / 6.0
 }
@@ -1530,7 +1530,7 @@ public func hardSigmoid<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T>
 /// Source: "Searching for MobileNetV3" (Howard et al. 2019)
 /// https://arxiv.org/abs/1905.02244
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func hardSwish<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   x * hardSigmoid(x)
 }
@@ -1541,7 +1541,7 @@ public func hardSwish<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
 /// Source: "Mish: A Self Regularized Non-Monotonic Neural Activation Function"
 /// https://arxiv.org/abs/1908.08681
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func mish<T: TensorFlowFloatingPoint>(_ x: Tensor<T>) -> Tensor<T> {
   x * tanh(softplus(x))
 }
@@ -1563,35 +1563,35 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
 
 /// Returns the power of the first tensor to the second tensor.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func pow<T: TensorFlowFloatingPoint>(_ lhs: Tensor<T>, _ rhs: Tensor<T>) -> Tensor<T> {
   Tensor.pow(lhs, rhs)
 }
 
 /// Returns the power of the scalar to the tensor, broadcasting the scalar.
 @inlinable
-@differentiable(wrt: rhs where T: TensorFlowFloatingPoint)
+@differentiable(reverse, wrt: rhs where T: TensorFlowFloatingPoint)
 public func pow<T: TensorFlowFloatingPoint>(_ lhs: T, _ rhs: Tensor<T>) -> Tensor<T> {
   pow(Tensor(lhs, deviceAndPrecisionLike: rhs), rhs)
 }
 
 /// Returns the power of the tensor to the scalar, broadcasting the scalar.
 @inlinable
-@differentiable(wrt: lhs where T: TensorFlowFloatingPoint)
+@differentiable(reverse, wrt: lhs where T: TensorFlowFloatingPoint)
 public func pow<T: TensorFlowFloatingPoint>(_ lhs: Tensor<T>, _ rhs: T) -> Tensor<T> {
   pow(lhs, Tensor(rhs, deviceAndPrecisionLike: lhs))
 }
 
 /// Returns the power of the tensor to the scalar, broadcasting the scalar.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func pow<T: TensorFlowFloatingPoint>(_ x: Tensor<T>, _ n: Int) -> Tensor<T> {
   pow(x, Tensor(T(n), deviceAndPrecisionLike: x))
 }
 
 /// Returns the element-wise `n`th root of the tensor.
 @inlinable
-@differentiable
+@differentiable(reverse)
 public func root<T: TensorFlowFloatingPoint>(_ x: Tensor<T>, _ n: Int) -> Tensor<T> {
   Tensor.root(x, n)
 }
@@ -1599,7 +1599,7 @@ public func root<T: TensorFlowFloatingPoint>(_ x: Tensor<T>, _ n: Int) -> Tensor
 /// Returns the squared difference between `x` and `y`.
 /// - Returns: `(x - y) ^ 2`.
 @inlinable
-@differentiable(where T: TensorFlowFloatingPoint)
+@differentiable(reverse where T: TensorFlowFloatingPoint)
 public func squaredDifference<T: TensorFlowNumeric>(_ x: Tensor<T>, _ y: Tensor<T>) -> Tensor<T> {
   _Raw.squaredDifference(x, y)
 }
@@ -1622,7 +1622,7 @@ internal func _vjpSquaredDifference<T: TensorFlowFloatingPoint>(
 /// Returns the element-wise maximum of two tensors.
 /// - Note: `max` supports broadcasting.
 @inlinable
-@differentiable(where T: TensorFlowFloatingPoint)
+@differentiable(reverse where T: TensorFlowFloatingPoint)
 public func max<T>(_ lhs: Tensor<T>, _ rhs: Tensor<T>) -> Tensor<T> where T: Numeric & Comparable {
   _Raw.maximum(lhs, rhs)
 }
@@ -1644,14 +1644,14 @@ internal func _vjpMax<T: TensorFlowFloatingPoint>(
 
 /// Returns the element-wise maximum of the scalar and the tensor, broadcasting the scalar.
 @inlinable
-@differentiable(wrt: rhs where T: TensorFlowFloatingPoint)
+@differentiable(reverse, wrt: rhs where T: TensorFlowFloatingPoint)
 public func max<T>(_ lhs: T, _ rhs: Tensor<T>) -> Tensor<T> where T: Numeric & Comparable {
   max(Tensor(lhs, deviceAndPrecisionLike: rhs), rhs)
 }
 
 /// Returns the element-wise maximum of the scalar and the tensor, broadcasting the scalar.
 @inlinable
-@differentiable(wrt: lhs where T: TensorFlowFloatingPoint)
+@differentiable(reverse, wrt: lhs where T: TensorFlowFloatingPoint)
 public func max<T>(_ lhs: Tensor<T>, _ rhs: T) -> Tensor<T> where T: Numeric & Comparable {
   max(lhs, Tensor(rhs, deviceAndPrecisionLike: lhs))
 }
@@ -1659,7 +1659,7 @@ public func max<T>(_ lhs: Tensor<T>, _ rhs: T) -> Tensor<T> where T: Numeric & C
 /// Returns the element-wise minimum of two tensors.
 /// - Note: `min` supports broadcasting.
 @inlinable
-@differentiable(where T: TensorFlowFloatingPoint)
+@differentiable(reverse where T: TensorFlowFloatingPoint)
 public func min<T>(_ lhs: Tensor<T>, _ rhs: Tensor<T>) -> Tensor<T> where T: Numeric & Comparable {
   _Raw.minimum(lhs, rhs)
 }
@@ -1681,14 +1681,14 @@ internal func _vjpMin<T: TensorFlowFloatingPoint>(
 
 /// Returns the element-wise minimum of the scalar and the tensor, broadcasting the scalar.
 @inlinable
-@differentiable(wrt: rhs where T: TensorFlowFloatingPoint)
+@differentiable(reverse, wrt: rhs where T: TensorFlowFloatingPoint)
 public func min<T>(_ lhs: T, _ rhs: Tensor<T>) -> Tensor<T> where T: Numeric & Comparable {
   min(Tensor(lhs, deviceAndPrecisionLike: rhs), rhs)
 }
 
 /// Returns the element-wise minimum of the scalar and the tensor, broadcasting the scalar.
 @inlinable
-@differentiable(wrt: lhs where T: TensorFlowFloatingPoint)
+@differentiable(reverse, wrt: lhs where T: TensorFlowFloatingPoint)
 public func min<T>(_ lhs: Tensor<T>, _ rhs: T) -> Tensor<T> where T: Numeric & Comparable {
   min(lhs, Tensor(rhs, deviceAndPrecisionLike: lhs))
 }
@@ -1715,7 +1715,7 @@ internal func _vjpMinMaxHelper<T: TensorFlowFloatingPoint>(
 }
 
 /// Returns the cosine similarity between `x` and `y`.
-@differentiable
+@differentiable(reverse)
 public func cosineSimilarity<Scalar: TensorFlowFloatingPoint>(
   _ x: Tensor<Scalar>,
   _ y: Tensor<Scalar>
@@ -1725,7 +1725,7 @@ public func cosineSimilarity<Scalar: TensorFlowFloatingPoint>(
 
 /// Returns the cosine distance between `x` and `y`. Cosine distance is defined as
 /// `1 - cosineSimilarity(x, y)`.
-@differentiable
+@differentiable(reverse)
 public func cosineDistance<Scalar: TensorFlowFloatingPoint>(
   _ x: Tensor<Scalar>,
   _ y: Tensor<Scalar>
@@ -1747,7 +1747,7 @@ extension Tensor {
   ///   must be either have the same shape as `self` or be a 1-D `Tensor` such
   ///   that `mask.scalarCount == self.shape[0]`.
   @inlinable
-  @differentiable(wrt: (self, other) where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: (self, other) where Scalar: TensorFlowFloatingPoint)
   public func replacing(with other: Tensor, where mask: Tensor<Bool>) -> Tensor {
     precondition(self.shape == other.shape, "`self` and `other` must have the same shape.")
     return _Raw.select(condition: mask, t: other, e: self)
@@ -1843,7 +1843,7 @@ extension Tensor where Scalar: Numeric & Comparable {
   // NOTE: This overload is necessary, otherwise `min()` would refer to the variadic method
   // `min(squeezingAxes:)` with zero indices.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public func min() -> Tensor {
     let axes = Tensor<Int32>(rangeFrom: 0, to: Int32(rank), stride: 1, on: device)
     return min(squeezingAxes: axes)
@@ -1852,7 +1852,7 @@ extension Tensor where Scalar: Numeric & Comparable {
   // NOTE: This overload is necessary, otherwise `max()` would refer to the variadic method
   // `max(squeezingAxes:)` with zero indices.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public func max() -> Tensor {
     let axes = Tensor<Int32>(rangeFrom: 0, to: Int32(rank), stride: 1, on: device)
     return max(squeezingAxes: axes)
@@ -1862,7 +1862,7 @@ extension Tensor where Scalar: Numeric & Comparable {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func max(squeezingAxes axes: Tensor<Int32>) -> Tensor {
     ensureValid(axes: axes)
     return _Raw.max(self, reductionIndices: axes, keepDims: false)
@@ -1872,7 +1872,7 @@ extension Tensor where Scalar: Numeric & Comparable {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func max(squeezingAxes axes: [Int]) -> Tensor {
     let axes = axes.map(Int32.init)
     return max(squeezingAxes: Tensor<Int32>(axes, on: device))
@@ -1882,7 +1882,7 @@ extension Tensor where Scalar: Numeric & Comparable {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func max(squeezingAxes axes: Int...) -> Tensor {
     max(squeezingAxes: axes)
   }
@@ -1891,7 +1891,7 @@ extension Tensor where Scalar: Numeric & Comparable {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func min(squeezingAxes axes: Tensor<Int32>) -> Tensor {
     ensureValid(axes: axes)
     return _Raw.min(self, reductionIndices: axes, keepDims: false)
@@ -1901,7 +1901,7 @@ extension Tensor where Scalar: Numeric & Comparable {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func min(squeezingAxes axes: [Int]) -> Tensor {
     let axes = axes.map(Int32.init)
     return min(squeezingAxes: Tensor<Int32>(axes, on: device))
@@ -1911,7 +1911,7 @@ extension Tensor where Scalar: Numeric & Comparable {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func min(squeezingAxes axes: Int...) -> Tensor {
     min(squeezingAxes: axes)
   }
@@ -1941,7 +1941,7 @@ extension Tensor where Scalar: Numeric & Comparable {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func min(alongAxes axes: Tensor<Int32>) -> Tensor {
     ensureValid(axes: axes)
     return _Raw.min(self, reductionIndices: axes, keepDims: true)
@@ -1952,7 +1952,7 @@ extension Tensor where Scalar: Numeric & Comparable {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func min(alongAxes axes: [Int]) -> Tensor {
     let axes = axes.map(Int32.init)
     return min(alongAxes: Tensor<Int32>(axes, on: device))
@@ -1963,7 +1963,7 @@ extension Tensor where Scalar: Numeric & Comparable {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func min(alongAxes axes: Int...) -> Tensor {
     min(alongAxes: axes)
   }
@@ -1973,7 +1973,7 @@ extension Tensor where Scalar: Numeric & Comparable {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func max(alongAxes axes: Tensor<Int32>) -> Tensor {
     ensureValid(axes: axes)
     return _Raw.max(self, reductionIndices: axes, keepDims: true)
@@ -1984,7 +1984,7 @@ extension Tensor where Scalar: Numeric & Comparable {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func max(alongAxes axes: [Int]) -> Tensor {
     let axes = axes.map(Int32.init)
     return max(alongAxes: Tensor<Int32>(axes, on: device))
@@ -1995,7 +1995,7 @@ extension Tensor where Scalar: Numeric & Comparable {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func max(alongAxes axes: Int...) -> Tensor {
     max(alongAxes: axes)
   }
@@ -2112,7 +2112,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank...rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func sum(squeezingAxes axes: Tensor<Int32>) -> Tensor {
     ensureValid(axes: axes)
     return _Raw.sum(self, reductionIndices: axes.scalars.map { Int64($0) }, keepDims: false)
@@ -2122,7 +2122,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank...rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func sum(squeezingAxes axes: [Int]) -> Tensor {
     let axes = axes.map(Int64.init)
     return _Raw.sum(self, reductionIndices: axes, keepDims: false)
@@ -2132,13 +2132,13 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank...rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func sum(squeezingAxes axes: Int...) -> Tensor {
     sum(squeezingAxes: axes)
   }
 
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func sum() -> Tensor {
     flattened().sum(squeezingAxes: 0)
   }
@@ -2147,7 +2147,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func sum(alongAxes axes: Tensor<Int32>) -> Tensor {
     ensureValid(axes: axes)
     return _Raw.sum(self, reductionIndices: axes, keepDims: true)
@@ -2157,7 +2157,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func sum(alongAxes axes: [Int]) -> Tensor {
     let axes = axes.map(Int64.init)
     return _Raw.sum(self, reductionIndices: axes, keepDims: true)
@@ -2167,7 +2167,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func sum(alongAxes axes: Int...) -> Tensor {
     sum(alongAxes: axes)
   }
@@ -2179,7 +2179,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank...rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func product(squeezingAxes axes: Tensor<Int32>) -> Tensor {
     ensureValid(axes: axes)
     return _Raw.prod(self, reductionIndices: axes, keepDims: false)
@@ -2190,7 +2190,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank...rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func product(squeezingAxes axes: [Int]) -> Tensor {
     let axes = axes.map(Int32.init)
     return product(squeezingAxes: Tensor<Int32>(axes, on: device))
@@ -2201,13 +2201,13 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank...rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func product(squeezingAxes axes: Int...) -> Tensor {
     product(squeezingAxes: axes)
   }
 
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func product() -> Tensor {
     flattened().product(squeezingAxes: 0)
   }
@@ -2247,7 +2247,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank...rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func mean(squeezingAxes axes: Tensor<Int32>) -> Tensor {
     ensureValid(axes: axes)
     return _Raw.mean(self, reductionIndices: axes, keepDims: false)
@@ -2257,7 +2257,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank...rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func mean(squeezingAxes axes: [Int]) -> Tensor {
     let axes = axes.map(Int64.init)
     return _Raw.mean(self, reductionIndices: axes, keepDims: false)
@@ -2267,13 +2267,13 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank...rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func mean(squeezingAxes axes: Int...) -> Tensor {
     mean(squeezingAxes: axes)
   }
 
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func mean() -> Tensor {
     flattened().mean(squeezingAxes: [0])
   }
@@ -2283,7 +2283,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func mean(alongAxes axes: Tensor<Int32>) -> Tensor {
     ensureValid(axes: axes)
     return _Raw.mean(self, reductionIndices: axes, keepDims: true)
@@ -2294,7 +2294,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func mean(alongAxes axes: [Int]) -> Tensor {
     let axes = axes.map(Int64.init)
     return _Raw.mean(self, reductionIndices: axes, keepDims: true)
@@ -2305,7 +2305,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func mean(alongAxes axes: Int...) -> Tensor {
     mean(alongAxes: axes)
   }
@@ -2317,7 +2317,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func variance(squeezingAxes axes: Tensor<Int32>) -> Tensor {
     ensureValid(axes: axes)
     let squaredDiff = squaredDifference(self, mean(alongAxes: axes))
@@ -2329,7 +2329,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func variance(squeezingAxes axes: [Int]) -> Tensor {
     let axes = axes.map(Int32.init)
     return variance(squeezingAxes: Tensor<Int32>(axes, on: device))
@@ -2340,13 +2340,13 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func variance(squeezingAxes axes: Int...) -> Tensor {
     variance(squeezingAxes: axes)
   }
 
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func variance() -> Tensor {
     let mean = self.mean()
     let squaredDiff = squaredDifference(self, mean)
@@ -2358,7 +2358,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func variance(alongAxes axes: Tensor<Int32>) -> Tensor {
     ensureValid(axes: axes)
     let squaredDiff = squaredDifference(self, mean(alongAxes: axes))
@@ -2370,7 +2370,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func variance(alongAxes axes: [Int]) -> Tensor {
     let axes = axes.map(Int32.init)
     return variance(alongAxes: Tensor<Int32>(axes, on: device))
@@ -2381,7 +2381,7 @@ extension Tensor where Scalar: Numeric {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func variance(alongAxes axes: Int...) -> Tensor {
     variance(alongAxes: axes)
   }
@@ -2412,7 +2412,7 @@ extension Tensor where Scalar: Numeric {
   /// - Returns: Result of the cumulative sum operation.
   /// - Precondition: `axis` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func cumulativeSum(
     alongAxis axis: Int,
     exclusive: Bool = false,
@@ -2451,7 +2451,7 @@ extension Tensor where Scalar: Numeric {
   /// - Precondition: `axis.rank` must be `0`.
   /// - Precondition: `axis` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func cumulativeSum(
     alongAxis axis: Tensor<Int32>,
     exclusive: Bool = false,
@@ -2487,7 +2487,7 @@ extension Tensor where Scalar: Numeric {
   /// - Returns: Result of the cumulative product operation.
   /// - Precondition: `axis` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func cumulativeProduct(
     alongAxis axis: Int,
     exclusive: Bool = false,
@@ -2526,7 +2526,7 @@ extension Tensor where Scalar: Numeric {
   /// - Precondition: `axis` must have rank `0`.
   /// - Precondition: `axis` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse, wrt: self where Scalar: TensorFlowFloatingPoint)
   public func cumulativeProduct(
     alongAxis axis: Tensor<Int32>,
     exclusive: Bool = false,
@@ -2733,7 +2733,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func standardDeviation(squeezingAxes axes: Tensor<Int32>) -> Tensor {
     ensureValid(axes: axes)
     return Tensor.sqrt(variance(squeezingAxes: axes))
@@ -2745,7 +2745,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func standardDeviation(squeezingAxes axes: [Int]) -> Tensor {
     ensureValid(axes: axes)
     return Tensor.sqrt(variance(squeezingAxes: axes))
@@ -2757,7 +2757,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func standardDeviation(squeezingAxes axes: Int...) -> Tensor {
     standardDeviation(squeezingAxes: axes)
   }
@@ -2767,7 +2767,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   ///
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func standardDeviation() -> Tensor {
     // Reduce along all dimensions.
     standardDeviation(squeezingAxes: Array(0..<shape.rank))
@@ -2779,7 +2779,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func standardDeviation(alongAxes axes: Tensor<Int32>) -> Tensor {
     ensureValid(axes: axes)
     return Tensor.sqrt(variance(alongAxes: axes))
@@ -2791,7 +2791,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func standardDeviation(alongAxes axes: [Int]) -> Tensor {
     let axes = axes.map(Int32.init)
     return standardDeviation(alongAxes: Tensor<Int32>(axes, on: device))
@@ -2803,7 +2803,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func standardDeviation(alongAxes axes: Int...) -> Tensor {
     ensureValid(axes: axes)
     return Tensor.sqrt(variance(alongAxes: axes))
@@ -2818,7 +2818,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func logSumExp(squeezingAxes axes: Tensor<Int32>) -> Tensor {
     ensureValid(axes: axes)
     let rawMax = max(alongAxes: axes)
@@ -2841,7 +2841,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func logSumExp(squeezingAxes axes: [Int]) -> Tensor {
     let axes = axes.map(Int32.init)
     return logSumExp(squeezingAxes: Tensor<Int32>(axes, on: device))
@@ -2856,7 +2856,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func logSumExp(squeezingAxes axes: Int...) -> Tensor {
     logSumExp(squeezingAxes: axes)
   }
@@ -2867,7 +2867,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// avoids overflows caused by computing the `exp` of large inputs and underflows caused by
   /// computing the `log` of small inputs.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func logSumExp() -> Tensor {
     logSumExp(squeezingAxes: Array(0..<shape.rank))
   }
@@ -2882,7 +2882,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func logSumExp(alongAxes axes: Tensor<Int32>) -> Tensor {
     ensureValid(axes: axes)
     let rawMax = max(alongAxes: axes)
@@ -2905,7 +2905,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func logSumExp(alongAxes axes: [Int]) -> Tensor {
     let axes = axes.map(Int32.init)
     return logSumExp(alongAxes: Tensor<Int32>(axes, on: device))
@@ -2921,7 +2921,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func logSumExp(alongAxes axes: Int...) -> Tensor {
     logSumExp(alongAxes: axes)
   }
@@ -2933,7 +2933,7 @@ public struct Moments<Scalar: TensorFlowFloatingPoint>: Differentiable {
   public var mean: Tensor<Scalar>
   public var variance: Tensor<Scalar>
 
-  @differentiable
+  @differentiable(reverse)
   public init(mean: Tensor<Scalar>, variance: Tensor<Scalar>) {
     self.mean = mean
     self.variance = variance
@@ -2948,7 +2948,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Precondition: `axes` must have rank `1`.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func moments(squeezingAxes axes: Tensor<Int32>) -> Moments<Scalar> {
     ensureValid(axes: axes)
     let mean = self.mean(alongAxes: axes)
@@ -2966,7 +2966,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func moments(squeezingAxes axes: [Int]) -> Moments<Scalar> {
     ensureValid(axes: axes)
     let mean = self.mean(squeezingAxes: axes)
@@ -2980,14 +2980,14 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func moments(squeezingAxes axes: Int...) -> Moments<Scalar> {
     moments(squeezingAxes: axes)
   }
 
   /// Returns the mean and variance of this tensor's elements.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func moments() -> Moments<Scalar> {
     moments(squeezingAxes: Array(0..<shape.rank))
   }
@@ -2999,7 +2999,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Precondition: `axes` must have rank `1`.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func moments(alongAxes axes: Tensor<Int32>) -> Moments<Scalar> {
     ensureValid(axes: axes)
     let mean = self.mean(alongAxes: axes)
@@ -3013,7 +3013,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func moments(alongAxes axes: [Int]) -> Moments<Scalar> {
     ensureValid(axes: axes)
     let mean = self.mean(alongAxes: axes)
@@ -3027,7 +3027,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   /// - Parameter axes: The dimensions to reduce.
   /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
   @inlinable
-  @differentiable(wrt: self)
+  @differentiable(reverse, wrt: self)
   public func moments(alongAxes axes: Int...) -> Moments<Scalar> {
     moments(alongAxes: axes)
   }
@@ -3039,7 +3039,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
 
 /// Performs matrix multiplication with another tensor and produces the result.
 @inlinable
-@differentiable(where Scalar: TensorFlowFloatingPoint)
+@differentiable(reverse where Scalar: TensorFlowFloatingPoint)
 public func matmul<Scalar: Numeric>(
   _ lhs: Tensor<Scalar>,
   transposed transposeLhs: Bool = false,
@@ -3105,7 +3105,7 @@ infix operator •: MultiplicationPrecedence
 extension Tensor where Scalar: Numeric {
   /// Performs matrix multiplication between two tensors and produces the result.
   @inlinable
-  @differentiable(where Scalar: TensorFlowFloatingPoint)
+  @differentiable(reverse where Scalar: TensorFlowFloatingPoint)
   public static func • (lhs: Tensor, rhs: Tensor) -> Tensor {
     matmul(lhs, rhs)
   }
diff --git a/Sources/TensorFlow/Operators/NN.swift b/Sources/TensorFlow/Operators/NN.swift
index caa472a0a..81fb6250b 100644
--- a/Sources/TensorFlow/Operators/NN.swift
+++ b/Sources/TensorFlow/Operators/NN.swift
@@ -30,7 +30,7 @@ extension Tensor where Scalar: TensorFlowFloatingPoint {
   ///   - scale: The scale, also known as gamma.
   ///   - epsilon: A small value added to the denominator for numerical stability.
   @inlinable
-  @differentiable(wrt: (self, offset, scale))
+  @differentiable(reverse, wrt: (self, offset, scale))
   public func batchNormalized(
     alongAxis axis: Int,
     offset: Tensor = Tensor(0),
@@ -84,7 +84,7 @@ extension Padding {
 ///   - dilation: The dilation factor.
 /// - Precondition: `input` must have rank `3`.
 /// - Precondition: `filter` must have rank 3.
-@differentiable(wrt: (input, filter))
+@differentiable(reverse, wrt: (input, filter))
 public func conv1D<Scalar: TensorFlowFloatingPoint>(
   _ input: Tensor<Scalar>,
   filter: Tensor<Scalar>,
@@ -113,7 +113,7 @@ public func conv1D<Scalar: TensorFlowFloatingPoint>(
 ///   - dilations: The dilation factor for each dimension of the input.
 /// - Precondition: `input` must have rank `4`.
 /// - Precondition: `filter` must have rank 4.
-@differentiable(wrt: (input, filter))
+@differentiable(reverse, wrt: (input, filter))
 public func conv2D<Scalar: TensorFlowFloatingPoint>(
   _ input: Tensor<Scalar>,
   filter: Tensor<Scalar>,
@@ -169,7 +169,7 @@ func _vjpConv2D<Scalar: TensorFlowFloatingPoint>(
 ///   - dilations: The dilation factor for each dimension of the input.
 /// - Precondition: `input` must have rank `4`.
 /// - Precondition: `filter` must have rank 4.
-@differentiable(wrt: (input, filter))
+@differentiable(reverse, wrt: (input, filter))
 public func transposedConv2D<Scalar: TensorFlowFloatingPoint>(
   _ input: Tensor<Scalar>,
   shape: [Int64],
@@ -186,7 +186,7 @@ public func transposedConv2D<Scalar: TensorFlowFloatingPoint>(
 }
 
 /// TensorFlow builtin conv2d gradient helper for the input.
-@differentiable(wrt: (x, filter))
+@differentiable(reverse, wrt: (x, filter))
 @usableFromInline
 func conv2DBackpropInput<Scalar: TensorFlowFloatingPoint>(
   _ x: Tensor<Scalar>,
@@ -233,7 +233,7 @@ func _vjpConv2DBackpropInput<Scalar: TensorFlowFloatingPoint>(
 }
 
 /// TensorFlow builtin conv2d gradient helper for the filter.
-@differentiable(wrt: (x, input))
+@differentiable(reverse, wrt: (x, input))
 @usableFromInline
 func conv2DBackpropFilter<Scalar: TensorFlowFloatingPoint>(
   _ x: Tensor<Scalar>,
@@ -289,7 +289,7 @@ func _vjpConv2DBackpropFilter<Scalar: TensorFlowFloatingPoint>(
 ///   - dilations: The dilation factor for each dimension of the input.
 /// - Precondition: `input` must have rank `5`.
 /// - Precondition: `filter` must have rank 5.
-@differentiable(wrt: (input, filter))
+@differentiable(reverse, wrt: (input, filter))
 public func conv3D<Scalar: TensorFlowFloatingPoint>(
   _ input: Tensor<Scalar>,
   filter: Tensor<Scalar>,
@@ -342,7 +342,7 @@ func _vjpConv3D<Scalar: TensorFlowFloatingPoint>(
 }
 
 /// TensorFlow builtin conv3d gradient helper for the input.
-@differentiable(wrt: (x, filter))
+@differentiable(reverse, wrt: (x, filter))
 @usableFromInline
 func conv3DBackpropInput<Scalar: TensorFlowFloatingPoint>(
   _ x: Tensor<Scalar>,
@@ -395,7 +395,7 @@ func _vjpConv3DBackpropInput<Scalar: TensorFlowFloatingPoint>(
 }
 
 /// TensorFlow builtin conv3d gradient helper for the filter.
-@differentiable(wrt: (x, input))
+@differentiable(reverse, wrt: (x, input))
 @usableFromInline
 func conv3DBackpropFilter<Scalar: TensorFlowFloatingPoint>(
   _ x: Tensor<Scalar>,
@@ -456,7 +456,7 @@ func _vjpConv3DBackpropFilter<Scalar: TensorFlowFloatingPoint>(
 ///   - padding: The padding for the operation.
 /// - Precondition: `input` must have rank 4.
 /// - Precondition: `filter` must have rank 4.
-@differentiable(wrt: (input, filter))
+@differentiable(reverse, wrt: (input, filter))
 public func depthwiseConv2D<Scalar: TensorFlowFloatingPoint>(
   _ input: Tensor<Scalar>,
   filter: Tensor<Scalar>,
@@ -502,7 +502,7 @@ func _vjpDepthwiseConv2D<Scalar: TensorFlowFloatingPoint>(
 }
 
 /// TensorFlow builtin depthwiseConv2D gradient helper for the input.
-@differentiable(wrt: (x, filter))
+@differentiable(reverse, wrt: (x, filter))
 @usableFromInline
 func depthwiseConv2dBackpropInput<Scalar: TensorFlowFloatingPoint>(
   _ x: Tensor<Scalar>,
@@ -549,7 +549,7 @@ func _vjpDepthwiseConv2dBackpropInput<Scalar: TensorFlowFloatingPoint>(
 }
 
 /// TensorFlow builtin depthwiseConv2D gradient helper for the filter.
-@differentiable(wrt: (x, input))
+@differentiable(reverse, wrt: (x, input))
 @usableFromInline
 func depthwiseConv2dBackpropFilter<Scalar: TensorFlowFloatingPoint>(
   _ x: Tensor<Scalar>,
@@ -602,7 +602,7 @@ func _vjpDepthwiseConv2dBackpropFilter<Scalar: TensorFlowFloatingPoint>(
 ///   - filterSize: The dimensions of the pooling kernel.
 ///   - strides: The strides of the sliding filter for each dimension of the input.
 ///   - padding: The padding for the operation.
-@differentiable(wrt: input)
+@differentiable(reverse, wrt: input)
 public func maxPool2D<Scalar: TensorFlowFloatingPoint>(
   _ input: Tensor<Scalar>,
   filterSize: (Int, Int, Int, Int),
@@ -662,7 +662,7 @@ func _vjpMaxPool2D<Scalar: TensorFlowFloatingPoint>(
 ///   - filterSize: The dimensions of the pooling kernel.
 ///   - strides: The strides of the sliding filter for each dimension of the input.
 ///   - padding: The padding for the operation.
-@differentiable(wrt: input)
+@differentiable(reverse, wrt: input)
 public func maxPool3D<Scalar: TensorFlowFloatingPoint>(
   _ input: Tensor<Scalar>,
   filterSize: (Int, Int, Int, Int, Int),
@@ -723,7 +723,7 @@ func _vjpMaxPool3D<Scalar: TensorFlowFloatingPoint>(
 ///   - filterSize: The dimensions of the pooling kernel.
 ///   - strides: The strides of the sliding filter for each dimension of the input.
 ///   - padding: The padding for the operation.
-@differentiable(wrt: input)
+@differentiable(reverse, wrt: input)
 public func avgPool2D<Scalar: TensorFlowFloatingPoint>(
   _ input: Tensor<Scalar>,
   filterSize: (Int, Int, Int, Int),
@@ -780,7 +780,7 @@ func _vjpAvgPool2D<Scalar: TensorFlowFloatingPoint>(
 ///   - filterSize: The dimensions of the pooling kernel.
 ///   - strides: The strides of the sliding filter for each dimension of the input.
 ///   - padding: The padding for the operation.
-@differentiable(wrt: input)
+@differentiable(reverse, wrt: input)
 public func avgPool3D<Scalar: TensorFlowFloatingPoint>(
   _ input: Tensor<Scalar>,
   filterSize: (Int, Int, Int, Int, Int),
@@ -849,7 +849,7 @@ func _vjpAvgPool3D<Scalar: TensorFlowFloatingPoint>(
 ///   - seed: An optional `Int64`. Defaults to `0`. If set to be non-zero, the random number
 ///     generator is seeded by the given seed.
 ///   - seed2: An optional `Int64`. Defaults to `0`. A second seed to avoid seed collision.
-@differentiable(wrt: input)
+@differentiable(reverse, wrt: input)
 public func fractionalMaxPool2D<Scalar: TensorFlowFloatingPoint>(
   _ input: Tensor<Scalar>,
   poolingRatio: (Double, Double, Double, Double),
@@ -972,7 +972,7 @@ func _vjpFractionalMaxPool<Scalar: TensorFlowFloatingPoint>(
 ///
 /// - Precondition: `input.rank == 4 && b >= 2`.
 /// - Precondition: The number of the features must be divisible by square of `b`.
-@differentiable(wrt: input where Scalar: TensorFlowFloatingPoint)
+@differentiable(reverse, wrt: input where Scalar: TensorFlowFloatingPoint)
 public func depthToSpace<Scalar>(_ input: Tensor<Scalar>, blockSize b: Int) -> Tensor<Scalar> {
   precondition(input.rank == 4, "The input must have rank 4.")
   precondition(b >= 2, "The size must be greater than 1.")
@@ -1047,7 +1047,7 @@ func _vjpDepthToSpace<Scalar: TensorFlowFloatingPoint>(
 /// - Precondition: `input.rank == 4 && b >= 2`.
 /// - Precondition: The height of the input must be divisible by `b`.
 /// - Precondition: The width of the input must be divisible by `b`.
-@differentiable(wrt: input where Scalar: TensorFlowFloatingPoint)
+@differentiable(reverse, wrt: input where Scalar: TensorFlowFloatingPoint)
 public func spaceToDepth<Scalar>(_ input: Tensor<Scalar>, blockSize b: Int) -> Tensor<Scalar> {
   precondition(input.rank == 4, "The input must have rank 4.")
   precondition(b >= 2, "The block size must be greater than 1.")
diff --git a/Sources/TensorFlow/Optimizers/MomentumBased.swift b/Sources/TensorFlow/Optimizers/MomentumBased.swift
index fd0e3d2c7..c64c34748 100644
--- a/Sources/TensorFlow/Optimizers/MomentumBased.swift
+++ b/Sources/TensorFlow/Optimizers/MomentumBased.swift
@@ -80,7 +80,7 @@ where
     let learningRate = self.learningRate * 1 / (1 + decay * Float(step))
     alpha = alpha.scaled(by: rho) + (direction .* direction).scaled(by: 1 - rho)
     let denominator = Model.TangentVector.sqrt(alpha).adding(epsilon)
-    model.move(along: (direction ./ denominator).scaled(by: -learningRate))
+    model.move(by: (direction ./ denominator).scaled(by: -learningRate))
   }
 
   public required init(copying other: RMSProp, to device: Device) {
@@ -145,7 +145,7 @@ where
   public func update(_ model: inout Model, along direction: Model.TangentVector) {
     accumulator = accumulator + (direction .* direction)
     let denominator = Model.TangentVector.sqrt(accumulator).adding(epsilon)
-    model.move(along: (direction ./ denominator).scaled(by: -learningRate))
+    model.move(by: (direction ./ denominator).scaled(by: -learningRate))
   }
 
   public required init(copying other: AdaGrad, to device: Device) {
@@ -221,7 +221,7 @@ where
       averageSquared.scaled(by: rho) + (direction .* direction).scaled(by: 1 - rho)
     var stepSize = direction .* Model.TangentVector.sqrt(accumulatedDelta.adding(epsilon))
     stepSize ./= Model.TangentVector.sqrt(averageSquared.adding(epsilon))
-    model.move(along: stepSize.scaled(by: -learningRate))
+    model.move(by: stepSize.scaled(by: -learningRate))
     accumulatedDelta =
       accumulatedDelta.scaled(by: rho) + (stepSize .* stepSize).scaled(by: 1 - rho)
   }
@@ -379,7 +379,7 @@ where
     secondMoments =
       secondMoments.scaled(by: beta2) + (direction .* direction).scaled(by: 1 - beta2)
     let denominator = Model.TangentVector.sqrt(secondMoments).adding(epsilon)
-    model.move(along: (firstMoments ./ denominator).scaled(by: -stepSize))
+    model.move(by: (firstMoments ./ denominator).scaled(by: -stepSize))
   }
 
   public required init(copying other: Adam, to device: Device) {
@@ -464,7 +464,7 @@ where
     }
 
     let denominator = infinityNorm.adding(epsilon)
-    model.move(along: (firstMoments ./ denominator).scaled(by: -stepSize))
+    model.move(by: (firstMoments ./ denominator).scaled(by: -stepSize))
   }
 
   public required init(copying other: AdaMax, to device: Device) {
@@ -556,7 +556,7 @@ where
     }
 
     let denominator = Model.TangentVector.sqrt(secondMomentsMax).adding(epsilon)
-    model.move(along: (firstMoments ./ denominator).scaled(by: -stepSize))
+    model.move(by: (firstMoments ./ denominator).scaled(by: -stepSize))
   }
 
   public required init(copying other: AMSGrad, to device: Device) {
@@ -643,11 +643,11 @@ where
           (N_sma_t - 4) * (N_sma_t - 2) * N_sma_inf
             / ((N_sma_inf - 4) * (N_sma_inf - 2) * (N_sma_t))) * learningRate / (1 - beta1Power)
       model.move(
-        along: (firstMoments ./ secondMoments_h).scaled(by: -stepSize * sqrtf(1 - beta2Power)))
+        by: (firstMoments ./ secondMoments_h).scaled(by: -stepSize * sqrtf(1 - beta2Power)))
     } else {
       // Update with un-adapted momentum.
       let stepSize = learningRate / (1 - beta1Power)
-      model.move(along: firstMoments.scaled(by: -stepSize))
+      model.move(by: firstMoments.scaled(by: -stepSize))
     }
   }
 
diff --git a/Sources/TensorFlow/Optimizers/SGD.swift b/Sources/TensorFlow/Optimizers/SGD.swift
index 11dcf1eb5..617d06b89 100644
--- a/Sources/TensorFlow/Optimizers/SGD.swift
+++ b/Sources/TensorFlow/Optimizers/SGD.swift
@@ -86,9 +86,9 @@ where
     let learningRate = self.learningRate * 1 / (1 + decay * Float(step))
     velocity = velocity.scaled(by: momentum) - direction.scaled(by: learningRate)
     if nesterov {
-      model.move(along: velocity.scaled(by: momentum) - direction.scaled(by: learningRate))
+      model.move(by: velocity.scaled(by: momentum) - direction.scaled(by: learningRate))
     } else {
-      model.move(along: velocity)
+      model.move(by: velocity)
     }
   }
 
diff --git a/Sources/TensorFlow/StdlibExtensions.swift b/Sources/TensorFlow/StdlibExtensions.swift
index f8bb846ee..7f368ab54 100644
--- a/Sources/TensorFlow/StdlibExtensions.swift
+++ b/Sources/TensorFlow/StdlibExtensions.swift
@@ -201,7 +201,7 @@ where Element: Differentiable & ElementaryFunctions {
   /// For real types, if `x` is negative the result is NaN, even if `y` has
   /// an integral value. For complex types, there is a branch cut on the
   /// negative real axis.
-  public static func pow(_ x: Self, _ y: Self) -> Self { .init(zip(x, y).map(Element.pow)) }
+  // public static func pow(_ x: Self, _ y: Self) -> Self { .init(zip(x, y).map({ (x,y) -> Element in Element.pow(x,y)})) }
 
   /// `x` raised to the `n`th power.
   ///
@@ -234,6 +234,17 @@ where Element: Differentiable {
     set { base[position] = newValue }
   }
 
+  @inlinable
+  public subscript(bounds: Range<Array<Element>.Index>) -> Self.SubSequence {
+    _read { yield base[bounds] }
+    set { base[bounds] = newValue }
+  }
+
+  @inlinable
+  public mutating func replaceSubrange<C>(_ subrange: Range<Self.Index>, with newElements: C) where C : Collection, Self.Element == C.Element {
+    fatalError("withUnsafeBufferPointer unimplemented because TensorBuffer is abstract")
+  }
+
   @inlinable
   public var startIndex: Index { base.startIndex }
 
@@ -288,20 +299,56 @@ where Element: Differentiable & PointwiseMultiplicative {
 
   public var reciprocal: Self { .init(map { $0.reciprocal }) }
 
-  public static func .* (lhs: Self, rhs: Self) -> Self {
-    precondition(lhs.count == rhs.count, "Count mismatch: \(lhs.count) and \(rhs.count)")
-    return .init(zip(lhs, rhs).map(.*))
-  }
-
-  public static func .*= (lhs: inout Self, rhs: Self) {
-    precondition(lhs.count == rhs.count, "Count mismatch: \(lhs.count) and \(rhs.count)")
-    for (i, x) in zip(lhs.indices, rhs) {
-      lhs[i] .*= x
-    }
-  }
+  // public static func .* (lhs: Self, rhs: Self) -> Self {
+  //   precondition(lhs.count == rhs.count, "Count mismatch: \(lhs.count) and \(rhs.count)")
+  //   return .init(zip(lhs, rhs).map(.*))
+  // }
+
+  // public static func .*= (lhs: inout Self, rhs: Self) {
+  //   precondition(lhs.count == rhs.count, "Count mismatch: \(lhs.count) and \(rhs.count)")
+  //   for (i, x) in zip(lhs.indices, rhs) {
+  //     lhs[i] .*= x
+  //   }
+  // }
 }
 
 extension Collection {
   /// Returns the `n`th position in `self`.
   func index(atOffset n: Int) -> Index { index(startIndex, offsetBy: n) }
 }
+
+/// Applies the given closure `body` to `x`. When used in a context where `x` is
+/// being differentiated with respect to, this function will not produce any
+/// derivative at `x`.
+// FIXME: Support throws-rethrows.
+@inlinable
+@inline(__always)
+@_semantics("autodiff.nonvarying")
+public func withoutDerivative<T, R>(at x: T, in body: (T) -> R) -> R {
+  body(x)
+}
+
+public extension Differentiable {
+  /// Applies the given closure to the derivative of `self`.
+  ///
+  /// Returns `self` like an identity function. When the return value is used in
+  /// a context where it is differentiated with respect to, applies the given
+  /// closure to the derivative of the return value.
+  @inlinable
+  @differentiable(reverse, wrt: self)
+  func withDerivative(_ body: @escaping (inout TangentVector) -> Void) -> Self {
+    return self
+  }
+
+  @inlinable
+  @derivative(of: withDerivative)
+  internal func _vjpWithDerivative(
+    _ body: @escaping (inout TangentVector) -> Void
+  ) -> (value: Self, pullback: (TangentVector) -> TangentVector) {
+    return (self, { grad in
+      var grad = grad
+      body(&grad)
+      return grad
+    })
+  }
+}
diff --git a/Sources/third_party/Experimental/Complex.swift b/Sources/third_party/Experimental/Complex.swift
index 373deaba9..f95cd8f2f 100644
--- a/Sources/third_party/Experimental/Complex.swift
+++ b/Sources/third_party/Experimental/Complex.swift
@@ -49,7 +49,7 @@ struct Complex<T: FloatingPoint> {
   var real: T
   var imaginary: T
 
-  @differentiable(where T: Differentiable, T == T.TangentVector)
+  @differentiable(reverse where T: Differentiable, T == T.TangentVector)
   init(real: T = 0, imaginary: T = 0) {
     self.real = real
     self.imaginary = imaginary
@@ -119,7 +119,7 @@ extension Complex: AdditiveArithmetic {
     lhs.imaginary += rhs.imaginary
   }
 
-  @differentiable(where T: Differentiable)
+  @differentiable(reverse where T: Differentiable)
   static func - (lhs: Complex, rhs: Complex) -> Complex {
     var temp = lhs
     temp -= rhs
@@ -157,7 +157,7 @@ extension Complex: Numeric {
     )
   }
 
-  @differentiable(where T: Differentiable)
+  @differentiable(reverse where T: Differentiable)
   static func * (lhs: Complex, rhs: Complex) -> Complex {
     var a = lhs.real
     var b = lhs.imaginary
@@ -206,7 +206,7 @@ extension Complex: Numeric {
 }
 
 extension Complex: SignedNumeric {
-  @differentiable(where T: Differentiable)
+  @differentiable(reverse where T: Differentiable)
   static prefix func - (operand: Complex) -> Complex {
     return Complex(real: -operand.real, imaginary: -operand.imaginary)
   }
@@ -218,7 +218,7 @@ extension Complex: SignedNumeric {
 }
 
 extension Complex {
-  @differentiable(where T: Differentiable)
+  @differentiable(reverse where T: Differentiable)
   static func / (lhs: Complex, rhs: Complex) -> Complex {
     var a = lhs.real
     var b = lhs.imaginary
@@ -262,7 +262,7 @@ extension Complex {
 }
 
 extension Complex {
-  @differentiable(where T: Differentiable)
+  @differentiable(reverse where T: Differentiable)
   func complexConjugate() -> Complex {
     return Complex(real: real, imaginary: -imaginary)
   }
@@ -273,28 +273,28 @@ func abs<T>(_ z: Complex<T>) -> Complex<T> {
 }
 
 extension Complex {
-  @differentiable(where T: Differentiable, T == T.TangentVector)
+  @differentiable(reverse where T: Differentiable, T == T.TangentVector)
   func adding(real: T) -> Complex {
     var c = self
     c.real += real
     return c
   }
 
-  @differentiable(where T: Differentiable, T == T.TangentVector)
+  @differentiable(reverse where T: Differentiable, T == T.TangentVector)
   func subtracting(real: T) -> Complex {
     var c = self
     c.real -= real
     return c
   }
 
-  @differentiable(where T: Differentiable, T == T.TangentVector)
+  @differentiable(reverse where T: Differentiable, T == T.TangentVector)
   func adding(imaginary: T) -> Complex {
     var c = self
     c.imaginary += imaginary
     return c
   }
 
-  @differentiable(where T: Differentiable, T == T.TangentVector)
+  @differentiable(reverse where T: Differentiable, T == T.TangentVector)
   func subtracting(imaginary: T) -> Complex {
     var c = self
     c.imaginary -= imaginary
diff --git a/Sources/x10/swift_bindings/optimizers/Optimizer.swift b/Sources/x10/swift_bindings/optimizers/Optimizer.swift
index 81d10ed61..ebd998ebf 100644
--- a/Sources/x10/swift_bindings/optimizers/Optimizer.swift
+++ b/Sources/x10/swift_bindings/optimizers/Optimizer.swift
@@ -229,7 +229,7 @@ where
       for cb in paramGroup.callbacks { cb(&state, &optimizerState) }
       step = state.step ?? Tensor<Float>(zerosLike: step)
     }
-    model.move(along: step)
+    model.move(by: step)
   }
 
   /// Copies the optimizer to the specified device.
diff --git a/Sources/x10/swift_bindings/training_loop.swift b/Sources/x10/swift_bindings/training_loop.swift
index 565eca214..bef75a979 100644
--- a/Sources/x10/swift_bindings/training_loop.swift
+++ b/Sources/x10/swift_bindings/training_loop.swift
@@ -159,7 +159,7 @@ struct Statistics {
   }
 }
 
-@differentiable
+@differentiable(reverse)
 public func _defaultLossFunction(_ ŷ: Tensor<Float>, _ y: Tensor<Int32>) -> Tensor<Float> {
   softmaxCrossEntropy(logits: ŷ, labels: y)
 }
@@ -190,7 +190,7 @@ where
   public func run<Dataset: Sequence>(
     train: Dataset, test: Dataset, crossReplicaSumDevices: [Device]? = nil,
     scheduleLearningRate: (Opt) -> Void = { _ in },
-    lossFunction: @differentiable (Tensor<Float>, @noDerivative Tensor<Int32>) -> Tensor<Float> =
+    lossFunction: @differentiable(reverse) (Tensor<Float>, @noDerivative Tensor<Int32>) -> Tensor<Float> =
       _defaultLossFunction
   )
     -> () -> (train: HostStatistics, test: HostStatistics)
diff --git a/Tests/AnnotationTests/TFEagerTests.swift b/Tests/AnnotationTests/TFEagerTests.swift
index ff3a4c705..093b5db09 100644
--- a/Tests/AnnotationTests/TFEagerTests.swift
+++ b/Tests/AnnotationTests/TFEagerTests.swift
@@ -24,7 +24,7 @@ final class AnnotationTFEagerTests: XCTestCase {
     public var dense3 = Dense<Float>(inputSize: 4, outputSize: 4)
     public var flatten = Flatten<Float>()
 
-    @differentiable
+    @differentiable(reverse)
     public func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
       let layer1 = dense1(input)
       let layer2 = layer1.reshaped(to: [1, 4])
diff --git a/Tests/AnnotationTests/XLATests.swift b/Tests/AnnotationTests/XLATests.swift
index df3e2447e..386717df1 100644
--- a/Tests/AnnotationTests/XLATests.swift
+++ b/Tests/AnnotationTests/XLATests.swift
@@ -24,7 +24,7 @@ final class AnnotationXLATests: XCTestCase {
     public var dense3 = Dense<Float>(inputSize: 4, outputSize: 4)
     public var flatten = Flatten<Float>()
 
-    @differentiable
+    @differentiable(reverse)
     public func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
       let layer1 = dense1(input)
       let layer2 = layer1.reshaped(to: [1, 4])
diff --git a/Tests/TensorFlowTests/FreezableTests.swift b/Tests/TensorFlowTests/FreezableTests.swift
index 464b62a5b..d8bf7dd40 100644
--- a/Tests/TensorFlowTests/FreezableTests.swift
+++ b/Tests/TensorFlowTests/FreezableTests.swift
@@ -31,7 +31,7 @@ final class FreezableTests: XCTestCase {
         self.bias = bias
       }
 
-      @differentiable
+      @differentiable(reverse)
       func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
         return input * weight + bias
       }
@@ -40,19 +40,19 @@ final class FreezableTests: XCTestCase {
     var dense = FreezableDense(weight: Tensor(2), bias: Tensor(3))
     let grad = FreezableDense.TangentVector(weight: Tensor(4), bias: Tensor(1))
 
-    dense.move(along: grad)
+    dense.move(by: grad)
     XCTAssertEqual(Tensor(6), dense.weight)
     XCTAssertEqual(Tensor(4), dense.bias)
 
     // Freeze `dense.weight`: its value cannot be updated.
     dense.$weight.freeze()
-    dense.move(along: grad)
+    dense.move(by: grad)
     XCTAssertEqual(Tensor(6), dense.weight)
     XCTAssertEqual(Tensor(5), dense.bias)
 
     // Unfreeze `dense.weight`: its value can be updated again.
     dense.$weight.unfreeze()
-    dense.move(along: grad)
+    dense.move(by: grad)
     XCTAssertEqual(Tensor(10), dense.weight)
     XCTAssertEqual(Tensor(6), dense.bias)
   }
diff --git a/Tests/TensorFlowTests/Helpers.swift b/Tests/TensorFlowTests/Helpers.swift
index 41d8e6a50..70d1f92fe 100644
--- a/Tests/TensorFlowTests/Helpers.swift
+++ b/Tests/TensorFlowTests/Helpers.swift
@@ -59,7 +59,7 @@ extension Float: PointwiseMultiplicative {
 struct Multiply: Layer {
   var coefficient: Float
 
-  @differentiable
+  @differentiable(reverse)
   func callAsFunction(_ input: Float) -> Float {
     return coefficient * input
   }
diff --git a/Tests/TensorFlowTests/LayerTests.swift b/Tests/TensorFlowTests/LayerTests.swift
index 00032bc01..512446014 100644
--- a/Tests/TensorFlowTests/LayerTests.swift
+++ b/Tests/TensorFlowTests/LayerTests.swift
@@ -19,7 +19,7 @@ import XCTest
 fileprivate struct Sigmoid<Scalar: TensorFlowFloatingPoint>: ParameterlessLayer {
   typealias TangentVector = EmptyTangentVector
 
-  @differentiable
+  @differentiable(reverse)
   func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
     sigmoid(input)
   }
diff --git a/Tests/TensorFlowTests/OptimizerTests.swift b/Tests/TensorFlowTests/OptimizerTests.swift
index f4569abb8..56a5010ae 100644
--- a/Tests/TensorFlowTests/OptimizerTests.swift
+++ b/Tests/TensorFlowTests/OptimizerTests.swift
@@ -21,7 +21,7 @@ class OptimizerTests: XCTestCase {
   struct Model: Layer {
     var dense = Dense<Float>(weight: [[0.8]], bias: [0.8], activation: identity)
 
-    @differentiable
+    @differentiable(reverse)
     func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
       dense(input)
     }
diff --git a/Tests/TensorFlowTests/SequencedTests.swift b/Tests/TensorFlowTests/SequencedTests.swift
index 011f7c819..e82a36a15 100644
--- a/Tests/TensorFlowTests/SequencedTests.swift
+++ b/Tests/TensorFlowTests/SequencedTests.swift
@@ -26,7 +26,7 @@ struct Model2: Layer {
   var multiply2: Multiply = Multiply(coefficient: 2)
   // ###sourceLocation(file: "/usr/local/google/home/marcrasi/git/swift-apis/Tests/TensorFlowTests/SequencedTests.swift.gyb", line: 25)
 
-  @differentiable
+  @differentiable(reverse)
   func callAsFunction(_ input: Float) -> Float {
     input.sequenced(
       through: multiply1, multiply2
@@ -43,7 +43,7 @@ struct Model3: Layer {
   var multiply3: Multiply = Multiply(coefficient: 3)
   // ###sourceLocation(file: "/usr/local/google/home/marcrasi/git/swift-apis/Tests/TensorFlowTests/SequencedTests.swift.gyb", line: 25)
 
-  @differentiable
+  @differentiable(reverse)
   func callAsFunction(_ input: Float) -> Float {
     input.sequenced(
       through: multiply1, multiply2, multiply3
@@ -62,7 +62,7 @@ struct Model4: Layer {
   var multiply4: Multiply = Multiply(coefficient: 4)
   // ###sourceLocation(file: "/usr/local/google/home/marcrasi/git/swift-apis/Tests/TensorFlowTests/SequencedTests.swift.gyb", line: 25)
 
-  @differentiable
+  @differentiable(reverse)
   func callAsFunction(_ input: Float) -> Float {
     input.sequenced(
       through: multiply1, multiply2, multiply3, multiply4
@@ -83,7 +83,7 @@ struct Model5: Layer {
   var multiply5: Multiply = Multiply(coefficient: 5)
   // ###sourceLocation(file: "/usr/local/google/home/marcrasi/git/swift-apis/Tests/TensorFlowTests/SequencedTests.swift.gyb", line: 25)
 
-  @differentiable
+  @differentiable(reverse)
   func callAsFunction(_ input: Float) -> Float {
     input.sequenced(
       through: multiply1, multiply2, multiply3, multiply4, multiply5
@@ -106,7 +106,7 @@ struct Model6: Layer {
   var multiply6: Multiply = Multiply(coefficient: 6)
   // ###sourceLocation(file: "/usr/local/google/home/marcrasi/git/swift-apis/Tests/TensorFlowTests/SequencedTests.swift.gyb", line: 25)
 
-  @differentiable
+  @differentiable(reverse)
   func callAsFunction(_ input: Float) -> Float {
     input.sequenced(
       through: multiply1, multiply2, multiply3, multiply4, multiply5, multiply6
diff --git a/Tests/TensorFlowTests/SequencedTests.swift.gyb b/Tests/TensorFlowTests/SequencedTests.swift.gyb
index e21aec19c..956d742a4 100644
--- a/Tests/TensorFlowTests/SequencedTests.swift.gyb
+++ b/Tests/TensorFlowTests/SequencedTests.swift.gyb
@@ -23,7 +23,7 @@ struct Model${count}: Layer {
   var multiply${i}: Multiply = Multiply(coefficient: ${i})
   % end
 
-  @differentiable
+  @differentiable(reverse)
   func callAsFunction(_ input: Float) -> Float {
     input.sequenced(
       through: ${', '.join(['multiply%d' % i for i in range(1, count + 1)])}
diff --git a/Tests/TensorFlowTests/TensorAutoDiffTests.swift b/Tests/TensorFlowTests/TensorAutoDiffTests.swift
index ac6ae9200..e9507495c 100644
--- a/Tests/TensorFlowTests/TensorAutoDiffTests.swift
+++ b/Tests/TensorFlowTests/TensorAutoDiffTests.swift
@@ -16,7 +16,7 @@ import XCTest
 
 @testable import TensorFlow
 
-let cube: @differentiable (Tensor<Float>) -> Tensor<Float> = { ($0 * $0 * $0) }
+let cube: @differentiable(reverse) (Tensor<Float>) -> Tensor<Float> = { ($0 * $0 * $0) }
 
 final class TensorAutoDiffTests: XCTestCase {
   func testSimpleGrad() {
@@ -207,7 +207,7 @@ final class TensorAutoDiffTests: XCTestCase {
     XCTAssertTrue(
       (Tensor<Float>(1), Tensor<Float>(1))
         == gradient(at: Tensor<Float>(0), Tensor<Float>(0), in: f))
-    XCTAssertTrue(([1], [1]) == pullback(at: [1], [10], in: f)([1]))
+    XCTAssertTrue(([1], [1]) == pullback(at: [1], [10], of: f)([1]))
   }
 
   func testSubtract() {
@@ -215,7 +215,7 @@ final class TensorAutoDiffTests: XCTestCase {
     XCTAssertTrue(
       (Tensor<Float>(1), Tensor<Float>(-1))
         == gradient(at: Tensor<Float>(0), Tensor<Float>(0), in: f))
-    XCTAssertTrue(([1], [-1]) == pullback(at: [1], [10], in: f)([1]))
+    XCTAssertTrue(([1], [-1]) == pullback(at: [1], [10], of: f)([1]))
   }
 
   func testMultiply() {
@@ -226,21 +226,21 @@ final class TensorAutoDiffTests: XCTestCase {
 
   func testDivide() {
     func f(a: Tensor<Float>, b: Tensor<Float>) -> Tensor<Float> { a / b }
-    XCTAssertTrue(([0.1], [-0.01]) == pullback(at: [1], [10], in: f)([1]))
+    XCTAssertTrue(([0.1], [-0.01]) == pullback(at: [1], [10], of: f)([1]))
   }
 
   func testMatmul() {
     func f(a: Tensor<Float>, b: Tensor<Float>) -> Tensor<Float> { matmul(a, b) }
     let v = Tensor<Float>(ones: [1, 1])
-    XCTAssertTrue(([[0]], [[0]]) == pullback(at: [[0]], [[0]], in: f)(v))
-    XCTAssertTrue(([[10]], [[1]]) == pullback(at: [[1]], [[10]], in: f)(v))
+    XCTAssertTrue(([[0]], [[0]]) == pullback(at: [[0]], [[0]], of: f)(v))
+    XCTAssertTrue(([[10]], [[1]]) == pullback(at: [[1]], [[10]], of: f)(v))
   }
 
   func testDot() {
     func f(a: Tensor<Float>, b: Tensor<Float>) -> Tensor<Float> { a • b }
     let v = Tensor<Float>(ones: [1, 1])
-    XCTAssertTrue(([[0]], [[0]]) == pullback(at: [[0]], [[0]], in: f)(v))
-    XCTAssertTrue(([[10]], [[1]]) == pullback(at: [[1]], [[10]], in: f)(v))
+    XCTAssertTrue(([[0]], [[0]]) == pullback(at: [[0]], [[0]], of: f)(v))
+    XCTAssertTrue(([[10]], [[1]]) == pullback(at: [[1]], [[10]], of: f)(v))
   }
 
   func testNegate() {
@@ -509,15 +509,15 @@ final class TensorAutoDiffTests: XCTestCase {
   func testExpandingShape() {
     func f1(a: Tensor<Float>) -> Tensor<Float> { a.expandingShape(at: 0).squared() }
     func f2(a: Tensor<Float>) -> Tensor<Float> { a.squared().expandingShape(at: 0) }
-    XCTAssertEqual(pullback(at: [3, 5], in: f1)([[1, 1]]), [6, 10])
-    XCTAssertEqual(pullback(at: [3, 5], in: f2)([[1, 1]]), [6, 10])
+    XCTAssertEqual(pullback(at: [3, 5], of: f1)([[1, 1]]), [6, 10])
+    XCTAssertEqual(pullback(at: [3, 5], of: f2)([[1, 1]]), [6, 10])
   }
 
   func testSqueezingShape() {
     func f1(a: Tensor<Float>) -> Tensor<Float> { a.squeezingShape(at: 0).squared() }
     func f2(a: Tensor<Float>) -> Tensor<Float> { a.squared().squeezingShape(at: 0) }
-    XCTAssertEqual(pullback(at: [[3, 5]], in: f1)([1, 1]), [[6, 10]])
-    XCTAssertEqual(pullback(at: [[3, 5]], in: f2)([1, 1]), [[6, 10]])
+    XCTAssertEqual(pullback(at: [[3, 5]], of: f1)([1, 1]), [[6, 10]])
+    XCTAssertEqual(pullback(at: [[3, 5]], of: f2)([1, 1]), [[6, 10]])
   }
 
   func testTiled() {
@@ -536,8 +536,8 @@ final class TensorAutoDiffTests: XCTestCase {
     func f2(a: Tensor<Float>) -> Tensor<Float> {
       a.squared().reshaped(toShape: Tensor<Int32>([2, 1]))
     }
-    XCTAssertEqual(pullback(at: [[3, 5]], in: f1)([[1], [1]]), [[6, 10]])
-    XCTAssertEqual(pullback(at: [[3, 5]], in: f2)([[1], [1]]), [[6, 10]])
+    XCTAssertEqual(pullback(at: [[3, 5]], of: f1)([[1], [1]]), [[6, 10]])
+    XCTAssertEqual(pullback(at: [[3, 5]], of: f2)([[1], [1]]), [[6, 10]])
   }
 
   func testReshaped() {
@@ -645,13 +645,13 @@ final class TensorAutoDiffTests: XCTestCase {
   }
 
   func testSideEffects() {
-    let foo: @differentiable (Tensor<Float>) -> Tensor<Float> = { x in
+    let foo: @differentiable(reverse) (Tensor<Float>) -> Tensor<Float> = { x in
       var a = x
       a = a + x
       a = a + x
       return a + x
     }
-    XCTAssertEqual(Tensor([4, 4]), pullback(at: Tensor([4, 5]), in: foo)([1, 1]))
+    XCTAssertEqual(Tensor([4, 4]), pullback(at: Tensor([4, 5]), of: foo)([1, 1]))
 
     func bar(x: Tensor<Float>) -> Tensor<Float> {
       var a = x
diff --git a/Tests/TensorFlowTests/TrivialModelTests.swift b/Tests/TensorFlowTests/TrivialModelTests.swift
index 1f093cfe2..2288cad1b 100644
--- a/Tests/TensorFlowTests/TrivialModelTests.swift
+++ b/Tests/TensorFlowTests/TrivialModelTests.swift
@@ -32,7 +32,7 @@ final class TrivialModelTests: XCTestCase {
           activation: relu,
           weightInitializer: glorotUniform(seed: (0xffeffe, 0xfffe)))
       }
-      @differentiable
+      @differentiable(reverse)
       func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
         let h1 = l1(input)
         return l2(h1)
diff --git a/Tests/x10/TensorVisitorPlanTest.swift b/Tests/x10/TensorVisitorPlanTest.swift
index bc5946e71..4c5132c65 100644
--- a/Tests/x10/TensorVisitorPlanTest.swift
+++ b/Tests/x10/TensorVisitorPlanTest.swift
@@ -20,7 +20,7 @@ struct Classifier: Layer {
   var layers = [Dense<Float>(inputSize: 784, outputSize: 30, activation: relu)]
   var final_layer = Dense<Float>(inputSize: 30, outputSize: 10)
 
-  @differentiable
+  @differentiable(reverse)
   func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
     return final_layer(layers.differentiableReduce(input) { last, layer in layer(last) })
   }
diff --git a/Tests/x10/ops_test.swift b/Tests/x10/ops_test.swift
index f1369efff..52c63ffaf 100644
--- a/Tests/x10/ops_test.swift
+++ b/Tests/x10/ops_test.swift
@@ -30,15 +30,15 @@ private func TF(_ range: TensorRange) -> TensorRange {
 }
 
 private func assertEqualUnaryOperationGradients(
-  _ xlaOp: @differentiable (Tensor<Float>) -> Tensor<Float>,
-  _ tensorFlowOp: @differentiable (Tensor<Float>) -> Tensor<Float>,
+  _ xlaOp: @differentiable(reverse) (Tensor<Float>) -> Tensor<Float>,
+  _ tensorFlowOp: @differentiable(reverse) (Tensor<Float>) -> Tensor<Float>,
   _ x: Tensor<Float>,
   _ outGrad: Tensor<Float>,
   relTolerance: Float = 1e-5,
   absTolerance: Float = 1e-7,
   file: StaticString = #file, line: UInt = #line
 ) {
-  var (actual, actualPullback) = valueWithPullback(at: x, in: xlaOp)
+  var (actual, actualPullback) = valueWithPullback(at: x, of: xlaOp)
   let useReducedPrecision = x.isReducedPrecision
   if useReducedPrecision {
     XCTAssert(outGrad.isReducedPrecision)
@@ -46,7 +46,7 @@ private func assertEqualUnaryOperationGradients(
     actual = actual.toFullPrecision
   }
   XCTAssert(!actual.isReducedPrecision)
-  let (expected, expectedPullback) = valueWithPullback(at: TF(x), in: tensorFlowOp)
+  let (expected, expectedPullback) = valueWithPullback(at: TF(x), of: tensorFlowOp)
   XCTAssert(
     allClose(
       actual: TF(actual), expected: expected, relTolerance: relTolerance, absTolerance: absTolerance
@@ -65,8 +65,8 @@ private func assertEqualUnaryOperationGradients(
 }
 
 private func assertEqualBinaryOperationGradients(
-  _ xlaOp: @differentiable (Tensor<Float>, Tensor<Float>) -> Tensor<Float>,
-  _ tensorFlowOp: @differentiable (Tensor<Float>, Tensor<Float>) -> Tensor<Float>,
+  _ xlaOp: @differentiable(reverse) (Tensor<Float>, Tensor<Float>) -> Tensor<Float>,
+  _ tensorFlowOp: @differentiable(reverse) (Tensor<Float>, Tensor<Float>) -> Tensor<Float>,
   _ x: Tensor<Float>,
   _ y: Tensor<Float>,
   _ outGrad: Tensor<Float>,
@@ -74,7 +74,7 @@ private func assertEqualBinaryOperationGradients(
   absTolerance: Float = 1e-7,
   file: StaticString = #file, line: UInt = #line
 ) {
-  var (actual, actualPullback) = valueWithPullback(at: x, y, in: xlaOp)
+  var (actual, actualPullback) = valueWithPullback(at: x, y, of: xlaOp)
   let useReducedPrecision = x.isReducedPrecision
   if useReducedPrecision {
     XCTAssert(y.isReducedPrecision)
@@ -83,7 +83,7 @@ private func assertEqualBinaryOperationGradients(
     actual = actual.toFullPrecision
   }
   XCTAssert(!actual.isReducedPrecision)
-  let (expected, expectedPullback) = valueWithPullback(at: TF(x), TF(y), in: tensorFlowOp)
+  let (expected, expectedPullback) = valueWithPullback(at: TF(x), TF(y), of: tensorFlowOp)
   XCTAssert(
     allClose(
       actual: TF(actual), expected: expected, relTolerance: relTolerance, absTolerance: absTolerance