xref: /aosp_15_r20/external/tensorflow/tensorflow/python/ops/gradient_checker.py (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15
16"""Gradient checker for any ops, graphs.
17
18The gradient checker verifies numerically that an op/graph properly
19computes the gradients
20"""
21import numpy as np
22
23from tensorflow.python.framework import constant_op
24from tensorflow.python.framework import dtypes
25from tensorflow.python.framework import indexed_slices
26from tensorflow.python.framework import ops
27from tensorflow.python.ops import array_ops
28from tensorflow.python.ops import gradients
29from tensorflow.python.ops import math_ops
30from tensorflow.python.platform import tf_logging as logging
31from tensorflow.python.util import deprecation
32from tensorflow.python.util.tf_export import tf_export
33
34
35def _product(t):
36  if isinstance(t, int):
37    return t
38  else:
39    y = 1
40    for x in t:
41      y *= x
42    return y
43
44
45def _extra_feeds(extra_feed_dict, new_feeds):
46  if not extra_feed_dict:
47    return new_feeds
48  r = {}
49  r.update(extra_feed_dict)
50  r.update(new_feeds)
51  return r
52
53
54def _compute_theoretical_jacobian(x, x_shape, x_data, dy, dy_shape, dx,
55                                  extra_feed_dict):
56  """Computes the theoretical Jacobian for dy/dx.
57
58  Computes the theoretical Jacobian using the ops generated by
59  compute_gradient().
60
61  Args:
62    x: the tensor "x".
63    x_shape: the dimensions of x as a tuple or an array of ints.
64    x_data: a numpy parray as the input data for x
65    dy: the tensor "dy".
66    dy_shape: the dimensions of dy as a tuple or an array of ints.
67    dx: Tensor or IndexedSlices representing dx
68    extra_feed_dict: dict that allows fixing specified tensor values
69      during the jacobian calculation.
70
71  Returns:
72    A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows
73    and "dy_size" columns where "x_size" is the number of elements in x and
74    "dy_size" is the number of elements in dy.
75
76  Raises:
77    ValueError: If `dy` is empty but the gradient is nonzero.
78  """
79  # Complex vectors are treated as vectors of twice as many reals.
80  if x.dtype.is_complex:
81    x_shape = tuple(x_shape) + (2,)
82  dy_factor = 2 if dy.dtype.is_complex else 1
83
84  # To compute the jacobian, we treat x and y as one-dimensional vectors.
85  x_size = _product(x_shape)
86  x_val_size = _product(x_shape[1:])  # This is used for sparse gradients
87  dy_size = _product(dy_shape) * dy_factor
88
89  # Allocate 2-D Jacobian, with x dimensions smashed into the first
90  # dimension and y dimensions smashed into the second.
91  jacobian = np.zeros((x_size, dy_size),
92                      dtype=x.dtype.real_dtype.as_numpy_dtype)
93
94  # For each of the entry of dy, we set this to be 1 and
95  # everything else to be 0 and compute the backprop -- this will give us one
96  # one column of the Jacobian matrix.
97  dy_data = np.zeros(dy_shape, dtype=dy.dtype.as_numpy_dtype)
98  dy_data_flat = dy_data.ravel().view(dy.dtype.real_dtype.as_numpy_dtype)
99  sess = ops.get_default_session()
100  for col in range(dy_size):
101    dy_data_flat[col] = 1
102    if isinstance(dx, indexed_slices.IndexedSlices):
103      backprop_indices, backprop_values = sess.run(
104          [dx.indices, dx.values],
105          feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data}))
106      for i, v in zip(backprop_indices, backprop_values):
107        r_begin = i * x_val_size
108        r_end = r_begin + x_val_size
109        jacobian[r_begin:r_end, col] += v.flat
110    else:
111      assert isinstance(dx, ops.Tensor), "dx = " + str(dx)
112      backprop = sess.run(
113          dx, feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data}))
114      jacobian[:, col] = backprop.ravel().view(jacobian.dtype)
115    dy_data_flat[col] = 0
116
117  # If the output is empty, run the gradients at least once and make sure
118  # they produce zeros.
119  if not dy_size:
120    backprop = sess.run(
121        dx, feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data}))
122    if backprop.shape != x_data.shape:
123      raise ValueError("Empty gradient has wrong shape: expected %s, got %s" %
124                       (x_data.shape, backprop.shape))
125    if np.any(backprop):
126      raise ValueError("Empty tensor with nonzero gradients")
127
128  logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian)
129  return jacobian
130
131
132def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta,
133                              extra_feed_dict):
134  """Computes the numeric Jacobian for dy/dx.
135
136  Computes the numeric Jacobian by slightly perturbing the inputs and
137  measuring the differences on the output.
138
139  Args:
140    x: the tensor "x".
141    x_shape: the dimensions of x as a tuple or an array of ints.
142    x_data: a numpy array as the input data for x
143    y: the tensor "y".
144    y_shape: the dimensions of y as a tuple or an array of ints.
145    delta: the amount of perturbation we give to the input
146    extra_feed_dict: dict that allows fixing specified tensor values
147      during the jacobian calculation.
148
149  Returns:
150    A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows
151    and "y_size" columns where "x_size" is the number of elements in x and
152    "y_size" is the number of elements in y.
153  """
154  # bfloat16 doesn't have enough bits to represent high precision numbers such
155  # as delta. Convert to float32 here. Since numeric_jacobian is expected to
156  # be the groundtruth to compare against, it shouldn't lose any information.
157  if x.dtype == dtypes.bfloat16:
158    x = math_ops.cast(x, dtypes.float32)  # TODO(wangpeng): Now that the new x
159            # is an output of the old x, isn't feeding to the new x a mistake?
160  if y.dtype == dtypes.bfloat16:
161    y = math_ops.cast(y, dtypes.float32)
162  if x_data.dtype == dtypes.bfloat16.as_numpy_dtype:
163    x_data = x_data.astype(np.float32)
164
165  # To compute the jacobian, we treat x and y as one-dimensional vectors
166  x_size = _product(x_shape) * (2 if x.dtype.is_complex else 1)
167  y_size = _product(y_shape) * (2 if y.dtype.is_complex else 1)
168  x_dtype = x.dtype.real_dtype.as_numpy_dtype
169  y_dtype = y.dtype.real_dtype.as_numpy_dtype
170
171  # Make sure we have the right types
172  x_data = np.asarray(x_data, dtype=x.dtype.as_numpy_dtype)
173  scale = np.asarray(2 * delta, dtype=y_dtype)[()]
174
175  jacobian = np.zeros((x_size, y_size), dtype=x_dtype)
176  # For each of the entry of x, we slightly perturbs this by adding and
177  # subtracting a delta and then compute difference between the outputs. This
178  # will give us one row of the Jacobian matrix.
179  for row in range(x_size):
180    x_pos = x_data.copy()
181    x_neg = x_data.copy()
182    x_pos.ravel().view(x_dtype)[row] += delta
183    y_pos = y.eval(feed_dict=_extra_feeds(extra_feed_dict, {x: x_pos}))
184    x_neg.ravel().view(x_dtype)[row] -= delta
185    y_neg = y.eval(feed_dict=_extra_feeds(extra_feed_dict, {x: x_neg}))
186    diff = (y_pos - y_neg) / scale
187    jacobian[row, :] = diff.ravel().view(y_dtype)
188
189  logging.vlog(1, "Numeric Jacobian =\n%s", jacobian)
190  return jacobian
191
192
193def _compute_dx_and_dy(x, y, y_shape):
194  """Returns a node to compute gradient of y wrt x."""
195  # We make up a dy so that we can compute the gradients. We don't really use
196  # the value of dy -- we will always feed it. We need to add an identity node
197  # so that we can always feed it properly. Otherwise, for the Add operation,
198  # dx is the same as dy and we cannot fetch the tensor that we are feeding.
199  with x.graph.as_default():
200    dy_orig = constant_op.constant(1.0, shape=y_shape, dtype=y.dtype)
201    dy = array_ops.identity(dy_orig)
202  # We compute the gradients for y wrt. x
203  grads = gradients.gradients(y, x, dy)
204  assert len(grads) == 1
205  return grads[0], dy_orig
206
207
208def _compute_gradient(x,
209                      x_shape,
210                      dx,
211                      y,
212                      y_shape,
213                      dy,
214                      x_init_value=None,
215                      delta=1e-3,
216                      extra_feed_dict=None):
217  """Computes the theoretical and numerical jacobian."""
218  t = dtypes.as_dtype(x.dtype)
219  allowed_types = [dtypes.float16, dtypes.bfloat16, dtypes.float32,
220                   dtypes.float64, dtypes.complex64, dtypes.complex128]
221  assert t.base_dtype in allowed_types, "Don't support type %s for x" % t.name
222  t2 = dtypes.as_dtype(y.dtype)
223  assert t2.base_dtype in allowed_types, "Don't support type %s for y" % t2.name
224
225  if x_init_value is not None:
226    i_shape = list(x_init_value.shape)
227    assert(list(x_shape) == i_shape), "x_shape = %s, init_data shape = %s" % (
228        x_shape, i_shape)
229    x_data = x_init_value
230  else:
231    x_data = np.random.random_sample(x_shape).astype(t.as_numpy_dtype)
232    if t.is_complex:
233      x_data.imag = np.random.random_sample(x_shape)
234
235  jacob_t = _compute_theoretical_jacobian(
236      x, x_shape, x_data, dy, y_shape, dx, extra_feed_dict=extra_feed_dict)
237  jacob_n = _compute_numeric_jacobian(
238      x, x_shape, x_data, y, y_shape, delta, extra_feed_dict=extra_feed_dict)
239  return jacob_t, jacob_n
240
241
242def _compute_gradient_list(x,
243                           x_shape,
244                           y,
245                           y_shape,
246                           x_init_value=None,
247                           delta=1e-3,
248                           init_targets=None,
249                           extra_feed_dict=None):
250  """Compute gradients for a list of x values."""
251  assert isinstance(x, list)
252  dx, dy = zip(*[_compute_dx_and_dy(xi, y, y_shape) for xi in x])
253
254  if init_targets is not None:
255    assert isinstance(init_targets, (list, tuple))
256    for init in init_targets:
257      init.run()
258  if x_init_value is None:
259    x_init_value = [None] * len(x)
260  # pylint: disable=g-complex-comprehension
261  ret = [_compute_gradient(xi, x_shapei, dxi, y, y_shape, dyi, x_init_valuei,
262                           delta, extra_feed_dict=extra_feed_dict)
263         for xi, x_shapei, dxi, dyi, x_init_valuei in zip(x, x_shape, dx, dy,
264                                                          x_init_value)]
265  return ret
266
267
268@tf_export(v1=["test.compute_gradient"])
269@deprecation.deprecated(
270    date=None,
271    instructions="Use tf.test.compute_gradient in 2.0, which has better "
272    "support for functions. Note that the two versions have different usage, "
273    "so code change is needed.")
274def compute_gradient(x,
275                     x_shape,
276                     y,
277                     y_shape,
278                     x_init_value=None,
279                     delta=1e-3,
280                     init_targets=None,
281                     extra_feed_dict=None):
282  """Computes and returns the theoretical and numerical Jacobian.
283
284  If `x` or `y` is complex, the Jacobian will still be real but the
285  corresponding Jacobian dimension(s) will be twice as large.  This is required
286  even if both input and output is complex since TensorFlow graphs are not
287  necessarily holomorphic, and may have gradients not expressible as complex
288  numbers.  For example, if `x` is complex with shape `[m]` and `y` is complex
289  with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with
290
291      J[:m, :n] = d(Re y)/d(Re x)
292      J[:m, n:] = d(Im y)/d(Re x)
293      J[m:, :n] = d(Re y)/d(Im x)
294      J[m:, n:] = d(Im y)/d(Im x)
295
296  Args:
297    x: a tensor or list of tensors
298    x_shape: the dimensions of x as a tuple or an array of ints. If x is a list,
299    then this is the list of shapes.
300    y: a tensor
301    y_shape: the dimensions of y as a tuple or an array of ints.
302    x_init_value: (optional) a numpy array of the same shape as "x"
303      representing the initial value of x. If x is a list, this should be a list
304      of numpy arrays.  If this is none, the function will pick a random tensor
305      as the initial value.
306    delta: (optional) the amount of perturbation.
307    init_targets: list of targets to run to initialize model params.
308    extra_feed_dict: dict that allows fixing specified tensor values
309      during the Jacobian calculation.
310
311  Returns:
312    Two 2-d numpy arrays representing the theoretical and numerical
313    Jacobian for dy/dx. Each has "x_size" rows and "y_size" columns
314    where "x_size" is the number of elements in x and "y_size" is the
315    number of elements in y. If x is a list, returns a list of two numpy arrays.
316  """
317  # TODO(mrry): remove argument `init_targets`
318  if extra_feed_dict is None:
319    extra_feed_dict = {}
320
321  if isinstance(x, list):
322    return _compute_gradient_list(x, x_shape, y, y_shape, x_init_value, delta,
323                                  init_targets, extra_feed_dict=extra_feed_dict)
324  else:
325    if init_targets is not None:
326      assert isinstance(init_targets, (list, tuple))
327      for init in init_targets:
328        init.run()
329    dx, dy = _compute_dx_and_dy(x, y, y_shape)
330    ret = _compute_gradient(x, x_shape, dx, y, y_shape, dy, x_init_value, delta,
331                            extra_feed_dict=extra_feed_dict)
332    return ret
333
334
335def _compute_error(grad):
336  if isinstance(grad, tuple):
337    grad = [grad]
338  error = 0
339  for j_t, j_n in grad:
340    if j_t.size or j_n.size:  # Handle zero size tensors correctly
341      error = np.maximum(error, np.fabs(j_t - j_n).max())
342  return error
343
344
345@tf_export(v1=["test.compute_gradient_error"])
346@deprecation.deprecated(
347    date=None,
348    instructions="Use tf.test.compute_gradient in 2.0, which has better "
349    "support for functions. Note that the two versions have different usage, "
350    "so code change is needed.")
351def compute_gradient_error(x,
352                           x_shape,
353                           y,
354                           y_shape,
355                           x_init_value=None,
356                           delta=1e-3,
357                           init_targets=None,
358                           extra_feed_dict=None):
359  """Computes the gradient error.
360
361  Computes the maximum error for dy/dx between the computed Jacobian and the
362  numerically estimated Jacobian.
363
364  This function will modify the tensors passed in as it adds more operations
365  and hence changing the consumers of the operations of the input tensors.
366
367  This function adds operations to the current session. To compute the error
368  using a particular device, such as a GPU, use the standard methods for
369  setting a device (e.g. using with sess.graph.device() or setting a device
370  function in the session constructor).
371
372  Args:
373    x: a tensor or list of tensors
374    x_shape: the dimensions of x as a tuple or an array of ints. If x is a list,
375    then this is the list of shapes.
376    y: a tensor
377    y_shape: the dimensions of y as a tuple or an array of ints.
378    x_init_value: (optional) a numpy array of the same shape as "x"
379      representing the initial value of x. If x is a list, this should be a list
380      of numpy arrays.  If this is none, the function will pick a random tensor
381      as the initial value.
382    delta: (optional) the amount of perturbation.
383    init_targets: list of targets to run to initialize model params.
384    extra_feed_dict: dict that allows fixing specified tensor values
385      during the Jacobian calculation.
386
387  Returns:
388    The maximum error in between the two Jacobians.
389  """
390  grad = compute_gradient(x, x_shape, y, y_shape, x_init_value, delta,
391                          init_targets, extra_feed_dict=extra_feed_dict)
392  return _compute_error(grad)
393