1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15 16"""Gradient checker for any ops, graphs. 17 18The gradient checker verifies numerically that an op/graph properly 19computes the gradients 20""" 21import numpy as np 22 23from tensorflow.python.framework import constant_op 24from tensorflow.python.framework import dtypes 25from tensorflow.python.framework import indexed_slices 26from tensorflow.python.framework import ops 27from tensorflow.python.ops import array_ops 28from tensorflow.python.ops import gradients 29from tensorflow.python.ops import math_ops 30from tensorflow.python.platform import tf_logging as logging 31from tensorflow.python.util import deprecation 32from tensorflow.python.util.tf_export import tf_export 33 34 35def _product(t): 36 if isinstance(t, int): 37 return t 38 else: 39 y = 1 40 for x in t: 41 y *= x 42 return y 43 44 45def _extra_feeds(extra_feed_dict, new_feeds): 46 if not extra_feed_dict: 47 return new_feeds 48 r = {} 49 r.update(extra_feed_dict) 50 r.update(new_feeds) 51 return r 52 53 54def _compute_theoretical_jacobian(x, x_shape, x_data, dy, dy_shape, dx, 55 extra_feed_dict): 56 """Computes the theoretical Jacobian for dy/dx. 57 58 Computes the theoretical Jacobian using the ops generated by 59 compute_gradient(). 60 61 Args: 62 x: the tensor "x". 63 x_shape: the dimensions of x as a tuple or an array of ints. 64 x_data: a numpy parray as the input data for x 65 dy: the tensor "dy". 66 dy_shape: the dimensions of dy as a tuple or an array of ints. 67 dx: Tensor or IndexedSlices representing dx 68 extra_feed_dict: dict that allows fixing specified tensor values 69 during the jacobian calculation. 70 71 Returns: 72 A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows 73 and "dy_size" columns where "x_size" is the number of elements in x and 74 "dy_size" is the number of elements in dy. 75 76 Raises: 77 ValueError: If `dy` is empty but the gradient is nonzero. 78 """ 79 # Complex vectors are treated as vectors of twice as many reals. 80 if x.dtype.is_complex: 81 x_shape = tuple(x_shape) + (2,) 82 dy_factor = 2 if dy.dtype.is_complex else 1 83 84 # To compute the jacobian, we treat x and y as one-dimensional vectors. 85 x_size = _product(x_shape) 86 x_val_size = _product(x_shape[1:]) # This is used for sparse gradients 87 dy_size = _product(dy_shape) * dy_factor 88 89 # Allocate 2-D Jacobian, with x dimensions smashed into the first 90 # dimension and y dimensions smashed into the second. 91 jacobian = np.zeros((x_size, dy_size), 92 dtype=x.dtype.real_dtype.as_numpy_dtype) 93 94 # For each of the entry of dy, we set this to be 1 and 95 # everything else to be 0 and compute the backprop -- this will give us one 96 # one column of the Jacobian matrix. 97 dy_data = np.zeros(dy_shape, dtype=dy.dtype.as_numpy_dtype) 98 dy_data_flat = dy_data.ravel().view(dy.dtype.real_dtype.as_numpy_dtype) 99 sess = ops.get_default_session() 100 for col in range(dy_size): 101 dy_data_flat[col] = 1 102 if isinstance(dx, indexed_slices.IndexedSlices): 103 backprop_indices, backprop_values = sess.run( 104 [dx.indices, dx.values], 105 feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data})) 106 for i, v in zip(backprop_indices, backprop_values): 107 r_begin = i * x_val_size 108 r_end = r_begin + x_val_size 109 jacobian[r_begin:r_end, col] += v.flat 110 else: 111 assert isinstance(dx, ops.Tensor), "dx = " + str(dx) 112 backprop = sess.run( 113 dx, feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data})) 114 jacobian[:, col] = backprop.ravel().view(jacobian.dtype) 115 dy_data_flat[col] = 0 116 117 # If the output is empty, run the gradients at least once and make sure 118 # they produce zeros. 119 if not dy_size: 120 backprop = sess.run( 121 dx, feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data})) 122 if backprop.shape != x_data.shape: 123 raise ValueError("Empty gradient has wrong shape: expected %s, got %s" % 124 (x_data.shape, backprop.shape)) 125 if np.any(backprop): 126 raise ValueError("Empty tensor with nonzero gradients") 127 128 logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian) 129 return jacobian 130 131 132def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta, 133 extra_feed_dict): 134 """Computes the numeric Jacobian for dy/dx. 135 136 Computes the numeric Jacobian by slightly perturbing the inputs and 137 measuring the differences on the output. 138 139 Args: 140 x: the tensor "x". 141 x_shape: the dimensions of x as a tuple or an array of ints. 142 x_data: a numpy array as the input data for x 143 y: the tensor "y". 144 y_shape: the dimensions of y as a tuple or an array of ints. 145 delta: the amount of perturbation we give to the input 146 extra_feed_dict: dict that allows fixing specified tensor values 147 during the jacobian calculation. 148 149 Returns: 150 A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows 151 and "y_size" columns where "x_size" is the number of elements in x and 152 "y_size" is the number of elements in y. 153 """ 154 # bfloat16 doesn't have enough bits to represent high precision numbers such 155 # as delta. Convert to float32 here. Since numeric_jacobian is expected to 156 # be the groundtruth to compare against, it shouldn't lose any information. 157 if x.dtype == dtypes.bfloat16: 158 x = math_ops.cast(x, dtypes.float32) # TODO(wangpeng): Now that the new x 159 # is an output of the old x, isn't feeding to the new x a mistake? 160 if y.dtype == dtypes.bfloat16: 161 y = math_ops.cast(y, dtypes.float32) 162 if x_data.dtype == dtypes.bfloat16.as_numpy_dtype: 163 x_data = x_data.astype(np.float32) 164 165 # To compute the jacobian, we treat x and y as one-dimensional vectors 166 x_size = _product(x_shape) * (2 if x.dtype.is_complex else 1) 167 y_size = _product(y_shape) * (2 if y.dtype.is_complex else 1) 168 x_dtype = x.dtype.real_dtype.as_numpy_dtype 169 y_dtype = y.dtype.real_dtype.as_numpy_dtype 170 171 # Make sure we have the right types 172 x_data = np.asarray(x_data, dtype=x.dtype.as_numpy_dtype) 173 scale = np.asarray(2 * delta, dtype=y_dtype)[()] 174 175 jacobian = np.zeros((x_size, y_size), dtype=x_dtype) 176 # For each of the entry of x, we slightly perturbs this by adding and 177 # subtracting a delta and then compute difference between the outputs. This 178 # will give us one row of the Jacobian matrix. 179 for row in range(x_size): 180 x_pos = x_data.copy() 181 x_neg = x_data.copy() 182 x_pos.ravel().view(x_dtype)[row] += delta 183 y_pos = y.eval(feed_dict=_extra_feeds(extra_feed_dict, {x: x_pos})) 184 x_neg.ravel().view(x_dtype)[row] -= delta 185 y_neg = y.eval(feed_dict=_extra_feeds(extra_feed_dict, {x: x_neg})) 186 diff = (y_pos - y_neg) / scale 187 jacobian[row, :] = diff.ravel().view(y_dtype) 188 189 logging.vlog(1, "Numeric Jacobian =\n%s", jacobian) 190 return jacobian 191 192 193def _compute_dx_and_dy(x, y, y_shape): 194 """Returns a node to compute gradient of y wrt x.""" 195 # We make up a dy so that we can compute the gradients. We don't really use 196 # the value of dy -- we will always feed it. We need to add an identity node 197 # so that we can always feed it properly. Otherwise, for the Add operation, 198 # dx is the same as dy and we cannot fetch the tensor that we are feeding. 199 with x.graph.as_default(): 200 dy_orig = constant_op.constant(1.0, shape=y_shape, dtype=y.dtype) 201 dy = array_ops.identity(dy_orig) 202 # We compute the gradients for y wrt. x 203 grads = gradients.gradients(y, x, dy) 204 assert len(grads) == 1 205 return grads[0], dy_orig 206 207 208def _compute_gradient(x, 209 x_shape, 210 dx, 211 y, 212 y_shape, 213 dy, 214 x_init_value=None, 215 delta=1e-3, 216 extra_feed_dict=None): 217 """Computes the theoretical and numerical jacobian.""" 218 t = dtypes.as_dtype(x.dtype) 219 allowed_types = [dtypes.float16, dtypes.bfloat16, dtypes.float32, 220 dtypes.float64, dtypes.complex64, dtypes.complex128] 221 assert t.base_dtype in allowed_types, "Don't support type %s for x" % t.name 222 t2 = dtypes.as_dtype(y.dtype) 223 assert t2.base_dtype in allowed_types, "Don't support type %s for y" % t2.name 224 225 if x_init_value is not None: 226 i_shape = list(x_init_value.shape) 227 assert(list(x_shape) == i_shape), "x_shape = %s, init_data shape = %s" % ( 228 x_shape, i_shape) 229 x_data = x_init_value 230 else: 231 x_data = np.random.random_sample(x_shape).astype(t.as_numpy_dtype) 232 if t.is_complex: 233 x_data.imag = np.random.random_sample(x_shape) 234 235 jacob_t = _compute_theoretical_jacobian( 236 x, x_shape, x_data, dy, y_shape, dx, extra_feed_dict=extra_feed_dict) 237 jacob_n = _compute_numeric_jacobian( 238 x, x_shape, x_data, y, y_shape, delta, extra_feed_dict=extra_feed_dict) 239 return jacob_t, jacob_n 240 241 242def _compute_gradient_list(x, 243 x_shape, 244 y, 245 y_shape, 246 x_init_value=None, 247 delta=1e-3, 248 init_targets=None, 249 extra_feed_dict=None): 250 """Compute gradients for a list of x values.""" 251 assert isinstance(x, list) 252 dx, dy = zip(*[_compute_dx_and_dy(xi, y, y_shape) for xi in x]) 253 254 if init_targets is not None: 255 assert isinstance(init_targets, (list, tuple)) 256 for init in init_targets: 257 init.run() 258 if x_init_value is None: 259 x_init_value = [None] * len(x) 260 # pylint: disable=g-complex-comprehension 261 ret = [_compute_gradient(xi, x_shapei, dxi, y, y_shape, dyi, x_init_valuei, 262 delta, extra_feed_dict=extra_feed_dict) 263 for xi, x_shapei, dxi, dyi, x_init_valuei in zip(x, x_shape, dx, dy, 264 x_init_value)] 265 return ret 266 267 268@tf_export(v1=["test.compute_gradient"]) 269@deprecation.deprecated( 270 date=None, 271 instructions="Use tf.test.compute_gradient in 2.0, which has better " 272 "support for functions. Note that the two versions have different usage, " 273 "so code change is needed.") 274def compute_gradient(x, 275 x_shape, 276 y, 277 y_shape, 278 x_init_value=None, 279 delta=1e-3, 280 init_targets=None, 281 extra_feed_dict=None): 282 """Computes and returns the theoretical and numerical Jacobian. 283 284 If `x` or `y` is complex, the Jacobian will still be real but the 285 corresponding Jacobian dimension(s) will be twice as large. This is required 286 even if both input and output is complex since TensorFlow graphs are not 287 necessarily holomorphic, and may have gradients not expressible as complex 288 numbers. For example, if `x` is complex with shape `[m]` and `y` is complex 289 with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with 290 291 J[:m, :n] = d(Re y)/d(Re x) 292 J[:m, n:] = d(Im y)/d(Re x) 293 J[m:, :n] = d(Re y)/d(Im x) 294 J[m:, n:] = d(Im y)/d(Im x) 295 296 Args: 297 x: a tensor or list of tensors 298 x_shape: the dimensions of x as a tuple or an array of ints. If x is a list, 299 then this is the list of shapes. 300 y: a tensor 301 y_shape: the dimensions of y as a tuple or an array of ints. 302 x_init_value: (optional) a numpy array of the same shape as "x" 303 representing the initial value of x. If x is a list, this should be a list 304 of numpy arrays. If this is none, the function will pick a random tensor 305 as the initial value. 306 delta: (optional) the amount of perturbation. 307 init_targets: list of targets to run to initialize model params. 308 extra_feed_dict: dict that allows fixing specified tensor values 309 during the Jacobian calculation. 310 311 Returns: 312 Two 2-d numpy arrays representing the theoretical and numerical 313 Jacobian for dy/dx. Each has "x_size" rows and "y_size" columns 314 where "x_size" is the number of elements in x and "y_size" is the 315 number of elements in y. If x is a list, returns a list of two numpy arrays. 316 """ 317 # TODO(mrry): remove argument `init_targets` 318 if extra_feed_dict is None: 319 extra_feed_dict = {} 320 321 if isinstance(x, list): 322 return _compute_gradient_list(x, x_shape, y, y_shape, x_init_value, delta, 323 init_targets, extra_feed_dict=extra_feed_dict) 324 else: 325 if init_targets is not None: 326 assert isinstance(init_targets, (list, tuple)) 327 for init in init_targets: 328 init.run() 329 dx, dy = _compute_dx_and_dy(x, y, y_shape) 330 ret = _compute_gradient(x, x_shape, dx, y, y_shape, dy, x_init_value, delta, 331 extra_feed_dict=extra_feed_dict) 332 return ret 333 334 335def _compute_error(grad): 336 if isinstance(grad, tuple): 337 grad = [grad] 338 error = 0 339 for j_t, j_n in grad: 340 if j_t.size or j_n.size: # Handle zero size tensors correctly 341 error = np.maximum(error, np.fabs(j_t - j_n).max()) 342 return error 343 344 345@tf_export(v1=["test.compute_gradient_error"]) 346@deprecation.deprecated( 347 date=None, 348 instructions="Use tf.test.compute_gradient in 2.0, which has better " 349 "support for functions. Note that the two versions have different usage, " 350 "so code change is needed.") 351def compute_gradient_error(x, 352 x_shape, 353 y, 354 y_shape, 355 x_init_value=None, 356 delta=1e-3, 357 init_targets=None, 358 extra_feed_dict=None): 359 """Computes the gradient error. 360 361 Computes the maximum error for dy/dx between the computed Jacobian and the 362 numerically estimated Jacobian. 363 364 This function will modify the tensors passed in as it adds more operations 365 and hence changing the consumers of the operations of the input tensors. 366 367 This function adds operations to the current session. To compute the error 368 using a particular device, such as a GPU, use the standard methods for 369 setting a device (e.g. using with sess.graph.device() or setting a device 370 function in the session constructor). 371 372 Args: 373 x: a tensor or list of tensors 374 x_shape: the dimensions of x as a tuple or an array of ints. If x is a list, 375 then this is the list of shapes. 376 y: a tensor 377 y_shape: the dimensions of y as a tuple or an array of ints. 378 x_init_value: (optional) a numpy array of the same shape as "x" 379 representing the initial value of x. If x is a list, this should be a list 380 of numpy arrays. If this is none, the function will pick a random tensor 381 as the initial value. 382 delta: (optional) the amount of perturbation. 383 init_targets: list of targets to run to initialize model params. 384 extra_feed_dict: dict that allows fixing specified tensor values 385 during the Jacobian calculation. 386 387 Returns: 388 The maximum error in between the two Jacobians. 389 """ 390 grad = compute_gradient(x, x_shape, y, y_shape, x_init_value, delta, 391 init_targets, extra_feed_dict=extra_feed_dict) 392 return _compute_error(grad) 393