1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_CORE_KERNELS_CONV_2D_H_
17 #define TENSORFLOW_CORE_KERNELS_CONV_2D_H_
18
19 #include "absl/strings/string_view.h"
20 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
21 #include "tensorflow/core/framework/tensor_types.h"
22 #include "tensorflow/core/kernels/eigen_backward_spatial_convolutions.h"
23 #include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
24 #include "tensorflow/core/util/tensor_format.h"
25
26 // Returns true if TF_CONV2D_USE_FP16_ACCUMULATE == 1, false otherwise.
Conv2dUseFp16Accumulate()27 static bool Conv2dUseFp16Accumulate() {
28 static bool use_fp16_accumulate = []() {
29 const char* env = std::getenv("TF_CONV2D_USE_FP16_ACCUMULATE");
30 return (env != nullptr) && (absl::string_view(env) == "1");
31 }();
32 return use_fp16_accumulate;
33 }
34
35 namespace tensorflow {
36 namespace functor {
37
38 template <typename Device, typename Input, typename Filter, typename Output,
39 typename OutputKernel>
40 void SpatialConvolutionFunc(const Device& d, Output output, Input input,
41 Filter filter, int row_stride, int col_stride,
42 int row_dilation, int col_dilation,
43 const Eigen::PaddingType& padding,
44 const OutputKernel& output_kernel,
45 int padding_top = 0, int padding_bottom = 0,
46 int padding_left = 0, int padding_right = 0) {
47 // Need to swap row/col, padding_top/padding_left, and
48 // padding_bottom/padding_right when calling Eigen. Eigen expects the tensor
49 // in NWHC format, but the tensor given is in NHWC.
50 output.device(d) = Eigen::SpatialConvolution(
51 input, filter, col_stride, row_stride, padding, col_dilation,
52 row_dilation, output_kernel, padding_left, padding_right, padding_top,
53 padding_bottom);
54 }
55
56 // TODO(ezhulenev): Non-templated `operator()` are required by explicit template
57 // instantiations for the GPU device. However they are almost certainly not used
58 // in any of the kernel implementation. Check if they can be removed.
59 template <typename Device, typename T,
60 typename OutputKernel = const Eigen::NoOpOutputKernel>
61 struct SpatialConvolution {
operatorSpatialConvolution62 void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
63 typename TTypes<T, 4>::ConstTensor input,
64 typename TTypes<T, 4>::ConstTensor filter, int row_stride,
65 int col_stride, int row_dilation, int col_dilation,
66 const Eigen::PaddingType& padding,
67 const OutputKernel& output_kernel = OutputKernel()) {
68 SpatialConvolutionFunc(d, output, input, filter, row_stride, col_stride,
69 row_dilation, col_dilation, padding, output_kernel);
70 }
71
72 template <typename Input, typename Filter, typename Output>
operatorSpatialConvolution73 void operator()(const Device& d, Output output, Input input, Filter filter,
74 int row_stride, int col_stride, int row_dilation,
75 int col_dilation, const Eigen::PaddingType& padding,
76 const OutputKernel& output_kernel = OutputKernel()) {
77 SpatialConvolutionFunc(d, output, input, filter, row_stride, col_stride,
78 row_dilation, col_dilation, padding, output_kernel);
79 }
80
operatorSpatialConvolution81 void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
82 typename TTypes<T, 4>::ConstTensor input,
83 typename TTypes<T, 4>::ConstTensor filter, int row_stride,
84 int col_stride, int row_dilation, int col_dilation,
85 int padding_top, int padding_bottom, int padding_left,
86 int padding_right,
87 const OutputKernel& output_kernel = OutputKernel()) {
88 SpatialConvolutionFunc(
89 d, output, input, filter, row_stride, col_stride, row_dilation,
90 col_dilation, Eigen::PaddingType::PADDING_VALID, output_kernel,
91 padding_top, padding_bottom, padding_left, padding_right);
92 }
93
94 template <typename Input, typename Filter, typename Output>
operatorSpatialConvolution95 void operator()(const Device& d, Output output, Input input, Filter filter,
96 int row_stride, int col_stride, int row_dilation,
97 int col_dilation, int padding_top, int padding_bottom,
98 int padding_left, int padding_right,
99 const OutputKernel& output_kernel = OutputKernel()) {
100 SpatialConvolutionFunc(
101 d, output, input, filter, row_stride, col_stride, row_dilation,
102 col_dilation, Eigen::PaddingType::PADDING_VALID, output_kernel,
103 padding_top, padding_bottom, padding_left, padding_right);
104 }
105 };
106
107 template <typename Device, typename OutputKernel>
108 struct SpatialConvolution<Device, Eigen::half, OutputKernel> {
109 void operator()(const Device& d,
110 typename TTypes<Eigen::half, 4>::Tensor output,
111 typename TTypes<Eigen::half, 4>::ConstTensor input,
112 typename TTypes<Eigen::half, 4>::ConstTensor filter,
113 int row_stride, int col_stride, int row_dilation,
114 int col_dilation, const Eigen::PaddingType& padding,
115 const OutputKernel& output_kernel = OutputKernel()) {
116 if (Conv2dUseFp16Accumulate()) {
117 output.device(d) = Eigen::SpatialConvolution(
118 input, filter, col_stride, row_stride, padding, col_dilation,
119 row_dilation, output_kernel);
120 } else {
121 output.device(d) =
122 Eigen::SpatialConvolution(input.cast<float>(), filter.cast<float>(),
123 col_stride, row_stride, padding,
124 col_dilation, row_dilation, output_kernel)
125 .template cast<Eigen::half>();
126 }
127 }
128
129 template <typename Input, typename Filter, typename Output>
130 void operator()(const Device& d, Output output, Input input, Filter filter,
131 int row_stride, int col_stride, int row_dilation,
132 int col_dilation, const Eigen::PaddingType& padding,
133 const OutputKernel& output_kernel = OutputKernel()) {
134 if (Conv2dUseFp16Accumulate()) {
135 output.device(d) = Eigen::SpatialConvolution(
136 input, filter, col_stride, row_stride, padding, col_dilation,
137 row_dilation, output_kernel);
138 } else {
139 output.device(d) =
140 Eigen::SpatialConvolution(input.template cast<float>(),
141 filter.template cast<float>(), col_stride,
142 row_stride, padding, col_dilation,
143 row_dilation, output_kernel)
144 .template cast<Eigen::half>();
145 }
146 }
147
148 void operator()(const Device& d,
149 typename TTypes<Eigen::half, 4>::Tensor output,
150 typename TTypes<Eigen::half, 4>::ConstTensor input,
151 typename TTypes<Eigen::half, 4>::ConstTensor filter,
152 int row_stride, int col_stride, int row_dilation,
153 int col_dilation, int padding_top, int padding_bottom,
154 int padding_left, int padding_right,
155 const OutputKernel& output_kernel = OutputKernel()) {
156 if (Conv2dUseFp16Accumulate()) {
157 output.device(d) = Eigen::SpatialConvolution(
158 input, filter, col_stride, row_stride,
159 Eigen::PaddingType::PADDING_VALID, col_dilation, row_dilation,
160 output_kernel, padding_left, padding_right, padding_top,
161 padding_bottom);
162 } else {
163 output.device(d) =
164 Eigen::SpatialConvolution(
165 input.cast<float>(), filter.cast<float>(), col_stride, row_stride,
166 Eigen::PaddingType::PADDING_VALID, col_dilation, row_dilation,
167 output_kernel, padding_left, padding_right, padding_top,
168 padding_bottom)
169 .template cast<Eigen::half>();
170 }
171 }
172
173 template <typename Input, typename Filter, typename Output>
174 void operator()(const Device& d, Output output, Input input, Filter filter,
175 int row_stride, int col_stride, int row_dilation,
176 int col_dilation, int padding_top, int padding_bottom,
177 int padding_left, int padding_right,
178 const OutputKernel& output_kernel = OutputKernel()) {
179 if (Conv2dUseFp16Accumulate()) {
180 output.device(d) = Eigen::SpatialConvolution(
181 input, filter, col_stride, row_stride,
182 Eigen::PaddingType::PADDING_VALID, col_dilation, row_dilation,
183 output_kernel, padding_left, padding_right, padding_top,
184 padding_bottom);
185 } else {
186 output.device(d) =
187 Eigen::SpatialConvolution(
188 input.template cast<float>(), filter.template cast<float>(),
189 col_stride, row_stride, Eigen::PaddingType::PADDING_VALID,
190 col_dilation, row_dilation, output_kernel, padding_left,
191 padding_right, padding_top, padding_bottom)
192 .template cast<Eigen::half>();
193 }
194 }
195 };
196
197 template <typename Device, typename T>
198 struct SpatialConvolutionBackwardInputFunc {
199 void operator()(const Device& d, typename TTypes<T, 4>::Tensor input_backward,
200 typename TTypes<T, 4>::ConstTensor filter,
201 typename TTypes<T, 4>::ConstTensor output_backward,
202 Eigen::DenseIndex col_stride, Eigen::DenseIndex row_stride,
203 Eigen::DenseIndex col_dilation,
204 Eigen::DenseIndex row_dilation) {
205 input_backward.device(d) = Eigen::SpatialConvolutionBackwardInput(
206 filter, output_backward, input_backward.dimension(2),
207 input_backward.dimension(1), col_stride, row_stride, col_dilation,
208 row_dilation);
209 }
210 };
211
212 // GPU version requires all tensors to be indexable by int32.
213 template <typename T>
214 struct SpatialConvolutionBackwardInputFunc<Eigen::GpuDevice, T> {
215 void operator()(const Eigen::GpuDevice& d,
216 typename TTypes<T, 4>::Tensor input_backward,
217 typename TTypes<T, 4>::ConstTensor filter,
218 typename TTypes<T, 4>::ConstTensor output_backward,
219 Eigen::DenseIndex col_stride, Eigen::DenseIndex row_stride,
220 Eigen::DenseIndex col_dilation,
221 Eigen::DenseIndex row_dilation) {
222 To32Bit(input_backward).device(d) = Eigen::SpatialConvolutionBackwardInput(
223 To32Bit(filter), To32Bit(output_backward), input_backward.dimension(2),
224 input_backward.dimension(1), col_stride, row_stride, col_dilation,
225 row_dilation);
226 }
227 };
228
229 template <typename Device, typename T>
230 struct SpatialConvolutionBackwardInputWithExplicitPaddingFunc {
231 void operator()(const Device& d, typename TTypes<T, 4>::Tensor input_backward,
232 typename TTypes<T, 4>::ConstTensor filter,
233 typename TTypes<T, 4>::ConstTensor output_backward,
234 Eigen::DenseIndex padded_cols, Eigen::DenseIndex padded_rows,
235 Eigen::DenseIndex col_stride, Eigen::DenseIndex row_stride,
236 Eigen::DenseIndex col_dilation,
237 Eigen::DenseIndex row_dilation, Eigen::DenseIndex pad_left,
238 Eigen::DenseIndex pad_top) {
239 // We have to slice the result of a spatial convolution backward
240 // input, before assigning it to the `input_backward` to remove padding.
241 //
242 // TODO(ezhulenev): Pass explicit paddings to Eigen and do not materialize
243 // intermediate result in memory before slicing.
244 input_backward.device(d) =
245 Eigen::SpatialConvolutionBackwardInput(
246 filter, output_backward, padded_cols, padded_rows, col_stride,
247 row_stride, col_dilation, row_dilation)
248 .eval()
249 .slice(Eigen::DSizes<Eigen::DenseIndex, 4>{0, pad_left, pad_top, 0},
250 input_backward.dimensions());
251 }
252 };
253
254 // GPU version requires all tensors to be indexable by int32.
255 template <typename T>
256 struct SpatialConvolutionBackwardInputWithExplicitPaddingFunc<Eigen::GpuDevice,
257 T> {
258 void operator()(const Eigen::GpuDevice& d,
259 typename TTypes<T, 4>::Tensor input_backward,
260 typename TTypes<T, 4>::ConstTensor filter,
261 typename TTypes<T, 4>::ConstTensor output_backward,
262 Eigen::DenseIndex padded_cols, Eigen::DenseIndex padded_rows,
263 Eigen::DenseIndex col_stride, Eigen::DenseIndex row_stride,
264 Eigen::DenseIndex col_dilation,
265 Eigen::DenseIndex row_dilation, Eigen::DenseIndex pad_left,
266 Eigen::DenseIndex pad_top) {
267 To32Bit(input_backward).device(d) =
268 Eigen::SpatialConvolutionBackwardInput(
269 To32Bit(filter), To32Bit(output_backward), padded_cols, padded_rows,
270 col_stride, row_stride, col_dilation, row_dilation)
271 .eval()
272 .slice(Eigen::DSizes<Eigen::DenseIndex, 4>{0, pad_left, pad_top, 0},
273 input_backward.dimensions());
274 }
275 };
276
277 // TODO(vrv): Figure out how to use the MatMulFunctor in matmul_op.h.
278 // My initial attempt to do this compiled but failed in the pytest
279 // due to a swigdeps error.
280 template <typename Device, typename T,
281 typename OutputKernel = const Eigen::NoOpOutputKernel>
282 struct MatMulConvFunctor {
283 // Computes on device "d": out = in0 * in1, where * is matrix
284 // multiplication.
285 void operator()(
286 const Device& d, typename TTypes<T, 2>::Tensor out,
287 typename TTypes<T, 2>::ConstTensor in0,
288 typename TTypes<T, 2>::ConstTensor in1,
289 const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
290 const OutputKernel& output_kernel = OutputKernel()) {
291 out.device(d) = in0.contract(in1, dim_pair, output_kernel);
292 }
293 };
294
295 // Shuffles a filter tensor from TensorFlow format HWIO to dst_filter_format.
296 //
297 // Note: Currently supports OIHW and OHWI destination formats.
298 template <typename Device, typename T, typename IndexType, int NDIMS>
299 struct TransformFilter {
300 void operator()(const Device& d, FilterTensorFormat dst_filter_format,
301 typename TTypes<T, NDIMS, IndexType>::ConstTensor in,
302 typename TTypes<T, NDIMS, IndexType>::Tensor out) {
303 // NOTE: Source filter format is always HWIO.
304 Eigen::DSizes<IndexType, NDIMS - 2> spatial_dims;
305 for (int i = 0; i < spatial_dims.rank(); ++i) {
306 spatial_dims[i] = in.dimension(i);
307 }
308
309 // Merge the spatial dimensions together to speed up the shuffle operation.
310 Eigen::DSizes<IndexType, 3> merged_dims;
311 merged_dims[0] = spatial_dims.TotalSize(); // product of spatial dims [H*W]
312 merged_dims[1] = in.dimension(NDIMS - 2); // input filters [I]
313 merged_dims[2] = in.dimension(NDIMS - 1); // output filters [O]
314
315 // Shuffle tensor with merged spatial dimensions.
316 Eigen::DSizes<IndexType, 3> shuffling_perm;
317 // Expand shuffled tensor into final dimensions.
318 Eigen::DSizes<IndexType, NDIMS> expanded_dims;
319
320 if (dst_filter_format == FORMAT_OIHW) {
321 shuffling_perm = Eigen::DSizes<IndexType, 3>(2, 1, 0);
322
323 expanded_dims[0] = merged_dims[2]; // [O]
324 expanded_dims[1] = merged_dims[1]; // [I]
325 for (int i = 0; i < spatial_dims.rank(); ++i) {
326 expanded_dims[2 + i] = spatial_dims[i];
327 }
328
329 } else if (dst_filter_format == FORMAT_OHWI) {
330 shuffling_perm = Eigen::DSizes<IndexType, 3>(2, 0, 1);
331
332 expanded_dims[0] = merged_dims[2]; // [O]
333 expanded_dims[NDIMS - 1] = merged_dims[1]; // [I]
334 for (int i = 0; i < spatial_dims.rank(); ++i) {
335 expanded_dims[1 + i] = spatial_dims[i];
336 }
337
338 } else {
339 DCHECK(false) << "Unsupported destination filter format: "
340 << ToString(dst_filter_format);
341 }
342
343 out.device(d) =
344 in.reshape(merged_dims).shuffle(shuffling_perm).reshape(expanded_dims);
345 }
346 };
347
348 // TODO This functor is not used anywhere and should be removed,
349 // but it defines some eigen templates that are referenced in other kernels.
350 template <typename Device, typename T, typename IndexType>
351 struct TransformDepth {
352 void operator()(const Device& d,
353 typename TTypes<T, 4, IndexType>::ConstTensor in,
354 const Eigen::DSizes<IndexType, 4>& shuffle,
355 typename TTypes<T, 4, IndexType>::Tensor out) {
356 Eigen::DSizes<IndexType, 3> merged_dims;
357 Eigen::DSizes<IndexType, 4> expanded_dims;
358 Eigen::DSizes<IndexType, 3> new_shuffle;
359
360 // Merge dimensions that won't be shuffled together to speed things up.
361 if (shuffle[1] == 2 && shuffle[2] == 3) {
362 merged_dims[0] = in.dimension(0);
363 merged_dims[1] = in.dimension(1);
364 merged_dims[2] = in.dimension(2) * in.dimension(3);
365 new_shuffle[0] = shuffle[0];
366 new_shuffle[1] = 2;
367 new_shuffle[2] = shuffle[3];
368 expanded_dims[0] = in.dimension(shuffle[0]);
369 expanded_dims[1] = in.dimension(2);
370 expanded_dims[2] = in.dimension(3);
371 expanded_dims[3] = in.dimension(shuffle[3]);
372 } else if (shuffle[0] == 2 && shuffle[1] == 3) {
373 merged_dims[0] = in.dimension(0);
374 merged_dims[1] = in.dimension(1);
375 merged_dims[2] = in.dimension(2) * in.dimension(3);
376 new_shuffle[0] = 2;
377 new_shuffle[1] = shuffle[2];
378 new_shuffle[2] = shuffle[3];
379 expanded_dims[0] = in.dimension(2);
380 expanded_dims[1] = in.dimension(3);
381 expanded_dims[2] = in.dimension(shuffle[2]);
382 expanded_dims[3] = in.dimension(shuffle[3]);
383 } else if (shuffle[0] == 0 && shuffle[1] == 3 && shuffle[2] == 1 &&
384 shuffle[3] == 2) {
385 merged_dims[0] = in.dimension(0);
386 merged_dims[1] = in.dimension(1) * in.dimension(2);
387 merged_dims[2] = in.dimension(3);
388 new_shuffle[0] = 0;
389 new_shuffle[1] = 2;
390 new_shuffle[2] = 1;
391 expanded_dims[0] = in.dimension(0);
392 expanded_dims[1] = in.dimension(3);
393 expanded_dims[2] = in.dimension(1);
394 expanded_dims[3] = in.dimension(2);
395 } else {
396 assert(false && "unexpected shuffle");
397 }
398
399 out.device(d) =
400 in.reshape(merged_dims).shuffle(new_shuffle).reshape(expanded_dims);
401 }
402 };
403
404 // Note on the use of const reference for the "padding_value" argument
405 //
406 // In the ROCm TF build,
407 // ++ the call(s) to the functor are in the files (conv_*.cc) that are compiled
408 // by the "CPU" compiler, while the
409 // ++ the GPUDevice specific template instantiations are in the files that are
410 // compiled by the "GPU" compiler.
411 //
412 // For T == Eigen::half, the value of the "padding_value" argument (when it was
413 // pass-by-value) was getting corrupted, leading to regressions in the
414 // convolution unit tests.
415 //
416 // I do not understand the exact reason for the this, but based on similar past
417 // issues, it is likely due to a combination of
418 // ++ an ABI incompatibility between the "old" CPU compiler (gcc 5.4 for
419 // Ubuntu 16.04, gcc 7.5 for Ubuntu 18.04) and the "new" ROCm GPU compiler
420 // (hipclang which is based on latest clang), AND
421 // ++ Eigen::half having the same size but different internals on the CPU and
422 // GPU sides (unsigned short on CPU, union {unsigned short, _Float16} on GPU
423 //
424 // Changing the "padding value" argument to be a const reference type seems to
425 // suppress the bug
426 template <typename Device, typename T, typename IndexType, int NDIMS>
427 struct PadInput {
428 void operator()(const Device& d,
429 typename TTypes<T, NDIMS, IndexType>::ConstTensor in,
430 const std::array<int, NDIMS - 2>& padding_left,
431 const std::array<int, NDIMS - 2>& padding_right,
432 typename TTypes<T, NDIMS, IndexType>::Tensor out,
433 TensorFormat format, const T& padding_value) {
434 Eigen::array<Eigen::IndexPair<IndexType>, NDIMS> padding;
435 padding[GetTensorDimIndex<NDIMS - 2>(format, 'N')] = {0, 0};
436 for (int i = 0; i < NDIMS - 2; ++i) {
437 padding[GetTensorDimIndex<NDIMS - 2>(format, '0' + i)] = {
438 padding_left[i], padding_right[i]};
439 }
440 padding[GetTensorDimIndex<NDIMS - 2>(format, 'C')] = {0, 0};
441 out.device(d) = in.pad(padding, padding_value);
442 }
443 };
444
445 // Converts a tensor from:
446 // [batch, <spatial>, filters]
447 // to:
448 // [batch, filters, <spatial>]
449 template <typename Device, typename T, int NDIMS>
450 struct NHWCToNCHW {
451 void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in,
452 typename TTypes<T, NDIMS>::Tensor out);
453 };
454
455 // Converts a tensor from:
456 // [batch, filters, <spatial>]
457 // to:
458 // [batch, <spatial>, filters]
459 template <typename Device, typename T, int NDIMS>
460 struct NCHWToNHWC {
461 void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in,
462 typename TTypes<T, NDIMS>::Tensor out);
463 };
464
465 // Converts a tensor from:
466 // [dim0, dim1, dim2]
467 // to:
468 // [dim0, dim2, dim1]
469 template <typename Device, typename T, bool conjugate = false>
470 struct SwapDimension1And2InTensor3 {
471 void operator()(const Device& d, const T* in,
472 const gtl::ArraySlice<int64_t>& input_dims, T* out);
473 };
474
475 // Converts a tensor from:
476 // [dim0, dim1, dim2]
477 // to:
478 // [dim2, dim1, dim0]
479 template <typename Device, typename T, bool conjugate = false>
480 struct SwapDimension0And2InTensor3 {
481 void operator()(const Device& d, const T* in,
482 const gtl::ArraySlice<int64_t>& input_dims, T* out);
483 };
484
485 // Transforms back filter from OIHW or OHWI to HWOI format to reverse effect of
486 // TransformFilter above.
487 template <typename Device, typename T, int NDIMS>
488 struct ReverseTransformFilter {
489 void operator()(const Device& d, FilterTensorFormat src_filter_format,
490 typename TTypes<T, NDIMS>::ConstTensor in,
491 typename TTypes<T, NDIMS>::Tensor out);
492 };
493
494 } // namespace functor
495
496 template <class T>
497 class ConvAlgorithmMap;
498
499 template <>
500 class ConvAlgorithmMap<Eigen::ThreadPoolDevice> {};
501 } // namespace tensorflow
502
503 #endif // TENSORFLOW_CORE_KERNELS_CONV_2D_H_
504