xref: /aosp_15_r20/external/tensorflow/tensorflow/core/kernels/conv_2d.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_KERNELS_CONV_2D_H_
17 #define TENSORFLOW_CORE_KERNELS_CONV_2D_H_
18 
19 #include "absl/strings/string_view.h"
20 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
21 #include "tensorflow/core/framework/tensor_types.h"
22 #include "tensorflow/core/kernels/eigen_backward_spatial_convolutions.h"
23 #include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
24 #include "tensorflow/core/util/tensor_format.h"
25 
26 // Returns true if TF_CONV2D_USE_FP16_ACCUMULATE == 1, false otherwise.
Conv2dUseFp16Accumulate()27 static bool Conv2dUseFp16Accumulate() {
28   static bool use_fp16_accumulate = []() {
29     const char* env = std::getenv("TF_CONV2D_USE_FP16_ACCUMULATE");
30     return (env != nullptr) && (absl::string_view(env) == "1");
31   }();
32   return use_fp16_accumulate;
33 }
34 
35 namespace tensorflow {
36 namespace functor {
37 
38 template <typename Device, typename Input, typename Filter, typename Output,
39           typename OutputKernel>
40 void SpatialConvolutionFunc(const Device& d, Output output, Input input,
41                             Filter filter, int row_stride, int col_stride,
42                             int row_dilation, int col_dilation,
43                             const Eigen::PaddingType& padding,
44                             const OutputKernel& output_kernel,
45                             int padding_top = 0, int padding_bottom = 0,
46                             int padding_left = 0, int padding_right = 0) {
47   // Need to swap row/col, padding_top/padding_left, and
48   // padding_bottom/padding_right when calling Eigen. Eigen expects the tensor
49   // in NWHC format, but the tensor given is in NHWC.
50   output.device(d) = Eigen::SpatialConvolution(
51       input, filter, col_stride, row_stride, padding, col_dilation,
52       row_dilation, output_kernel, padding_left, padding_right, padding_top,
53       padding_bottom);
54 }
55 
56 // TODO(ezhulenev): Non-templated `operator()` are required by explicit template
57 // instantiations for the GPU device. However they are almost certainly not used
58 // in any of the kernel implementation. Check if they can be removed.
59 template <typename Device, typename T,
60           typename OutputKernel = const Eigen::NoOpOutputKernel>
61 struct SpatialConvolution {
operatorSpatialConvolution62   void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
63                   typename TTypes<T, 4>::ConstTensor input,
64                   typename TTypes<T, 4>::ConstTensor filter, int row_stride,
65                   int col_stride, int row_dilation, int col_dilation,
66                   const Eigen::PaddingType& padding,
67                   const OutputKernel& output_kernel = OutputKernel()) {
68     SpatialConvolutionFunc(d, output, input, filter, row_stride, col_stride,
69                            row_dilation, col_dilation, padding, output_kernel);
70   }
71 
72   template <typename Input, typename Filter, typename Output>
operatorSpatialConvolution73   void operator()(const Device& d, Output output, Input input, Filter filter,
74                   int row_stride, int col_stride, int row_dilation,
75                   int col_dilation, const Eigen::PaddingType& padding,
76                   const OutputKernel& output_kernel = OutputKernel()) {
77     SpatialConvolutionFunc(d, output, input, filter, row_stride, col_stride,
78                            row_dilation, col_dilation, padding, output_kernel);
79   }
80 
operatorSpatialConvolution81   void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
82                   typename TTypes<T, 4>::ConstTensor input,
83                   typename TTypes<T, 4>::ConstTensor filter, int row_stride,
84                   int col_stride, int row_dilation, int col_dilation,
85                   int padding_top, int padding_bottom, int padding_left,
86                   int padding_right,
87                   const OutputKernel& output_kernel = OutputKernel()) {
88     SpatialConvolutionFunc(
89         d, output, input, filter, row_stride, col_stride, row_dilation,
90         col_dilation, Eigen::PaddingType::PADDING_VALID, output_kernel,
91         padding_top, padding_bottom, padding_left, padding_right);
92   }
93 
94   template <typename Input, typename Filter, typename Output>
operatorSpatialConvolution95   void operator()(const Device& d, Output output, Input input, Filter filter,
96                   int row_stride, int col_stride, int row_dilation,
97                   int col_dilation, int padding_top, int padding_bottom,
98                   int padding_left, int padding_right,
99                   const OutputKernel& output_kernel = OutputKernel()) {
100     SpatialConvolutionFunc(
101         d, output, input, filter, row_stride, col_stride, row_dilation,
102         col_dilation, Eigen::PaddingType::PADDING_VALID, output_kernel,
103         padding_top, padding_bottom, padding_left, padding_right);
104   }
105 };
106 
107 template <typename Device, typename OutputKernel>
108 struct SpatialConvolution<Device, Eigen::half, OutputKernel> {
109   void operator()(const Device& d,
110                   typename TTypes<Eigen::half, 4>::Tensor output,
111                   typename TTypes<Eigen::half, 4>::ConstTensor input,
112                   typename TTypes<Eigen::half, 4>::ConstTensor filter,
113                   int row_stride, int col_stride, int row_dilation,
114                   int col_dilation, const Eigen::PaddingType& padding,
115                   const OutputKernel& output_kernel = OutputKernel()) {
116     if (Conv2dUseFp16Accumulate()) {
117       output.device(d) = Eigen::SpatialConvolution(
118           input, filter, col_stride, row_stride, padding, col_dilation,
119           row_dilation, output_kernel);
120     } else {
121       output.device(d) =
122           Eigen::SpatialConvolution(input.cast<float>(), filter.cast<float>(),
123                                     col_stride, row_stride, padding,
124                                     col_dilation, row_dilation, output_kernel)
125               .template cast<Eigen::half>();
126     }
127   }
128 
129   template <typename Input, typename Filter, typename Output>
130   void operator()(const Device& d, Output output, Input input, Filter filter,
131                   int row_stride, int col_stride, int row_dilation,
132                   int col_dilation, const Eigen::PaddingType& padding,
133                   const OutputKernel& output_kernel = OutputKernel()) {
134     if (Conv2dUseFp16Accumulate()) {
135       output.device(d) = Eigen::SpatialConvolution(
136           input, filter, col_stride, row_stride, padding, col_dilation,
137           row_dilation, output_kernel);
138     } else {
139       output.device(d) =
140           Eigen::SpatialConvolution(input.template cast<float>(),
141                                     filter.template cast<float>(), col_stride,
142                                     row_stride, padding, col_dilation,
143                                     row_dilation, output_kernel)
144               .template cast<Eigen::half>();
145     }
146   }
147 
148   void operator()(const Device& d,
149                   typename TTypes<Eigen::half, 4>::Tensor output,
150                   typename TTypes<Eigen::half, 4>::ConstTensor input,
151                   typename TTypes<Eigen::half, 4>::ConstTensor filter,
152                   int row_stride, int col_stride, int row_dilation,
153                   int col_dilation, int padding_top, int padding_bottom,
154                   int padding_left, int padding_right,
155                   const OutputKernel& output_kernel = OutputKernel()) {
156     if (Conv2dUseFp16Accumulate()) {
157       output.device(d) = Eigen::SpatialConvolution(
158           input, filter, col_stride, row_stride,
159           Eigen::PaddingType::PADDING_VALID, col_dilation, row_dilation,
160           output_kernel, padding_left, padding_right, padding_top,
161           padding_bottom);
162     } else {
163       output.device(d) =
164           Eigen::SpatialConvolution(
165               input.cast<float>(), filter.cast<float>(), col_stride, row_stride,
166               Eigen::PaddingType::PADDING_VALID, col_dilation, row_dilation,
167               output_kernel, padding_left, padding_right, padding_top,
168               padding_bottom)
169               .template cast<Eigen::half>();
170     }
171   }
172 
173   template <typename Input, typename Filter, typename Output>
174   void operator()(const Device& d, Output output, Input input, Filter filter,
175                   int row_stride, int col_stride, int row_dilation,
176                   int col_dilation, int padding_top, int padding_bottom,
177                   int padding_left, int padding_right,
178                   const OutputKernel& output_kernel = OutputKernel()) {
179     if (Conv2dUseFp16Accumulate()) {
180       output.device(d) = Eigen::SpatialConvolution(
181           input, filter, col_stride, row_stride,
182           Eigen::PaddingType::PADDING_VALID, col_dilation, row_dilation,
183           output_kernel, padding_left, padding_right, padding_top,
184           padding_bottom);
185     } else {
186       output.device(d) =
187           Eigen::SpatialConvolution(
188               input.template cast<float>(), filter.template cast<float>(),
189               col_stride, row_stride, Eigen::PaddingType::PADDING_VALID,
190               col_dilation, row_dilation, output_kernel, padding_left,
191               padding_right, padding_top, padding_bottom)
192               .template cast<Eigen::half>();
193     }
194   }
195 };
196 
197 template <typename Device, typename T>
198 struct SpatialConvolutionBackwardInputFunc {
199   void operator()(const Device& d, typename TTypes<T, 4>::Tensor input_backward,
200                   typename TTypes<T, 4>::ConstTensor filter,
201                   typename TTypes<T, 4>::ConstTensor output_backward,
202                   Eigen::DenseIndex col_stride, Eigen::DenseIndex row_stride,
203                   Eigen::DenseIndex col_dilation,
204                   Eigen::DenseIndex row_dilation) {
205     input_backward.device(d) = Eigen::SpatialConvolutionBackwardInput(
206         filter, output_backward, input_backward.dimension(2),
207         input_backward.dimension(1), col_stride, row_stride, col_dilation,
208         row_dilation);
209   }
210 };
211 
212 // GPU version requires all tensors to be indexable by int32.
213 template <typename T>
214 struct SpatialConvolutionBackwardInputFunc<Eigen::GpuDevice, T> {
215   void operator()(const Eigen::GpuDevice& d,
216                   typename TTypes<T, 4>::Tensor input_backward,
217                   typename TTypes<T, 4>::ConstTensor filter,
218                   typename TTypes<T, 4>::ConstTensor output_backward,
219                   Eigen::DenseIndex col_stride, Eigen::DenseIndex row_stride,
220                   Eigen::DenseIndex col_dilation,
221                   Eigen::DenseIndex row_dilation) {
222     To32Bit(input_backward).device(d) = Eigen::SpatialConvolutionBackwardInput(
223         To32Bit(filter), To32Bit(output_backward), input_backward.dimension(2),
224         input_backward.dimension(1), col_stride, row_stride, col_dilation,
225         row_dilation);
226   }
227 };
228 
229 template <typename Device, typename T>
230 struct SpatialConvolutionBackwardInputWithExplicitPaddingFunc {
231   void operator()(const Device& d, typename TTypes<T, 4>::Tensor input_backward,
232                   typename TTypes<T, 4>::ConstTensor filter,
233                   typename TTypes<T, 4>::ConstTensor output_backward,
234                   Eigen::DenseIndex padded_cols, Eigen::DenseIndex padded_rows,
235                   Eigen::DenseIndex col_stride, Eigen::DenseIndex row_stride,
236                   Eigen::DenseIndex col_dilation,
237                   Eigen::DenseIndex row_dilation, Eigen::DenseIndex pad_left,
238                   Eigen::DenseIndex pad_top) {
239     // We have to slice the result of a spatial convolution backward
240     // input, before assigning it to the `input_backward` to remove padding.
241     //
242     // TODO(ezhulenev): Pass explicit paddings to Eigen and do not materialize
243     // intermediate result in memory before slicing.
244     input_backward.device(d) =
245         Eigen::SpatialConvolutionBackwardInput(
246             filter, output_backward, padded_cols, padded_rows, col_stride,
247             row_stride, col_dilation, row_dilation)
248             .eval()
249             .slice(Eigen::DSizes<Eigen::DenseIndex, 4>{0, pad_left, pad_top, 0},
250                    input_backward.dimensions());
251   }
252 };
253 
254 // GPU version requires all tensors to be indexable by int32.
255 template <typename T>
256 struct SpatialConvolutionBackwardInputWithExplicitPaddingFunc<Eigen::GpuDevice,
257                                                               T> {
258   void operator()(const Eigen::GpuDevice& d,
259                   typename TTypes<T, 4>::Tensor input_backward,
260                   typename TTypes<T, 4>::ConstTensor filter,
261                   typename TTypes<T, 4>::ConstTensor output_backward,
262                   Eigen::DenseIndex padded_cols, Eigen::DenseIndex padded_rows,
263                   Eigen::DenseIndex col_stride, Eigen::DenseIndex row_stride,
264                   Eigen::DenseIndex col_dilation,
265                   Eigen::DenseIndex row_dilation, Eigen::DenseIndex pad_left,
266                   Eigen::DenseIndex pad_top) {
267     To32Bit(input_backward).device(d) =
268         Eigen::SpatialConvolutionBackwardInput(
269             To32Bit(filter), To32Bit(output_backward), padded_cols, padded_rows,
270             col_stride, row_stride, col_dilation, row_dilation)
271             .eval()
272             .slice(Eigen::DSizes<Eigen::DenseIndex, 4>{0, pad_left, pad_top, 0},
273                    input_backward.dimensions());
274   }
275 };
276 
277 // TODO(vrv): Figure out how to use the MatMulFunctor in matmul_op.h.
278 // My initial attempt to do this compiled but failed in the pytest
279 // due to a swigdeps error.
280 template <typename Device, typename T,
281           typename OutputKernel = const Eigen::NoOpOutputKernel>
282 struct MatMulConvFunctor {
283   // Computes on device "d": out = in0 * in1, where * is matrix
284   // multiplication.
285   void operator()(
286       const Device& d, typename TTypes<T, 2>::Tensor out,
287       typename TTypes<T, 2>::ConstTensor in0,
288       typename TTypes<T, 2>::ConstTensor in1,
289       const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
290       const OutputKernel& output_kernel = OutputKernel()) {
291     out.device(d) = in0.contract(in1, dim_pair, output_kernel);
292   }
293 };
294 
295 // Shuffles a filter tensor from TensorFlow format HWIO to dst_filter_format.
296 //
297 // Note: Currently supports OIHW and OHWI destination formats.
298 template <typename Device, typename T, typename IndexType, int NDIMS>
299 struct TransformFilter {
300   void operator()(const Device& d, FilterTensorFormat dst_filter_format,
301                   typename TTypes<T, NDIMS, IndexType>::ConstTensor in,
302                   typename TTypes<T, NDIMS, IndexType>::Tensor out) {
303     // NOTE: Source filter format is always HWIO.
304     Eigen::DSizes<IndexType, NDIMS - 2> spatial_dims;
305     for (int i = 0; i < spatial_dims.rank(); ++i) {
306       spatial_dims[i] = in.dimension(i);
307     }
308 
309     // Merge the spatial dimensions together to speed up the shuffle operation.
310     Eigen::DSizes<IndexType, 3> merged_dims;
311     merged_dims[0] = spatial_dims.TotalSize();  // product of spatial dims [H*W]
312     merged_dims[1] = in.dimension(NDIMS - 2);   // input filters           [I]
313     merged_dims[2] = in.dimension(NDIMS - 1);   // output filters          [O]
314 
315     // Shuffle tensor with merged spatial dimensions.
316     Eigen::DSizes<IndexType, 3> shuffling_perm;
317     // Expand shuffled tensor into final dimensions.
318     Eigen::DSizes<IndexType, NDIMS> expanded_dims;
319 
320     if (dst_filter_format == FORMAT_OIHW) {
321       shuffling_perm = Eigen::DSizes<IndexType, 3>(2, 1, 0);
322 
323       expanded_dims[0] = merged_dims[2];  // [O]
324       expanded_dims[1] = merged_dims[1];  // [I]
325       for (int i = 0; i < spatial_dims.rank(); ++i) {
326         expanded_dims[2 + i] = spatial_dims[i];
327       }
328 
329     } else if (dst_filter_format == FORMAT_OHWI) {
330       shuffling_perm = Eigen::DSizes<IndexType, 3>(2, 0, 1);
331 
332       expanded_dims[0] = merged_dims[2];          // [O]
333       expanded_dims[NDIMS - 1] = merged_dims[1];  // [I]
334       for (int i = 0; i < spatial_dims.rank(); ++i) {
335         expanded_dims[1 + i] = spatial_dims[i];
336       }
337 
338     } else {
339       DCHECK(false) << "Unsupported destination filter format: "
340                     << ToString(dst_filter_format);
341     }
342 
343     out.device(d) =
344         in.reshape(merged_dims).shuffle(shuffling_perm).reshape(expanded_dims);
345   }
346 };
347 
348 // TODO This functor is not used anywhere and should be removed,
349 // but it defines some eigen templates that are referenced in other kernels.
350 template <typename Device, typename T, typename IndexType>
351 struct TransformDepth {
352   void operator()(const Device& d,
353                   typename TTypes<T, 4, IndexType>::ConstTensor in,
354                   const Eigen::DSizes<IndexType, 4>& shuffle,
355                   typename TTypes<T, 4, IndexType>::Tensor out) {
356     Eigen::DSizes<IndexType, 3> merged_dims;
357     Eigen::DSizes<IndexType, 4> expanded_dims;
358     Eigen::DSizes<IndexType, 3> new_shuffle;
359 
360     // Merge dimensions that won't be shuffled together to speed things up.
361     if (shuffle[1] == 2 && shuffle[2] == 3) {
362       merged_dims[0] = in.dimension(0);
363       merged_dims[1] = in.dimension(1);
364       merged_dims[2] = in.dimension(2) * in.dimension(3);
365       new_shuffle[0] = shuffle[0];
366       new_shuffle[1] = 2;
367       new_shuffle[2] = shuffle[3];
368       expanded_dims[0] = in.dimension(shuffle[0]);
369       expanded_dims[1] = in.dimension(2);
370       expanded_dims[2] = in.dimension(3);
371       expanded_dims[3] = in.dimension(shuffle[3]);
372     } else if (shuffle[0] == 2 && shuffle[1] == 3) {
373       merged_dims[0] = in.dimension(0);
374       merged_dims[1] = in.dimension(1);
375       merged_dims[2] = in.dimension(2) * in.dimension(3);
376       new_shuffle[0] = 2;
377       new_shuffle[1] = shuffle[2];
378       new_shuffle[2] = shuffle[3];
379       expanded_dims[0] = in.dimension(2);
380       expanded_dims[1] = in.dimension(3);
381       expanded_dims[2] = in.dimension(shuffle[2]);
382       expanded_dims[3] = in.dimension(shuffle[3]);
383     } else if (shuffle[0] == 0 && shuffle[1] == 3 && shuffle[2] == 1 &&
384                shuffle[3] == 2) {
385       merged_dims[0] = in.dimension(0);
386       merged_dims[1] = in.dimension(1) * in.dimension(2);
387       merged_dims[2] = in.dimension(3);
388       new_shuffle[0] = 0;
389       new_shuffle[1] = 2;
390       new_shuffle[2] = 1;
391       expanded_dims[0] = in.dimension(0);
392       expanded_dims[1] = in.dimension(3);
393       expanded_dims[2] = in.dimension(1);
394       expanded_dims[3] = in.dimension(2);
395     } else {
396       assert(false && "unexpected shuffle");
397     }
398 
399     out.device(d) =
400         in.reshape(merged_dims).shuffle(new_shuffle).reshape(expanded_dims);
401   }
402 };
403 
404 // Note on the use of const reference for the "padding_value" argument
405 //
406 // In the ROCm TF build,
407 // ++ the call(s) to the functor are in the files (conv_*.cc) that are compiled
408 //    by the "CPU" compiler, while the
409 // ++ the GPUDevice specific template instantiations are in the files that are
410 //     compiled by the "GPU" compiler.
411 //
412 // For T == Eigen::half, the value of the "padding_value" argument (when it was
413 // pass-by-value) was getting corrupted, leading to regressions in the
414 // convolution unit tests.
415 //
416 // I do not understand the exact reason for the this, but based on similar past
417 // issues, it is likely due to a combination of
418 // ++ an ABI incompatibility between the "old" CPU compiler (gcc 5.4 for
419 //    Ubuntu 16.04, gcc 7.5 for Ubuntu 18.04) and the "new" ROCm GPU compiler
420 //    (hipclang which is based on latest clang), AND
421 // ++ Eigen::half having the same size but different internals on the CPU and
422 //    GPU sides (unsigned short on CPU, union {unsigned short, _Float16} on GPU
423 //
424 // Changing the "padding value" argument to be a const reference type seems to
425 // suppress the bug
426 template <typename Device, typename T, typename IndexType, int NDIMS>
427 struct PadInput {
428   void operator()(const Device& d,
429                   typename TTypes<T, NDIMS, IndexType>::ConstTensor in,
430                   const std::array<int, NDIMS - 2>& padding_left,
431                   const std::array<int, NDIMS - 2>& padding_right,
432                   typename TTypes<T, NDIMS, IndexType>::Tensor out,
433                   TensorFormat format, const T& padding_value) {
434     Eigen::array<Eigen::IndexPair<IndexType>, NDIMS> padding;
435     padding[GetTensorDimIndex<NDIMS - 2>(format, 'N')] = {0, 0};
436     for (int i = 0; i < NDIMS - 2; ++i) {
437       padding[GetTensorDimIndex<NDIMS - 2>(format, '0' + i)] = {
438           padding_left[i], padding_right[i]};
439     }
440     padding[GetTensorDimIndex<NDIMS - 2>(format, 'C')] = {0, 0};
441     out.device(d) = in.pad(padding, padding_value);
442   }
443 };
444 
445 // Converts a tensor from:
446 //   [batch, <spatial>, filters]
447 // to:
448 //   [batch, filters, <spatial>]
449 template <typename Device, typename T, int NDIMS>
450 struct NHWCToNCHW {
451   void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in,
452                   typename TTypes<T, NDIMS>::Tensor out);
453 };
454 
455 // Converts a tensor from:
456 //   [batch, filters, <spatial>]
457 // to:
458 //   [batch, <spatial>, filters]
459 template <typename Device, typename T, int NDIMS>
460 struct NCHWToNHWC {
461   void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in,
462                   typename TTypes<T, NDIMS>::Tensor out);
463 };
464 
465 // Converts a tensor from:
466 //   [dim0, dim1, dim2]
467 // to:
468 //   [dim0, dim2, dim1]
469 template <typename Device, typename T, bool conjugate = false>
470 struct SwapDimension1And2InTensor3 {
471   void operator()(const Device& d, const T* in,
472                   const gtl::ArraySlice<int64_t>& input_dims, T* out);
473 };
474 
475 // Converts a tensor from:
476 //   [dim0, dim1, dim2]
477 // to:
478 //   [dim2, dim1, dim0]
479 template <typename Device, typename T, bool conjugate = false>
480 struct SwapDimension0And2InTensor3 {
481   void operator()(const Device& d, const T* in,
482                   const gtl::ArraySlice<int64_t>& input_dims, T* out);
483 };
484 
485 // Transforms back filter from OIHW or OHWI to HWOI format to reverse effect of
486 // TransformFilter above.
487 template <typename Device, typename T, int NDIMS>
488 struct ReverseTransformFilter {
489   void operator()(const Device& d, FilterTensorFormat src_filter_format,
490                   typename TTypes<T, NDIMS>::ConstTensor in,
491                   typename TTypes<T, NDIMS>::Tensor out);
492 };
493 
494 }  // namespace functor
495 
496 template <class T>
497 class ConvAlgorithmMap;
498 
499 template <>
500 class ConvAlgorithmMap<Eigen::ThreadPoolDevice> {};
501 }  // namespace tensorflow
502 
503 #endif  // TENSORFLOW_CORE_KERNELS_CONV_2D_H_
504