xref: /aosp_15_r20/external/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h (revision bf2c37156dfe67e5dfebd6d394bad8b2ab5804d4)
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <[email protected]>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
12 
13 namespace Eigen {
14 
15 /** \class TensorPadding
16   * \ingroup CXX11_Tensor_Module
17   *
18   * \brief Tensor padding class.
19   * At the moment only padding with a constant value is supported.
20   *
21   */
22 namespace internal {
23 template<typename PaddingDimensions, typename XprType>
24 struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType>
25 {
26   typedef typename XprType::Scalar Scalar;
27   typedef traits<XprType> XprTraits;
28   typedef typename XprTraits::StorageKind StorageKind;
29   typedef typename XprTraits::Index Index;
30   typedef typename XprType::Nested Nested;
31   typedef typename remove_reference<Nested>::type _Nested;
32   static const int NumDimensions = XprTraits::NumDimensions;
33   static const int Layout = XprTraits::Layout;
34   typedef typename XprTraits::PointerType PointerType;
35 };
36 
37 template<typename PaddingDimensions, typename XprType>
38 struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense>
39 {
40   typedef const TensorPaddingOp<PaddingDimensions, XprType>& type;
41 };
42 
43 template<typename PaddingDimensions, typename XprType>
44 struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1, typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type>
45 {
46   typedef TensorPaddingOp<PaddingDimensions, XprType> type;
47 };
48 
49 }  // end namespace internal
50 
51 
52 
53 template<typename PaddingDimensions, typename XprType>
54 class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors>
55 {
56   public:
57   typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar;
58   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
59   typedef typename XprType::CoeffReturnType CoeffReturnType;
60   typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested;
61   typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind;
62   typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index;
63 
64   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value)
65       : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {}
66 
67     EIGEN_DEVICE_FUNC
68     const PaddingDimensions& padding() const { return m_padding_dims; }
69     EIGEN_DEVICE_FUNC
70     Scalar padding_value() const { return m_padding_value; }
71 
72     EIGEN_DEVICE_FUNC
73     const typename internal::remove_all<typename XprType::Nested>::type&
74     expression() const { return m_xpr; }
75 
76   protected:
77     typename XprType::Nested m_xpr;
78     const PaddingDimensions m_padding_dims;
79     const Scalar m_padding_value;
80 };
81 
82 
83 // Eval as rvalue
84 template<typename PaddingDimensions, typename ArgType, typename Device>
85 struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device>
86 {
87   typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType;
88   typedef typename XprType::Index Index;
89   static const int NumDims = internal::array_size<PaddingDimensions>::value;
90   typedef DSizes<Index, NumDims> Dimensions;
91   typedef typename XprType::Scalar Scalar;
92   typedef typename XprType::CoeffReturnType CoeffReturnType;
93   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
94   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
95   typedef StorageMemory<CoeffReturnType, Device> Storage;
96   typedef typename Storage::Type EvaluatorPointerType;
97 
98   enum {
99     IsAligned         = true,
100     PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
101     BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
102     PreferBlockAccess = true,
103     Layout            = TensorEvaluator<ArgType, Device>::Layout,
104     CoordAccess       = true,
105     RawAccess         = false
106   };
107 
108   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
109 
110   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
111   typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
112   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
113 
114   typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
115                                                      Layout, Index>
116       TensorBlock;
117   //===--------------------------------------------------------------------===//
118 
119   EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
120       : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device)
121   {
122     // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
123     // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
124     // of 1 element first and then pad.
125     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
126 
127     // Compute dimensions
128     m_dimensions = m_impl.dimensions();
129     for (int i = 0; i < NumDims; ++i) {
130       m_dimensions[i] += m_padding[i].first + m_padding[i].second;
131     }
132     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
133     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
134       m_inputStrides[0] = 1;
135       m_outputStrides[0] = 1;
136       for (int i = 1; i < NumDims; ++i) {
137         m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
138         m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
139       }
140       m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1];
141     } else {
142       m_inputStrides[NumDims - 1] = 1;
143       m_outputStrides[NumDims] = 1;
144       for (int i = NumDims - 2; i >= 0; --i) {
145         m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
146         m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1];
147       }
148       m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0];
149     }
150   }
151 
152   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
153 
154   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
155     m_impl.evalSubExprsIfNeeded(NULL);
156     return true;
157   }
158 
159 #ifdef EIGEN_USE_THREADS
160   template <typename EvalSubExprsCallback>
161   EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
162       EvaluatorPointerType, EvalSubExprsCallback done) {
163     m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
164   }
165 #endif  // EIGEN_USE_THREADS
166 
167   EIGEN_STRONG_INLINE void cleanup() {
168     m_impl.cleanup();
169   }
170 
171   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
172   {
173     eigen_assert(index < dimensions().TotalSize());
174     Index inputIndex = 0;
175     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
176       EIGEN_UNROLL_LOOP
177       for (int i = NumDims - 1; i > 0; --i) {
178         const Index idx = index / m_outputStrides[i];
179         if (isPaddingAtIndexForDim(idx, i)) {
180           return m_paddingValue;
181         }
182         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
183         index -= idx * m_outputStrides[i];
184       }
185       if (isPaddingAtIndexForDim(index, 0)) {
186         return m_paddingValue;
187       }
188       inputIndex += (index - m_padding[0].first);
189     } else {
190       EIGEN_UNROLL_LOOP
191       for (int i = 0; i < NumDims - 1; ++i) {
192         const Index idx = index / m_outputStrides[i+1];
193         if (isPaddingAtIndexForDim(idx, i)) {
194           return m_paddingValue;
195         }
196         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
197         index -= idx * m_outputStrides[i+1];
198       }
199       if (isPaddingAtIndexForDim(index, NumDims-1)) {
200         return m_paddingValue;
201       }
202       inputIndex += (index - m_padding[NumDims-1].first);
203     }
204     return m_impl.coeff(inputIndex);
205   }
206 
207   template<int LoadMode>
208   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
209   {
210     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
211       return packetColMajor(index);
212     }
213     return packetRowMajor(index);
214   }
215 
216   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
217     TensorOpCost cost = m_impl.costPerCoeff(vectorized);
218     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
219       EIGEN_UNROLL_LOOP
220       for (int i = 0; i < NumDims; ++i)
221         updateCostPerDimension(cost, i, i == 0);
222     } else {
223       EIGEN_UNROLL_LOOP
224       for (int i = NumDims - 1; i >= 0; --i)
225         updateCostPerDimension(cost, i, i == NumDims - 1);
226     }
227     return cost;
228   }
229 
230   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
231   internal::TensorBlockResourceRequirements getResourceRequirements() const {
232     const size_t target_size = m_device.lastLevelCacheSize();
233     return internal::TensorBlockResourceRequirements::merge(
234         internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
235         m_impl.getResourceRequirements());
236   }
237 
238   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
239   block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
240           bool /*root_of_expr_ast*/ = false) const {
241     // If one of the dimensions is zero, return empty block view.
242     if (desc.size() == 0) {
243       return TensorBlock(internal::TensorBlockKind::kView, NULL,
244                            desc.dimensions());
245     }
246 
247     static const bool IsColMajor = Layout == static_cast<int>(ColMajor);
248     const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1;
249 
250     Index offset = desc.offset();
251 
252     // Compute offsets in the output tensor corresponding to the desc.offset().
253     DSizes<Index, NumDims> output_offsets;
254     for (int i = NumDims - 1; i > 0; --i) {
255       const int dim = IsColMajor ? i : NumDims - i - 1;
256       const int stride_dim = IsColMajor ? dim : dim + 1;
257       output_offsets[dim] = offset / m_outputStrides[stride_dim];
258       offset -= output_offsets[dim] * m_outputStrides[stride_dim];
259     }
260     output_offsets[inner_dim_idx] = offset;
261 
262     // Offsets in the input corresponding to output offsets.
263     DSizes<Index, NumDims> input_offsets = output_offsets;
264     for (int i = 0; i < NumDims; ++i) {
265       const int dim = IsColMajor ? i : NumDims - i - 1;
266       input_offsets[dim] = input_offsets[dim] - m_padding[dim].first;
267     }
268 
269     // Compute offset in the input buffer (at this point it might be illegal and
270     // point outside of the input buffer, because we don't check for negative
271     // offsets, it will be autocorrected in the block iteration loop below).
272     Index input_offset = 0;
273     for (int i = 0; i < NumDims; ++i) {
274       const int dim = IsColMajor ? i : NumDims - i - 1;
275       input_offset += input_offsets[dim] * m_inputStrides[dim];
276     }
277 
278     // Destination buffer and scratch buffer both indexed from 0 and have the
279     // same dimensions as the requested block (for destination buffer this
280     // property is guaranteed by `desc.destination()`).
281     Index output_offset = 0;
282     const DSizes<Index, NumDims> output_strides =
283         internal::strides<Layout>(desc.dimensions());
284 
285     // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1`
286     // dimensions, skipping innermost dimension. In theory it should be possible
287     // to squeeze matching innermost dimensions, however in practice that did
288     // not show any improvements in benchmarks. Also in practice first outer
289     // dimension usually has padding, and will prevent squeezing.
290 
291     // Initialize output block iterator state. Dimension in this array are
292     // always in inner_most -> outer_most order (col major layout).
293     array<BlockIteratorState, NumDims - 1> it;
294     for (int i = 0; i < NumDims - 1; ++i) {
295       const int dim = IsColMajor ? i + 1 : NumDims - i - 2;
296       it[i].count = 0;
297       it[i].size = desc.dimension(dim);
298 
299       it[i].input_stride = m_inputStrides[dim];
300       it[i].input_span = it[i].input_stride * (it[i].size - 1);
301 
302       it[i].output_stride = output_strides[dim];
303       it[i].output_span = it[i].output_stride * (it[i].size - 1);
304     }
305 
306     const Index input_inner_dim_size =
307         static_cast<Index>(m_impl.dimensions()[inner_dim_idx]);
308 
309     // Total output size.
310     const Index output_size = desc.size();
311 
312     // We will fill inner dimension of this size in the output. It might be
313     // larger than the inner dimension in the input, so we might have to pad
314     // before/after we copy values from the input inner dimension.
315     const Index output_inner_dim_size = desc.dimension(inner_dim_idx);
316 
317     // How many values to fill with padding BEFORE reading from the input inner
318     // dimension.
319     const Index output_inner_pad_before_size =
320         input_offsets[inner_dim_idx] < 0
321             ? numext::mini(numext::abs(input_offsets[inner_dim_idx]),
322                            output_inner_dim_size)
323             : 0;
324 
325     // How many values we can actually copy from the input inner dimension.
326     const Index output_inner_copy_size = numext::mini(
327         // Want to copy from input.
328         (output_inner_dim_size - output_inner_pad_before_size),
329         // Can copy from input.
330         numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] +
331                                              output_inner_pad_before_size),
332                      Index(0)));
333 
334     eigen_assert(output_inner_copy_size >= 0);
335 
336     // How many values to fill with padding AFTER reading from the input inner
337     // dimension.
338     const Index output_inner_pad_after_size =
339         (output_inner_dim_size - output_inner_copy_size -
340          output_inner_pad_before_size);
341 
342     // Sanity check, sum of all sizes must be equal to the output size.
343     eigen_assert(output_inner_dim_size ==
344                  (output_inner_pad_before_size + output_inner_copy_size +
345                   output_inner_pad_after_size));
346 
347     // Keep track of current coordinates and padding in the output.
348     DSizes<Index, NumDims> output_coord = output_offsets;
349     DSizes<Index, NumDims> output_padded;
350     for (int i = 0; i < NumDims; ++i) {
351       const int dim = IsColMajor ? i : NumDims - i - 1;
352       output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
353     }
354 
355     typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;
356 
357     // Prepare storage for the materialized padding result.
358     const typename TensorBlock::Storage block_storage =
359         TensorBlock::prepareStorage(desc, scratch);
360 
361     // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a
362     // single logical inner dimension.
363 
364     // When possible we squeeze writes for the innermost (only if non-padded)
365     // dimension with the first padded dimension. This allows to reduce the
366     // number of calls to LinCopy and better utilize vector instructions.
367     const bool squeeze_writes =
368         NumDims > 1 &&
369         // inner dimension is not padded
370         (input_inner_dim_size == m_dimensions[inner_dim_idx]) &&
371         // and equal to the block inner dimension
372         (input_inner_dim_size == output_inner_dim_size);
373 
374     const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1;
375 
376     // Maximum coordinate on a squeeze dimension that we can write to.
377     const Index squeeze_max_coord =
378         squeeze_writes ? numext::mini(
379                              // max non-padded element in the input
380                              static_cast<Index>(m_dimensions[squeeze_dim] -
381                                                 m_padding[squeeze_dim].second),
382                              // max element in the output buffer
383                              static_cast<Index>(output_offsets[squeeze_dim] +
384                                                 desc.dimension(squeeze_dim)))
385                        : static_cast<Index>(0);
386 
387     // Iterate copying data from `m_impl.data()` to the output buffer.
388     for (Index size = 0; size < output_size;) {
389       // Detect if we are in the padded region (exclude innermost dimension).
390       bool is_padded = false;
391       for (int j = 1; j < NumDims; ++j) {
392         const int dim = IsColMajor ? j : NumDims - j - 1;
393         is_padded = output_padded[dim];
394         if (is_padded) break;
395       }
396 
397       if (is_padded) {
398         // Fill single innermost dimension with padding value.
399         size += output_inner_dim_size;
400 
401         LinCopy::template Run<LinCopy::Kind::FillLinear>(
402             typename LinCopy::Dst(output_offset, 1, block_storage.data()),
403             typename LinCopy::Src(0, 0, &m_paddingValue),
404             output_inner_dim_size);
405 
406 
407       } else if (squeeze_writes) {
408         // Squeeze multiple reads from innermost dimensions.
409         const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim];
410         size += output_inner_dim_size * squeeze_num;
411 
412         // Copy `squeeze_num` inner dimensions from input to output.
413         LinCopy::template Run<LinCopy::Kind::Linear>(
414             typename LinCopy::Dst(output_offset, 1, block_storage.data()),
415             typename LinCopy::Src(input_offset, 1, m_impl.data()),
416             output_inner_dim_size * squeeze_num);
417 
418         // Update iteration state for only `squeeze_num - 1` processed inner
419         // dimensions, because we have another iteration state update at the end
420         // of the loop that will update iteration state for the last inner
421         // processed dimension.
422         it[0].count += (squeeze_num - 1);
423         input_offset += it[0].input_stride * (squeeze_num - 1);
424         output_offset += it[0].output_stride * (squeeze_num - 1);
425         output_coord[squeeze_dim] += (squeeze_num - 1);
426 
427       } else {
428         // Single read from innermost dimension.
429         size += output_inner_dim_size;
430 
431         {  // Fill with padding before copying from input inner dimension.
432           const Index out = output_offset;
433 
434           LinCopy::template Run<LinCopy::Kind::FillLinear>(
435               typename LinCopy::Dst(out, 1, block_storage.data()),
436               typename LinCopy::Src(0, 0, &m_paddingValue),
437               output_inner_pad_before_size);
438         }
439 
440         {  // Copy data from input inner dimension.
441           const Index out = output_offset + output_inner_pad_before_size;
442           const Index in = input_offset + output_inner_pad_before_size;
443 
444           eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL);
445 
446           LinCopy::template Run<LinCopy::Kind::Linear>(
447               typename LinCopy::Dst(out, 1, block_storage.data()),
448               typename LinCopy::Src(in, 1, m_impl.data()),
449               output_inner_copy_size);
450         }
451 
452         {  // Fill with padding after copying from input inner dimension.
453           const Index out = output_offset + output_inner_pad_before_size +
454                             output_inner_copy_size;
455 
456           LinCopy::template Run<LinCopy::Kind::FillLinear>(
457               typename LinCopy::Dst(out, 1, block_storage.data()),
458               typename LinCopy::Src(0, 0, &m_paddingValue),
459               output_inner_pad_after_size);
460         }
461       }
462 
463       for (int j = 0; j < NumDims - 1; ++j) {
464         const int dim = IsColMajor ? j + 1 : NumDims - j - 2;
465 
466         if (++it[j].count < it[j].size) {
467           input_offset += it[j].input_stride;
468           output_offset += it[j].output_stride;
469           output_coord[dim] += 1;
470           output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
471           break;
472         }
473         it[j].count = 0;
474         input_offset -= it[j].input_span;
475         output_offset -= it[j].output_span;
476         output_coord[dim] -= it[j].size - 1;
477         output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
478       }
479     }
480 
481     return block_storage.AsTensorMaterializedBlock();
482   }
483 
484   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
485 
486 #ifdef EIGEN_USE_SYCL
487   // binding placeholder accessors to a command group handler for SYCL
488   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
489     m_impl.bind(cgh);
490   }
491 #endif
492 
493  private:
494   struct BlockIteratorState {
495     BlockIteratorState()
496         : count(0),
497           size(0),
498           input_stride(0),
499           input_span(0),
500           output_stride(0),
501           output_span(0) {}
502 
503     Index count;
504     Index size;
505     Index input_stride;
506     Index input_span;
507     Index output_stride;
508     Index output_span;
509   };
510 
511   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
512       Index index, int dim_index) const {
513 #if defined(EIGEN_HAS_INDEX_LIST)
514     return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) &&
515             index < m_padding[dim_index].first) ||
516         (!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) &&
517          index >= m_dimensions[dim_index] - m_padding[dim_index].second);
518 #else
519     return (index < m_padding[dim_index].first) ||
520            (index >= m_dimensions[dim_index] - m_padding[dim_index].second);
521 #endif
522   }
523 
524   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(
525       int dim_index) const {
526 #if defined(EIGEN_HAS_INDEX_LIST)
527     return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0);
528 #else
529     EIGEN_UNUSED_VARIABLE(dim_index);
530     return false;
531 #endif
532   }
533 
534   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(
535       int dim_index) const {
536 #if defined(EIGEN_HAS_INDEX_LIST)
537     return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0);
538 #else
539     EIGEN_UNUSED_VARIABLE(dim_index);
540     return false;
541 #endif
542   }
543 
544 
545   void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
546     const double in = static_cast<double>(m_impl.dimensions()[i]);
547     const double out = in + m_padding[i].first + m_padding[i].second;
548     if (out == 0)
549       return;
550     const double reduction = in / out;
551     cost *= reduction;
552     if (first) {
553       cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
554                     reduction * (1 * TensorOpCost::AddCost<Index>()));
555     } else {
556       cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
557                                  2 * TensorOpCost::MulCost<Index>() +
558                     reduction * (2 * TensorOpCost::MulCost<Index>() +
559                                  1 * TensorOpCost::DivCost<Index>()));
560     }
561   }
562 
563  protected:
564 
565   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
566   {
567     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
568     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
569 
570     const Index initialIndex = index;
571     Index inputIndex = 0;
572     EIGEN_UNROLL_LOOP
573     for (int i = NumDims - 1; i > 0; --i) {
574       const Index firstIdx = index;
575       const Index lastIdx = index + PacketSize - 1;
576       const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
577       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
578       const Index lastPaddedRight = m_outputStrides[i+1];
579 
580       if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
581         // all the coefficient are in the padding zone.
582         return internal::pset1<PacketReturnType>(m_paddingValue);
583       }
584       else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
585         // all the coefficient are in the padding zone.
586         return internal::pset1<PacketReturnType>(m_paddingValue);
587       }
588       else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
589         // all the coefficient are between the 2 padding zones.
590         const Index idx = index / m_outputStrides[i];
591         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
592         index -= idx * m_outputStrides[i];
593       }
594       else {
595         // Every other case
596         return packetWithPossibleZero(initialIndex);
597       }
598     }
599 
600     const Index lastIdx = index + PacketSize - 1;
601     const Index firstIdx = index;
602     const Index lastPaddedLeft = m_padding[0].first;
603     const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
604     const Index lastPaddedRight = m_outputStrides[1];
605 
606     if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) {
607       // all the coefficient are in the padding zone.
608       return internal::pset1<PacketReturnType>(m_paddingValue);
609     }
610     else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
611       // all the coefficient are in the padding zone.
612       return internal::pset1<PacketReturnType>(m_paddingValue);
613     }
614     else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
615       // all the coefficient are between the 2 padding zones.
616       inputIndex += (index - m_padding[0].first);
617       return m_impl.template packet<Unaligned>(inputIndex);
618     }
619     // Every other case
620     return packetWithPossibleZero(initialIndex);
621   }
622 
623   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
624   {
625     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
626     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
627 
628     const Index initialIndex = index;
629     Index inputIndex = 0;
630     EIGEN_UNROLL_LOOP
631     for (int i = 0; i < NumDims - 1; ++i) {
632       const Index firstIdx = index;
633       const Index lastIdx = index + PacketSize - 1;
634       const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
635       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
636       const Index lastPaddedRight = m_outputStrides[i];
637 
638       if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
639         // all the coefficient are in the padding zone.
640         return internal::pset1<PacketReturnType>(m_paddingValue);
641       }
642       else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
643         // all the coefficient are in the padding zone.
644         return internal::pset1<PacketReturnType>(m_paddingValue);
645       }
646       else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
647         // all the coefficient are between the 2 padding zones.
648         const Index idx = index / m_outputStrides[i+1];
649         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
650         index -= idx * m_outputStrides[i+1];
651       }
652       else {
653         // Every other case
654         return packetWithPossibleZero(initialIndex);
655       }
656     }
657 
658     const Index lastIdx = index + PacketSize - 1;
659     const Index firstIdx = index;
660     const Index lastPaddedLeft = m_padding[NumDims-1].first;
661     const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
662     const Index lastPaddedRight = m_outputStrides[NumDims-1];
663 
664     if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) {
665       // all the coefficient are in the padding zone.
666       return internal::pset1<PacketReturnType>(m_paddingValue);
667     }
668     else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
669       // all the coefficient are in the padding zone.
670       return internal::pset1<PacketReturnType>(m_paddingValue);
671     }
672     else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
673       // all the coefficient are between the 2 padding zones.
674       inputIndex += (index - m_padding[NumDims-1].first);
675       return m_impl.template packet<Unaligned>(inputIndex);
676     }
677     // Every other case
678     return packetWithPossibleZero(initialIndex);
679   }
680 
681   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
682   {
683     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
684     EIGEN_UNROLL_LOOP
685     for (int i = 0; i < PacketSize; ++i) {
686       values[i] = coeff(index+i);
687     }
688     PacketReturnType rslt = internal::pload<PacketReturnType>(values);
689     return rslt;
690   }
691 
692   Dimensions m_dimensions;
693   array<Index, NumDims+1> m_outputStrides;
694   array<Index, NumDims> m_inputStrides;
695   TensorEvaluator<ArgType, Device> m_impl;
696   PaddingDimensions m_padding;
697 
698   Scalar m_paddingValue;
699 
700   const Device EIGEN_DEVICE_REF m_device;
701 };
702 
703 
704 
705 
706 } // end namespace Eigen
707 
708 #endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
709