1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <[email protected]> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H 12 13 namespace Eigen { 14 15 /** \class TensorPadding 16 * \ingroup CXX11_Tensor_Module 17 * 18 * \brief Tensor padding class. 19 * At the moment only padding with a constant value is supported. 20 * 21 */ 22 namespace internal { 23 template<typename PaddingDimensions, typename XprType> 24 struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType> 25 { 26 typedef typename XprType::Scalar Scalar; 27 typedef traits<XprType> XprTraits; 28 typedef typename XprTraits::StorageKind StorageKind; 29 typedef typename XprTraits::Index Index; 30 typedef typename XprType::Nested Nested; 31 typedef typename remove_reference<Nested>::type _Nested; 32 static const int NumDimensions = XprTraits::NumDimensions; 33 static const int Layout = XprTraits::Layout; 34 typedef typename XprTraits::PointerType PointerType; 35 }; 36 37 template<typename PaddingDimensions, typename XprType> 38 struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense> 39 { 40 typedef const TensorPaddingOp<PaddingDimensions, XprType>& type; 41 }; 42 43 template<typename PaddingDimensions, typename XprType> 44 struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1, typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type> 45 { 46 typedef TensorPaddingOp<PaddingDimensions, XprType> type; 47 }; 48 49 } // end namespace internal 50 51 52 53 template<typename PaddingDimensions, typename XprType> 54 class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors> 55 { 56 public: 57 typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar; 58 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; 59 typedef typename XprType::CoeffReturnType CoeffReturnType; 60 typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested; 61 typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind; 62 typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index; 63 64 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value) 65 : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {} 66 67 EIGEN_DEVICE_FUNC 68 const PaddingDimensions& padding() const { return m_padding_dims; } 69 EIGEN_DEVICE_FUNC 70 Scalar padding_value() const { return m_padding_value; } 71 72 EIGEN_DEVICE_FUNC 73 const typename internal::remove_all<typename XprType::Nested>::type& 74 expression() const { return m_xpr; } 75 76 protected: 77 typename XprType::Nested m_xpr; 78 const PaddingDimensions m_padding_dims; 79 const Scalar m_padding_value; 80 }; 81 82 83 // Eval as rvalue 84 template<typename PaddingDimensions, typename ArgType, typename Device> 85 struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device> 86 { 87 typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType; 88 typedef typename XprType::Index Index; 89 static const int NumDims = internal::array_size<PaddingDimensions>::value; 90 typedef DSizes<Index, NumDims> Dimensions; 91 typedef typename XprType::Scalar Scalar; 92 typedef typename XprType::CoeffReturnType CoeffReturnType; 93 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; 94 static const int PacketSize = PacketType<CoeffReturnType, Device>::size; 95 typedef StorageMemory<CoeffReturnType, Device> Storage; 96 typedef typename Storage::Type EvaluatorPointerType; 97 98 enum { 99 IsAligned = true, 100 PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, 101 BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess, 102 PreferBlockAccess = true, 103 Layout = TensorEvaluator<ArgType, Device>::Layout, 104 CoordAccess = true, 105 RawAccess = false 106 }; 107 108 typedef typename internal::remove_const<Scalar>::type ScalarNoConst; 109 110 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// 111 typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; 112 typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; 113 114 typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, 115 Layout, Index> 116 TensorBlock; 117 //===--------------------------------------------------------------------===// 118 119 EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) 120 : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device) 121 { 122 // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead 123 // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector 124 // of 1 element first and then pad. 125 EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); 126 127 // Compute dimensions 128 m_dimensions = m_impl.dimensions(); 129 for (int i = 0; i < NumDims; ++i) { 130 m_dimensions[i] += m_padding[i].first + m_padding[i].second; 131 } 132 const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); 133 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 134 m_inputStrides[0] = 1; 135 m_outputStrides[0] = 1; 136 for (int i = 1; i < NumDims; ++i) { 137 m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; 138 m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; 139 } 140 m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1]; 141 } else { 142 m_inputStrides[NumDims - 1] = 1; 143 m_outputStrides[NumDims] = 1; 144 for (int i = NumDims - 2; i >= 0; --i) { 145 m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; 146 m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1]; 147 } 148 m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0]; 149 } 150 } 151 152 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } 153 154 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { 155 m_impl.evalSubExprsIfNeeded(NULL); 156 return true; 157 } 158 159 #ifdef EIGEN_USE_THREADS 160 template <typename EvalSubExprsCallback> 161 EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( 162 EvaluatorPointerType, EvalSubExprsCallback done) { 163 m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); 164 } 165 #endif // EIGEN_USE_THREADS 166 167 EIGEN_STRONG_INLINE void cleanup() { 168 m_impl.cleanup(); 169 } 170 171 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const 172 { 173 eigen_assert(index < dimensions().TotalSize()); 174 Index inputIndex = 0; 175 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 176 EIGEN_UNROLL_LOOP 177 for (int i = NumDims - 1; i > 0; --i) { 178 const Index idx = index / m_outputStrides[i]; 179 if (isPaddingAtIndexForDim(idx, i)) { 180 return m_paddingValue; 181 } 182 inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; 183 index -= idx * m_outputStrides[i]; 184 } 185 if (isPaddingAtIndexForDim(index, 0)) { 186 return m_paddingValue; 187 } 188 inputIndex += (index - m_padding[0].first); 189 } else { 190 EIGEN_UNROLL_LOOP 191 for (int i = 0; i < NumDims - 1; ++i) { 192 const Index idx = index / m_outputStrides[i+1]; 193 if (isPaddingAtIndexForDim(idx, i)) { 194 return m_paddingValue; 195 } 196 inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; 197 index -= idx * m_outputStrides[i+1]; 198 } 199 if (isPaddingAtIndexForDim(index, NumDims-1)) { 200 return m_paddingValue; 201 } 202 inputIndex += (index - m_padding[NumDims-1].first); 203 } 204 return m_impl.coeff(inputIndex); 205 } 206 207 template<int LoadMode> 208 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const 209 { 210 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 211 return packetColMajor(index); 212 } 213 return packetRowMajor(index); 214 } 215 216 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { 217 TensorOpCost cost = m_impl.costPerCoeff(vectorized); 218 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 219 EIGEN_UNROLL_LOOP 220 for (int i = 0; i < NumDims; ++i) 221 updateCostPerDimension(cost, i, i == 0); 222 } else { 223 EIGEN_UNROLL_LOOP 224 for (int i = NumDims - 1; i >= 0; --i) 225 updateCostPerDimension(cost, i, i == NumDims - 1); 226 } 227 return cost; 228 } 229 230 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 231 internal::TensorBlockResourceRequirements getResourceRequirements() const { 232 const size_t target_size = m_device.lastLevelCacheSize(); 233 return internal::TensorBlockResourceRequirements::merge( 234 internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size), 235 m_impl.getResourceRequirements()); 236 } 237 238 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock 239 block(TensorBlockDesc& desc, TensorBlockScratch& scratch, 240 bool /*root_of_expr_ast*/ = false) const { 241 // If one of the dimensions is zero, return empty block view. 242 if (desc.size() == 0) { 243 return TensorBlock(internal::TensorBlockKind::kView, NULL, 244 desc.dimensions()); 245 } 246 247 static const bool IsColMajor = Layout == static_cast<int>(ColMajor); 248 const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1; 249 250 Index offset = desc.offset(); 251 252 // Compute offsets in the output tensor corresponding to the desc.offset(). 253 DSizes<Index, NumDims> output_offsets; 254 for (int i = NumDims - 1; i > 0; --i) { 255 const int dim = IsColMajor ? i : NumDims - i - 1; 256 const int stride_dim = IsColMajor ? dim : dim + 1; 257 output_offsets[dim] = offset / m_outputStrides[stride_dim]; 258 offset -= output_offsets[dim] * m_outputStrides[stride_dim]; 259 } 260 output_offsets[inner_dim_idx] = offset; 261 262 // Offsets in the input corresponding to output offsets. 263 DSizes<Index, NumDims> input_offsets = output_offsets; 264 for (int i = 0; i < NumDims; ++i) { 265 const int dim = IsColMajor ? i : NumDims - i - 1; 266 input_offsets[dim] = input_offsets[dim] - m_padding[dim].first; 267 } 268 269 // Compute offset in the input buffer (at this point it might be illegal and 270 // point outside of the input buffer, because we don't check for negative 271 // offsets, it will be autocorrected in the block iteration loop below). 272 Index input_offset = 0; 273 for (int i = 0; i < NumDims; ++i) { 274 const int dim = IsColMajor ? i : NumDims - i - 1; 275 input_offset += input_offsets[dim] * m_inputStrides[dim]; 276 } 277 278 // Destination buffer and scratch buffer both indexed from 0 and have the 279 // same dimensions as the requested block (for destination buffer this 280 // property is guaranteed by `desc.destination()`). 281 Index output_offset = 0; 282 const DSizes<Index, NumDims> output_strides = 283 internal::strides<Layout>(desc.dimensions()); 284 285 // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1` 286 // dimensions, skipping innermost dimension. In theory it should be possible 287 // to squeeze matching innermost dimensions, however in practice that did 288 // not show any improvements in benchmarks. Also in practice first outer 289 // dimension usually has padding, and will prevent squeezing. 290 291 // Initialize output block iterator state. Dimension in this array are 292 // always in inner_most -> outer_most order (col major layout). 293 array<BlockIteratorState, NumDims - 1> it; 294 for (int i = 0; i < NumDims - 1; ++i) { 295 const int dim = IsColMajor ? i + 1 : NumDims - i - 2; 296 it[i].count = 0; 297 it[i].size = desc.dimension(dim); 298 299 it[i].input_stride = m_inputStrides[dim]; 300 it[i].input_span = it[i].input_stride * (it[i].size - 1); 301 302 it[i].output_stride = output_strides[dim]; 303 it[i].output_span = it[i].output_stride * (it[i].size - 1); 304 } 305 306 const Index input_inner_dim_size = 307 static_cast<Index>(m_impl.dimensions()[inner_dim_idx]); 308 309 // Total output size. 310 const Index output_size = desc.size(); 311 312 // We will fill inner dimension of this size in the output. It might be 313 // larger than the inner dimension in the input, so we might have to pad 314 // before/after we copy values from the input inner dimension. 315 const Index output_inner_dim_size = desc.dimension(inner_dim_idx); 316 317 // How many values to fill with padding BEFORE reading from the input inner 318 // dimension. 319 const Index output_inner_pad_before_size = 320 input_offsets[inner_dim_idx] < 0 321 ? numext::mini(numext::abs(input_offsets[inner_dim_idx]), 322 output_inner_dim_size) 323 : 0; 324 325 // How many values we can actually copy from the input inner dimension. 326 const Index output_inner_copy_size = numext::mini( 327 // Want to copy from input. 328 (output_inner_dim_size - output_inner_pad_before_size), 329 // Can copy from input. 330 numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] + 331 output_inner_pad_before_size), 332 Index(0))); 333 334 eigen_assert(output_inner_copy_size >= 0); 335 336 // How many values to fill with padding AFTER reading from the input inner 337 // dimension. 338 const Index output_inner_pad_after_size = 339 (output_inner_dim_size - output_inner_copy_size - 340 output_inner_pad_before_size); 341 342 // Sanity check, sum of all sizes must be equal to the output size. 343 eigen_assert(output_inner_dim_size == 344 (output_inner_pad_before_size + output_inner_copy_size + 345 output_inner_pad_after_size)); 346 347 // Keep track of current coordinates and padding in the output. 348 DSizes<Index, NumDims> output_coord = output_offsets; 349 DSizes<Index, NumDims> output_padded; 350 for (int i = 0; i < NumDims; ++i) { 351 const int dim = IsColMajor ? i : NumDims - i - 1; 352 output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); 353 } 354 355 typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy; 356 357 // Prepare storage for the materialized padding result. 358 const typename TensorBlock::Storage block_storage = 359 TensorBlock::prepareStorage(desc, scratch); 360 361 // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a 362 // single logical inner dimension. 363 364 // When possible we squeeze writes for the innermost (only if non-padded) 365 // dimension with the first padded dimension. This allows to reduce the 366 // number of calls to LinCopy and better utilize vector instructions. 367 const bool squeeze_writes = 368 NumDims > 1 && 369 // inner dimension is not padded 370 (input_inner_dim_size == m_dimensions[inner_dim_idx]) && 371 // and equal to the block inner dimension 372 (input_inner_dim_size == output_inner_dim_size); 373 374 const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1; 375 376 // Maximum coordinate on a squeeze dimension that we can write to. 377 const Index squeeze_max_coord = 378 squeeze_writes ? numext::mini( 379 // max non-padded element in the input 380 static_cast<Index>(m_dimensions[squeeze_dim] - 381 m_padding[squeeze_dim].second), 382 // max element in the output buffer 383 static_cast<Index>(output_offsets[squeeze_dim] + 384 desc.dimension(squeeze_dim))) 385 : static_cast<Index>(0); 386 387 // Iterate copying data from `m_impl.data()` to the output buffer. 388 for (Index size = 0; size < output_size;) { 389 // Detect if we are in the padded region (exclude innermost dimension). 390 bool is_padded = false; 391 for (int j = 1; j < NumDims; ++j) { 392 const int dim = IsColMajor ? j : NumDims - j - 1; 393 is_padded = output_padded[dim]; 394 if (is_padded) break; 395 } 396 397 if (is_padded) { 398 // Fill single innermost dimension with padding value. 399 size += output_inner_dim_size; 400 401 LinCopy::template Run<LinCopy::Kind::FillLinear>( 402 typename LinCopy::Dst(output_offset, 1, block_storage.data()), 403 typename LinCopy::Src(0, 0, &m_paddingValue), 404 output_inner_dim_size); 405 406 407 } else if (squeeze_writes) { 408 // Squeeze multiple reads from innermost dimensions. 409 const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim]; 410 size += output_inner_dim_size * squeeze_num; 411 412 // Copy `squeeze_num` inner dimensions from input to output. 413 LinCopy::template Run<LinCopy::Kind::Linear>( 414 typename LinCopy::Dst(output_offset, 1, block_storage.data()), 415 typename LinCopy::Src(input_offset, 1, m_impl.data()), 416 output_inner_dim_size * squeeze_num); 417 418 // Update iteration state for only `squeeze_num - 1` processed inner 419 // dimensions, because we have another iteration state update at the end 420 // of the loop that will update iteration state for the last inner 421 // processed dimension. 422 it[0].count += (squeeze_num - 1); 423 input_offset += it[0].input_stride * (squeeze_num - 1); 424 output_offset += it[0].output_stride * (squeeze_num - 1); 425 output_coord[squeeze_dim] += (squeeze_num - 1); 426 427 } else { 428 // Single read from innermost dimension. 429 size += output_inner_dim_size; 430 431 { // Fill with padding before copying from input inner dimension. 432 const Index out = output_offset; 433 434 LinCopy::template Run<LinCopy::Kind::FillLinear>( 435 typename LinCopy::Dst(out, 1, block_storage.data()), 436 typename LinCopy::Src(0, 0, &m_paddingValue), 437 output_inner_pad_before_size); 438 } 439 440 { // Copy data from input inner dimension. 441 const Index out = output_offset + output_inner_pad_before_size; 442 const Index in = input_offset + output_inner_pad_before_size; 443 444 eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL); 445 446 LinCopy::template Run<LinCopy::Kind::Linear>( 447 typename LinCopy::Dst(out, 1, block_storage.data()), 448 typename LinCopy::Src(in, 1, m_impl.data()), 449 output_inner_copy_size); 450 } 451 452 { // Fill with padding after copying from input inner dimension. 453 const Index out = output_offset + output_inner_pad_before_size + 454 output_inner_copy_size; 455 456 LinCopy::template Run<LinCopy::Kind::FillLinear>( 457 typename LinCopy::Dst(out, 1, block_storage.data()), 458 typename LinCopy::Src(0, 0, &m_paddingValue), 459 output_inner_pad_after_size); 460 } 461 } 462 463 for (int j = 0; j < NumDims - 1; ++j) { 464 const int dim = IsColMajor ? j + 1 : NumDims - j - 2; 465 466 if (++it[j].count < it[j].size) { 467 input_offset += it[j].input_stride; 468 output_offset += it[j].output_stride; 469 output_coord[dim] += 1; 470 output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); 471 break; 472 } 473 it[j].count = 0; 474 input_offset -= it[j].input_span; 475 output_offset -= it[j].output_span; 476 output_coord[dim] -= it[j].size - 1; 477 output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); 478 } 479 } 480 481 return block_storage.AsTensorMaterializedBlock(); 482 } 483 484 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } 485 486 #ifdef EIGEN_USE_SYCL 487 // binding placeholder accessors to a command group handler for SYCL 488 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { 489 m_impl.bind(cgh); 490 } 491 #endif 492 493 private: 494 struct BlockIteratorState { 495 BlockIteratorState() 496 : count(0), 497 size(0), 498 input_stride(0), 499 input_span(0), 500 output_stride(0), 501 output_span(0) {} 502 503 Index count; 504 Index size; 505 Index input_stride; 506 Index input_span; 507 Index output_stride; 508 Index output_span; 509 }; 510 511 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( 512 Index index, int dim_index) const { 513 #if defined(EIGEN_HAS_INDEX_LIST) 514 return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) && 515 index < m_padding[dim_index].first) || 516 (!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) && 517 index >= m_dimensions[dim_index] - m_padding[dim_index].second); 518 #else 519 return (index < m_padding[dim_index].first) || 520 (index >= m_dimensions[dim_index] - m_padding[dim_index].second); 521 #endif 522 } 523 524 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero( 525 int dim_index) const { 526 #if defined(EIGEN_HAS_INDEX_LIST) 527 return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0); 528 #else 529 EIGEN_UNUSED_VARIABLE(dim_index); 530 return false; 531 #endif 532 } 533 534 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero( 535 int dim_index) const { 536 #if defined(EIGEN_HAS_INDEX_LIST) 537 return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0); 538 #else 539 EIGEN_UNUSED_VARIABLE(dim_index); 540 return false; 541 #endif 542 } 543 544 545 void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const { 546 const double in = static_cast<double>(m_impl.dimensions()[i]); 547 const double out = in + m_padding[i].first + m_padding[i].second; 548 if (out == 0) 549 return; 550 const double reduction = in / out; 551 cost *= reduction; 552 if (first) { 553 cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() + 554 reduction * (1 * TensorOpCost::AddCost<Index>())); 555 } else { 556 cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() + 557 2 * TensorOpCost::MulCost<Index>() + 558 reduction * (2 * TensorOpCost::MulCost<Index>() + 559 1 * TensorOpCost::DivCost<Index>())); 560 } 561 } 562 563 protected: 564 565 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const 566 { 567 EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) 568 eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); 569 570 const Index initialIndex = index; 571 Index inputIndex = 0; 572 EIGEN_UNROLL_LOOP 573 for (int i = NumDims - 1; i > 0; --i) { 574 const Index firstIdx = index; 575 const Index lastIdx = index + PacketSize - 1; 576 const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; 577 const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; 578 const Index lastPaddedRight = m_outputStrides[i+1]; 579 580 if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { 581 // all the coefficient are in the padding zone. 582 return internal::pset1<PacketReturnType>(m_paddingValue); 583 } 584 else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { 585 // all the coefficient are in the padding zone. 586 return internal::pset1<PacketReturnType>(m_paddingValue); 587 } 588 else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { 589 // all the coefficient are between the 2 padding zones. 590 const Index idx = index / m_outputStrides[i]; 591 inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; 592 index -= idx * m_outputStrides[i]; 593 } 594 else { 595 // Every other case 596 return packetWithPossibleZero(initialIndex); 597 } 598 } 599 600 const Index lastIdx = index + PacketSize - 1; 601 const Index firstIdx = index; 602 const Index lastPaddedLeft = m_padding[0].first; 603 const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); 604 const Index lastPaddedRight = m_outputStrides[1]; 605 606 if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) { 607 // all the coefficient are in the padding zone. 608 return internal::pset1<PacketReturnType>(m_paddingValue); 609 } 610 else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { 611 // all the coefficient are in the padding zone. 612 return internal::pset1<PacketReturnType>(m_paddingValue); 613 } 614 else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { 615 // all the coefficient are between the 2 padding zones. 616 inputIndex += (index - m_padding[0].first); 617 return m_impl.template packet<Unaligned>(inputIndex); 618 } 619 // Every other case 620 return packetWithPossibleZero(initialIndex); 621 } 622 623 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const 624 { 625 EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) 626 eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); 627 628 const Index initialIndex = index; 629 Index inputIndex = 0; 630 EIGEN_UNROLL_LOOP 631 for (int i = 0; i < NumDims - 1; ++i) { 632 const Index firstIdx = index; 633 const Index lastIdx = index + PacketSize - 1; 634 const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; 635 const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; 636 const Index lastPaddedRight = m_outputStrides[i]; 637 638 if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { 639 // all the coefficient are in the padding zone. 640 return internal::pset1<PacketReturnType>(m_paddingValue); 641 } 642 else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { 643 // all the coefficient are in the padding zone. 644 return internal::pset1<PacketReturnType>(m_paddingValue); 645 } 646 else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { 647 // all the coefficient are between the 2 padding zones. 648 const Index idx = index / m_outputStrides[i+1]; 649 inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; 650 index -= idx * m_outputStrides[i+1]; 651 } 652 else { 653 // Every other case 654 return packetWithPossibleZero(initialIndex); 655 } 656 } 657 658 const Index lastIdx = index + PacketSize - 1; 659 const Index firstIdx = index; 660 const Index lastPaddedLeft = m_padding[NumDims-1].first; 661 const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); 662 const Index lastPaddedRight = m_outputStrides[NumDims-1]; 663 664 if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) { 665 // all the coefficient are in the padding zone. 666 return internal::pset1<PacketReturnType>(m_paddingValue); 667 } 668 else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { 669 // all the coefficient are in the padding zone. 670 return internal::pset1<PacketReturnType>(m_paddingValue); 671 } 672 else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { 673 // all the coefficient are between the 2 padding zones. 674 inputIndex += (index - m_padding[NumDims-1].first); 675 return m_impl.template packet<Unaligned>(inputIndex); 676 } 677 // Every other case 678 return packetWithPossibleZero(initialIndex); 679 } 680 681 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const 682 { 683 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; 684 EIGEN_UNROLL_LOOP 685 for (int i = 0; i < PacketSize; ++i) { 686 values[i] = coeff(index+i); 687 } 688 PacketReturnType rslt = internal::pload<PacketReturnType>(values); 689 return rslt; 690 } 691 692 Dimensions m_dimensions; 693 array<Index, NumDims+1> m_outputStrides; 694 array<Index, NumDims> m_inputStrides; 695 TensorEvaluator<ArgType, Device> m_impl; 696 PaddingDimensions m_padding; 697 698 Scalar m_paddingValue; 699 700 const Device EIGEN_DEVICE_REF m_device; 701 }; 702 703 704 705 706 } // end namespace Eigen 707 708 #endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H 709