xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/cpu/CatKernel.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 
4 #include <ATen/Dispatch.h>
5 #include <ATen/native/cpu/CatKernel.h>
6 #include <ATen/cpu/vec/functional.h>
7 #include <ATen/cpu/vec/vec.h>
8 #include <c10/util/irange.h>
9 
10 namespace at::native {
11 
12 namespace {
13 
14 struct InputMeta {
15   const void* data_ptr;
16   int64_t inner_size;
17 
InputMetaat::native::__anondcc294db0111::InputMeta18   InputMeta(const Tensor& t, int64_t dim, int64_t inner)
19     : data_ptr(t.const_data_ptr())
20     , inner_size(t.sizes()[dim] * inner) {}
21 };
22 
23 template <typename scalar_t>
cat_serial_kernel_impl(const Tensor & result,const MaterializedITensorListRef & tensors,int64_t dim)24 void cat_serial_kernel_impl(const Tensor& result, const MaterializedITensorListRef& tensors, int64_t dim) {
25   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
26       dim >= 0 && dim < result.dim(), "dim out of range in cat_serial_kernel_impl");
27   int64_t outer = result.numel() / (result.sizes()[dim] * result.strides()[dim]);
28   scalar_t* result_data = result.data_ptr<scalar_t>();
29   int64_t ninputs = static_cast<int64_t>(tensors.size());
30   std::vector<InputMeta> inputs;
31   inputs.reserve(ninputs);
32   for (const Tensor& tensor : tensors) {
33     inputs.emplace_back(tensor, dim, result.strides()[dim]);
34   }
35 
36   using Vec = vec::Vectorized<scalar_t>;
37   scalar_t* result_ptr = result_data;
38   for (const auto i : c10::irange(outer)) {
39     for (const auto j : c10::irange(ninputs)) {
40       int64_t local_inner = inputs[j].inner_size;
41       const scalar_t* input_ptr = (const scalar_t*)(inputs[j].data_ptr) + i * local_inner;
42       int64_t d = 0;
43       for (; d < local_inner - (local_inner % Vec::size()); d += Vec::size()) {
44         Vec in_vec = Vec::loadu(input_ptr + d);
45         in_vec.store(result_ptr + d);
46       }
47       #if !defined(_MSC_VER) && !defined(COMPILING_FOR_MIN_SIZE)
48       # pragma unroll
49       #endif
50       for (; d < local_inner; d++) {
51         result_ptr[d] = input_ptr[d];
52       }
53       result_ptr += local_inner;
54     }
55   }
56 }
57 
cat_serial_kernel(const Tensor & result,const MaterializedITensorListRef & tensors,int64_t dim)58 void cat_serial_kernel(const Tensor& result, const MaterializedITensorListRef& tensors, int64_t dim) {
59   AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, result.scalar_type(), "cat_serial_kernel", [&]() {
60     cat_serial_kernel_impl<scalar_t>(result, tensors, dim);
61   });
62 }
63 
64 } // anonymous namespace
65 
66 REGISTER_DISPATCH(cat_serial_stub, &cat_serial_kernel);
67 
68 } // at::native
69