xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/cuda/Nonzero.cu (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 #include <ATen/Dispatch.h>
4 #include <ATen/EmptyTensor.h>
5 #include <ATen/cuda/CUDAContext.h>
6 #include <c10/cuda/CUDACachingAllocator.h>
7 #include <ATen/cuda/EmptyTensor.h>
8 #include <ATen/cuda/detail/KernelUtils.h>
9 #include <ATen/cuda/detail/OffsetCalculator.cuh> //for MAX_DIMS
10 #include <ATen/cuda/cub.cuh>
11 
12 #ifndef AT_PER_OPERATOR_HEADERS
13 #include <ATen/NativeFunctions.h>
14 #else
15 #include <ATen/ops/empty_native.h>
16 #include <ATen/ops/nonzero_native.h>
17 #endif
18 
19 
20 namespace at::native {
21 
22 namespace{
23 template<typename T>
24 struct NonZeroOp
25 {
operator ()at::native::__anon0c6846970111::NonZeroOp26     __host__ __device__ __forceinline__ bool operator()(const T& a) const {
27       return (a!=T(0));
28     }
29 };
30 
31 //TODO: actually support int64_t index_t
32 template<typename index_t>
33 struct TensorDims {
34   index_t sizes[MAX_DIMS];
35 };
36 
37 template <typename index_t>
write_indices(int64_t * inp,TensorDims<index_t> dims,int ndim,index_t n)38 __global__ void write_indices(
39     int64_t* inp,
40     TensorDims<index_t> dims,
41     int ndim,
42     index_t n) {
43   auto index = threadIdx.x + blockIdx.x * blockDim.x;
44   if (index < n) {
45     index_t div = 1;
46     int64_t idx_flat = inp[index];
47 #pragma unroll
48     for (int dim = MAX_DIMS; dim >= 0; dim--) {
49       if (dim > ndim - 1)
50         continue;
51       auto dim_size = dims.sizes[dim];
52       inp[index + dim * n] = (idx_flat / div) % dim_size;
53       div *= dim_size;
54     }
55   }
56 }
57 
58 } //anonymous namespace
59 
60 template<typename scalar_t>
nonzero_cuda_out_impl(const Tensor & self,Tensor & out)61 void nonzero_cuda_out_impl(const Tensor& self, Tensor& out){
62   Tensor self_ = self.contiguous();
63   int N = self_.numel();
64   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
65 // compute number of nonzero elements
66   size_t temp_storage_bytes=0;
67   auto& allocator = *c10::cuda::CUDACachingAllocator::get();
68   auto num_nonzeros = allocator.allocate(sizeof(int));
69   cub::TransformInputIterator<bool, NonZeroOp<scalar_t>, const scalar_t*> itr(self_.const_data_ptr<scalar_t>(), NonZeroOp<scalar_t>());
70   cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
71   auto temp_storage = allocator.allocate(temp_storage_bytes);
72   cub::DeviceReduce::Sum(temp_storage.get(), temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
73   int num_nonzeros_h;
74   auto pinned_num_nonzeros_h = at::detail::empty_cpu(
75             {1}, /* size */
76             c10::CppTypeToScalarType<int>(), /* dtype */
77             std::nullopt, /* layout */
78             std::nullopt, /* device */
79             true, /* pin_memory */
80             std::nullopt /* memory format */
81           );
82   at::cuda::memcpy_and_sync((void *)pinned_num_nonzeros_h.const_data_ptr<int>(), num_nonzeros.get(), sizeof(int), cudaMemcpyDeviceToHost, stream);
83   num_nonzeros_h = (int)*(pinned_num_nonzeros_h.const_data_ptr<int>());
84   //expected output size is num_nonzeros x ndim
85   //we are producing output with size {num_nonzeros, ndim} and strides {1, num_nonzeros} (that is, transposed ndim x num_nonzeros output)
86   //we are able to directly use passed output with this size and strides, and we can also (per contract)
87   //resize passed output with incorrect sizes anyway we want.
88   //However, out with correct sizes and incorrect strides will have to be copied to from the intermediate we've produced.
89   bool need_to_copy = out.dim() == 2 && out.sizes()[0] == num_nonzeros_h && out.sizes()[1] == self.dim() && !out.t().is_contiguous();
90   at::Tensor out_temp = need_to_copy ?
91       Tensor(at::detail::empty_cuda({self.dim(), num_nonzeros_h}, out.options())) :
92       out.resize_({self.dim(), num_nonzeros_h});
93   //Scalars are expected to produce output of size (1,0), so we can't write to it
94   if (self.dim() > 0) {
95     cub::CountingInputIterator<int64_t> counting_itr(0);
96     temp_storage_bytes = 0;
97     cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, counting_itr, itr,
98       out_temp.mutable_data_ptr<int64_t>(), (int*)num_nonzeros.get(), N, stream);
99     temp_storage = allocator.allocate(temp_storage_bytes);
100     cub::DeviceSelect::Flagged(temp_storage.get(), temp_storage_bytes, counting_itr, itr,
101       out_temp.mutable_data_ptr<int64_t>(), (int*)num_nonzeros.get(), N, stream);
102     if (num_nonzeros_h > 0 && self.dim() > 1){
103         TensorDims<int> dims;
104         for (int i=0; i<self.dim(); i++){
105             dims.sizes[i] = self.sizes()[i];
106         }
107         const int nthreads = 256;
108         const int nblocks = (num_nonzeros_h + nthreads -1)/nthreads;
109         write_indices<<<nblocks, nthreads, 0, stream>>>(out_temp.mutable_data_ptr<int64_t>(),
110         dims, self.dim(), num_nonzeros_h);
111         C10_CUDA_KERNEL_LAUNCH_CHECK();
112     }
113   }
114   if (need_to_copy) {
115     out.copy_(out_temp.t());
116   } else {
117     //transpose out so it is correct size
118     Tensor out_ = out_temp.t();
119     out.set_(out_);
120   }
121 }
122 
nonzero_out_cuda(const Tensor & self,Tensor & out)123 Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out){
124   TORCH_CHECK(self.numel() < std::numeric_limits<int>::max(), "nonzero is not supported for tensors with more than INT_MAX elements, \
125   See https://github.com/pytorch/pytorch/issues/51871");
126   TORCH_CHECK(out.dtype() == at::kLong, "Expected object of scalar type ", at::kLong, " as out, but got ", out.dtype());
127   TORCH_CHECK(self.device() == out.device(), "expected self and out to be on the same device, but got out on ",
128   out.device(), " and self on ", self.device());
129   TORCH_CHECK(self.dim() <= MAX_DIMS, "nonzero is not supported for tensor with more than ", MAX_DIMS, " dimensions");
130   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(at::ScalarType::ComplexHalf, at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half,
131     self.scalar_type(), "nonzero_cuda",
132     [&] {nonzero_cuda_out_impl<scalar_t>(self, out);});
133   return out;
134 }
135 
nonzero_cuda(const Tensor & self)136 Tensor nonzero_cuda(const Tensor& self){
137   Tensor out = at::detail::empty_cuda({0}, self.options().dtype(kLong));
138   return at::native::nonzero_out_cuda(self, out);
139 }
140 } //namespace at::native
141