1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 #include <ATen/Dispatch.h>
4 #include <ATen/EmptyTensor.h>
5 #include <ATen/cuda/CUDAContext.h>
6 #include <c10/cuda/CUDACachingAllocator.h>
7 #include <ATen/cuda/EmptyTensor.h>
8 #include <ATen/cuda/detail/KernelUtils.h>
9 #include <ATen/cuda/detail/OffsetCalculator.cuh> //for MAX_DIMS
10 #include <ATen/cuda/cub.cuh>
11
12 #ifndef AT_PER_OPERATOR_HEADERS
13 #include <ATen/NativeFunctions.h>
14 #else
15 #include <ATen/ops/empty_native.h>
16 #include <ATen/ops/nonzero_native.h>
17 #endif
18
19
20 namespace at::native {
21
22 namespace{
23 template<typename T>
24 struct NonZeroOp
25 {
operator ()at::native::__anon0c6846970111::NonZeroOp26 __host__ __device__ __forceinline__ bool operator()(const T& a) const {
27 return (a!=T(0));
28 }
29 };
30
31 //TODO: actually support int64_t index_t
32 template<typename index_t>
33 struct TensorDims {
34 index_t sizes[MAX_DIMS];
35 };
36
37 template <typename index_t>
write_indices(int64_t * inp,TensorDims<index_t> dims,int ndim,index_t n)38 __global__ void write_indices(
39 int64_t* inp,
40 TensorDims<index_t> dims,
41 int ndim,
42 index_t n) {
43 auto index = threadIdx.x + blockIdx.x * blockDim.x;
44 if (index < n) {
45 index_t div = 1;
46 int64_t idx_flat = inp[index];
47 #pragma unroll
48 for (int dim = MAX_DIMS; dim >= 0; dim--) {
49 if (dim > ndim - 1)
50 continue;
51 auto dim_size = dims.sizes[dim];
52 inp[index + dim * n] = (idx_flat / div) % dim_size;
53 div *= dim_size;
54 }
55 }
56 }
57
58 } //anonymous namespace
59
60 template<typename scalar_t>
nonzero_cuda_out_impl(const Tensor & self,Tensor & out)61 void nonzero_cuda_out_impl(const Tensor& self, Tensor& out){
62 Tensor self_ = self.contiguous();
63 int N = self_.numel();
64 const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
65 // compute number of nonzero elements
66 size_t temp_storage_bytes=0;
67 auto& allocator = *c10::cuda::CUDACachingAllocator::get();
68 auto num_nonzeros = allocator.allocate(sizeof(int));
69 cub::TransformInputIterator<bool, NonZeroOp<scalar_t>, const scalar_t*> itr(self_.const_data_ptr<scalar_t>(), NonZeroOp<scalar_t>());
70 cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
71 auto temp_storage = allocator.allocate(temp_storage_bytes);
72 cub::DeviceReduce::Sum(temp_storage.get(), temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
73 int num_nonzeros_h;
74 auto pinned_num_nonzeros_h = at::detail::empty_cpu(
75 {1}, /* size */
76 c10::CppTypeToScalarType<int>(), /* dtype */
77 std::nullopt, /* layout */
78 std::nullopt, /* device */
79 true, /* pin_memory */
80 std::nullopt /* memory format */
81 );
82 at::cuda::memcpy_and_sync((void *)pinned_num_nonzeros_h.const_data_ptr<int>(), num_nonzeros.get(), sizeof(int), cudaMemcpyDeviceToHost, stream);
83 num_nonzeros_h = (int)*(pinned_num_nonzeros_h.const_data_ptr<int>());
84 //expected output size is num_nonzeros x ndim
85 //we are producing output with size {num_nonzeros, ndim} and strides {1, num_nonzeros} (that is, transposed ndim x num_nonzeros output)
86 //we are able to directly use passed output with this size and strides, and we can also (per contract)
87 //resize passed output with incorrect sizes anyway we want.
88 //However, out with correct sizes and incorrect strides will have to be copied to from the intermediate we've produced.
89 bool need_to_copy = out.dim() == 2 && out.sizes()[0] == num_nonzeros_h && out.sizes()[1] == self.dim() && !out.t().is_contiguous();
90 at::Tensor out_temp = need_to_copy ?
91 Tensor(at::detail::empty_cuda({self.dim(), num_nonzeros_h}, out.options())) :
92 out.resize_({self.dim(), num_nonzeros_h});
93 //Scalars are expected to produce output of size (1,0), so we can't write to it
94 if (self.dim() > 0) {
95 cub::CountingInputIterator<int64_t> counting_itr(0);
96 temp_storage_bytes = 0;
97 cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, counting_itr, itr,
98 out_temp.mutable_data_ptr<int64_t>(), (int*)num_nonzeros.get(), N, stream);
99 temp_storage = allocator.allocate(temp_storage_bytes);
100 cub::DeviceSelect::Flagged(temp_storage.get(), temp_storage_bytes, counting_itr, itr,
101 out_temp.mutable_data_ptr<int64_t>(), (int*)num_nonzeros.get(), N, stream);
102 if (num_nonzeros_h > 0 && self.dim() > 1){
103 TensorDims<int> dims;
104 for (int i=0; i<self.dim(); i++){
105 dims.sizes[i] = self.sizes()[i];
106 }
107 const int nthreads = 256;
108 const int nblocks = (num_nonzeros_h + nthreads -1)/nthreads;
109 write_indices<<<nblocks, nthreads, 0, stream>>>(out_temp.mutable_data_ptr<int64_t>(),
110 dims, self.dim(), num_nonzeros_h);
111 C10_CUDA_KERNEL_LAUNCH_CHECK();
112 }
113 }
114 if (need_to_copy) {
115 out.copy_(out_temp.t());
116 } else {
117 //transpose out so it is correct size
118 Tensor out_ = out_temp.t();
119 out.set_(out_);
120 }
121 }
122
nonzero_out_cuda(const Tensor & self,Tensor & out)123 Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out){
124 TORCH_CHECK(self.numel() < std::numeric_limits<int>::max(), "nonzero is not supported for tensors with more than INT_MAX elements, \
125 See https://github.com/pytorch/pytorch/issues/51871");
126 TORCH_CHECK(out.dtype() == at::kLong, "Expected object of scalar type ", at::kLong, " as out, but got ", out.dtype());
127 TORCH_CHECK(self.device() == out.device(), "expected self and out to be on the same device, but got out on ",
128 out.device(), " and self on ", self.device());
129 TORCH_CHECK(self.dim() <= MAX_DIMS, "nonzero is not supported for tensor with more than ", MAX_DIMS, " dimensions");
130 AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(at::ScalarType::ComplexHalf, at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half,
131 self.scalar_type(), "nonzero_cuda",
132 [&] {nonzero_cuda_out_impl<scalar_t>(self, out);});
133 return out;
134 }
135
nonzero_cuda(const Tensor & self)136 Tensor nonzero_cuda(const Tensor& self){
137 Tensor out = at::detail::empty_cuda({0}, self.options().dtype(kLong));
138 return at::native::nonzero_out_cuda(self, out);
139 }
140 } //namespace at::native
141