1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #define EIGEN_USE_THREADS 17 18 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 19 #define EIGEN_USE_GPU 20 #endif 21 22 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 23 #include "tensorflow/core/framework/op.h" 24 #include "tensorflow/core/framework/op_kernel.h" 25 #include "tensorflow/core/framework/tensor_reference.h" 26 #include "tensorflow/core/framework/tensor_shape.h" 27 #include "tensorflow/core/framework/tensor_types.h" 28 #include "tensorflow/core/framework/variant_op_registry.h" 29 #include "tensorflow/core/kernels/dense_update_functor.h" 30 #include "tensorflow/core/kernels/fill_functor.h" 31 #include "tensorflow/core/kernels/sparse/kernels.h" 32 #include "tensorflow/core/kernels/sparse/sparse_matrix.h" 33 34 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 35 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" 36 #include "tensorflow/core/util/cuda_sparse.h" 37 #include "tensorflow/core/util/gpu_solvers.h" 38 #endif 39 40 #if GOOGLE_CUDA 41 #include "tensorflow/stream_executor/cuda/cuda_activation.h" 42 using ::perftools::gputools::cuda::ScopedActivateExecutorContext; 43 #elif TENSORFLOW_USE_ROCM 44 #include "tensorflow/stream_executor/rocm/rocm_activation.h" 45 using ::perftools::gputools::rocm::ScopedActivateExecutorContext; 46 #endif 47 48 namespace tensorflow { 49 50 typedef Eigen::ThreadPoolDevice CPUDevice; 51 typedef Eigen::GpuDevice GPUDevice; 52 53 // Op to convert SparseTensors to CSR SparseMatrices on the CPU. 54 // Takes a SparseTensor of rank 2 or (if batched) 3 as the input. The 55 // SparseTensor's indices must be present in the canonical, row-major ordering. 56 // 57 // Returns a (batched) CSR SparseMatrix with the same dense shape and non-zero 58 // values. 59 template <typename T> 60 class SparseTensorToCSRSparseMatrixCPUOp : public OpKernel { 61 public: SparseTensorToCSRSparseMatrixCPUOp(OpKernelConstruction * c)62 explicit SparseTensorToCSRSparseMatrixCPUOp(OpKernelConstruction* c) 63 : OpKernel(c) {} 64 Compute(OpKernelContext * ctx)65 void Compute(OpKernelContext* ctx) final { 66 const Tensor& indices = ctx->input(0); 67 const Tensor& values = ctx->input(1); 68 const Tensor& dense_shape = ctx->input(2); 69 const int rank = dense_shape.NumElements(); 70 OP_REQUIRES( 71 ctx, TensorShapeUtils::IsVector(dense_shape.shape()), 72 errors::InvalidArgument("dense_shape must be rank 1 but got rank", 73 dense_shape.shape().dims())); 74 OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(indices.shape()), 75 errors::InvalidArgument("indices must be rank 2 but got rank", 76 indices.shape().dims())); 77 OP_REQUIRES(ctx, rank == 2 || rank == 3, 78 errors::InvalidArgument("SparseTensor must have rank 2 or 3; ", 79 "but indices has rank: ", rank)); 80 auto dense_shape_vec = dense_shape.vec<int64_t>(); 81 const int64_t batch_size = (rank == 2) ? 1 : dense_shape_vec(0); 82 const int64_t num_rows = dense_shape_vec((rank == 2) ? 0 : 1); 83 const int64_t total_nnz = values.NumElements(); 84 85 // Allocate output Tensors. 86 TensorShape batch_ptr_shape; 87 OP_REQUIRES_OK( 88 ctx, TensorShape::BuildTensorShape({batch_size + 1}, &batch_ptr_shape)); 89 Tensor batch_ptr(cpu_allocator(), DT_INT32, batch_ptr_shape); 90 TensorShape csr_col_ind_shape; 91 OP_REQUIRES_OK( 92 ctx, TensorShape::BuildTensorShape({total_nnz}, &csr_col_ind_shape)); 93 Tensor csr_col_ind(cpu_allocator(), DT_INT32, csr_col_ind_shape); 94 TensorShape csr_row_ind_shape; 95 OP_REQUIRES_OK(ctx, TensorShape::BuildTensorShape( 96 {(num_rows + 1) * batch_size}, &csr_row_ind_shape)); 97 Tensor csr_row_ptr(cpu_allocator(), DT_INT32, csr_row_ind_shape); 98 99 // Fill the row pointers with zeros. 100 functor::SetZeroFunctor<CPUDevice, int32> set_zero; 101 set_zero(ctx->eigen_device<CPUDevice>(), csr_row_ptr.flat<int32>()); 102 103 // Convert from COO to CSR format. 104 functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr; 105 OP_REQUIRES_OK( 106 ctx, 107 coo_to_csr(batch_size, num_rows, indices.template matrix<int64_t>(), 108 batch_ptr.vec<int32>(), csr_row_ptr.vec<int32>(), 109 csr_col_ind.vec<int32>())); 110 111 // Create the CSRSparseMatrix object from its component Tensors and prepare 112 // the Variant output Tensor. 113 CSRSparseMatrix output_csr_matrix; 114 OP_REQUIRES_OK( 115 ctx, CSRSparseMatrix::CreateCSRSparseMatrix( 116 DataTypeToEnum<T>::value, dense_shape, batch_ptr, csr_row_ptr, 117 csr_col_ind, values, &output_csr_matrix)); 118 Tensor* output_csr_matrix_tensor; 119 AllocatorAttributes cpu_alloc; 120 cpu_alloc.set_on_host(true); 121 OP_REQUIRES_OK( 122 ctx, ctx->allocate_output(0, TensorShape({}), &output_csr_matrix_tensor, 123 cpu_alloc)); 124 output_csr_matrix_tensor->scalar<Variant>()() = 125 std::move(output_csr_matrix); 126 } 127 }; 128 129 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 130 131 template <typename Device, typename T> 132 class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel { 133 public: SparseTensorToCSRSparseMatrixGPUOp(OpKernelConstruction * c)134 explicit SparseTensorToCSRSparseMatrixGPUOp(OpKernelConstruction* c) 135 : AsyncOpKernel(c) {} 136 ComputeAsync(OpKernelContext * c,DoneCallback done)137 void ComputeAsync(OpKernelContext* c, DoneCallback done) final { 138 auto stream = c->op_device_context()->stream(); 139 const Device& d = c->eigen_device<Device>(); 140 141 const Tensor& indices_t = c->input(0); 142 const Tensor& values_t = c->input(1); 143 const Tensor& dense_shape_t = c->input(2); 144 const int rank = dense_shape_t.NumElements(); 145 OP_REQUIRES_ASYNC( 146 c, rank == 2 || rank == 3, 147 errors::InvalidArgument("sparse tensor must have rank == 2 or 3; ", 148 "but indices has ", rank, " columns"), 149 done); 150 auto dense_shape = dense_shape_t.vec<int64_t>(); 151 const int64_t batch_size = (rank == 2) ? 1 : dense_shape(0); 152 const int64_t rows = dense_shape((rank == 2) ? 0 : 1); 153 const int64_t cols = dense_shape((rank == 2) ? 1 : 2); 154 155 ScratchSpace<int32> nnz_per_batch_host(c, batch_size, /*on_host*/ true); 156 157 Tensor nnz_per_batch_device_t; 158 if (rank == 2) { 159 // Simple case. 160 nnz_per_batch_host.mutable_data()[0] = indices_t.dim_size(0); 161 } else { 162 OP_REQUIRES_OK_ASYNC(c, 163 c->allocate_temp(DT_INT32, TensorShape({batch_size}), 164 &nnz_per_batch_device_t), 165 done); 166 auto nnz_per_batch_device = nnz_per_batch_device_t.vec<int32>(); 167 168 functor::CalculateNNZPerBatchMatrixFromIndices<Device> 169 calculate_nnz_from_indices; 170 auto indices = indices_t.matrix<int64_t>(); 171 OP_REQUIRES_OK_ASYNC( 172 c, calculate_nnz_from_indices(c, indices, nnz_per_batch_device), 173 done); 174 175 perftools::gputools::DeviceMemoryBase nnz_per_batch_device_ptr( 176 static_cast<void*>(nnz_per_batch_device.data())); 177 178 OP_REQUIRES_ASYNC( 179 c, 180 stream 181 ->ThenMemcpy(nnz_per_batch_host.mutable_data() /*host_dst*/, 182 nnz_per_batch_device_ptr /*gpu_src*/, 183 batch_size * sizeof(int32) /*size*/) 184 .ok(), 185 errors::Internal("SparseTensorToSparseMatrixGPUOp: failed to copy " 186 "nnz_per_batch from device"), 187 done); 188 } 189 190 TensorReference nnz_per_batch_device_ref(nnz_per_batch_device_t); 191 auto convert_to_csr = [this, c, batch_size, nnz_per_batch_host, 192 nnz_per_batch_device_ref, stream, &d, &values_t, 193 &indices_t, &dense_shape_t, dense_shape, rows, cols, 194 rank, done]() { 195 // The data has been copied out of the nnz_per_batch_device 196 // tensor by the time we get here; we can unreference it. 197 nnz_per_batch_device_ref.Unref(); 198 199 auto nnz_per_batch = nnz_per_batch_host.tensor().vec<int32>(); 200 201 // Ensure that within the callback, the proper GPU settings are 202 // configured. 203 ScopedActivateExecutorContext scoped_activation{stream->parent()}; 204 Tensor batch_ptr_t(cpu_allocator(), DT_INT32, 205 TensorShape({batch_size + 1})); 206 207 auto batch_ptr = batch_ptr_t.vec<int32>(); 208 auto indices = indices_t.matrix<int64_t>(); 209 210 batch_ptr(0) = 0; 211 for (int i = 0; i < batch_size; ++i) { 212 batch_ptr(i + 1) = batch_ptr(i) + nnz_per_batch(i); 213 } 214 int total_nnz = batch_ptr(batch_size); 215 OP_REQUIRES_ASYNC( 216 c, total_nnz == values_t.NumElements(), 217 errors::Internal("nnz returned by " 218 "CalculateNNZPerBatchMatrixFromInd" 219 "ices != len(values): ", 220 total_nnz, " vs. ", values_t.NumElements()), 221 done); 222 223 Tensor coo_col_ind_t; 224 Tensor csr_row_ptr_t; 225 Tensor csr_values_t = values_t; 226 227 Tensor coo_row_ind_t; 228 OP_REQUIRES_OK_ASYNC( 229 c, 230 c->allocate_temp(DT_INT32, TensorShape({total_nnz}), &coo_row_ind_t), 231 done); 232 OP_REQUIRES_OK_ASYNC( 233 c, 234 c->allocate_temp(DT_INT32, TensorShape({total_nnz}), &coo_col_ind_t), 235 done); 236 OP_REQUIRES_OK_ASYNC( 237 c, 238 c->allocate_temp(DT_INT32, TensorShape({batch_size * (rows + 1)}), 239 &csr_row_ptr_t), 240 done); 241 242 auto coo_row_ind = coo_row_ind_t.vec<int32>(); 243 auto coo_col_ind = coo_col_ind_t.vec<int32>(); 244 auto csr_row_ptr = csr_row_ptr_t.vec<int32>(); 245 246 // Convert SparseTensor rep to coo row ind, coo col ind. 247 if (total_nnz > 0) { 248 functor::SparseTensorToCOOSparseMatrix<Device> st_to_coo; 249 st_to_coo(d, dense_shape, indices, coo_row_ind, coo_col_ind); 250 } 251 252 // Set all csr row pointers to zero, so that when iterating over 253 // batches converting coo to csr, we do not have to perform an 254 // unaligned SetZero for any nnz == 0 minibatches. coo2csr has 255 // a bug if you have empty coo rows. 256 // TODO(ebrevdo): File bug w/ nvidia so coo2csr can handle 257 // zero-element input coo rows. 258 functor::SetZeroFunctor<Device, int32> set_zero; 259 set_zero(d, csr_row_ptr_t.flat<int32>()); 260 261 functor::COOSparseMatrixToCSRSparseMatrix<Device> coo_to_csr; 262 for (int i = 0; i < batch_size; ++i) { 263 int nnz_i = batch_ptr(i + 1) - batch_ptr(i); 264 if (nnz_i == 0) { 265 // This is an empty minibatch; no call to coo2csr: it's 266 // handled by the SetZero above. 267 } else { 268 // Convert coo to csr. 269 auto coo_row_ind_i = 270 TTypes<int32>::UnalignedVec(&coo_row_ind(batch_ptr(i)), nnz_i); 271 auto csr_row_ptr_i = TTypes<int32>::UnalignedVec( 272 &csr_row_ptr((rows + 1) * i), rows + 1); 273 OP_REQUIRES_OK_ASYNC( 274 c, coo_to_csr(c, rows, cols, coo_row_ind_i, csr_row_ptr_i), done); 275 } 276 } 277 278 CSRSparseMatrix matrix; 279 OP_REQUIRES_OK_ASYNC( 280 c, 281 CSRSparseMatrix::CreateCSRSparseMatrix( 282 values_t.dtype(), dense_shape_t, batch_ptr_t, csr_row_ptr_t, 283 coo_col_ind_t, csr_values_t, &matrix), 284 done); 285 Tensor* matrix_t; 286 AllocatorAttributes cpu_alloc; 287 cpu_alloc.set_on_host(true); 288 OP_REQUIRES_OK_ASYNC( 289 c, c->allocate_output(0, TensorShape({}), &matrix_t, cpu_alloc), 290 done); 291 matrix_t->scalar<Variant>()() = std::move(matrix); 292 293 done(); 294 }; 295 296 if (rank == 2) { 297 convert_to_csr(); 298 } else { 299 // Launch the GPU kernel to count nnz entries, then call convert_to_csr. 300 c->device()->tensorflow_accelerator_device_info()->event_mgr->ThenExecute( 301 stream, convert_to_csr); 302 } 303 } 304 }; 305 306 namespace functor { 307 308 template <> 309 Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()( 310 OpKernelContext* c, TTypes<int64_t>::ConstMatrix indices, 311 TTypes<int32>::Vec nnz_per_batch); 312 extern template struct CalculateNNZPerBatchMatrixFromIndices<GPUDevice>; 313 314 template <> 315 struct SparseTensorToCOOSparseMatrix<GPUDevice> { 316 void operator()(const GPUDevice& d, 317 TTypes<int64_t>::ConstVec host_dense_shape, 318 TTypes<int64_t>::ConstMatrix indices, 319 TTypes<int>::Vec coo_row_ind, TTypes<int>::Vec coo_col_ind); 320 }; 321 extern template struct SparseTensorToCOOSparseMatrix<GPUDevice>; 322 323 template <> 324 struct COOSparseMatrixToCSRSparseMatrix<GPUDevice> { operator ()tensorflow::functor::COOSparseMatrixToCSRSparseMatrix325 Status operator()(OpKernelContext* c, const int rows, const int cols, 326 TTypes<int>::UnalignedVec coo_row_ind, 327 TTypes<int>::UnalignedVec csr_row_ptr) { 328 GpuSparse cuda_sparse(c); 329 TF_RETURN_IF_ERROR(cuda_sparse.Initialize()); 330 return cuda_sparse.Coo2csr(coo_row_ind.data(), 331 /*nnz*/ coo_row_ind.size(), 332 /*m == rows of A*/ rows, csr_row_ptr.data()); 333 } 334 }; 335 extern template struct COOSparseMatrixToCSRSparseMatrix<GPUDevice>; 336 337 } // namespace functor 338 339 #define REGISTER_GPU(T) \ 340 REGISTER_KERNEL_BUILDER(Name("SparseTensorToCSRSparseMatrix") \ 341 .Device(DEVICE_GPU) \ 342 .TypeConstraint<T>("T") \ 343 .HostMemory("dense_shape"), \ 344 SparseTensorToCSRSparseMatrixGPUOp<GPUDevice, T>); 345 346 REGISTER_GPU(float) 347 REGISTER_GPU(double) 348 REGISTER_GPU(complex64) 349 REGISTER_GPU(complex128) 350 351 #undef REGISTER_GPU 352 353 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM 354 355 #define REGISTER_CPU(T) \ 356 REGISTER_KERNEL_BUILDER(Name("SparseTensorToCSRSparseMatrix") \ 357 .Device(DEVICE_CPU) \ 358 .TypeConstraint<T>("T"), \ 359 SparseTensorToCSRSparseMatrixCPUOp<T>); 360 361 REGISTER_CPU(float) 362 REGISTER_CPU(double) 363 REGISTER_CPU(complex64) 364 REGISTER_CPU(complex128) 365 366 #undef REGISTER_CPU 367 368 } // namespace tensorflow 369