1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #define EIGEN_USE_THREADS
17 
18 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
19 #define EIGEN_USE_GPU
20 #endif
21 
22 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
23 #include "tensorflow/core/framework/op.h"
24 #include "tensorflow/core/framework/op_kernel.h"
25 #include "tensorflow/core/framework/tensor_reference.h"
26 #include "tensorflow/core/framework/tensor_shape.h"
27 #include "tensorflow/core/framework/tensor_types.h"
28 #include "tensorflow/core/framework/variant_op_registry.h"
29 #include "tensorflow/core/kernels/dense_update_functor.h"
30 #include "tensorflow/core/kernels/fill_functor.h"
31 #include "tensorflow/core/kernels/sparse/kernels.h"
32 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"
33 
34 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
35 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
36 #include "tensorflow/core/util/cuda_sparse.h"
37 #include "tensorflow/core/util/gpu_solvers.h"
38 #endif
39 
40 #if GOOGLE_CUDA
41 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
42 using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
43 #elif TENSORFLOW_USE_ROCM
44 #include "tensorflow/stream_executor/rocm/rocm_activation.h"
45 using ::perftools::gputools::rocm::ScopedActivateExecutorContext;
46 #endif
47 
48 namespace tensorflow {
49 
50 typedef Eigen::ThreadPoolDevice CPUDevice;
51 typedef Eigen::GpuDevice GPUDevice;
52 
53 // Op to convert SparseTensors to CSR SparseMatrices on the CPU.
54 // Takes a SparseTensor of rank 2 or (if batched) 3 as the input. The
55 // SparseTensor's indices must be present in the canonical, row-major ordering.
56 //
57 // Returns a (batched) CSR SparseMatrix with the same dense shape and non-zero
58 // values.
59 template <typename T>
60 class SparseTensorToCSRSparseMatrixCPUOp : public OpKernel {
61  public:
SparseTensorToCSRSparseMatrixCPUOp(OpKernelConstruction * c)62   explicit SparseTensorToCSRSparseMatrixCPUOp(OpKernelConstruction* c)
63       : OpKernel(c) {}
64 
Compute(OpKernelContext * ctx)65   void Compute(OpKernelContext* ctx) final {
66     const Tensor& indices = ctx->input(0);
67     const Tensor& values = ctx->input(1);
68     const Tensor& dense_shape = ctx->input(2);
69     const int rank = dense_shape.NumElements();
70     OP_REQUIRES(
71         ctx, TensorShapeUtils::IsVector(dense_shape.shape()),
72         errors::InvalidArgument("dense_shape must be rank 1 but got rank",
73                                 dense_shape.shape().dims()));
74     OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(indices.shape()),
75                 errors::InvalidArgument("indices must be rank 2 but got rank",
76                                         indices.shape().dims()));
77     OP_REQUIRES(ctx, rank == 2 || rank == 3,
78                 errors::InvalidArgument("SparseTensor must have rank 2 or 3; ",
79                                         "but indices has rank: ", rank));
80     auto dense_shape_vec = dense_shape.vec<int64_t>();
81     const int64_t batch_size = (rank == 2) ? 1 : dense_shape_vec(0);
82     const int64_t num_rows = dense_shape_vec((rank == 2) ? 0 : 1);
83     const int64_t total_nnz = values.NumElements();
84 
85     // Allocate output Tensors.
86     TensorShape batch_ptr_shape;
87     OP_REQUIRES_OK(
88         ctx, TensorShape::BuildTensorShape({batch_size + 1}, &batch_ptr_shape));
89     Tensor batch_ptr(cpu_allocator(), DT_INT32, batch_ptr_shape);
90     TensorShape csr_col_ind_shape;
91     OP_REQUIRES_OK(
92         ctx, TensorShape::BuildTensorShape({total_nnz}, &csr_col_ind_shape));
93     Tensor csr_col_ind(cpu_allocator(), DT_INT32, csr_col_ind_shape);
94     TensorShape csr_row_ind_shape;
95     OP_REQUIRES_OK(ctx, TensorShape::BuildTensorShape(
96                             {(num_rows + 1) * batch_size}, &csr_row_ind_shape));
97     Tensor csr_row_ptr(cpu_allocator(), DT_INT32, csr_row_ind_shape);
98 
99     // Fill the row pointers with zeros.
100     functor::SetZeroFunctor<CPUDevice, int32> set_zero;
101     set_zero(ctx->eigen_device<CPUDevice>(), csr_row_ptr.flat<int32>());
102 
103     // Convert from COO to CSR format.
104     functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
105     OP_REQUIRES_OK(
106         ctx,
107         coo_to_csr(batch_size, num_rows, indices.template matrix<int64_t>(),
108                    batch_ptr.vec<int32>(), csr_row_ptr.vec<int32>(),
109                    csr_col_ind.vec<int32>()));
110 
111     // Create the CSRSparseMatrix object from its component Tensors and prepare
112     // the Variant output Tensor.
113     CSRSparseMatrix output_csr_matrix;
114     OP_REQUIRES_OK(
115         ctx, CSRSparseMatrix::CreateCSRSparseMatrix(
116                  DataTypeToEnum<T>::value, dense_shape, batch_ptr, csr_row_ptr,
117                  csr_col_ind, values, &output_csr_matrix));
118     Tensor* output_csr_matrix_tensor;
119     AllocatorAttributes cpu_alloc;
120     cpu_alloc.set_on_host(true);
121     OP_REQUIRES_OK(
122         ctx, ctx->allocate_output(0, TensorShape({}), &output_csr_matrix_tensor,
123                                   cpu_alloc));
124     output_csr_matrix_tensor->scalar<Variant>()() =
125         std::move(output_csr_matrix);
126   }
127 };
128 
129 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
130 
131 template <typename Device, typename T>
132 class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
133  public:
SparseTensorToCSRSparseMatrixGPUOp(OpKernelConstruction * c)134   explicit SparseTensorToCSRSparseMatrixGPUOp(OpKernelConstruction* c)
135       : AsyncOpKernel(c) {}
136 
ComputeAsync(OpKernelContext * c,DoneCallback done)137   void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
138     auto stream = c->op_device_context()->stream();
139     const Device& d = c->eigen_device<Device>();
140 
141     const Tensor& indices_t = c->input(0);
142     const Tensor& values_t = c->input(1);
143     const Tensor& dense_shape_t = c->input(2);
144     const int rank = dense_shape_t.NumElements();
145     OP_REQUIRES_ASYNC(
146         c, rank == 2 || rank == 3,
147         errors::InvalidArgument("sparse tensor must have rank == 2 or 3; ",
148                                 "but indices has ", rank, " columns"),
149         done);
150     auto dense_shape = dense_shape_t.vec<int64_t>();
151     const int64_t batch_size = (rank == 2) ? 1 : dense_shape(0);
152     const int64_t rows = dense_shape((rank == 2) ? 0 : 1);
153     const int64_t cols = dense_shape((rank == 2) ? 1 : 2);
154 
155     ScratchSpace<int32> nnz_per_batch_host(c, batch_size, /*on_host*/ true);
156 
157     Tensor nnz_per_batch_device_t;
158     if (rank == 2) {
159       // Simple case.
160       nnz_per_batch_host.mutable_data()[0] = indices_t.dim_size(0);
161     } else {
162       OP_REQUIRES_OK_ASYNC(c,
163                            c->allocate_temp(DT_INT32, TensorShape({batch_size}),
164                                             &nnz_per_batch_device_t),
165                            done);
166       auto nnz_per_batch_device = nnz_per_batch_device_t.vec<int32>();
167 
168       functor::CalculateNNZPerBatchMatrixFromIndices<Device>
169           calculate_nnz_from_indices;
170       auto indices = indices_t.matrix<int64_t>();
171       OP_REQUIRES_OK_ASYNC(
172           c, calculate_nnz_from_indices(c, indices, nnz_per_batch_device),
173           done);
174 
175       perftools::gputools::DeviceMemoryBase nnz_per_batch_device_ptr(
176           static_cast<void*>(nnz_per_batch_device.data()));
177 
178       OP_REQUIRES_ASYNC(
179           c,
180           stream
181               ->ThenMemcpy(nnz_per_batch_host.mutable_data() /*host_dst*/,
182                            nnz_per_batch_device_ptr /*gpu_src*/,
183                            batch_size * sizeof(int32) /*size*/)
184               .ok(),
185           errors::Internal("SparseTensorToSparseMatrixGPUOp: failed to copy "
186                            "nnz_per_batch from device"),
187           done);
188     }
189 
190     TensorReference nnz_per_batch_device_ref(nnz_per_batch_device_t);
191     auto convert_to_csr = [this, c, batch_size, nnz_per_batch_host,
192                            nnz_per_batch_device_ref, stream, &d, &values_t,
193                            &indices_t, &dense_shape_t, dense_shape, rows, cols,
194                            rank, done]() {
195       // The data has been copied out of the nnz_per_batch_device
196       // tensor by the time we get here; we can unreference it.
197       nnz_per_batch_device_ref.Unref();
198 
199       auto nnz_per_batch = nnz_per_batch_host.tensor().vec<int32>();
200 
201       // Ensure that within the callback, the proper GPU settings are
202       // configured.
203       ScopedActivateExecutorContext scoped_activation{stream->parent()};
204       Tensor batch_ptr_t(cpu_allocator(), DT_INT32,
205                          TensorShape({batch_size + 1}));
206 
207       auto batch_ptr = batch_ptr_t.vec<int32>();
208       auto indices = indices_t.matrix<int64_t>();
209 
210       batch_ptr(0) = 0;
211       for (int i = 0; i < batch_size; ++i) {
212         batch_ptr(i + 1) = batch_ptr(i) + nnz_per_batch(i);
213       }
214       int total_nnz = batch_ptr(batch_size);
215       OP_REQUIRES_ASYNC(
216           c, total_nnz == values_t.NumElements(),
217           errors::Internal("nnz returned by "
218                            "CalculateNNZPerBatchMatrixFromInd"
219                            "ices != len(values): ",
220                            total_nnz, " vs. ", values_t.NumElements()),
221           done);
222 
223       Tensor coo_col_ind_t;
224       Tensor csr_row_ptr_t;
225       Tensor csr_values_t = values_t;
226 
227       Tensor coo_row_ind_t;
228       OP_REQUIRES_OK_ASYNC(
229           c,
230           c->allocate_temp(DT_INT32, TensorShape({total_nnz}), &coo_row_ind_t),
231           done);
232       OP_REQUIRES_OK_ASYNC(
233           c,
234           c->allocate_temp(DT_INT32, TensorShape({total_nnz}), &coo_col_ind_t),
235           done);
236       OP_REQUIRES_OK_ASYNC(
237           c,
238           c->allocate_temp(DT_INT32, TensorShape({batch_size * (rows + 1)}),
239                            &csr_row_ptr_t),
240           done);
241 
242       auto coo_row_ind = coo_row_ind_t.vec<int32>();
243       auto coo_col_ind = coo_col_ind_t.vec<int32>();
244       auto csr_row_ptr = csr_row_ptr_t.vec<int32>();
245 
246       // Convert SparseTensor rep to coo row ind, coo col ind.
247       if (total_nnz > 0) {
248         functor::SparseTensorToCOOSparseMatrix<Device> st_to_coo;
249         st_to_coo(d, dense_shape, indices, coo_row_ind, coo_col_ind);
250       }
251 
252       // Set all csr row pointers to zero, so that when iterating over
253       // batches converting coo to csr, we do not have to perform an
254       // unaligned SetZero for any nnz == 0 minibatches.  coo2csr has
255       // a bug if you have empty coo rows.
256       // TODO(ebrevdo): File bug w/ nvidia so coo2csr can handle
257       // zero-element input coo rows.
258       functor::SetZeroFunctor<Device, int32> set_zero;
259       set_zero(d, csr_row_ptr_t.flat<int32>());
260 
261       functor::COOSparseMatrixToCSRSparseMatrix<Device> coo_to_csr;
262       for (int i = 0; i < batch_size; ++i) {
263         int nnz_i = batch_ptr(i + 1) - batch_ptr(i);
264         if (nnz_i == 0) {
265           // This is an empty minibatch; no call to coo2csr: it's
266           // handled by the SetZero above.
267         } else {
268           // Convert coo to csr.
269           auto coo_row_ind_i =
270               TTypes<int32>::UnalignedVec(&coo_row_ind(batch_ptr(i)), nnz_i);
271           auto csr_row_ptr_i = TTypes<int32>::UnalignedVec(
272               &csr_row_ptr((rows + 1) * i), rows + 1);
273           OP_REQUIRES_OK_ASYNC(
274               c, coo_to_csr(c, rows, cols, coo_row_ind_i, csr_row_ptr_i), done);
275         }
276       }
277 
278       CSRSparseMatrix matrix;
279       OP_REQUIRES_OK_ASYNC(
280           c,
281           CSRSparseMatrix::CreateCSRSparseMatrix(
282               values_t.dtype(), dense_shape_t, batch_ptr_t, csr_row_ptr_t,
283               coo_col_ind_t, csr_values_t, &matrix),
284           done);
285       Tensor* matrix_t;
286       AllocatorAttributes cpu_alloc;
287       cpu_alloc.set_on_host(true);
288       OP_REQUIRES_OK_ASYNC(
289           c, c->allocate_output(0, TensorShape({}), &matrix_t, cpu_alloc),
290           done);
291       matrix_t->scalar<Variant>()() = std::move(matrix);
292 
293       done();
294     };
295 
296     if (rank == 2) {
297       convert_to_csr();
298     } else {
299       // Launch the GPU kernel to count nnz entries, then call convert_to_csr.
300       c->device()->tensorflow_accelerator_device_info()->event_mgr->ThenExecute(
301           stream, convert_to_csr);
302     }
303   }
304 };
305 
306 namespace functor {
307 
308 template <>
309 Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
310     OpKernelContext* c, TTypes<int64_t>::ConstMatrix indices,
311     TTypes<int32>::Vec nnz_per_batch);
312 extern template struct CalculateNNZPerBatchMatrixFromIndices<GPUDevice>;
313 
314 template <>
315 struct SparseTensorToCOOSparseMatrix<GPUDevice> {
316   void operator()(const GPUDevice& d,
317                   TTypes<int64_t>::ConstVec host_dense_shape,
318                   TTypes<int64_t>::ConstMatrix indices,
319                   TTypes<int>::Vec coo_row_ind, TTypes<int>::Vec coo_col_ind);
320 };
321 extern template struct SparseTensorToCOOSparseMatrix<GPUDevice>;
322 
323 template <>
324 struct COOSparseMatrixToCSRSparseMatrix<GPUDevice> {
operator ()tensorflow::functor::COOSparseMatrixToCSRSparseMatrix325   Status operator()(OpKernelContext* c, const int rows, const int cols,
326                     TTypes<int>::UnalignedVec coo_row_ind,
327                     TTypes<int>::UnalignedVec csr_row_ptr) {
328     GpuSparse cuda_sparse(c);
329     TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
330     return cuda_sparse.Coo2csr(coo_row_ind.data(),
331                                /*nnz*/ coo_row_ind.size(),
332                                /*m == rows of A*/ rows, csr_row_ptr.data());
333   }
334 };
335 extern template struct COOSparseMatrixToCSRSparseMatrix<GPUDevice>;
336 
337 }  // namespace functor
338 
339 #define REGISTER_GPU(T)                                         \
340   REGISTER_KERNEL_BUILDER(Name("SparseTensorToCSRSparseMatrix") \
341                               .Device(DEVICE_GPU)               \
342                               .TypeConstraint<T>("T")           \
343                               .HostMemory("dense_shape"),       \
344                           SparseTensorToCSRSparseMatrixGPUOp<GPUDevice, T>);
345 
346 REGISTER_GPU(float)
347 REGISTER_GPU(double)
348 REGISTER_GPU(complex64)
349 REGISTER_GPU(complex128)
350 
351 #undef REGISTER_GPU
352 
353 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
354 
355 #define REGISTER_CPU(T)                                         \
356   REGISTER_KERNEL_BUILDER(Name("SparseTensorToCSRSparseMatrix") \
357                               .Device(DEVICE_CPU)               \
358                               .TypeConstraint<T>("T"),          \
359                           SparseTensorToCSRSparseMatrixCPUOp<T>);
360 
361 REGISTER_CPU(float)
362 REGISTER_CPU(double)
363 REGISTER_CPU(complex64)
364 REGISTER_CPU(complex128)
365 
366 #undef REGISTER_CPU
367 
368 }  // namespace tensorflow
369