xref: /aosp_15_r20/external/tensorflow/tensorflow/core/kernels/image/generate_box_proposals_op.cu.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
17 #define EIGEN_USE_GPU
18 
19 #include <algorithm>
20 #include <vector>
21 
22 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
23 #include "tensorflow/core/framework/numeric_types.h"
24 #include "tensorflow/core/framework/op_kernel.h"
25 #include "tensorflow/core/framework/tensor_types.h"
26 #include "tensorflow/core/kernels/gpu_prim.h"
27 #include "tensorflow/core/kernels/image/non_max_suppression_op.h"
28 #include "tensorflow/core/lib/core/errors.h"
29 #include "tensorflow/core/platform/logging.h"
30 #include "tensorflow/core/platform/stream_executor.h"
31 #include "tensorflow/core/platform/types.h"
32 #include "tensorflow/core/util/gpu_kernel_helper.h"
33 #include "tensorflow/core/util/gpu_launch_config.h"
34 
35 namespace tensorflow {
36 typedef Eigen::GpuDevice GPUDevice;
37 
38 namespace {
39 
40 // Decode d_bbox_deltas with respect to anchors into absolute coordinates,
41 // clipping if necessary.
42 // prenms_nboxes maximum number of boxes per image to decode.
43 // d_boxes_keep_flags mask for boxes to consider in NMS.
44 // min_size is the lower bound of the shortest edge for the boxes to consider.
45 // bbox_xform_clip is the upper bound of encoded width and height.
GeneratePreNMSUprightBoxesKernel(const Gpu2DLaunchConfig config,const int * d_sorted_scores_keys,const float4 * d_bbox_deltas,const float4 * d_anchors,const int height,const int width,const int num_anchors,const float min_size,const float * d_img_info_vec,const float bbox_xform_clip,float4 * d_out_boxes,const int prenms_nboxes,char * d_boxes_keep_flags)46 __global__ void GeneratePreNMSUprightBoxesKernel(
47     const Gpu2DLaunchConfig config, const int* d_sorted_scores_keys,
48     const float4* d_bbox_deltas, const float4* d_anchors, const int height,
49     const int width, const int num_anchors, const float min_size,
50     const float* d_img_info_vec,  // Input "image_info" to the op [N,5]
51     const float bbox_xform_clip, float4* d_out_boxes,
52     const int prenms_nboxes,  // leading dimension of out_boxes
53     char* d_boxes_keep_flags) {
54   // constants to calculate offsets in to the input and output arrays.
55   const int anchor_stride = height * width;              // Stride of Anchor
56   const int height_stride = width * num_anchors;         // Stride of height
57   const int image_stride = anchor_stride * num_anchors;  // Stride of image
58   CUDA_AXIS_KERNEL_LOOP(image_index, config.virtual_thread_count.y, Y) {
59     CUDA_AXIS_KERNEL_LOOP(ibox, config.virtual_thread_count.x, X) {
60       // box_conv_index : # of the same box, but indexed in the
61       // scores from the conv layer, of shape (height,width,num_anchors) the
62       // num_images dimension was already removed box_conv_index =
63       // a*image_stride + h*width + w
64       const int box_conv_index =
65           d_sorted_scores_keys[image_index * image_stride + ibox];
66 
67       // We want to decompose box_conv_index in (h,w,a)
68       // such as box_conv_index = h*width*num_anchors + width*num_anchors + a
69       // (avoiding modulos in the process)
70       int remaining = box_conv_index;
71       const int delta_height = height_stride;  // stride of height
72       const int h = remaining / delta_height;
73       remaining -= h * delta_height;
74       const int delta_width = num_anchors;  // stride of width
75       const int w = remaining / delta_width;
76       remaining -= w * delta_width;
77       // Loading the anchor a
78       // float4 is a struct with float x,y,z,w
79       const float4 anchor = d_anchors[box_conv_index];
80       // x1,y1,x2,y2 :coordinates of anchor a, shifted for position (h,w)
81       float x1 = anchor.y;
82       float x2 = anchor.w;
83       float y1 = anchor.x;
84       float y2 = anchor.z;
85 
86       // TODO use fast math when possible
87 
88       // Deltas of shape (N,height,width,num_anchors x 4)
89       int deltas_idx = box_conv_index + image_index * image_stride;
90       float4 deltas = d_bbox_deltas[deltas_idx];
91       float dx = deltas.y;
92       float dy = deltas.x;
93       float dw = deltas.w;
94       float dh = deltas.z;
95       // Upper bound on dw,dh
96       dw = fmin(dw, bbox_xform_clip);
97       dh = fmin(dh, bbox_xform_clip);
98 
99       // Applying the deltas
100       float width = x2 - x1;
101       const float ctr_x = x1 + 0.5f * width;
102       const float pred_ctr_x = ctr_x + width * dx;  // TODO fuse madd
103       const float pred_w = width * expf(dw);
104       x1 = pred_ctr_x - 0.5f * pred_w;
105       x2 = pred_ctr_x + 0.5f * pred_w;
106 
107       float height = y2 - y1;
108       const float ctr_y = y1 + 0.5f * height;
109       const float pred_ctr_y = ctr_y + height * dy;
110       const float pred_h = height * expf(dh);
111       y1 = pred_ctr_y - 0.5f * pred_h;
112       y2 = pred_ctr_y + 0.5f * pred_h;
113 
114       // Clipping box to image
115       const float img_height = d_img_info_vec[5 * image_index + 0];
116       const float img_width = d_img_info_vec[5 * image_index + 1];
117       const float min_size_scaled =
118           min_size * d_img_info_vec[5 * image_index + 2];
119       x1 = fmax(fmin(x1, img_width), 0.0f);
120       y1 = fmax(fmin(y1, img_height), 0.0f);
121       x2 = fmax(fmin(x2, img_width), 0.0f);
122       y2 = fmax(fmin(y2, img_height), 0.0f);
123 
124       // Filter boxes
125       // Removing boxes with one dim < min_size
126       // (center of box is in image, because of previous step)
127       width = x2 - x1;  // may have changed
128       height = y2 - y1;
129       bool keep_box = fmin(width, height) >= min_size_scaled;
130 
131       // We are not deleting the box right now even if !keep_box
132       // we want to keep the relative order of the elements stable
133       // we'll do it in such a way later
134       // d_boxes_keep_flags size: (num_images,prenms_nboxes)
135       // d_out_boxes size: (num_images,prenms_nboxes)
136       const int out_index = image_index * prenms_nboxes + ibox;
137 
138       d_boxes_keep_flags[out_index] = keep_box;
139       d_out_boxes[out_index] = {x1, y1, x2, y2};
140     }
141   }
142 }
143 
144 // Copy the selected boxes and scores to output tensors.
145 //
WriteUprightBoxesOutput(const GpuLaunchConfig nboxes,const float4 * d_image_boxes,const float * d_image_scores,const int * d_image_boxes_keep_list,const int n_rois,float * d_image_out_rois,float * d_image_out_rois_probs)146 __global__ void WriteUprightBoxesOutput(
147     const GpuLaunchConfig nboxes, const float4* d_image_boxes,
148     const float* d_image_scores, const int* d_image_boxes_keep_list,
149     const int n_rois, float* d_image_out_rois, float* d_image_out_rois_probs) {
150   CUDA_1D_KERNEL_LOOP(i, nboxes.virtual_thread_count) {
151     if (i < n_rois) {  // copy rois to output
152       const int ibox = d_image_boxes_keep_list[i];
153       const float4 box = d_image_boxes[ibox];
154       const float score = d_image_scores[ibox];
155       // Scattered memory accesses
156       // postnms_nboxes is small anyway
157       d_image_out_rois_probs[i] = score;
158       const int base_idx = 4 * i;
159       d_image_out_rois[base_idx + 0] = box.y;
160       d_image_out_rois[base_idx + 1] = box.x;
161       d_image_out_rois[base_idx + 2] = box.w;
162       d_image_out_rois[base_idx + 3] = box.z;
163     } else {  // set trailing entries to 0
164       d_image_out_rois_probs[i] = 0.;
165       const int base_idx = 4 * i;
166       d_image_out_rois[base_idx + 0] = 0.;
167       d_image_out_rois[base_idx + 1] = 0.;
168       d_image_out_rois[base_idx + 2] = 0.;
169       d_image_out_rois[base_idx + 3] = 0.;
170     }
171   }
172 }
173 
174 template <typename T>
ResetTensor(Tensor * t,const Eigen::GpuDevice & d)175 Status ResetTensor(Tensor* t, const Eigen::GpuDevice& d) {
176   GpuLaunchConfig zconfig = GetGpuLaunchConfig(t->NumElements(), d);
177   return GpuLaunchKernel(SetZero<T>, zconfig.block_count,
178                          zconfig.thread_per_block, 0, d.stream(),
179                          zconfig.virtual_thread_count, (*t).flat<T>().data());
180 }
181 // Allocate scratch spaces that are needed for operation
182 //
183 
AllocateGenerationTempTensors(OpKernelContext * context,Tensor * d_conv_layer_indexes,Tensor * d_image_offset,Tensor * d_cub_temp_buffer,Tensor * d_sorted_conv_layer_indexes,Tensor * d_sorted_scores,Tensor * dev_boxes,Tensor * dev_boxes_keep_flags,int num_images,int conv_layer_nboxes,size_t cub_temp_storage_bytes,int num_boxes_to_generate,int box_dim)184 Status AllocateGenerationTempTensors(
185     OpKernelContext* context, Tensor* d_conv_layer_indexes,
186     Tensor* d_image_offset, Tensor* d_cub_temp_buffer,
187     Tensor* d_sorted_conv_layer_indexes, Tensor* d_sorted_scores,
188     Tensor* dev_boxes, Tensor* dev_boxes_keep_flags, int num_images,
189     int conv_layer_nboxes, size_t cub_temp_storage_bytes,
190     int num_boxes_to_generate, int box_dim) {
191   auto d = context->eigen_gpu_device();
192   TF_RETURN_IF_ERROR(context->allocate_temp(
193       DataType::DT_INT32, TensorShape({num_images, conv_layer_nboxes}),
194       d_conv_layer_indexes));
195   TF_RETURN_IF_ERROR(ResetTensor<int>(d_conv_layer_indexes, d));
196   TF_RETURN_IF_ERROR(context->allocate_temp(
197       DataType::DT_INT32, TensorShape({num_images + 1}), d_image_offset));
198   TF_RETURN_IF_ERROR(ResetTensor<int>(d_image_offset, d));
199   TF_RETURN_IF_ERROR(context->allocate_temp(
200       DataType::DT_INT8, TensorShape({(int64)cub_temp_storage_bytes}),
201       d_cub_temp_buffer));
202   TF_RETURN_IF_ERROR(context->allocate_temp(
203       DataType::DT_INT32, TensorShape({num_images, conv_layer_nboxes}),
204       d_sorted_conv_layer_indexes));
205   TF_RETURN_IF_ERROR(ResetTensor<int32>(d_sorted_conv_layer_indexes, d));
206   TF_RETURN_IF_ERROR(context->allocate_temp(
207       DataType::DT_FLOAT, TensorShape({num_images, conv_layer_nboxes}),
208       d_sorted_scores));
209   TF_RETURN_IF_ERROR(ResetTensor<float>(d_sorted_scores, d));
210   TF_RETURN_IF_ERROR(context->allocate_temp(
211       DataType::DT_FLOAT,
212       TensorShape({num_images, box_dim * num_boxes_to_generate}), dev_boxes));
213   TF_RETURN_IF_ERROR(ResetTensor<float>(dev_boxes, d));
214   TF_RETURN_IF_ERROR(context->allocate_temp(
215       DataType::DT_INT8, TensorShape({num_images, num_boxes_to_generate}),
216       dev_boxes_keep_flags));
217   TF_RETURN_IF_ERROR(ResetTensor<int8>(dev_boxes_keep_flags, d));
218   return Status::OK();
219 }
220 
221 // Allocate workspace for NMS operation
AllocatePreNMSTempTensors(OpKernelContext * context,Tensor * dev_image_prenms_boxes,Tensor * dev_image_prenms_scores,Tensor * dev_image_boxes_keep_list,Tensor * dev_postnms_rois,Tensor * dev_postnms_rois_probs,Tensor * dev_prenms_nboxes,int num_images,int num_boxes_to_generate,int box_dim,int post_nms_topn,int pre_nms_topn)222 Status AllocatePreNMSTempTensors(
223     OpKernelContext* context, Tensor* dev_image_prenms_boxes,
224     Tensor* dev_image_prenms_scores, Tensor* dev_image_boxes_keep_list,
225     Tensor* dev_postnms_rois, Tensor* dev_postnms_rois_probs,
226     Tensor* dev_prenms_nboxes, int num_images, int num_boxes_to_generate,
227     int box_dim, int post_nms_topn, int pre_nms_topn) {
228   auto d = context->eigen_gpu_device();
229   TF_RETURN_IF_ERROR(context->allocate_temp(
230       DataType::DT_FLOAT, TensorShape({box_dim * num_boxes_to_generate}),
231       dev_image_prenms_boxes));
232   TF_RETURN_IF_ERROR(ResetTensor<float>(dev_image_prenms_boxes, d));
233 
234   TF_RETURN_IF_ERROR(context->allocate_temp(
235       DataType::DT_FLOAT, TensorShape({num_boxes_to_generate}),
236       dev_image_prenms_scores));
237   TF_RETURN_IF_ERROR(ResetTensor<float>(dev_image_prenms_scores, d));
238 
239   TF_RETURN_IF_ERROR(context->allocate_temp(
240       DataType::DT_INT32, TensorShape({num_boxes_to_generate}),
241       dev_image_boxes_keep_list));
242   TF_RETURN_IF_ERROR(ResetTensor<int32>(dev_image_boxes_keep_list, d));
243 
244   const int max_postnms_nboxes = std::min(num_boxes_to_generate, post_nms_topn);
245   TF_RETURN_IF_ERROR(context->allocate_temp(
246       DataType::DT_FLOAT,
247       TensorShape({box_dim * num_images * max_postnms_nboxes}),
248       dev_postnms_rois));
249   TF_RETURN_IF_ERROR(ResetTensor<float>(dev_postnms_rois, d));
250 
251   TF_RETURN_IF_ERROR(context->allocate_temp(
252       DataType::DT_FLOAT, TensorShape({num_images * max_postnms_nboxes}),
253       dev_postnms_rois_probs));
254   TF_RETURN_IF_ERROR(ResetTensor<float>(dev_postnms_rois_probs, d));
255 
256   TF_RETURN_IF_ERROR(context->allocate_temp(
257       DataType::DT_INT32, TensorShape({num_images}), dev_prenms_nboxes));
258   TF_RETURN_IF_ERROR(ResetTensor<int32>(dev_prenms_nboxes, d));
259 
260   return Status::OK();
261 }
262 
263 // Initialize index and offset arrays.
264 // num_images is the batch size.
InitializeDataKernel(const Gpu2DLaunchConfig config,int * d_image_offsets,int * d_boxes_keys_iota)265 __global__ void InitializeDataKernel(const Gpu2DLaunchConfig config,
266                                      int* d_image_offsets,
267                                      int* d_boxes_keys_iota) {
268   const int image_size = config.virtual_thread_count.x;
269   const int num_images = config.virtual_thread_count.y;
270   CUDA_AXIS_KERNEL_LOOP(img_idx, config.virtual_thread_count.y, Y) {
271     CUDA_AXIS_KERNEL_LOOP(box_idx, config.virtual_thread_count.x, X) {
272       d_boxes_keys_iota[img_idx * image_size + box_idx] = box_idx;
273 
274       // One 1D line sets the 1D data
275       if (box_idx == 0) {
276         d_image_offsets[img_idx] = image_size * img_idx;
277         // One thread sets the last+1 offset
278         if (img_idx == 0) d_image_offsets[num_images] = image_size * num_images;
279       }
280     }
281   }
282 }
283 
284 }  // namespace
285 
286 class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
287  public:
GenerateBoundingBoxProposals(tensorflow::OpKernelConstruction * context)288   explicit GenerateBoundingBoxProposals(
289       tensorflow::OpKernelConstruction* context)
290       : OpKernel(context) {
291     OP_REQUIRES_OK(context, context->GetAttr("post_nms_topn", &post_nms_topn_));
292     OP_REQUIRES(context, post_nms_topn_ > 0,
293                 errors::InvalidArgument("post_nms_topn can't be 0 or less"));
294     bbox_xform_clip_default_ = log(1000.0 / 16.);
295   }
296 
297   template <typename T>
GetScalarValue(OpKernelContext * context,int input,T * value)298   Status GetScalarValue(OpKernelContext* context, int input, T* value) {
299     const Tensor& scalar_tensor = context->input(input);
300     if (!TensorShapeUtils::IsScalar(scalar_tensor.shape())) {
301       return errors::InvalidArgument("Expected a scalar in input ", input,
302                                      "but got shape ",
303                                      scalar_tensor.shape().DebugString());
304     }
305     *value = scalar_tensor.scalar<T>()();
306     return Status::OK();
307   }
308 
Compute(tensorflow::OpKernelContext * context)309   void Compute(tensorflow::OpKernelContext* context) override {
310     VLOG(1) << "Starting Compute " << name();
311     const auto scores = context->input(0);
312     const auto bbox_deltas = context->input(1);
313     const auto image_info = context->input(2);
314     const auto anchors = context->input(3);
315     const auto num_images = scores.dim_size(0);
316     const auto num_anchors = scores.dim_size(3);
317     const auto height = scores.dim_size(1);
318     const auto width = scores.dim_size(2);
319     const auto box_dim = anchors.dim_size(2) / num_anchors;
320     OP_REQUIRES(context, box_dim == 4,
321                 errors::OutOfRange("Box dimensions need to be 4"));
322     // TODO(skama): make sure that inputs are ok.
323     const int image_stride = height * width;
324     const int conv_layer_nboxes =
325         image_stride *
326         num_anchors;  // total number of boxes when decoded on anchors.
327     // The following calls to CUB primitives do nothing
328     // (because the first arg is nullptr)
329     // except setting cub_*_temp_storage_bytes
330     float nms_threshold;
331     int pre_nms_topn;
332     float min_size;
333     OP_REQUIRES_OK(context, GetScalarValue(context, 4, &nms_threshold));
334     if (nms_threshold < 0 || nms_threshold > 1.0) {
335       context->SetStatus(errors::InvalidArgument(
336           "nms_threshold should be between 0 and 1. Got ", nms_threshold));
337       return;
338     }
339     OP_REQUIRES_OK(context, GetScalarValue(context, 5, &pre_nms_topn));
340     if (pre_nms_topn <= 0) {
341       context->SetStatus(errors::InvalidArgument(
342           "pre_nms_topn should be greater than 0", pre_nms_topn));
343       return;
344     }
345     OP_REQUIRES_OK(context, GetScalarValue(context, 6, &min_size));
346     auto cuda_stream = GetGpuStream(context);
347     size_t cub_sort_temp_storage_bytes = 0;
348     float* flt_ptr = nullptr;
349     int* int_ptr = nullptr;
350     cudaError_t cuda_ret =
351         gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
352             nullptr, cub_sort_temp_storage_bytes, flt_ptr, flt_ptr, int_ptr,
353             int_ptr, num_images * conv_layer_nboxes, num_images, int_ptr,
354             int_ptr, 0, 8 * sizeof(float),  // sort all bits
355             cuda_stream);
356     TF_OP_REQUIRES_CUDA_SUCCESS(context, cuda_ret);
357     // get the size of select temp buffer
358     size_t cub_select_temp_storage_bytes = 0;
359     char* char_ptr = nullptr;
360     float4* f4_ptr = nullptr;
361     TF_OP_REQUIRES_CUDA_SUCCESS(
362         context, gpuprim::DeviceSelect::Flagged(
363                      nullptr, cub_select_temp_storage_bytes, f4_ptr, char_ptr,
364                      f4_ptr, int_ptr, image_stride * num_anchors, cuda_stream));
365     Tensor d_conv_layer_indexes;  // box indices on device
366     Tensor d_image_offset;        // starting offsets boxes for each image
367     Tensor d_cub_temp_buffer;     // buffer for cub sorting
368     Tensor d_sorted_conv_layer_indexes;  // output of cub sorting, indices of
369                                          // the sorted boxes
370     Tensor dev_sorted_scores;            // sorted scores, cub output
371     Tensor dev_boxes;                    // boxes on device
372     Tensor dev_boxes_keep_flags;  // bitmask for keeping the boxes or rejecting
373                                   // from output
374     const int nboxes_to_generate = std::min(conv_layer_nboxes, pre_nms_topn);
375     size_t cub_temp_storage_bytes =
376         std::max(cub_sort_temp_storage_bytes, cub_select_temp_storage_bytes);
377     OP_REQUIRES_OK(
378         context,
379         AllocateGenerationTempTensors(
380             context, &d_conv_layer_indexes, &d_image_offset, &d_cub_temp_buffer,
381             &d_sorted_conv_layer_indexes, &dev_sorted_scores, &dev_boxes,
382             &dev_boxes_keep_flags, num_images, conv_layer_nboxes,
383             cub_temp_storage_bytes, nboxes_to_generate, box_dim));
384     const GPUDevice& d = context->eigen_device<GPUDevice>();
385     Gpu2DLaunchConfig conf2d =
386         GetGpu2DLaunchConfig(conv_layer_nboxes, num_images, d);
387     // create box indices and offsets for each image on device
388     OP_REQUIRES_OK(
389         context, GpuLaunchKernel(InitializeDataKernel, conf2d.block_count,
390                                  conf2d.thread_per_block, 0, d.stream(), conf2d,
391                                  d_image_offset.flat<int>().data(),
392                                  d_conv_layer_indexes.flat<int>().data()));
393 
394     // sort boxes with their scores.
395     // d_sorted_conv_layer_indexes will hold the pointers to old indices.
396     TF_OP_REQUIRES_CUDA_SUCCESS(
397         context,
398         gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
399             d_cub_temp_buffer.flat<int8>().data(), cub_temp_storage_bytes,
400             scores.flat<float>().data(), dev_sorted_scores.flat<float>().data(),
401             d_conv_layer_indexes.flat<int>().data(),
402             d_sorted_conv_layer_indexes.flat<int>().data(),
403             num_images * conv_layer_nboxes, num_images,
404             d_image_offset.flat<int>().data(),
405             d_image_offset.flat<int>().data() + 1, 0,
406             8 * sizeof(float),  // sort all bits
407             cuda_stream));
408     // Keeping only the topN pre_nms
409     conf2d = GetGpu2DLaunchConfig(nboxes_to_generate, num_images, d);
410 
411     // create box y1,x1,y2,x2 from box_deltas and anchors (decode the boxes) and
412     // mark the boxes which are smaller that min_size ignored.
413     OP_REQUIRES_OK(
414         context,
415         GpuLaunchKernel(
416             GeneratePreNMSUprightBoxesKernel, conf2d.block_count,
417             conf2d.thread_per_block, 0, d.stream(), conf2d,
418             d_sorted_conv_layer_indexes.flat<int>().data(),
419             reinterpret_cast<const float4*>(bbox_deltas.flat<float>().data()),
420             reinterpret_cast<const float4*>(anchors.flat<float>().data()),
421             height, width, num_anchors, min_size,
422             image_info.flat<float>().data(), bbox_xform_clip_default_,
423             reinterpret_cast<float4*>(dev_boxes.flat<float>().data()),
424             nboxes_to_generate,
425             (char*)dev_boxes_keep_flags.flat<int8>().data()));
426     const int nboxes_generated = nboxes_to_generate;
427     const int roi_cols = box_dim;
428     Tensor dev_image_prenms_boxes;
429     Tensor dev_image_prenms_scores;
430     Tensor dev_image_boxes_keep_list;
431     Tensor dev_postnms_rois;
432     Tensor dev_postnms_rois_probs;
433     Tensor dev_prenms_nboxes;
434     // Allocate workspaces needed for NMS
435     OP_REQUIRES_OK(
436         context, AllocatePreNMSTempTensors(
437                      context, &dev_image_prenms_boxes, &dev_image_prenms_scores,
438                      &dev_image_boxes_keep_list, &dev_postnms_rois,
439                      &dev_postnms_rois_probs, &dev_prenms_nboxes, num_images,
440                      nboxes_generated, box_dim, post_nms_topn_, pre_nms_topn));
441     // get the pointers for temp storages
442     int* d_prenms_nboxes = dev_prenms_nboxes.flat<int>().data();
443     int h_prenms_nboxes = 0;
444     char* d_cub_temp_storage = (char*)d_cub_temp_buffer.flat<int8>().data();
445     float* d_image_prenms_boxes = dev_image_prenms_boxes.flat<float>().data();
446     float* d_image_prenms_scores = dev_image_prenms_scores.flat<float>().data();
447     int* d_image_boxes_keep_list = dev_image_boxes_keep_list.flat<int>().data();
448 
449     int nrois_in_output = 0;
450     // get the pointers to boxes and scores
451     char* d_boxes_keep_flags = (char*)dev_boxes_keep_flags.flat<int8>().data();
452     float* d_boxes = dev_boxes.flat<float>().data();
453     float* d_sorted_scores = dev_sorted_scores.flat<float>().data();
454 
455     // Create output tensors
456     Tensor* output_rois = nullptr;
457     Tensor* output_roi_probs = nullptr;
458     OP_REQUIRES_OK(context,
459                    context->allocate_output(
460                        0, TensorShape({num_images, post_nms_topn_, roi_cols}),
461                        &output_rois));
462     OP_REQUIRES_OK(context, context->allocate_output(
463                                 1, TensorShape({num_images, post_nms_topn_}),
464                                 &output_roi_probs));
465     float* d_postnms_rois = (*output_rois).flat<float>().data();
466     float* d_postnms_rois_probs = (*output_roi_probs).flat<float>().data();
467     gpuEvent_t copy_done;
468     gpuEventCreate(&copy_done);
469 
470     // Do  per-image nms
471     for (int image_index = 0; image_index < num_images; ++image_index) {
472       // reset output workspaces
473       OP_REQUIRES_OK(context,
474                      ResetTensor<int32>(&dev_image_boxes_keep_list, d));
475       // Sub matrices for current image
476       // boxes
477       const float* d_image_boxes =
478           &d_boxes[image_index * nboxes_generated * box_dim];
479       // scores
480       const float* d_image_sorted_scores =
481           &d_sorted_scores[image_index * image_stride * num_anchors];
482       // keep flags
483       char* d_image_boxes_keep_flags =
484           &d_boxes_keep_flags[image_index * nboxes_generated];
485 
486       // Output buffer for image
487       float* d_image_postnms_rois =
488           &d_postnms_rois[image_index * roi_cols * post_nms_topn_];
489       float* d_image_postnms_rois_probs =
490           &d_postnms_rois_probs[image_index * post_nms_topn_];
491 
492       // Moving valid boxes (ie the ones with d_boxes_keep_flags[ibox] == true)
493       // to the output tensors
494       TF_OP_REQUIRES_CUDA_SUCCESS(
495           context, gpuprim::DeviceSelect::Flagged(
496                        d_cub_temp_storage, cub_temp_storage_bytes,
497                        reinterpret_cast<const float4*>(d_image_boxes),
498                        d_image_boxes_keep_flags,
499                        reinterpret_cast<float4*>(d_image_prenms_boxes),
500                        d_prenms_nboxes, nboxes_generated, d.stream()));
501       TF_OP_REQUIRES_CUDA_SUCCESS(
502           context,
503           gpuprim::DeviceSelect::Flagged(
504               d_cub_temp_storage, cub_temp_storage_bytes, d_image_sorted_scores,
505               d_image_boxes_keep_flags, d_image_prenms_scores, d_prenms_nboxes,
506               nboxes_generated, d.stream()));
507       d.memcpyDeviceToHost(&h_prenms_nboxes, d_prenms_nboxes, sizeof(int));
508       TF_OP_REQUIRES_CUDA_SUCCESS(context,
509                                   gpuEventRecord(copy_done, d.stream()));
510       TF_OP_REQUIRES_CUDA_SUCCESS(context, gpuEventSynchronize(copy_done));
511       // We know prenms_boxes <= topN_prenms, because nboxes_generated <=
512       // topN_prenms. Calling NMS on the generated boxes
513       const int prenms_nboxes = h_prenms_nboxes;
514       int nkeep;
515       OP_REQUIRES_OK(context, NmsGpu(d_image_prenms_boxes, prenms_nboxes,
516                                      nms_threshold, d_image_boxes_keep_list,
517                                      &nkeep, context, post_nms_topn_));
518       // All operations done after previous sort were keeping the relative order
519       // of the elements the elements are still sorted keep topN <=> truncate
520       // the array
521       const int postnms_nboxes = std::min(nkeep, post_nms_topn_);
522       // Moving the out boxes to the output tensors,
523       // adding the image_index dimension on the fly
524       GpuLaunchConfig config = GetGpuLaunchConfig(post_nms_topn_, d);
525       // make this single kernel
526       OP_REQUIRES_OK(
527           context,
528           GpuLaunchKernel(WriteUprightBoxesOutput, config.block_count,
529                           config.thread_per_block, 0, d.stream(), config,
530                           reinterpret_cast<const float4*>(d_image_prenms_boxes),
531                           d_image_prenms_scores, d_image_boxes_keep_list,
532                           postnms_nboxes, d_image_postnms_rois,
533                           d_image_postnms_rois_probs));
534       nrois_in_output += postnms_nboxes;
535       TF_OP_REQUIRES_CUDA_SUCCESS(context, cudaGetLastError());
536     }
537   }
538 
539  private:
540   int post_nms_topn_;
541   float bbox_xform_clip_default_;
542 };
543 
544 REGISTER_KERNEL_BUILDER(Name("GenerateBoundingBoxProposals")
545                             .Device(tensorflow::DEVICE_GPU)
546                             .HostMemory("nms_threshold")
547                             .HostMemory("min_size")
548                             .HostMemory("pre_nms_topn"),
549                         tensorflow::GenerateBoundingBoxProposals);
550 }  // namespace tensorflow
551 #endif
552