1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
17 #define EIGEN_USE_GPU
18
19 #include <algorithm>
20 #include <vector>
21
22 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
23 #include "tensorflow/core/framework/numeric_types.h"
24 #include "tensorflow/core/framework/op_kernel.h"
25 #include "tensorflow/core/framework/tensor_types.h"
26 #include "tensorflow/core/kernels/gpu_prim.h"
27 #include "tensorflow/core/kernels/image/non_max_suppression_op.h"
28 #include "tensorflow/core/lib/core/errors.h"
29 #include "tensorflow/core/platform/logging.h"
30 #include "tensorflow/core/platform/stream_executor.h"
31 #include "tensorflow/core/platform/types.h"
32 #include "tensorflow/core/util/gpu_kernel_helper.h"
33 #include "tensorflow/core/util/gpu_launch_config.h"
34
35 namespace tensorflow {
36 typedef Eigen::GpuDevice GPUDevice;
37
38 namespace {
39
40 // Decode d_bbox_deltas with respect to anchors into absolute coordinates,
41 // clipping if necessary.
42 // prenms_nboxes maximum number of boxes per image to decode.
43 // d_boxes_keep_flags mask for boxes to consider in NMS.
44 // min_size is the lower bound of the shortest edge for the boxes to consider.
45 // bbox_xform_clip is the upper bound of encoded width and height.
GeneratePreNMSUprightBoxesKernel(const Gpu2DLaunchConfig config,const int * d_sorted_scores_keys,const float4 * d_bbox_deltas,const float4 * d_anchors,const int height,const int width,const int num_anchors,const float min_size,const float * d_img_info_vec,const float bbox_xform_clip,float4 * d_out_boxes,const int prenms_nboxes,char * d_boxes_keep_flags)46 __global__ void GeneratePreNMSUprightBoxesKernel(
47 const Gpu2DLaunchConfig config, const int* d_sorted_scores_keys,
48 const float4* d_bbox_deltas, const float4* d_anchors, const int height,
49 const int width, const int num_anchors, const float min_size,
50 const float* d_img_info_vec, // Input "image_info" to the op [N,5]
51 const float bbox_xform_clip, float4* d_out_boxes,
52 const int prenms_nboxes, // leading dimension of out_boxes
53 char* d_boxes_keep_flags) {
54 // constants to calculate offsets in to the input and output arrays.
55 const int anchor_stride = height * width; // Stride of Anchor
56 const int height_stride = width * num_anchors; // Stride of height
57 const int image_stride = anchor_stride * num_anchors; // Stride of image
58 CUDA_AXIS_KERNEL_LOOP(image_index, config.virtual_thread_count.y, Y) {
59 CUDA_AXIS_KERNEL_LOOP(ibox, config.virtual_thread_count.x, X) {
60 // box_conv_index : # of the same box, but indexed in the
61 // scores from the conv layer, of shape (height,width,num_anchors) the
62 // num_images dimension was already removed box_conv_index =
63 // a*image_stride + h*width + w
64 const int box_conv_index =
65 d_sorted_scores_keys[image_index * image_stride + ibox];
66
67 // We want to decompose box_conv_index in (h,w,a)
68 // such as box_conv_index = h*width*num_anchors + width*num_anchors + a
69 // (avoiding modulos in the process)
70 int remaining = box_conv_index;
71 const int delta_height = height_stride; // stride of height
72 const int h = remaining / delta_height;
73 remaining -= h * delta_height;
74 const int delta_width = num_anchors; // stride of width
75 const int w = remaining / delta_width;
76 remaining -= w * delta_width;
77 // Loading the anchor a
78 // float4 is a struct with float x,y,z,w
79 const float4 anchor = d_anchors[box_conv_index];
80 // x1,y1,x2,y2 :coordinates of anchor a, shifted for position (h,w)
81 float x1 = anchor.y;
82 float x2 = anchor.w;
83 float y1 = anchor.x;
84 float y2 = anchor.z;
85
86 // TODO use fast math when possible
87
88 // Deltas of shape (N,height,width,num_anchors x 4)
89 int deltas_idx = box_conv_index + image_index * image_stride;
90 float4 deltas = d_bbox_deltas[deltas_idx];
91 float dx = deltas.y;
92 float dy = deltas.x;
93 float dw = deltas.w;
94 float dh = deltas.z;
95 // Upper bound on dw,dh
96 dw = fmin(dw, bbox_xform_clip);
97 dh = fmin(dh, bbox_xform_clip);
98
99 // Applying the deltas
100 float width = x2 - x1;
101 const float ctr_x = x1 + 0.5f * width;
102 const float pred_ctr_x = ctr_x + width * dx; // TODO fuse madd
103 const float pred_w = width * expf(dw);
104 x1 = pred_ctr_x - 0.5f * pred_w;
105 x2 = pred_ctr_x + 0.5f * pred_w;
106
107 float height = y2 - y1;
108 const float ctr_y = y1 + 0.5f * height;
109 const float pred_ctr_y = ctr_y + height * dy;
110 const float pred_h = height * expf(dh);
111 y1 = pred_ctr_y - 0.5f * pred_h;
112 y2 = pred_ctr_y + 0.5f * pred_h;
113
114 // Clipping box to image
115 const float img_height = d_img_info_vec[5 * image_index + 0];
116 const float img_width = d_img_info_vec[5 * image_index + 1];
117 const float min_size_scaled =
118 min_size * d_img_info_vec[5 * image_index + 2];
119 x1 = fmax(fmin(x1, img_width), 0.0f);
120 y1 = fmax(fmin(y1, img_height), 0.0f);
121 x2 = fmax(fmin(x2, img_width), 0.0f);
122 y2 = fmax(fmin(y2, img_height), 0.0f);
123
124 // Filter boxes
125 // Removing boxes with one dim < min_size
126 // (center of box is in image, because of previous step)
127 width = x2 - x1; // may have changed
128 height = y2 - y1;
129 bool keep_box = fmin(width, height) >= min_size_scaled;
130
131 // We are not deleting the box right now even if !keep_box
132 // we want to keep the relative order of the elements stable
133 // we'll do it in such a way later
134 // d_boxes_keep_flags size: (num_images,prenms_nboxes)
135 // d_out_boxes size: (num_images,prenms_nboxes)
136 const int out_index = image_index * prenms_nboxes + ibox;
137
138 d_boxes_keep_flags[out_index] = keep_box;
139 d_out_boxes[out_index] = {x1, y1, x2, y2};
140 }
141 }
142 }
143
144 // Copy the selected boxes and scores to output tensors.
145 //
WriteUprightBoxesOutput(const GpuLaunchConfig nboxes,const float4 * d_image_boxes,const float * d_image_scores,const int * d_image_boxes_keep_list,const int n_rois,float * d_image_out_rois,float * d_image_out_rois_probs)146 __global__ void WriteUprightBoxesOutput(
147 const GpuLaunchConfig nboxes, const float4* d_image_boxes,
148 const float* d_image_scores, const int* d_image_boxes_keep_list,
149 const int n_rois, float* d_image_out_rois, float* d_image_out_rois_probs) {
150 CUDA_1D_KERNEL_LOOP(i, nboxes.virtual_thread_count) {
151 if (i < n_rois) { // copy rois to output
152 const int ibox = d_image_boxes_keep_list[i];
153 const float4 box = d_image_boxes[ibox];
154 const float score = d_image_scores[ibox];
155 // Scattered memory accesses
156 // postnms_nboxes is small anyway
157 d_image_out_rois_probs[i] = score;
158 const int base_idx = 4 * i;
159 d_image_out_rois[base_idx + 0] = box.y;
160 d_image_out_rois[base_idx + 1] = box.x;
161 d_image_out_rois[base_idx + 2] = box.w;
162 d_image_out_rois[base_idx + 3] = box.z;
163 } else { // set trailing entries to 0
164 d_image_out_rois_probs[i] = 0.;
165 const int base_idx = 4 * i;
166 d_image_out_rois[base_idx + 0] = 0.;
167 d_image_out_rois[base_idx + 1] = 0.;
168 d_image_out_rois[base_idx + 2] = 0.;
169 d_image_out_rois[base_idx + 3] = 0.;
170 }
171 }
172 }
173
174 template <typename T>
ResetTensor(Tensor * t,const Eigen::GpuDevice & d)175 Status ResetTensor(Tensor* t, const Eigen::GpuDevice& d) {
176 GpuLaunchConfig zconfig = GetGpuLaunchConfig(t->NumElements(), d);
177 return GpuLaunchKernel(SetZero<T>, zconfig.block_count,
178 zconfig.thread_per_block, 0, d.stream(),
179 zconfig.virtual_thread_count, (*t).flat<T>().data());
180 }
181 // Allocate scratch spaces that are needed for operation
182 //
183
AllocateGenerationTempTensors(OpKernelContext * context,Tensor * d_conv_layer_indexes,Tensor * d_image_offset,Tensor * d_cub_temp_buffer,Tensor * d_sorted_conv_layer_indexes,Tensor * d_sorted_scores,Tensor * dev_boxes,Tensor * dev_boxes_keep_flags,int num_images,int conv_layer_nboxes,size_t cub_temp_storage_bytes,int num_boxes_to_generate,int box_dim)184 Status AllocateGenerationTempTensors(
185 OpKernelContext* context, Tensor* d_conv_layer_indexes,
186 Tensor* d_image_offset, Tensor* d_cub_temp_buffer,
187 Tensor* d_sorted_conv_layer_indexes, Tensor* d_sorted_scores,
188 Tensor* dev_boxes, Tensor* dev_boxes_keep_flags, int num_images,
189 int conv_layer_nboxes, size_t cub_temp_storage_bytes,
190 int num_boxes_to_generate, int box_dim) {
191 auto d = context->eigen_gpu_device();
192 TF_RETURN_IF_ERROR(context->allocate_temp(
193 DataType::DT_INT32, TensorShape({num_images, conv_layer_nboxes}),
194 d_conv_layer_indexes));
195 TF_RETURN_IF_ERROR(ResetTensor<int>(d_conv_layer_indexes, d));
196 TF_RETURN_IF_ERROR(context->allocate_temp(
197 DataType::DT_INT32, TensorShape({num_images + 1}), d_image_offset));
198 TF_RETURN_IF_ERROR(ResetTensor<int>(d_image_offset, d));
199 TF_RETURN_IF_ERROR(context->allocate_temp(
200 DataType::DT_INT8, TensorShape({(int64)cub_temp_storage_bytes}),
201 d_cub_temp_buffer));
202 TF_RETURN_IF_ERROR(context->allocate_temp(
203 DataType::DT_INT32, TensorShape({num_images, conv_layer_nboxes}),
204 d_sorted_conv_layer_indexes));
205 TF_RETURN_IF_ERROR(ResetTensor<int32>(d_sorted_conv_layer_indexes, d));
206 TF_RETURN_IF_ERROR(context->allocate_temp(
207 DataType::DT_FLOAT, TensorShape({num_images, conv_layer_nboxes}),
208 d_sorted_scores));
209 TF_RETURN_IF_ERROR(ResetTensor<float>(d_sorted_scores, d));
210 TF_RETURN_IF_ERROR(context->allocate_temp(
211 DataType::DT_FLOAT,
212 TensorShape({num_images, box_dim * num_boxes_to_generate}), dev_boxes));
213 TF_RETURN_IF_ERROR(ResetTensor<float>(dev_boxes, d));
214 TF_RETURN_IF_ERROR(context->allocate_temp(
215 DataType::DT_INT8, TensorShape({num_images, num_boxes_to_generate}),
216 dev_boxes_keep_flags));
217 TF_RETURN_IF_ERROR(ResetTensor<int8>(dev_boxes_keep_flags, d));
218 return Status::OK();
219 }
220
221 // Allocate workspace for NMS operation
AllocatePreNMSTempTensors(OpKernelContext * context,Tensor * dev_image_prenms_boxes,Tensor * dev_image_prenms_scores,Tensor * dev_image_boxes_keep_list,Tensor * dev_postnms_rois,Tensor * dev_postnms_rois_probs,Tensor * dev_prenms_nboxes,int num_images,int num_boxes_to_generate,int box_dim,int post_nms_topn,int pre_nms_topn)222 Status AllocatePreNMSTempTensors(
223 OpKernelContext* context, Tensor* dev_image_prenms_boxes,
224 Tensor* dev_image_prenms_scores, Tensor* dev_image_boxes_keep_list,
225 Tensor* dev_postnms_rois, Tensor* dev_postnms_rois_probs,
226 Tensor* dev_prenms_nboxes, int num_images, int num_boxes_to_generate,
227 int box_dim, int post_nms_topn, int pre_nms_topn) {
228 auto d = context->eigen_gpu_device();
229 TF_RETURN_IF_ERROR(context->allocate_temp(
230 DataType::DT_FLOAT, TensorShape({box_dim * num_boxes_to_generate}),
231 dev_image_prenms_boxes));
232 TF_RETURN_IF_ERROR(ResetTensor<float>(dev_image_prenms_boxes, d));
233
234 TF_RETURN_IF_ERROR(context->allocate_temp(
235 DataType::DT_FLOAT, TensorShape({num_boxes_to_generate}),
236 dev_image_prenms_scores));
237 TF_RETURN_IF_ERROR(ResetTensor<float>(dev_image_prenms_scores, d));
238
239 TF_RETURN_IF_ERROR(context->allocate_temp(
240 DataType::DT_INT32, TensorShape({num_boxes_to_generate}),
241 dev_image_boxes_keep_list));
242 TF_RETURN_IF_ERROR(ResetTensor<int32>(dev_image_boxes_keep_list, d));
243
244 const int max_postnms_nboxes = std::min(num_boxes_to_generate, post_nms_topn);
245 TF_RETURN_IF_ERROR(context->allocate_temp(
246 DataType::DT_FLOAT,
247 TensorShape({box_dim * num_images * max_postnms_nboxes}),
248 dev_postnms_rois));
249 TF_RETURN_IF_ERROR(ResetTensor<float>(dev_postnms_rois, d));
250
251 TF_RETURN_IF_ERROR(context->allocate_temp(
252 DataType::DT_FLOAT, TensorShape({num_images * max_postnms_nboxes}),
253 dev_postnms_rois_probs));
254 TF_RETURN_IF_ERROR(ResetTensor<float>(dev_postnms_rois_probs, d));
255
256 TF_RETURN_IF_ERROR(context->allocate_temp(
257 DataType::DT_INT32, TensorShape({num_images}), dev_prenms_nboxes));
258 TF_RETURN_IF_ERROR(ResetTensor<int32>(dev_prenms_nboxes, d));
259
260 return Status::OK();
261 }
262
263 // Initialize index and offset arrays.
264 // num_images is the batch size.
InitializeDataKernel(const Gpu2DLaunchConfig config,int * d_image_offsets,int * d_boxes_keys_iota)265 __global__ void InitializeDataKernel(const Gpu2DLaunchConfig config,
266 int* d_image_offsets,
267 int* d_boxes_keys_iota) {
268 const int image_size = config.virtual_thread_count.x;
269 const int num_images = config.virtual_thread_count.y;
270 CUDA_AXIS_KERNEL_LOOP(img_idx, config.virtual_thread_count.y, Y) {
271 CUDA_AXIS_KERNEL_LOOP(box_idx, config.virtual_thread_count.x, X) {
272 d_boxes_keys_iota[img_idx * image_size + box_idx] = box_idx;
273
274 // One 1D line sets the 1D data
275 if (box_idx == 0) {
276 d_image_offsets[img_idx] = image_size * img_idx;
277 // One thread sets the last+1 offset
278 if (img_idx == 0) d_image_offsets[num_images] = image_size * num_images;
279 }
280 }
281 }
282 }
283
284 } // namespace
285
286 class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
287 public:
GenerateBoundingBoxProposals(tensorflow::OpKernelConstruction * context)288 explicit GenerateBoundingBoxProposals(
289 tensorflow::OpKernelConstruction* context)
290 : OpKernel(context) {
291 OP_REQUIRES_OK(context, context->GetAttr("post_nms_topn", &post_nms_topn_));
292 OP_REQUIRES(context, post_nms_topn_ > 0,
293 errors::InvalidArgument("post_nms_topn can't be 0 or less"));
294 bbox_xform_clip_default_ = log(1000.0 / 16.);
295 }
296
297 template <typename T>
GetScalarValue(OpKernelContext * context,int input,T * value)298 Status GetScalarValue(OpKernelContext* context, int input, T* value) {
299 const Tensor& scalar_tensor = context->input(input);
300 if (!TensorShapeUtils::IsScalar(scalar_tensor.shape())) {
301 return errors::InvalidArgument("Expected a scalar in input ", input,
302 "but got shape ",
303 scalar_tensor.shape().DebugString());
304 }
305 *value = scalar_tensor.scalar<T>()();
306 return Status::OK();
307 }
308
Compute(tensorflow::OpKernelContext * context)309 void Compute(tensorflow::OpKernelContext* context) override {
310 VLOG(1) << "Starting Compute " << name();
311 const auto scores = context->input(0);
312 const auto bbox_deltas = context->input(1);
313 const auto image_info = context->input(2);
314 const auto anchors = context->input(3);
315 const auto num_images = scores.dim_size(0);
316 const auto num_anchors = scores.dim_size(3);
317 const auto height = scores.dim_size(1);
318 const auto width = scores.dim_size(2);
319 const auto box_dim = anchors.dim_size(2) / num_anchors;
320 OP_REQUIRES(context, box_dim == 4,
321 errors::OutOfRange("Box dimensions need to be 4"));
322 // TODO(skama): make sure that inputs are ok.
323 const int image_stride = height * width;
324 const int conv_layer_nboxes =
325 image_stride *
326 num_anchors; // total number of boxes when decoded on anchors.
327 // The following calls to CUB primitives do nothing
328 // (because the first arg is nullptr)
329 // except setting cub_*_temp_storage_bytes
330 float nms_threshold;
331 int pre_nms_topn;
332 float min_size;
333 OP_REQUIRES_OK(context, GetScalarValue(context, 4, &nms_threshold));
334 if (nms_threshold < 0 || nms_threshold > 1.0) {
335 context->SetStatus(errors::InvalidArgument(
336 "nms_threshold should be between 0 and 1. Got ", nms_threshold));
337 return;
338 }
339 OP_REQUIRES_OK(context, GetScalarValue(context, 5, &pre_nms_topn));
340 if (pre_nms_topn <= 0) {
341 context->SetStatus(errors::InvalidArgument(
342 "pre_nms_topn should be greater than 0", pre_nms_topn));
343 return;
344 }
345 OP_REQUIRES_OK(context, GetScalarValue(context, 6, &min_size));
346 auto cuda_stream = GetGpuStream(context);
347 size_t cub_sort_temp_storage_bytes = 0;
348 float* flt_ptr = nullptr;
349 int* int_ptr = nullptr;
350 cudaError_t cuda_ret =
351 gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
352 nullptr, cub_sort_temp_storage_bytes, flt_ptr, flt_ptr, int_ptr,
353 int_ptr, num_images * conv_layer_nboxes, num_images, int_ptr,
354 int_ptr, 0, 8 * sizeof(float), // sort all bits
355 cuda_stream);
356 TF_OP_REQUIRES_CUDA_SUCCESS(context, cuda_ret);
357 // get the size of select temp buffer
358 size_t cub_select_temp_storage_bytes = 0;
359 char* char_ptr = nullptr;
360 float4* f4_ptr = nullptr;
361 TF_OP_REQUIRES_CUDA_SUCCESS(
362 context, gpuprim::DeviceSelect::Flagged(
363 nullptr, cub_select_temp_storage_bytes, f4_ptr, char_ptr,
364 f4_ptr, int_ptr, image_stride * num_anchors, cuda_stream));
365 Tensor d_conv_layer_indexes; // box indices on device
366 Tensor d_image_offset; // starting offsets boxes for each image
367 Tensor d_cub_temp_buffer; // buffer for cub sorting
368 Tensor d_sorted_conv_layer_indexes; // output of cub sorting, indices of
369 // the sorted boxes
370 Tensor dev_sorted_scores; // sorted scores, cub output
371 Tensor dev_boxes; // boxes on device
372 Tensor dev_boxes_keep_flags; // bitmask for keeping the boxes or rejecting
373 // from output
374 const int nboxes_to_generate = std::min(conv_layer_nboxes, pre_nms_topn);
375 size_t cub_temp_storage_bytes =
376 std::max(cub_sort_temp_storage_bytes, cub_select_temp_storage_bytes);
377 OP_REQUIRES_OK(
378 context,
379 AllocateGenerationTempTensors(
380 context, &d_conv_layer_indexes, &d_image_offset, &d_cub_temp_buffer,
381 &d_sorted_conv_layer_indexes, &dev_sorted_scores, &dev_boxes,
382 &dev_boxes_keep_flags, num_images, conv_layer_nboxes,
383 cub_temp_storage_bytes, nboxes_to_generate, box_dim));
384 const GPUDevice& d = context->eigen_device<GPUDevice>();
385 Gpu2DLaunchConfig conf2d =
386 GetGpu2DLaunchConfig(conv_layer_nboxes, num_images, d);
387 // create box indices and offsets for each image on device
388 OP_REQUIRES_OK(
389 context, GpuLaunchKernel(InitializeDataKernel, conf2d.block_count,
390 conf2d.thread_per_block, 0, d.stream(), conf2d,
391 d_image_offset.flat<int>().data(),
392 d_conv_layer_indexes.flat<int>().data()));
393
394 // sort boxes with their scores.
395 // d_sorted_conv_layer_indexes will hold the pointers to old indices.
396 TF_OP_REQUIRES_CUDA_SUCCESS(
397 context,
398 gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
399 d_cub_temp_buffer.flat<int8>().data(), cub_temp_storage_bytes,
400 scores.flat<float>().data(), dev_sorted_scores.flat<float>().data(),
401 d_conv_layer_indexes.flat<int>().data(),
402 d_sorted_conv_layer_indexes.flat<int>().data(),
403 num_images * conv_layer_nboxes, num_images,
404 d_image_offset.flat<int>().data(),
405 d_image_offset.flat<int>().data() + 1, 0,
406 8 * sizeof(float), // sort all bits
407 cuda_stream));
408 // Keeping only the topN pre_nms
409 conf2d = GetGpu2DLaunchConfig(nboxes_to_generate, num_images, d);
410
411 // create box y1,x1,y2,x2 from box_deltas and anchors (decode the boxes) and
412 // mark the boxes which are smaller that min_size ignored.
413 OP_REQUIRES_OK(
414 context,
415 GpuLaunchKernel(
416 GeneratePreNMSUprightBoxesKernel, conf2d.block_count,
417 conf2d.thread_per_block, 0, d.stream(), conf2d,
418 d_sorted_conv_layer_indexes.flat<int>().data(),
419 reinterpret_cast<const float4*>(bbox_deltas.flat<float>().data()),
420 reinterpret_cast<const float4*>(anchors.flat<float>().data()),
421 height, width, num_anchors, min_size,
422 image_info.flat<float>().data(), bbox_xform_clip_default_,
423 reinterpret_cast<float4*>(dev_boxes.flat<float>().data()),
424 nboxes_to_generate,
425 (char*)dev_boxes_keep_flags.flat<int8>().data()));
426 const int nboxes_generated = nboxes_to_generate;
427 const int roi_cols = box_dim;
428 Tensor dev_image_prenms_boxes;
429 Tensor dev_image_prenms_scores;
430 Tensor dev_image_boxes_keep_list;
431 Tensor dev_postnms_rois;
432 Tensor dev_postnms_rois_probs;
433 Tensor dev_prenms_nboxes;
434 // Allocate workspaces needed for NMS
435 OP_REQUIRES_OK(
436 context, AllocatePreNMSTempTensors(
437 context, &dev_image_prenms_boxes, &dev_image_prenms_scores,
438 &dev_image_boxes_keep_list, &dev_postnms_rois,
439 &dev_postnms_rois_probs, &dev_prenms_nboxes, num_images,
440 nboxes_generated, box_dim, post_nms_topn_, pre_nms_topn));
441 // get the pointers for temp storages
442 int* d_prenms_nboxes = dev_prenms_nboxes.flat<int>().data();
443 int h_prenms_nboxes = 0;
444 char* d_cub_temp_storage = (char*)d_cub_temp_buffer.flat<int8>().data();
445 float* d_image_prenms_boxes = dev_image_prenms_boxes.flat<float>().data();
446 float* d_image_prenms_scores = dev_image_prenms_scores.flat<float>().data();
447 int* d_image_boxes_keep_list = dev_image_boxes_keep_list.flat<int>().data();
448
449 int nrois_in_output = 0;
450 // get the pointers to boxes and scores
451 char* d_boxes_keep_flags = (char*)dev_boxes_keep_flags.flat<int8>().data();
452 float* d_boxes = dev_boxes.flat<float>().data();
453 float* d_sorted_scores = dev_sorted_scores.flat<float>().data();
454
455 // Create output tensors
456 Tensor* output_rois = nullptr;
457 Tensor* output_roi_probs = nullptr;
458 OP_REQUIRES_OK(context,
459 context->allocate_output(
460 0, TensorShape({num_images, post_nms_topn_, roi_cols}),
461 &output_rois));
462 OP_REQUIRES_OK(context, context->allocate_output(
463 1, TensorShape({num_images, post_nms_topn_}),
464 &output_roi_probs));
465 float* d_postnms_rois = (*output_rois).flat<float>().data();
466 float* d_postnms_rois_probs = (*output_roi_probs).flat<float>().data();
467 gpuEvent_t copy_done;
468 gpuEventCreate(©_done);
469
470 // Do per-image nms
471 for (int image_index = 0; image_index < num_images; ++image_index) {
472 // reset output workspaces
473 OP_REQUIRES_OK(context,
474 ResetTensor<int32>(&dev_image_boxes_keep_list, d));
475 // Sub matrices for current image
476 // boxes
477 const float* d_image_boxes =
478 &d_boxes[image_index * nboxes_generated * box_dim];
479 // scores
480 const float* d_image_sorted_scores =
481 &d_sorted_scores[image_index * image_stride * num_anchors];
482 // keep flags
483 char* d_image_boxes_keep_flags =
484 &d_boxes_keep_flags[image_index * nboxes_generated];
485
486 // Output buffer for image
487 float* d_image_postnms_rois =
488 &d_postnms_rois[image_index * roi_cols * post_nms_topn_];
489 float* d_image_postnms_rois_probs =
490 &d_postnms_rois_probs[image_index * post_nms_topn_];
491
492 // Moving valid boxes (ie the ones with d_boxes_keep_flags[ibox] == true)
493 // to the output tensors
494 TF_OP_REQUIRES_CUDA_SUCCESS(
495 context, gpuprim::DeviceSelect::Flagged(
496 d_cub_temp_storage, cub_temp_storage_bytes,
497 reinterpret_cast<const float4*>(d_image_boxes),
498 d_image_boxes_keep_flags,
499 reinterpret_cast<float4*>(d_image_prenms_boxes),
500 d_prenms_nboxes, nboxes_generated, d.stream()));
501 TF_OP_REQUIRES_CUDA_SUCCESS(
502 context,
503 gpuprim::DeviceSelect::Flagged(
504 d_cub_temp_storage, cub_temp_storage_bytes, d_image_sorted_scores,
505 d_image_boxes_keep_flags, d_image_prenms_scores, d_prenms_nboxes,
506 nboxes_generated, d.stream()));
507 d.memcpyDeviceToHost(&h_prenms_nboxes, d_prenms_nboxes, sizeof(int));
508 TF_OP_REQUIRES_CUDA_SUCCESS(context,
509 gpuEventRecord(copy_done, d.stream()));
510 TF_OP_REQUIRES_CUDA_SUCCESS(context, gpuEventSynchronize(copy_done));
511 // We know prenms_boxes <= topN_prenms, because nboxes_generated <=
512 // topN_prenms. Calling NMS on the generated boxes
513 const int prenms_nboxes = h_prenms_nboxes;
514 int nkeep;
515 OP_REQUIRES_OK(context, NmsGpu(d_image_prenms_boxes, prenms_nboxes,
516 nms_threshold, d_image_boxes_keep_list,
517 &nkeep, context, post_nms_topn_));
518 // All operations done after previous sort were keeping the relative order
519 // of the elements the elements are still sorted keep topN <=> truncate
520 // the array
521 const int postnms_nboxes = std::min(nkeep, post_nms_topn_);
522 // Moving the out boxes to the output tensors,
523 // adding the image_index dimension on the fly
524 GpuLaunchConfig config = GetGpuLaunchConfig(post_nms_topn_, d);
525 // make this single kernel
526 OP_REQUIRES_OK(
527 context,
528 GpuLaunchKernel(WriteUprightBoxesOutput, config.block_count,
529 config.thread_per_block, 0, d.stream(), config,
530 reinterpret_cast<const float4*>(d_image_prenms_boxes),
531 d_image_prenms_scores, d_image_boxes_keep_list,
532 postnms_nboxes, d_image_postnms_rois,
533 d_image_postnms_rois_probs));
534 nrois_in_output += postnms_nboxes;
535 TF_OP_REQUIRES_CUDA_SUCCESS(context, cudaGetLastError());
536 }
537 }
538
539 private:
540 int post_nms_topn_;
541 float bbox_xform_clip_default_;
542 };
543
544 REGISTER_KERNEL_BUILDER(Name("GenerateBoundingBoxProposals")
545 .Device(tensorflow::DEVICE_GPU)
546 .HostMemory("nms_threshold")
547 .HostMemory("min_size")
548 .HostMemory("pre_nms_topn"),
549 tensorflow::GenerateBoundingBoxProposals);
550 } // namespace tensorflow
551 #endif
552