1 #include <ATen/native/vulkan/impl/Packing.h>
2 #include <ATen/native/vulkan/ops/Common.h>
3 #include <ATen/native/vulkan/ops/Utils.h>
4
5 #ifndef AT_PER_OPERATOR_HEADERS
6 #include <ATen/Functions.h>
7 #else
8 #include <ATen/ops/cat.h>
9 #include <ATen/ops/empty.h>
10 #include <ATen/ops/narrow.h>
11 #include <ATen/ops/zeros.h>
12 #endif
13
14 namespace at {
15 namespace native {
16 namespace vulkan {
17 namespace ops {
18
19 namespace utils {
20
21 using namespace api::utils;
22
23 /*
24 * This function formats an input tensor in NCHW layout to NC4HW layout such
25 * that the buffer of the formatted tensor can be directly copied into a GPU
26 * texture. Conceptually, the formatting can be achieved via the following
27 * steps:
28 *
29 * 1. Given that the src tensor has size {N,C,H,W}
30 *
31 * 2. Combine the batch and channel dims by reshaping to {N*C, H, W}
32 *
33 * 3. Determine the amount of padding to add: determine how many channels to add
34 * in order to align N*C to the next multiple of 4
35 *
36 * 4. Add padding to the tensor so that the batch-channel dimension is a
37 * multiple of four; the shape of the tensor is now {NC_aligned, H, W}
38 *
39 * 5. Split the batch-channel dimension into groups of 4 by reshaping the tensor
40 * to size {NC_aligned/4, 4, H, W}
41 *
42 * 6. The groups of 4 channels (dim 1) should be contiguous. Therefore, permute
43 * the dims of the tensor in the order {0, 2, 3, 1}
44 *
45 * 7. Finally, return a contiguous version of the tensor. The final shape of the
46 * tensor would be {NC_aligned/4, H, W, 4}
47 */
nchw_to_nc4hw(const Tensor & src)48 Tensor nchw_to_nc4hw(const Tensor& src) {
49 uint32_t N = get_dim<Dim4D::Batch>(src.sizes());
50 uint32_t C = get_dim<Dim4D::Channel>(src.sizes());
51 uint32_t H = get_dim<Dim4D::Height>(src.sizes());
52 uint32_t W = get_dim<Dim4D::Width>(src.sizes());
53
54 uint32_t C_aligned = api::utils::align_up(C, 4u);
55 uint32_t NC4 = (N * C_aligned) / 4;
56
57 // Add padding to the tensor so that the channel dim is a multiple of 4
58 Tensor padding = at::zeros({N, C_aligned - C, H, W}, src.options());
59 Tensor src_padded = at::cat({src.reshape({N, C, H, W}), padding}, 1);
60 // Reshape to group channels into groups of 4 and permute so that the groups
61 // are in the first dimension so that they are contiguous
62 Tensor src_NC4HW = src_padded.reshape({NC4, 4, H, W}).permute({0, 2, 3, 1});
63
64 // Return a contiguous version of the tensor
65 return src_NC4HW.contiguous();
66 }
67
68 /*
69 * Creates a staging tensor into which texture data, which will be in NC4HW
70 * format, can be copied directly. The shape of the staging tensor will be the
71 * same as the tensor produced by a call to format_src_tensor().
72 */
create_staging_tensor(const vTensor & v_in)73 Tensor create_staging_tensor(const vTensor& v_in) {
74 uint32_t N = get_dim<Dim4D::Batch>(v_in.sizes());
75 uint32_t C = get_dim<Dim4D::Channel>(v_in.sizes());
76 uint32_t H = get_dim<Dim4D::Height>(v_in.sizes());
77 uint32_t W = get_dim<Dim4D::Width>(v_in.sizes());
78
79 uint32_t NC4 = N * api::utils::div_up(C, 4u);
80
81 // Note that the dtype corresponding with the texture format of the vTensor is
82 // used instead of options().dtype(). This is to ensure the number of bytes in
83 // the staging tensor matches the number of bytes in the image texture. Refer
84 // to comments for api::vk_format()
85 return at::empty(
86 {NC4, H, W, 4},
87 at::device(at::kCPU).dtype(convert_dtype(v_in.texture_dtype())));
88 }
89
90 /*
91 * After copying texture data, which will be in NC4HW format, to a staging
92 * tensor created in create_staging_tensor(), this function reformats the tensor
93 * to NCHW format. It essentially reverses the transformations made by
94 * format_src_tensor().
95 *
96 * Note that the sizes of the original tensor must be passed in to fully restore
97 * the properties of the original tensor.
98 */
nc4hw_to_nchw(const Tensor & t_in,IntArrayRef sizes)99 Tensor nc4hw_to_nchw(const Tensor& t_in, IntArrayRef sizes) {
100 uint32_t N = get_dim<Dim4D::Batch>(sizes);
101 uint32_t C = get_dim<Dim4D::Channel>(sizes);
102 uint32_t H = get_dim<Dim4D::Height>(sizes);
103 uint32_t W = get_dim<Dim4D::Width>(sizes);
104
105 uint32_t C_aligned = api::utils::align_up(C, 4u);
106
107 // Undo the permute step and channel grouping step
108 Tensor t_in_padded = t_in.permute({0, 3, 1, 2}).reshape({N, C_aligned, H, W});
109 // Remove the padding channels
110 Tensor t_in_shaved =
111 at::narrow(t_in_padded, /*dim=*/1, /*start*/ 0, /*end*/ C);
112
113 // Reshape to original sizing and dtype and return a contiguous Tensor
114 return t_in_shaved.reshape(sizes).contiguous();
115 }
116
copy_buffer_to_vtensor(api::VulkanBuffer & src_buffer,vTensor & v_dst,api::PipelineBarrier & pipeline_barrier)117 void copy_buffer_to_vtensor(
118 api::VulkanBuffer& src_buffer,
119 vTensor& v_dst,
120 api::PipelineBarrier& pipeline_barrier) {
121 api::Context* const context = api::context();
122
123 TORCH_CHECK(
124 src_buffer.mem_size() == v_dst.gpu_nbytes(),
125 "Vulkan copy_buffer_to_vtensor: source buffer and destination texture "
126 "do not have the same number of bytes");
127
128 context->submit_copy<api::VulkanBuffer, api::VulkanImage>(
129 // pipeline barrier
130 pipeline_barrier,
131 // resources
132 src_buffer,
133 v_dst.image(
134 pipeline_barrier,
135 api::PipelineStage::TRANSFER,
136 api::MemoryAccessType::WRITE),
137 // copy details
138 v_dst.extents(),
139 {0u, 0u, 0u},
140 {0u, 0u, 0u},
141 // fence handle
142 VK_NULL_HANDLE);
143 }
144
copy_buffer_to_buffer(api::Context * const context,api::StorageBuffer & src,api::StorageBuffer & dst,VkFence fence_handle)145 void copy_buffer_to_buffer(
146 api::Context* const context,
147 api::StorageBuffer& src,
148 api::StorageBuffer& dst,
149 VkFence fence_handle) {
150 api::PipelineBarrier pipeline_barrier{};
151
152 context->submit_copy<api::VulkanBuffer, api::VulkanBuffer>(
153 // pipeline barrier
154 pipeline_barrier,
155 // resources
156 src.buffer(),
157 dst.buffer(),
158 // copy details
159 {static_cast<uint32_t>(src.buffer().mem_size()), 0u, 0u},
160 {0u, 0u, 0u},
161 {0u, 0u, 0u},
162 // fence handle
163 fence_handle);
164 }
165
copy_vtensor_to_buffer(vTensor & v_src,api::VulkanBuffer & dst_buffer,api::PipelineBarrier & pipeline_barrier,const VkFence fence_handle)166 void copy_vtensor_to_buffer(
167 vTensor& v_src,
168 api::VulkanBuffer& dst_buffer,
169 api::PipelineBarrier& pipeline_barrier,
170 const VkFence fence_handle) {
171 api::Context* const context = api::context();
172
173 TORCH_CHECK(
174 v_src.gpu_nbytes() == dst_buffer.mem_size(),
175 "Vulkan copy_vtensor_to_buffer: source texture and destination buffer "
176 "do not have the same number of bytes");
177
178 context->submit_copy<api::VulkanImage, api::VulkanBuffer>(
179 // pipeline barrier
180 pipeline_barrier,
181 // resources
182 v_src.image(
183 pipeline_barrier,
184 api::PipelineStage::TRANSFER,
185 api::MemoryAccessType::READ),
186 dst_buffer,
187 // copy details
188 v_src.extents(),
189 {0u, 0u, 0u},
190 {0u, 0u, 0u},
191 // fence handle
192 fence_handle);
193 }
194
pack_buffer_to_vtensor(api::VulkanBuffer & buffer,vTensor & v_self,api::PipelineBarrier & pipeline_barrier)195 void pack_buffer_to_vtensor(
196 api::VulkanBuffer& buffer,
197 vTensor& v_self,
198 api::PipelineBarrier& pipeline_barrier) {
199 api::Context* const context = api::context();
200
201 if (v_self.storage_type() == api::StorageType::BUFFER) {
202 packing::record_nchw_to_buffer_op(
203 context, buffer, v_self, pipeline_barrier, VK_NULL_HANDLE);
204 } else {
205 api::ShaderInfo compute_shader = packing::get_nchw_to_image_shader(v_self);
206 packing::record_nchw_to_image_op(
207 context,
208 compute_shader,
209 buffer,
210 v_self,
211 pipeline_barrier,
212 VK_NULL_HANDLE);
213 }
214 }
215
pack_staging_to_vtensor(api::VulkanBuffer & staging,vTensor & v_self)216 void pack_staging_to_vtensor(api::VulkanBuffer& staging, vTensor& v_self) {
217 api::PipelineBarrier pipeline_barrier{};
218 pack_buffer_to_vtensor(staging, v_self, pipeline_barrier);
219 }
220
pack_vtensor_to_staging(vTensor & v_self,api::VulkanBuffer & staging,const VkFence fence_handle)221 bool pack_vtensor_to_staging(
222 vTensor& v_self,
223 api::VulkanBuffer& staging,
224 const VkFence fence_handle) {
225 api::Context* const context = api::context();
226 api::PipelineBarrier pipeline_barrier{};
227
228 if (v_self.storage_type() == api::StorageType::BUFFER) {
229 return packing::record_buffer_to_nchw_op(
230 context, v_self, staging, pipeline_barrier, fence_handle);
231 } else {
232 api::ShaderInfo compute_shader = packing::get_image_to_nchw_shader(v_self);
233 return packing::record_image_to_nchw_op(
234 context,
235 compute_shader,
236 v_self,
237 staging,
238 pipeline_barrier,
239 fence_handle);
240 }
241 }
242
243 /*
244 * Broadcasting Utils
245 */
246
247 // check if two tensors are broadcastable
is_broadcastable(const Tensor & input1,const Tensor & input2)248 void is_broadcastable(const Tensor& input1, const Tensor& input2) {
249 TORCH_CHECK(
250 input1.dim() <= 4 && input2.dim() <= 4,
251 "Vulkan only supports tensors <= 4 dimensions");
252
253 // check if the shapes of input tensors are broadcastable
254 // see https://pytorch.org/docs/stable/notes/broadcasting.html
255 // for broadcasting semantics
256 const std::string broadcast_error_msg = "Tensors are not broadcastable!";
257
258 if (get_dim<Dim4D::Batch>(input1) != get_dim<Dim4D::Batch>(input2)) {
259 TORCH_CHECK(
260 get_dim<Dim4D::Batch>(input1) == 1 ||
261 get_dim<Dim4D::Batch>(input2) == 1,
262 broadcast_error_msg);
263 }
264 if (get_dim<Dim4D::Channel>(input1) != get_dim<Dim4D::Channel>(input2)) {
265 TORCH_CHECK(
266 get_dim<Dim4D::Channel>(input1) == 1 ||
267 get_dim<Dim4D::Channel>(input2) == 1,
268 broadcast_error_msg);
269 }
270 if (get_dim<Dim4D::Height>(input1) != get_dim<Dim4D::Height>(input2)) {
271 TORCH_CHECK(
272 get_dim<Dim4D::Height>(input1) == 1 ||
273 get_dim<Dim4D::Height>(input2) == 1,
274 broadcast_error_msg);
275 }
276 if (get_dim<Dim4D::Width>(input1) != get_dim<Dim4D::Width>(input2)) {
277 TORCH_CHECK(
278 get_dim<Dim4D::Width>(input1) == 1 ||
279 get_dim<Dim4D::Width>(input2) == 1,
280 broadcast_error_msg);
281 }
282 }
283
284 // compute the output shape by broadcasting the shapes of t1 and t2
broadcast_size(const Tensor & t1,const Tensor & t2)285 std::vector<int64_t> broadcast_size(const Tensor& t1, const Tensor& t2) {
286 int64_t t1_size = t1.dim();
287 int64_t t2_size = t2.dim();
288
289 std::vector<int64_t> out;
290 if (t1_size > t2_size) {
291 for (int64_t i = 0; i < t1_size; i++) {
292 out.push_back(t1.sizes()[i]);
293 }
294 } else {
295 for (int64_t i = 0; i < t2_size; i++) {
296 out.push_back(t2.sizes()[i]);
297 }
298 }
299
300 if (!out.empty()) {
301 out[out.size() - 1] =
302 std::max(get_dim<Dim4D::Width>(t1), get_dim<Dim4D::Width>(t2));
303 }
304 if (out.size() > 1) {
305 out[out.size() - 2] =
306 std::max(get_dim<Dim4D::Height>(t1), get_dim<Dim4D::Height>(t2));
307 }
308 if (out.size() > 2) {
309 out[out.size() - 3] =
310 std::max(get_dim<Dim4D::Channel>(t1), get_dim<Dim4D::Channel>(t2));
311 }
312 if (out.size() > 3) {
313 out[out.size() - 4] =
314 std::max(get_dim<Dim4D::Batch>(t1), get_dim<Dim4D::Batch>(t2));
315 }
316
317 return out;
318 }
319
extract_texel(const Tensor & input,const ivec3 & pos)320 api::utils::vec4 extract_texel(const Tensor& input, const ivec3& pos) {
321 api::Context* const context = api::context();
322
323 TORCH_CHECK(input.is_vulkan());
324 const vTensor& v_input = convert(input);
325
326 api::PipelineBarrier pipeline_barrier{};
327
328 std::vector<int64_t> output_size{1, 1, 1};
329
330 // x, y, z, w all using a single element tensor. We intend to pull
331 // (0, 0, 0).x from each tensor. This allows us to isolate the effect
332 // of most packing mechanism.
333 api::ScalarType dtype = convert_dtype(input.scalar_type());
334 vTensor v_outputs_x{context, output_size, dtype};
335 vTensor v_outputs_y{context, output_size, dtype};
336 vTensor v_outputs_z{context, output_size, dtype};
337 vTensor v_outputs_w{context, output_size, dtype};
338
339 const struct Block final {
340 ivec3 pos;
341 } block{
342 pos,
343 };
344
345 api::UniformParamsBuffer params(context, block);
346
347 context->submit_compute_job(
348 VK_KERNEL(extract_texel),
349 pipeline_barrier,
350 {1, 1, 1},
351 {1, 1, 1},
352 VK_NULL_HANDLE,
353 v_outputs_x.image(
354 pipeline_barrier,
355 api::PipelineStage::COMPUTE,
356 api::MemoryAccessType::WRITE),
357 v_outputs_y.image(
358 pipeline_barrier,
359 api::PipelineStage::COMPUTE,
360 api::MemoryAccessType::WRITE),
361 v_outputs_z.image(
362 pipeline_barrier,
363 api::PipelineStage::COMPUTE,
364 api::MemoryAccessType::WRITE),
365 v_outputs_w.image(
366 pipeline_barrier,
367 api::PipelineStage::COMPUTE,
368 api::MemoryAccessType::WRITE),
369 v_input.image(pipeline_barrier, api::PipelineStage::COMPUTE),
370 params.buffer());
371
372 vec4 rv = {
373 convert(v_outputs_x).cpu().data_ptr<float>()[0],
374 convert(v_outputs_y).cpu().data_ptr<float>()[0],
375 convert(v_outputs_z).cpu().data_ptr<float>()[0],
376 convert(v_outputs_w).cpu().data_ptr<float>()[0],
377 };
378
379 return rv;
380 }
381
382 } // namespace utils
383 } // namespace ops
384 } // namespace vulkan
385 } // namespace native
386 } // namespace at
387