xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/vulkan/ops/Utils.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <ATen/native/vulkan/impl/Packing.h>
2 #include <ATen/native/vulkan/ops/Common.h>
3 #include <ATen/native/vulkan/ops/Utils.h>
4 
5 #ifndef AT_PER_OPERATOR_HEADERS
6 #include <ATen/Functions.h>
7 #else
8 #include <ATen/ops/cat.h>
9 #include <ATen/ops/empty.h>
10 #include <ATen/ops/narrow.h>
11 #include <ATen/ops/zeros.h>
12 #endif
13 
14 namespace at {
15 namespace native {
16 namespace vulkan {
17 namespace ops {
18 
19 namespace utils {
20 
21 using namespace api::utils;
22 
23 /*
24  * This function formats an input tensor in NCHW layout to NC4HW layout such
25  * that the buffer of the formatted tensor can be directly copied into a GPU
26  * texture. Conceptually, the formatting can be achieved via the following
27  * steps:
28  *
29  * 1. Given that the src tensor has size {N,C,H,W}
30  *
31  * 2. Combine the batch and channel dims by reshaping to {N*C, H, W}
32  *
33  * 3. Determine the amount of padding to add: determine how many channels to add
34  *    in order to align N*C to the next multiple of 4
35  *
36  * 4. Add padding to the tensor so that the batch-channel dimension is a
37  *    multiple of four; the shape of the tensor is now {NC_aligned, H, W}
38  *
39  * 5. Split the batch-channel dimension into groups of 4 by reshaping the tensor
40  *    to size {NC_aligned/4, 4, H, W}
41  *
42  * 6. The groups of 4 channels (dim 1) should be contiguous. Therefore, permute
43  *    the dims of the tensor in the order {0, 2, 3, 1}
44  *
45  * 7. Finally, return a contiguous version of the tensor. The final shape of the
46  *    tensor would be {NC_aligned/4, H, W, 4}
47  */
nchw_to_nc4hw(const Tensor & src)48 Tensor nchw_to_nc4hw(const Tensor& src) {
49   uint32_t N = get_dim<Dim4D::Batch>(src.sizes());
50   uint32_t C = get_dim<Dim4D::Channel>(src.sizes());
51   uint32_t H = get_dim<Dim4D::Height>(src.sizes());
52   uint32_t W = get_dim<Dim4D::Width>(src.sizes());
53 
54   uint32_t C_aligned = api::utils::align_up(C, 4u);
55   uint32_t NC4 = (N * C_aligned) / 4;
56 
57   // Add padding to the tensor so that the channel dim is a multiple of 4
58   Tensor padding = at::zeros({N, C_aligned - C, H, W}, src.options());
59   Tensor src_padded = at::cat({src.reshape({N, C, H, W}), padding}, 1);
60   // Reshape to group channels into groups of 4 and permute so that the groups
61   // are in the first dimension so that they are contiguous
62   Tensor src_NC4HW = src_padded.reshape({NC4, 4, H, W}).permute({0, 2, 3, 1});
63 
64   // Return a contiguous version of the tensor
65   return src_NC4HW.contiguous();
66 }
67 
68 /*
69  * Creates a staging tensor into which texture data, which will be in NC4HW
70  * format, can be copied directly. The shape of the staging tensor will be the
71  * same as the tensor produced by a call to format_src_tensor().
72  */
create_staging_tensor(const vTensor & v_in)73 Tensor create_staging_tensor(const vTensor& v_in) {
74   uint32_t N = get_dim<Dim4D::Batch>(v_in.sizes());
75   uint32_t C = get_dim<Dim4D::Channel>(v_in.sizes());
76   uint32_t H = get_dim<Dim4D::Height>(v_in.sizes());
77   uint32_t W = get_dim<Dim4D::Width>(v_in.sizes());
78 
79   uint32_t NC4 = N * api::utils::div_up(C, 4u);
80 
81   // Note that the dtype corresponding with the texture format of the vTensor is
82   // used instead of options().dtype(). This is to ensure the number of bytes in
83   // the staging tensor matches the number of bytes in the image texture. Refer
84   // to comments for api::vk_format()
85   return at::empty(
86       {NC4, H, W, 4},
87       at::device(at::kCPU).dtype(convert_dtype(v_in.texture_dtype())));
88 }
89 
90 /*
91  * After copying texture data, which will be in NC4HW format, to a staging
92  * tensor created in create_staging_tensor(), this function reformats the tensor
93  * to NCHW format. It essentially reverses the transformations made by
94  * format_src_tensor().
95  *
96  * Note that the sizes of the original tensor must be passed in to fully restore
97  * the properties of the original tensor.
98  */
nc4hw_to_nchw(const Tensor & t_in,IntArrayRef sizes)99 Tensor nc4hw_to_nchw(const Tensor& t_in, IntArrayRef sizes) {
100   uint32_t N = get_dim<Dim4D::Batch>(sizes);
101   uint32_t C = get_dim<Dim4D::Channel>(sizes);
102   uint32_t H = get_dim<Dim4D::Height>(sizes);
103   uint32_t W = get_dim<Dim4D::Width>(sizes);
104 
105   uint32_t C_aligned = api::utils::align_up(C, 4u);
106 
107   // Undo the permute step and channel grouping step
108   Tensor t_in_padded = t_in.permute({0, 3, 1, 2}).reshape({N, C_aligned, H, W});
109   // Remove the padding channels
110   Tensor t_in_shaved =
111       at::narrow(t_in_padded, /*dim=*/1, /*start*/ 0, /*end*/ C);
112 
113   // Reshape to original sizing and dtype and return a contiguous Tensor
114   return t_in_shaved.reshape(sizes).contiguous();
115 }
116 
copy_buffer_to_vtensor(api::VulkanBuffer & src_buffer,vTensor & v_dst,api::PipelineBarrier & pipeline_barrier)117 void copy_buffer_to_vtensor(
118     api::VulkanBuffer& src_buffer,
119     vTensor& v_dst,
120     api::PipelineBarrier& pipeline_barrier) {
121   api::Context* const context = api::context();
122 
123   TORCH_CHECK(
124       src_buffer.mem_size() == v_dst.gpu_nbytes(),
125       "Vulkan copy_buffer_to_vtensor: source buffer and destination texture "
126       "do not have the same number of bytes");
127 
128   context->submit_copy<api::VulkanBuffer, api::VulkanImage>(
129       // pipeline barrier
130       pipeline_barrier,
131       // resources
132       src_buffer,
133       v_dst.image(
134           pipeline_barrier,
135           api::PipelineStage::TRANSFER,
136           api::MemoryAccessType::WRITE),
137       // copy details
138       v_dst.extents(),
139       {0u, 0u, 0u},
140       {0u, 0u, 0u},
141       // fence handle
142       VK_NULL_HANDLE);
143 }
144 
copy_buffer_to_buffer(api::Context * const context,api::StorageBuffer & src,api::StorageBuffer & dst,VkFence fence_handle)145 void copy_buffer_to_buffer(
146     api::Context* const context,
147     api::StorageBuffer& src,
148     api::StorageBuffer& dst,
149     VkFence fence_handle) {
150   api::PipelineBarrier pipeline_barrier{};
151 
152   context->submit_copy<api::VulkanBuffer, api::VulkanBuffer>(
153       // pipeline barrier
154       pipeline_barrier,
155       // resources
156       src.buffer(),
157       dst.buffer(),
158       // copy details
159       {static_cast<uint32_t>(src.buffer().mem_size()), 0u, 0u},
160       {0u, 0u, 0u},
161       {0u, 0u, 0u},
162       // fence handle
163       fence_handle);
164 }
165 
copy_vtensor_to_buffer(vTensor & v_src,api::VulkanBuffer & dst_buffer,api::PipelineBarrier & pipeline_barrier,const VkFence fence_handle)166 void copy_vtensor_to_buffer(
167     vTensor& v_src,
168     api::VulkanBuffer& dst_buffer,
169     api::PipelineBarrier& pipeline_barrier,
170     const VkFence fence_handle) {
171   api::Context* const context = api::context();
172 
173   TORCH_CHECK(
174       v_src.gpu_nbytes() == dst_buffer.mem_size(),
175       "Vulkan copy_vtensor_to_buffer: source texture and destination buffer "
176       "do not have the same number of bytes");
177 
178   context->submit_copy<api::VulkanImage, api::VulkanBuffer>(
179       // pipeline barrier
180       pipeline_barrier,
181       // resources
182       v_src.image(
183           pipeline_barrier,
184           api::PipelineStage::TRANSFER,
185           api::MemoryAccessType::READ),
186       dst_buffer,
187       // copy details
188       v_src.extents(),
189       {0u, 0u, 0u},
190       {0u, 0u, 0u},
191       // fence handle
192       fence_handle);
193 }
194 
pack_buffer_to_vtensor(api::VulkanBuffer & buffer,vTensor & v_self,api::PipelineBarrier & pipeline_barrier)195 void pack_buffer_to_vtensor(
196     api::VulkanBuffer& buffer,
197     vTensor& v_self,
198     api::PipelineBarrier& pipeline_barrier) {
199   api::Context* const context = api::context();
200 
201   if (v_self.storage_type() == api::StorageType::BUFFER) {
202     packing::record_nchw_to_buffer_op(
203         context, buffer, v_self, pipeline_barrier, VK_NULL_HANDLE);
204   } else {
205     api::ShaderInfo compute_shader = packing::get_nchw_to_image_shader(v_self);
206     packing::record_nchw_to_image_op(
207         context,
208         compute_shader,
209         buffer,
210         v_self,
211         pipeline_barrier,
212         VK_NULL_HANDLE);
213   }
214 }
215 
pack_staging_to_vtensor(api::VulkanBuffer & staging,vTensor & v_self)216 void pack_staging_to_vtensor(api::VulkanBuffer& staging, vTensor& v_self) {
217   api::PipelineBarrier pipeline_barrier{};
218   pack_buffer_to_vtensor(staging, v_self, pipeline_barrier);
219 }
220 
pack_vtensor_to_staging(vTensor & v_self,api::VulkanBuffer & staging,const VkFence fence_handle)221 bool pack_vtensor_to_staging(
222     vTensor& v_self,
223     api::VulkanBuffer& staging,
224     const VkFence fence_handle) {
225   api::Context* const context = api::context();
226   api::PipelineBarrier pipeline_barrier{};
227 
228   if (v_self.storage_type() == api::StorageType::BUFFER) {
229     return packing::record_buffer_to_nchw_op(
230         context, v_self, staging, pipeline_barrier, fence_handle);
231   } else {
232     api::ShaderInfo compute_shader = packing::get_image_to_nchw_shader(v_self);
233     return packing::record_image_to_nchw_op(
234         context,
235         compute_shader,
236         v_self,
237         staging,
238         pipeline_barrier,
239         fence_handle);
240   }
241 }
242 
243 /*
244  * Broadcasting Utils
245  */
246 
247 // check if two tensors are broadcastable
is_broadcastable(const Tensor & input1,const Tensor & input2)248 void is_broadcastable(const Tensor& input1, const Tensor& input2) {
249   TORCH_CHECK(
250       input1.dim() <= 4 && input2.dim() <= 4,
251       "Vulkan only supports tensors <= 4 dimensions");
252 
253   // check if the shapes of input tensors are broadcastable
254   // see https://pytorch.org/docs/stable/notes/broadcasting.html
255   // for broadcasting semantics
256   const std::string broadcast_error_msg = "Tensors are not broadcastable!";
257 
258   if (get_dim<Dim4D::Batch>(input1) != get_dim<Dim4D::Batch>(input2)) {
259     TORCH_CHECK(
260         get_dim<Dim4D::Batch>(input1) == 1 ||
261             get_dim<Dim4D::Batch>(input2) == 1,
262         broadcast_error_msg);
263   }
264   if (get_dim<Dim4D::Channel>(input1) != get_dim<Dim4D::Channel>(input2)) {
265     TORCH_CHECK(
266         get_dim<Dim4D::Channel>(input1) == 1 ||
267             get_dim<Dim4D::Channel>(input2) == 1,
268         broadcast_error_msg);
269   }
270   if (get_dim<Dim4D::Height>(input1) != get_dim<Dim4D::Height>(input2)) {
271     TORCH_CHECK(
272         get_dim<Dim4D::Height>(input1) == 1 ||
273             get_dim<Dim4D::Height>(input2) == 1,
274         broadcast_error_msg);
275   }
276   if (get_dim<Dim4D::Width>(input1) != get_dim<Dim4D::Width>(input2)) {
277     TORCH_CHECK(
278         get_dim<Dim4D::Width>(input1) == 1 ||
279             get_dim<Dim4D::Width>(input2) == 1,
280         broadcast_error_msg);
281   }
282 }
283 
284 // compute the output shape by broadcasting the shapes of t1 and t2
broadcast_size(const Tensor & t1,const Tensor & t2)285 std::vector<int64_t> broadcast_size(const Tensor& t1, const Tensor& t2) {
286   int64_t t1_size = t1.dim();
287   int64_t t2_size = t2.dim();
288 
289   std::vector<int64_t> out;
290   if (t1_size > t2_size) {
291     for (int64_t i = 0; i < t1_size; i++) {
292       out.push_back(t1.sizes()[i]);
293     }
294   } else {
295     for (int64_t i = 0; i < t2_size; i++) {
296       out.push_back(t2.sizes()[i]);
297     }
298   }
299 
300   if (!out.empty()) {
301     out[out.size() - 1] =
302         std::max(get_dim<Dim4D::Width>(t1), get_dim<Dim4D::Width>(t2));
303   }
304   if (out.size() > 1) {
305     out[out.size() - 2] =
306         std::max(get_dim<Dim4D::Height>(t1), get_dim<Dim4D::Height>(t2));
307   }
308   if (out.size() > 2) {
309     out[out.size() - 3] =
310         std::max(get_dim<Dim4D::Channel>(t1), get_dim<Dim4D::Channel>(t2));
311   }
312   if (out.size() > 3) {
313     out[out.size() - 4] =
314         std::max(get_dim<Dim4D::Batch>(t1), get_dim<Dim4D::Batch>(t2));
315   }
316 
317   return out;
318 }
319 
extract_texel(const Tensor & input,const ivec3 & pos)320 api::utils::vec4 extract_texel(const Tensor& input, const ivec3& pos) {
321   api::Context* const context = api::context();
322 
323   TORCH_CHECK(input.is_vulkan());
324   const vTensor& v_input = convert(input);
325 
326   api::PipelineBarrier pipeline_barrier{};
327 
328   std::vector<int64_t> output_size{1, 1, 1};
329 
330   // x, y, z, w all using a single element tensor. We intend to pull
331   // (0, 0, 0).x from each tensor. This allows us to isolate the effect
332   // of most packing mechanism.
333   api::ScalarType dtype = convert_dtype(input.scalar_type());
334   vTensor v_outputs_x{context, output_size, dtype};
335   vTensor v_outputs_y{context, output_size, dtype};
336   vTensor v_outputs_z{context, output_size, dtype};
337   vTensor v_outputs_w{context, output_size, dtype};
338 
339   const struct Block final {
340     ivec3 pos;
341   } block{
342       pos,
343   };
344 
345   api::UniformParamsBuffer params(context, block);
346 
347   context->submit_compute_job(
348       VK_KERNEL(extract_texel),
349       pipeline_barrier,
350       {1, 1, 1},
351       {1, 1, 1},
352       VK_NULL_HANDLE,
353       v_outputs_x.image(
354           pipeline_barrier,
355           api::PipelineStage::COMPUTE,
356           api::MemoryAccessType::WRITE),
357       v_outputs_y.image(
358           pipeline_barrier,
359           api::PipelineStage::COMPUTE,
360           api::MemoryAccessType::WRITE),
361       v_outputs_z.image(
362           pipeline_barrier,
363           api::PipelineStage::COMPUTE,
364           api::MemoryAccessType::WRITE),
365       v_outputs_w.image(
366           pipeline_barrier,
367           api::PipelineStage::COMPUTE,
368           api::MemoryAccessType::WRITE),
369       v_input.image(pipeline_barrier, api::PipelineStage::COMPUTE),
370       params.buffer());
371 
372   vec4 rv = {
373       convert(v_outputs_x).cpu().data_ptr<float>()[0],
374       convert(v_outputs_y).cpu().data_ptr<float>()[0],
375       convert(v_outputs_z).cpu().data_ptr<float>()[0],
376       convert(v_outputs_w).cpu().data_ptr<float>()[0],
377   };
378 
379   return rv;
380 }
381 
382 } // namespace utils
383 } // namespace ops
384 } // namespace vulkan
385 } // namespace native
386 } // namespace at
387