xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cpu/qnnpack/src/deconvolution.c (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 /*
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #include <assert.h>
10 #include <math.h>
11 #include <stdbool.h>
12 #include <stddef.h>
13 #include <stdint.h>
14 #include <string.h>
15 
16 #include <pytorch_qnnpack.h>
17 #include <qnnpack/indirection.h>
18 #include <qnnpack/log.h>
19 #include <qnnpack/math.h>
20 #include <qnnpack/operator.h>
21 #include <qnnpack/pack.h>
22 #include <qnnpack/params.h>
23 #include <qnnpack/requantization.h>
24 
compute_output_dimension(size_t input_dimension,size_t input_padding_dimension,size_t adjustment_dimension,size_t kernel_dimension,size_t dilation_dimension,size_t stride_dimension)25 static inline size_t compute_output_dimension(
26     size_t input_dimension,
27     size_t input_padding_dimension,
28     size_t adjustment_dimension,
29     size_t kernel_dimension,
30     size_t dilation_dimension,
31     size_t stride_dimension) {
32   const size_t effective_kernel_dimension =
33       (kernel_dimension - 1) * dilation_dimension + 1;
34   return stride_dimension * (input_dimension - 1) + adjustment_dimension +
35       effective_kernel_dimension - input_padding_dimension;
36 }
37 
pytorch_qnnp_create_deconvolution2d_nhwc_q8(uint32_t input_padding_height,uint32_t input_padding_width,uint32_t adjustment_height,uint32_t adjustment_width,uint32_t kernel_height,uint32_t kernel_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,uint8_t input_zero_point,const uint8_t * kernel_zero_points,const uint8_t * kernel,const int32_t * bias,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max,uint32_t flags,const float * requantization_scales,pytorch_qnnp_operator_t * deconvolution_out)38 enum pytorch_qnnp_status pytorch_qnnp_create_deconvolution2d_nhwc_q8(
39     uint32_t input_padding_height,
40     uint32_t input_padding_width,
41     uint32_t adjustment_height,
42     uint32_t adjustment_width,
43     uint32_t kernel_height,
44     uint32_t kernel_width,
45     uint32_t stride_height,
46     uint32_t stride_width,
47     uint32_t dilation_height,
48     uint32_t dilation_width,
49     uint32_t groups,
50     size_t group_input_channels,
51     size_t group_output_channels,
52     uint8_t input_zero_point,
53     const uint8_t* kernel_zero_points,
54     const uint8_t* kernel,
55     const int32_t* bias,
56     uint8_t output_zero_point,
57     uint8_t output_min,
58     uint8_t output_max,
59     uint32_t flags,
60     const float* requantization_scales,
61     pytorch_qnnp_operator_t* deconvolution_out) {
62   pytorch_qnnp_operator_t deconvolution = NULL;
63   enum pytorch_qnnp_status status = pytorch_qnnp_status_uninitialized;
64 
65   if (!pytorch_qnnp_params.initialized) {
66     pytorch_qnnp_log_error(
67         "pytorch_qnnp_create_deconvolution2d_nhwc_q8 failed because QNNPACK is not properly initialized");
68     goto error;
69   }
70 
71   status = pytorch_qnnp_status_invalid_parameter;
72 
73   if (kernel_width == 0 || kernel_height == 0) {
74     pytorch_qnnp_log_error(
75         "failed to create deconvolution with %" PRIu32 "x%" PRIu32
76         " kernel: kernel dimensions must be non-zero",
77         kernel_width,
78         kernel_height);
79     goto error;
80   }
81 
82   if (stride_width == 0 || stride_height == 0) {
83     pytorch_qnnp_log_error(
84         "failed to create deconvolution with %" PRIu32 "x%" PRIu32
85         " stride: "
86         "stride dimensions must be non-zero",
87         stride_width,
88         stride_height);
89     goto error;
90   }
91 
92   if (dilation_width == 0 || dilation_height == 0) {
93     pytorch_qnnp_log_error(
94         "failed to create deconvolution with %" PRIu32 "x%" PRIu32
95         " dilation: "
96         "dilation dimensions must be non-zero",
97         dilation_width,
98         dilation_height);
99     goto error;
100   }
101 
102   status = pytorch_qnnp_status_unsupported_parameter;
103 
104   for (int i = 0; i < groups * group_output_channels; i++) {
105     if (requantization_scales[i] <= 0.0f ||
106         !isnormal(requantization_scales[i])) {
107       pytorch_qnnp_log_error(
108           "failed to create deconvolution operator with %.7g requantization scale for "
109           "channel %d scale must be finite and positive",
110           requantization_scales[i], i);
111       goto error;
112     }
113   }
114 
115   status = pytorch_qnnp_status_out_of_memory;
116 
117   deconvolution = calloc(1, sizeof(struct pytorch_qnnp_operator));
118   if (deconvolution == NULL) {
119     pytorch_qnnp_log_error(
120         "failed to allocate %zu bytes for pytorch_qnnp_operator structure",
121         sizeof(struct pytorch_qnnp_operator));
122     goto error;
123   }
124 
125   const uint32_t nr = pytorch_qnnp_params.q8conv.nr;
126   const uint32_t kr = pytorch_qnnp_params.q8conv.kr;
127 
128   const uint32_t n_stride = (group_output_channels + (nr - 1)) & -nr;
129   const uint32_t k_stride = (group_input_channels + (kr - 1)) & -kr;
130   const uint32_t kernel_size = kernel_height * kernel_width;
131   const size_t packed_group_weights_size =
132       (sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t)) * n_stride;
133   deconvolution->packed_weights = malloc(packed_group_weights_size * groups);
134   if (deconvolution->packed_weights == NULL) {
135     pytorch_qnnp_log_error(
136         "failed to allocate %zu bytes for packed weights",
137         packed_group_weights_size * groups);
138     goto error;
139   }
140   memset(
141       deconvolution->packed_weights,
142       kernel_zero_points[0],
143       packed_group_weights_size * groups);
144 
145   for (uint32_t group = 0; group < groups; group++) {
146     pytorch_pack_q8deconv_w(
147         group_output_channels,
148         kernel_size,
149         group_input_channels,
150         nr,
151         kr,
152 #if !PYTORCH_QNNPACK_RUNTIME_QUANTIZATION
153         input_zero_point,
154         kernel_zero_points[0],
155 #endif
156         kernel +
157             group * group_output_channels * kernel_size * group_input_channels,
158         bias + group * group_output_channels,
159 #if PYTORCH_QNNPACK_RUNTIME_QUANTIZATION
160         kernel_zero_points + group * group_output_channels,
161 #endif
162         (void*)((uintptr_t)deconvolution->packed_weights + group * packed_group_weights_size));
163   }
164 
165   size_t zero_size = sizeof(uint8_t) * k_stride;
166   size_t zero_offset = 0;
167   if (group_input_channels < 8) {
168     zero_size += 8;
169     zero_offset = 8;
170   }
171 
172   void* zero_buffer = malloc(zero_size);
173   if (zero_buffer == NULL) {
174     pytorch_qnnp_log_error(
175         "failed to allocate %zu bytes for zero padding", zero_size);
176     goto error;
177   }
178   memset(zero_buffer, input_zero_point, zero_size);
179   deconvolution->zero_buffer = zero_buffer;
180   deconvolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset);
181 
182   deconvolution->input_padding_height = input_padding_height;
183   deconvolution->input_padding_width = input_padding_width;
184   deconvolution->adjustment_height = adjustment_height;
185   deconvolution->adjustment_width = adjustment_width;
186 
187   deconvolution->kernel_height = kernel_height;
188   deconvolution->kernel_width = kernel_width;
189   deconvolution->stride_height = stride_height;
190   deconvolution->stride_width = stride_width;
191   deconvolution->dilation_height = dilation_height;
192   deconvolution->dilation_width = dilation_width;
193   deconvolution->groups = groups;
194   deconvolution->group_input_channels = group_input_channels;
195   deconvolution->group_output_channels = group_output_channels;
196 
197   deconvolution->kernel_zero_point = kernel_zero_points[0];
198 
199   deconvolution->conv_quantization_params =
200       pytorch_qnnp_compute_conv_quantization_params(
201           input_zero_point,
202           kernel_zero_points,
203           requantization_scales,
204           output_zero_point,
205           output_min,
206           output_max);
207 
208   deconvolution->ukernel_type = pytorch_qnnp_ukernel_type_conv;
209   deconvolution->format = pytorch_qnnp_format_quint8;
210   deconvolution->transpose = true;
211 
212   *deconvolution_out = deconvolution;
213   return pytorch_qnnp_status_success;
214 
215 error:
216   pytorch_qnnp_delete_operator(deconvolution);
217   return status;
218 }
219 
pytorch_qnnp_setup_deconvolution2d_nhwc_q8(pytorch_qnnp_operator_t deconvolution,size_t batch_size,size_t input_height,size_t input_width,const uint8_t * input,size_t input_pixel_stride,uint8_t * output,size_t output_pixel_stride,pthreadpool_t threadpool)220 enum pytorch_qnnp_status pytorch_qnnp_setup_deconvolution2d_nhwc_q8(
221     pytorch_qnnp_operator_t deconvolution,
222     size_t batch_size,
223     size_t input_height,
224     size_t input_width,
225     const uint8_t* input,
226     size_t input_pixel_stride,
227     uint8_t* output,
228     size_t output_pixel_stride,
229     pthreadpool_t threadpool) {
230   if (!pytorch_qnnp_params.initialized) {
231     pytorch_qnnp_log_error(
232         "pytorch_qnnp_setup_deconvolution2d_nhwc_q8 failed because QNNPACK is not properly initialized");
233     return pytorch_qnnp_status_uninitialized;
234   }
235 
236   if (batch_size == 0) {
237     deconvolution->batch_size = 0;
238     return pytorch_qnnp_status_success;
239   }
240 
241   if (input_width == 0 || input_height == 0) {
242     pytorch_qnnp_log_error(
243         "failed to setup deconvolution with %zux%zu input: input dimensions must be non-zero",
244         input_width,
245         input_height);
246     return pytorch_qnnp_status_invalid_parameter;
247   }
248 
249   deconvolution->batch_size = batch_size;
250   deconvolution->input_height = input_height;
251   deconvolution->input_width = input_width;
252   deconvolution->input = input;
253   deconvolution->input_pixel_stride = input_pixel_stride;
254   deconvolution->output = output;
255   deconvolution->output_pixel_stride = output_pixel_stride;
256 
257   const size_t kernel_height = deconvolution->kernel_height;
258   const size_t kernel_width = deconvolution->kernel_width;
259   const size_t kernel_size = kernel_height * kernel_width;
260   const size_t stride_height = deconvolution->stride_height;
261   const size_t stride_width = deconvolution->stride_width;
262   const size_t output_height = deconvolution->output_height =
263       compute_output_dimension(
264           input_height,
265           deconvolution->input_padding_height * 2,
266           deconvolution->adjustment_height,
267           kernel_height,
268           deconvolution->dilation_height,
269           stride_height);
270   const size_t output_width = deconvolution->output_width =
271       compute_output_dimension(
272           input_width,
273           deconvolution->input_padding_width * 2,
274           deconvolution->adjustment_width,
275           kernel_width,
276           deconvolution->dilation_width,
277           stride_width);
278 
279   const size_t groups = deconvolution->groups;
280   const size_t output_size = output_height * output_width;
281   const size_t output_tile_size = pytorch_qnnp_params.q8conv.mr;
282   const size_t tiled_output_size = round_up(output_size, output_tile_size);
283   const size_t indirection_buffer_size =
284       sizeof(void*) * batch_size * groups * tiled_output_size * kernel_size;
285 
286   const void** indirection_buffer = (const void**)realloc(
287       deconvolution->indirection_buffer, indirection_buffer_size);
288   if (indirection_buffer == NULL) {
289     pytorch_qnnp_log_error(
290         "failed to allocate %zu bytes for indirection buffer",
291         indirection_buffer_size);
292     return pytorch_qnnp_status_out_of_memory;
293   }
294   deconvolution->indirection_buffer = indirection_buffer;
295 
296   pytorch_qnnp_indirection_init_deconv2d(
297       deconvolution, output_tile_size, tiled_output_size);
298 
299   return pytorch_qnnp_status_success;
300 }
301