1 /*
2 * Copyright (c) Facebook, Inc. and its affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #include <assert.h>
10 #include <math.h>
11 #include <stdbool.h>
12 #include <stddef.h>
13 #include <stdint.h>
14 #include <string.h>
15
16 #include <pytorch_qnnpack.h>
17 #include <qnnpack/indirection.h>
18 #include <qnnpack/log.h>
19 #include <qnnpack/math.h>
20 #include <qnnpack/operator.h>
21 #include <qnnpack/pack.h>
22 #include <qnnpack/params.h>
23 #include <qnnpack/requantization.h>
24
compute_output_dimension(size_t input_dimension,size_t input_padding_dimension,size_t adjustment_dimension,size_t kernel_dimension,size_t dilation_dimension,size_t stride_dimension)25 static inline size_t compute_output_dimension(
26 size_t input_dimension,
27 size_t input_padding_dimension,
28 size_t adjustment_dimension,
29 size_t kernel_dimension,
30 size_t dilation_dimension,
31 size_t stride_dimension) {
32 const size_t effective_kernel_dimension =
33 (kernel_dimension - 1) * dilation_dimension + 1;
34 return stride_dimension * (input_dimension - 1) + adjustment_dimension +
35 effective_kernel_dimension - input_padding_dimension;
36 }
37
pytorch_qnnp_create_deconvolution2d_nhwc_q8(uint32_t input_padding_height,uint32_t input_padding_width,uint32_t adjustment_height,uint32_t adjustment_width,uint32_t kernel_height,uint32_t kernel_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,uint8_t input_zero_point,const uint8_t * kernel_zero_points,const uint8_t * kernel,const int32_t * bias,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max,uint32_t flags,const float * requantization_scales,pytorch_qnnp_operator_t * deconvolution_out)38 enum pytorch_qnnp_status pytorch_qnnp_create_deconvolution2d_nhwc_q8(
39 uint32_t input_padding_height,
40 uint32_t input_padding_width,
41 uint32_t adjustment_height,
42 uint32_t adjustment_width,
43 uint32_t kernel_height,
44 uint32_t kernel_width,
45 uint32_t stride_height,
46 uint32_t stride_width,
47 uint32_t dilation_height,
48 uint32_t dilation_width,
49 uint32_t groups,
50 size_t group_input_channels,
51 size_t group_output_channels,
52 uint8_t input_zero_point,
53 const uint8_t* kernel_zero_points,
54 const uint8_t* kernel,
55 const int32_t* bias,
56 uint8_t output_zero_point,
57 uint8_t output_min,
58 uint8_t output_max,
59 uint32_t flags,
60 const float* requantization_scales,
61 pytorch_qnnp_operator_t* deconvolution_out) {
62 pytorch_qnnp_operator_t deconvolution = NULL;
63 enum pytorch_qnnp_status status = pytorch_qnnp_status_uninitialized;
64
65 if (!pytorch_qnnp_params.initialized) {
66 pytorch_qnnp_log_error(
67 "pytorch_qnnp_create_deconvolution2d_nhwc_q8 failed because QNNPACK is not properly initialized");
68 goto error;
69 }
70
71 status = pytorch_qnnp_status_invalid_parameter;
72
73 if (kernel_width == 0 || kernel_height == 0) {
74 pytorch_qnnp_log_error(
75 "failed to create deconvolution with %" PRIu32 "x%" PRIu32
76 " kernel: kernel dimensions must be non-zero",
77 kernel_width,
78 kernel_height);
79 goto error;
80 }
81
82 if (stride_width == 0 || stride_height == 0) {
83 pytorch_qnnp_log_error(
84 "failed to create deconvolution with %" PRIu32 "x%" PRIu32
85 " stride: "
86 "stride dimensions must be non-zero",
87 stride_width,
88 stride_height);
89 goto error;
90 }
91
92 if (dilation_width == 0 || dilation_height == 0) {
93 pytorch_qnnp_log_error(
94 "failed to create deconvolution with %" PRIu32 "x%" PRIu32
95 " dilation: "
96 "dilation dimensions must be non-zero",
97 dilation_width,
98 dilation_height);
99 goto error;
100 }
101
102 status = pytorch_qnnp_status_unsupported_parameter;
103
104 for (int i = 0; i < groups * group_output_channels; i++) {
105 if (requantization_scales[i] <= 0.0f ||
106 !isnormal(requantization_scales[i])) {
107 pytorch_qnnp_log_error(
108 "failed to create deconvolution operator with %.7g requantization scale for "
109 "channel %d scale must be finite and positive",
110 requantization_scales[i], i);
111 goto error;
112 }
113 }
114
115 status = pytorch_qnnp_status_out_of_memory;
116
117 deconvolution = calloc(1, sizeof(struct pytorch_qnnp_operator));
118 if (deconvolution == NULL) {
119 pytorch_qnnp_log_error(
120 "failed to allocate %zu bytes for pytorch_qnnp_operator structure",
121 sizeof(struct pytorch_qnnp_operator));
122 goto error;
123 }
124
125 const uint32_t nr = pytorch_qnnp_params.q8conv.nr;
126 const uint32_t kr = pytorch_qnnp_params.q8conv.kr;
127
128 const uint32_t n_stride = (group_output_channels + (nr - 1)) & -nr;
129 const uint32_t k_stride = (group_input_channels + (kr - 1)) & -kr;
130 const uint32_t kernel_size = kernel_height * kernel_width;
131 const size_t packed_group_weights_size =
132 (sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t)) * n_stride;
133 deconvolution->packed_weights = malloc(packed_group_weights_size * groups);
134 if (deconvolution->packed_weights == NULL) {
135 pytorch_qnnp_log_error(
136 "failed to allocate %zu bytes for packed weights",
137 packed_group_weights_size * groups);
138 goto error;
139 }
140 memset(
141 deconvolution->packed_weights,
142 kernel_zero_points[0],
143 packed_group_weights_size * groups);
144
145 for (uint32_t group = 0; group < groups; group++) {
146 pytorch_pack_q8deconv_w(
147 group_output_channels,
148 kernel_size,
149 group_input_channels,
150 nr,
151 kr,
152 #if !PYTORCH_QNNPACK_RUNTIME_QUANTIZATION
153 input_zero_point,
154 kernel_zero_points[0],
155 #endif
156 kernel +
157 group * group_output_channels * kernel_size * group_input_channels,
158 bias + group * group_output_channels,
159 #if PYTORCH_QNNPACK_RUNTIME_QUANTIZATION
160 kernel_zero_points + group * group_output_channels,
161 #endif
162 (void*)((uintptr_t)deconvolution->packed_weights + group * packed_group_weights_size));
163 }
164
165 size_t zero_size = sizeof(uint8_t) * k_stride;
166 size_t zero_offset = 0;
167 if (group_input_channels < 8) {
168 zero_size += 8;
169 zero_offset = 8;
170 }
171
172 void* zero_buffer = malloc(zero_size);
173 if (zero_buffer == NULL) {
174 pytorch_qnnp_log_error(
175 "failed to allocate %zu bytes for zero padding", zero_size);
176 goto error;
177 }
178 memset(zero_buffer, input_zero_point, zero_size);
179 deconvolution->zero_buffer = zero_buffer;
180 deconvolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset);
181
182 deconvolution->input_padding_height = input_padding_height;
183 deconvolution->input_padding_width = input_padding_width;
184 deconvolution->adjustment_height = adjustment_height;
185 deconvolution->adjustment_width = adjustment_width;
186
187 deconvolution->kernel_height = kernel_height;
188 deconvolution->kernel_width = kernel_width;
189 deconvolution->stride_height = stride_height;
190 deconvolution->stride_width = stride_width;
191 deconvolution->dilation_height = dilation_height;
192 deconvolution->dilation_width = dilation_width;
193 deconvolution->groups = groups;
194 deconvolution->group_input_channels = group_input_channels;
195 deconvolution->group_output_channels = group_output_channels;
196
197 deconvolution->kernel_zero_point = kernel_zero_points[0];
198
199 deconvolution->conv_quantization_params =
200 pytorch_qnnp_compute_conv_quantization_params(
201 input_zero_point,
202 kernel_zero_points,
203 requantization_scales,
204 output_zero_point,
205 output_min,
206 output_max);
207
208 deconvolution->ukernel_type = pytorch_qnnp_ukernel_type_conv;
209 deconvolution->format = pytorch_qnnp_format_quint8;
210 deconvolution->transpose = true;
211
212 *deconvolution_out = deconvolution;
213 return pytorch_qnnp_status_success;
214
215 error:
216 pytorch_qnnp_delete_operator(deconvolution);
217 return status;
218 }
219
pytorch_qnnp_setup_deconvolution2d_nhwc_q8(pytorch_qnnp_operator_t deconvolution,size_t batch_size,size_t input_height,size_t input_width,const uint8_t * input,size_t input_pixel_stride,uint8_t * output,size_t output_pixel_stride,pthreadpool_t threadpool)220 enum pytorch_qnnp_status pytorch_qnnp_setup_deconvolution2d_nhwc_q8(
221 pytorch_qnnp_operator_t deconvolution,
222 size_t batch_size,
223 size_t input_height,
224 size_t input_width,
225 const uint8_t* input,
226 size_t input_pixel_stride,
227 uint8_t* output,
228 size_t output_pixel_stride,
229 pthreadpool_t threadpool) {
230 if (!pytorch_qnnp_params.initialized) {
231 pytorch_qnnp_log_error(
232 "pytorch_qnnp_setup_deconvolution2d_nhwc_q8 failed because QNNPACK is not properly initialized");
233 return pytorch_qnnp_status_uninitialized;
234 }
235
236 if (batch_size == 0) {
237 deconvolution->batch_size = 0;
238 return pytorch_qnnp_status_success;
239 }
240
241 if (input_width == 0 || input_height == 0) {
242 pytorch_qnnp_log_error(
243 "failed to setup deconvolution with %zux%zu input: input dimensions must be non-zero",
244 input_width,
245 input_height);
246 return pytorch_qnnp_status_invalid_parameter;
247 }
248
249 deconvolution->batch_size = batch_size;
250 deconvolution->input_height = input_height;
251 deconvolution->input_width = input_width;
252 deconvolution->input = input;
253 deconvolution->input_pixel_stride = input_pixel_stride;
254 deconvolution->output = output;
255 deconvolution->output_pixel_stride = output_pixel_stride;
256
257 const size_t kernel_height = deconvolution->kernel_height;
258 const size_t kernel_width = deconvolution->kernel_width;
259 const size_t kernel_size = kernel_height * kernel_width;
260 const size_t stride_height = deconvolution->stride_height;
261 const size_t stride_width = deconvolution->stride_width;
262 const size_t output_height = deconvolution->output_height =
263 compute_output_dimension(
264 input_height,
265 deconvolution->input_padding_height * 2,
266 deconvolution->adjustment_height,
267 kernel_height,
268 deconvolution->dilation_height,
269 stride_height);
270 const size_t output_width = deconvolution->output_width =
271 compute_output_dimension(
272 input_width,
273 deconvolution->input_padding_width * 2,
274 deconvolution->adjustment_width,
275 kernel_width,
276 deconvolution->dilation_width,
277 stride_width);
278
279 const size_t groups = deconvolution->groups;
280 const size_t output_size = output_height * output_width;
281 const size_t output_tile_size = pytorch_qnnp_params.q8conv.mr;
282 const size_t tiled_output_size = round_up(output_size, output_tile_size);
283 const size_t indirection_buffer_size =
284 sizeof(void*) * batch_size * groups * tiled_output_size * kernel_size;
285
286 const void** indirection_buffer = (const void**)realloc(
287 deconvolution->indirection_buffer, indirection_buffer_size);
288 if (indirection_buffer == NULL) {
289 pytorch_qnnp_log_error(
290 "failed to allocate %zu bytes for indirection buffer",
291 indirection_buffer_size);
292 return pytorch_qnnp_status_out_of_memory;
293 }
294 deconvolution->indirection_buffer = indirection_buffer;
295
296 pytorch_qnnp_indirection_init_deconv2d(
297 deconvolution, output_tile_size, tiled_output_size);
298
299 return pytorch_qnnp_status_success;
300 }
301