xref: /aosp_15_r20/external/XNNPACK/test/packing.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2022 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <numeric>
8 
9 #include <xnnpack/pack.h>
10 #include <xnnpack/aligned-allocator.h>
11 
12 #include <gtest/gtest.h>
13 #include <fp16.h>
14 
TEST(PACK_QU8_DWCONV_GHW_W,primary_tile_eq_kernel_size)15 TEST(PACK_QU8_DWCONV_GHW_W, primary_tile_eq_kernel_size) {
16   size_t primary_tile = 3;
17   size_t h = 3;
18   size_t w = 1;
19   size_t c = 2;
20   size_t cr = 2;
21 
22   std::vector<int32_t> b(c);
23   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
24   std::vector<uint8_t> k(c * h * w);  // k = [2, 3, 4, 5, 6, 7]
25   std::iota(k.begin(), k.end(), b.size());
26 
27   std::vector<uint8_t> packed_weights(((primary_tile + sizeof(int32_t)/sizeof(uint8_t)) * round_up_po2(c, cr)));
28 
29   xnn_qu8_packing_params params = {
30     .input_zero_point = 127,
31     .kernel_zero_point = 127,
32   };
33   xnn_pack_qu8_dwconv_ghw_w(
34       primary_tile,
35       h,
36       w,
37       c,
38       cr,
39       k.data(),
40       b.data(),
41       packed_weights.data(),
42       0,
43       &params);
44 
45   const int32_t bias_offset = h * w * params.input_zero_point * params.kernel_zero_point;
46   ASSERT_EQ(bias_offset, 48387);
47   std::vector<uint8_t> expected = {
48     // bias first
49     // 48387 + 0 - (2 + 3 + 4) * 127 = 47,244 = 0xB88C
50     0x8C, 0xB8, 0, 0,
51     // 48387 + 1 - (5 + 6 + 7) * 127 = 46,102 = 0xB416
52     0x16, 0xB4, 0, 0,
53     // then weights, channels first
54     2, 5,
55     3, 6,
56     4, 7,
57   };
58   ASSERT_EQ(expected, packed_weights);
59 }
60 
TEST(PACK_QU8_DWCONV_GHW_W,primary_tile_eq_kernel_size_channels_gt_cr)61 TEST(PACK_QU8_DWCONV_GHW_W, primary_tile_eq_kernel_size_channels_gt_cr) {
62   size_t primary_tile = 3;
63   size_t h = 3;
64   size_t w = 1;
65   size_t c = 5;
66   size_t cr = 2;
67 
68   std::vector<int32_t> b(c);
69   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
70   std::vector<uint8_t> k(c * h * w);  // k = [
71                                       //   5, 6, 7,
72                                       //   8, 9, 10,
73                                       //   11, 12, 13,
74                                       //   14, 15, 16,
75                                       //   17, 18, 19]
76   std::iota(k.begin(), k.end(), b.size());
77   std::vector<uint8_t> packed_weights(((primary_tile + sizeof(int32_t)/sizeof(uint8_t)) * round_up_po2(c, cr)));
78 
79   xnn_qu8_packing_params params = {
80     .input_zero_point = 127,
81     .kernel_zero_point = 127,
82   };
83   xnn_pack_qu8_dwconv_ghw_w(
84       primary_tile,
85       h,
86       w,
87       c,
88       cr,
89       k.data(),
90       b.data(),
91       packed_weights.data(),
92       0,
93       &params);
94 
95   const int32_t bias_offset = h * w * params.input_zero_point * params.kernel_zero_point;
96   ASSERT_EQ(bias_offset, 48387);
97   std::vector<uint8_t> expected = {
98     // cr blocks
99     // bias first (cr == 2 of them)
100     // 48387 + 0 - (5 + 6 + 7) * 127 = 46,101 = 0xB415
101     0x15, 0xB4, 0, 0,
102     // 48387 + 1 - (8 + 9 + 10) * 127 = 44,959 = 0xAF9F
103     0x9F, 0xAF, 0, 0,
104     // then weights, channels first
105     5, 8, 6, 9, 7, 10,
106     // bias again
107     // 48387 + 2 - (11 + 12 + 13) * 127 = 43,817 = 0xAB29
108     0x29, 0xAB, 0, 0,
109     // 48387 + 3 - (14 + 15 + 16) * 127 = 42,675 = 0xA6B3
110     0xB3, 0xA6, 0, 0,
111     // then weights, channels first
112     11, 14, 12, 15, 13, 16,
113     // bias again
114     // 48387 + 4 - (17 + 18 + 19) * 127 = 41,533 = 0xA23D
115     0x3D, 0xA2, 0, 0,
116     0, 0, 0, 0,
117     // then weights, channels first
118     17, 0, 18, 0, 19, 0,
119   };
120   ASSERT_EQ(expected, packed_weights);
121 }
122 
TEST(PACK_QU8_DWCONV_GHW_W,primary_tile_gt_kernel_size)123 TEST(PACK_QU8_DWCONV_GHW_W, primary_tile_gt_kernel_size) {
124   size_t primary_tile = 9;
125   size_t h = 2;
126   size_t w = 2;
127   size_t c = 2;
128   size_t cr = 2;
129 
130   std::vector<int32_t> b(c);
131   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
132   std::vector<uint8_t> k(c * h * w);  // k = [
133                                       //   2, 3,
134                                       //   4, 5,
135                                       //   6, 7,
136                                       //   8, 9]
137   std::iota(k.begin(), k.end(), b.size());
138   std::vector<uint8_t> packed_weights(((primary_tile + sizeof(int32_t)/sizeof(uint8_t)) * round_up_po2(c, cr)));
139 
140   xnn_qu8_packing_params params = {
141     .input_zero_point = 127,
142     .kernel_zero_point = 127,
143   };
144   xnn_pack_qu8_dwconv_ghw_w(
145       primary_tile,
146       h,
147       w,
148       c,
149       cr,
150       k.data(),
151       b.data(),
152       packed_weights.data(),
153       0,
154       &params);
155 
156   const int32_t bias_offset = h * w * params.input_zero_point * params.kernel_zero_point;
157   ASSERT_EQ(bias_offset, 64516);
158   std::vector<uint8_t> expected = {
159     // bias first (cr == 2 of them)
160     // 64516 + 0 - (2 + 3 + 4 + 5) * 127 = 62,738 = 0xF512
161     0x12, 0xF5, 0, 0,
162     // 64516 + 1 - (6 + 7 + 8 + 9) * 127 = 60,707 = 0xED23
163     0x23, 0xED, 0, 0,
164     // then weights, channels first
165     2, 6,
166     // go down the columns first
167     4, 8, 3, 7, 5, 9,
168     // followed by 10 zeros to make up the difference with primary_tile
169     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
170   };
171   ASSERT_EQ(expected, packed_weights);
172 }
173 
TEST(PACK_QU8_DWCONV_GHW_W,primary_tile_gt_kernel_size_channels_gt_cr)174 TEST(PACK_QU8_DWCONV_GHW_W, primary_tile_gt_kernel_size_channels_gt_cr) {
175   size_t primary_tile = 9;
176   size_t h = 2;
177   size_t w = 2;
178   size_t c = 5;
179   size_t cr = 2;
180 
181   std::vector<int32_t> b(c);
182   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
183   std::vector<uint8_t> k(c * h * w);  // k = [
184                                       //   5, 6,
185                                       //   7, 8,
186                                       //   9, 10,
187                                       //   11, 12,
188                                       //   13, 14,
189                                       //   15, 16,
190                                       //   17, 18,
191                                       //   19, 20,
192                                       //   21, 22,
193                                       //   23, 24]
194   std::iota(k.begin(), k.end(), b.size());
195   std::vector<uint8_t> packed_weights(((primary_tile + sizeof(int32_t)/sizeof(uint8_t)) * round_up_po2(c, cr)));
196 
197   xnn_qu8_packing_params params = {
198     .input_zero_point = 127,
199     .kernel_zero_point = 127,
200   };
201   xnn_pack_qu8_dwconv_ghw_w(
202       primary_tile,
203       h,
204       w,
205       c,
206       cr,
207       k.data(),
208       b.data(),
209       packed_weights.data(),
210       0,
211       &params);
212 
213   const int32_t bias_offset = h * w * params.input_zero_point * params.kernel_zero_point;
214   ASSERT_EQ(bias_offset, 64516);
215   std::vector<uint8_t> expected = {
216     // bias first (cr == 2 of them)
217     // 64516 + 0 - (5 + 6 + 7 + 8) * 127 = 61,214 = 0xEF1E
218     0x1E, 0xEF, 0, 0,
219     // 64516 + 1 - (9 + 10 + 11 + 12) * 127 = 59,183 = 0xE72F
220     0x2F, 0xE7, 0, 0,
221     // then weights, channels first
222     5, 9,
223     // go down the columns first
224     7, 11,
225     6, 10,
226     8, 12,
227     // followed by 10 zeros to make up the difference with primary_tile
228     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
229     // bias first (cr == 2 of them)
230     // 64516 + 2 - (13 + 14 + 15 + 16) * 127 = 57,152 = 0xDF40
231     0x40, 0xDF, 0, 0,
232     // 64516 + 3 - (17 + 18 + 19 + 20) * 127 = 55,121 = 0xD751
233     0x51, 0xD7, 0, 0,
234     // then weights, channels first
235     13, 17, 15, 19, 14, 18, 16, 20,
236     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
237     // bias
238     // 64516 + 4 - (21 + 22 + 23 + 24) * 127 = 53,090 = 0xCF62
239     0x62, 0xCF, 0, 0,
240     0, 0, 0, 0,
241     // weights
242     21, 0, 23, 0, 22, 0, 24, 0,
243     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
244   };
245   ASSERT_EQ(expected, packed_weights);
246 }
247 
TEST(PACK_QU8_DWCONV_HWG_W,primary_tile_eq_kernel_size)248 TEST(PACK_QU8_DWCONV_HWG_W, primary_tile_eq_kernel_size) {
249   size_t primary_tile = 3;
250   size_t h = 3;
251   size_t w = 1;
252   size_t c = 2;
253   size_t cr = 2;
254 
255   std::vector<int32_t> b(c);
256   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
257   std::vector<uint8_t> k(c * h * w);  // k = [2, 3, 4, 5, 6, 7]
258   std::iota(k.begin(), k.end(), b.size());
259 
260   std::vector<uint8_t> packed_weights(((primary_tile + sizeof(int32_t)/sizeof(uint8_t)) * round_up_po2(c, cr)));
261 
262   xnn_qu8_packing_params params = {
263     .input_zero_point = 127,
264     .kernel_zero_point = 127,
265   };
266   xnn_pack_qu8_dwconv_hwg_w(
267       primary_tile,
268       h,
269       w,
270       c,
271       cr,
272       k.data(),
273       b.data(),
274       packed_weights.data(),
275       0,
276       &params);
277 
278   const int32_t bias_offset = h * w * params.input_zero_point * params.kernel_zero_point;
279   ASSERT_EQ(bias_offset, 48387);
280   std::vector<uint8_t> expected = {
281     // bias first
282     // 48387 + 0 - (2 + 4 + 6) * 127 = 46,863 = 0xB70F
283     0x0F, 0xB7, 0, 0,
284     // 48387 + 1 - (3 + 5 + 7) * 127 = 46,483 = 0xB593
285     0x93, 0xB5, 0, 0,
286     // then weights, channels first
287     2, 3,
288     4, 5,
289     6, 7,
290   };
291   ASSERT_EQ(expected, packed_weights);
292 }
293 
TEST(PACK_QU8_DWCONV_HWG_W,primary_tile_eq_kernel_size_channels_gt_cr)294 TEST(PACK_QU8_DWCONV_HWG_W, primary_tile_eq_kernel_size_channels_gt_cr) {
295   size_t primary_tile = 3;
296   size_t h = 3;
297   size_t w = 1;
298   size_t c = 5;
299   size_t cr = 2;
300 
301   std::vector<int32_t> b(c);
302   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
303   std::vector<uint8_t> k(c * h * w);  // k = [
304                                       //   5, 6, 7, 8, 9,
305                                       //   10, 11, 12, 13, 14,
306                                       //   15, 16, 17, 18, 19]
307   std::iota(k.begin(), k.end(), b.size());
308   std::vector<uint8_t> packed_weights(((primary_tile + sizeof(int32_t)/sizeof(uint8_t)) * round_up_po2(c, cr)));
309 
310   xnn_qu8_packing_params params = {
311     .input_zero_point = 127,
312     .kernel_zero_point = 127,
313   };
314   xnn_pack_qu8_dwconv_hwg_w(
315       primary_tile,
316       h,
317       w,
318       c,
319       cr,
320       k.data(),
321       b.data(),
322       packed_weights.data(),
323       0,
324       &params);
325 
326   const int32_t bias_offset = h * w * params.input_zero_point * params.kernel_zero_point;
327   ASSERT_EQ(bias_offset, 48387);
328   std::vector<uint8_t> expected = {
329     // cr blocks
330     // bias first (cr == 2 of them)
331     // 48387 + 0 - (5 + 10 + 15) * 127 = 44577 = 0xAE21
332     0x21, 0xAE, 0, 0,
333     // 48387 + 1 - (6 + 11 + 16) * 127 = 44197 = 0xACA5
334     0xA5, 0xAC, 0, 0,
335     // then weights, channels first
336     5, 6, 10, 11, 15, 16,
337     // bias again
338     // 48387 + 2 - (7, 12, 17) * 127 = 43817 = 0xAB29
339     0x29, 0xAB, 0, 0,
340     // 48387 + 3 - (8, 13, 18) * 127 = 43434 = 0xA9AD
341     0xAD, 0xA9, 0, 0,
342     // then weights, channels first
343     7, 8, 12, 13, 17, 18,
344     // bias again
345     // 48387 + 4 - (9, 14, 19) * 127 = 43053 = 0xA831
346     0x31, 0xA8, 0, 0,
347     0, 0, 0, 0,
348     // then weights, channels first
349     9, 0, 14, 0, 19, 0,
350   };
351   ASSERT_EQ(expected, packed_weights);
352 }
353 
TEST(PACK_QU8_DWCONV_HWG_W,primary_tile_gt_kernel_size)354 TEST(PACK_QU8_DWCONV_HWG_W, primary_tile_gt_kernel_size) {
355   size_t primary_tile = 9;
356   size_t h = 2;
357   size_t w = 2;
358   size_t c = 2;
359   size_t cr = 2;
360 
361   std::vector<int32_t> b(c);
362   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
363   std::vector<uint8_t> k(c * h * w);  // k = [
364                                       //   2, 3,
365                                       //   4, 5,
366                                       //   6, 7,
367                                       //   8, 9]
368   std::iota(k.begin(), k.end(), b.size());
369   std::vector<uint8_t> packed_weights(((primary_tile + sizeof(int32_t)/sizeof(uint8_t)) * round_up_po2(c, cr)));
370 
371   xnn_qu8_packing_params params = {
372     .input_zero_point = 127,
373     .kernel_zero_point = 127,
374   };
375   xnn_pack_qu8_dwconv_hwg_w(
376       primary_tile,
377       h,
378       w,
379       c,
380       cr,
381       k.data(),
382       b.data(),
383       packed_weights.data(),
384       0,
385       &params);
386 
387   const int32_t bias_offset = h * w * params.input_zero_point * params.kernel_zero_point;
388   ASSERT_EQ(bias_offset, 64516);
389   std::vector<uint8_t> expected = {
390     // bias first (cr == 2 of them)
391     // 64516 + 0 - (2 + 4 + 6 + 8) * 127 = 61976 = 0xF218
392     0x18, 0xF2, 0, 0,
393     // 64516 + 1 - (3 + 5 + 7 + 9) * 127 = 61469 = 0xF01D
394     0x1D, 0xF0, 0, 0,
395     // then weights, channels first
396     2, 3,
397     // go down the columns first
398     6, 7, 4, 5, 8, 9,
399     // followed by 10 zeros to make up the difference with primary_tile
400     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
401   };
402   ASSERT_EQ(expected, packed_weights);
403 }
404 
TEST(PACK_QU8_DWCONV_HWG_W,primary_tile_gt_kernel_size_channels_gt_cr)405 TEST(PACK_QU8_DWCONV_HWG_W, primary_tile_gt_kernel_size_channels_gt_cr) {
406   size_t primary_tile = 9;
407   size_t h = 2;
408   size_t w = 2;
409   size_t c = 5;
410   size_t cr = 2;
411 
412   std::vector<int32_t> b(c);
413   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
414   std::vector<uint8_t> k(c * h * w);  // k = [
415                                       //   5, 6, 7, 8, 9,
416                                       //   10, 11, 12, 13, 14,
417                                       //   15, 16, 17, 18, 19,
418                                       //   20, 21, 22, 23, 24]
419   std::iota(k.begin(), k.end(), b.size());
420   std::vector<uint8_t> packed_weights(((primary_tile + sizeof(int32_t)/sizeof(uint8_t)) * round_up_po2(c, cr)));
421 
422   xnn_qu8_packing_params params = {
423     .input_zero_point = 127,
424     .kernel_zero_point = 127,
425   };
426   xnn_pack_qu8_dwconv_hwg_w(
427       primary_tile,
428       h,
429       w,
430       c,
431       cr,
432       k.data(),
433       b.data(),
434       packed_weights.data(),
435       0,
436       &params);
437 
438   const int32_t bias_offset = h * w * params.input_zero_point * params.kernel_zero_point;
439   ASSERT_EQ(bias_offset, 64516);
440   std::vector<uint8_t> expected = {
441     // bias first (cr == 2 of them)
442     // 64516 + 0 - (5 + 10 + 15 + 20) * 127 = 58166 = 0xE336
443     0x36, 0xE3, 0, 0,
444     // 64516 + 1 - (6 + 11 + 16 + 21) * 127 = 57659 = 0xE13B
445     0x3B, 0xE1, 0, 0,
446     // then weights, channels first
447     5, 6,
448     // go down the columns first
449     15, 16,
450     10, 11,
451     20, 21,
452     // followed by 10 zeros to make up the difference with primary_tile
453     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
454     // bias first (cr == 2 of them)
455     // 64516 + 2 - (7 + 12 + 17 + 22) * 127 = 57152 = 0xDF40
456     0x40, 0xDF, 0, 0,
457     // 64516 + 3 - (8 + 13 + 18 + 23) * 127 = 56645 = 0xDD45
458     0x45, 0xDD, 0, 0,
459     // then weights, channels first
460     7, 8, 17, 18, 12, 13, 22, 23,
461     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
462     // bias
463     // 64516 + 4 - (9 + 14 + 19 + 24) * 127 = 56138 = 0xDB4A
464     0x4A, 0xDB, 0, 0,
465     0, 0, 0, 0,
466     // weights
467     9, 0, 19, 0, 14, 0, 24, 0,
468     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
469   };
470   ASSERT_EQ(expected, packed_weights);
471 }
472 
TEST(PACK_QS8_DWCONV_GHW_W,primary_tile_eq_kernel_size)473 TEST(PACK_QS8_DWCONV_GHW_W, primary_tile_eq_kernel_size) {
474   size_t primary_tile = 3;
475   size_t h = 3;
476   size_t w = 1;
477   size_t c = 2;
478   size_t cr = 2;
479 
480   std::vector<int32_t> b(c);
481   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
482   std::vector<int8_t> k(c * h * w);  // k = [2, 3, 4, 5, 6, 7]
483   std::iota(k.begin(), k.end(), b.size());
484 
485   std::vector<uint8_t> packed_weights(((primary_tile + sizeof(int32_t)/sizeof(uint8_t)) * round_up_po2(c, cr)));
486 
487   xnn_qs8_packing_params params = {
488     .input_zero_point = 127,
489   };
490   xnn_pack_qs8_dwconv_ghw_w(
491       primary_tile,
492       h,
493       w,
494       c,
495       cr,
496       k.data(),
497       b.data(),
498       packed_weights.data(),
499       0,
500       &params);
501 
502   std::vector<uint8_t> expected = {
503     // bias first
504     // (2 + 3 + 4) * 127 = -1143 = 0xFFFFFB89
505     0x89, 0xFB, 0xFF, 0xFF,
506     // (5 + 6 + 7) * 127 = -2285 = 0xFFFFF713
507     0x13, 0xF7, 0xFF, 0xFF,
508     // then weights, channels first
509     2, 5,
510     3, 6,
511     4, 7,
512   };
513   ASSERT_EQ(expected, packed_weights);
514 }
515 
TEST(PACK_QS8_DWCONV_GHW_W,primary_tile_eq_kernel_size_channels_gt_cr)516 TEST(PACK_QS8_DWCONV_GHW_W, primary_tile_eq_kernel_size_channels_gt_cr) {
517   size_t primary_tile = 3;
518   size_t h = 3;
519   size_t w = 1;
520   size_t c = 5;
521   size_t cr = 2;
522 
523   std::vector<int32_t> b(c);
524   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
525   std::vector<int8_t> k(c * h * w);  // k = [
526                                       //   5, 6, 7,
527                                       //   8, 9, 10,
528                                       //   11, 12, 13,
529                                       //   14, 15, 16,
530                                       //   17, 18, 19]
531   std::iota(k.begin(), k.end(), b.size());
532   std::vector<uint8_t> packed_weights(((primary_tile + sizeof(int32_t)/sizeof(uint8_t)) * round_up_po2(c, cr)));
533 
534   xnn_qs8_packing_params params = {
535     .input_zero_point = 127,
536   };
537   xnn_pack_qs8_dwconv_ghw_w(
538       primary_tile,
539       h,
540       w,
541       c,
542       cr,
543       k.data(),
544       b.data(),
545       packed_weights.data(),
546       0,
547       &params);
548 
549   std::vector<uint8_t> expected = {
550     // cr blocks
551     // bias first (cr == 2 of them)
552     // 0 - (5 + 6 + 7) * 127 = -2286 = 0xFFFFF712
553     0x12, 0xF7, 0xFF, 0xFF,
554     // 1 - (8 + 9 + 10) * 127 = -3428 = 0xFFFFF29C
555     0x9C, 0xF2, 0xFF, 0xFF,
556     // then weights, channels first
557     5, 8, 6, 9, 7, 10,
558     // bias again
559     // 2 - (11 + 12 + 13) * 127 = -4570 = 0xFFFFEE26
560     0x26, 0xEE, 0xFF, 0xFF,
561     // 3 - (14 + 15 + 16) * 127 = -5712 = 0xFFFFE9B0
562     0xB0, 0xE9, 0xFF, 0xFF,
563     // then weights, channels first
564     11, 14, 12, 15, 13, 16,
565     // bias again
566     // 4 - (17 + 18 + 19) * 127 = -6854 = 0xFFFFE53A
567     0x3A, 0xE5, 0xFF, 0xFF,
568     0, 0, 0, 0,
569     // then weights, channels first
570     17, 0, 18, 0, 19, 0,
571   };
572   ASSERT_EQ(expected, packed_weights);
573 }
574 
TEST(PACK_QS8_DWCONV_GHW_W,primary_tile_gt_kernel_size)575 TEST(PACK_QS8_DWCONV_GHW_W, primary_tile_gt_kernel_size) {
576   size_t primary_tile = 9;
577   size_t h = 2;
578   size_t w = 2;
579   size_t c = 2;
580   size_t cr = 2;
581 
582   std::vector<int32_t> b(c);
583   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
584   std::vector<int8_t> k(c * h * w);  // k = [
585                                       //   2, 3,
586                                       //   4, 5,
587                                       //   6, 7,
588                                       //   8, 9]
589   std::iota(k.begin(), k.end(), b.size());
590   std::vector<uint8_t> packed_weights(((primary_tile + sizeof(int32_t)/sizeof(uint8_t)) * round_up_po2(c, cr)));
591 
592   xnn_qs8_packing_params params = {
593     .input_zero_point = 127,
594   };
595   xnn_pack_qs8_dwconv_ghw_w(
596       primary_tile,
597       h,
598       w,
599       c,
600       cr,
601       k.data(),
602       b.data(),
603       packed_weights.data(),
604       0,
605       &params);
606 
607   std::vector<uint8_t> expected = {
608     // bias first (cr == 2 of them)
609     // 0 - (2 + 3 + 4 + 5) * 127 = -1778 = 0xFFFFF90E
610     0x0E, 0xF9, 0xFF, 0xFF,
611     // 1 - (6 + 7 + 8 + 9) * 127 = -3809 = 0xFFFFF11F
612     0x1F, 0xF1, 0xFF, 0xFF,
613     // then weights, channels first
614     2, 6,
615     // go down the columns first
616     4, 8, 3, 7, 5, 9,
617     // followed by 10 zeros to make up the difference with primary_tile
618     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
619   };
620   ASSERT_EQ(expected, packed_weights);
621 }
622 
TEST(PACK_QS8_DWCONV_GHW_W,primary_tile_gt_kernel_size_channels_gt_cr)623 TEST(PACK_QS8_DWCONV_GHW_W, primary_tile_gt_kernel_size_channels_gt_cr) {
624   size_t primary_tile = 9;
625   size_t h = 2;
626   size_t w = 2;
627   size_t c = 5;
628   size_t cr = 2;
629 
630   std::vector<int32_t> b(c);
631   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
632   std::vector<int8_t> k(c * h * w);  // k = [
633                                       //   5, 6,
634                                       //   7, 8,
635                                       //   9, 10,
636                                       //   11, 12,
637                                       //   13, 14,
638                                       //   15, 16,
639                                       //   17, 18,
640                                       //   19, 20,
641                                       //   21, 22,
642                                       //   23, 24]
643   std::iota(k.begin(), k.end(), b.size());
644   std::vector<uint8_t> packed_weights(((primary_tile + sizeof(int32_t)/sizeof(uint8_t)) * round_up_po2(c, cr)));
645 
646   xnn_qs8_packing_params params = {
647     .input_zero_point = 127,
648   };
649   xnn_pack_qs8_dwconv_ghw_w(
650       primary_tile,
651       h,
652       w,
653       c,
654       cr,
655       k.data(),
656       b.data(),
657       packed_weights.data(),
658       0,
659       &params);
660 
661   std::vector<uint8_t> expected = {
662     // bias first (cr == 2 of them)
663     // 0 - (5 + 6 + 7 + 8) * 127 = -3302 = 0xFFFFF31A
664     0x1A, 0xF3, 0xFF, 0xFF,
665     // 1 - (9 + 10 + 11 + 12) * 127 = -5333 = 0xFFFFEB2B
666     0x2B, 0xEB, 0xFF, 0xFF,
667     // then weights, channels first
668     5, 9,
669     // go down the columns first
670     7, 11,
671     6, 10,
672     8, 12,
673     // followed by 10 zeros to make up the difference with primary_tile
674     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
675     // bias first (cr == 2 of them)
676     // 2 - (13 + 14 + 15 + 16) * 127 = -7364 = 0xFFFFE33C
677     0x3C, 0xE3, 0xFF, 0xFF,
678     // 3 - (17 + 18 + 19 + 20) * 127 = -9395 = 0xFFFFDB4D
679     0x4D, 0xDB, 0xFF, 0xFF,
680     // then weights, channels first
681     13, 17, 15, 19, 14, 18, 16, 20,
682     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
683     // bias
684     // 4 - (21 + 22 + 23 + 24) * 127 = -11426 = 0xFFFFD35E
685     0x5E, 0xD3, 0xFF, 0xFF,
686     0, 0, 0, 0,
687     // weights
688     21, 0, 23, 0, 22, 0, 24, 0,
689     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
690   };
691   ASSERT_EQ(expected, packed_weights);
692 }
693 
TEST(PACK_QS8_DWCONV_HWG_W,primary_tile_eq_kernel_size)694 TEST(PACK_QS8_DWCONV_HWG_W, primary_tile_eq_kernel_size) {
695   size_t primary_tile = 3;
696   size_t h = 3;
697   size_t w = 1;
698   size_t c = 2;
699   size_t cr = 2;
700 
701   std::vector<int32_t> b(c);
702   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
703   std::vector<int8_t> k(c * h * w);  // k = [2, 3, 4, 5, 6, 7]
704   std::iota(k.begin(), k.end(), b.size());
705 
706   std::vector<uint8_t> packed_weights(((primary_tile + sizeof(int32_t)/sizeof(uint8_t)) * round_up_po2(c, cr)));
707 
708   xnn_qs8_packing_params params = {
709     .input_zero_point = 127,
710   };
711   xnn_pack_qs8_dwconv_hwg_w(
712       primary_tile,
713       h,
714       w,
715       c,
716       cr,
717       k.data(),
718       b.data(),
719       packed_weights.data(),
720       0,
721       &params);
722 
723   std::vector<uint8_t> expected = {
724     // bias first
725     // 0 - (2 + 4 + 6) * 127 = -1524 = 0xFFFFFA0C
726     0x0C, 0xFA, 0xFF, 0xFF,
727     // 1 - (3 + 5 + 7) * 127 = -1904 = 0xFFFFF890
728     0x90, 0xF8, 0xFF, 0xFF,
729     // then weights, channels first
730     2, 3,
731     4, 5,
732     6, 7,
733   };
734   ASSERT_EQ(expected, packed_weights);
735 }
736 
TEST(PACK_QS8_DWCONV_HWG_W,primary_tile_eq_kernel_size_channels_gt_cr)737 TEST(PACK_QS8_DWCONV_HWG_W, primary_tile_eq_kernel_size_channels_gt_cr) {
738   size_t primary_tile = 3;
739   size_t h = 3;
740   size_t w = 1;
741   size_t c = 5;
742   size_t cr = 2;
743 
744   std::vector<int32_t> b(c);
745   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
746   std::vector<int8_t> k(c * h * w);  // k = [
747                                       //   5, 6, 7, 8, 9,
748                                       //   10, 11, 12, 13, 14,
749                                       //   15, 16, 17, 18, 19]
750   std::iota(k.begin(), k.end(), b.size());
751   std::vector<uint8_t> packed_weights(((primary_tile + sizeof(int32_t)/sizeof(uint8_t)) * round_up_po2(c, cr)));
752 
753   xnn_qs8_packing_params params = {
754     .input_zero_point = 127,
755   };
756   xnn_pack_qs8_dwconv_hwg_w(
757       primary_tile,
758       h,
759       w,
760       c,
761       cr,
762       k.data(),
763       b.data(),
764       packed_weights.data(),
765       0,
766       &params);
767 
768   std::vector<uint8_t> expected = {
769     // cr blocks
770     // bias first (cr == 2 of them)
771     // 0 - (5 + 10 + 15) * 127 = -3810 = 0xFFFFF11E
772     0x1E, 0xF1, 0xFF, 0xFF,
773     // 1 - (6 + 11 + 16) * 127 = -4190 = 0xFFFFEFA2
774     0xA2, 0xEF, 0xFF, 0xFF,
775     // then weights, channels first
776     5, 6, 10, 11, 15, 16,
777     // bias again
778     // 2 - (7, 12, 17) * 127 = -45709 = 0xFFFFEE26
779     0x26, 0xEE, 0xFF, 0xFF,
780     // 3 - (8, 13, 18) * 127 = -4950 = 0xFFFFECAA
781     0xAA, 0xEC, 0xFF, 0xFF,
782     // then weights, channels first
783     7, 8, 12, 13, 17, 18,
784     // bias again
785     // 4 - (9, 14, 19) * 127 = -5330 = 0xFFFFEB2E
786     0x2E, 0xEB, 0xFF, 0xFF,
787     0, 0, 0, 0,
788     // then weights, channels first
789     9, 0, 14, 0, 19, 0,
790   };
791   ASSERT_EQ(expected, packed_weights);
792 }
793 
TEST(PACK_QS8_DWCONV_HWG_W,primary_tile_gt_kernel_size)794 TEST(PACK_QS8_DWCONV_HWG_W, primary_tile_gt_kernel_size) {
795   size_t primary_tile = 9;
796   size_t h = 2;
797   size_t w = 2;
798   size_t c = 2;
799   size_t cr = 2;
800 
801   std::vector<int32_t> b(c);
802   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
803   std::vector<int8_t> k(c * h * w);  // k = [
804                                       //   2, 3,
805                                       //   4, 5,
806                                       //   6, 7,
807                                       //   8, 9]
808   std::iota(k.begin(), k.end(), b.size());
809   std::vector<uint8_t> packed_weights(((primary_tile + sizeof(int32_t)/sizeof(uint8_t)) * round_up_po2(c, cr)));
810 
811   xnn_qs8_packing_params params = {
812     .input_zero_point = 127,
813   };
814   xnn_pack_qs8_dwconv_hwg_w(
815       primary_tile,
816       h,
817       w,
818       c,
819       cr,
820       k.data(),
821       b.data(),
822       packed_weights.data(),
823       0,
824       &params);
825 
826   std::vector<uint8_t> expected = {
827     // bias first (cr == 2 of them)
828     // 0 - (2 + 4 + 6 + 8) * 127 = -2540 = 0xFFFFF614
829     0x14, 0xF6, 0xFF, 0xFF,
830     // 1 - (3 + 5 + 7 + 9) * 127 = -3047 = 0xFFFFF419
831     0x19, 0xF4, 0xFF, 0xFF,
832     // then weights, channels first
833     2, 3,
834     // go down the columns first
835     6, 7, 4, 5, 8, 9,
836     // followed by 10 zeros to make up the difference with primary_tile
837     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
838   };
839   ASSERT_EQ(expected, packed_weights);
840 }
841 
TEST(PACK_QS8_DWCONV_HWG_W,primary_tile_gt_kernel_size_channels_gt_cr)842 TEST(PACK_QS8_DWCONV_HWG_W, primary_tile_gt_kernel_size_channels_gt_cr) {
843   size_t primary_tile = 9;
844   size_t h = 2;
845   size_t w = 2;
846   size_t c = 5;
847   size_t cr = 2;
848 
849   std::vector<int32_t> b(c);
850   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
851   std::vector<int8_t> k(c * h * w);  // k = [
852                                       //   5, 6, 7, 8, 9,
853                                       //   10, 11, 12, 13, 14,
854                                       //   15, 16, 17, 18, 19,
855                                       //   20, 21, 22, 23, 24]
856   std::iota(k.begin(), k.end(), b.size());
857   std::vector<uint8_t> packed_weights(((primary_tile + sizeof(int32_t)/sizeof(uint8_t)) * round_up_po2(c, cr)));
858 
859   xnn_qs8_packing_params params = {
860     .input_zero_point = 127,
861   };
862   xnn_pack_qs8_dwconv_hwg_w(
863       primary_tile,
864       h,
865       w,
866       c,
867       cr,
868       k.data(),
869       b.data(),
870       packed_weights.data(),
871       0,
872       &params);
873 
874   std::vector<uint8_t> expected = {
875     // bias first (cr == 2 of them)
876     // 0 - (5 + 10 + 15 + 20) * 127 = -6350 = 0xFFFFE732
877     0x32, 0xE7, 0xFF, 0xFF,
878     // 1 - (6 + 11 + 16 + 21) * 127 = -6857 = 0xFFFFE537
879     0x37, 0xE5, 0xFF, 0xFF,
880     // then weights, channels first
881     5, 6,
882     // go down the columns first
883     15, 16,
884     10, 11,
885     20, 21,
886     // followed by 10 zeros to make up the difference with primary_tile
887     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
888     // bias first (cr == 2 of them)
889     // 2 - (7 + 12 + 17 + 22) * 127 = -7364 = 0xFFFFE33C
890     0x3C, 0xE3, 0xFF, 0xFF,
891     // 3 - (8 + 13 + 18 + 23) * 127 = -7871 = 0xFFFFE141
892     0x41, 0xE1, 0xFF, 0xFF,
893     // then weights, channels first
894     7, 8, 17, 18, 12, 13, 22, 23,
895     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
896     // bias
897     // 4 - (9 + 14 + 19 + 24) * 127 = -8378 = 0xFFFFDF46
898     0x46, 0xDF, 0xFF, 0xFF,
899     0, 0, 0, 0,
900     // weights
901     9, 0, 19, 0, 14, 0, 24, 0,
902     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
903   };
904   ASSERT_EQ(expected, packed_weights);
905 }
906 
TEST(PACK_F16_DWCONV_GHW_W,primary_tile_eq_kernel_size)907 TEST(PACK_F16_DWCONV_GHW_W, primary_tile_eq_kernel_size) {
908   size_t primary_tile = 3;
909   size_t h = 3;
910   size_t w = 1;
911   size_t c = 2;
912   size_t cr = 2;
913 
914   std::vector<uint16_t> b(c);
915   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
916   std::vector<uint16_t> k(c * h * w);  // k = [2, 3, 4, 5, 6, 7]
917   std::iota(k.begin(), k.end(), b.size());
918   std::vector<uint16_t> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
919 
920   xnn_pack_f16_dwconv_ghw_w(
921       primary_tile,
922       h,
923       w,
924       c,
925       cr,
926       k.data(),
927       b.data(),
928       packed_weights.data(),
929       0,
930       nullptr);
931 
932   std::vector<uint16_t> expected = {
933     // bias first
934     0, 1,
935     // then weights, channels first
936     2, 5,
937     3, 6,
938     4, 7,
939   };
940   ASSERT_EQ(expected, packed_weights);
941 }
942 
TEST(PACK_F16_DWCONV_GHW_W,primary_tile_eq_kernel_size_channels_gt_cr)943 TEST(PACK_F16_DWCONV_GHW_W, primary_tile_eq_kernel_size_channels_gt_cr) {
944   size_t primary_tile = 3;
945   size_t h = 3;
946   size_t w = 1;
947   size_t c = 5;
948   size_t cr = 2;
949 
950   std::vector<uint16_t> b(c);
951   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
952   std::vector<uint16_t> k(c * h * w);  // k = [
953                                       //   5, 6, 7,
954                                       //   8, 9, 10,
955                                       //   11, 12, 13,
956                                       //   14, 15, 16,
957                                       //   17, 18, 19]
958   std::iota(k.begin(), k.end(), b.size());
959   std::vector<uint16_t> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
960 
961   xnn_pack_f16_dwconv_ghw_w(
962       primary_tile,
963       h,
964       w,
965       c,
966       cr,
967       k.data(),
968       b.data(),
969       packed_weights.data(),
970       0,
971       nullptr);
972 
973   std::vector<uint16_t> expected = {
974     // cr blocks
975     // bias first (cr == 2 of them)
976     0, 1,
977     // then weights, channels first
978     5, 8, 6, 9, 7, 10,
979     // bias again
980     2, 3,
981     // then weights, channels first
982     11, 14, 12, 15, 13, 16,
983     // bias again
984     4, 0,
985     // then weights, channels first
986     17, 0, 18, 0, 19, 0,
987   };
988   ASSERT_EQ(expected, packed_weights);
989 }
990 
TEST(PACK_F16_DWCONV_GHW_W,primary_tile_gt_kernel_size)991 TEST(PACK_F16_DWCONV_GHW_W, primary_tile_gt_kernel_size) {
992   size_t primary_tile = 9;
993   size_t h = 2;
994   size_t w = 2;
995   size_t c = 2;
996   size_t cr = 2;
997 
998   std::vector<uint16_t> b(c);
999   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
1000   std::vector<uint16_t> k(c * h * w);  // k = [
1001                                       //   2, 3,
1002                                       //   4, 5,
1003                                       //   6, 7,
1004                                       //   8, 9]
1005   std::iota(k.begin(), k.end(), b.size());
1006   std::vector<uint16_t> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1007 
1008   xnn_pack_f16_dwconv_ghw_w(
1009       primary_tile,
1010       h,
1011       w,
1012       c,
1013       cr,
1014       k.data(),
1015       b.data(),
1016       packed_weights.data(),
1017       0,
1018       nullptr);
1019 
1020   std::vector<uint16_t> expected = {
1021     // bias first (cr == 2 of them)
1022     0, 1,
1023     // then weights, channels first
1024     2, 6,
1025     // go down the columns first
1026     4, 8, 3, 7, 5, 9,
1027     // followed by 10 zeros to make up the difference with primary_tile
1028     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1029   };
1030   ASSERT_EQ(expected, packed_weights);
1031 }
1032 
TEST(PACK_F16_DWCONV_GHW_W,primary_tile_gt_kernel_size_channels_gt_cr)1033 TEST(PACK_F16_DWCONV_GHW_W, primary_tile_gt_kernel_size_channels_gt_cr) {
1034   size_t primary_tile = 9;
1035   size_t h = 2;
1036   size_t w = 2;
1037   size_t c = 5;
1038   size_t cr = 2;
1039 
1040   std::vector<uint16_t> b(c);
1041   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
1042   std::vector<uint16_t> k(c * h * w);  // k = [
1043                                       //   5, 6,
1044                                       //   7, 8,
1045                                       //   9, 10,
1046                                       //   11, 12,
1047                                       //   13, 14,
1048                                       //   15, 16,
1049                                       //   17, 18,
1050                                       //   19, 20,
1051                                       //   21, 22,
1052                                       //   23, 24]
1053   std::iota(k.begin(), k.end(), b.size());
1054   std::vector<uint16_t> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1055 
1056   xnn_pack_f16_dwconv_ghw_w(
1057       primary_tile,
1058       h,
1059       w,
1060       c,
1061       cr,
1062       k.data(),
1063       b.data(),
1064       packed_weights.data(),
1065       0,
1066       nullptr);
1067 
1068   std::vector<uint16_t> expected = {
1069     // bias first (cr == 2 of them)
1070     0, 1,
1071     // then weights, channels first
1072     5, 9,
1073     // go down the columns first
1074     7, 11,
1075     6, 10,
1076     8, 12,
1077     // followed by 10 zeros to make up the difference with primary_tile
1078     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1079     // bias first (cr == 2 of them)
1080     2, 3,
1081     // then weights, channels first
1082     13, 17, 15, 19, 14, 18, 16, 20,
1083     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1084     // bias
1085     4, 0,
1086     // weights
1087     21, 0, 23, 0, 22, 0, 24, 0,
1088     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1089   };
1090   ASSERT_EQ(expected, packed_weights);
1091 }
1092 
TEST(PACK_F16_DWCONV_HWG_W,primary_tile_eq_kernel_size)1093 TEST(PACK_F16_DWCONV_HWG_W, primary_tile_eq_kernel_size) {
1094   size_t primary_tile = 3;
1095   size_t h = 3;
1096   size_t w = 1;
1097   size_t c = 2;
1098   size_t cr = 2;
1099 
1100   std::vector<uint16_t> b(c);
1101   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
1102   std::vector<uint16_t> k(c * h * w);  // k = [2, 3, 4, 5, 6, 7]
1103   std::iota(k.begin(), k.end(), b.size());
1104   std::vector<uint16_t> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1105 
1106   xnn_pack_f16_dwconv_hwg_w(
1107       primary_tile,
1108       h,
1109       w,
1110       c,
1111       cr,
1112       k.data(),
1113       b.data(),
1114       packed_weights.data(),
1115       0,
1116       nullptr);
1117 
1118   std::vector<uint16_t> expected = {
1119     // bias first
1120     0, 1,
1121     // then weights, channels first
1122     2, 3,
1123     4, 5,
1124     6, 7,
1125   };
1126   ASSERT_EQ(expected, packed_weights);
1127 }
1128 
TEST(PACK_F16_DWCONV_HWG_W,primary_tile_eq_kernel_size_channels_gt_cr)1129 TEST(PACK_F16_DWCONV_HWG_W, primary_tile_eq_kernel_size_channels_gt_cr) {
1130   size_t primary_tile = 3;
1131   size_t h = 3;
1132   size_t w = 1;
1133   size_t c = 5;
1134   size_t cr = 2;
1135 
1136   std::vector<uint16_t> b(c);
1137   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
1138   std::vector<uint16_t> k(c * h * w);  // k = [
1139                                       //   5, 6, 7, 8, 9,
1140                                       //   10, 11, 12, 13, 14,
1141                                       //   15, 16, 17, 18, 19]
1142   std::iota(k.begin(), k.end(), b.size());
1143   std::vector<uint16_t> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1144 
1145   xnn_pack_f16_dwconv_hwg_w(
1146       primary_tile,
1147       h,
1148       w,
1149       c,
1150       cr,
1151       k.data(),
1152       b.data(),
1153       packed_weights.data(),
1154       0,
1155       nullptr);
1156 
1157   std::vector<uint16_t> expected = {
1158     // cr blocks
1159     // bias first (cr == 2 of them)
1160     0, 1,
1161     // then weights, channels first
1162     5, 6, 10, 11, 15, 16,
1163     // bias again
1164     2, 3,
1165     // then weights, channels first
1166     7, 8, 12, 13, 17, 18,
1167     // bias again
1168     4, 0,
1169     // then weights, channels first
1170     9, 0, 14, 0, 19, 0,
1171   };
1172   ASSERT_EQ(expected, packed_weights);
1173 }
1174 
TEST(PACK_F16_DWCONV_HWG_W,primary_tile_gt_kernel_size)1175 TEST(PACK_F16_DWCONV_HWG_W, primary_tile_gt_kernel_size) {
1176   size_t primary_tile = 9;
1177   size_t h = 2;
1178   size_t w = 2;
1179   size_t c = 2;
1180   size_t cr = 2;
1181 
1182   std::vector<uint16_t> b(c);
1183   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
1184   std::vector<uint16_t> k(c * h * w);  // k = [
1185                                       //   2, 3,
1186                                       //   4, 5,
1187                                       //   6, 7,
1188                                       //   8, 9]
1189   std::iota(k.begin(), k.end(), b.size());
1190   std::vector<uint16_t> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1191 
1192   xnn_pack_f16_dwconv_hwg_w(
1193       primary_tile,
1194       h,
1195       w,
1196       c,
1197       cr,
1198       k.data(),
1199       b.data(),
1200       packed_weights.data(),
1201       0,
1202       nullptr);
1203 
1204   std::vector<uint16_t> expected = {
1205     // bias first (cr == 2 of them)
1206     0, 1,
1207     // then weights, channels first
1208     2, 3,
1209     // go down the columns first
1210     6, 7, 4, 5, 8, 9,
1211     // followed by 10 zeros to make up the difference with primary_tile
1212     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1213   };
1214   ASSERT_EQ(expected, packed_weights);
1215 }
1216 
TEST(PACK_F16_DWCONV_HWG_W,primary_tile_gt_kernel_size_channels_gt_cr)1217 TEST(PACK_F16_DWCONV_HWG_W, primary_tile_gt_kernel_size_channels_gt_cr) {
1218   size_t primary_tile = 9;
1219   size_t h = 2;
1220   size_t w = 2;
1221   size_t c = 5;
1222   size_t cr = 2;
1223 
1224   std::vector<uint16_t> b(c);
1225   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
1226   std::vector<uint16_t> k(c * h * w);  // k = [
1227                                       //   5, 6, 7, 8, 9,
1228                                       //   10, 11, 12, 13, 14,
1229                                       //   15, 16, 17, 18, 19,
1230                                       //   20, 21, 22, 23, 24]
1231   std::iota(k.begin(), k.end(), b.size());
1232   std::vector<uint16_t> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1233 
1234   xnn_pack_f16_dwconv_hwg_w(
1235       primary_tile,
1236       h,
1237       w,
1238       c,
1239       cr,
1240       k.data(),
1241       b.data(),
1242       packed_weights.data(),
1243       0,
1244       nullptr);
1245 
1246   std::vector<uint16_t> expected = {
1247     // bias first (cr == 2 of them)
1248     0, 1,
1249     // then weights, channels first
1250     5, 6,
1251     // go down the columns first
1252     15, 16,
1253     10, 11,
1254     20, 21,
1255     // followed by 10 zeros to make up the difference with primary_tile
1256     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1257     // bias first (cr == 2 of them)
1258     2, 3,
1259     // then weights, channels first
1260     7, 8, 17, 18, 12, 13, 22, 23,
1261     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1262     // bias
1263     4, 0,
1264     // weights
1265     9, 0, 19, 0, 14, 0, 24, 0,
1266     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1267   };
1268   ASSERT_EQ(expected, packed_weights);
1269 }
1270 
TEST(PACK_F32_DWCONV_GHW_W,primary_tile_eq_kernel_size)1271 TEST(PACK_F32_DWCONV_GHW_W, primary_tile_eq_kernel_size) {
1272   size_t primary_tile = 3;
1273   size_t h = 3;
1274   size_t w = 1;
1275   size_t c = 2;
1276   size_t cr = 2;
1277 
1278   std::vector<float> b(c);
1279   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
1280   std::vector<float> k(c * h * w);  // k = [2, 3, 4, 5, 6, 7]
1281   std::iota(k.begin(), k.end(), b.size());
1282   std::vector<float> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1283 
1284   xnn_pack_f32_dwconv_ghw_w(
1285       primary_tile,
1286       h,
1287       w,
1288       c,
1289       cr,
1290       k.data(),
1291       b.data(),
1292       packed_weights.data(),
1293       0,
1294       nullptr);
1295 
1296   std::vector<float> expected = {
1297     // bias first
1298     0.0f, 1.0f,
1299     // then weights, channels first
1300     2.0f, 5.0f,
1301     3.0f, 6.0f,
1302     4.0f, 7.0f,
1303   };
1304   ASSERT_EQ(expected, packed_weights);
1305 }
1306 
TEST(PACK_F32_DWCONV_GHW_W,primary_tile_eq_kernel_size_channels_gt_cr)1307 TEST(PACK_F32_DWCONV_GHW_W, primary_tile_eq_kernel_size_channels_gt_cr) {
1308   size_t primary_tile = 3;
1309   size_t h = 3;
1310   size_t w = 1;
1311   size_t c = 5;
1312   size_t cr = 2;
1313 
1314   std::vector<float> b(c);
1315   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
1316   std::vector<float> k(c * h * w);  // k = [
1317                                       //   5, 6, 7,
1318                                       //   8, 9, 10,
1319                                       //   11, 12, 13,
1320                                       //   14, 15, 16,
1321                                       //   17, 18, 19]
1322   std::iota(k.begin(), k.end(), b.size());
1323   std::vector<float> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1324 
1325   xnn_pack_f32_dwconv_ghw_w(
1326       primary_tile,
1327       h,
1328       w,
1329       c,
1330       cr,
1331       k.data(),
1332       b.data(),
1333       packed_weights.data(),
1334       0,
1335       nullptr);
1336 
1337   std::vector<float> expected = {
1338     // cr blocks
1339     // bias first (cr == 2 of them)
1340     0.0f, 1.0f,
1341     // then weights, channels first
1342     5.0f, 8.0f, 6.0f, 9.0f, 7.0f, 10.0f,
1343     // bias again
1344     2.0f, 3.0f,
1345     // then weights, channels first
1346     11.0f, 14.0f, 12.0f, 15.0f, 13.0f, 16.0f,
1347     // bias again
1348     4.0f, 0.0f,
1349     // then weights, channels first
1350     17.0f, 0.0f, 18.0f, 0.0f, 19.0f, 0.0f,
1351   };
1352   ASSERT_EQ(expected, packed_weights);
1353 }
1354 
TEST(PACK_F32_DWCONV_GHW_W,primary_tile_gt_kernel_size)1355 TEST(PACK_F32_DWCONV_GHW_W, primary_tile_gt_kernel_size) {
1356   size_t primary_tile = 9;
1357   size_t h = 2;
1358   size_t w = 2;
1359   size_t c = 2;
1360   size_t cr = 2;
1361 
1362   std::vector<float> b(c);
1363   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
1364   std::vector<float> k(c * h * w);  // k = [
1365                                       //   2, 3,
1366                                       //   4, 5,
1367                                       //   6, 7,
1368                                       //   8, 9]
1369   std::iota(k.begin(), k.end(), b.size());
1370   std::vector<float> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1371 
1372   xnn_pack_f32_dwconv_ghw_w(
1373       primary_tile,
1374       h,
1375       w,
1376       c,
1377       cr,
1378       k.data(),
1379       b.data(),
1380       packed_weights.data(),
1381       0,
1382       nullptr);
1383 
1384   std::vector<float> expected = {
1385     // bias first (cr == 2 of them)
1386     0.0f, 1.0f,
1387     // then weights, channels first
1388     2.0f, 6.0f,
1389     // go down the columns first
1390     4.0f, 8.0f, 3.0f, 7.0f, 5.0f, 9.0f,
1391     // followed by 10 zeros to make up the difference with primary_tile
1392     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
1393   };
1394   ASSERT_EQ(expected, packed_weights);
1395 }
1396 
TEST(PACK_F32_DWCONV_GHW_W,primary_tile_gt_kernel_size_channels_gt_cr)1397 TEST(PACK_F32_DWCONV_GHW_W, primary_tile_gt_kernel_size_channels_gt_cr) {
1398   size_t primary_tile = 9;
1399   size_t h = 2;
1400   size_t w = 2;
1401   size_t c = 5;
1402   size_t cr = 2;
1403 
1404   std::vector<float> b(c);
1405   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
1406   std::vector<float> k(c * h * w);  // k = [
1407                                       //   5, 6,
1408                                       //   7, 8,
1409                                       //   9, 10,
1410                                       //   11, 12,
1411                                       //   13, 14,
1412                                       //   15, 16,
1413                                       //   17, 18,
1414                                       //   19, 20,
1415                                       //   21, 22,
1416                                       //   23, 24]
1417   std::iota(k.begin(), k.end(), b.size());
1418   std::vector<float> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1419 
1420   xnn_pack_f32_dwconv_ghw_w(
1421       primary_tile,
1422       h,
1423       w,
1424       c,
1425       cr,
1426       k.data(),
1427       b.data(),
1428       packed_weights.data(),
1429       0,
1430       nullptr);
1431 
1432   std::vector<float> expected = {
1433     // bias first (cr == 2 of them)
1434     0.0f, 1.0f,
1435     // then weights, channels first
1436     5.0f, 9.0f,
1437     // go down the columns first
1438     7.0f, 11.0f,
1439     6.0f, 10.0f,
1440     8.0f, 12.0f,
1441     // followed by 10 zeros to make up the difference with primary_tile
1442     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
1443     // bias first (cr == 2 of them)
1444     2.0f, 3.0f,
1445     // then weights, channels first
1446     13.0f, 17.0f, 15.0f, 19.0f, 14.0f, 18.0f, 16.0f, 20.0f,
1447     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
1448     // bias
1449     4.0f, 0.0f,
1450     // weights
1451     21.0f, 0.0f, 23.0f, 0.0f, 22.0f, 0.0f, 24.0f, 0.0f,
1452     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
1453   };
1454   ASSERT_EQ(expected, packed_weights);
1455 }
1456 
TEST(PACK_F32_DWCONV_HWG_W,primary_tile_eq_kernel_size)1457 TEST(PACK_F32_DWCONV_HWG_W, primary_tile_eq_kernel_size) {
1458   size_t primary_tile = 3;
1459   size_t h = 3;
1460   size_t w = 1;
1461   size_t c = 2;
1462   size_t cr = 2;
1463 
1464   std::vector<float> b(c);
1465   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
1466   std::vector<float> k(c * h * w);  // k = [2, 3, 4, 5, 6, 7]
1467   std::iota(k.begin(), k.end(), b.size());
1468   std::vector<float> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1469 
1470   xnn_pack_f32_dwconv_hwg_w(
1471       primary_tile,
1472       h,
1473       w,
1474       c,
1475       cr,
1476       k.data(),
1477       b.data(),
1478       packed_weights.data(),
1479       0,
1480       nullptr);
1481 
1482   std::vector<float> expected = {
1483     // bias first
1484     0.0f, 1.0f,
1485     // then weights, channels first
1486     2.0f, 3.0f,
1487     4.0f, 5.0f,
1488     6.0f, 7.0f,
1489   };
1490   ASSERT_EQ(expected, packed_weights);
1491 }
1492 
TEST(PACK_F32_DWCONV_HWG_W,primary_tile_eq_kernel_size_channels_gt_cr)1493 TEST(PACK_F32_DWCONV_HWG_W, primary_tile_eq_kernel_size_channels_gt_cr) {
1494   size_t primary_tile = 3;
1495   size_t h = 3;
1496   size_t w = 1;
1497   size_t c = 5;
1498   size_t cr = 2;
1499 
1500   std::vector<float> b(c);
1501   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
1502   std::vector<float> k(c * h * w);  // k = [
1503                                       //   5, 6, 7, 8, 9,
1504                                       //   10, 11, 12, 13, 14,
1505                                       //   15, 16, 17, 18, 19]
1506   std::iota(k.begin(), k.end(), b.size());
1507   std::vector<float> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1508 
1509   xnn_pack_f32_dwconv_hwg_w(
1510       primary_tile,
1511       h,
1512       w,
1513       c,
1514       cr,
1515       k.data(),
1516       b.data(),
1517       packed_weights.data(),
1518       0,
1519       nullptr);
1520 
1521   std::vector<float> expected = {
1522     // cr blocks
1523     // bias first (cr == 2 of them)
1524     0.0f, 1.0f,
1525     // then weights, channels first
1526     5.0f, 6.0f, 10.0f, 11.0f, 15.0f, 16.0f,
1527     // bias again
1528     2.0f, 3.0f,
1529     // then weights, channels first
1530     7.0f, 8.0f, 12.0f, 13.0f, 17.0f, 18.0f,
1531     // bias again
1532     4.0f, 0.0f,
1533     // then weights, channels first
1534     9.0f, 0.0f, 14.0f, 0.0f, 19.0f, 0.0f,
1535   };
1536   ASSERT_EQ(expected, packed_weights);
1537 }
1538 
TEST(PACK_F32_DWCONV_HWG_W,primary_tile_gt_kernel_size)1539 TEST(PACK_F32_DWCONV_HWG_W, primary_tile_gt_kernel_size) {
1540   size_t primary_tile = 9;
1541   size_t h = 2;
1542   size_t w = 2;
1543   size_t c = 2;
1544   size_t cr = 2;
1545 
1546   std::vector<float> b(c);
1547   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
1548   std::vector<float> k(c * h * w);  // k = [
1549                                       //   2, 3,
1550                                       //   4, 5,
1551                                       //   6, 7,
1552                                       //   8, 9]
1553   std::iota(k.begin(), k.end(), b.size());
1554   std::vector<float> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1555 
1556   xnn_pack_f32_dwconv_hwg_w(
1557       primary_tile,
1558       h,
1559       w,
1560       c,
1561       cr,
1562       k.data(),
1563       b.data(),
1564       packed_weights.data(),
1565       0,
1566       nullptr);
1567 
1568   std::vector<float> expected = {
1569     // bias first (cr == 2 of them)
1570     0.0f, 1.0f,
1571     // then weights, channels first
1572     2.0f, 3.0f,
1573     // go down the columns first
1574     6.0f, 7.0f, 4.0f, 5.0f, 8.0f, 9.0f,
1575     // followed by 10 zeros to make up the difference with primary_tile
1576     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
1577   };
1578   ASSERT_EQ(expected, packed_weights);
1579 }
1580 
TEST(PACK_F32_DWCONV_HWG_W,primary_tile_gt_kernel_size_channels_gt_cr)1581 TEST(PACK_F32_DWCONV_HWG_W, primary_tile_gt_kernel_size_channels_gt_cr) {
1582   size_t primary_tile = 9;
1583   size_t h = 2;
1584   size_t w = 2;
1585   size_t c = 5;
1586   size_t cr = 2;
1587 
1588   std::vector<float> b(c);
1589   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
1590   std::vector<float> k(c * h * w);  // k = [
1591                                       //   5, 6, 7, 8, 9,
1592                                       //   10, 11, 12, 13, 14,
1593                                       //   15, 16, 17, 18, 19,
1594                                       //   20, 21, 22, 23, 24]
1595   std::iota(k.begin(), k.end(), b.size());
1596   std::vector<float> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1597 
1598   xnn_pack_f32_dwconv_hwg_w(
1599       primary_tile,
1600       h,
1601       w,
1602       c,
1603       cr,
1604       k.data(),
1605       b.data(),
1606       packed_weights.data(),
1607       0,
1608       nullptr);
1609 
1610   std::vector<float> expected = {
1611     // bias first (cr == 2 of them)
1612     0.0f, 1.0f,
1613     // then weights, channels first
1614     5.0f, 6.0f,
1615     // go down the columns first
1616     15.0f, 16.0f,
1617     10.0f, 11.0f,
1618     20.0f, 21.0f,
1619     // followed by 10 zeros to make up the difference with primary_tile
1620     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
1621     // bias first (cr == 2 of them)
1622     2.0f, 3.0f,
1623     // then weights, channels first
1624     7.0f, 8.0f, 17.0f, 18.0f, 12.0f, 13.0f, 22.0f, 23.0f,
1625     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
1626     // bias
1627     4.0f, 0.0f,
1628     // weights
1629     9.0f, 0.0f, 19.0f, 0.0f, 14.0f, 0.0f, 24.0f, 0.0f,
1630     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
1631   };
1632   ASSERT_EQ(expected, packed_weights);
1633 }
1634 
TEST(PACK_F32_TO_F16_DWCONV_GHW_W,primary_tile_eq_kernel_size)1635 TEST(PACK_F32_TO_F16_DWCONV_GHW_W, primary_tile_eq_kernel_size) {
1636   size_t primary_tile = 3;
1637   size_t h = 3;
1638   size_t w = 1;
1639   size_t c = 2;
1640   size_t cr = 2;
1641 
1642   std::vector<float> b(c);
1643   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
1644   std::vector<float> k(c * h * w);  // k = [2, 3, 4, 5, 6, 7]
1645   std::iota(k.begin(), k.end(), b.size());
1646   std::vector<uint16_t> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1647 
1648   xnn_pack_f32_to_f16_dwconv_ghw_w(
1649       primary_tile,
1650       h,
1651       w,
1652       c,
1653       cr,
1654       k.data(),
1655       b.data(),
1656       packed_weights.data(),
1657       0,
1658       nullptr);
1659 
1660   std::vector<float> expected_float = {
1661     // bias first
1662     0.0f, 1.0f,
1663     // then weights, channels first
1664     2.0f, 5.0f,
1665     3.0f, 6.0f,
1666     4.0f, 7.0f,
1667   };
1668   std::vector<uint16_t> expected(expected_float.size());
1669   std::transform(expected_float.begin(), expected_float.end(), expected.begin(),
1670                  [](float f) { return fp16_ieee_from_fp32_value(f); });
1671   ASSERT_EQ(expected, packed_weights);
1672 }
1673 
TEST(PACK_F32_TO_F16_DWCONV_GHW_W,primary_tile_eq_kernel_size_channels_gt_cr)1674 TEST(PACK_F32_TO_F16_DWCONV_GHW_W, primary_tile_eq_kernel_size_channels_gt_cr) {
1675   size_t primary_tile = 3;
1676   size_t h = 3;
1677   size_t w = 1;
1678   size_t c = 5;
1679   size_t cr = 2;
1680 
1681   std::vector<float> b(c);
1682   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
1683   std::vector<float> k(c * h * w);  // k = [
1684                                       //   5, 6, 7,
1685                                       //   8, 9, 10,
1686                                       //   11, 12, 13,
1687                                       //   14, 15, 16,
1688                                       //   17, 18, 19]
1689   std::iota(k.begin(), k.end(), b.size());
1690   std::vector<uint16_t> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1691 
1692   xnn_pack_f32_to_f16_dwconv_ghw_w(
1693       primary_tile,
1694       h,
1695       w,
1696       c,
1697       cr,
1698       k.data(),
1699       b.data(),
1700       packed_weights.data(),
1701       0,
1702       nullptr);
1703 
1704   std::vector<float> expected_float = {
1705     // cr blocks
1706     // bias first (cr == 2 of them)
1707     0.0f, 1.0f,
1708     // then weights, channels first
1709     5.0f, 8.0f, 6.0f, 9.0f, 7.0f, 10.0f,
1710     // bias again
1711     2.0f, 3.0f,
1712     // then weights, channels first
1713     11.0f, 14.0f, 12.0f, 15.0f, 13.0f, 16.0f,
1714     // bias again
1715     4.0f, 0.0f,
1716     // then weights, channels first
1717     17.0f, 0.0f, 18.0f, 0.0f, 19.0f, 0.0f,
1718   };
1719   std::vector<uint16_t> expected(expected_float.size());
1720   std::transform(expected_float.begin(), expected_float.end(), expected.begin(),
1721                  [](float f) { return fp16_ieee_from_fp32_value(f); });
1722   ASSERT_EQ(expected, packed_weights);
1723 }
1724 
TEST(PACK_F32_TO_F16_DWCONV_GHW_W,primary_tile_gt_kernel_size)1725 TEST(PACK_F32_TO_F16_DWCONV_GHW_W, primary_tile_gt_kernel_size) {
1726   size_t primary_tile = 9;
1727   size_t h = 2;
1728   size_t w = 2;
1729   size_t c = 2;
1730   size_t cr = 2;
1731 
1732   std::vector<float> b(c);
1733   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
1734   std::vector<float> k(c * h * w);  // k = [
1735                                       //   2, 3,
1736                                       //   4, 5,
1737                                       //   6, 7,
1738                                       //   8, 9]
1739   std::iota(k.begin(), k.end(), b.size());
1740   std::vector<uint16_t> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1741 
1742   xnn_pack_f32_to_f16_dwconv_ghw_w(
1743       primary_tile,
1744       h,
1745       w,
1746       c,
1747       cr,
1748       k.data(),
1749       b.data(),
1750       packed_weights.data(),
1751       0,
1752       nullptr);
1753 
1754   std::vector<float> expected_float = {
1755     // bias first (cr == 2 of them)
1756     0.0f, 1.0f,
1757     // then weights, channels first
1758     2.0f, 6.0f,
1759     // go down the columns first
1760     4.0f, 8.0f, 3.0f, 7.0f, 5.0f, 9.0f,
1761     // followed by 10 zeros to make up the difference with primary_tile
1762     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
1763   };
1764   std::vector<uint16_t> expected(expected_float.size());
1765   std::transform(expected_float.begin(), expected_float.end(), expected.begin(),
1766                  [](float f) { return fp16_ieee_from_fp32_value(f); });
1767   ASSERT_EQ(expected, packed_weights);
1768 }
1769 
TEST(PACK_F32_TO_F16_DWCONV_GHW_W,primary_tile_gt_kernel_size_channels_gt_cr)1770 TEST(PACK_F32_TO_F16_DWCONV_GHW_W, primary_tile_gt_kernel_size_channels_gt_cr) {
1771   size_t primary_tile = 9;
1772   size_t h = 2;
1773   size_t w = 2;
1774   size_t c = 5;
1775   size_t cr = 2;
1776 
1777   std::vector<float> b(c);
1778   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
1779   std::vector<float> k(c * h * w);  // k = [
1780                                       //   5, 6,
1781                                       //   7, 8,
1782                                       //   9, 10,
1783                                       //   11, 12,
1784                                       //   13, 14,
1785                                       //   15, 16,
1786                                       //   17, 18,
1787                                       //   19, 20,
1788                                       //   21, 22,
1789                                       //   23, 24]
1790   std::iota(k.begin(), k.end(), b.size());
1791   std::vector<uint16_t> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1792 
1793   xnn_pack_f32_to_f16_dwconv_ghw_w(
1794       primary_tile,
1795       h,
1796       w,
1797       c,
1798       cr,
1799       k.data(),
1800       b.data(),
1801       packed_weights.data(),
1802       0,
1803       nullptr);
1804 
1805   std::vector<float> expected_float = {
1806     // bias first (cr == 2 of them)
1807     0.0f, 1.0f,
1808     // then weights, channels first
1809     5.0f, 9.0f,
1810     // go down the columns first
1811     7.0f, 11.0f,
1812     6.0f, 10.0f,
1813     8.0f, 12.0f,
1814     // followed by 10 zeros to make up the difference with primary_tile
1815     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
1816     // bias first (cr == 2 of them)
1817     2.0f, 3.0f,
1818     // then weights, channels first
1819     13.0f, 17.0f, 15.0f, 19.0f, 14.0f, 18.0f, 16.0f, 20.0f,
1820     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
1821     // bias
1822     4.0f, 0.0f,
1823     // weights
1824     21.0f, 0.0f, 23.0f, 0.0f, 22.0f, 0.0f, 24.0f, 0.0f,
1825     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
1826   };
1827   std::vector<uint16_t> expected(expected_float.size());
1828   std::transform(expected_float.begin(), expected_float.end(), expected.begin(),
1829                  [](float f) { return fp16_ieee_from_fp32_value(f); });
1830   ASSERT_EQ(expected, packed_weights);
1831 }
1832 
TEST(PACK_F32_TO_F16_DWCONV_HWG_W,primary_tile_eq_kernel_size)1833 TEST(PACK_F32_TO_F16_DWCONV_HWG_W, primary_tile_eq_kernel_size) {
1834   size_t primary_tile = 3;
1835   size_t h = 3;
1836   size_t w = 1;
1837   size_t c = 2;
1838   size_t cr = 2;
1839 
1840   std::vector<float> b(c);
1841   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
1842   std::vector<float> k(c * h * w);  // k = [2, 3, 4, 5, 6, 7]
1843   std::iota(k.begin(), k.end(), b.size());
1844   std::vector<uint16_t> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1845 
1846   xnn_pack_f32_to_f16_dwconv_hwg_w(
1847       primary_tile,
1848       h,
1849       w,
1850       c,
1851       cr,
1852       k.data(),
1853       b.data(),
1854       packed_weights.data(),
1855       0,
1856       nullptr);
1857 
1858   std::vector<float> expected_float = {
1859     // bias first
1860     0.0f, 1.0f,
1861     // then weights, channels first
1862     2.0f, 3.0f,
1863     4.0f, 5.0f,
1864     6.0f, 7.0f,
1865   };
1866   std::vector<uint16_t> expected(expected_float.size());
1867   std::transform(expected_float.begin(), expected_float.end(), expected.begin(),
1868                  [](float f) { return fp16_ieee_from_fp32_value(f); });
1869   ASSERT_EQ(expected, packed_weights);
1870 }
1871 
TEST(PACK_F32_TO_F16_DWCONV_HWG_W,primary_tile_eq_kernel_size_channels_gt_cr)1872 TEST(PACK_F32_TO_F16_DWCONV_HWG_W, primary_tile_eq_kernel_size_channels_gt_cr) {
1873   size_t primary_tile = 3;
1874   size_t h = 3;
1875   size_t w = 1;
1876   size_t c = 5;
1877   size_t cr = 2;
1878 
1879   std::vector<float> b(c);
1880   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
1881   std::vector<float> k(c * h * w);  // k = [
1882                                       //   5, 6, 7, 8, 9,
1883                                       //   10, 11, 12, 13, 14,
1884                                       //   15, 16, 17, 18, 19]
1885   std::iota(k.begin(), k.end(), b.size());
1886   std::vector<uint16_t> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1887 
1888   xnn_pack_f32_to_f16_dwconv_hwg_w(
1889       primary_tile,
1890       h,
1891       w,
1892       c,
1893       cr,
1894       k.data(),
1895       b.data(),
1896       packed_weights.data(),
1897       0,
1898       nullptr);
1899 
1900   std::vector<float> expected_float = {
1901     // cr blocks
1902     // bias first (cr == 2 of them)
1903     0.0f, 1.0f,
1904     // then weights, channels first
1905     5.0f, 6.0f, 10.0f, 11.0f, 15.0f, 16.0f,
1906     // bias again
1907     2.0f, 3.0f,
1908     // then weights, channels first
1909     7.0f, 8.0f, 12.0f, 13.0f, 17.0f, 18.0f,
1910     // bias again
1911     4.0f, 0.0f,
1912     // then weights, channels first
1913     9.0f, 0.0f, 14.0f, 0.0f, 19.0f, 0.0f,
1914   };
1915   std::vector<uint16_t> expected(expected_float.size());
1916   std::transform(expected_float.begin(), expected_float.end(), expected.begin(),
1917                  [](float f) { return fp16_ieee_from_fp32_value(f); });
1918   ASSERT_EQ(expected, packed_weights);
1919 }
1920 
TEST(PACK_F32_TO_F16_DWCONV_HWG_W,primary_tile_gt_kernel_size)1921 TEST(PACK_F32_TO_F16_DWCONV_HWG_W, primary_tile_gt_kernel_size) {
1922   size_t primary_tile = 9;
1923   size_t h = 2;
1924   size_t w = 2;
1925   size_t c = 2;
1926   size_t cr = 2;
1927 
1928   std::vector<float> b(c);
1929   std::iota(b.begin(), b.end(), 0);  // b = [0, 1]
1930   std::vector<float> k(c * h * w);  // k = [
1931                                       //   2, 3,
1932                                       //   4, 5,
1933                                       //   6, 7,
1934                                       //   8, 9]
1935   std::iota(k.begin(), k.end(), b.size());
1936   std::vector<uint16_t> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1937 
1938   xnn_pack_f32_to_f16_dwconv_hwg_w(
1939       primary_tile,
1940       h,
1941       w,
1942       c,
1943       cr,
1944       k.data(),
1945       b.data(),
1946       packed_weights.data(),
1947       0,
1948       nullptr);
1949 
1950   std::vector<float> expected_float = {
1951     // bias first (cr == 2 of them)
1952     0.0f, 1.0f,
1953     // then weights, channels first
1954     2.0f, 3.0f,
1955     // go down the columns first
1956     6.0f, 7.0f, 4.0f, 5.0f, 8.0f, 9.0f,
1957     // followed by 10 zeros to make up the difference with primary_tile
1958     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
1959   };
1960   std::vector<uint16_t> expected(expected_float.size());
1961   std::transform(expected_float.begin(), expected_float.end(), expected.begin(),
1962                  [](float f) { return fp16_ieee_from_fp32_value(f); });
1963   ASSERT_EQ(expected, packed_weights);
1964 }
1965 
TEST(PACK_F32_TO_F16_DWCONV_HWG_W,primary_tile_gt_kernel_size_channels_gt_cr)1966 TEST(PACK_F32_TO_F16_DWCONV_HWG_W, primary_tile_gt_kernel_size_channels_gt_cr) {
1967   size_t primary_tile = 9;
1968   size_t h = 2;
1969   size_t w = 2;
1970   size_t c = 5;
1971   size_t cr = 2;
1972 
1973   std::vector<float> b(c);
1974   std::iota(b.begin(), b.end(), 0);  // b = [0, 1, 2, 3, 4]
1975   std::vector<float> k(c * h * w);  // k = [
1976                                       //   5, 6, 7, 8, 9,
1977                                       //   10, 11, 12, 13, 14,
1978                                       //   15, 16, 17, 18, 19,
1979                                       //   20, 21, 22, 23, 24]
1980   std::iota(k.begin(), k.end(), b.size());
1981   std::vector<uint16_t> packed_weights(((primary_tile + 1) * round_up_po2(c, cr)));
1982 
1983   xnn_pack_f32_to_f16_dwconv_hwg_w(
1984       primary_tile,
1985       h,
1986       w,
1987       c,
1988       cr,
1989       k.data(),
1990       b.data(),
1991       packed_weights.data(),
1992       0,
1993       nullptr);
1994 
1995   std::vector<float> expected_float = {
1996     // bias first (cr == 2 of them)
1997     0.0f, 1.0f,
1998     // then weights, channels first
1999     5.0f, 6.0f,
2000     // go down the columns first
2001     15.0f, 16.0f,
2002     10.0f, 11.0f,
2003     20.0f, 21.0f,
2004     // followed by 10 zeros to make up the difference with primary_tile
2005     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
2006     // bias first (cr == 2 of them)
2007     2.0f, 3.0f,
2008     // then weights, channels first
2009     7.0f, 8.0f, 17.0f, 18.0f, 12.0f, 13.0f, 22.0f, 23.0f,
2010     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
2011     // bias
2012     4.0f, 0.0f,
2013     // weights
2014     9.0f, 0.0f, 19.0f, 0.0f, 14.0f, 0.0f, 24.0f, 0.0f,
2015     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
2016   };
2017   std::vector<uint16_t> expected(expected_float.size());
2018   std::transform(expected_float.begin(), expected_float.end(), expected.begin(),
2019                  [](float f) { return fp16_ieee_from_fp32_value(f); });
2020   ASSERT_EQ(expected, packed_weights);
2021 }
2022