xref: /aosp_15_r20/external/ComputeLibrary/cl_kernels/nhwc/direct_convolution.clembed (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1*c217d954SCole FaustR"(
2*c217d954SCole Faust
3*c217d954SCole Faust
4*c217d954SCole Faust
5*c217d954SCole Faust
6*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H
7*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H
8*c217d954SCole Faust
9*c217d954SCole Faust
10*c217d954SCole Faust
11*c217d954SCole Faust
12*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
13*c217d954SCole Faust    VSTORE(N0)                                                 \
14*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
15*c217d954SCole Faust
16*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
17*c217d954SCole Faust    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
18*c217d954SCole Faust    VSTORE(N0)                                                 \
19*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
20*c217d954SCole Faust
21*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
22*c217d954SCole Faust    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
23*c217d954SCole Faust    VSTORE(N0)                                                 \
24*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
25*c217d954SCole Faust
26*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
27*c217d954SCole Faust    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
28*c217d954SCole Faust    VSTORE(N0)                                                 \
29*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
30*c217d954SCole Faust
31*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
32*c217d954SCole Faust    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
33*c217d954SCole Faust    VSTORE(N0)                                                 \
34*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
35*c217d954SCole Faust
36*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
37*c217d954SCole Faust    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
38*c217d954SCole Faust    VSTORE(N0)                                                 \
39*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
40*c217d954SCole Faust
41*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
42*c217d954SCole Faust    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
43*c217d954SCole Faust    VSTORE(N0)                                                 \
44*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
45*c217d954SCole Faust
46*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
47*c217d954SCole Faust    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
48*c217d954SCole Faust    VSTORE(N0)                                                 \
49*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
50*c217d954SCole Faust
51*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
52*c217d954SCole Faust    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
53*c217d954SCole Faust    VSTORE(N0)                                                 \
54*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
55*c217d954SCole Faust
56*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
57*c217d954SCole Faust    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
58*c217d954SCole Faust    VSTORE(N0)                                                  \
59*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
60*c217d954SCole Faust
61*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
62*c217d954SCole Faust    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
63*c217d954SCole Faust    VSTORE(N0)                                                  \
64*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
65*c217d954SCole Faust
66*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
67*c217d954SCole Faust    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
68*c217d954SCole Faust    VSTORE(N0)                                                  \
69*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
70*c217d954SCole Faust
71*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
72*c217d954SCole Faust    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
73*c217d954SCole Faust    VSTORE(N0)                                                  \
74*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
75*c217d954SCole Faust
76*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
77*c217d954SCole Faust    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
78*c217d954SCole Faust    VSTORE(N0)                                                  \
79*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
80*c217d954SCole Faust
81*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
82*c217d954SCole Faust    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
83*c217d954SCole Faust    VSTORE(N0)                                                  \
84*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
85*c217d954SCole Faust
86*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
87*c217d954SCole Faust    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
88*c217d954SCole Faust    VSTORE(N0)                                                  \
89*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
90*c217d954SCole Faust
91*c217d954SCole Faust
92*c217d954SCole Faust
93*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
94*c217d954SCole Faust    VSTORE(N0)                                                         \
95*c217d954SCole Faust    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
96*c217d954SCole Faust
97*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
98*c217d954SCole Faust    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
99*c217d954SCole Faust    VSTORE(N0)                                                         \
100*c217d954SCole Faust    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
101*c217d954SCole Faust
102*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
103*c217d954SCole Faust    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
104*c217d954SCole Faust    VSTORE(N0)                                                         \
105*c217d954SCole Faust    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
106*c217d954SCole Faust
107*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
108*c217d954SCole Faust    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
109*c217d954SCole Faust    VSTORE(N0)                                                         \
110*c217d954SCole Faust    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
111*c217d954SCole Faust
112*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
113*c217d954SCole Faust    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
114*c217d954SCole Faust    VSTORE(N0)                                                         \
115*c217d954SCole Faust    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
116*c217d954SCole Faust
117*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
118*c217d954SCole Faust    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
119*c217d954SCole Faust    VSTORE(N0)                                                         \
120*c217d954SCole Faust    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
121*c217d954SCole Faust
122*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
123*c217d954SCole Faust    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
124*c217d954SCole Faust    VSTORE(N0)                                                         \
125*c217d954SCole Faust    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
126*c217d954SCole Faust
127*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
128*c217d954SCole Faust    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
129*c217d954SCole Faust    VSTORE(N0)                                                         \
130*c217d954SCole Faust    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
131*c217d954SCole Faust
132*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
133*c217d954SCole Faust    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
134*c217d954SCole Faust    VSTORE(N0)                                                         \
135*c217d954SCole Faust    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
136*c217d954SCole Faust
137*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
138*c217d954SCole Faust    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
139*c217d954SCole Faust    VSTORE(N0)                                                     \
140*c217d954SCole Faust    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
141*c217d954SCole Faust
142*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
143*c217d954SCole Faust    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
144*c217d954SCole Faust    VSTORE(N0)                                                          \
145*c217d954SCole Faust    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
146*c217d954SCole Faust
147*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
148*c217d954SCole Faust    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
149*c217d954SCole Faust    VSTORE(N0)                                                          \
150*c217d954SCole Faust    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
151*c217d954SCole Faust
152*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
153*c217d954SCole Faust    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
154*c217d954SCole Faust    VSTORE(N0)                                                          \
155*c217d954SCole Faust    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
156*c217d954SCole Faust
157*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
158*c217d954SCole Faust    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
159*c217d954SCole Faust    VSTORE(N0)                                                          \
160*c217d954SCole Faust    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
161*c217d954SCole Faust
162*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
163*c217d954SCole Faust    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
164*c217d954SCole Faust    VSTORE(N0)                                                          \
165*c217d954SCole Faust    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
166*c217d954SCole Faust
167*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
168*c217d954SCole Faust    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
169*c217d954SCole Faust    VSTORE(N0)                                                          \
170*c217d954SCole Faust    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
171*c217d954SCole Faust
172*c217d954SCole Faust
173*c217d954SCole Faust
174*c217d954SCole Faust
175*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
176*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
177*c217d954SCole Faust
178*c217d954SCole Faust
179*c217d954SCole Faust
180*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
181*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
182*c217d954SCole Faust
183*c217d954SCole Faust
184*c217d954SCole Faust
185*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
186*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
187*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
188*c217d954SCole Faust
189*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
190*c217d954SCole Faust    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
191*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
192*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
193*c217d954SCole Faust
194*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
195*c217d954SCole Faust    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
196*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
197*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
198*c217d954SCole Faust
199*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
200*c217d954SCole Faust    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
201*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
202*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
203*c217d954SCole Faust
204*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
205*c217d954SCole Faust    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
206*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
207*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
208*c217d954SCole Faust
209*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
210*c217d954SCole Faust    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
211*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
212*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
213*c217d954SCole Faust
214*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
215*c217d954SCole Faust    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
216*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
217*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
218*c217d954SCole Faust
219*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
220*c217d954SCole Faust    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
221*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
222*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
223*c217d954SCole Faust
224*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
225*c217d954SCole Faust    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
226*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
227*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
228*c217d954SCole Faust
229*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
230*c217d954SCole Faust    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
231*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
232*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
233*c217d954SCole Faust
234*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
235*c217d954SCole Faust    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
236*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
237*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
238*c217d954SCole Faust
239*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
240*c217d954SCole Faust    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
241*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
242*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
243*c217d954SCole Faust
244*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
245*c217d954SCole Faust    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
246*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
247*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
248*c217d954SCole Faust
249*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
250*c217d954SCole Faust    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
251*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
252*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
253*c217d954SCole Faust
254*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
255*c217d954SCole Faust    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
256*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
257*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
258*c217d954SCole Faust
259*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
260*c217d954SCole Faust    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
261*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
262*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
263*c217d954SCole Faust
264*c217d954SCole Faust
265*c217d954SCole Faust
266*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
267*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
268*c217d954SCole Faust
269*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
270*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
271*c217d954SCole Faust    {                                                                                                                                                     \
272*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
273*c217d954SCole Faust    }                                                                                                                                                     \
274*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
275*c217d954SCole Faust    {                                                                                                                                                     \
276*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
277*c217d954SCole Faust    }                                                                                                                                                     \
278*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
279*c217d954SCole Faust    {                                                                                                                                                     \
280*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
281*c217d954SCole Faust    }                                                                                                                                                     \
282*c217d954SCole Faust    else                                                                                                                                                  \
283*c217d954SCole Faust    {                                                                                                                                                     \
284*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
285*c217d954SCole Faust    }
286*c217d954SCole Faust
287*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
288*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                         \
289*c217d954SCole Faust    {                                                                                                             \
290*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
291*c217d954SCole Faust    }                                                                                                             \
292*c217d954SCole Faust    else                                                                                                          \
293*c217d954SCole Faust    {                                                                                                             \
294*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
295*c217d954SCole Faust    }
296*c217d954SCole Faust
297*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
298*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                         \
299*c217d954SCole Faust    {                                                                                                             \
300*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
301*c217d954SCole Faust    }                                                                                                             \
302*c217d954SCole Faust    else                                                                                                          \
303*c217d954SCole Faust    {                                                                                                             \
304*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
305*c217d954SCole Faust    }
306*c217d954SCole Faust
307*c217d954SCole Faust
308*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
309*c217d954SCole Faust
310*c217d954SCole Faust
311*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
312*c217d954SCole Faust
313*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
314*c217d954SCole Faust    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
315*c217d954SCole Faust
316*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
317*c217d954SCole Faust
318*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
319*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
320*c217d954SCole Faust
321*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
322*c217d954SCole Faust
323*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
324*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
325*c217d954SCole Faust
326*c217d954SCole Faust#else
327*c217d954SCole Faust
328*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
329*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
330*c217d954SCole Faust
331*c217d954SCole Faust#endif
332*c217d954SCole Faust
333*c217d954SCole Faust#endif
334*c217d954SCole Faust
335*c217d954SCole Faust
336*c217d954SCole Faust#if defined(PARTIAL_STORE_M0)
337*c217d954SCole Faust
338*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
339*c217d954SCole Faust    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
340*c217d954SCole Faust#else
341*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
342*c217d954SCole Faust    ((uint)(y * M0))
343*c217d954SCole Faust#endif
344*c217d954SCole Faust
345*c217d954SCole Faust
346*c217d954SCole Faust
347*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
348*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
349*c217d954SCole Faust
350*c217d954SCole Faust
351*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
352*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable
353*c217d954SCole Faust#endif
354*c217d954SCole Faust
355*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
356*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
357*c217d954SCole Faust#endif
358*c217d954SCole Faust
359*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
360*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
361*c217d954SCole Faust#endif
362*c217d954SCole Faust
363*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
364*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable
365*c217d954SCole Faust#endif
366*c217d954SCole Faust
367*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100
368*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200
369*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300
370*c217d954SCole Faust
371*c217d954SCole Faust
372*c217d954SCole Faust#define CONCAT(a, b) a##b
373*c217d954SCole Faust
374*c217d954SCole Faust
375*c217d954SCole Faust#define EXPAND(x) x
376*c217d954SCole Faust
377*c217d954SCole Faust
378*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
379*c217d954SCole Faust
380*c217d954SCole Faust
381*c217d954SCole Faust#define REV1(x) ((x))
382*c217d954SCole Faust#define REV2(x) ((x).s10)
383*c217d954SCole Faust#define REV3(x) ((x).s210)
384*c217d954SCole Faust#define REV4(x) ((x).s3210)
385*c217d954SCole Faust#define REV8(x) ((x).s76543210)
386*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210)
387*c217d954SCole Faust
388*c217d954SCole Faust
389*c217d954SCole Faust
390*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x))
391*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s)
392*c217d954SCole Faust
393*c217d954SCole Faust
394*c217d954SCole Faust
395*c217d954SCole Faust#define ROT1_0(x) ((x))
396*c217d954SCole Faust#define ROT1_1(x) ((x))
397*c217d954SCole Faust
398*c217d954SCole Faust#define ROT2_0(x) ((x))
399*c217d954SCole Faust#define ROT2_1(x) ((x).s10)
400*c217d954SCole Faust#define ROT2_2(x) ((x))
401*c217d954SCole Faust
402*c217d954SCole Faust#define ROT3_0(x) ((x))
403*c217d954SCole Faust#define ROT3_1(x) ((x).s201)
404*c217d954SCole Faust#define ROT3_2(x) ((x).s120)
405*c217d954SCole Faust#define ROT3_3(x) ((x))
406*c217d954SCole Faust
407*c217d954SCole Faust#define ROT4_0(x) ((x))
408*c217d954SCole Faust#define ROT4_1(x) ((x).s3012)
409*c217d954SCole Faust#define ROT4_2(x) ((x).s2301)
410*c217d954SCole Faust#define ROT4_3(x) ((x).s1230)
411*c217d954SCole Faust#define ROT4_4(x) ((x))
412*c217d954SCole Faust
413*c217d954SCole Faust#define ROT8_0(x) ((x))
414*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456)
415*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345)
416*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234)
417*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123)
418*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012)
419*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701)
420*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670)
421*c217d954SCole Faust#define ROT8_8(x) ((x))
422*c217d954SCole Faust
423*c217d954SCole Faust#define ROT16_0(x) ((x))
424*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE)
425*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD)
426*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC)
427*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB)
428*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A)
429*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789)
430*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678)
431*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567)
432*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456)
433*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345)
434*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234)
435*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123)
436*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012)
437*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01)
438*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0)
439*c217d954SCole Faust#define ROT16_16(x) ((x))
440*c217d954SCole Faust
441*c217d954SCole Faust
442*c217d954SCole Faust
443*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
444*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
445*c217d954SCole Faust
446*c217d954SCole Faust
447*c217d954SCole Faust
448*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0)
449*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1)
450*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2)
451*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
452*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
453*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
454*c217d954SCole Faust
455*c217d954SCole Faust
456*c217d954SCole Faust
457*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
458*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
459*c217d954SCole Faust
460*c217d954SCole Faust
461*c217d954SCole Faust#define VLOAD_STR(size) vload##size
462*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size)
463*c217d954SCole Faust
464*c217d954SCole Faust
465*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
466*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
467*c217d954SCole Faust
468*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \
469*c217d954SCole Faust    {                            \
470*c217d954SCole Faust    }
471*c217d954SCole Faust
472*c217d954SCole Faust
473*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD
474*c217d954SCole Faust#define vload_partial_1_1 vload1
475*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD
476*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD
477*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD
478*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD
479*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD
480*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD
481*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD
482*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD
483*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD
484*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD
485*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD
486*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD
487*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD
488*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD
489*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD
490*c217d954SCole Faust
491*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD
492*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1
493*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2
494*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD
495*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD
496*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD
497*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD
498*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD
499*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD
500*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD
501*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD
502*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD
503*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD
504*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD
505*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD
506*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD
507*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD
508*c217d954SCole Faust
509*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD
510*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1
511*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2
512*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3
513*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD
514*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD
515*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD
516*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD
517*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD
518*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD
519*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD
520*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD
521*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD
522*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD
523*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD
524*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD
525*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD
526*c217d954SCole Faust
527*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD
528*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1
529*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2
530*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3
531*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4
532*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD
533*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD
534*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD
535*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD
536*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD
537*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD
538*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD
539*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD
540*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD
541*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD
542*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD
543*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD
544*c217d954SCole Faust
545*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD
546*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1
547*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2
548*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3
549*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4
550*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5
551*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6
552*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7
553*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8
554*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD
555*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD
556*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD
557*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD
558*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD
559*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD
560*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD
561*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD
562*c217d954SCole Faust
563*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD
564*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1
565*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2
566*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3
567*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4
568*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5
569*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6
570*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7
571*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8
572*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9
573*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10
574*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11
575*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12
576*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13
577*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14
578*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15
579*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16
580*c217d954SCole Faust
581*c217d954SCole Faust
582*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \
583*c217d954SCole Faust    DATA.s0 = vload1(OFFSET, PTR);
584*c217d954SCole Faust
585*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \
586*c217d954SCole Faust    DATA.s01 = vload2(OFFSET, PTR);
587*c217d954SCole Faust
588*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \
589*c217d954SCole Faust    DATA.s012 = vload3(OFFSET, PTR);
590*c217d954SCole Faust
591*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \
592*c217d954SCole Faust    DATA.s0123 = vload4(OFFSET, PTR);
593*c217d954SCole Faust
594*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR)    \
595*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
596*c217d954SCole Faust    DATA.s4 = vload1(OFFSET, PTR + 4);
597*c217d954SCole Faust
598*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR)    \
599*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
600*c217d954SCole Faust    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
601*c217d954SCole Faust
602*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR)    \
603*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
604*c217d954SCole Faust    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
605*c217d954SCole Faust
606*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \
607*c217d954SCole Faust    DATA.s01234567 = vload8(OFFSET, PTR);
608*c217d954SCole Faust
609*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR)        \
610*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
611*c217d954SCole Faust    DATA.s8 = vload1(OFFSET, PTR + 8);
612*c217d954SCole Faust
613*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR)       \
614*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
615*c217d954SCole Faust    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
616*c217d954SCole Faust
617*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR)       \
618*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
619*c217d954SCole Faust    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
620*c217d954SCole Faust
621*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR)       \
622*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
623*c217d954SCole Faust    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
624*c217d954SCole Faust
625*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR)       \
626*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
627*c217d954SCole Faust    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
628*c217d954SCole Faust
629*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR)       \
630*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
631*c217d954SCole Faust    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
632*c217d954SCole Faust
633*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR)       \
634*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
635*c217d954SCole Faust    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
636*c217d954SCole Faust
637*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \
638*c217d954SCole Faust    DATA = vload16(OFFSET, PTR);
639*c217d954SCole Faust
640*c217d954SCole Faust
641*c217d954SCole Faust
642*c217d954SCole Faust#define PIXEL_UNIT4 1
643*c217d954SCole Faust#define PIXEL_UNIT8 2
644*c217d954SCole Faust#define PIXEL_UNIT16 4
645*c217d954SCole Faust
646*c217d954SCole Faust
647*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
648*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
649*c217d954SCole Faust
650*c217d954SCole Faust
651*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
652*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
653*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
654*c217d954SCole Faust
655*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
656*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
657*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
658*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
659*c217d954SCole Faust#endif
660*c217d954SCole Faust
661*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
662*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
663*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
664*c217d954SCole Faust
665*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
666*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
667*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
668*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
669*c217d954SCole Faust#endif
670*c217d954SCole Faust
671*c217d954SCole Faust
672*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
673*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
674*c217d954SCole Faust
675*c217d954SCole Faust
676*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
677*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
678*c217d954SCole Faust
679*c217d954SCole Faust#define VSTORE_STR(size) vstore##size
680*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size)
681*c217d954SCole Faust
682*c217d954SCole Faust#define float1 float
683*c217d954SCole Faust#define half1 half
684*c217d954SCole Faust#define char1 char
685*c217d954SCole Faust#define uchar1 uchar
686*c217d954SCole Faust#define short1 short
687*c217d954SCole Faust#define ushort1 ushort
688*c217d954SCole Faust#define int1 int
689*c217d954SCole Faust#define uint1 uint
690*c217d954SCole Faust#define long1 long
691*c217d954SCole Faust#define ulong1 ulong
692*c217d954SCole Faust#define double1 double
693*c217d954SCole Faust
694*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR)
695*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
696*c217d954SCole Faust
697*c217d954SCole Faust
698*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
699*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
700*c217d954SCole Faust
701*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \
702*c217d954SCole Faust    {                             \
703*c217d954SCole Faust    }
704*c217d954SCole Faust
705*c217d954SCole Faust
706*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE
707*c217d954SCole Faust#define vstore_partial_1_1 vstore1
708*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE
709*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE
710*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE
711*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE
712*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE
713*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE
714*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE
715*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE
716*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE
717*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE
718*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE
719*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE
720*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE
721*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE
722*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE
723*c217d954SCole Faust
724*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE
725*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1
726*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2
727*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE
728*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE
729*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE
730*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE
731*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE
732*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE
733*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE
734*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE
735*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE
736*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE
737*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE
738*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE
739*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE
740*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE
741*c217d954SCole Faust
742*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE
743*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1
744*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2
745*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3
746*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE
747*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE
748*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE
749*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE
750*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE
751*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE
752*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE
753*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE
754*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE
755*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE
756*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE
757*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE
758*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE
759*c217d954SCole Faust
760*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE
761*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1
762*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2
763*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3
764*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4
765*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE
766*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE
767*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE
768*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE
769*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE
770*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE
771*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE
772*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE
773*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE
774*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE
775*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE
776*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE
777*c217d954SCole Faust
778*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE
779*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1
780*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2
781*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3
782*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4
783*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5
784*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6
785*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7
786*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8
787*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE
788*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE
789*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE
790*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE
791*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE
792*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE
793*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE
794*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE
795*c217d954SCole Faust
796*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE
797*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1
798*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2
799*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3
800*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4
801*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5
802*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6
803*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7
804*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8
805*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9
806*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10
807*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11
808*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12
809*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13
810*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14
811*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15
812*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16
813*c217d954SCole Faust
814*c217d954SCole Faust
815*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \
816*c217d954SCole Faust    vstore1(DATA.s0, OFFSET, PTR);
817*c217d954SCole Faust
818*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \
819*c217d954SCole Faust    vstore2(DATA.s01, OFFSET, PTR);
820*c217d954SCole Faust
821*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \
822*c217d954SCole Faust    vstore3(DATA.s012, OFFSET, PTR);
823*c217d954SCole Faust
824*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \
825*c217d954SCole Faust    vstore4(DATA.s0123, OFFSET, PTR);
826*c217d954SCole Faust
827*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR)    \
828*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
829*c217d954SCole Faust    vstore1(DATA.s4, OFFSET, PTR + 4);
830*c217d954SCole Faust
831*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR)    \
832*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
833*c217d954SCole Faust    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
834*c217d954SCole Faust
835*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR)    \
836*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
837*c217d954SCole Faust    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
838*c217d954SCole Faust
839*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \
840*c217d954SCole Faust    vstore8(DATA.s01234567, OFFSET, PTR);
841*c217d954SCole Faust
842*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR)        \
843*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
844*c217d954SCole Faust    vstore1(DATA.s8, OFFSET, PTR + 8);
845*c217d954SCole Faust
846*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR)       \
847*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
848*c217d954SCole Faust    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
849*c217d954SCole Faust
850*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR)       \
851*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
852*c217d954SCole Faust    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
853*c217d954SCole Faust
854*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR)       \
855*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
856*c217d954SCole Faust    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
857*c217d954SCole Faust
858*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR)       \
859*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
860*c217d954SCole Faust    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
861*c217d954SCole Faust
862*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR)       \
863*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
864*c217d954SCole Faust    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
865*c217d954SCole Faust
866*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR)       \
867*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
868*c217d954SCole Faust    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
869*c217d954SCole Faust
870*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \
871*c217d954SCole Faust    vstore16(DATA, OFFSET, PTR);
872*c217d954SCole Faust
873*c217d954SCole Faust
874*c217d954SCole Faust
875*c217d954SCole Faust
876*c217d954SCole Faust
877*c217d954SCole Faust#define convert_float_sat convert_float
878*c217d954SCole Faust#define convert_float1_sat convert_float
879*c217d954SCole Faust#define convert_float2_sat convert_float2
880*c217d954SCole Faust#define convert_float3_sat convert_float3
881*c217d954SCole Faust#define convert_float4_sat convert_float4
882*c217d954SCole Faust#define convert_float8_sat convert_float8
883*c217d954SCole Faust#define convert_float16_sat convert_float16
884*c217d954SCole Faust#define convert_half_sat convert_float
885*c217d954SCole Faust#define convert_half1_sat convert_half
886*c217d954SCole Faust#define convert_half2_sat convert_half2
887*c217d954SCole Faust#define convert_half3_sat convert_half3
888*c217d954SCole Faust#define convert_half4_sat convert_half4
889*c217d954SCole Faust#define convert_half8_sat convert_half8
890*c217d954SCole Faust#define convert_half16_sat convert_half16
891*c217d954SCole Faust
892*c217d954SCole Faust#define convert_float1 convert_float
893*c217d954SCole Faust#define convert_half1 convert_half
894*c217d954SCole Faust#define convert_char1 convert_char
895*c217d954SCole Faust#define convert_uchar1 convert_uchar
896*c217d954SCole Faust#define convert_short1 convert_short
897*c217d954SCole Faust#define convert_ushort1 convert_ushort
898*c217d954SCole Faust#define convert_int1 convert_int
899*c217d954SCole Faust#define convert_uint1 convert_uint
900*c217d954SCole Faust#define convert_long1 convert_long
901*c217d954SCole Faust#define convert_ulong1 convert_ulong
902*c217d954SCole Faust#define convert_double1 convert_double
903*c217d954SCole Faust
904*c217d954SCole Faust#define convert_char1_sat convert_char_sat
905*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat
906*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat
907*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat
908*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat
909*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat
910*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat
911*c217d954SCole Faust#define convert_short1_sat convert_short_sat
912*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat
913*c217d954SCole Faust#define convert_int1_sat convert_int_sat
914*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat
915*c217d954SCole Faust#define convert_long1_sat convert_long_sat
916*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat
917*c217d954SCole Faust#define convert_double1_sat convert_double_sat
918*c217d954SCole Faust
919*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size
920*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
921*c217d954SCole Faust
922*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x)))
923*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type)
924*c217d954SCole Faust
925*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
926*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
927*c217d954SCole Faust
928*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
929*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
930*c217d954SCole Faust
931*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size
932*c217d954SCole Faust#define select_vec_dt_char(size) char##size
933*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size
934*c217d954SCole Faust#define select_vec_dt_short(size) short##size
935*c217d954SCole Faust#define select_vec_dt_half(size) short##size
936*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size
937*c217d954SCole Faust#define select_vec_dt_int(size) int##size
938*c217d954SCole Faust#define select_vec_dt_float(size) int##size
939*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size
940*c217d954SCole Faust#define select_vec_dt_long(size) long##size
941*c217d954SCole Faust
942*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
943*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
944*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
945*c217d954SCole Faust
946*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size
947*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size
948*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size
949*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size
950*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size
951*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size
952*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size
953*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size
954*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size
955*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size
956*c217d954SCole Faust
957*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
958*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
959*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
960*c217d954SCole Faust
961*c217d954SCole Faust#define sum_reduce_1(x) (x)
962*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1)
963*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
964*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
965*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
966*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
967*c217d954SCole Faust
968*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
969*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
970*c217d954SCole Faust
971*c217d954SCole Faust#define prod_reduce_1(x) (x)
972*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1)
973*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
974*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
975*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
976*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
977*c217d954SCole Faust
978*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
979*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
980*c217d954SCole Faust
981*c217d954SCole Faust#define max_reduce_1(x) (x)
982*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1))
983*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
984*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
985*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
986*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
987*c217d954SCole Faust
988*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
989*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
990*c217d954SCole Faust
991*c217d954SCole Faust#define VECTOR_DECLARATION(name)     \
992*c217d954SCole Faust    __global uchar *name##_ptr,      \
993*c217d954SCole Faust    uint        name##_stride_x, \
994*c217d954SCole Faust    uint        name##_step_x,   \
995*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
996*c217d954SCole Faust
997*c217d954SCole Faust#define IMAGE_DECLARATION(name)      \
998*c217d954SCole Faust    __global uchar *name##_ptr,      \
999*c217d954SCole Faust    uint        name##_stride_x, \
1000*c217d954SCole Faust    uint        name##_step_x,   \
1001*c217d954SCole Faust    uint        name##_stride_y, \
1002*c217d954SCole Faust    uint        name##_step_y,   \
1003*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1004*c217d954SCole Faust
1005*c217d954SCole Faust#define TENSOR3D_DECLARATION(name)   \
1006*c217d954SCole Faust    __global uchar *name##_ptr,      \
1007*c217d954SCole Faust    uint        name##_stride_x, \
1008*c217d954SCole Faust    uint        name##_step_x,   \
1009*c217d954SCole Faust    uint        name##_stride_y, \
1010*c217d954SCole Faust    uint        name##_step_y,   \
1011*c217d954SCole Faust    uint        name##_stride_z, \
1012*c217d954SCole Faust    uint        name##_step_z,   \
1013*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1014*c217d954SCole Faust
1015*c217d954SCole Faust#define TENSOR4D_DECLARATION(name)   \
1016*c217d954SCole Faust    __global uchar *name##_ptr,      \
1017*c217d954SCole Faust    uint        name##_stride_x, \
1018*c217d954SCole Faust    uint        name##_step_x,   \
1019*c217d954SCole Faust    uint        name##_stride_y, \
1020*c217d954SCole Faust    uint        name##_step_y,   \
1021*c217d954SCole Faust    uint        name##_stride_z, \
1022*c217d954SCole Faust    uint        name##_step_z,   \
1023*c217d954SCole Faust    uint        name##_stride_w, \
1024*c217d954SCole Faust    uint        name##_step_w,   \
1025*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1026*c217d954SCole Faust
1027*c217d954SCole Faust#define TENSOR5D_DECLARATION(name)   \
1028*c217d954SCole Faust    __global uchar *name##_ptr,      \
1029*c217d954SCole Faust    uint        name##_stride_x, \
1030*c217d954SCole Faust    uint        name##_step_x,   \
1031*c217d954SCole Faust    uint        name##_stride_y, \
1032*c217d954SCole Faust    uint        name##_step_y,   \
1033*c217d954SCole Faust    uint        name##_stride_z, \
1034*c217d954SCole Faust    uint        name##_step_z,   \
1035*c217d954SCole Faust    uint        name##_stride_w, \
1036*c217d954SCole Faust    uint        name##_step_w,   \
1037*c217d954SCole Faust    uint        name##_stride_v, \
1038*c217d954SCole Faust    uint        name##_step_v,   \
1039*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1040*c217d954SCole Faust
1041*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \
1042*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1043*c217d954SCole Faust
1044*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1045*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1046*c217d954SCole Faust
1047*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \
1048*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1049*c217d954SCole Faust
1050*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1051*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1052*c217d954SCole Faust
1053*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1054*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1055*c217d954SCole Faust
1056*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1057*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1058*c217d954SCole Faust
1059*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1060*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1061*c217d954SCole Faust
1062*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1063*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1064*c217d954SCole Faust                                 name##_stride_z, name##_step_z)
1065*c217d954SCole Faust
1066*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1067*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1068*c217d954SCole Faust
1069*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1070*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1071*c217d954SCole Faust                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1072*c217d954SCole Faust
1073*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1074*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1075*c217d954SCole Faust
1076*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1077*c217d954SCole Faust    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1078*c217d954SCole Faust                           name##_stride_z, name##_step_z)
1079*c217d954SCole Faust
1080*c217d954SCole Faust
1081*c217d954SCole Fausttypedef struct Vector
1082*c217d954SCole Faust{
1083*c217d954SCole Faust    __global uchar *ptr;
1084*c217d954SCole Faust    int             offset_first_element_in_bytes;
1085*c217d954SCole Faust    int             stride_x;
1086*c217d954SCole Faust} Vector;
1087*c217d954SCole Faust
1088*c217d954SCole Faust
1089*c217d954SCole Fausttypedef struct Image
1090*c217d954SCole Faust{
1091*c217d954SCole Faust    __global uchar *ptr;
1092*c217d954SCole Faust    int             offset_first_element_in_bytes;
1093*c217d954SCole Faust    int             stride_x;
1094*c217d954SCole Faust    int             stride_y;
1095*c217d954SCole Faust} Image;
1096*c217d954SCole Faust
1097*c217d954SCole Faust
1098*c217d954SCole Fausttypedef struct Tensor3D
1099*c217d954SCole Faust{
1100*c217d954SCole Faust    __global uchar *ptr;
1101*c217d954SCole Faust    int             offset_first_element_in_bytes;
1102*c217d954SCole Faust    int             stride_x;
1103*c217d954SCole Faust    int             stride_y;
1104*c217d954SCole Faust    int             stride_z;
1105*c217d954SCole Faust} Tensor3D;
1106*c217d954SCole Faust
1107*c217d954SCole Faust
1108*c217d954SCole Fausttypedef struct Tensor4D
1109*c217d954SCole Faust{
1110*c217d954SCole Faust    __global uchar *ptr;
1111*c217d954SCole Faust    int             offset_first_element_in_bytes;
1112*c217d954SCole Faust    int             stride_x;
1113*c217d954SCole Faust    int             stride_y;
1114*c217d954SCole Faust    int             stride_z;
1115*c217d954SCole Faust    int             stride_w;
1116*c217d954SCole Faust} Tensor4D;
1117*c217d954SCole Faust
1118*c217d954SCole Faust
1119*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1120*c217d954SCole Faust{
1121*c217d954SCole Faust    Vector vector =
1122*c217d954SCole Faust    {
1123*c217d954SCole Faust        .ptr                           = ptr,
1124*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1125*c217d954SCole Faust        .stride_x                      = stride_x,
1126*c217d954SCole Faust    };
1127*c217d954SCole Faust    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1128*c217d954SCole Faust    return vector;
1129*c217d954SCole Faust}
1130*c217d954SCole Faust
1131*c217d954SCole Faust
1132*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1133*c217d954SCole Faust{
1134*c217d954SCole Faust    Image img =
1135*c217d954SCole Faust    {
1136*c217d954SCole Faust        .ptr                           = ptr,
1137*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1138*c217d954SCole Faust        .stride_x                      = stride_x,
1139*c217d954SCole Faust        .stride_y                      = stride_y
1140*c217d954SCole Faust    };
1141*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1142*c217d954SCole Faust    return img;
1143*c217d954SCole Faust}
1144*c217d954SCole Faust
1145*c217d954SCole Faust
1146*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1147*c217d954SCole Faust{
1148*c217d954SCole Faust    Image img =
1149*c217d954SCole Faust    {
1150*c217d954SCole Faust        .ptr                           = ptr,
1151*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1152*c217d954SCole Faust        .stride_x                      = stride_x,
1153*c217d954SCole Faust        .stride_y                      = stride_y
1154*c217d954SCole Faust    };
1155*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1156*c217d954SCole Faust    return img;
1157*c217d954SCole Faust}
1158*c217d954SCole Faust
1159*c217d954SCole Faust
1160*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1161*c217d954SCole Faust{
1162*c217d954SCole Faust    Tensor3D tensor =
1163*c217d954SCole Faust    {
1164*c217d954SCole Faust        .ptr                           = ptr,
1165*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1166*c217d954SCole Faust        .stride_x                      = stride_x,
1167*c217d954SCole Faust        .stride_y                      = stride_y,
1168*c217d954SCole Faust        .stride_z                      = stride_z
1169*c217d954SCole Faust    };
1170*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1171*c217d954SCole Faust    return tensor;
1172*c217d954SCole Faust}
1173*c217d954SCole Faust
1174*c217d954SCole Faust
1175*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1176*c217d954SCole Faust{
1177*c217d954SCole Faust    Tensor3D tensor =
1178*c217d954SCole Faust    {
1179*c217d954SCole Faust        .ptr                           = ptr,
1180*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1181*c217d954SCole Faust        .stride_x                      = stride_x,
1182*c217d954SCole Faust        .stride_y                      = stride_y,
1183*c217d954SCole Faust        .stride_z                      = stride_z
1184*c217d954SCole Faust    };
1185*c217d954SCole Faust    return tensor;
1186*c217d954SCole Faust}
1187*c217d954SCole Faust
1188*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1189*c217d954SCole Faust                                             uint step_w,
1190*c217d954SCole Faust                                             uint mod_size)
1191*c217d954SCole Faust{
1192*c217d954SCole Faust    Tensor4D tensor =
1193*c217d954SCole Faust    {
1194*c217d954SCole Faust        .ptr                           = ptr,
1195*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1196*c217d954SCole Faust        .stride_x                      = stride_x,
1197*c217d954SCole Faust        .stride_y                      = stride_y,
1198*c217d954SCole Faust        .stride_z                      = stride_z,
1199*c217d954SCole Faust        .stride_w                      = stride_w
1200*c217d954SCole Faust    };
1201*c217d954SCole Faust
1202*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1203*c217d954SCole Faust    return tensor;
1204*c217d954SCole Faust}
1205*c217d954SCole Faust
1206*c217d954SCole Faust
1207*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x)
1208*c217d954SCole Faust{
1209*c217d954SCole Faust    return vec->ptr + x * vec->stride_x;
1210*c217d954SCole Faust}
1211*c217d954SCole Faust
1212*c217d954SCole Faust
1213*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y)
1214*c217d954SCole Faust{
1215*c217d954SCole Faust    return img->ptr + x * img->stride_x + y * img->stride_y;
1216*c217d954SCole Faust}
1217*c217d954SCole Faust
1218*c217d954SCole Faust
1219*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1220*c217d954SCole Faust{
1221*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1222*c217d954SCole Faust}
1223*c217d954SCole Faust
1224*c217d954SCole Faust
1225*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1226*c217d954SCole Faust{
1227*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1228*c217d954SCole Faust}
1229*c217d954SCole Faust
1230*c217d954SCole Faust
1231*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1232*c217d954SCole Faust{
1233*c217d954SCole Faust    uint num_elements = width * height;
1234*c217d954SCole Faust
1235*c217d954SCole Faust    const uint z = index / num_elements;
1236*c217d954SCole Faust
1237*c217d954SCole Faust    index %= num_elements;
1238*c217d954SCole Faust
1239*c217d954SCole Faust    const uint y = index / width;
1240*c217d954SCole Faust
1241*c217d954SCole Faust    index %= width;
1242*c217d954SCole Faust
1243*c217d954SCole Faust    const uint x = index;
1244*c217d954SCole Faust
1245*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1246*c217d954SCole Faust}
1247*c217d954SCole Faust
1248*c217d954SCole Faust#endif
1249*c217d954SCole Faust
1250*c217d954SCole Faust#if GPU_ARCH == GPU_ARCH_BIFROST
1251*c217d954SCole Faust#define MLA(a, b, c) (fma(c, b, a))
1252*c217d954SCole Faust#else
1253*c217d954SCole Faust#define MLA(a, b, c) ((b) * (c) + (a))
1254*c217d954SCole Faust#endif
1255*c217d954SCole Faust
1256*c217d954SCole Faust
1257*c217d954SCole Faust#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
1258*c217d954SCole Faust
1259*c217d954SCole Faust
1260*c217d954SCole Faust#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
1261*c217d954SCole Faust
1262*c217d954SCole Faust
1263*c217d954SCole Faust#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
1264*c217d954SCole Faust
1265*c217d954SCole Faust
1266*c217d954SCole Faust#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
1267*c217d954SCole Faust
1268*c217d954SCole Faust
1269*c217d954SCole Faust#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
1270*c217d954SCole Faust
1271*c217d954SCole Faust
1272*c217d954SCole Faust#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
1273*c217d954SCole Faust
1274*c217d954SCole Faust
1275*c217d954SCole Faust#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
1276*c217d954SCole Faust
1277*c217d954SCole Faust
1278*c217d954SCole Faust#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
1279*c217d954SCole Faust
1280*c217d954SCole Faust
1281*c217d954SCole Faust#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
1282*c217d954SCole Faust
1283*c217d954SCole Faust
1284*c217d954SCole Faust#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
1285*c217d954SCole Faust
1286*c217d954SCole Faust
1287*c217d954SCole Faust#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x)
1288*c217d954SCole Faust
1289*c217d954SCole Faust
1290*c217d954SCole Faust#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x))
1291*c217d954SCole Faust
1292*c217d954SCole Faust
1293*c217d954SCole Faust#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
1294*c217d954SCole Faust
1295*c217d954SCole Faust
1296*c217d954SCole Faust#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
1297*c217d954SCole Faust
1298*c217d954SCole Faust
1299*c217d954SCole Faust#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
1300*c217d954SCole Faust
1301*c217d954SCole Faust#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1302*c217d954SCole Faust
1303*c217d954SCole Faust#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1304*c217d954SCole Faust
1305*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H
1306*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H
1307*c217d954SCole Faust
1308*c217d954SCole Faust
1309*c217d954SCole Faust
1310*c217d954SCole Faust
1311*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1312*c217d954SCole Faust    VSTORE(N0)                                                 \
1313*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1314*c217d954SCole Faust
1315*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1316*c217d954SCole Faust    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1317*c217d954SCole Faust    VSTORE(N0)                                                 \
1318*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1319*c217d954SCole Faust
1320*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1321*c217d954SCole Faust    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1322*c217d954SCole Faust    VSTORE(N0)                                                 \
1323*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1324*c217d954SCole Faust
1325*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1326*c217d954SCole Faust    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1327*c217d954SCole Faust    VSTORE(N0)                                                 \
1328*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1329*c217d954SCole Faust
1330*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1331*c217d954SCole Faust    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1332*c217d954SCole Faust    VSTORE(N0)                                                 \
1333*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1334*c217d954SCole Faust
1335*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1336*c217d954SCole Faust    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1337*c217d954SCole Faust    VSTORE(N0)                                                 \
1338*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1339*c217d954SCole Faust
1340*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1341*c217d954SCole Faust    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1342*c217d954SCole Faust    VSTORE(N0)                                                 \
1343*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1344*c217d954SCole Faust
1345*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1346*c217d954SCole Faust    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1347*c217d954SCole Faust    VSTORE(N0)                                                 \
1348*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1349*c217d954SCole Faust
1350*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1351*c217d954SCole Faust    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1352*c217d954SCole Faust    VSTORE(N0)                                                 \
1353*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1354*c217d954SCole Faust
1355*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1356*c217d954SCole Faust    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1357*c217d954SCole Faust    VSTORE(N0)                                                  \
1358*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1359*c217d954SCole Faust
1360*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1361*c217d954SCole Faust    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1362*c217d954SCole Faust    VSTORE(N0)                                                  \
1363*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1364*c217d954SCole Faust
1365*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1366*c217d954SCole Faust    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1367*c217d954SCole Faust    VSTORE(N0)                                                  \
1368*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1369*c217d954SCole Faust
1370*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1371*c217d954SCole Faust    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1372*c217d954SCole Faust    VSTORE(N0)                                                  \
1373*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1374*c217d954SCole Faust
1375*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1376*c217d954SCole Faust    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1377*c217d954SCole Faust    VSTORE(N0)                                                  \
1378*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1379*c217d954SCole Faust
1380*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1381*c217d954SCole Faust    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1382*c217d954SCole Faust    VSTORE(N0)                                                  \
1383*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1384*c217d954SCole Faust
1385*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1386*c217d954SCole Faust    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1387*c217d954SCole Faust    VSTORE(N0)                                                  \
1388*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1389*c217d954SCole Faust
1390*c217d954SCole Faust
1391*c217d954SCole Faust
1392*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1393*c217d954SCole Faust    VSTORE(N0)                                                         \
1394*c217d954SCole Faust    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1395*c217d954SCole Faust
1396*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1397*c217d954SCole Faust    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1398*c217d954SCole Faust    VSTORE(N0)                                                         \
1399*c217d954SCole Faust    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1400*c217d954SCole Faust
1401*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1402*c217d954SCole Faust    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1403*c217d954SCole Faust    VSTORE(N0)                                                         \
1404*c217d954SCole Faust    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1405*c217d954SCole Faust
1406*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1407*c217d954SCole Faust    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1408*c217d954SCole Faust    VSTORE(N0)                                                         \
1409*c217d954SCole Faust    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1410*c217d954SCole Faust
1411*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1412*c217d954SCole Faust    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1413*c217d954SCole Faust    VSTORE(N0)                                                         \
1414*c217d954SCole Faust    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1415*c217d954SCole Faust
1416*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1417*c217d954SCole Faust    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1418*c217d954SCole Faust    VSTORE(N0)                                                         \
1419*c217d954SCole Faust    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1420*c217d954SCole Faust
1421*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1422*c217d954SCole Faust    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1423*c217d954SCole Faust    VSTORE(N0)                                                         \
1424*c217d954SCole Faust    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1425*c217d954SCole Faust
1426*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1427*c217d954SCole Faust    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1428*c217d954SCole Faust    VSTORE(N0)                                                         \
1429*c217d954SCole Faust    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1430*c217d954SCole Faust
1431*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1432*c217d954SCole Faust    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1433*c217d954SCole Faust    VSTORE(N0)                                                         \
1434*c217d954SCole Faust    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1435*c217d954SCole Faust
1436*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
1437*c217d954SCole Faust    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1438*c217d954SCole Faust    VSTORE(N0)                                                     \
1439*c217d954SCole Faust    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1440*c217d954SCole Faust
1441*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1442*c217d954SCole Faust    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1443*c217d954SCole Faust    VSTORE(N0)                                                          \
1444*c217d954SCole Faust    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1445*c217d954SCole Faust
1446*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1447*c217d954SCole Faust    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1448*c217d954SCole Faust    VSTORE(N0)                                                          \
1449*c217d954SCole Faust    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1450*c217d954SCole Faust
1451*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1452*c217d954SCole Faust    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1453*c217d954SCole Faust    VSTORE(N0)                                                          \
1454*c217d954SCole Faust    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1455*c217d954SCole Faust
1456*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1457*c217d954SCole Faust    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1458*c217d954SCole Faust    VSTORE(N0)                                                          \
1459*c217d954SCole Faust    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1460*c217d954SCole Faust
1461*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1462*c217d954SCole Faust    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1463*c217d954SCole Faust    VSTORE(N0)                                                          \
1464*c217d954SCole Faust    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1465*c217d954SCole Faust
1466*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1467*c217d954SCole Faust    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1468*c217d954SCole Faust    VSTORE(N0)                                                          \
1469*c217d954SCole Faust    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1470*c217d954SCole Faust
1471*c217d954SCole Faust
1472*c217d954SCole Faust
1473*c217d954SCole Faust
1474*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1475*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1476*c217d954SCole Faust
1477*c217d954SCole Faust
1478*c217d954SCole Faust
1479*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1480*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1481*c217d954SCole Faust
1482*c217d954SCole Faust
1483*c217d954SCole Faust
1484*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1485*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1486*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1487*c217d954SCole Faust
1488*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1489*c217d954SCole Faust    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1490*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1491*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1492*c217d954SCole Faust
1493*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1494*c217d954SCole Faust    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1495*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1496*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1497*c217d954SCole Faust
1498*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1499*c217d954SCole Faust    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1500*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1501*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1502*c217d954SCole Faust
1503*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1504*c217d954SCole Faust    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1505*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1506*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1507*c217d954SCole Faust
1508*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1509*c217d954SCole Faust    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1510*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1511*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1512*c217d954SCole Faust
1513*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1514*c217d954SCole Faust    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1515*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1516*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1517*c217d954SCole Faust
1518*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1519*c217d954SCole Faust    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1520*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1521*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1522*c217d954SCole Faust
1523*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1524*c217d954SCole Faust    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1525*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1526*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1527*c217d954SCole Faust
1528*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1529*c217d954SCole Faust    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1530*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1531*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1532*c217d954SCole Faust
1533*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1534*c217d954SCole Faust    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1535*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1536*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1537*c217d954SCole Faust
1538*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1539*c217d954SCole Faust    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1540*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1541*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1542*c217d954SCole Faust
1543*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1544*c217d954SCole Faust    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1545*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1546*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1547*c217d954SCole Faust
1548*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1549*c217d954SCole Faust    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1550*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1551*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1552*c217d954SCole Faust
1553*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1554*c217d954SCole Faust    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1555*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1556*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1557*c217d954SCole Faust
1558*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1559*c217d954SCole Faust    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1560*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1561*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1562*c217d954SCole Faust
1563*c217d954SCole Faust
1564*c217d954SCole Faust
1565*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1566*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1567*c217d954SCole Faust
1568*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1569*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
1570*c217d954SCole Faust    {                                                                                                                                                     \
1571*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
1572*c217d954SCole Faust    }                                                                                                                                                     \
1573*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
1574*c217d954SCole Faust    {                                                                                                                                                     \
1575*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1576*c217d954SCole Faust    }                                                                                                                                                     \
1577*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
1578*c217d954SCole Faust    {                                                                                                                                                     \
1579*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1580*c217d954SCole Faust    }                                                                                                                                                     \
1581*c217d954SCole Faust    else                                                                                                                                                  \
1582*c217d954SCole Faust    {                                                                                                                                                     \
1583*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
1584*c217d954SCole Faust    }
1585*c217d954SCole Faust
1586*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
1587*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                         \
1588*c217d954SCole Faust    {                                                                                                             \
1589*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1590*c217d954SCole Faust    }                                                                                                             \
1591*c217d954SCole Faust    else                                                                                                          \
1592*c217d954SCole Faust    {                                                                                                             \
1593*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1594*c217d954SCole Faust    }
1595*c217d954SCole Faust
1596*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
1597*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                         \
1598*c217d954SCole Faust    {                                                                                                             \
1599*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1600*c217d954SCole Faust    }                                                                                                             \
1601*c217d954SCole Faust    else                                                                                                          \
1602*c217d954SCole Faust    {                                                                                                             \
1603*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1604*c217d954SCole Faust    }
1605*c217d954SCole Faust
1606*c217d954SCole Faust
1607*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1608*c217d954SCole Faust
1609*c217d954SCole Faust
1610*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1611*c217d954SCole Faust
1612*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1613*c217d954SCole Faust    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1614*c217d954SCole Faust
1615*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
1616*c217d954SCole Faust
1617*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1618*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
1619*c217d954SCole Faust
1620*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
1621*c217d954SCole Faust
1622*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1623*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
1624*c217d954SCole Faust
1625*c217d954SCole Faust#else
1626*c217d954SCole Faust
1627*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1628*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
1629*c217d954SCole Faust
1630*c217d954SCole Faust#endif
1631*c217d954SCole Faust
1632*c217d954SCole Faust#endif
1633*c217d954SCole Faust
1634*c217d954SCole Faust
1635*c217d954SCole Faust#if defined(PARTIAL_STORE_M0)
1636*c217d954SCole Faust
1637*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1638*c217d954SCole Faust    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
1639*c217d954SCole Faust#else
1640*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1641*c217d954SCole Faust    ((uint)(y * M0))
1642*c217d954SCole Faust#endif
1643*c217d954SCole Faust
1644*c217d954SCole Faust
1645*c217d954SCole Faust
1646*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
1647*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
1648*c217d954SCole Faust
1649*c217d954SCole Faust
1650*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1651*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable
1652*c217d954SCole Faust#endif
1653*c217d954SCole Faust
1654*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
1655*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
1656*c217d954SCole Faust#endif
1657*c217d954SCole Faust
1658*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
1659*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
1660*c217d954SCole Faust#endif
1661*c217d954SCole Faust
1662*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
1663*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable
1664*c217d954SCole Faust#endif
1665*c217d954SCole Faust
1666*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100
1667*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200
1668*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300
1669*c217d954SCole Faust
1670*c217d954SCole Faust
1671*c217d954SCole Faust#define CONCAT(a, b) a##b
1672*c217d954SCole Faust
1673*c217d954SCole Faust
1674*c217d954SCole Faust#define EXPAND(x) x
1675*c217d954SCole Faust
1676*c217d954SCole Faust
1677*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
1678*c217d954SCole Faust
1679*c217d954SCole Faust
1680*c217d954SCole Faust#define REV1(x) ((x))
1681*c217d954SCole Faust#define REV2(x) ((x).s10)
1682*c217d954SCole Faust#define REV3(x) ((x).s210)
1683*c217d954SCole Faust#define REV4(x) ((x).s3210)
1684*c217d954SCole Faust#define REV8(x) ((x).s76543210)
1685*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210)
1686*c217d954SCole Faust
1687*c217d954SCole Faust
1688*c217d954SCole Faust
1689*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x))
1690*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s)
1691*c217d954SCole Faust
1692*c217d954SCole Faust
1693*c217d954SCole Faust
1694*c217d954SCole Faust#define ROT1_0(x) ((x))
1695*c217d954SCole Faust#define ROT1_1(x) ((x))
1696*c217d954SCole Faust
1697*c217d954SCole Faust#define ROT2_0(x) ((x))
1698*c217d954SCole Faust#define ROT2_1(x) ((x).s10)
1699*c217d954SCole Faust#define ROT2_2(x) ((x))
1700*c217d954SCole Faust
1701*c217d954SCole Faust#define ROT3_0(x) ((x))
1702*c217d954SCole Faust#define ROT3_1(x) ((x).s201)
1703*c217d954SCole Faust#define ROT3_2(x) ((x).s120)
1704*c217d954SCole Faust#define ROT3_3(x) ((x))
1705*c217d954SCole Faust
1706*c217d954SCole Faust#define ROT4_0(x) ((x))
1707*c217d954SCole Faust#define ROT4_1(x) ((x).s3012)
1708*c217d954SCole Faust#define ROT4_2(x) ((x).s2301)
1709*c217d954SCole Faust#define ROT4_3(x) ((x).s1230)
1710*c217d954SCole Faust#define ROT4_4(x) ((x))
1711*c217d954SCole Faust
1712*c217d954SCole Faust#define ROT8_0(x) ((x))
1713*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456)
1714*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345)
1715*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234)
1716*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123)
1717*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012)
1718*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701)
1719*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670)
1720*c217d954SCole Faust#define ROT8_8(x) ((x))
1721*c217d954SCole Faust
1722*c217d954SCole Faust#define ROT16_0(x) ((x))
1723*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE)
1724*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD)
1725*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC)
1726*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB)
1727*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A)
1728*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789)
1729*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678)
1730*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567)
1731*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456)
1732*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345)
1733*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234)
1734*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123)
1735*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012)
1736*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01)
1737*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0)
1738*c217d954SCole Faust#define ROT16_16(x) ((x))
1739*c217d954SCole Faust
1740*c217d954SCole Faust
1741*c217d954SCole Faust
1742*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
1743*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
1744*c217d954SCole Faust
1745*c217d954SCole Faust
1746*c217d954SCole Faust
1747*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0)
1748*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1)
1749*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2)
1750*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
1751*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
1752*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
1753*c217d954SCole Faust
1754*c217d954SCole Faust
1755*c217d954SCole Faust
1756*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
1757*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
1758*c217d954SCole Faust
1759*c217d954SCole Faust
1760*c217d954SCole Faust#define VLOAD_STR(size) vload##size
1761*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size)
1762*c217d954SCole Faust
1763*c217d954SCole Faust
1764*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
1765*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
1766*c217d954SCole Faust
1767*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \
1768*c217d954SCole Faust    {                            \
1769*c217d954SCole Faust    }
1770*c217d954SCole Faust
1771*c217d954SCole Faust
1772*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD
1773*c217d954SCole Faust#define vload_partial_1_1 vload1
1774*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD
1775*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD
1776*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD
1777*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD
1778*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD
1779*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD
1780*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD
1781*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD
1782*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD
1783*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD
1784*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD
1785*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD
1786*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD
1787*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD
1788*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD
1789*c217d954SCole Faust
1790*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD
1791*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1
1792*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2
1793*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD
1794*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD
1795*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD
1796*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD
1797*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD
1798*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD
1799*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD
1800*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD
1801*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD
1802*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD
1803*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD
1804*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD
1805*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD
1806*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD
1807*c217d954SCole Faust
1808*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD
1809*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1
1810*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2
1811*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3
1812*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD
1813*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD
1814*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD
1815*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD
1816*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD
1817*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD
1818*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD
1819*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD
1820*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD
1821*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD
1822*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD
1823*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD
1824*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD
1825*c217d954SCole Faust
1826*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD
1827*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1
1828*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2
1829*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3
1830*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4
1831*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD
1832*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD
1833*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD
1834*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD
1835*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD
1836*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD
1837*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD
1838*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD
1839*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD
1840*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD
1841*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD
1842*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD
1843*c217d954SCole Faust
1844*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD
1845*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1
1846*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2
1847*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3
1848*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4
1849*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5
1850*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6
1851*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7
1852*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8
1853*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD
1854*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD
1855*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD
1856*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD
1857*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD
1858*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD
1859*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD
1860*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD
1861*c217d954SCole Faust
1862*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD
1863*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1
1864*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2
1865*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3
1866*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4
1867*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5
1868*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6
1869*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7
1870*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8
1871*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9
1872*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10
1873*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11
1874*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12
1875*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13
1876*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14
1877*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15
1878*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16
1879*c217d954SCole Faust
1880*c217d954SCole Faust
1881*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \
1882*c217d954SCole Faust    DATA.s0 = vload1(OFFSET, PTR);
1883*c217d954SCole Faust
1884*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \
1885*c217d954SCole Faust    DATA.s01 = vload2(OFFSET, PTR);
1886*c217d954SCole Faust
1887*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \
1888*c217d954SCole Faust    DATA.s012 = vload3(OFFSET, PTR);
1889*c217d954SCole Faust
1890*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \
1891*c217d954SCole Faust    DATA.s0123 = vload4(OFFSET, PTR);
1892*c217d954SCole Faust
1893*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR)    \
1894*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1895*c217d954SCole Faust    DATA.s4 = vload1(OFFSET, PTR + 4);
1896*c217d954SCole Faust
1897*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR)    \
1898*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1899*c217d954SCole Faust    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
1900*c217d954SCole Faust
1901*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR)    \
1902*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1903*c217d954SCole Faust    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
1904*c217d954SCole Faust
1905*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \
1906*c217d954SCole Faust    DATA.s01234567 = vload8(OFFSET, PTR);
1907*c217d954SCole Faust
1908*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR)        \
1909*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1910*c217d954SCole Faust    DATA.s8 = vload1(OFFSET, PTR + 8);
1911*c217d954SCole Faust
1912*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR)       \
1913*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1914*c217d954SCole Faust    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
1915*c217d954SCole Faust
1916*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR)       \
1917*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1918*c217d954SCole Faust    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
1919*c217d954SCole Faust
1920*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR)       \
1921*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1922*c217d954SCole Faust    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
1923*c217d954SCole Faust
1924*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR)       \
1925*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1926*c217d954SCole Faust    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
1927*c217d954SCole Faust
1928*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR)       \
1929*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1930*c217d954SCole Faust    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
1931*c217d954SCole Faust
1932*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR)       \
1933*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1934*c217d954SCole Faust    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
1935*c217d954SCole Faust
1936*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \
1937*c217d954SCole Faust    DATA = vload16(OFFSET, PTR);
1938*c217d954SCole Faust
1939*c217d954SCole Faust
1940*c217d954SCole Faust
1941*c217d954SCole Faust#define PIXEL_UNIT4 1
1942*c217d954SCole Faust#define PIXEL_UNIT8 2
1943*c217d954SCole Faust#define PIXEL_UNIT16 4
1944*c217d954SCole Faust
1945*c217d954SCole Faust
1946*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
1947*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
1948*c217d954SCole Faust
1949*c217d954SCole Faust
1950*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
1951*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
1952*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
1953*c217d954SCole Faust
1954*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1955*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
1956*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
1957*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
1958*c217d954SCole Faust#endif
1959*c217d954SCole Faust
1960*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
1961*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
1962*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1963*c217d954SCole Faust
1964*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1965*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
1966*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
1967*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1968*c217d954SCole Faust#endif
1969*c217d954SCole Faust
1970*c217d954SCole Faust
1971*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
1972*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
1973*c217d954SCole Faust
1974*c217d954SCole Faust
1975*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
1976*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
1977*c217d954SCole Faust
1978*c217d954SCole Faust#define VSTORE_STR(size) vstore##size
1979*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size)
1980*c217d954SCole Faust
1981*c217d954SCole Faust#define float1 float
1982*c217d954SCole Faust#define half1 half
1983*c217d954SCole Faust#define char1 char
1984*c217d954SCole Faust#define uchar1 uchar
1985*c217d954SCole Faust#define short1 short
1986*c217d954SCole Faust#define ushort1 ushort
1987*c217d954SCole Faust#define int1 int
1988*c217d954SCole Faust#define uint1 uint
1989*c217d954SCole Faust#define long1 long
1990*c217d954SCole Faust#define ulong1 ulong
1991*c217d954SCole Faust#define double1 double
1992*c217d954SCole Faust
1993*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR)
1994*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
1995*c217d954SCole Faust
1996*c217d954SCole Faust
1997*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
1998*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
1999*c217d954SCole Faust
2000*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \
2001*c217d954SCole Faust    {                             \
2002*c217d954SCole Faust    }
2003*c217d954SCole Faust
2004*c217d954SCole Faust
2005*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE
2006*c217d954SCole Faust#define vstore_partial_1_1 vstore1
2007*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE
2008*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE
2009*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE
2010*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE
2011*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE
2012*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE
2013*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE
2014*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE
2015*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE
2016*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE
2017*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE
2018*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE
2019*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE
2020*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE
2021*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE
2022*c217d954SCole Faust
2023*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE
2024*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1
2025*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2
2026*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE
2027*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE
2028*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE
2029*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE
2030*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE
2031*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE
2032*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE
2033*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE
2034*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE
2035*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE
2036*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE
2037*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE
2038*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE
2039*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE
2040*c217d954SCole Faust
2041*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE
2042*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1
2043*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2
2044*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3
2045*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE
2046*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE
2047*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE
2048*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE
2049*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE
2050*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE
2051*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE
2052*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE
2053*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE
2054*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE
2055*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE
2056*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE
2057*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE
2058*c217d954SCole Faust
2059*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE
2060*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1
2061*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2
2062*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3
2063*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4
2064*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE
2065*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE
2066*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE
2067*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE
2068*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE
2069*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE
2070*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE
2071*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE
2072*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE
2073*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE
2074*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE
2075*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE
2076*c217d954SCole Faust
2077*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE
2078*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1
2079*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2
2080*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3
2081*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4
2082*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5
2083*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6
2084*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7
2085*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8
2086*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE
2087*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE
2088*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE
2089*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE
2090*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE
2091*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE
2092*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE
2093*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE
2094*c217d954SCole Faust
2095*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE
2096*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1
2097*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2
2098*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3
2099*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4
2100*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5
2101*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6
2102*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7
2103*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8
2104*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9
2105*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10
2106*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11
2107*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12
2108*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13
2109*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14
2110*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15
2111*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16
2112*c217d954SCole Faust
2113*c217d954SCole Faust
2114*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \
2115*c217d954SCole Faust    vstore1(DATA.s0, OFFSET, PTR);
2116*c217d954SCole Faust
2117*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \
2118*c217d954SCole Faust    vstore2(DATA.s01, OFFSET, PTR);
2119*c217d954SCole Faust
2120*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \
2121*c217d954SCole Faust    vstore3(DATA.s012, OFFSET, PTR);
2122*c217d954SCole Faust
2123*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \
2124*c217d954SCole Faust    vstore4(DATA.s0123, OFFSET, PTR);
2125*c217d954SCole Faust
2126*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR)    \
2127*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2128*c217d954SCole Faust    vstore1(DATA.s4, OFFSET, PTR + 4);
2129*c217d954SCole Faust
2130*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR)    \
2131*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2132*c217d954SCole Faust    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
2133*c217d954SCole Faust
2134*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR)    \
2135*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2136*c217d954SCole Faust    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
2137*c217d954SCole Faust
2138*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \
2139*c217d954SCole Faust    vstore8(DATA.s01234567, OFFSET, PTR);
2140*c217d954SCole Faust
2141*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR)        \
2142*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2143*c217d954SCole Faust    vstore1(DATA.s8, OFFSET, PTR + 8);
2144*c217d954SCole Faust
2145*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR)       \
2146*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2147*c217d954SCole Faust    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
2148*c217d954SCole Faust
2149*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR)       \
2150*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2151*c217d954SCole Faust    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
2152*c217d954SCole Faust
2153*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR)       \
2154*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2155*c217d954SCole Faust    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
2156*c217d954SCole Faust
2157*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR)       \
2158*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2159*c217d954SCole Faust    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
2160*c217d954SCole Faust
2161*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR)       \
2162*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2163*c217d954SCole Faust    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
2164*c217d954SCole Faust
2165*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR)       \
2166*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2167*c217d954SCole Faust    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
2168*c217d954SCole Faust
2169*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \
2170*c217d954SCole Faust    vstore16(DATA, OFFSET, PTR);
2171*c217d954SCole Faust
2172*c217d954SCole Faust
2173*c217d954SCole Faust
2174*c217d954SCole Faust
2175*c217d954SCole Faust
2176*c217d954SCole Faust#define convert_float_sat convert_float
2177*c217d954SCole Faust#define convert_float1_sat convert_float
2178*c217d954SCole Faust#define convert_float2_sat convert_float2
2179*c217d954SCole Faust#define convert_float3_sat convert_float3
2180*c217d954SCole Faust#define convert_float4_sat convert_float4
2181*c217d954SCole Faust#define convert_float8_sat convert_float8
2182*c217d954SCole Faust#define convert_float16_sat convert_float16
2183*c217d954SCole Faust#define convert_half_sat convert_float
2184*c217d954SCole Faust#define convert_half1_sat convert_half
2185*c217d954SCole Faust#define convert_half2_sat convert_half2
2186*c217d954SCole Faust#define convert_half3_sat convert_half3
2187*c217d954SCole Faust#define convert_half4_sat convert_half4
2188*c217d954SCole Faust#define convert_half8_sat convert_half8
2189*c217d954SCole Faust#define convert_half16_sat convert_half16
2190*c217d954SCole Faust
2191*c217d954SCole Faust#define convert_float1 convert_float
2192*c217d954SCole Faust#define convert_half1 convert_half
2193*c217d954SCole Faust#define convert_char1 convert_char
2194*c217d954SCole Faust#define convert_uchar1 convert_uchar
2195*c217d954SCole Faust#define convert_short1 convert_short
2196*c217d954SCole Faust#define convert_ushort1 convert_ushort
2197*c217d954SCole Faust#define convert_int1 convert_int
2198*c217d954SCole Faust#define convert_uint1 convert_uint
2199*c217d954SCole Faust#define convert_long1 convert_long
2200*c217d954SCole Faust#define convert_ulong1 convert_ulong
2201*c217d954SCole Faust#define convert_double1 convert_double
2202*c217d954SCole Faust
2203*c217d954SCole Faust#define convert_char1_sat convert_char_sat
2204*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat
2205*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat
2206*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat
2207*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat
2208*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat
2209*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat
2210*c217d954SCole Faust#define convert_short1_sat convert_short_sat
2211*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat
2212*c217d954SCole Faust#define convert_int1_sat convert_int_sat
2213*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat
2214*c217d954SCole Faust#define convert_long1_sat convert_long_sat
2215*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat
2216*c217d954SCole Faust#define convert_double1_sat convert_double_sat
2217*c217d954SCole Faust
2218*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size
2219*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
2220*c217d954SCole Faust
2221*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x)))
2222*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type)
2223*c217d954SCole Faust
2224*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
2225*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
2226*c217d954SCole Faust
2227*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
2228*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
2229*c217d954SCole Faust
2230*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size
2231*c217d954SCole Faust#define select_vec_dt_char(size) char##size
2232*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size
2233*c217d954SCole Faust#define select_vec_dt_short(size) short##size
2234*c217d954SCole Faust#define select_vec_dt_half(size) short##size
2235*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size
2236*c217d954SCole Faust#define select_vec_dt_int(size) int##size
2237*c217d954SCole Faust#define select_vec_dt_float(size) int##size
2238*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size
2239*c217d954SCole Faust#define select_vec_dt_long(size) long##size
2240*c217d954SCole Faust
2241*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
2242*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
2243*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
2244*c217d954SCole Faust
2245*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size
2246*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size
2247*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size
2248*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size
2249*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size
2250*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size
2251*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size
2252*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size
2253*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size
2254*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size
2255*c217d954SCole Faust
2256*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
2257*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
2258*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
2259*c217d954SCole Faust
2260*c217d954SCole Faust#define sum_reduce_1(x) (x)
2261*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1)
2262*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
2263*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
2264*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
2265*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
2266*c217d954SCole Faust
2267*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
2268*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
2269*c217d954SCole Faust
2270*c217d954SCole Faust#define prod_reduce_1(x) (x)
2271*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1)
2272*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
2273*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
2274*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
2275*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
2276*c217d954SCole Faust
2277*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
2278*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
2279*c217d954SCole Faust
2280*c217d954SCole Faust#define max_reduce_1(x) (x)
2281*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1))
2282*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
2283*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
2284*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
2285*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
2286*c217d954SCole Faust
2287*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
2288*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
2289*c217d954SCole Faust
2290*c217d954SCole Faust#define VECTOR_DECLARATION(name)     \
2291*c217d954SCole Faust    __global uchar *name##_ptr,      \
2292*c217d954SCole Faust    uint        name##_stride_x, \
2293*c217d954SCole Faust    uint        name##_step_x,   \
2294*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2295*c217d954SCole Faust
2296*c217d954SCole Faust#define IMAGE_DECLARATION(name)      \
2297*c217d954SCole Faust    __global uchar *name##_ptr,      \
2298*c217d954SCole Faust    uint        name##_stride_x, \
2299*c217d954SCole Faust    uint        name##_step_x,   \
2300*c217d954SCole Faust    uint        name##_stride_y, \
2301*c217d954SCole Faust    uint        name##_step_y,   \
2302*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2303*c217d954SCole Faust
2304*c217d954SCole Faust#define TENSOR3D_DECLARATION(name)   \
2305*c217d954SCole Faust    __global uchar *name##_ptr,      \
2306*c217d954SCole Faust    uint        name##_stride_x, \
2307*c217d954SCole Faust    uint        name##_step_x,   \
2308*c217d954SCole Faust    uint        name##_stride_y, \
2309*c217d954SCole Faust    uint        name##_step_y,   \
2310*c217d954SCole Faust    uint        name##_stride_z, \
2311*c217d954SCole Faust    uint        name##_step_z,   \
2312*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2313*c217d954SCole Faust
2314*c217d954SCole Faust#define TENSOR4D_DECLARATION(name)   \
2315*c217d954SCole Faust    __global uchar *name##_ptr,      \
2316*c217d954SCole Faust    uint        name##_stride_x, \
2317*c217d954SCole Faust    uint        name##_step_x,   \
2318*c217d954SCole Faust    uint        name##_stride_y, \
2319*c217d954SCole Faust    uint        name##_step_y,   \
2320*c217d954SCole Faust    uint        name##_stride_z, \
2321*c217d954SCole Faust    uint        name##_step_z,   \
2322*c217d954SCole Faust    uint        name##_stride_w, \
2323*c217d954SCole Faust    uint        name##_step_w,   \
2324*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2325*c217d954SCole Faust
2326*c217d954SCole Faust#define TENSOR5D_DECLARATION(name)   \
2327*c217d954SCole Faust    __global uchar *name##_ptr,      \
2328*c217d954SCole Faust    uint        name##_stride_x, \
2329*c217d954SCole Faust    uint        name##_step_x,   \
2330*c217d954SCole Faust    uint        name##_stride_y, \
2331*c217d954SCole Faust    uint        name##_step_y,   \
2332*c217d954SCole Faust    uint        name##_stride_z, \
2333*c217d954SCole Faust    uint        name##_step_z,   \
2334*c217d954SCole Faust    uint        name##_stride_w, \
2335*c217d954SCole Faust    uint        name##_step_w,   \
2336*c217d954SCole Faust    uint        name##_stride_v, \
2337*c217d954SCole Faust    uint        name##_step_v,   \
2338*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2339*c217d954SCole Faust
2340*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \
2341*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
2342*c217d954SCole Faust
2343*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
2344*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
2345*c217d954SCole Faust
2346*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \
2347*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
2348*c217d954SCole Faust
2349*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
2350*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
2351*c217d954SCole Faust
2352*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2353*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2354*c217d954SCole Faust
2355*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
2356*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
2357*c217d954SCole Faust
2358*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2359*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2360*c217d954SCole Faust
2361*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
2362*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2363*c217d954SCole Faust                                 name##_stride_z, name##_step_z)
2364*c217d954SCole Faust
2365*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
2366*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
2367*c217d954SCole Faust
2368*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
2369*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2370*c217d954SCole Faust                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
2371*c217d954SCole Faust
2372*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
2373*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
2374*c217d954SCole Faust
2375*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
2376*c217d954SCole Faust    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2377*c217d954SCole Faust                           name##_stride_z, name##_step_z)
2378*c217d954SCole Faust
2379*c217d954SCole Faust
2380*c217d954SCole Fausttypedef struct Vector
2381*c217d954SCole Faust{
2382*c217d954SCole Faust    __global uchar *ptr;
2383*c217d954SCole Faust    int             offset_first_element_in_bytes;
2384*c217d954SCole Faust    int             stride_x;
2385*c217d954SCole Faust} Vector;
2386*c217d954SCole Faust
2387*c217d954SCole Faust
2388*c217d954SCole Fausttypedef struct Image
2389*c217d954SCole Faust{
2390*c217d954SCole Faust    __global uchar *ptr;
2391*c217d954SCole Faust    int             offset_first_element_in_bytes;
2392*c217d954SCole Faust    int             stride_x;
2393*c217d954SCole Faust    int             stride_y;
2394*c217d954SCole Faust} Image;
2395*c217d954SCole Faust
2396*c217d954SCole Faust
2397*c217d954SCole Fausttypedef struct Tensor3D
2398*c217d954SCole Faust{
2399*c217d954SCole Faust    __global uchar *ptr;
2400*c217d954SCole Faust    int             offset_first_element_in_bytes;
2401*c217d954SCole Faust    int             stride_x;
2402*c217d954SCole Faust    int             stride_y;
2403*c217d954SCole Faust    int             stride_z;
2404*c217d954SCole Faust} Tensor3D;
2405*c217d954SCole Faust
2406*c217d954SCole Faust
2407*c217d954SCole Fausttypedef struct Tensor4D
2408*c217d954SCole Faust{
2409*c217d954SCole Faust    __global uchar *ptr;
2410*c217d954SCole Faust    int             offset_first_element_in_bytes;
2411*c217d954SCole Faust    int             stride_x;
2412*c217d954SCole Faust    int             stride_y;
2413*c217d954SCole Faust    int             stride_z;
2414*c217d954SCole Faust    int             stride_w;
2415*c217d954SCole Faust} Tensor4D;
2416*c217d954SCole Faust
2417*c217d954SCole Faust
2418*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
2419*c217d954SCole Faust{
2420*c217d954SCole Faust    Vector vector =
2421*c217d954SCole Faust    {
2422*c217d954SCole Faust        .ptr                           = ptr,
2423*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2424*c217d954SCole Faust        .stride_x                      = stride_x,
2425*c217d954SCole Faust    };
2426*c217d954SCole Faust    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
2427*c217d954SCole Faust    return vector;
2428*c217d954SCole Faust}
2429*c217d954SCole Faust
2430*c217d954SCole Faust
2431*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
2432*c217d954SCole Faust{
2433*c217d954SCole Faust    Image img =
2434*c217d954SCole Faust    {
2435*c217d954SCole Faust        .ptr                           = ptr,
2436*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2437*c217d954SCole Faust        .stride_x                      = stride_x,
2438*c217d954SCole Faust        .stride_y                      = stride_y
2439*c217d954SCole Faust    };
2440*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
2441*c217d954SCole Faust    return img;
2442*c217d954SCole Faust}
2443*c217d954SCole Faust
2444*c217d954SCole Faust
2445*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2446*c217d954SCole Faust{
2447*c217d954SCole Faust    Image img =
2448*c217d954SCole Faust    {
2449*c217d954SCole Faust        .ptr                           = ptr,
2450*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2451*c217d954SCole Faust        .stride_x                      = stride_x,
2452*c217d954SCole Faust        .stride_y                      = stride_y
2453*c217d954SCole Faust    };
2454*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2455*c217d954SCole Faust    return img;
2456*c217d954SCole Faust}
2457*c217d954SCole Faust
2458*c217d954SCole Faust
2459*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2460*c217d954SCole Faust{
2461*c217d954SCole Faust    Tensor3D tensor =
2462*c217d954SCole Faust    {
2463*c217d954SCole Faust        .ptr                           = ptr,
2464*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2465*c217d954SCole Faust        .stride_x                      = stride_x,
2466*c217d954SCole Faust        .stride_y                      = stride_y,
2467*c217d954SCole Faust        .stride_z                      = stride_z
2468*c217d954SCole Faust    };
2469*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2470*c217d954SCole Faust    return tensor;
2471*c217d954SCole Faust}
2472*c217d954SCole Faust
2473*c217d954SCole Faust
2474*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2475*c217d954SCole Faust{
2476*c217d954SCole Faust    Tensor3D tensor =
2477*c217d954SCole Faust    {
2478*c217d954SCole Faust        .ptr                           = ptr,
2479*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2480*c217d954SCole Faust        .stride_x                      = stride_x,
2481*c217d954SCole Faust        .stride_y                      = stride_y,
2482*c217d954SCole Faust        .stride_z                      = stride_z
2483*c217d954SCole Faust    };
2484*c217d954SCole Faust    return tensor;
2485*c217d954SCole Faust}
2486*c217d954SCole Faust
2487*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
2488*c217d954SCole Faust                                             uint step_w,
2489*c217d954SCole Faust                                             uint mod_size)
2490*c217d954SCole Faust{
2491*c217d954SCole Faust    Tensor4D tensor =
2492*c217d954SCole Faust    {
2493*c217d954SCole Faust        .ptr                           = ptr,
2494*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2495*c217d954SCole Faust        .stride_x                      = stride_x,
2496*c217d954SCole Faust        .stride_y                      = stride_y,
2497*c217d954SCole Faust        .stride_z                      = stride_z,
2498*c217d954SCole Faust        .stride_w                      = stride_w
2499*c217d954SCole Faust    };
2500*c217d954SCole Faust
2501*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
2502*c217d954SCole Faust    return tensor;
2503*c217d954SCole Faust}
2504*c217d954SCole Faust
2505*c217d954SCole Faust
2506*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x)
2507*c217d954SCole Faust{
2508*c217d954SCole Faust    return vec->ptr + x * vec->stride_x;
2509*c217d954SCole Faust}
2510*c217d954SCole Faust
2511*c217d954SCole Faust
2512*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y)
2513*c217d954SCole Faust{
2514*c217d954SCole Faust    return img->ptr + x * img->stride_x + y * img->stride_y;
2515*c217d954SCole Faust}
2516*c217d954SCole Faust
2517*c217d954SCole Faust
2518*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
2519*c217d954SCole Faust{
2520*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
2521*c217d954SCole Faust}
2522*c217d954SCole Faust
2523*c217d954SCole Faust
2524*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
2525*c217d954SCole Faust{
2526*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
2527*c217d954SCole Faust}
2528*c217d954SCole Faust
2529*c217d954SCole Faust
2530*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
2531*c217d954SCole Faust{
2532*c217d954SCole Faust    uint num_elements = width * height;
2533*c217d954SCole Faust
2534*c217d954SCole Faust    const uint z = index / num_elements;
2535*c217d954SCole Faust
2536*c217d954SCole Faust    index %= num_elements;
2537*c217d954SCole Faust
2538*c217d954SCole Faust    const uint y = index / width;
2539*c217d954SCole Faust
2540*c217d954SCole Faust    index %= width;
2541*c217d954SCole Faust
2542*c217d954SCole Faust    const uint x = index;
2543*c217d954SCole Faust
2544*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
2545*c217d954SCole Faust}
2546*c217d954SCole Faust
2547*c217d954SCole Faust#endif
2548*c217d954SCole Faust
2549*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
2550*c217d954SCole Faust#define ARM_COMPUTE_HELPERS_ASYMM_H
2551*c217d954SCole Faust
2552*c217d954SCole Faust
2553*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H
2554*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H
2555*c217d954SCole Faust
2556*c217d954SCole Faust
2557*c217d954SCole Faust
2558*c217d954SCole Faust
2559*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2560*c217d954SCole Faust    VSTORE(N0)                                                 \
2561*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2562*c217d954SCole Faust
2563*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2564*c217d954SCole Faust    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2565*c217d954SCole Faust    VSTORE(N0)                                                 \
2566*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2567*c217d954SCole Faust
2568*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2569*c217d954SCole Faust    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2570*c217d954SCole Faust    VSTORE(N0)                                                 \
2571*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2572*c217d954SCole Faust
2573*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2574*c217d954SCole Faust    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2575*c217d954SCole Faust    VSTORE(N0)                                                 \
2576*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2577*c217d954SCole Faust
2578*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2579*c217d954SCole Faust    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2580*c217d954SCole Faust    VSTORE(N0)                                                 \
2581*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2582*c217d954SCole Faust
2583*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2584*c217d954SCole Faust    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2585*c217d954SCole Faust    VSTORE(N0)                                                 \
2586*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2587*c217d954SCole Faust
2588*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2589*c217d954SCole Faust    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2590*c217d954SCole Faust    VSTORE(N0)                                                 \
2591*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2592*c217d954SCole Faust
2593*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2594*c217d954SCole Faust    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2595*c217d954SCole Faust    VSTORE(N0)                                                 \
2596*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2597*c217d954SCole Faust
2598*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2599*c217d954SCole Faust    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2600*c217d954SCole Faust    VSTORE(N0)                                                 \
2601*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2602*c217d954SCole Faust
2603*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2604*c217d954SCole Faust    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
2605*c217d954SCole Faust    VSTORE(N0)                                                  \
2606*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2607*c217d954SCole Faust
2608*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2609*c217d954SCole Faust    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2610*c217d954SCole Faust    VSTORE(N0)                                                  \
2611*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2612*c217d954SCole Faust
2613*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2614*c217d954SCole Faust    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2615*c217d954SCole Faust    VSTORE(N0)                                                  \
2616*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2617*c217d954SCole Faust
2618*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2619*c217d954SCole Faust    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2620*c217d954SCole Faust    VSTORE(N0)                                                  \
2621*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2622*c217d954SCole Faust
2623*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2624*c217d954SCole Faust    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2625*c217d954SCole Faust    VSTORE(N0)                                                  \
2626*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2627*c217d954SCole Faust
2628*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2629*c217d954SCole Faust    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2630*c217d954SCole Faust    VSTORE(N0)                                                  \
2631*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2632*c217d954SCole Faust
2633*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2634*c217d954SCole Faust    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2635*c217d954SCole Faust    VSTORE(N0)                                                  \
2636*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2637*c217d954SCole Faust
2638*c217d954SCole Faust
2639*c217d954SCole Faust
2640*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2641*c217d954SCole Faust    VSTORE(N0)                                                         \
2642*c217d954SCole Faust    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2643*c217d954SCole Faust
2644*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2645*c217d954SCole Faust    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2646*c217d954SCole Faust    VSTORE(N0)                                                         \
2647*c217d954SCole Faust    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2648*c217d954SCole Faust
2649*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2650*c217d954SCole Faust    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2651*c217d954SCole Faust    VSTORE(N0)                                                         \
2652*c217d954SCole Faust    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2653*c217d954SCole Faust
2654*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2655*c217d954SCole Faust    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2656*c217d954SCole Faust    VSTORE(N0)                                                         \
2657*c217d954SCole Faust    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2658*c217d954SCole Faust
2659*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2660*c217d954SCole Faust    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2661*c217d954SCole Faust    VSTORE(N0)                                                         \
2662*c217d954SCole Faust    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2663*c217d954SCole Faust
2664*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2665*c217d954SCole Faust    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2666*c217d954SCole Faust    VSTORE(N0)                                                         \
2667*c217d954SCole Faust    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2668*c217d954SCole Faust
2669*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2670*c217d954SCole Faust    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2671*c217d954SCole Faust    VSTORE(N0)                                                         \
2672*c217d954SCole Faust    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2673*c217d954SCole Faust
2674*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2675*c217d954SCole Faust    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2676*c217d954SCole Faust    VSTORE(N0)                                                         \
2677*c217d954SCole Faust    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2678*c217d954SCole Faust
2679*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2680*c217d954SCole Faust    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2681*c217d954SCole Faust    VSTORE(N0)                                                         \
2682*c217d954SCole Faust    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2683*c217d954SCole Faust
2684*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
2685*c217d954SCole Faust    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2686*c217d954SCole Faust    VSTORE(N0)                                                     \
2687*c217d954SCole Faust    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2688*c217d954SCole Faust
2689*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2690*c217d954SCole Faust    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2691*c217d954SCole Faust    VSTORE(N0)                                                          \
2692*c217d954SCole Faust    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2693*c217d954SCole Faust
2694*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2695*c217d954SCole Faust    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2696*c217d954SCole Faust    VSTORE(N0)                                                          \
2697*c217d954SCole Faust    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2698*c217d954SCole Faust
2699*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2700*c217d954SCole Faust    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2701*c217d954SCole Faust    VSTORE(N0)                                                          \
2702*c217d954SCole Faust    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2703*c217d954SCole Faust
2704*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2705*c217d954SCole Faust    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2706*c217d954SCole Faust    VSTORE(N0)                                                          \
2707*c217d954SCole Faust    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2708*c217d954SCole Faust
2709*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2710*c217d954SCole Faust    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2711*c217d954SCole Faust    VSTORE(N0)                                                          \
2712*c217d954SCole Faust    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2713*c217d954SCole Faust
2714*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2715*c217d954SCole Faust    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2716*c217d954SCole Faust    VSTORE(N0)                                                          \
2717*c217d954SCole Faust    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2718*c217d954SCole Faust
2719*c217d954SCole Faust
2720*c217d954SCole Faust
2721*c217d954SCole Faust
2722*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2723*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2724*c217d954SCole Faust
2725*c217d954SCole Faust
2726*c217d954SCole Faust
2727*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2728*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2729*c217d954SCole Faust
2730*c217d954SCole Faust
2731*c217d954SCole Faust
2732*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2733*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2734*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2735*c217d954SCole Faust
2736*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2737*c217d954SCole Faust    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2738*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2739*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2740*c217d954SCole Faust
2741*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2742*c217d954SCole Faust    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2743*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2744*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2745*c217d954SCole Faust
2746*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2747*c217d954SCole Faust    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2748*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2749*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2750*c217d954SCole Faust
2751*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2752*c217d954SCole Faust    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2753*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2754*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2755*c217d954SCole Faust
2756*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2757*c217d954SCole Faust    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2758*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2759*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2760*c217d954SCole Faust
2761*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2762*c217d954SCole Faust    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2763*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2764*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2765*c217d954SCole Faust
2766*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2767*c217d954SCole Faust    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2768*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2769*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2770*c217d954SCole Faust
2771*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2772*c217d954SCole Faust    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2773*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2774*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2775*c217d954SCole Faust
2776*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2777*c217d954SCole Faust    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
2778*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2779*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2780*c217d954SCole Faust
2781*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2782*c217d954SCole Faust    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2783*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2784*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2785*c217d954SCole Faust
2786*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2787*c217d954SCole Faust    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2788*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2789*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2790*c217d954SCole Faust
2791*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2792*c217d954SCole Faust    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2793*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2794*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2795*c217d954SCole Faust
2796*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2797*c217d954SCole Faust    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2798*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2799*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2800*c217d954SCole Faust
2801*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2802*c217d954SCole Faust    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2803*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2804*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2805*c217d954SCole Faust
2806*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2807*c217d954SCole Faust    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2808*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2809*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2810*c217d954SCole Faust
2811*c217d954SCole Faust
2812*c217d954SCole Faust
2813*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2814*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2815*c217d954SCole Faust
2816*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2817*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
2818*c217d954SCole Faust    {                                                                                                                                                     \
2819*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
2820*c217d954SCole Faust    }                                                                                                                                                     \
2821*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
2822*c217d954SCole Faust    {                                                                                                                                                     \
2823*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
2824*c217d954SCole Faust    }                                                                                                                                                     \
2825*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
2826*c217d954SCole Faust    {                                                                                                                                                     \
2827*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
2828*c217d954SCole Faust    }                                                                                                                                                     \
2829*c217d954SCole Faust    else                                                                                                                                                  \
2830*c217d954SCole Faust    {                                                                                                                                                     \
2831*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
2832*c217d954SCole Faust    }
2833*c217d954SCole Faust
2834*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
2835*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                         \
2836*c217d954SCole Faust    {                                                                                                             \
2837*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
2838*c217d954SCole Faust    }                                                                                                             \
2839*c217d954SCole Faust    else                                                                                                          \
2840*c217d954SCole Faust    {                                                                                                             \
2841*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
2842*c217d954SCole Faust    }
2843*c217d954SCole Faust
2844*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
2845*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                         \
2846*c217d954SCole Faust    {                                                                                                             \
2847*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
2848*c217d954SCole Faust    }                                                                                                             \
2849*c217d954SCole Faust    else                                                                                                          \
2850*c217d954SCole Faust    {                                                                                                             \
2851*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
2852*c217d954SCole Faust    }
2853*c217d954SCole Faust
2854*c217d954SCole Faust
2855*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
2856*c217d954SCole Faust
2857*c217d954SCole Faust
2858*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2859*c217d954SCole Faust
2860*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2861*c217d954SCole Faust    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2862*c217d954SCole Faust
2863*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
2864*c217d954SCole Faust
2865*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2866*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
2867*c217d954SCole Faust
2868*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
2869*c217d954SCole Faust
2870*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2871*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
2872*c217d954SCole Faust
2873*c217d954SCole Faust#else
2874*c217d954SCole Faust
2875*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2876*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
2877*c217d954SCole Faust
2878*c217d954SCole Faust#endif
2879*c217d954SCole Faust
2880*c217d954SCole Faust#endif
2881*c217d954SCole Faust
2882*c217d954SCole Faust
2883*c217d954SCole Faust#if defined(PARTIAL_STORE_M0)
2884*c217d954SCole Faust
2885*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
2886*c217d954SCole Faust    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
2887*c217d954SCole Faust#else
2888*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
2889*c217d954SCole Faust    ((uint)(y * M0))
2890*c217d954SCole Faust#endif
2891*c217d954SCole Faust
2892*c217d954SCole Faust
2893*c217d954SCole Faust
2894*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
2895*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
2896*c217d954SCole Faust
2897*c217d954SCole Faust
2898*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
2899*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2900*c217d954SCole Faust#endif
2901*c217d954SCole Faust
2902*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
2903*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
2904*c217d954SCole Faust#endif
2905*c217d954SCole Faust
2906*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
2907*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
2908*c217d954SCole Faust#endif
2909*c217d954SCole Faust
2910*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
2911*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable
2912*c217d954SCole Faust#endif
2913*c217d954SCole Faust
2914*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100
2915*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200
2916*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300
2917*c217d954SCole Faust
2918*c217d954SCole Faust
2919*c217d954SCole Faust#define CONCAT(a, b) a##b
2920*c217d954SCole Faust
2921*c217d954SCole Faust
2922*c217d954SCole Faust#define EXPAND(x) x
2923*c217d954SCole Faust
2924*c217d954SCole Faust
2925*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
2926*c217d954SCole Faust
2927*c217d954SCole Faust
2928*c217d954SCole Faust#define REV1(x) ((x))
2929*c217d954SCole Faust#define REV2(x) ((x).s10)
2930*c217d954SCole Faust#define REV3(x) ((x).s210)
2931*c217d954SCole Faust#define REV4(x) ((x).s3210)
2932*c217d954SCole Faust#define REV8(x) ((x).s76543210)
2933*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210)
2934*c217d954SCole Faust
2935*c217d954SCole Faust
2936*c217d954SCole Faust
2937*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x))
2938*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s)
2939*c217d954SCole Faust
2940*c217d954SCole Faust
2941*c217d954SCole Faust
2942*c217d954SCole Faust#define ROT1_0(x) ((x))
2943*c217d954SCole Faust#define ROT1_1(x) ((x))
2944*c217d954SCole Faust
2945*c217d954SCole Faust#define ROT2_0(x) ((x))
2946*c217d954SCole Faust#define ROT2_1(x) ((x).s10)
2947*c217d954SCole Faust#define ROT2_2(x) ((x))
2948*c217d954SCole Faust
2949*c217d954SCole Faust#define ROT3_0(x) ((x))
2950*c217d954SCole Faust#define ROT3_1(x) ((x).s201)
2951*c217d954SCole Faust#define ROT3_2(x) ((x).s120)
2952*c217d954SCole Faust#define ROT3_3(x) ((x))
2953*c217d954SCole Faust
2954*c217d954SCole Faust#define ROT4_0(x) ((x))
2955*c217d954SCole Faust#define ROT4_1(x) ((x).s3012)
2956*c217d954SCole Faust#define ROT4_2(x) ((x).s2301)
2957*c217d954SCole Faust#define ROT4_3(x) ((x).s1230)
2958*c217d954SCole Faust#define ROT4_4(x) ((x))
2959*c217d954SCole Faust
2960*c217d954SCole Faust#define ROT8_0(x) ((x))
2961*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456)
2962*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345)
2963*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234)
2964*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123)
2965*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012)
2966*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701)
2967*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670)
2968*c217d954SCole Faust#define ROT8_8(x) ((x))
2969*c217d954SCole Faust
2970*c217d954SCole Faust#define ROT16_0(x) ((x))
2971*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE)
2972*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD)
2973*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC)
2974*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB)
2975*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A)
2976*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789)
2977*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678)
2978*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567)
2979*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456)
2980*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345)
2981*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234)
2982*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123)
2983*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012)
2984*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01)
2985*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0)
2986*c217d954SCole Faust#define ROT16_16(x) ((x))
2987*c217d954SCole Faust
2988*c217d954SCole Faust
2989*c217d954SCole Faust
2990*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
2991*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
2992*c217d954SCole Faust
2993*c217d954SCole Faust
2994*c217d954SCole Faust
2995*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0)
2996*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1)
2997*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2)
2998*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
2999*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
3000*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
3001*c217d954SCole Faust
3002*c217d954SCole Faust
3003*c217d954SCole Faust
3004*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
3005*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
3006*c217d954SCole Faust
3007*c217d954SCole Faust
3008*c217d954SCole Faust#define VLOAD_STR(size) vload##size
3009*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size)
3010*c217d954SCole Faust
3011*c217d954SCole Faust
3012*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
3013*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
3014*c217d954SCole Faust
3015*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \
3016*c217d954SCole Faust    {                            \
3017*c217d954SCole Faust    }
3018*c217d954SCole Faust
3019*c217d954SCole Faust
3020*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD
3021*c217d954SCole Faust#define vload_partial_1_1 vload1
3022*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD
3023*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD
3024*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD
3025*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD
3026*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD
3027*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD
3028*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD
3029*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD
3030*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD
3031*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD
3032*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD
3033*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD
3034*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD
3035*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD
3036*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD
3037*c217d954SCole Faust
3038*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD
3039*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1
3040*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2
3041*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD
3042*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD
3043*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD
3044*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD
3045*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD
3046*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD
3047*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD
3048*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD
3049*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD
3050*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD
3051*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD
3052*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD
3053*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD
3054*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD
3055*c217d954SCole Faust
3056*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD
3057*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1
3058*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2
3059*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3
3060*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD
3061*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD
3062*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD
3063*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD
3064*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD
3065*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD
3066*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD
3067*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD
3068*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD
3069*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD
3070*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD
3071*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD
3072*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD
3073*c217d954SCole Faust
3074*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD
3075*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1
3076*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2
3077*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3
3078*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4
3079*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD
3080*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD
3081*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD
3082*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD
3083*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD
3084*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD
3085*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD
3086*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD
3087*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD
3088*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD
3089*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD
3090*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD
3091*c217d954SCole Faust
3092*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD
3093*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1
3094*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2
3095*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3
3096*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4
3097*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5
3098*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6
3099*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7
3100*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8
3101*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD
3102*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD
3103*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD
3104*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD
3105*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD
3106*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD
3107*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD
3108*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD
3109*c217d954SCole Faust
3110*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD
3111*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1
3112*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2
3113*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3
3114*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4
3115*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5
3116*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6
3117*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7
3118*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8
3119*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9
3120*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10
3121*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11
3122*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12
3123*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13
3124*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14
3125*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15
3126*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16
3127*c217d954SCole Faust
3128*c217d954SCole Faust
3129*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \
3130*c217d954SCole Faust    DATA.s0 = vload1(OFFSET, PTR);
3131*c217d954SCole Faust
3132*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \
3133*c217d954SCole Faust    DATA.s01 = vload2(OFFSET, PTR);
3134*c217d954SCole Faust
3135*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \
3136*c217d954SCole Faust    DATA.s012 = vload3(OFFSET, PTR);
3137*c217d954SCole Faust
3138*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \
3139*c217d954SCole Faust    DATA.s0123 = vload4(OFFSET, PTR);
3140*c217d954SCole Faust
3141*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR)    \
3142*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
3143*c217d954SCole Faust    DATA.s4 = vload1(OFFSET, PTR + 4);
3144*c217d954SCole Faust
3145*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR)    \
3146*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
3147*c217d954SCole Faust    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
3148*c217d954SCole Faust
3149*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR)    \
3150*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
3151*c217d954SCole Faust    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
3152*c217d954SCole Faust
3153*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \
3154*c217d954SCole Faust    DATA.s01234567 = vload8(OFFSET, PTR);
3155*c217d954SCole Faust
3156*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR)        \
3157*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3158*c217d954SCole Faust    DATA.s8 = vload1(OFFSET, PTR + 8);
3159*c217d954SCole Faust
3160*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR)       \
3161*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3162*c217d954SCole Faust    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
3163*c217d954SCole Faust
3164*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR)       \
3165*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3166*c217d954SCole Faust    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
3167*c217d954SCole Faust
3168*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR)       \
3169*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3170*c217d954SCole Faust    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
3171*c217d954SCole Faust
3172*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR)       \
3173*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3174*c217d954SCole Faust    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
3175*c217d954SCole Faust
3176*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR)       \
3177*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3178*c217d954SCole Faust    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
3179*c217d954SCole Faust
3180*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR)       \
3181*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3182*c217d954SCole Faust    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
3183*c217d954SCole Faust
3184*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \
3185*c217d954SCole Faust    DATA = vload16(OFFSET, PTR);
3186*c217d954SCole Faust
3187*c217d954SCole Faust
3188*c217d954SCole Faust
3189*c217d954SCole Faust#define PIXEL_UNIT4 1
3190*c217d954SCole Faust#define PIXEL_UNIT8 2
3191*c217d954SCole Faust#define PIXEL_UNIT16 4
3192*c217d954SCole Faust
3193*c217d954SCole Faust
3194*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
3195*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
3196*c217d954SCole Faust
3197*c217d954SCole Faust
3198*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
3199*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
3200*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
3201*c217d954SCole Faust
3202*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
3203*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
3204*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
3205*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
3206*c217d954SCole Faust#endif
3207*c217d954SCole Faust
3208*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
3209*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
3210*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
3211*c217d954SCole Faust
3212*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
3213*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
3214*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
3215*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
3216*c217d954SCole Faust#endif
3217*c217d954SCole Faust
3218*c217d954SCole Faust
3219*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
3220*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
3221*c217d954SCole Faust
3222*c217d954SCole Faust
3223*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
3224*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
3225*c217d954SCole Faust
3226*c217d954SCole Faust#define VSTORE_STR(size) vstore##size
3227*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size)
3228*c217d954SCole Faust
3229*c217d954SCole Faust#define float1 float
3230*c217d954SCole Faust#define half1 half
3231*c217d954SCole Faust#define char1 char
3232*c217d954SCole Faust#define uchar1 uchar
3233*c217d954SCole Faust#define short1 short
3234*c217d954SCole Faust#define ushort1 ushort
3235*c217d954SCole Faust#define int1 int
3236*c217d954SCole Faust#define uint1 uint
3237*c217d954SCole Faust#define long1 long
3238*c217d954SCole Faust#define ulong1 ulong
3239*c217d954SCole Faust#define double1 double
3240*c217d954SCole Faust
3241*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR)
3242*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
3243*c217d954SCole Faust
3244*c217d954SCole Faust
3245*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
3246*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
3247*c217d954SCole Faust
3248*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \
3249*c217d954SCole Faust    {                             \
3250*c217d954SCole Faust    }
3251*c217d954SCole Faust
3252*c217d954SCole Faust
3253*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE
3254*c217d954SCole Faust#define vstore_partial_1_1 vstore1
3255*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE
3256*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE
3257*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE
3258*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE
3259*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE
3260*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE
3261*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE
3262*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE
3263*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE
3264*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE
3265*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE
3266*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE
3267*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE
3268*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE
3269*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE
3270*c217d954SCole Faust
3271*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE
3272*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1
3273*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2
3274*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE
3275*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE
3276*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE
3277*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE
3278*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE
3279*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE
3280*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE
3281*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE
3282*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE
3283*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE
3284*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE
3285*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE
3286*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE
3287*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE
3288*c217d954SCole Faust
3289*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE
3290*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1
3291*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2
3292*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3
3293*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE
3294*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE
3295*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE
3296*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE
3297*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE
3298*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE
3299*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE
3300*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE
3301*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE
3302*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE
3303*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE
3304*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE
3305*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE
3306*c217d954SCole Faust
3307*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE
3308*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1
3309*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2
3310*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3
3311*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4
3312*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE
3313*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE
3314*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE
3315*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE
3316*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE
3317*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE
3318*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE
3319*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE
3320*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE
3321*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE
3322*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE
3323*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE
3324*c217d954SCole Faust
3325*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE
3326*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1
3327*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2
3328*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3
3329*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4
3330*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5
3331*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6
3332*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7
3333*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8
3334*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE
3335*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE
3336*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE
3337*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE
3338*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE
3339*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE
3340*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE
3341*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE
3342*c217d954SCole Faust
3343*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE
3344*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1
3345*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2
3346*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3
3347*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4
3348*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5
3349*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6
3350*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7
3351*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8
3352*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9
3353*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10
3354*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11
3355*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12
3356*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13
3357*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14
3358*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15
3359*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16
3360*c217d954SCole Faust
3361*c217d954SCole Faust
3362*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \
3363*c217d954SCole Faust    vstore1(DATA.s0, OFFSET, PTR);
3364*c217d954SCole Faust
3365*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \
3366*c217d954SCole Faust    vstore2(DATA.s01, OFFSET, PTR);
3367*c217d954SCole Faust
3368*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \
3369*c217d954SCole Faust    vstore3(DATA.s012, OFFSET, PTR);
3370*c217d954SCole Faust
3371*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \
3372*c217d954SCole Faust    vstore4(DATA.s0123, OFFSET, PTR);
3373*c217d954SCole Faust
3374*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR)    \
3375*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
3376*c217d954SCole Faust    vstore1(DATA.s4, OFFSET, PTR + 4);
3377*c217d954SCole Faust
3378*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR)    \
3379*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
3380*c217d954SCole Faust    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
3381*c217d954SCole Faust
3382*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR)    \
3383*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
3384*c217d954SCole Faust    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
3385*c217d954SCole Faust
3386*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \
3387*c217d954SCole Faust    vstore8(DATA.s01234567, OFFSET, PTR);
3388*c217d954SCole Faust
3389*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR)        \
3390*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3391*c217d954SCole Faust    vstore1(DATA.s8, OFFSET, PTR + 8);
3392*c217d954SCole Faust
3393*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR)       \
3394*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3395*c217d954SCole Faust    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
3396*c217d954SCole Faust
3397*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR)       \
3398*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3399*c217d954SCole Faust    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
3400*c217d954SCole Faust
3401*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR)       \
3402*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3403*c217d954SCole Faust    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
3404*c217d954SCole Faust
3405*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR)       \
3406*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3407*c217d954SCole Faust    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
3408*c217d954SCole Faust
3409*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR)       \
3410*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3411*c217d954SCole Faust    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
3412*c217d954SCole Faust
3413*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR)       \
3414*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3415*c217d954SCole Faust    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
3416*c217d954SCole Faust
3417*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \
3418*c217d954SCole Faust    vstore16(DATA, OFFSET, PTR);
3419*c217d954SCole Faust
3420*c217d954SCole Faust
3421*c217d954SCole Faust
3422*c217d954SCole Faust
3423*c217d954SCole Faust
3424*c217d954SCole Faust#define convert_float_sat convert_float
3425*c217d954SCole Faust#define convert_float1_sat convert_float
3426*c217d954SCole Faust#define convert_float2_sat convert_float2
3427*c217d954SCole Faust#define convert_float3_sat convert_float3
3428*c217d954SCole Faust#define convert_float4_sat convert_float4
3429*c217d954SCole Faust#define convert_float8_sat convert_float8
3430*c217d954SCole Faust#define convert_float16_sat convert_float16
3431*c217d954SCole Faust#define convert_half_sat convert_float
3432*c217d954SCole Faust#define convert_half1_sat convert_half
3433*c217d954SCole Faust#define convert_half2_sat convert_half2
3434*c217d954SCole Faust#define convert_half3_sat convert_half3
3435*c217d954SCole Faust#define convert_half4_sat convert_half4
3436*c217d954SCole Faust#define convert_half8_sat convert_half8
3437*c217d954SCole Faust#define convert_half16_sat convert_half16
3438*c217d954SCole Faust
3439*c217d954SCole Faust#define convert_float1 convert_float
3440*c217d954SCole Faust#define convert_half1 convert_half
3441*c217d954SCole Faust#define convert_char1 convert_char
3442*c217d954SCole Faust#define convert_uchar1 convert_uchar
3443*c217d954SCole Faust#define convert_short1 convert_short
3444*c217d954SCole Faust#define convert_ushort1 convert_ushort
3445*c217d954SCole Faust#define convert_int1 convert_int
3446*c217d954SCole Faust#define convert_uint1 convert_uint
3447*c217d954SCole Faust#define convert_long1 convert_long
3448*c217d954SCole Faust#define convert_ulong1 convert_ulong
3449*c217d954SCole Faust#define convert_double1 convert_double
3450*c217d954SCole Faust
3451*c217d954SCole Faust#define convert_char1_sat convert_char_sat
3452*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat
3453*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat
3454*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat
3455*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat
3456*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat
3457*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat
3458*c217d954SCole Faust#define convert_short1_sat convert_short_sat
3459*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat
3460*c217d954SCole Faust#define convert_int1_sat convert_int_sat
3461*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat
3462*c217d954SCole Faust#define convert_long1_sat convert_long_sat
3463*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat
3464*c217d954SCole Faust#define convert_double1_sat convert_double_sat
3465*c217d954SCole Faust
3466*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size
3467*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
3468*c217d954SCole Faust
3469*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x)))
3470*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type)
3471*c217d954SCole Faust
3472*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
3473*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
3474*c217d954SCole Faust
3475*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
3476*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
3477*c217d954SCole Faust
3478*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size
3479*c217d954SCole Faust#define select_vec_dt_char(size) char##size
3480*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size
3481*c217d954SCole Faust#define select_vec_dt_short(size) short##size
3482*c217d954SCole Faust#define select_vec_dt_half(size) short##size
3483*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size
3484*c217d954SCole Faust#define select_vec_dt_int(size) int##size
3485*c217d954SCole Faust#define select_vec_dt_float(size) int##size
3486*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size
3487*c217d954SCole Faust#define select_vec_dt_long(size) long##size
3488*c217d954SCole Faust
3489*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
3490*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
3491*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
3492*c217d954SCole Faust
3493*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size
3494*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size
3495*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size
3496*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size
3497*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size
3498*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size
3499*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size
3500*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size
3501*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size
3502*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size
3503*c217d954SCole Faust
3504*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
3505*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
3506*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
3507*c217d954SCole Faust
3508*c217d954SCole Faust#define sum_reduce_1(x) (x)
3509*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1)
3510*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
3511*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
3512*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
3513*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
3514*c217d954SCole Faust
3515*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
3516*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
3517*c217d954SCole Faust
3518*c217d954SCole Faust#define prod_reduce_1(x) (x)
3519*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1)
3520*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
3521*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
3522*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
3523*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
3524*c217d954SCole Faust
3525*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
3526*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
3527*c217d954SCole Faust
3528*c217d954SCole Faust#define max_reduce_1(x) (x)
3529*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1))
3530*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
3531*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
3532*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
3533*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
3534*c217d954SCole Faust
3535*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
3536*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
3537*c217d954SCole Faust
3538*c217d954SCole Faust#define VECTOR_DECLARATION(name)     \
3539*c217d954SCole Faust    __global uchar *name##_ptr,      \
3540*c217d954SCole Faust    uint        name##_stride_x, \
3541*c217d954SCole Faust    uint        name##_step_x,   \
3542*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
3543*c217d954SCole Faust
3544*c217d954SCole Faust#define IMAGE_DECLARATION(name)      \
3545*c217d954SCole Faust    __global uchar *name##_ptr,      \
3546*c217d954SCole Faust    uint        name##_stride_x, \
3547*c217d954SCole Faust    uint        name##_step_x,   \
3548*c217d954SCole Faust    uint        name##_stride_y, \
3549*c217d954SCole Faust    uint        name##_step_y,   \
3550*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
3551*c217d954SCole Faust
3552*c217d954SCole Faust#define TENSOR3D_DECLARATION(name)   \
3553*c217d954SCole Faust    __global uchar *name##_ptr,      \
3554*c217d954SCole Faust    uint        name##_stride_x, \
3555*c217d954SCole Faust    uint        name##_step_x,   \
3556*c217d954SCole Faust    uint        name##_stride_y, \
3557*c217d954SCole Faust    uint        name##_step_y,   \
3558*c217d954SCole Faust    uint        name##_stride_z, \
3559*c217d954SCole Faust    uint        name##_step_z,   \
3560*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
3561*c217d954SCole Faust
3562*c217d954SCole Faust#define TENSOR4D_DECLARATION(name)   \
3563*c217d954SCole Faust    __global uchar *name##_ptr,      \
3564*c217d954SCole Faust    uint        name##_stride_x, \
3565*c217d954SCole Faust    uint        name##_step_x,   \
3566*c217d954SCole Faust    uint        name##_stride_y, \
3567*c217d954SCole Faust    uint        name##_step_y,   \
3568*c217d954SCole Faust    uint        name##_stride_z, \
3569*c217d954SCole Faust    uint        name##_step_z,   \
3570*c217d954SCole Faust    uint        name##_stride_w, \
3571*c217d954SCole Faust    uint        name##_step_w,   \
3572*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
3573*c217d954SCole Faust
3574*c217d954SCole Faust#define TENSOR5D_DECLARATION(name)   \
3575*c217d954SCole Faust    __global uchar *name##_ptr,      \
3576*c217d954SCole Faust    uint        name##_stride_x, \
3577*c217d954SCole Faust    uint        name##_step_x,   \
3578*c217d954SCole Faust    uint        name##_stride_y, \
3579*c217d954SCole Faust    uint        name##_step_y,   \
3580*c217d954SCole Faust    uint        name##_stride_z, \
3581*c217d954SCole Faust    uint        name##_step_z,   \
3582*c217d954SCole Faust    uint        name##_stride_w, \
3583*c217d954SCole Faust    uint        name##_step_w,   \
3584*c217d954SCole Faust    uint        name##_stride_v, \
3585*c217d954SCole Faust    uint        name##_step_v,   \
3586*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
3587*c217d954SCole Faust
3588*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \
3589*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
3590*c217d954SCole Faust
3591*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
3592*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
3593*c217d954SCole Faust
3594*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \
3595*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
3596*c217d954SCole Faust
3597*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
3598*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
3599*c217d954SCole Faust
3600*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
3601*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
3602*c217d954SCole Faust
3603*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
3604*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
3605*c217d954SCole Faust
3606*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
3607*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
3608*c217d954SCole Faust
3609*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
3610*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
3611*c217d954SCole Faust                                 name##_stride_z, name##_step_z)
3612*c217d954SCole Faust
3613*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
3614*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
3615*c217d954SCole Faust
3616*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
3617*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
3618*c217d954SCole Faust                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
3619*c217d954SCole Faust
3620*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
3621*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
3622*c217d954SCole Faust
3623*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
3624*c217d954SCole Faust    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
3625*c217d954SCole Faust                           name##_stride_z, name##_step_z)
3626*c217d954SCole Faust
3627*c217d954SCole Faust
3628*c217d954SCole Fausttypedef struct Vector
3629*c217d954SCole Faust{
3630*c217d954SCole Faust    __global uchar *ptr;
3631*c217d954SCole Faust    int             offset_first_element_in_bytes;
3632*c217d954SCole Faust    int             stride_x;
3633*c217d954SCole Faust} Vector;
3634*c217d954SCole Faust
3635*c217d954SCole Faust
3636*c217d954SCole Fausttypedef struct Image
3637*c217d954SCole Faust{
3638*c217d954SCole Faust    __global uchar *ptr;
3639*c217d954SCole Faust    int             offset_first_element_in_bytes;
3640*c217d954SCole Faust    int             stride_x;
3641*c217d954SCole Faust    int             stride_y;
3642*c217d954SCole Faust} Image;
3643*c217d954SCole Faust
3644*c217d954SCole Faust
3645*c217d954SCole Fausttypedef struct Tensor3D
3646*c217d954SCole Faust{
3647*c217d954SCole Faust    __global uchar *ptr;
3648*c217d954SCole Faust    int             offset_first_element_in_bytes;
3649*c217d954SCole Faust    int             stride_x;
3650*c217d954SCole Faust    int             stride_y;
3651*c217d954SCole Faust    int             stride_z;
3652*c217d954SCole Faust} Tensor3D;
3653*c217d954SCole Faust
3654*c217d954SCole Faust
3655*c217d954SCole Fausttypedef struct Tensor4D
3656*c217d954SCole Faust{
3657*c217d954SCole Faust    __global uchar *ptr;
3658*c217d954SCole Faust    int             offset_first_element_in_bytes;
3659*c217d954SCole Faust    int             stride_x;
3660*c217d954SCole Faust    int             stride_y;
3661*c217d954SCole Faust    int             stride_z;
3662*c217d954SCole Faust    int             stride_w;
3663*c217d954SCole Faust} Tensor4D;
3664*c217d954SCole Faust
3665*c217d954SCole Faust
3666*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
3667*c217d954SCole Faust{
3668*c217d954SCole Faust    Vector vector =
3669*c217d954SCole Faust    {
3670*c217d954SCole Faust        .ptr                           = ptr,
3671*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3672*c217d954SCole Faust        .stride_x                      = stride_x,
3673*c217d954SCole Faust    };
3674*c217d954SCole Faust    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
3675*c217d954SCole Faust    return vector;
3676*c217d954SCole Faust}
3677*c217d954SCole Faust
3678*c217d954SCole Faust
3679*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
3680*c217d954SCole Faust{
3681*c217d954SCole Faust    Image img =
3682*c217d954SCole Faust    {
3683*c217d954SCole Faust        .ptr                           = ptr,
3684*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3685*c217d954SCole Faust        .stride_x                      = stride_x,
3686*c217d954SCole Faust        .stride_y                      = stride_y
3687*c217d954SCole Faust    };
3688*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
3689*c217d954SCole Faust    return img;
3690*c217d954SCole Faust}
3691*c217d954SCole Faust
3692*c217d954SCole Faust
3693*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
3694*c217d954SCole Faust{
3695*c217d954SCole Faust    Image img =
3696*c217d954SCole Faust    {
3697*c217d954SCole Faust        .ptr                           = ptr,
3698*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3699*c217d954SCole Faust        .stride_x                      = stride_x,
3700*c217d954SCole Faust        .stride_y                      = stride_y
3701*c217d954SCole Faust    };
3702*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
3703*c217d954SCole Faust    return img;
3704*c217d954SCole Faust}
3705*c217d954SCole Faust
3706*c217d954SCole Faust
3707*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
3708*c217d954SCole Faust{
3709*c217d954SCole Faust    Tensor3D tensor =
3710*c217d954SCole Faust    {
3711*c217d954SCole Faust        .ptr                           = ptr,
3712*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3713*c217d954SCole Faust        .stride_x                      = stride_x,
3714*c217d954SCole Faust        .stride_y                      = stride_y,
3715*c217d954SCole Faust        .stride_z                      = stride_z
3716*c217d954SCole Faust    };
3717*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
3718*c217d954SCole Faust    return tensor;
3719*c217d954SCole Faust}
3720*c217d954SCole Faust
3721*c217d954SCole Faust
3722*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
3723*c217d954SCole Faust{
3724*c217d954SCole Faust    Tensor3D tensor =
3725*c217d954SCole Faust    {
3726*c217d954SCole Faust        .ptr                           = ptr,
3727*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3728*c217d954SCole Faust        .stride_x                      = stride_x,
3729*c217d954SCole Faust        .stride_y                      = stride_y,
3730*c217d954SCole Faust        .stride_z                      = stride_z
3731*c217d954SCole Faust    };
3732*c217d954SCole Faust    return tensor;
3733*c217d954SCole Faust}
3734*c217d954SCole Faust
3735*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
3736*c217d954SCole Faust                                             uint step_w,
3737*c217d954SCole Faust                                             uint mod_size)
3738*c217d954SCole Faust{
3739*c217d954SCole Faust    Tensor4D tensor =
3740*c217d954SCole Faust    {
3741*c217d954SCole Faust        .ptr                           = ptr,
3742*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3743*c217d954SCole Faust        .stride_x                      = stride_x,
3744*c217d954SCole Faust        .stride_y                      = stride_y,
3745*c217d954SCole Faust        .stride_z                      = stride_z,
3746*c217d954SCole Faust        .stride_w                      = stride_w
3747*c217d954SCole Faust    };
3748*c217d954SCole Faust
3749*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
3750*c217d954SCole Faust    return tensor;
3751*c217d954SCole Faust}
3752*c217d954SCole Faust
3753*c217d954SCole Faust
3754*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x)
3755*c217d954SCole Faust{
3756*c217d954SCole Faust    return vec->ptr + x * vec->stride_x;
3757*c217d954SCole Faust}
3758*c217d954SCole Faust
3759*c217d954SCole Faust
3760*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y)
3761*c217d954SCole Faust{
3762*c217d954SCole Faust    return img->ptr + x * img->stride_x + y * img->stride_y;
3763*c217d954SCole Faust}
3764*c217d954SCole Faust
3765*c217d954SCole Faust
3766*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
3767*c217d954SCole Faust{
3768*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
3769*c217d954SCole Faust}
3770*c217d954SCole Faust
3771*c217d954SCole Faust
3772*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
3773*c217d954SCole Faust{
3774*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
3775*c217d954SCole Faust}
3776*c217d954SCole Faust
3777*c217d954SCole Faust
3778*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
3779*c217d954SCole Faust{
3780*c217d954SCole Faust    uint num_elements = width * height;
3781*c217d954SCole Faust
3782*c217d954SCole Faust    const uint z = index / num_elements;
3783*c217d954SCole Faust
3784*c217d954SCole Faust    index %= num_elements;
3785*c217d954SCole Faust
3786*c217d954SCole Faust    const uint y = index / width;
3787*c217d954SCole Faust
3788*c217d954SCole Faust    index %= width;
3789*c217d954SCole Faust
3790*c217d954SCole Faust    const uint x = index;
3791*c217d954SCole Faust
3792*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
3793*c217d954SCole Faust}
3794*c217d954SCole Faust
3795*c217d954SCole Faust#endif
3796*c217d954SCole Faust
3797*c217d954SCole Faust
3798*c217d954SCole Faust#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
3799*c217d954SCole Faust#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
3800*c217d954SCole Faust
3801*c217d954SCole Faust
3802*c217d954SCole Faustinline uchar quantize_qasymm8(float input, float offset, float scale)
3803*c217d954SCole Faust{
3804*c217d954SCole Faust    float out_f32 = input / scale + offset;
3805*c217d954SCole Faust    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
3806*c217d954SCole Faust    return res_u8;
3807*c217d954SCole Faust}
3808*c217d954SCole Faust
3809*c217d954SCole Faust
3810*c217d954SCole Faustinline float dequantize_qasymm8(uchar input, float offset, float scale)
3811*c217d954SCole Faust{
3812*c217d954SCole Faust    return ((float)input - offset) * scale;
3813*c217d954SCole Faust}
3814*c217d954SCole Faust
3815*c217d954SCole Faust
3816*c217d954SCole Faustinline float dequantize_qasymm8_signed(char input, float offset, float scale)
3817*c217d954SCole Faust{
3818*c217d954SCole Faust    return ((float)input - offset) * scale;
3819*c217d954SCole Faust}
3820*c217d954SCole Faust
3821*c217d954SCole Faust
3822*c217d954SCole Faust#define QUANTIZE_IMPL(type, size)                                                                                       \
3823*c217d954SCole Faust    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
3824*c217d954SCole Faust    {                                                                                                                   \
3825*c217d954SCole Faust        VEC_DATA_TYPE(float, size)                                                                                      \
3826*c217d954SCole Faust        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
3827*c217d954SCole Faust        VEC_DATA_TYPE(type, size)                                                                                       \
3828*c217d954SCole Faust        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
3829*c217d954SCole Faust        return res;                                                                                                     \
3830*c217d954SCole Faust    }
3831*c217d954SCole Faust
3832*c217d954SCole Faust
3833*c217d954SCole Faust#define DEQUANTIZE_IMPL(type, size)                                                                                       \
3834*c217d954SCole Faust    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
3835*c217d954SCole Faust    {                                                                                                                     \
3836*c217d954SCole Faust        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
3837*c217d954SCole Faust    }
3838*c217d954SCole Faust
3839*c217d954SCole Faust
3840*c217d954SCole Faust#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
3841*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
3842*c217d954SCole Faust    {                                                                                                                                   \
3843*c217d954SCole Faust        const VEC_DATA_TYPE(int, size)                                                                                                  \
3844*c217d954SCole Faust        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
3845*c217d954SCole Faust        const VEC_DATA_TYPE(int, size)                                                                                                  \
3846*c217d954SCole Faust        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
3847*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                        \
3848*c217d954SCole Faust        mask = (one << exponent) - one;                                                                                                 \
3849*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                        \
3850*c217d954SCole Faust        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                                          \
3851*c217d954SCole Faust        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold));                          \
3852*c217d954SCole Faust    }
3853*c217d954SCole Faust
3854*c217d954SCole Faust
3855*c217d954SCole Faust#define ASYMM_MULT_IMPL(size)                                                                                \
3856*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
3857*c217d954SCole Faust    {                                                                                                        \
3858*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                             \
3859*c217d954SCole Faust        overflow = a == b && a == INT_MIN;                                                                   \
3860*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                            \
3861*c217d954SCole Faust        a_64 = convert_long##size(a);                                                                        \
3862*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                            \
3863*c217d954SCole Faust        b_64 = convert_long##size(b);                                                                        \
3864*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                            \
3865*c217d954SCole Faust        ab_64 = a_64 * b_64;                                                                                 \
3866*c217d954SCole Faust                                                                                      \
3867*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                            \
3868*c217d954SCole Faust        mask1 = 1 << 30;                                                                                     \
3869*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                            \
3870*c217d954SCole Faust        mask2 = 1 - (1 << 30);                                                                               \
3871*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                            \
3872*c217d954SCole Faust        is_positive_or_zero = ab_64 >= 0;                                                                    \
3873*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                            \
3874*c217d954SCole Faust        nudge = select(mask2, mask1, (SELECT_VEC_DATA_TYPE(long, size))(is_positive_or_zero));               \
3875*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                            \
3876*c217d954SCole Faust        mask = 1ll << 31;                                                                                    \
3877*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                             \
3878*c217d954SCole Faust        ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                                            \
3879*c217d954SCole Faust        return select(ab_x2_high32, INT_MAX, (SELECT_VEC_DATA_TYPE(int, size))(overflow));                   \
3880*c217d954SCole Faust    }
3881*c217d954SCole Faust
3882*c217d954SCole Faust
3883*c217d954SCole Faust#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
3884*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
3885*c217d954SCole Faust    {                                                                                                                               \
3886*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
3887*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
3888*c217d954SCole Faust        const int k_fractional_bits = 31;                                                                                           \
3889*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                    \
3890*c217d954SCole Faust        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
3891*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                    \
3892*c217d954SCole Faust        x2 = ASYMM_MULT(x, x, size);                                                                                                \
3893*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                    \
3894*c217d954SCole Faust        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
3895*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                    \
3896*c217d954SCole Faust        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
3897*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                    \
3898*c217d954SCole Faust        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
3899*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                    \
3900*c217d954SCole Faust        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
3901*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                    \
3902*c217d954SCole Faust        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
3903*c217d954SCole Faust        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
3904*c217d954SCole Faust    }
3905*c217d954SCole Faust
3906*c217d954SCole Faust
3907*c217d954SCole Faust#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
3908*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
3909*c217d954SCole Faust    {                                                                                                                                                                     \
3910*c217d954SCole Faust        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
3911*c217d954SCole Faust    }
3912*c217d954SCole Faust
3913*c217d954SCole Faust
3914*c217d954SCole Faust#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
3915*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
3916*c217d954SCole Faust    {                                                                                    \
3917*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
3918*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
3919*c217d954SCole Faust        return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a == 0));   \
3920*c217d954SCole Faust    }
3921*c217d954SCole Faust
3922*c217d954SCole Faust
3923*c217d954SCole Faust#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
3924*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
3925*c217d954SCole Faust    {                                                                                        \
3926*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
3927*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
3928*c217d954SCole Faust        return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0));       \
3929*c217d954SCole Faust    }
3930*c217d954SCole Faust
3931*c217d954SCole Faust#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
3932*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
3933*c217d954SCole Faust    {                                                                                                                                                                                                         \
3934*c217d954SCole Faust        if(k_integer_bits > exponent)                                                                                                                                                                         \
3935*c217d954SCole Faust        {                                                                                                                                                                                                     \
3936*c217d954SCole Faust            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
3937*c217d954SCole Faust            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
3938*c217d954SCole Faust                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
3939*c217d954SCole Faust                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
3940*c217d954SCole Faust        }                                                                                                                                                                                                     \
3941*c217d954SCole Faust        \
3942*c217d954SCole Faust        return result;                                                                                                                                                                                        \
3943*c217d954SCole Faust    }
3944*c217d954SCole Faust
3945*c217d954SCole Faust
3946*c217d954SCole Faust#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
3947*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
3948*c217d954SCole Faust    {                                                                                                                         \
3949*c217d954SCole Faust        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
3950*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                              \
3951*c217d954SCole Faust        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
3952*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                              \
3953*c217d954SCole Faust        mask = k_one_quarter - 1;                                                                                             \
3954*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                              \
3955*c217d954SCole Faust        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
3956*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                              \
3957*c217d954SCole Faust        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
3958*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                              \
3959*c217d954SCole Faust        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
3960*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                              \
3961*c217d954SCole Faust        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
3962*c217d954SCole Faust        \
3963*c217d954SCole Faust        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
3964*c217d954SCole Faust        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
3965*c217d954SCole Faust        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
3966*c217d954SCole Faust        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
3967*c217d954SCole Faust        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
3968*c217d954SCole Faust        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
3969*c217d954SCole Faust        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
3970*c217d954SCole Faust        \
3971*c217d954SCole Faust        if(k_integer_bits > 5)                                                                                                \
3972*c217d954SCole Faust        {                                                                                                                     \
3973*c217d954SCole Faust            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
3974*c217d954SCole Faust            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
3975*c217d954SCole Faust        }                                                                                                                     \
3976*c217d954SCole Faust        \
3977*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
3978*c217d954SCole Faust        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
3979*c217d954SCole Faust    }
3980*c217d954SCole Faust
3981*c217d954SCole Faust
3982*c217d954SCole Faust#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
3983*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
3984*c217d954SCole Faust    {                                                                                                                      \
3985*c217d954SCole Faust        if(exponent < 0)                                                                                                   \
3986*c217d954SCole Faust        {                                                                                                                  \
3987*c217d954SCole Faust            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
3988*c217d954SCole Faust        }                                                                                                                  \
3989*c217d954SCole Faust        \
3990*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
3991*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
3992*c217d954SCole Faust        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
3993*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                           \
3994*c217d954SCole Faust        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
3995*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                           \
3996*c217d954SCole Faust        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
3997*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                           \
3998*c217d954SCole Faust        result = x << exponent;                                                                                            \
3999*c217d954SCole Faust        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
4000*c217d954SCole Faust        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
4001*c217d954SCole Faust        return result;                                                                                                     \
4002*c217d954SCole Faust    }
4003*c217d954SCole Faust
4004*c217d954SCole Faust
4005*c217d954SCole Faust#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
4006*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
4007*c217d954SCole Faust    {                                                                                                                     \
4008*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                                         \
4009*c217d954SCole Faust        a64 = convert_long##size(a);                                                                                      \
4010*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                                         \
4011*c217d954SCole Faust        b64 = convert_long##size(b);                                                                                      \
4012*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                                         \
4013*c217d954SCole Faust        sum = a64 + b64;                                                                                                  \
4014*c217d954SCole Faust        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
4015*c217d954SCole Faust        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
4016*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                                         \
4017*c217d954SCole Faust        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));                                      \
4018*c217d954SCole Faust        return convert_int##size((sum + sign) / 2);                                                                       \
4019*c217d954SCole Faust    }
4020*c217d954SCole Faust
4021*c217d954SCole Faust
4022*c217d954SCole Faust#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
4023*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
4024*c217d954SCole Faust    {                                                                                                        \
4025*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
4026*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
4027*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                             \
4028*c217d954SCole Faust        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
4029*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
4030*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
4031*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                             \
4032*c217d954SCole Faust        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
4033*c217d954SCole Faust        for(int i = 0; i < 3; i++)                                                                           \
4034*c217d954SCole Faust        {                                                                                                    \
4035*c217d954SCole Faust            VEC_DATA_TYPE(int, size)                                                                         \
4036*c217d954SCole Faust            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
4037*c217d954SCole Faust            VEC_DATA_TYPE(int, size)                                                                         \
4038*c217d954SCole Faust            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
4039*c217d954SCole Faust            VEC_DATA_TYPE(int, size)                                                                         \
4040*c217d954SCole Faust            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
4041*c217d954SCole Faust            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
4042*c217d954SCole Faust        }                                                                                                    \
4043*c217d954SCole Faust        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
4044*c217d954SCole Faust    }
4045*c217d954SCole Faust
4046*c217d954SCole Faust
4047*c217d954SCole Faust#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
4048*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
4049*c217d954SCole Faust    {                                                                                                                               \
4050*c217d954SCole Faust        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
4051*c217d954SCole Faust        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
4052*c217d954SCole Faust    }
4053*c217d954SCole Faust
4054*c217d954SCole Faust#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
4055*c217d954SCole Faust#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
4056*c217d954SCole Faust#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
4057*c217d954SCole Faust#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
4058*c217d954SCole Faust
4059*c217d954SCole Faust#define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
4060*c217d954SCole Faust#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
4061*c217d954SCole Faust#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b)
4062*c217d954SCole Faust#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size)
4063*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
4064*c217d954SCole Faust    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
4065*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
4066*c217d954SCole Faust    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
4067*c217d954SCole Faust#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
4068*c217d954SCole Faust#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
4069*c217d954SCole Faust#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
4070*c217d954SCole Faust#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
4071*c217d954SCole Faust#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
4072*c217d954SCole Faust#define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
4073*c217d954SCole Faust#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
4074*c217d954SCole Faust#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
4075*c217d954SCole Faust#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
4076*c217d954SCole Faust#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
4077*c217d954SCole Faust#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
4078*c217d954SCole Faust#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
4079*c217d954SCole Faust#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
4080*c217d954SCole Faust
4081*c217d954SCole Faust#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                             \
4082*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
4083*c217d954SCole Faust    {                                                                                                                           \
4084*c217d954SCole Faust        const int left_shift  = shift > 0 ? shift : 0;                                                                          \
4085*c217d954SCole Faust        const int right_shift = shift > 0 ? 0 : -shift;                                                                         \
4086*c217d954SCole Faust        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size);             \
4087*c217d954SCole Faust    }
4088*c217d954SCole Faust#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift)
4089*c217d954SCole Faust
4090*c217d954SCole FaustQUANTIZE_IMPL(uchar, 1)
4091*c217d954SCole FaustQUANTIZE_IMPL(char, 1)
4092*c217d954SCole FaustQUANTIZE_IMPL(uint, 1)
4093*c217d954SCole FaustQUANTIZE_IMPL(int, 1)
4094*c217d954SCole FaustQUANTIZE_IMPL(uchar, 2)
4095*c217d954SCole FaustQUANTIZE_IMPL(char, 2)
4096*c217d954SCole FaustQUANTIZE_IMPL(uint, 2)
4097*c217d954SCole FaustQUANTIZE_IMPL(int, 2)
4098*c217d954SCole FaustQUANTIZE_IMPL(uchar, 3)
4099*c217d954SCole FaustQUANTIZE_IMPL(char, 3)
4100*c217d954SCole FaustQUANTIZE_IMPL(uint, 3)
4101*c217d954SCole FaustQUANTIZE_IMPL(int, 3)
4102*c217d954SCole FaustQUANTIZE_IMPL(uchar, 4)
4103*c217d954SCole FaustQUANTIZE_IMPL(ushort, 4)
4104*c217d954SCole FaustQUANTIZE_IMPL(short, 4)
4105*c217d954SCole FaustQUANTIZE_IMPL(int, 4)
4106*c217d954SCole FaustQUANTIZE_IMPL(uchar, 8)
4107*c217d954SCole FaustQUANTIZE_IMPL(char, 8)
4108*c217d954SCole FaustQUANTIZE_IMPL(uint, 8)
4109*c217d954SCole FaustQUANTIZE_IMPL(int, 8)
4110*c217d954SCole FaustQUANTIZE_IMPL(uchar, 16)
4111*c217d954SCole FaustQUANTIZE_IMPL(char, 16)
4112*c217d954SCole FaustQUANTIZE_IMPL(ushort, 16)
4113*c217d954SCole FaustQUANTIZE_IMPL(short, 16)
4114*c217d954SCole FaustQUANTIZE_IMPL(uint, 16)
4115*c217d954SCole FaustQUANTIZE_IMPL(int, 16)
4116*c217d954SCole Faust
4117*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 1)
4118*c217d954SCole FaustDEQUANTIZE_IMPL(char, 1)
4119*c217d954SCole FaustDEQUANTIZE_IMPL(uint, 1)
4120*c217d954SCole FaustDEQUANTIZE_IMPL(int, 1)
4121*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 2)
4122*c217d954SCole FaustDEQUANTIZE_IMPL(char, 2)
4123*c217d954SCole FaustDEQUANTIZE_IMPL(uint, 2)
4124*c217d954SCole FaustDEQUANTIZE_IMPL(int, 2)
4125*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 3)
4126*c217d954SCole FaustDEQUANTIZE_IMPL(char, 3)
4127*c217d954SCole FaustDEQUANTIZE_IMPL(uint, 3)
4128*c217d954SCole FaustDEQUANTIZE_IMPL(int, 3)
4129*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 4)
4130*c217d954SCole FaustDEQUANTIZE_IMPL(ushort, 4)
4131*c217d954SCole FaustDEQUANTIZE_IMPL(short, 4)
4132*c217d954SCole FaustDEQUANTIZE_IMPL(int, 4)
4133*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 8)
4134*c217d954SCole FaustDEQUANTIZE_IMPL(char, 8)
4135*c217d954SCole FaustDEQUANTIZE_IMPL(uint, 8)
4136*c217d954SCole FaustDEQUANTIZE_IMPL(int, 8)
4137*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 16)
4138*c217d954SCole FaustDEQUANTIZE_IMPL(char, 16)
4139*c217d954SCole FaustDEQUANTIZE_IMPL(ushort, 16)
4140*c217d954SCole FaustDEQUANTIZE_IMPL(short, 16)
4141*c217d954SCole FaustDEQUANTIZE_IMPL(uint, 16)
4142*c217d954SCole FaustDEQUANTIZE_IMPL(int, 16)
4143*c217d954SCole Faust
4144*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
4145*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
4146*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(3)
4147*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
4148*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
4149*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
4150*c217d954SCole Faust
4151*c217d954SCole FaustASYMM_MULT_IMPL(1)
4152*c217d954SCole FaustASYMM_MULT_IMPL(2)
4153*c217d954SCole FaustASYMM_MULT_IMPL(3)
4154*c217d954SCole FaustASYMM_MULT_IMPL(4)
4155*c217d954SCole FaustASYMM_MULT_IMPL(8)
4156*c217d954SCole FaustASYMM_MULT_IMPL(16)
4157*c217d954SCole Faust
4158*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(1)
4159*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
4160*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(3)
4161*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
4162*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
4163*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
4164*c217d954SCole Faust
4165*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(1)
4166*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(2)
4167*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(3)
4168*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(4)
4169*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(8)
4170*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(16)
4171*c217d954SCole Faust
4172*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(1)
4173*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(2)
4174*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(3)
4175*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(4)
4176*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(8)
4177*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(16)
4178*c217d954SCole Faust
4179*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(1)
4180*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(2)
4181*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(3)
4182*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(4)
4183*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(8)
4184*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(16)
4185*c217d954SCole Faust
4186*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(1)
4187*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(2)
4188*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(3)
4189*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(4)
4190*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(8)
4191*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(16)
4192*c217d954SCole Faust
4193*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(1)
4194*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
4195*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(3)
4196*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
4197*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
4198*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
4199*c217d954SCole Faust
4200*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
4201*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
4202*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(3)
4203*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
4204*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
4205*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
4206*c217d954SCole Faust
4207*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(1)
4208*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(2)
4209*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(3)
4210*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(4)
4211*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(8)
4212*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(16)
4213*c217d954SCole Faust
4214*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(1)
4215*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
4216*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(3)
4217*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
4218*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
4219*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
4220*c217d954SCole Faust
4221*c217d954SCole FaustASYMM_RESCALE_IMPL(1)
4222*c217d954SCole FaustASYMM_RESCALE_IMPL(2)
4223*c217d954SCole FaustASYMM_RESCALE_IMPL(3)
4224*c217d954SCole FaustASYMM_RESCALE_IMPL(4)
4225*c217d954SCole FaustASYMM_RESCALE_IMPL(8)
4226*c217d954SCole FaustASYMM_RESCALE_IMPL(16)
4227*c217d954SCole Faust
4228*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
4229*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
4230*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(3)
4231*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
4232*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
4233*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
4234*c217d954SCole Faust
4235*c217d954SCole Faust#endif
4236*c217d954SCole Faust
4237*c217d954SCole Faust#ifndef SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
4238*c217d954SCole Faust#define SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
4239*c217d954SCole Faust
4240*c217d954SCole Faust
4241*c217d954SCole Faust
4242*c217d954SCole Faust
4243*c217d954SCole Faust#define TILE_VECTOR_SIZE1 1
4244*c217d954SCole Faust#define TILE_VECTOR_SIZE2 2
4245*c217d954SCole Faust#define TILE_VECTOR_SIZE3 3
4246*c217d954SCole Faust#define TILE_VECTOR_SIZE4 4
4247*c217d954SCole Faust#define TILE_VECTOR_SIZE5 8
4248*c217d954SCole Faust#define TILE_VECTOR_SIZE6 8
4249*c217d954SCole Faust#define TILE_VECTOR_SIZE7 8
4250*c217d954SCole Faust#define TILE_VECTOR_SIZE8 8
4251*c217d954SCole Faust#define TILE_VECTOR_SIZE9 16
4252*c217d954SCole Faust#define TILE_VECTOR_SIZE10 16
4253*c217d954SCole Faust#define TILE_VECTOR_SIZE11 16
4254*c217d954SCole Faust#define TILE_VECTOR_SIZE12 16
4255*c217d954SCole Faust#define TILE_VECTOR_SIZE13 16
4256*c217d954SCole Faust#define TILE_VECTOR_SIZE14 16
4257*c217d954SCole Faust#define TILE_VECTOR_SIZE15 16
4258*c217d954SCole Faust#define TILE_VECTOR_SIZE16 16
4259*c217d954SCole Faust
4260*c217d954SCole Faust#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1
4261*c217d954SCole Faust#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2
4262*c217d954SCole Faust#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3
4263*c217d954SCole Faust#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4
4264*c217d954SCole Faust#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8
4265*c217d954SCole Faust#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8
4266*c217d954SCole Faust#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8
4267*c217d954SCole Faust#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8
4268*c217d954SCole Faust#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16
4269*c217d954SCole Faust#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16
4270*c217d954SCole Faust#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16
4271*c217d954SCole Faust#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16
4272*c217d954SCole Faust#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16
4273*c217d954SCole Faust#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16
4274*c217d954SCole Faust#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16
4275*c217d954SCole Faust#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16
4276*c217d954SCole Faust
4277*c217d954SCole Faust
4278*c217d954SCole Faust#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME)
4279*c217d954SCole Faust#define TILE_STR(DATA_TYPE, H, W, BASENAME) \
4280*c217d954SCole Faust    union {                                 \
4281*c217d954SCole Faust        DATA_TYPE                      s[TILE_VECTOR_SIZE##W];                  \
4282*c217d954SCole Faust        TILE_VECTOR_TYPE##W(DATA_TYPE) v;                     \
4283*c217d954SCole Faust    } BASENAME[H]
4284*c217d954SCole Faust
4285*c217d954SCole Faust#define TENSOR4D_IMAGE(name)          \
4286*c217d954SCole Faust    __read_only image2d_t name##_img, \
4287*c217d954SCole Faust    __global uchar *name##_ptr,       \
4288*c217d954SCole Faust    uint            name##_stride_x,  \
4289*c217d954SCole Faust    uint            name##_step_x,    \
4290*c217d954SCole Faust    uint            name##_stride_y,  \
4291*c217d954SCole Faust    uint            name##_step_y,    \
4292*c217d954SCole Faust    uint            name##_stride_z,  \
4293*c217d954SCole Faust    uint            name##_step_z,    \
4294*c217d954SCole Faust    uint            name##_stride_w,  \
4295*c217d954SCole Faust    uint            name##_step_w,    \
4296*c217d954SCole Faust    uint            name##_offset_first_element_in_bytes
4297*c217d954SCole Faust
4298*c217d954SCole Faust#define TENSOR4D_BUFFER(name)    \
4299*c217d954SCole Faust    __global uchar *name##_ptr,  \
4300*c217d954SCole Faust    uint        name##_stride_x, \
4301*c217d954SCole Faust    uint        name##_step_x,   \
4302*c217d954SCole Faust    uint        name##_stride_y, \
4303*c217d954SCole Faust    uint        name##_step_y,   \
4304*c217d954SCole Faust    uint        name##_stride_z, \
4305*c217d954SCole Faust    uint        name##_step_z,   \
4306*c217d954SCole Faust    uint        name##_stride_w, \
4307*c217d954SCole Faust    uint        name##_step_w,   \
4308*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4309*c217d954SCole Faust
4310*c217d954SCole Faust#define TENSOR4D_STR(name, type) TENSOR4D_##type(name)
4311*c217d954SCole Faust#define TENSOR4D(name, type) TENSOR4D_STR(name, type)
4312*c217d954SCole Faust
4313*c217d954SCole Faust#define TENSOR4D_T_IMAGE(name)          \
4314*c217d954SCole Faust    __read_only image2d_t name##_img, \
4315*c217d954SCole Faust    __global uchar *name##_ptr,       \
4316*c217d954SCole Faust    uint        name##_stride_y, \
4317*c217d954SCole Faust    uint        name##_stride_z, \
4318*c217d954SCole Faust    uint        name##_stride_w, \
4319*c217d954SCole Faust    uint        name##_c,   \
4320*c217d954SCole Faust    uint        name##_w,   \
4321*c217d954SCole Faust    uint        name##_h,   \
4322*c217d954SCole Faust    uint        name##_n,   \
4323*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4324*c217d954SCole Faust
4325*c217d954SCole Faust#define TENSOR4D_T_BUFFER(name)    \
4326*c217d954SCole Faust    __global uchar *name##_ptr,  \
4327*c217d954SCole Faust    uint        name##_stride_y, \
4328*c217d954SCole Faust    uint        name##_stride_z, \
4329*c217d954SCole Faust    uint        name##_stride_w, \
4330*c217d954SCole Faust    uint        name##_c,   \
4331*c217d954SCole Faust    uint        name##_w,   \
4332*c217d954SCole Faust    uint        name##_h,   \
4333*c217d954SCole Faust    uint        name##_n,   \
4334*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4335*c217d954SCole Faust
4336*c217d954SCole Faust#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
4337*c217d954SCole Faust
4338*c217d954SCole Faust
4339*c217d954SCole Faust#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
4340*c217d954SCole Faust
4341*c217d954SCole Faust#define TENSOR4D_RO_T_IMAGE(name)          \
4342*c217d954SCole Faust    __read_only image2d_t name##_img, \
4343*c217d954SCole Faust    TENSOR4D_T_BUFFER(name)
4344*c217d954SCole Faust
4345*c217d954SCole Faust#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
4346*c217d954SCole Faust
4347*c217d954SCole Faust#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name)
4348*c217d954SCole Faust
4349*c217d954SCole Faust
4350*c217d954SCole Faust#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type)
4351*c217d954SCole Faust
4352*c217d954SCole Faust#define TENSOR4D_WO_T_IMAGE(name)          \
4353*c217d954SCole Faust    __write_only image2d_t name##_img, \
4354*c217d954SCole Faust    TENSOR4D_T_BUFFER(name)
4355*c217d954SCole Faust
4356*c217d954SCole Faust#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
4357*c217d954SCole Faust
4358*c217d954SCole Faust#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name)
4359*c217d954SCole Faust
4360*c217d954SCole Faust
4361*c217d954SCole Faust#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type)
4362*c217d954SCole Faust
4363*c217d954SCole Faust#define TENSOR3D_T_IMAGE(name)          \
4364*c217d954SCole Faust    __read_only image2d_t name##_img, \
4365*c217d954SCole Faust    __global uchar *name##_ptr,       \
4366*c217d954SCole Faust    uint        name##_stride_y, \
4367*c217d954SCole Faust    uint        name##_stride_z, \
4368*c217d954SCole Faust    uint        name##_w,   \
4369*c217d954SCole Faust    uint        name##_h,   \
4370*c217d954SCole Faust    uint        name##_n,   \
4371*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4372*c217d954SCole Faust
4373*c217d954SCole Faust#define TENSOR3D_T_BUFFER(name)    \
4374*c217d954SCole Faust    __global uchar *name##_ptr,  \
4375*c217d954SCole Faust    uint        name##_stride_y, \
4376*c217d954SCole Faust    uint        name##_stride_z, \
4377*c217d954SCole Faust    uint        name##_w,   \
4378*c217d954SCole Faust    uint        name##_h,   \
4379*c217d954SCole Faust    uint        name##_n,   \
4380*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4381*c217d954SCole Faust
4382*c217d954SCole Faust#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name)
4383*c217d954SCole Faust#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type)
4384*c217d954SCole Faust
4385*c217d954SCole Faust#if !defined(UNROLL_WITH_PRAGMA)
4386*c217d954SCole Faust#define UNROLL_INCR(idx, step, macro) idx += (step); (macro)
4387*c217d954SCole Faust
4388*c217d954SCole Faust#define LOOP_UNROLLING_1(idx, step, macro) (macro)
4389*c217d954SCole Faust#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro)
4390*c217d954SCole Faust#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro)
4391*c217d954SCole Faust#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro)
4392*c217d954SCole Faust#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro)
4393*c217d954SCole Faust#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro)
4394*c217d954SCole Faust#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro)
4395*c217d954SCole Faust#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro)
4396*c217d954SCole Faust#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro)
4397*c217d954SCole Faust#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro)
4398*c217d954SCole Faust#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro)
4399*c217d954SCole Faust#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro)
4400*c217d954SCole Faust#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro)
4401*c217d954SCole Faust#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro)
4402*c217d954SCole Faust#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro)
4403*c217d954SCole Faust#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro)
4404*c217d954SCole Faust#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro)
4405*c217d954SCole Faust#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro)
4406*c217d954SCole Faust#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro)
4407*c217d954SCole Faust#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro)
4408*c217d954SCole Faust#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro)
4409*c217d954SCole Faust#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro)
4410*c217d954SCole Faust#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro)
4411*c217d954SCole Faust#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro)
4412*c217d954SCole Faust#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro)
4413*c217d954SCole Faust#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro)
4414*c217d954SCole Faust#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro)
4415*c217d954SCole Faust#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro)
4416*c217d954SCole Faust#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro)
4417*c217d954SCole Faust#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro)
4418*c217d954SCole Faust#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro)
4419*c217d954SCole Faust#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro)
4420*c217d954SCole Faust#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro)
4421*c217d954SCole Faust#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro)
4422*c217d954SCole Faust#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro)
4423*c217d954SCole Faust#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro)
4424*c217d954SCole Faust#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro)
4425*c217d954SCole Faust#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro)
4426*c217d954SCole Faust#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro)
4427*c217d954SCole Faust#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro)
4428*c217d954SCole Faust#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro)
4429*c217d954SCole Faust#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro)
4430*c217d954SCole Faust#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro)
4431*c217d954SCole Faust#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro)
4432*c217d954SCole Faust#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro)
4433*c217d954SCole Faust#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro)
4434*c217d954SCole Faust#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro)
4435*c217d954SCole Faust#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro)
4436*c217d954SCole Faust#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro)
4437*c217d954SCole Faust#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro)
4438*c217d954SCole Faust#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro)
4439*c217d954SCole Faust#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro)
4440*c217d954SCole Faust#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro)
4441*c217d954SCole Faust#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro)
4442*c217d954SCole Faust#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro)
4443*c217d954SCole Faust#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro)
4444*c217d954SCole Faust#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro)
4445*c217d954SCole Faust#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro)
4446*c217d954SCole Faust#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro)
4447*c217d954SCole Faust#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro)
4448*c217d954SCole Faust#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro)
4449*c217d954SCole Faust#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro)
4450*c217d954SCole Faust#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro)
4451*c217d954SCole Faust#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro)
4452*c217d954SCole Faust#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro)
4453*c217d954SCole Faust#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro)
4454*c217d954SCole Faust#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro)
4455*c217d954SCole Faust#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro)
4456*c217d954SCole Faust#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro)
4457*c217d954SCole Faust#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro)
4458*c217d954SCole Faust#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro)
4459*c217d954SCole Faust#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro)
4460*c217d954SCole Faust#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro)
4461*c217d954SCole Faust#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro)
4462*c217d954SCole Faust#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro)
4463*c217d954SCole Faust#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro)
4464*c217d954SCole Faust#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro)
4465*c217d954SCole Faust#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro)
4466*c217d954SCole Faust#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro)
4467*c217d954SCole Faust#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro)
4468*c217d954SCole Faust#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro)
4469*c217d954SCole Faust#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro)
4470*c217d954SCole Faust#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro)
4471*c217d954SCole Faust#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro)
4472*c217d954SCole Faust#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro)
4473*c217d954SCole Faust#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro)
4474*c217d954SCole Faust#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro)
4475*c217d954SCole Faust#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro)
4476*c217d954SCole Faust#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro)
4477*c217d954SCole Faust#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro)
4478*c217d954SCole Faust#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro)
4479*c217d954SCole Faust#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro)
4480*c217d954SCole Faust#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro)
4481*c217d954SCole Faust#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro)
4482*c217d954SCole Faust#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro)
4483*c217d954SCole Faust#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro)
4484*c217d954SCole Faust#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro)
4485*c217d954SCole Faust#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro)
4486*c217d954SCole Faust#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro)
4487*c217d954SCole Faust#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro)
4488*c217d954SCole Faust#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro)
4489*c217d954SCole Faust#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro)
4490*c217d954SCole Faust#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro)
4491*c217d954SCole Faust#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro)
4492*c217d954SCole Faust#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro)
4493*c217d954SCole Faust#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro)
4494*c217d954SCole Faust#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro)
4495*c217d954SCole Faust#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro)
4496*c217d954SCole Faust#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro)
4497*c217d954SCole Faust#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro)
4498*c217d954SCole Faust#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro)
4499*c217d954SCole Faust#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro)
4500*c217d954SCole Faust#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro)
4501*c217d954SCole Faust#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro)
4502*c217d954SCole Faust#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro)
4503*c217d954SCole Faust#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro)
4504*c217d954SCole Faust#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro)
4505*c217d954SCole Faust#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro)
4506*c217d954SCole Faust#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro)
4507*c217d954SCole Faust#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro)
4508*c217d954SCole Faust#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro)
4509*c217d954SCole Faust#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro)
4510*c217d954SCole Faust#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro)
4511*c217d954SCole Faust#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro)
4512*c217d954SCole Faust#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro)
4513*c217d954SCole Faust#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro)
4514*c217d954SCole Faust#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro)
4515*c217d954SCole Faust#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro)
4516*c217d954SCole Faust
4517*c217d954SCole Faust#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
4518*c217d954SCole Faust    {                                                          \
4519*c217d954SCole Faust        type idx = start;                                      \
4520*c217d954SCole Faust        LOOP_UNROLLING_##num(idx, step, macro);                \
4521*c217d954SCole Faust    }
4522*c217d954SCole Faust#else
4523*c217d954SCole Faust#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
4524*c217d954SCole Faust    {                                                          \
4525*c217d954SCole Faust        _Pragma("unroll")                                      \
4526*c217d954SCole Faust        for(type idx = start; idx < (num * step); idx += step) \
4527*c217d954SCole Faust        {                                                      \
4528*c217d954SCole Faust            (macro);                                           \
4529*c217d954SCole Faust        }                                                      \
4530*c217d954SCole Faust    }
4531*c217d954SCole Faust#endif
4532*c217d954SCole Faust#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro)
4533*c217d954SCole Faust
4534*c217d954SCole Faust
4535*c217d954SCole Faust#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))
4536*c217d954SCole Faust
4537*c217d954SCole Faust
4538*c217d954SCole Faust#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c)
4539*c217d954SCole Faust#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)
4540*c217d954SCole Faust#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4541*c217d954SCole Faust    ({                                                \
4542*c217d954SCole Faust        c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b);     \
4543*c217d954SCole Faust    })
4544*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
4545*c217d954SCole Faust#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
4546*c217d954SCole Faust#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
4547*c217d954SCole Faust#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b));
4548*c217d954SCole Faust#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
4549*c217d954SCole Faust#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c));
4550*c217d954SCole Faust#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c));
4551*c217d954SCole Faust#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c));
4552*c217d954SCole Faust#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
4553*c217d954SCole Faust#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
4554*c217d954SCole Faust#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
4555*c217d954SCole Faust#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b));
4556*c217d954SCole Faust#else
4557*c217d954SCole Faust#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
4558*c217d954SCole Faust    ({                                                  \
4559*c217d954SCole Faust        c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \
4560*c217d954SCole Faust        c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \
4561*c217d954SCole Faust    })
4562*c217d954SCole Faust#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
4563*c217d954SCole Faust    ({                                                  \
4564*c217d954SCole Faust        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c);  \
4565*c217d954SCole Faust        c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \
4566*c217d954SCole Faust    })
4567*c217d954SCole Faust#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val)   \
4568*c217d954SCole Faust    ({                                                    \
4569*c217d954SCole Faust        val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \
4570*c217d954SCole Faust        val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \
4571*c217d954SCole Faust        val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \
4572*c217d954SCole Faust        val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \
4573*c217d954SCole Faust    })
4574*c217d954SCole Faust#endif
4575*c217d954SCole Faust#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4576*c217d954SCole Faust    ({                                                \
4577*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
4578*c217d954SCole Faust        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c);     \
4579*c217d954SCole Faust    })
4580*c217d954SCole Faust#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4581*c217d954SCole Faust    ({                                                \
4582*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
4583*c217d954SCole Faust        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c);     \
4584*c217d954SCole Faust    })
4585*c217d954SCole Faust#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4586*c217d954SCole Faust    ({                                                \
4587*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
4588*c217d954SCole Faust        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c);     \
4589*c217d954SCole Faust    })
4590*c217d954SCole Faust#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4591*c217d954SCole Faust    ({                                                \
4592*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);     \
4593*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);     \
4594*c217d954SCole Faust    })
4595*c217d954SCole Faust#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4596*c217d954SCole Faust    ({                                                \
4597*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
4598*c217d954SCole Faust        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c);     \
4599*c217d954SCole Faust    })
4600*c217d954SCole Faust#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4601*c217d954SCole Faust    ({                                                \
4602*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
4603*c217d954SCole Faust        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c);     \
4604*c217d954SCole Faust    })
4605*c217d954SCole Faust#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4606*c217d954SCole Faust    ({                                                \
4607*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
4608*c217d954SCole Faust        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c);     \
4609*c217d954SCole Faust    })
4610*c217d954SCole Faust#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4611*c217d954SCole Faust    ({                                                \
4612*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
4613*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
4614*c217d954SCole Faust    })
4615*c217d954SCole Faust#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4616*c217d954SCole Faust    ({                                                \
4617*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
4618*c217d954SCole Faust        DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c);     \
4619*c217d954SCole Faust    })
4620*c217d954SCole Faust#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4621*c217d954SCole Faust    ({                                                \
4622*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
4623*c217d954SCole Faust        DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c);     \
4624*c217d954SCole Faust    })
4625*c217d954SCole Faust#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4626*c217d954SCole Faust    ({                                                \
4627*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
4628*c217d954SCole Faust        DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c);     \
4629*c217d954SCole Faust    })
4630*c217d954SCole Faust#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4631*c217d954SCole Faust    ({                                                 \
4632*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);      \
4633*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);      \
4634*c217d954SCole Faust    })
4635*c217d954SCole Faust
4636*c217d954SCole Faust
4637*c217d954SCole Faust#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c)
4638*c217d954SCole Faust#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c)
4639*c217d954SCole Faust
4640*c217d954SCole Faust
4641*c217d954SCole Faust#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y)
4642*c217d954SCole Faust#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)
4643*c217d954SCole Faust#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \
4644*c217d954SCole Faust    VLOAD(WIDTH)                                                \
4645*c217d954SCole Faust    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
4646*c217d954SCole Faust#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))
4647*c217d954SCole Faust
4648*c217d954SCole Faust
4649*c217d954SCole Faust#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES)
4650*c217d954SCole Faust#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES)
4651*c217d954SCole Faust#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \
4652*c217d954SCole Faust    VSTORE(WIDTH)                                                \
4653*c217d954SCole Faust    (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
4654*c217d954SCole Faust#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES)
4655*c217d954SCole Faust
4656*c217d954SCole Faust
4657*c217d954SCole Faust#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst)                      \
4658*c217d954SCole Faust    ({                                                                                                                 \
4659*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                          \
4660*c217d954SCole Faust        {                                                                                                              \
4661*c217d954SCole Faust            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \
4662*c217d954SCole Faust        })                                                                                                             \
4663*c217d954SCole Faust    })
4664*c217d954SCole Faust
4665*c217d954SCole Faust
4666*c217d954SCole Faust#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst)    \
4667*c217d954SCole Faust    ({                                                                                                  \
4668*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                           \
4669*c217d954SCole Faust        {                                                                                               \
4670*c217d954SCole Faust            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \
4671*c217d954SCole Faust        })                                                                                              \
4672*c217d954SCole Faust    })
4673*c217d954SCole Faust
4674*c217d954SCole Faust
4675*c217d954SCole Faust#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y)                                                      \
4676*c217d954SCole Faust    ({                                                                                                                                                                                             \
4677*c217d954SCole Faust        if(WIDTH1_CONDITION)                                                                                                                                                                       \
4678*c217d954SCole Faust        {                                                                                                                                                                                          \
4679*c217d954SCole Faust            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
4680*c217d954SCole Faust            {                                                                                                                                                                                      \
4681*c217d954SCole Faust                VLOAD_PARTIAL(WIDTH0, WIDTH1)                                                         \
4682*c217d954SCole Faust                (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y));               \
4683*c217d954SCole Faust            })                                                                                                                                                                                     \
4684*c217d954SCole Faust        }                                                                                                                                                                                          \
4685*c217d954SCole Faust        else                                                                                                                                                                                       \
4686*c217d954SCole Faust        {                                                                                                                                                                                          \
4687*c217d954SCole Faust            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
4688*c217d954SCole Faust            {                                                                                                                                                                                      \
4689*c217d954SCole Faust                dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \
4690*c217d954SCole Faust            })                                                                                                                                                                                     \
4691*c217d954SCole Faust        }                                                                                                                                                                                          \
4692*c217d954SCole Faust    })
4693*c217d954SCole Faust
4694*c217d954SCole Faust#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst)   \
4695*c217d954SCole Faust    ({                                                                                                                                                \
4696*c217d954SCole Faust        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT,                                                                                                   \
4697*c217d954SCole Faust        {                                                                                                                                             \
4698*c217d954SCole Faust            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH,                                                                                                \
4699*c217d954SCole Faust            {                                                                                                                                         \
4700*c217d954SCole Faust                int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH);                                                                                \
4701*c217d954SCole Faust                _src_y    += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                        \
4702*c217d954SCole Faust                int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \
4703*c217d954SCole Faust                if(_src_valid_y != 0)                                                                                                                 \
4704*c217d954SCole Faust                {                                                                                                                                     \
4705*c217d954SCole Faust                    dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                     \
4706*c217d954SCole Faust                }                                                                                                                                     \
4707*c217d954SCole Faust            })                                                                                                                                        \
4708*c217d954SCole Faust        })                                                                                                                                            \
4709*c217d954SCole Faust    })
4710*c217d954SCole Faust
4711*c217d954SCole Faust
4712*c217d954SCole Faust#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst)         \
4713*c217d954SCole Faust    ({ \
4714*c217d954SCole Faust        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \
4715*c217d954SCole Faust        { \
4716*c217d954SCole Faust            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \
4717*c217d954SCole Faust            { \
4718*c217d954SCole Faust                int _src_y = (X) + _xk * (DILATION_X); \
4719*c217d954SCole Faust                int _src_z = ((Y) + _yk * (DILATION_Y)); \
4720*c217d954SCole Faust                int _src_w    = (B); \
4721*c217d954SCole Faust                bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \
4722*c217d954SCole Faust                if(!(BOUNDARY_CHECK)) \
4723*c217d954SCole Faust                { \
4724*c217d954SCole Faust                    dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
4725*c217d954SCole Faust                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
4726*c217d954SCole Faust                } \
4727*c217d954SCole Faust                else \
4728*c217d954SCole Faust                { \
4729*c217d954SCole Faust                    if(_src_valid_y) \
4730*c217d954SCole Faust                    { \
4731*c217d954SCole Faust                        dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
4732*c217d954SCole Faust                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
4733*c217d954SCole Faust                    }                                                                                                                                                                                                 \
4734*c217d954SCole Faust                } \
4735*c217d954SCole Faust            })                                                                                                                                                                                                             \
4736*c217d954SCole Faust        })                                                                                                                                                                                                             \
4737*c217d954SCole Faust    })
4738*c217d954SCole Faust
4739*c217d954SCole Faust
4740*c217d954SCole Faust#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst)                \
4741*c217d954SCole Faust    ({                                                                                                                                                                \
4742*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
4743*c217d954SCole Faust        {                                                                                                                                                             \
4744*c217d954SCole Faust            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH);                                                                                          \
4745*c217d954SCole Faust            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                                               \
4746*c217d954SCole Faust            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \
4747*c217d954SCole Faust            if(_src_valid_y != 0)                                                                                                                                     \
4748*c217d954SCole Faust            {                                                                                                                                                         \
4749*c217d954SCole Faust                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
4750*c217d954SCole Faust            }                                                                                                                                                         \
4751*c217d954SCole Faust        })                                                                                                                                                            \
4752*c217d954SCole Faust    })
4753*c217d954SCole Faust
4754*c217d954SCole Faust
4755*c217d954SCole Faust#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
4756*c217d954SCole Faust#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
4757*c217d954SCole Faust#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
4758*c217d954SCole Faust    ({ \
4759*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
4760*c217d954SCole Faust        { \
4761*c217d954SCole Faust            if(yi[0].s[_i] >= 0) \
4762*c217d954SCole Faust            { \
4763*c217d954SCole Faust                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
4764*c217d954SCole Faust            } \
4765*c217d954SCole Faust        }) \
4766*c217d954SCole Faust    })
4767*c217d954SCole Faust
4768*c217d954SCole Faust#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
4769*c217d954SCole Faust    ({ \
4770*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
4771*c217d954SCole Faust        { \
4772*c217d954SCole Faust            dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
4773*c217d954SCole Faust        }) \
4774*c217d954SCole Faust    })
4775*c217d954SCole Faust
4776*c217d954SCole Faust
4777*c217d954SCole Faust#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \
4778*c217d954SCole Faust    ({                                                                                                                                                                \
4779*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
4780*c217d954SCole Faust        {                                                                                                                                                             \
4781*c217d954SCole Faust            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT);                                      \
4782*c217d954SCole Faust            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH);                                                                         \
4783*c217d954SCole Faust            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)   \
4784*c217d954SCole Faust                             && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH));                                                                     \
4785*c217d954SCole Faust            if(_src_valid_y != 0)                                                                                                                                     \
4786*c217d954SCole Faust            {                                                                                                                                                         \
4787*c217d954SCole Faust                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
4788*c217d954SCole Faust            }                                                                                                                                                         \
4789*c217d954SCole Faust        })                                                                                                                                                            \
4790*c217d954SCole Faust    })
4791*c217d954SCole Faust
4792*c217d954SCole Faust
4793*c217d954SCole Faust#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y)                                                      \
4794*c217d954SCole Faust    ({                                                                                                                                                                                             \
4795*c217d954SCole Faust        if(WIDTH1_CONDITION)                                                                                                                                                                       \
4796*c217d954SCole Faust        {                                                                                                                                                                                          \
4797*c217d954SCole Faust            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
4798*c217d954SCole Faust            {                                                                                                                                                                                      \
4799*c217d954SCole Faust                VSTORE_PARTIAL(WIDTH0, WIDTH1)                                                                                                                                                     \
4800*c217d954SCole Faust                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
4801*c217d954SCole Faust            })                                                                                                                                                                                     \
4802*c217d954SCole Faust        }                                                                                                                                                                                          \
4803*c217d954SCole Faust        else                                                                                                                                                                                       \
4804*c217d954SCole Faust        {                                                                                                                                                                                          \
4805*c217d954SCole Faust            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
4806*c217d954SCole Faust            {                                                                                                                                                                                      \
4807*c217d954SCole Faust                VSTORE(WIDTH0)                                                                                                                                                                     \
4808*c217d954SCole Faust                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
4809*c217d954SCole Faust            })                                                                                                                                                                                     \
4810*c217d954SCole Faust        }                                                                                                                                                                                          \
4811*c217d954SCole Faust    })
4812*c217d954SCole Faust
4813*c217d954SCole Faust
4814*c217d954SCole Faust#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst)        \
4815*c217d954SCole Faust    ({                                                                                               \
4816*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                           \
4817*c217d954SCole Faust        {                                                                                            \
4818*c217d954SCole Faust            ACC_DATA_TYPE _tm = 0;                                                                   \
4819*c217d954SCole Faust            LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                       \
4820*c217d954SCole Faust            {                                                                                        \
4821*c217d954SCole Faust                _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET);                 \
4822*c217d954SCole Faust            })                                                                                       \
4823*c217d954SCole Faust            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                       \
4824*c217d954SCole Faust            {                                                                                        \
4825*c217d954SCole Faust                dst[_m0].s[_n0] += _tm;                                                              \
4826*c217d954SCole Faust                LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                   \
4827*c217d954SCole Faust                {                                                                                    \
4828*c217d954SCole Faust                    dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \
4829*c217d954SCole Faust                })                                                                                   \
4830*c217d954SCole Faust            })                                                                                       \
4831*c217d954SCole Faust        })                                                                                          \
4832*c217d954SCole Faust    })
4833*c217d954SCole Faust
4834*c217d954SCole Faust
4835*c217d954SCole Faust#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
4836*c217d954SCole Faust#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
4837*c217d954SCole Faust
4838*c217d954SCole Faust
4839*c217d954SCole Faust#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
4840*c217d954SCole Faust    ({ \
4841*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
4842*c217d954SCole Faust        { \
4843*c217d954SCole Faust            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
4844*c217d954SCole Faust            { \
4845*c217d954SCole Faust                SRC_DATA_TYPE _tmp = 0; \
4846*c217d954SCole Faust                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
4847*c217d954SCole Faust                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
4848*c217d954SCole Faust                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
4849*c217d954SCole Faust                long a_64 = (long)(_src); \
4850*c217d954SCole Faust                long b_64 = (long)(DST_MULTIPLIER); \
4851*c217d954SCole Faust                long ab_64 = a_64 * b_64; \
4852*c217d954SCole Faust                long mask1 = 1 << 30; \
4853*c217d954SCole Faust                long mask2 = 1 - (1 << 30); \
4854*c217d954SCole Faust                long is_positive_or_zero = ab_64 >= 0; \
4855*c217d954SCole Faust                long nudge = select(mask2, mask1, is_positive_or_zero); \
4856*c217d954SCole Faust                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
4857*c217d954SCole Faust                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
4858*c217d954SCole Faust                if(DST_SHIFT >= 0) \
4859*c217d954SCole Faust                { \
4860*c217d954SCole Faust                    long mask = ((((int)1) << DST_SHIFT) - (long)1); \
4861*c217d954SCole Faust                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
4862*c217d954SCole Faust                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
4863*c217d954SCole Faust                } \
4864*c217d954SCole Faust                _tmp += DST_OFFSET; \
4865*c217d954SCole Faust                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
4866*c217d954SCole Faust            })                                                                                                                                          \
4867*c217d954SCole Faust        })                                                                                                                                          \
4868*c217d954SCole Faust    })
4869*c217d954SCole Faust
4870*c217d954SCole Faust
4871*c217d954SCole Faust#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
4872*c217d954SCole Faust    ({ \
4873*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
4874*c217d954SCole Faust        { \
4875*c217d954SCole Faust            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
4876*c217d954SCole Faust            { \
4877*c217d954SCole Faust                SRC_DATA_TYPE _tmp = 0; \
4878*c217d954SCole Faust                SRC_DATA_TYPE _tmp2 = 0; \
4879*c217d954SCole Faust                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
4880*c217d954SCole Faust                SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \
4881*c217d954SCole Faust                SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \
4882*c217d954SCole Faust                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \
4883*c217d954SCole Faust                SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \
4884*c217d954SCole Faust                long a_64 = (long)(_src); \
4885*c217d954SCole Faust                long b_64 = (long)(_dst_multiplier); \
4886*c217d954SCole Faust                long ab_64 = a_64 * b_64; \
4887*c217d954SCole Faust                long mask1 = 1 << 30; \
4888*c217d954SCole Faust                long mask2 = 1 - (1 << 30); \
4889*c217d954SCole Faust                long is_positive_or_zero = ab_64 >= 0; \
4890*c217d954SCole Faust                long nudge = select(mask2, mask1, is_positive_or_zero); \
4891*c217d954SCole Faust                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
4892*c217d954SCole Faust                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
4893*c217d954SCole Faust                long mask = ((((int)1) << _dst_shift) - (int)1); \
4894*c217d954SCole Faust                long threshold = (mask >> 1) + any(_tmp); \
4895*c217d954SCole Faust                _tmp2 = _tmp >> _dst_shift; \
4896*c217d954SCole Faust                _tmp2 += select(0, 1, (_tmp & mask) > threshold); \
4897*c217d954SCole Faust                _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \
4898*c217d954SCole Faust                _tmp += DST_OFFSET; \
4899*c217d954SCole Faust                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
4900*c217d954SCole Faust            })                                                                                                                                          \
4901*c217d954SCole Faust        })                                                                                                                                         \
4902*c217d954SCole Faust    })
4903*c217d954SCole Faust
4904*c217d954SCole Faust
4905*c217d954SCole Faust#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst)                          \
4906*c217d954SCole Faust    ({ \
4907*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
4908*c217d954SCole Faust        { \
4909*c217d954SCole Faust            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
4910*c217d954SCole Faust            { \
4911*c217d954SCole Faust                SRC_DATA_TYPE _tmp = 0; \
4912*c217d954SCole Faust                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
4913*c217d954SCole Faust                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
4914*c217d954SCole Faust                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
4915*c217d954SCole Faust                long a_64 = (long)(_src); \
4916*c217d954SCole Faust                long b_64 = (long)(DST_MULTIPLIER); \
4917*c217d954SCole Faust                long ab_64 = a_64 * b_64; \
4918*c217d954SCole Faust                long mask1 = 1 << 30; \
4919*c217d954SCole Faust                long mask2 = 1 - (1 << 30); \
4920*c217d954SCole Faust                long is_positive_or_zero = ab_64 >= 0; \
4921*c217d954SCole Faust                long nudge = select(mask2, mask1, is_positive_or_zero); \
4922*c217d954SCole Faust                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
4923*c217d954SCole Faust                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
4924*c217d954SCole Faust                if(DST_SHIFT >= 0) \
4925*c217d954SCole Faust                { \
4926*c217d954SCole Faust                    long mask = ((((int)1) << DST_SHIFT) - (int)1); \
4927*c217d954SCole Faust                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
4928*c217d954SCole Faust                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
4929*c217d954SCole Faust                } \
4930*c217d954SCole Faust                _tmp += DST_OFFSET; \
4931*c217d954SCole Faust                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
4932*c217d954SCole Faust            })                                                                                                                                          \
4933*c217d954SCole Faust        })                                                                                                                                          \
4934*c217d954SCole Faust    })
4935*c217d954SCole Faust
4936*c217d954SCole Faust
4937*c217d954SCole Faust#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask)                                                                                            \
4938*c217d954SCole Faust    ({                                                                                                                                                     \
4939*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                                                                                 \
4940*c217d954SCole Faust        {                                                                                                                                                  \
4941*c217d954SCole Faust            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                                                                             \
4942*c217d954SCole Faust            {                                                                                                                                              \
4943*c217d954SCole Faust                a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \
4944*c217d954SCole Faust            })                                                                                                                                             \
4945*c217d954SCole Faust        })                                                                                                                                                 \
4946*c217d954SCole Faust    })
4947*c217d954SCole Faust
4948*c217d954SCole Faust
4949*c217d954SCole Faust#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst)               \
4950*c217d954SCole Faust    ({                                                                                         \
4951*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                     \
4952*c217d954SCole Faust        {                                                                                      \
4953*c217d954SCole Faust            dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \
4954*c217d954SCole Faust        })                                                                                     \
4955*c217d954SCole Faust    })
4956*c217d954SCole Faust
4957*c217d954SCole Faust
4958*c217d954SCole Faust#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x))
4959*c217d954SCole Faust
4960*c217d954SCole Faust#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x)))
4961*c217d954SCole Faust
4962*c217d954SCole Faust#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
4963*c217d954SCole Faust
4964*c217d954SCole Faust#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f))
4965*c217d954SCole Faust
4966*c217d954SCole Faust#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x)
4967*c217d954SCole Faust
4968*c217d954SCole Faust#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
4969*c217d954SCole Faust#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
4970*c217d954SCole Faust
4971*c217d954SCole Faust#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))
4972*c217d954SCole Faust#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL))
4973*c217d954SCole Faust#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))
4974*c217d954SCole Faust#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL))
4975*c217d954SCole Faust
4976*c217d954SCole Faust
4977*c217d954SCole Faust#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst)               \
4978*c217d954SCole Faust    ({ \
4979*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
4980*c217d954SCole Faust        { \
4981*c217d954SCole Faust            dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \
4982*c217d954SCole Faust        })                                                                                          \
4983*c217d954SCole Faust    })
4984*c217d954SCole Faust
4985*c217d954SCole Faust
4986*c217d954SCole Faust#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \
4987*c217d954SCole Faust    ({                                                            \
4988*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
4989*c217d954SCole Faust        {                                                         \
4990*c217d954SCole Faust            dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \
4991*c217d954SCole Faust        })                                                        \
4992*c217d954SCole Faust    })
4993*c217d954SCole Faust
4994*c217d954SCole Faust
4995*c217d954SCole Faust#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
4996*c217d954SCole Faust    ({                                                            \
4997*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
4998*c217d954SCole Faust        {                                                         \
4999*c217d954SCole Faust            dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant;               \
5000*c217d954SCole Faust        })                                                        \
5001*c217d954SCole Faust    })
5002*c217d954SCole Faust
5003*c217d954SCole Faust#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5004*c217d954SCole Faust#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5005*c217d954SCole Faust#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5006*c217d954SCole Faust
5007*c217d954SCole Faust#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5008*c217d954SCole Faust#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5009*c217d954SCole Faust
5010*c217d954SCole Faust#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5011*c217d954SCole Faust
5012*c217d954SCole Faust#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5013*c217d954SCole Faust#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5014*c217d954SCole Faust
5015*c217d954SCole Faust
5016*c217d954SCole Faust#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
5017*c217d954SCole Faust    ({                                                            \
5018*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
5019*c217d954SCole Faust        {                                                         \
5020*c217d954SCole Faust            dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \
5021*c217d954SCole Faust        })                                                        \
5022*c217d954SCole Faust    })
5023*c217d954SCole Faust
5024*c217d954SCole Faust
5025*c217d954SCole Faust#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
5026*c217d954SCole Faust    ({                                                      \
5027*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
5028*c217d954SCole Faust        {                                                   \
5029*c217d954SCole Faust            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
5030*c217d954SCole Faust        })                                                  \
5031*c217d954SCole Faust    })
5032*c217d954SCole Faust
5033*c217d954SCole Faust
5034*c217d954SCole Faust#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
5035*c217d954SCole Faust    ({                                                      \
5036*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
5037*c217d954SCole Faust        {                                                   \
5038*c217d954SCole Faust            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
5039*c217d954SCole Faust        })                                                  \
5040*c217d954SCole Faust    })
5041*c217d954SCole Faust
5042*c217d954SCole Faust#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5043*c217d954SCole Faust#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5044*c217d954SCole Faust#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5045*c217d954SCole Faust#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5046*c217d954SCole Faust
5047*c217d954SCole Faust
5048*c217d954SCole Faust#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
5049*c217d954SCole Faust    ({                                                      \
5050*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
5051*c217d954SCole Faust        {                                                   \
5052*c217d954SCole Faust            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
5053*c217d954SCole Faust        })                                                  \
5054*c217d954SCole Faust    })
5055*c217d954SCole Faust
5056*c217d954SCole Faust
5057*c217d954SCole Faust#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \
5058*c217d954SCole Faust    ({                                                      \
5059*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
5060*c217d954SCole Faust        {                                                   \
5061*c217d954SCole Faust            dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
5062*c217d954SCole Faust        })                                                  \
5063*c217d954SCole Faust    })
5064*c217d954SCole Faust
5065*c217d954SCole Faust
5066*c217d954SCole Faust#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
5067*c217d954SCole Faust#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
5068*c217d954SCole Faust#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
5069*c217d954SCole Faust#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
5070*c217d954SCole Faust#define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
5071*c217d954SCole Faust#define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
5072*c217d954SCole Faust#define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
5073*c217d954SCole Faust#define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
5074*c217d954SCole Faust#define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
5075*c217d954SCole Faust    {                                                                                     \
5076*c217d954SCole Faust        LOOP_UNROLLING(int, _m, 0, 1, M0,                                                 \
5077*c217d954SCole Faust        {                                                                                 \
5078*c217d954SCole Faust            LOOP_UNROLLING(int, _n, 0, 1, N0,                                             \
5079*c217d954SCole Faust            {                                                                             \
5080*c217d954SCole Faust                LOOP_UNROLLING(int, _k, 0, 1, K0,                                         \
5081*c217d954SCole Faust                {                                                                         \
5082*c217d954SCole Faust                    dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \
5083*c217d954SCole Faust                })                                                                        \
5084*c217d954SCole Faust            })                                                                            \
5085*c217d954SCole Faust        })                                                                                \
5086*c217d954SCole Faust    }
5087*c217d954SCole Faust
5088*c217d954SCole Faust#define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                            \
5089*c217d954SCole Faust    ({ \
5090*c217d954SCole Faust        LOOP_UNROLLING(int, _m, 0, 1, M0, \
5091*c217d954SCole Faust        { \
5092*c217d954SCole Faust            LOOP_UNROLLING(int, _n, 0, 1, N0, \
5093*c217d954SCole Faust            { \
5094*c217d954SCole Faust                DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \
5095*c217d954SCole Faust            })                                                                                             \
5096*c217d954SCole Faust        })                                                                                             \
5097*c217d954SCole Faust    })
5098*c217d954SCole Faust
5099*c217d954SCole Faust#endif
5100*c217d954SCole Faust
5101*c217d954SCole Faust
5102*c217d954SCole Faust
5103*c217d954SCole Faust
5104*c217d954SCole Faust__kernel void direct_convolution_nhwc(
5105*c217d954SCole Faust    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
5106*c217d954SCole Faust    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
5107*c217d954SCole Faust    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
5108*c217d954SCole Faust#if defined(HAS_BIAS)
5109*c217d954SCole Faust    ,
5110*c217d954SCole Faust    VECTOR_DECLARATION(bia)
5111*c217d954SCole Faust#endif
5112*c217d954SCole Faust)
5113*c217d954SCole Faust{
5114*c217d954SCole Faust
5115*c217d954SCole Faust
5116*c217d954SCole Faust#define _IWEI_WIDTH WEI_WIDTH
5117*c217d954SCole Faust#define _IWEI_HEIGHT WEI_HEIGHT
5118*c217d954SCole Faust#define _ISRC_WIDTH SRC_WIDTH
5119*c217d954SCole Faust#define _ISRC_HEIGHT SRC_HEIGHT
5120*c217d954SCole Faust#define _ISRC_CHANNELS SRC_CHANNELS
5121*c217d954SCole Faust#define _IDST_WIDTH DST_WIDTH
5122*c217d954SCole Faust#define _IDST_HEIGHT DST_HEIGHT
5123*c217d954SCole Faust#define _IDST_CHANNELS DST_CHANNELS
5124*c217d954SCole Faust#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
5125*c217d954SCole Faust
5126*c217d954SCole Faust
5127*c217d954SCole Faust#if defined(IS_QUANTIZED)
5128*c217d954SCole Faust#define _IOUTPUT_TILE cq
5129*c217d954SCole Faust#else
5130*c217d954SCole Faust#define _IOUTPUT_TILE c
5131*c217d954SCole Faust#endif
5132*c217d954SCole Faust
5133*c217d954SCole Faust    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0);
5134*c217d954SCole Faust    const int mout = GET_SPATIAL_IDX(1, M0, 0);
5135*c217d954SCole Faust    const int bout = GET_SPATIAL_IDX(2, 1, 0);
5136*c217d954SCole Faust
5137*c217d954SCole Faust
5138*c217d954SCole Faust
5139*c217d954SCole Faust    TILE(int, 1, M0, xi);
5140*c217d954SCole Faust    TILE(int, 1, M0, yi);
5141*c217d954SCole Faust
5142*c217d954SCole Faust
5143*c217d954SCole Faust    LOOP_UNROLLING(int, i, 0, 1, M0,
5144*c217d954SCole Faust    {
5145*c217d954SCole Faust        xi[0].s[i] = ((mout + i) % _IDST_WIDTH) * STRIDE_X;
5146*c217d954SCole Faust        yi[0].s[i] = ((mout + i) / _IDST_WIDTH) * STRIDE_Y;
5147*c217d954SCole Faust        xi[0].s[i] -= PAD_LEFT;
5148*c217d954SCole Faust        yi[0].s[i] -= PAD_TOP;
5149*c217d954SCole Faust    })
5150*c217d954SCole Faust
5151*c217d954SCole Faust
5152*c217d954SCole Faust    TILE(ACC_DATA_TYPE, M0, N0, c);
5153*c217d954SCole Faust
5154*c217d954SCole Faust    LOOP_UNROLLING(int, i, 0, 1, M0,
5155*c217d954SCole Faust    {
5156*c217d954SCole Faust        c[i].v = 0;
5157*c217d954SCole Faust    })
5158*c217d954SCole Faust
5159*c217d954SCole Faust    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
5160*c217d954SCole Faust    {
5161*c217d954SCole Faust        int xk = i % _IWEI_WIDTH;
5162*c217d954SCole Faust        int yk = i / _IWEI_WIDTH;
5163*c217d954SCole Faust
5164*c217d954SCole Faust        TILE(int, 1, M0, my);
5165*c217d954SCole Faust
5166*c217d954SCole Faust        LOOP_UNROLLING(int, i, 0, 1, M0,
5167*c217d954SCole Faust        {
5168*c217d954SCole Faust            int x_s    = xi[0].s[i] + xk;
5169*c217d954SCole Faust            int y_s    = yi[0].s[i] + yk;
5170*c217d954SCole Faust            my[0].s[i] = x_s + y_s *_ISRC_WIDTH;
5171*c217d954SCole Faust            my[0].s[i] = my[0].s[i] + bout * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
5172*c217d954SCole Faust            my[0].s[i] = select(-1, my[0].s[i], x_s >= 0);
5173*c217d954SCole Faust            my[0].s[i] = select(-1, my[0].s[i], x_s < _ISRC_WIDTH);
5174*c217d954SCole Faust            my[0].s[i] = select(-1, my[0].s[i], y_s >= 0);
5175*c217d954SCole Faust            my[0].s[i] = select(-1, my[0].s[i], y_s < _ISRC_HEIGHT);
5176*c217d954SCole Faust        })
5177*c217d954SCole Faust
5178*c217d954SCole Faust        int ck = 0;
5179*c217d954SCole Faust        for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
5180*c217d954SCole Faust        {
5181*c217d954SCole Faust            TILE(SRC_DATA_TYPE, M0, K0, a);
5182*c217d954SCole Faust            TILE(WEI_DATA_TYPE, N0, K0, b);
5183*c217d954SCole Faust
5184*c217d954SCole Faust
5185*c217d954SCole Faust            LOOP_UNROLLING(int, i, 0, 1, M0,
5186*c217d954SCole Faust            {
5187*c217d954SCole Faust                a[i].v = ZERO_VALUE;
5188*c217d954SCole Faust            })
5189*c217d954SCole Faust
5190*c217d954SCole Faust            LOOP_UNROLLING(int, i, 0, 1, N0,
5191*c217d954SCole Faust            {
5192*c217d954SCole Faust                b[i].v = ZERO_VALUE;
5193*c217d954SCole Faust            })
5194*c217d954SCole Faust
5195*c217d954SCole Faust
5196*c217d954SCole Faust            T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
5197*c217d954SCole Faust
5198*c217d954SCole Faust
5199*c217d954SCole Faust            T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
5200*c217d954SCole Faust
5201*c217d954SCole Faust
5202*c217d954SCole Faust            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
5203*c217d954SCole Faust
5204*c217d954SCole Faust
5205*c217d954SCole Faust
5206*c217d954SCole Faust            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
5207*c217d954SCole Faust        }
5208*c217d954SCole Faust
5209*c217d954SCole Faust
5210*c217d954SCole Faust#if defined(LEFTOVER_LOOP)
5211*c217d954SCole Faust
5212*c217d954SCole Faust        for(; ck < _ISRC_CHANNELS; ++ck)
5213*c217d954SCole Faust        {
5214*c217d954SCole Faust            TILE(SRC_DATA_TYPE, M0, 1, a);
5215*c217d954SCole Faust            TILE(WEI_DATA_TYPE, N0, 1, b);
5216*c217d954SCole Faust
5217*c217d954SCole Faust
5218*c217d954SCole Faust            LOOP_UNROLLING(int, i, 0, 1, M0,
5219*c217d954SCole Faust            {
5220*c217d954SCole Faust                a[i].v = ZERO_VALUE;
5221*c217d954SCole Faust            })
5222*c217d954SCole Faust
5223*c217d954SCole Faust            LOOP_UNROLLING(int, i, 0, 1, N0,
5224*c217d954SCole Faust            {
5225*c217d954SCole Faust                b[i].v = ZERO_VALUE;
5226*c217d954SCole Faust            })
5227*c217d954SCole Faust
5228*c217d954SCole Faust
5229*c217d954SCole Faust            T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
5230*c217d954SCole Faust
5231*c217d954SCole Faust
5232*c217d954SCole Faust
5233*c217d954SCole Faust            T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
5234*c217d954SCole Faust
5235*c217d954SCole Faust
5236*c217d954SCole Faust            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
5237*c217d954SCole Faust
5238*c217d954SCole Faust
5239*c217d954SCole Faust
5240*c217d954SCole Faust            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
5241*c217d954SCole Faust        }
5242*c217d954SCole Faust#endif
5243*c217d954SCole Faust    }
5244*c217d954SCole Faust
5245*c217d954SCole Faust
5246*c217d954SCole Faust
5247*c217d954SCole Faust    T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
5248*c217d954SCole Faust
5249*c217d954SCole Faust#if defined(HAS_BIAS)
5250*c217d954SCole Faust    TILE(BIA_DATA_TYPE, 1, N0, bias0);
5251*c217d954SCole Faust
5252*c217d954SCole Faust    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
5253*c217d954SCole Faust
5254*c217d954SCole Faust
5255*c217d954SCole Faust    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
5256*c217d954SCole Faust
5257*c217d954SCole Faust#endif
5258*c217d954SCole Faust
5259*c217d954SCole Faust#if defined(IS_QUANTIZED)
5260*c217d954SCole Faust
5261*c217d954SCole Faust    TILE(DST_DATA_TYPE, M0, N0, cq);
5262*c217d954SCole Faust
5263*c217d954SCole Faust
5264*c217d954SCole Faust    T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
5265*c217d954SCole Faust#endif
5266*c217d954SCole Faust
5267*c217d954SCole Faust
5268*c217d954SCole Faust    T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, _IOUTPUT_TILE, _IOUTPUT_TILE);
5269*c217d954SCole Faust
5270*c217d954SCole Faust    TILE(uint, M0, 1, dst_indirect_y);
5271*c217d954SCole Faust
5272*c217d954SCole Faust
5273*c217d954SCole Faust    LOOP_UNROLLING(int, i, 0, 1, M0,
5274*c217d954SCole Faust    {
5275*c217d954SCole Faust        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
5276*c217d954SCole Faust        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
5277*c217d954SCole Faust    })
5278*c217d954SCole Faust
5279*c217d954SCole Faust    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
5280*c217d954SCole Faust
5281*c217d954SCole Faust
5282*c217d954SCole Faust
5283*c217d954SCole Faust    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
5284*c217d954SCole Faust
5285*c217d954SCole Faust#undef _IWEI_WIDTH
5286*c217d954SCole Faust#undef _IWEI_HEIGHT
5287*c217d954SCole Faust#undef _ISRC_WIDTH
5288*c217d954SCole Faust#undef _ISRC_HEIGHT
5289*c217d954SCole Faust#undef _ISRC_CHANNELS
5290*c217d954SCole Faust#undef _IDST_WIDTH
5291*c217d954SCole Faust#undef _IDST_HEIGHT
5292*c217d954SCole Faust#undef _IDST_CHANNELS
5293*c217d954SCole Faust#undef _IY_MULTIPLIER
5294*c217d954SCole Faust})"