xref: /aosp_15_r20/external/XNNPACK/src/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qu8-gemm/4x16c4-aarch64-neondot-ld128.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qu8_conv_minmax_params params)  [sp + 8] -> x11
24
25# params structure is 20 bytes
26#  struct {
27#    uint8_t kernel_zero_point[4];
28#    int32_t right_pre_shift;
29#    int32_t multiplier;
30#    int32_t right_post_shift;
31#    int16_t output_zero_point;
32#    int8_t output_min;
33#    int8_t output_max;
34#  } rndnu_neon;
35
36# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
37
38# Register usage
39# A0  x3 v0
40# A1 x15 v1
41# A2 x13 v2
42# A3  x4 v3
43# B   x5 v4  v5  v6  v7
44# C0  x6 v16 v20 v24 v28
45# C1  x8 v17 v21 v25 v29
46# C2  x9 v18 v22 v26 v30
47# C3  x7 v19 v23 v27 v31
48# zero_point v8 v12 v13 v14 v15
49# unused v9 v10 v11
50
51BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128
52
53        # Clamp A and C pointers
54        CMP     x0, 2                   // if mr < 2
55        LDP     x12, x11, [sp]          // cn_stride, params
56        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
57        ADD     x15, x3, x4             // a1 = a0 + a_stride
58        ADD     x8, x6, x7              // c1 = c0 + cm_stride
59
60        # Save d8,d12-d15 on stack
61        STR     d8,  [sp, -48]!
62        CSEL    x15, x3, x15, LO        //   a1 = a0
63        CSEL    x8, x6,  x8, LO         //   c1 = c0
64        BIC     x2, x2, 3
65
66        STP     d12, d13, [sp, 16]
67        ADD     x13, x15, x4            // a2 = a1 + a_stride
68        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
69                                        // if mr <= 2
70        CSEL    x13, x15, x13, LS       //   a2 = a1
71        CSEL    x9,  x8,  x9, LS        //   c2 = c1
72
73        STP     d14, d15, [sp, 32]
74        CMP     x0, 4                   // if mr < 4
75        ADD     x4, x13, x4             // a3 = a2 + a_stride
76        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
77
78        LD1R    {v8.4s}, [x11], 4       // kernel_zero_point
79
80        CSEL    x4, x13, x4, LO         //   a3 = a2
81        CSEL    x7,  x9, x7, LO         //   c3 = c2
82
83        .p2align 3
840:
85        # Load initial bias from w into accumulators
86        LDP     q16, q20, [x5], 32
87
88        MOVI    v12.4s, 0
89        MOVI    v13.4s, 0
90        MOVI    v14.4s, 0
91        MOVI    v15.4s, 0
92
93        MOV     v17.16b, v16.16b
94        MOV     v18.16b, v16.16b
95        LDP     q24, q28, [x5], 32
96        MOV     v19.16b, v16.16b
97        MOV     v21.16b, v20.16b
98        SUBS    x0, x2, 16              // k = kc - 16
99        MOV     v22.16b, v20.16b
100        MOV     v23.16b, v20.16b
101        MOV     v25.16b, v24.16b
102        MOV     v26.16b, v24.16b
103        MOV     v27.16b, v24.16b
104        MOV     v29.16b, v28.16b
105        MOV     v30.16b, v28.16b
106        MOV     v31.16b, v28.16b
107
108        # Is there at least 16 bytes?
109        B.LO    3f
110
111        # Main loop - 16 bytes of A
112        .p2align 3
1131:
114        LDR     q0,  [x3], 16
115        LDR     q4,  [x5], 16
116        LDR     q1, [x15], 16
117        LDR     q2, [x13], 16
118        LDR     q3,  [x4], 16
119        LDR     q5,  [x5], 16
120
121        UDOT    v12.4s, v8.16b,  v0.16b  // update zero point
122        UDOT    v13.4s, v8.16b,  v1.16b
123        UDOT    v14.4s, v8.16b,  v2.16b
124        UDOT    v15.4s, v8.16b,  v3.16b
125
126        UDOT    v16.4s, v4.16b,  v0.4b[0]
127        UDOT    v17.4s, v4.16b,  v1.4b[0]
128        LDP     q6, q7, [x5], 32
129        UDOT    v18.4s, v4.16b,  v2.4b[0]
130        UDOT    v19.4s, v4.16b,  v3.4b[0]
131        UDOT    v20.4s, v5.16b,  v0.4b[0]
132        UDOT    v21.4s, v5.16b,  v1.4b[0]
133        UDOT    v22.4s, v5.16b,  v2.4b[0]
134        UDOT    v23.4s, v5.16b,  v3.4b[0]
135        UDOT    v24.4s, v6.16b, v0.4b[0]
136        UDOT    v25.4s, v6.16b, v1.4b[0]
137        LDP     q4, q5, [x5], 32
138        UDOT    v26.4s, v6.16b, v2.4b[0]
139        UDOT    v27.4s, v6.16b, v3.4b[0]
140        UDOT    v28.4s, v7.16b, v0.4b[0]
141        UDOT    v29.4s, v7.16b, v1.4b[0]
142        UDOT    v30.4s, v7.16b, v2.4b[0]
143        UDOT    v31.4s, v7.16b, v3.4b[0]
144
145        UDOT    v16.4s, v4.16b,  v0.4b[1]
146        UDOT    v17.4s, v4.16b,  v1.4b[1]
147        LDP     q6, q7, [x5], 32
148        UDOT    v18.4s, v4.16b,  v2.4b[1]
149        UDOT    v19.4s, v4.16b,  v3.4b[1]
150        UDOT    v20.4s, v5.16b,  v0.4b[1]
151        UDOT    v21.4s, v5.16b,  v1.4b[1]
152        UDOT    v22.4s, v5.16b,  v2.4b[1]
153        UDOT    v23.4s, v5.16b,  v3.4b[1]
154        UDOT    v24.4s, v6.16b,  v0.4b[1]
155        UDOT    v25.4s, v6.16b,  v1.4b[1]
156        LDP     q4, q5, [x5], 32
157        UDOT    v26.4s, v6.16b,  v2.4b[1]
158        UDOT    v27.4s, v6.16b,  v3.4b[1]
159        UDOT    v28.4s, v7.16b,  v0.4b[1]
160        UDOT    v29.4s, v7.16b,  v1.4b[1]
161        UDOT    v30.4s, v7.16b,  v2.4b[1]
162        UDOT    v31.4s, v7.16b,  v3.4b[1]
163
164        UDOT    v16.4s, v4.16b,  v0.4b[2]
165        UDOT    v17.4s, v4.16b,  v1.4b[2]
166        LDP     q6, q7, [x5], 32
167        UDOT    v18.4s, v4.16b,  v2.4b[2]
168        UDOT    v19.4s, v4.16b,  v3.4b[2]
169        UDOT    v20.4s, v5.16b,  v0.4b[2]
170        UDOT    v21.4s, v5.16b,  v1.4b[2]
171        UDOT    v22.4s, v5.16b,  v2.4b[2]
172        UDOT    v23.4s, v5.16b,  v3.4b[2]
173        UDOT    v24.4s, v6.16b,  v0.4b[2]
174        UDOT    v25.4s, v6.16b,  v1.4b[2]
175        LDP     q4, q5, [x5], 32
176        UDOT    v26.4s, v6.16b,  v2.4b[2]
177        UDOT    v27.4s, v6.16b,  v3.4b[2]
178        UDOT    v28.4s, v7.16b,  v0.4b[2]
179        UDOT    v29.4s, v7.16b,  v1.4b[2]
180        UDOT    v30.4s, v7.16b,  v2.4b[2]
181        UDOT    v31.4s, v7.16b,  v3.4b[2]
182
183        UDOT    v16.4s, v4.16b,  v0.4b[3]
184        UDOT    v17.4s, v4.16b,  v1.4b[3]
185        LDP     q6, q7, [x5], 32
186        UDOT    v18.4s, v4.16b,  v2.4b[3]
187        UDOT    v19.4s, v4.16b,  v3.4b[3]
188        UDOT    v20.4s, v5.16b,  v0.4b[3]
189        UDOT    v21.4s, v5.16b,  v1.4b[3]
190        UDOT    v22.4s, v5.16b,  v2.4b[3]
191        UDOT    v23.4s, v5.16b,  v3.4b[3]
192        UDOT    v24.4s, v6.16b,  v0.4b[3]
193        UDOT    v25.4s, v6.16b,  v1.4b[3]
194        UDOT    v26.4s, v6.16b,  v2.4b[3]
195        UDOT    v27.4s, v6.16b,  v3.4b[3]
196        SUBS    x0, x0, 16
197        UDOT    v28.4s, v7.16b,  v0.4b[3]
198        UDOT    v29.4s, v7.16b,  v1.4b[3]
199        UDOT    v30.4s, v7.16b,  v2.4b[3]
200        UDOT    v31.4s, v7.16b,  v3.4b[3]
201        B.HS    1b
202
203        # Is there a remainder?- 4 to 12 bytes of A
204        TST     x0, 15
205        B.NE    3f
206
2072:
208        ADDP    v0.4s, v12.4s, v12.4s
209        ADDP    v1.4s, v13.4s, v13.4s
210        ADDP    v2.4s, v14.4s, v14.4s
211        ADDP    v3.4s, v15.4s, v15.4s
212        ADDP    v12.4s, v0.4s, v0.4s
213        ADDP    v13.4s, v1.4s, v1.4s
214        ADDP    v14.4s, v2.4s, v2.4s
215        ADDP    v15.4s, v3.4s, v3.4s
216
217        # Subtract zero point from accumulators
218        SUB     v16.4s, v16.4s, v12.4s
219        SUB     v17.4s, v17.4s, v13.4s
220        SUB     v18.4s, v18.4s, v14.4s
221        SUB     v19.4s, v19.4s, v15.4s
222        SUB     v20.4s, v20.4s, v12.4s
223        SUB     v21.4s, v21.4s, v13.4s
224        SUB     v22.4s, v22.4s, v14.4s
225        SUB     v23.4s, v23.4s, v15.4s
226        SUB     v24.4s, v24.4s, v12.4s
227        SUB     v25.4s, v25.4s, v13.4s
228        SUB     v26.4s, v26.4s, v14.4s
229        SUB     v27.4s, v27.4s, v15.4s
230        SUB     v28.4s, v28.4s, v12.4s
231        SUB     v29.4s, v29.4s, v13.4s
232        SUB     v30.4s, v30.4s, v14.4s
233        SUB     v31.4s, v31.4s, v15.4s
234
235        # Apply params - preshift, scale, postshift, bias and clamp
236        LD1R    {v4.4s}, [x11], 4
237        SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
238        SSHL    v17.4s, v17.4s, v4.4s
239        SSHL    v18.4s, v18.4s, v4.4s
240        SSHL    v19.4s, v19.4s, v4.4s
241        SSHL    v20.4s, v20.4s, v4.4s
242        SSHL    v21.4s, v21.4s, v4.4s
243        SSHL    v22.4s, v22.4s, v4.4s
244        SSHL    v23.4s, v23.4s, v4.4s
245        LD1R    {v5.4s}, [x11], 4
246        SSHL    v24.4s, v24.4s, v4.4s
247        SSHL    v25.4s, v25.4s, v4.4s
248        SSHL    v26.4s, v26.4s, v4.4s
249        SSHL    v27.4s, v27.4s, v4.4s
250        SSHL    v28.4s, v28.4s, v4.4s
251        SSHL    v29.4s, v29.4s, v4.4s
252        SSHL    v30.4s, v30.4s, v4.4s
253        SSHL    v31.4s, v31.4s, v4.4s
254        LD1R    {v6.4s}, [x11], 4
255        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
256        SQDMULH v17.4s, v17.4s, v5.4s
257        SQDMULH v18.4s, v18.4s, v5.4s
258        SQDMULH v19.4s, v19.4s, v5.4s
259        SQDMULH v20.4s, v20.4s, v5.4s
260        SQDMULH v21.4s, v21.4s, v5.4s
261        SQDMULH v22.4s, v22.4s, v5.4s
262        SQDMULH v23.4s, v23.4s, v5.4s
263        SQDMULH v24.4s, v24.4s, v5.4s
264        SQDMULH v25.4s, v25.4s, v5.4s
265        SQDMULH v26.4s, v26.4s, v5.4s
266        SQDMULH v27.4s, v27.4s, v5.4s
267        SQDMULH v28.4s, v28.4s, v5.4s
268        SQDMULH v29.4s, v29.4s, v5.4s
269        SQDMULH v30.4s, v30.4s, v5.4s
270        SQDMULH v31.4s, v31.4s, v5.4s
271        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
272        SRSHL   v17.4s, v17.4s, v6.4s
273        SRSHL   v18.4s, v18.4s, v6.4s
274        SRSHL   v19.4s, v19.4s, v6.4s
275        SRSHL   v20.4s, v20.4s, v6.4s
276        SRSHL   v21.4s, v21.4s, v6.4s
277        SRSHL   v22.4s, v22.4s, v6.4s
278        SRSHL   v23.4s, v23.4s, v6.4s
279        SRSHL   v24.4s, v24.4s, v6.4s
280        SRSHL   v25.4s, v25.4s, v6.4s
281        SRSHL   v26.4s, v26.4s, v6.4s
282        SRSHL   v27.4s, v27.4s, v6.4s
283        SRSHL   v28.4s, v28.4s, v6.4s
284        SRSHL   v29.4s, v29.4s, v6.4s
285        SRSHL   v30.4s, v30.4s, v6.4s
286        SRSHL   v31.4s, v31.4s, v6.4s
287
288        SQXTN   v16.4h, v16.4s
289        SQXTN   v17.4h, v17.4s
290        SQXTN   v18.4h, v18.4s
291        SQXTN   v19.4h, v19.4s
292        SQXTN   v24.4h, v24.4s
293        SQXTN   v25.4h, v25.4s
294        SQXTN   v26.4h, v26.4s
295        SQXTN   v27.4h, v27.4s
296        LD1R    {v6.8h}, [x11], 2       // add bias
297
298        SQXTN2  v16.8h, v20.4s
299        SQXTN2  v17.8h, v21.4s
300        SQXTN2  v18.8h, v22.4s
301        SQXTN2  v19.8h, v23.4s
302        SQXTN2  v24.8h, v28.4s
303        SQXTN2  v25.8h, v29.4s
304        SQXTN2  v26.8h, v30.4s
305        SQXTN2  v27.8h, v31.4s
306
307        SQADD   v16.8h, v16.8h, v6.8h
308        SQADD   v17.8h, v17.8h, v6.8h
309        SQADD   v18.8h, v18.8h, v6.8h
310        SQADD   v19.8h, v19.8h, v6.8h
311        SQADD   v24.8h, v24.8h, v6.8h
312        SQADD   v25.8h, v25.8h, v6.8h
313        SQADD   v26.8h, v26.8h, v6.8h
314        SQADD   v27.8h, v27.8h, v6.8h
315        LD1R    {v4.16b}, [x11], 1      // clamp min value
316
317        SQXTUN  v0.8b, v16.8h
318        SQXTUN  v1.8b, v17.8h
319        SQXTUN  v2.8b, v18.8h
320        SQXTUN  v3.8b, v19.8h
321        LD1R    {v5.16b}, [x11]         // clamp max value
322        SQXTUN2 v0.16b, v24.8h
323        SQXTUN2 v1.16b, v25.8h
324        SQXTUN2 v2.16b, v26.8h
325        SQXTUN2 v3.16b, v27.8h
326
327        SUB     x11, x11, 15             // rewind params pointer
328
329        UMAX    v0.16b, v0.16b, v4.16b
330        UMAX    v1.16b, v1.16b, v4.16b
331        UMAX    v2.16b, v2.16b, v4.16b
332        UMAX    v3.16b, v3.16b, v4.16b
333        SUBS    x1, x1, 16
334        UMIN    v0.16b, v0.16b, v5.16b
335        UMIN    v1.16b, v1.16b, v5.16b
336        UMIN    v2.16b, v2.16b, v5.16b
337        UMIN    v3.16b, v3.16b, v5.16b
338        B.LO    5f
339
340        # Store full 4 x 16
341        ST1     {v0.16b}, [x6], x12
342        SUB     x3,  x3, x2             // a0 -= kc
343        ST1     {v1.16b}, [x8], x12
344        SUB     x15, x15, x2            // a1 -= kc
345        ST1     {v2.16b}, [x9], x12
346        SUB     x13, x13, x2            // a2 -= kc
347        ST1     {v3.16b}, [x7], x12
348        SUB     x4,  x4, x2             // a3 -= kc
349        B.NE    0b
350
351        # Restore d8,d12-d15 from stack
352        LDP     d14, d15, [sp, 32]
353        LDP     d12, d13, [sp, 16]
354        LDR     d8,  [sp], 48
355        RET
356
357        # Remainder- 8 bytes of A
358        .p2align 3
3593:
360        # Is there a remainder?- 8 bytes of A
361        TBZ     x0, 3, 4f
362
363        LDR     d0,  [x3], 8
364        LDR     q4,  [x5], 16
365        LDR     d1, [x15], 8
366        LDR     d2, [x13], 8
367        LDR     d3,  [x4], 8
368        LDR     q5,  [x5], 16
369
370        UDOT    v12.4s, v8.16b,  v0.16b  // update zero point
371        UDOT    v13.4s, v8.16b,  v1.16b
372        UDOT    v14.4s, v8.16b,  v2.16b
373        UDOT    v15.4s, v8.16b,  v3.16b
374
375        UDOT    v16.4s, v4.16b,  v0.4b[0]
376        UDOT    v17.4s, v4.16b,  v1.4b[0]
377        LDP     q6, q7, [x5], 32
378        UDOT    v18.4s, v4.16b,  v2.4b[0]
379        UDOT    v19.4s, v4.16b,  v3.4b[0]
380        UDOT    v20.4s, v5.16b,  v0.4b[0]
381        UDOT    v21.4s, v5.16b,  v1.4b[0]
382        UDOT    v22.4s, v5.16b,  v2.4b[0]
383        UDOT    v23.4s, v5.16b,  v3.4b[0]
384        UDOT    v24.4s, v6.16b, v0.4b[0]
385        UDOT    v25.4s, v6.16b, v1.4b[0]
386        LDP     q4, q5, [x5], 32
387        UDOT    v26.4s, v6.16b, v2.4b[0]
388        UDOT    v27.4s, v6.16b, v3.4b[0]
389        UDOT    v28.4s, v7.16b, v0.4b[0]
390        UDOT    v29.4s, v7.16b, v1.4b[0]
391        UDOT    v30.4s, v7.16b, v2.4b[0]
392        UDOT    v31.4s, v7.16b, v3.4b[0]
393        UDOT    v16.4s, v4.16b,  v0.4b[1]
394        UDOT    v17.4s, v4.16b,  v1.4b[1]
395        LDP     q6, q7, [x5], 32
396        UDOT    v18.4s, v4.16b,  v2.4b[1]
397        UDOT    v19.4s, v4.16b,  v3.4b[1]
398        UDOT    v20.4s, v5.16b,  v0.4b[1]
399        UDOT    v21.4s, v5.16b,  v1.4b[1]
400        UDOT    v22.4s, v5.16b,  v2.4b[1]
401        UDOT    v23.4s, v5.16b,  v3.4b[1]
402        UDOT    v24.4s, v6.16b,  v0.4b[1]
403        UDOT    v25.4s, v6.16b,  v1.4b[1]
404        UDOT    v26.4s, v6.16b,  v2.4b[1]
405        UDOT    v27.4s, v6.16b,  v3.4b[1]
406        UDOT    v28.4s, v7.16b,  v0.4b[1]
407        UDOT    v29.4s, v7.16b,  v1.4b[1]
408        UDOT    v30.4s, v7.16b,  v2.4b[1]
409        UDOT    v31.4s, v7.16b,  v3.4b[1]
410        # Is there a remainder?- 4 bytes of A
411        TBZ     x0, 2, 2b
412
413        # Remainder- 4 bytes of A
4144:
415        LDR     s0,  [x3], 4
416        LDR     q4,  [x5], 16
417        LDR     s1, [x15], 4
418        LDR     s2, [x13], 4
419        LDR     s3,  [x4], 4
420        LDR     q5, [x5], 16
421
422        UDOT    v12.4s, v8.16b,  v0.16b  // update zero point
423        UDOT    v13.4s, v8.16b,  v1.16b
424        UDOT    v14.4s, v8.16b,  v2.16b
425        UDOT    v15.4s, v8.16b,  v3.16b
426
427        UDOT    v16.4s, v4.16b,  v0.4b[0]
428        UDOT    v17.4s, v4.16b,  v1.4b[0]
429        UDOT    v18.4s, v4.16b,  v2.4b[0]
430        UDOT    v19.4s, v4.16b,  v3.4b[0]
431        LDP     q6, q7, [x5], 32
432        UDOT    v20.4s, v5.16b,  v0.4b[0]
433        UDOT    v21.4s, v5.16b,  v1.4b[0]
434        UDOT    v22.4s, v5.16b,  v2.4b[0]
435        UDOT    v23.4s, v5.16b,  v3.4b[0]
436        UDOT    v24.4s, v6.16b, v0.4b[0]
437        UDOT    v25.4s, v6.16b, v1.4b[0]
438        UDOT    v26.4s, v6.16b, v2.4b[0]
439        UDOT    v27.4s, v6.16b, v3.4b[0]
440        UDOT    v28.4s, v7.16b, v0.4b[0]
441        UDOT    v29.4s, v7.16b, v1.4b[0]
442        UDOT    v30.4s, v7.16b, v2.4b[0]
443        UDOT    v31.4s, v7.16b, v3.4b[0]
444        B       2b
445
446        # Store odd width
447        .p2align 3
4485:
449        TBZ     x1, 3, 6f
450        STR     d0, [x6], 8
451        STR     d1, [x8], 8
452        DUP     d0, v0.d[1]
453        DUP     d1, v1.d[1]
454        STR     d2, [x9], 8
455        STR     d3, [x7], 8
456        DUP     d2, v2.d[1]
457        DUP     d3, v3.d[1]
4586:
459        TBZ     x1, 2, 7f
460        STR     s0, [x6], 4
461        STR     s1, [x8], 4
462        DUP     s0, v0.s[1]
463        DUP     s1, v1.s[1]
464        STR     s2, [x9], 4
465        STR     s3, [x7], 4
466        DUP     s2, v2.s[1]
467        DUP     s3, v3.s[1]
4687:
469        TBZ     x1, 1, 8f
470        STR     h0, [x6], 2
471        STR     h1, [x8], 2
472        DUP     h0, v0.h[1]
473        DUP     h1, v1.h[1]
474        STR     h2, [x9], 2
475        STR     h3, [x7], 2
476        DUP     h2, v2.h[1]
477        DUP     h3, v3.h[1]
4788:
479        TBZ     x1, 0, 9f
480        STR     b0, [x6]
481        STR     b1, [x8]
482        STR     b2, [x9]
483        STR     b3, [x7]
4849:
485        # Restore d8,d12-d15 from stack
486        LDP     d14, d15, [sp, 32]
487        LDP     d12, d13, [sp, 16]
488        LDR     d8,  [sp], 48
489        RET
490
491END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128
492
493#ifdef __ELF__
494.section ".note.GNU-stack","",%progbits
495#endif
496