1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const float**restrict a, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> (x0) 18# size_t a_offset, [sp + 8] -> x11 19# const float* zero, [sp + 16] -> x12 20# const xnn_f32_minmax_params params [sp + 24] -> (x8) 21 22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 23 24# Register usage 25# A0 x14 v0 v3 26# A1 x15 v0[1] v3[1] 27# A2 x20 v1 v4 28# A3 x21 v1[1] v4[1] 29# A4 x22 v2 v5 30# A5 x23 v2[1] v5[1] 31 32# B x5 v12 v13 v14 v15 second set of B 33# B v16 v17 v18 v19 first set 34 35# C0 x6 v20 v21 36# C1 x16 v22 v23 37# C2 x17 v24 v25 38# C3 x10 v26 v27 39# C4 x13 v28 v29 40# C5 x7 v30 v31 41 42# Clamp v6 v7 43# unused A v8 v9 v10 v11 44# x19 temporary vector shadow register 45 46BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55 47 48 # Clamp C pointers 49 CMP x0, 2 // if mr < 2 50 ADD x16, x6, x7 // c1 = c0 + cm_stride 51 CSEL x16, x6, x16, LO // c1 = c0 52 53 ADD x17, x16, x7 // c2 = c1 + cm_stride 54 // if mr <= 2 55 CSEL x17, x16, x17, LS // c2 = c1 56 57 CMP x0, 4 // if mr < 4 58 ADD x10, x17, x7 // c3 = c2 + cm_stride 59 CSEL x10, x17, x10, LO // c3 = c2 60 61 ADD x13, x10, x7 // c4 = c3 + cm_stride 62 // if mr <= 4 63 CSEL x13, x10, x13, LS // c4 = c3 64 65 66 CMP x0, 6 // if mr < 6 67 ADD x7, x13, x7 // c5 = c4 + cm_stride 68 CSEL x7, x13, x7, LO // c5 = c4 69 70 # Load a_offset 71 LDR x11, [sp, 8] 72 73 # Load zero, params pointer 74 LDP x12, x8, [sp, 16] 75 76 # Load min/max values 77 LD2R {v6.4s, v7.4s}, [x8] 78 79 # Save x19-x23, d12-d15 on stack 80 STP d12, d13, [sp, -80]! 81 STP d14, d15, [sp, 16] 82 STP x19, x20, [sp, 32] 83 STP x21, x22, [sp, 48] 84 STR x23, [sp, 64] 85 860: 87 # Load initial bias from w into accumulators 88 LDP q20, q21, [x5], 32 89 MOV x9, x3 // p = ks 90 MOV v22.16b, v20.16b 91 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 92 MOV v23.16b, v21.16b 93 PRFM PLDL1KEEP, [x5, 64] 94 MOV v24.16b, v20.16b 95 PRFM PLDL1KEEP, [x5, 128] 96 MOV v25.16b, v21.16b 97 PRFM PLDL1KEEP, [x5, 192] 98 MOV v26.16b, v20.16b 99 PRFM PLDL1KEEP, [x5, 256] 100 MOV v27.16b, v21.16b 101 PRFM PLDL1KEEP, [x5, 320] 102 MOV v28.16b, v20.16b 103 MOV v29.16b, v21.16b 104 MOV v30.16b, v20.16b 105 MOV v31.16b, v21.16b 106 107 1081: 109 # Load next 6 A pointers 110 LDP x14, x15, [x4], 16 111 LDP x20, x21, [x4], 16 112 LDP x22, x23, [x4], 16 113 114 CMP x14, x12 // if a0 == zero 115 ADD x14, x14, x11 // a0 += a_offset 116 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 117 CMP x15, x12 // if a1 == zero 118 ADD x15, x15, x11 // a1 += a_offset 119 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 120 CMP x20, x12 // if a2 == zero 121 ADD x20, x20, x11 // a2 += a_offset 122 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 123 CMP x21, x12 // if a3 == zero 124 ADD x21, x21, x11 // a3 += a_offset 125 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 126 CMP x22, x12 // if a4 == zero 127 ADD x22, x22, x11 // a4 += a_offset 128 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 129 CMP x23, x12 // if a5 == zero 130 ADD x23, x23, x11 // a5 += a_offset 131 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 132 133 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 134 SUBS x0, x2, 16 // k = kc - 16 135 B.LO 5f 136 137 # Prologue - First group loads, no FMA 138 LDR d0, [x14], 8 // a0 139 LDP q16, q17, [x5], 32 // b 140 LDR d1, [x20], 8 // a2 141 LDR d2, [x22], 8 // a4 142 LD1 {v0.d}[1], [x15], 8 // a1 143 LD1 {v1.d}[1], [x21], 8 // a3 144 LD1 {v2.d}[1], [x23], 8 // a5 145 SUBS x0, x0, 16 146 LDR q18, [x5], 16 147 LDR d19, [x5], 8 148 LDR x19, [x5], 8 // ins is in BLOCK 0 149 150 # Is there at least 4 floats (16 bytes) for main loop? 151 B.LO 3f 152 153 # Main loop - 4 floats of A (16 bytes) 154 # 48 FMA + 12 LD64 A + 8 LDR B 1552: 156 # First group of 24 FMA, Second group loads 157 # BLOCK 0 158 FMLA v20.4s, v16.4s, v0.s[0] 159 LDR d3, [x14], 8 // a0 160 FMLA v22.4s, v16.4s, v0.s[2] 161 INS v19.d[1], x19 // b from second group 162 FMLA v24.4s, v16.4s, v1.s[0] 163 LDR x19, [x15], 8 // a1 164 165 # BLOCK 1 166 FMLA v26.4s, v16.4s, v1.s[2] 167 LDR d12, [x5] 168 FMLA v28.4s, v16.4s, v2.s[0] 169 INS v3.d[1], x19 // a1 ins 170 FMLA v30.4s, v16.4s, v2.s[2] 171 LDR x19, [x5, 8] // b 172 173 # BLOCK 2 174 FMLA v21.4s, v17.4s, v0.s[0] 175 LDR d4, [x20], 8 // a2 176 FMLA v23.4s, v17.4s, v0.s[2] 177 INS v12.d[1], x19 // b ins 178 FMLA v25.4s, v17.4s, v1.s[0] 179 LDR x19, [x21], 8 // a3 180 181 # BLOCK 3 182 FMLA v27.4s, v17.4s, v1.s[2] 183 LDR d5, [x22], 8 // a4 184 FMLA v29.4s, v17.4s, v2.s[0] 185 INS v4.d[1], x19 // a3 ins 186 FMLA v31.4s, v17.4s, v2.s[2] 187 LDR x19, [x23], 8 // a5 188 189 # BLOCK 4 190 FMLA v20.4s, v18.4s, v0.s[1] 191 LDR d13, [x5, 16] 192 FMLA v22.4s, v18.4s, v0.s[3] 193 INS v5.d[1], x19 // a5 ins 194 FMLA v24.4s, v18.4s, v1.s[1] 195 LDR x19, [x5, 24] 196 197 # BLOCK 5 198 FMLA v26.4s, v18.4s, v1.s[3] 199 LDR d14, [x5, 32] 200 FMLA v28.4s, v18.4s, v2.s[1] 201 INS v13.d[1], x19 // b 202 FMLA v30.4s, v18.4s, v2.s[3] 203 LDR x19, [x5, 40] 204 205 # BLOCK 6 206 FMLA v21.4s, v19.4s, v0.s[1] 207 LDR d15, [x5, 48] 208 FMLA v23.4s, v19.4s, v0.s[3] 209 INS v14.d[1], x19 // b 210 FMLA v25.4s, v19.4s, v1.s[1] 211 LDR x19, [x5, 56] 212 213 # BLOCK 7 214 FMLA v27.4s, v19.4s, v1.s[3] 215 FMLA v29.4s, v19.4s, v2.s[1] 216 INS v15.d[1], x19 217 FMLA v31.4s, v19.4s, v2.s[3] 218 219 # Second group of 24 FMA, First group of loads 220 # BLOCK 0 221 FMLA v20.4s, v12.4s, v3.s[0] 222 LDR d0, [x14], 8 // a0 223 FMLA v22.4s, v12.4s, v3.s[2] 224 FMLA v24.4s, v12.4s, v4.s[0] 225 LDR x19, [x15], 8 // a1 226 227 # BLOCK 1 228 FMLA v26.4s, v12.4s, v4.s[2] 229 LDR d16, [x5, 64] 230 FMLA v28.4s, v12.4s, v5.s[0] 231 INS v0.d[1], x19 // a1 ins 232 FMLA v30.4s, v12.4s, v5.s[2] 233 LDR x19, [x5, 72] // b 234 235 # BLOCK 2 236 FMLA v21.4s, v13.4s, v3.s[0] 237 LDR d1, [x20], 8 // a2 238 FMLA v23.4s, v13.4s, v3.s[2] 239 INS v16.d[1], x19 // b 240 FMLA v25.4s, v13.4s, v4.s[0] 241 LDR x19, [x21], 8 // a3 242 243 # BLOCK 3 244 FMLA v27.4s, v13.4s, v4.s[2] 245 LDR d2, [x22], 8 // a4 246 FMLA v29.4s, v13.4s, v5.s[0] 247 INS v1.d[1], x19 // a3 ins 248 FMLA v31.4s, v13.4s, v5.s[2] 249 LDR x19, [x23], 8 // a5 250 251 # BLOCK 4 252 FMLA v20.4s, v14.4s, v3.s[1] 253 LDR d17, [x5, 80] 254 FMLA v22.4s, v14.4s, v3.s[3] 255 INS v2.d[1], x19 // a5 ins 256 FMLA v24.4s, v14.4s, v4.s[1] 257 LDR x19, [x5, 88] 258 259 # BLOCK 5 260 FMLA v26.4s, v14.4s, v4.s[3] 261 LDR d18, [x5, 96] 262 FMLA v28.4s, v14.4s, v5.s[1] 263 INS v17.d[1], x19 // b 264 FMLA v30.4s, v14.4s, v5.s[3] 265 LDR x19, [x5, 104] 266 267 # BLOCK 6 268 FMLA v21.4s, v15.4s, v3.s[1] 269 LDR d19, [x5, 112] 270 FMLA v23.4s, v15.4s, v3.s[3] 271 INS v18.d[1], x19 // b 272 FMLA v25.4s, v15.4s, v4.s[1] 273 LDR x19, [x5, 120] 274 275 # BLOCK 7 276 FMLA v27.4s, v15.4s, v4.s[3] 277 SUBS x0, x0, 16 278 FMLA v29.4s, v15.4s, v5.s[1] 279 ADD x5, x5, 128 280 FMLA v31.4s, v15.4s, v5.s[3] 281 B.HS 2b 282 283 # Epilogue - 4 floats of A (16 bytes) 284 # 48 FMA + 12 LD64 A + 8 LDR B 2853: 286 # First group of 24 FMA, Second group loads 287 # BLOCK 0 288 FMLA v20.4s, v16.4s, v0.s[0] 289 LDR d3, [x14], 8 // a0 290 FMLA v22.4s, v16.4s, v0.s[2] 291 INS v19.d[1], x19 // b from second group 292 FMLA v24.4s, v16.4s, v1.s[0] 293 LDR x19, [x15], 8 // a1 294 295 # BLOCK 1 296 FMLA v26.4s, v16.4s, v1.s[2] 297 LDR d12, [x5] 298 FMLA v28.4s, v16.4s, v2.s[0] 299 INS v3.d[1], x19 // a1 ins 300 FMLA v30.4s, v16.4s, v2.s[2] 301 LDR x19, [x5, 8] // b 302 303 # BLOCK 2 304 FMLA v21.4s, v17.4s, v0.s[0] 305 LDR d4, [x20], 8 // a2 306 FMLA v23.4s, v17.4s, v0.s[2] 307 INS v12.d[1], x19 // b ins 308 FMLA v25.4s, v17.4s, v1.s[0] 309 LDR x19, [x21], 8 // a3 310 311 # BLOCK 3 312 FMLA v27.4s, v17.4s, v1.s[2] 313 LDR d5, [x22], 8 // a4 314 FMLA v29.4s, v17.4s, v2.s[0] 315 INS v4.d[1], x19 // a3 ins 316 FMLA v31.4s, v17.4s, v2.s[2] 317 LDR x19, [x23], 8 // a5 318 319 # BLOCK 4 320 FMLA v20.4s, v18.4s, v0.s[1] 321 LDR d13, [x5, 16] 322 FMLA v22.4s, v18.4s, v0.s[3] 323 INS v5.d[1], x19 // a5 ins 324 FMLA v24.4s, v18.4s, v1.s[1] 325 LDR x19, [x5, 24] 326 327 # BLOCK 5 328 FMLA v26.4s, v18.4s, v1.s[3] 329 LDR d14, [x5, 32] 330 FMLA v28.4s, v18.4s, v2.s[1] 331 INS v13.d[1], x19 // b 332 FMLA v30.4s, v18.4s, v2.s[3] 333 LDR x19, [x5, 40] 334 335 # BLOCK 6 336 LDR d15, [x5, 48] 337 FMLA v21.4s, v19.4s, v0.s[1] 338 INS v14.d[1], x19 // b 339 FMLA v23.4s, v19.4s, v0.s[3] 340 LDR x19, [x5, 56] 341 FMLA v25.4s, v19.4s, v1.s[1] 342 343 # BLOCK 7 344 INS v15.d[1], x19 // b from previous 345 FMLA v27.4s, v19.4s, v1.s[3] 346 FMLA v29.4s, v19.4s, v2.s[1] 347 FMLA v31.4s, v19.4s, v2.s[3] 348 349 # Second group of 24 FMA, First group of loads 350 # BLOCK 0 351 FMLA v20.4s, v12.4s, v3.s[0] 352 PRFM PSTL1KEEP, [x6] // Prefetch C0 353 FMLA v22.4s, v12.4s, v3.s[2] 354 PRFM PSTL1KEEP, [x16] // Prefetch C1 355 FMLA v24.4s, v12.4s, v4.s[0] 356 PRFM PSTL1KEEP, [x17] // Prefetch C2 357 358 # BLOCK 1 359 FMLA v26.4s, v12.4s, v4.s[2] 360 PRFM PSTL1KEEP, [x10] // Prefetch C3 361 FMLA v28.4s, v12.4s, v5.s[0] 362 PRFM PSTL1KEEP, [x13] // Prefetch C4 363 FMLA v30.4s, v12.4s, v5.s[2] 364 PRFM PSTL1KEEP, [x7] // Prefetch C5 365 366 # BLOCK 2 367 FMLA v21.4s, v13.4s, v3.s[0] 368 FMLA v23.4s, v13.4s, v3.s[2] 369 FMLA v25.4s, v13.4s, v4.s[0] 370 371 # BLOCK 3 372 FMLA v27.4s, v13.4s, v4.s[2] 373 FMLA v29.4s, v13.4s, v5.s[0] 374 FMLA v31.4s, v13.4s, v5.s[2] 375 376 # BLOCK 4 377 FMLA v20.4s, v14.4s, v3.s[1] 378 FMLA v22.4s, v14.4s, v3.s[3] 379 FMLA v24.4s, v14.4s, v4.s[1] 380 381 # BLOCK 5 382 FMLA v26.4s, v14.4s, v4.s[3] 383 FMLA v28.4s, v14.4s, v5.s[1] 384 FMLA v30.4s, v14.4s, v5.s[3] 385 TST x0, 15 386 387 # BLOCK 6 388 FMLA v21.4s, v15.4s, v3.s[1] 389 FMLA v23.4s, v15.4s, v3.s[3] 390 FMLA v25.4s, v15.4s, v4.s[1] 391 ADD x5, x5, 64 392 393 # BLOCK 7 394 FMLA v27.4s, v15.4s, v4.s[3] 395 FMLA v29.4s, v15.4s, v5.s[1] 396 FMLA v31.4s, v15.4s, v5.s[3] 397 398 # Is there a remainder?- 2 floats of A (8 bytes) or less 399 B.NE 5f 400 4014: 402 # ks loop 403 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 404 B.HI 1b 405 406 # Clamp 407 FMAX v20.4s, v20.4s, v6.4s 408 # Load cn_stride 409 LDR x0, [sp, 80] 410 FMAX v21.4s, v21.4s, v6.4s 411 FMAX v22.4s, v22.4s, v6.4s 412 FMAX v23.4s, v23.4s, v6.4s 413 FMAX v24.4s, v24.4s, v6.4s 414 FMAX v25.4s, v25.4s, v6.4s 415 FMAX v26.4s, v26.4s, v6.4s 416 FMAX v27.4s, v27.4s, v6.4s 417 FMAX v28.4s, v28.4s, v6.4s 418 FMAX v29.4s, v29.4s, v6.4s 419 FMAX v30.4s, v30.4s, v6.4s 420 FMAX v31.4s, v31.4s, v6.4s 421 SUBS x1, x1, 8 422 FMIN v20.4s, v20.4s, v7.4s 423 FMIN v21.4s, v21.4s, v7.4s 424 FMIN v22.4s, v22.4s, v7.4s 425 FMIN v23.4s, v23.4s, v7.4s 426 FMIN v24.4s, v24.4s, v7.4s 427 FMIN v25.4s, v25.4s, v7.4s 428 FMIN v26.4s, v26.4s, v7.4s 429 FMIN v27.4s, v27.4s, v7.4s 430 FMIN v28.4s, v28.4s, v7.4s 431 FMIN v29.4s, v29.4s, v7.4s 432 FMIN v30.4s, v30.4s, v7.4s 433 FMIN v31.4s, v31.4s, v7.4s 434 435 # Store full 6 x 8 436 B.LO 7f 437 438 STP q30, q31, [x7] 439 ADD x7, x7, x0 440 STP q28, q29, [x13] 441 ADD x13, x13, x0 442 STP q26, q27, [x10] 443 ADD x10, x10, x0 444 STP q24, q25, [x17] 445 ADD x17, x17, x0 446 STP q22, q23, [x16] 447 ADD x16, x16, x0 448 STP q20, q21, [x6] 449 ADD x6, x6, x0 450 451 SUB x4, x4, x3 // a -= ks 452 453 # nc loop 454 B.HI 0b 455 456 # Restore x19-x23, d12-d15 from stack 457 LDR x23, [sp, 64] 458 LDP x21, x22, [sp, 48] 459 LDP x19, x20, [sp, 32] 460 LDP d14, d15, [sp, 16] 461 LDP d12, d13, [sp], 80 462 RET 463 4645: 465 # Is there a remainder?- 2 floats of A (8 bytes) 466 TBZ x0, 3, 6f 467 468 # Remainder- 2 floats of A (8 bytes) 469 LDR d0, [x14], 8 470 LDR q16, [x5], 16 471 LD1 {v0.d}[1], [x15], 8 472 LDR d1, [x20], 8 473 LD1 {v1.d}[1], [x21], 8 474 LDR d2, [x22], 8 475 LD1 {v2.d}[1], [x23], 8 476 LDR q17, [x5], 16 477 LDR q18, [x5], 16 478 LDR q19, [x5], 16 479 FMLA v20.4s, v16.4s, v0.s[0] 480 FMLA v22.4s, v16.4s, v0.s[2] 481 FMLA v24.4s, v16.4s, v1.s[0] 482 FMLA v26.4s, v16.4s, v1.s[2] 483 FMLA v28.4s, v16.4s, v2.s[0] 484 FMLA v30.4s, v16.4s, v2.s[2] 485 FMLA v21.4s, v17.4s, v0.s[0] 486 FMLA v23.4s, v17.4s, v0.s[2] 487 FMLA v25.4s, v17.4s, v1.s[0] 488 FMLA v27.4s, v17.4s, v1.s[2] 489 FMLA v29.4s, v17.4s, v2.s[0] 490 FMLA v31.4s, v17.4s, v2.s[2] 491 492 FMLA v20.4s, v18.4s, v0.s[1] 493 FMLA v22.4s, v18.4s, v0.s[3] 494 FMLA v24.4s, v18.4s, v1.s[1] 495 FMLA v26.4s, v18.4s, v1.s[3] 496 FMLA v28.4s, v18.4s, v2.s[1] 497 FMLA v30.4s, v18.4s, v2.s[3] 498 FMLA v21.4s, v19.4s, v0.s[1] 499 FMLA v23.4s, v19.4s, v0.s[3] 500 FMLA v25.4s, v19.4s, v1.s[1] 501 FMLA v27.4s, v19.4s, v1.s[3] 502 FMLA v29.4s, v19.4s, v2.s[1] 503 FMLA v31.4s, v19.4s, v2.s[3] 504 505 # Is there a remainder?- 1 float of A (4 bytes) 506 TBZ x0, 2, 4b 5076: 508 # Remainder- 1 float of A (4 bytes) 509 LDR s0, [x14], 4 510 LDR q16, [x5], 16 511 LD1 {v0.s}[2], [x15], 4 512 LDR s1, [x20], 4 513 LD1 {v1.s}[2], [x21], 4 514 LDR s2, [x22], 4 515 LD1 {v2.s}[2], [x23], 4 516 LDR q17, [x5], 16 517 518 FMLA v20.4s, v16.4s, v0.s[0] 519 FMLA v22.4s, v16.4s, v0.s[2] 520 FMLA v24.4s, v16.4s, v1.s[0] 521 FMLA v26.4s, v16.4s, v1.s[2] 522 FMLA v28.4s, v16.4s, v2.s[0] 523 FMLA v30.4s, v16.4s, v2.s[2] 524 FMLA v21.4s, v17.4s, v0.s[0] 525 FMLA v23.4s, v17.4s, v0.s[2] 526 FMLA v25.4s, v17.4s, v1.s[0] 527 FMLA v27.4s, v17.4s, v1.s[2] 528 FMLA v29.4s, v17.4s, v2.s[0] 529 FMLA v31.4s, v17.4s, v2.s[2] 530 B 4b 531 532 # Store odd width 5337: 534 TBZ x1, 2, 8f 535 STR q30, [x7], 16 536 MOV v30.16b, v31.16b 537 STR q28, [x13], 16 538 MOV v28.16b, v29.16b 539 STR q26, [x10], 16 540 MOV v26.16b, v27.16b 541 STR q24, [x17], 16 542 MOV v24.16b, v25.16b 543 STR q22, [x16], 16 544 MOV v22.16b, v23.16b 545 STR q20, [x6], 16 546 MOV v20.16b, v21.16b 5478: 548 TBZ x1, 1, 9f 549 STR d30, [x7], 8 550 STR d28, [x13], 8 551 DUP d30, v30.d[1] 552 DUP d28, v28.d[1] 553 STR d26, [x10], 8 554 STR d24, [x17], 8 555 DUP d26, v26.d[1] 556 DUP d24, v24.d[1] 557 STR d22, [x16], 8 558 STR d20, [x6], 8 559 DUP d22, v22.d[1] 560 DUP d20, v20.d[1] 561 5629: 563 TBZ x1, 0, 10f 564 STR s30, [x7] 565 STR s28, [x13] 566 STR s26, [x10] 567 STR s24, [x17] 568 STR s22, [x16] 569 STR s20, [x6] 57010: 571 # Restore x19-x23, d12-d15 from stack 572 LDR x23, [sp, 64] 573 LDP x21, x22, [sp, 48] 574 LDP x19, x20, [sp, 32] 575 LDP d14, d15, [sp, 16] 576 LDP d12, d13, [sp], 80 577 RET 578 579END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55 580 581#ifdef __ELF__ 582.section ".note.GNU-stack","",%progbits 583#endif 584