1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, (x4) 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, (x7) 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# B x5 v16 v17 v18 v19 30# C0 x6 v28 v29 v30 v31 31# unused v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15 32 33BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32 340: 35 # Load initial bias from w into accumulators 36 ADD x2, x2, 3 // kc = (kc + 3) & ~3 37 LDP q28, q29, [x5], 32 38 BIC x2, x2, 3 39 LDP q30, q31, [x5], 32 40 MOV x0, x2 // k = kc. assumes kc > 0 41 LDR x11, [sp, 8] // params 42 43 # Main loop - 4 bytes of A 44 .p2align 3 451: 46 LDR s0, [x3], 4 47 LDR q16, [x5], 16 48 LDR q17, [x5], 16 49 LDR q18, [x5], 16 50 LDR q19, [x5], 16 51 SDOT v28.4s, v16.16b, v0.4b[0] 52 SDOT v29.4s, v17.16b, v0.4b[0] 53 SUBS x0, x0, 4 54 SDOT v30.4s, v18.16b, v0.4b[0] 55 SDOT v31.4s, v19.16b, v0.4b[0] 56 B.HI 1b 57 58 # Load per channel scale values from weights 59 SCVTF v28.4s, v28.4s 60 LDR q4, [x5], 16 61 SCVTF v29.4s, v29.4s 62 LDR q5, [x5], 16 63 SCVTF v30.4s, v30.4s 64 LDR q6, [x5], 16 65 SCVTF v31.4s, v31.4s 66 FMUL v28.4s, v28.4s, v4.4s 67 LDR q4, [x5], 16 68 FMUL v29.4s, v29.4s, v5.4s 69 FMUL v30.4s, v30.4s, v6.4s 70 FMUL v31.4s, v31.4s, v4.4s 71 72 FCVTNS v28.4s, v28.4s 73 FCVTNS v29.4s, v29.4s 74 FCVTNS v30.4s, v30.4s 75 FCVTNS v31.4s, v31.4s 76 77 LD1R {v6.8h}, [x11], 2 // add bias 78 SQXTN v0.4h, v28.4s 79 SQXTN v2.4h, v30.4s 80 SQXTN2 v0.8h, v29.4s 81 SQXTN2 v2.8h, v31.4s 82 83 LD2R {v4.16b, v5.16b}, [x11] // clamp to min/max 84 SQADD v0.8h, v0.8h, v6.8h 85 SQADD v2.8h, v2.8h, v6.8h 86 LDR x12, [sp] // cn_stride 87 SQXTN v0.8b, v0.8h 88 SQXTN2 v0.16b, v2.8h 89 SUBS x1, x1, 16 90 SMAX v0.16b, v0.16b, v4.16b 91 SMIN v0.16b, v0.16b, v5.16b 92 B.LO 2f 93 94 # Store full 1 x 16 95 ST1 {v0.16b}, [x6], x12 96 SUB x3, x3, x2 // a0 -= kc 97 B.NE 0b 98 RET 99 100 # Store odd width 101 .p2align 3 1022: 103 TBZ x1, 3, 3f 104 STR d0, [x6], 8 105 DUP d0, v0.d[1] 1063: 107 TBZ x1, 2, 4f 108 STR s0, [x6], 4 109 DUP s0, v0.s[1] 1104: 111 TBZ x1, 1, 5f 112 STR h0, [x6], 2 113 DUP h0, v0.h[1] 1145: 115 TBZ x1, 0, 6f 116 STR b0, [x6] 1176: 118 RET 119 120END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32 121 122#ifdef __ELF__ 123.section ".note.GNU-stack","",%progbits 124#endif 125