1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/2x8c16-aarch64-neon-mlal.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# const union xnn_qs8_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# A1 x4 v1 30# B x5 v4 v5 v6 v7 31# C0 x7 v16 v18 v20 v22 v24 v26 v28 v30 32# C1 x8 v17 v19 v21 v23 v25 v27 v29 v31 33# temp0 v2 v10 v12 v14 34# temp1 v3 v11 v13 v15 35# unused v8 v9 36 37BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal 38 39 # Clamp A and C pointers 40 CMP x0, 2 // if mr < 2 41 STP d10, d11, [sp, -48]! 42 ADD x4, x3, x4 // a1 = a0 + a_stride 43 STP d12, d13, [sp, 16] 44 ADD x7, x6, x7 // c1 = c0 + cm_stride 45 STP d14, d15, [sp, 32] 46 CSEL x4, x3, x4, LO // a1 = a0 47 ADD x2, x2, 15 // kc = (kc + 15) & ~15 48 CSEL x7, x6, x7, LO // c1 = c0 49 BIC x2, x2, 15 50 51 .p2align 3 520: 53 # Load initial bias from w into accumulators 54 MOV x0, x2 // k = kc 55 LDP s16, s18, [x5], 8 56 MOV v17.16b, v16.16b 57 MOV v19.16b, v18.16b 58 LDP s20, s22, [x5], 8 59 MOV v21.16b, v20.16b 60 MOV v23.16b, v22.16b 61 LDP s24, s26, [x5], 8 62 MOV v25.16b, v24.16b 63 MOV v27.16b, v26.16b 64 LDP s28, s30, [x5], 8 65 MOV v29.16b, v28.16b 66 LDP x10, x11, [sp, 48] // cn_stride, params 67 MOV v31.16b, v30.16b 68 69 # Main loop - 16 bytes of A 70 .p2align 3 711: 72 LDR q0, [x3], 16 73 LDP q4, q5, [x5] 74 LDR q1, [x4], 16 75 LDP q6, q7, [x5, 32] 76 SMULL v2.8h, v4.8b, v0.8b 77 SMULL v3.8h, v4.8b, v1.8b 78 SMULL v10.8h, v5.8b, v0.8b 79 SMULL v11.8h, v5.8b, v1.8b 80 SMLAL2 v2.8h, v4.16b, v0.16b 81 SMLAL2 v3.8h, v4.16b, v1.16b 82 SMLAL2 v10.8h, v5.16b, v0.16b 83 SMLAL2 v11.8h, v5.16b, v1.16b 84 SMULL v12.8h, v6.8b, v0.8b 85 SADALP v16.4s, v2.8h 86 SMULL v13.8h, v6.8b, v1.8b 87 SADALP v17.4s, v3.8h 88 SMULL v14.8h, v7.8b, v0.8b 89 SADALP v18.4s, v10.8h 90 SMULL v15.8h, v7.8b, v1.8b 91 SADALP v19.4s, v11.8h 92 LDP q4, q5, [x5, 64] 93 SMLAL2 v12.8h, v6.16b, v0.16b 94 SMLAL2 v13.8h, v6.16b, v1.16b 95 SMLAL2 v14.8h, v7.16b, v0.16b 96 SMLAL2 v15.8h, v7.16b, v1.16b 97 SMULL v2.8h, v4.8b, v0.8b 98 SADALP v20.4s, v12.8h 99 SMULL v3.8h, v4.8b, v1.8b 100 SADALP v21.4s, v13.8h 101 SMULL v10.8h, v5.8b, v0.8b 102 SADALP v22.4s, v14.8h 103 SMULL v11.8h, v5.8b, v1.8b 104 SADALP v23.4s, v15.8h 105 LDP q6, q7, [x5, 96] 106 107 SMLAL2 v2.8h, v4.16b, v0.16b 108 SMLAL2 v3.8h, v4.16b, v1.16b 109 SMLAL2 v10.8h, v5.16b, v0.16b 110 SMLAL2 v11.8h, v5.16b, v1.16b 111 ADD x5, x5, 128 112 SMULL v12.8h, v6.8b, v0.8b 113 SADALP v24.4s, v2.8h 114 SMULL v13.8h, v6.8b, v1.8b 115 SADALP v25.4s, v3.8h 116 SMULL v14.8h, v7.8b, v0.8b 117 SADALP v26.4s, v10.8h 118 SMULL v15.8h, v7.8b, v1.8b 119 SADALP v27.4s, v11.8h 120 SUBS x0, x0, 16 121 SMLAL2 v12.8h, v6.16b, v0.16b 122 SMLAL2 v13.8h, v6.16b, v1.16b 123 SMLAL2 v14.8h, v7.16b, v0.16b 124 SMLAL2 v15.8h, v7.16b, v1.16b 125 SADALP v28.4s, v12.8h 126 SADALP v29.4s, v13.8h 127 SADALP v30.4s, v14.8h 128 SADALP v31.4s, v15.8h 129 B.HI 1b 130 131 # Add columns 132 ADDP v16.4s, v16.4s, v18.4s 133 ADDP v20.4s, v20.4s, v22.4s 134 ADDP v24.4s, v24.4s, v26.4s 135 ADDP v28.4s, v28.4s, v30.4s 136 ADDP v17.4s, v17.4s, v19.4s 137 ADDP v21.4s, v21.4s, v23.4s 138 ADDP v25.4s, v25.4s, v27.4s 139 ADDP v29.4s, v29.4s, v31.4s 140 ADDP v0.4s, v16.4s, v20.4s 141 ADDP v1.4s, v24.4s, v28.4s 142 ADDP v2.4s, v17.4s, v21.4s 143 ADDP v3.4s, v25.4s, v29.4s 144 145 # Load per channel scale values from weights 146 SCVTF v0.4s, v0.4s 147 LDR q4, [x5], 16 148 SCVTF v1.4s, v1.4s 149 LDR q5, [x5], 16 150 SCVTF v2.4s, v2.4s 151 SCVTF v3.4s, v3.4s 152 FMUL v0.4s, v0.4s, v4.4s 153 FMUL v1.4s, v1.4s, v5.4s 154 FMUL v2.4s, v2.4s, v4.4s 155 FMUL v3.4s, v3.4s, v5.4s 156 157 FCVTNS v0.4s, v0.4s 158 FCVTNS v1.4s, v1.4s 159 FCVTNS v2.4s, v2.4s 160 FCVTNS v3.4s, v3.4s 161 162 LD1R {v5.8h}, [x11], 2 163 SQXTN v0.4h, v0.4s 164 SQXTN v2.4h, v2.4s 165 SQXTN2 v0.8h, v1.4s 166 SQXTN2 v2.8h, v3.4s 167 SUBS x1, x1, 8 168 SQADD v0.8h, v0.8h, v5.8h 169 SQADD v1.8h, v2.8h, v5.8h 170 SQXTN v0.8b, v0.8h 171 SQXTN2 v0.16b, v1.8h 172 LD1R {v1.16b}, [x11], 1 173 LD1R {v2.16b}, [x11] 174 SMAX v0.16b, v0.16b, v1.16b 175 SMIN v0.16b, v0.16b, v2.16b 176 B.LO 2f 177 178 # Store full 2 x 8 179 ST1 {v0.8b}, [x6], x10 180 SUB x3, x3, x2 // a0 -= kc 181 ST1 {v0.d}[1], [x7], x10 182 SUB x4, x4, x2 // a1 -= kc 183 B.HI 0b 184 185 # Restore d10-d15 from stack 186 LDP d14, d15, [sp, 32] 187 LDP d12, d13, [sp, 16] 188 LDP d10, d11, [sp], 48 189 RET 190 191 # Store odd width 192 .p2align 3 1932: 194 TBZ x1, 2, 3f 195 STR s0, [x6], 4 196 ST1 {v0.s}[2], [x7], 4 197 EXT v0.16b, v0.16b, v0.16b, 4 198 1993: 200 TBZ x1, 1, 4f 201 STR h0, [x6], 2 202 ST1 {v0.h}[4], [x7], 2 203 EXT v0.16b, v0.16b, v0.16b, 2 2044: 205 TBZ x1, 0, 5f 206 STR b0, [x6] 207 ST1 {v0.b}[8], [x7] 2085: 209 # Restore d10-d15 from stack 210 LDP d14, d15, [sp, 32] 211 LDP d12, d13, [sp, 16] 212 LDP d10, d11, [sp], 48 213 RET 214 215END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal 216 217#ifdef __ELF__ 218.section ".note.GNU-stack","",%progbits 219#endif 220 221