1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/2x8c16-aarch64-neon-mlal.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 31# A1 x15 v1 32# B x5 v4 v5 v6 v7 33# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 34# C1 x7 v17 v19 v21 v23 v25 v27 v29 v31 35# temp0 v2 v10 v12 v14 36# temp1 v3 v11 v13 v15 37# unused v8 v9 38 39BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal 40 41 # Clamp C pointers 42 LDP x10, x8, [sp] // Load cn_stride, a_offset 43 CMP x0, 2 // if mr < 2 44 LDP x12, x11, [sp, 16] // Load zero, params pointer 45 ADD x7, x6, x7 // c1 = c0 + cm_stride 46 STP d10, d11, [sp, -48]! 47 ADD x2, x2, 15 // kc = (kc + 15) & ~15 48 STP d12, d13, [sp, 16] 49 CSEL x7, x6, x7, LO // c1 = c0 50 STP d14, d15, [sp, 32] 51 BIC x2, x2, 15 52 53 .p2align 3 540: 55 # Load initial bias from w into accumulators 56 LDP s16, s18, [x5], 8 57 MOV v17.16b, v16.16b 58 MOV v19.16b, v18.16b 59 LDP s20, s22, [x5], 8 60 MOV v21.16b, v20.16b 61 MOV v23.16b, v22.16b 62 LDP s24, s26, [x5], 8 63 MOV v25.16b, v24.16b 64 MOV v27.16b, v26.16b 65 LDP s28, s30, [x5], 8 66 MOV v29.16b, v28.16b 67 MOV v31.16b, v30.16b 68 MOV x9, x3 // p = ks 69 70 .p2align 3 711: 72 # Load next 2 A pointers 73 LDP x13, x15, [x4], 16 74 75 CMP x13, x12 // if a0 == zero 76 ADD x13, x13, x8 // a0 += a_offset 77 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 78 CMP x15, x12 // if a1 == zero 79 ADD x15, x15, x8 // a1 += a_offset 80 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 81 82 MOV x0, x2 // k = kc 83 84 # Main loop - 16 bytes of A 85 .p2align 3 862: 87 LDR q0, [x13], 16 88 LDP q4, q5, [x5] 89 LDR q1, [x15], 16 90 LDP q6, q7, [x5, 32] 91 SMULL v2.8h, v4.8b, v0.8b 92 SMULL v3.8h, v4.8b, v1.8b 93 SMULL v10.8h, v5.8b, v0.8b 94 SMULL v11.8h, v5.8b, v1.8b 95 SMLAL2 v2.8h, v4.16b, v0.16b 96 SMLAL2 v3.8h, v4.16b, v1.16b 97 SMLAL2 v10.8h, v5.16b, v0.16b 98 SMLAL2 v11.8h, v5.16b, v1.16b 99 SMULL v12.8h, v6.8b, v0.8b 100 SADALP v16.4s, v2.8h 101 SMULL v13.8h, v6.8b, v1.8b 102 SADALP v17.4s, v3.8h 103 SMULL v14.8h, v7.8b, v0.8b 104 SADALP v18.4s, v10.8h 105 SMULL v15.8h, v7.8b, v1.8b 106 SADALP v19.4s, v11.8h 107 LDP q4, q5, [x5, 64] 108 SMLAL2 v12.8h, v6.16b, v0.16b 109 SMLAL2 v13.8h, v6.16b, v1.16b 110 SMLAL2 v14.8h, v7.16b, v0.16b 111 SMLAL2 v15.8h, v7.16b, v1.16b 112 SMULL v2.8h, v4.8b, v0.8b 113 SADALP v20.4s, v12.8h 114 SMULL v3.8h, v4.8b, v1.8b 115 SADALP v21.4s, v13.8h 116 SMULL v10.8h, v5.8b, v0.8b 117 SADALP v22.4s, v14.8h 118 SMULL v11.8h, v5.8b, v1.8b 119 SADALP v23.4s, v15.8h 120 LDP q6, q7, [x5, 96] 121 122 SMLAL2 v2.8h, v4.16b, v0.16b 123 SMLAL2 v3.8h, v4.16b, v1.16b 124 SMLAL2 v10.8h, v5.16b, v0.16b 125 SMLAL2 v11.8h, v5.16b, v1.16b 126 ADD x5, x5, 128 127 SMULL v12.8h, v6.8b, v0.8b 128 SADALP v24.4s, v2.8h 129 SMULL v13.8h, v6.8b, v1.8b 130 SADALP v25.4s, v3.8h 131 SMULL v14.8h, v7.8b, v0.8b 132 SADALP v26.4s, v10.8h 133 SMULL v15.8h, v7.8b, v1.8b 134 SADALP v27.4s, v11.8h 135 SUBS x0, x0, 16 136 SMLAL2 v12.8h, v6.16b, v0.16b 137 SMLAL2 v13.8h, v6.16b, v1.16b 138 SMLAL2 v14.8h, v7.16b, v0.16b 139 SMLAL2 v15.8h, v7.16b, v1.16b 140 SADALP v28.4s, v12.8h 141 SADALP v29.4s, v13.8h 142 SADALP v30.4s, v14.8h 143 SADALP v31.4s, v15.8h 144 B.HI 2b 145 146 # ks loop 147 SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*) 148 B.HI 1b 149 150 # Add columns 151 ADDP v16.4s, v16.4s, v18.4s 152 ADDP v20.4s, v20.4s, v22.4s 153 ADDP v24.4s, v24.4s, v26.4s 154 ADDP v28.4s, v28.4s, v30.4s 155 ADDP v17.4s, v17.4s, v19.4s 156 ADDP v21.4s, v21.4s, v23.4s 157 ADDP v25.4s, v25.4s, v27.4s 158 ADDP v29.4s, v29.4s, v31.4s 159 ADDP v0.4s, v16.4s, v20.4s 160 ADDP v1.4s, v24.4s, v28.4s 161 ADDP v2.4s, v17.4s, v21.4s 162 ADDP v3.4s, v25.4s, v29.4s 163 164 # Apply params - scale, bias and clamp 165 SCVTF v0.4s, v0.4s 166 LD1R {v4.4s}, [x11], 4 167 SCVTF v1.4s, v1.4s 168 SCVTF v2.4s, v2.4s 169 SCVTF v3.4s, v3.4s 170 FMUL v0.4s, v0.4s, v4.4s 171 FMUL v1.4s, v1.4s, v4.4s 172 FMUL v2.4s, v2.4s, v4.4s 173 FMUL v3.4s, v3.4s, v4.4s 174 175 FCVTNS v0.4s, v0.4s 176 FCVTNS v1.4s, v1.4s 177 FCVTNS v2.4s, v2.4s 178 FCVTNS v3.4s, v3.4s 179 180 LD1R {v5.8h}, [x11], 2 181 SQXTN v0.4h, v0.4s 182 SQXTN v2.4h, v2.4s 183 SQXTN2 v0.8h, v1.4s 184 SQXTN2 v2.8h, v3.4s 185 SUBS x1, x1, 8 186 SQADD v0.8h, v0.8h, v5.8h 187 SQADD v1.8h, v2.8h, v5.8h 188 SQXTN v0.8b, v0.8h 189 SQXTN2 v0.16b, v1.8h 190 LD1R {v1.16b}, [x11], 1 191 LD1R {v2.16b}, [x11] 192 SMAX v0.16b, v0.16b, v1.16b 193 SUB x11, x11, 7 // rewind params pointer 194 SMIN v0.16b, v0.16b, v2.16b 195 B.LO 3f 196 197 # Store full 2 x 8 198 ST1 {v0.d}[1], [x7], x10 199 SUB x4, x4, x3 // a -= ks 200 ST1 {v0.8b}, [x6], x10 201 202 # nc loop 203 B.HI 0b 204 205 # Restore d10-d15 from stack 206 LDP d14, d15, [sp, 32] 207 LDP d12, d13, [sp, 16] 208 LDP d10, d11, [sp], 48 209 RET 210 211 # Store odd width 212 .p2align 3 2133: 214 TBZ x1, 2, 4f 215 ST1 {v0.s}[2], [x7], 4 216 STR s0, [x6], 4 217 EXT v0.16b, v0.16b, v0.16b, 4 218 2194: 220 TBZ x1, 1, 5f 221 ST1 {v0.h}[4], [x7], 2 222 STR h0, [x6], 2 223 EXT v0.16b, v0.16b, v0.16b, 2 2245: 225 TBZ x1, 0, 6f 226 ST1 {v0.b}[8], [x7] 227 STR b0, [x6] 2286: 229 # Restore d10-d15 from stack 230 LDP d14, d15, [sp, 32] 231 LDP d12, d13, [sp, 16] 232 LDP d10, d11, [sp], 48 233 RET 234 235END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal 236 237#ifdef __ELF__ 238.section ".note.GNU-stack","",%progbits 239#endif 240 241