1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/2x8c16-aarch64-neon-mlal.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 31# A1 x15 v1 32# B x5 v4 v5 v6 v7 33# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 34# C1 x7 v17 v19 v21 v23 v25 v27 v29 v31 35# temp0 v2 v10 v12 v14 36# temp1 v3 v11 v13 v15 37# unused v8 v9 38 39BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal 40 41 # Clamp C pointers 42 LDP x10, x8, [sp] // Load cn_stride, a_offset 43 CMP x0, 2 // if mr < 2 44 LDP x12, x11, [sp, 16] // Load zero, params pointer 45 ADD x7, x6, x7 // c1 = c0 + cm_stride 46 STP d10, d11, [sp, -48]! 47 ADD x2, x2, 15 // kc = (kc + 15) & ~15 48 STP d12, d13, [sp, 16] 49 CSEL x7, x6, x7, LO // c1 = c0 50 STP d14, d15, [sp, 32] 51 BIC x2, x2, 15 52 53 .p2align 3 540: 55 # Load initial bias from w into accumulators 56 LDP s16, s18, [x5], 8 57 MOV v17.16b, v16.16b 58 MOV v19.16b, v18.16b 59 LDP s20, s22, [x5], 8 60 MOV v21.16b, v20.16b 61 MOV v23.16b, v22.16b 62 LDP s24, s26, [x5], 8 63 MOV v25.16b, v24.16b 64 MOV v27.16b, v26.16b 65 LDP s28, s30, [x5], 8 66 MOV v29.16b, v28.16b 67 MOV v31.16b, v30.16b 68 MOV x9, x3 // p = ks 69 70 .p2align 3 711: 72 # Load next 2 A pointers 73 LDP x13, x15, [x4], 16 74 75 CMP x13, x12 // if a0 == zero 76 ADD x13, x13, x8 // a0 += a_offset 77 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 78 CMP x15, x12 // if a1 == zero 79 ADD x15, x15, x8 // a1 += a_offset 80 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 81 82 MOV x0, x2 // k = kc 83 84 # Main loop - 16 bytes of A 85 .p2align 3 862: 87 LDR q0, [x13], 16 88 LDP q4, q5, [x5] 89 LDR q1, [x15], 16 90 LDP q6, q7, [x5, 32] 91 SMULL v2.8h, v4.8b, v0.8b 92 SMULL v3.8h, v4.8b, v1.8b 93 SMULL v10.8h, v5.8b, v0.8b 94 SMULL v11.8h, v5.8b, v1.8b 95 SMLAL2 v2.8h, v4.16b, v0.16b 96 SMLAL2 v3.8h, v4.16b, v1.16b 97 SMLAL2 v10.8h, v5.16b, v0.16b 98 SMLAL2 v11.8h, v5.16b, v1.16b 99 SMULL v12.8h, v6.8b, v0.8b 100 SADALP v16.4s, v2.8h 101 SMULL v13.8h, v6.8b, v1.8b 102 SADALP v17.4s, v3.8h 103 SMULL v14.8h, v7.8b, v0.8b 104 SADALP v18.4s, v10.8h 105 SMULL v15.8h, v7.8b, v1.8b 106 SADALP v19.4s, v11.8h 107 LDP q4, q5, [x5, 64] 108 SMLAL2 v12.8h, v6.16b, v0.16b 109 SMLAL2 v13.8h, v6.16b, v1.16b 110 SMLAL2 v14.8h, v7.16b, v0.16b 111 SMLAL2 v15.8h, v7.16b, v1.16b 112 SMULL v2.8h, v4.8b, v0.8b 113 SADALP v20.4s, v12.8h 114 SMULL v3.8h, v4.8b, v1.8b 115 SADALP v21.4s, v13.8h 116 SMULL v10.8h, v5.8b, v0.8b 117 SADALP v22.4s, v14.8h 118 SMULL v11.8h, v5.8b, v1.8b 119 SADALP v23.4s, v15.8h 120 LDP q6, q7, [x5, 96] 121 122 SMLAL2 v2.8h, v4.16b, v0.16b 123 SMLAL2 v3.8h, v4.16b, v1.16b 124 SMLAL2 v10.8h, v5.16b, v0.16b 125 SMLAL2 v11.8h, v5.16b, v1.16b 126 ADD x5, x5, 128 127 SMULL v12.8h, v6.8b, v0.8b 128 SADALP v24.4s, v2.8h 129 SMULL v13.8h, v6.8b, v1.8b 130 SADALP v25.4s, v3.8h 131 SMULL v14.8h, v7.8b, v0.8b 132 SADALP v26.4s, v10.8h 133 SMULL v15.8h, v7.8b, v1.8b 134 SADALP v27.4s, v11.8h 135 SUBS x0, x0, 16 136 SMLAL2 v12.8h, v6.16b, v0.16b 137 SMLAL2 v13.8h, v6.16b, v1.16b 138 SMLAL2 v14.8h, v7.16b, v0.16b 139 SMLAL2 v15.8h, v7.16b, v1.16b 140 SADALP v28.4s, v12.8h 141 SADALP v29.4s, v13.8h 142 SADALP v30.4s, v14.8h 143 SADALP v31.4s, v15.8h 144 B.HI 2b 145 146 # ks loop 147 SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*) 148 B.HI 1b 149 150 # Add columns 151 ADDP v16.4s, v16.4s, v18.4s 152 ADDP v20.4s, v20.4s, v22.4s 153 LD1R {v4.4s}, [x11], 4 154 ADDP v24.4s, v24.4s, v26.4s 155 ADDP v28.4s, v28.4s, v30.4s 156 LD1R {v7.4s}, [x11], 4 157 ADDP v17.4s, v17.4s, v19.4s 158 ADDP v21.4s, v21.4s, v23.4s 159 ADDP v25.4s, v25.4s, v27.4s 160 ADDP v29.4s, v29.4s, v31.4s 161 ADDP v0.4s, v16.4s, v20.4s 162 ADDP v1.4s, v24.4s, v28.4s 163 ADDP v2.4s, v17.4s, v21.4s 164 ADDP v3.4s, v25.4s, v29.4s 165 166 # Apply params - preshift, scale, postshift, bias and clamp 167 LD1R {v5.4s}, [x11], 4 168 SQSHL v0.4s, v0.4s, v4.4s // shift to upper bits 169 SQSHL v1.4s, v1.4s, v4.4s 170 SQSHL v2.4s, v2.4s, v4.4s 171 SQSHL v3.4s, v3.4s, v4.4s 172 SQDMULH v0.4s, v0.4s, v7.4s // scale without rounding 173 SQDMULH v1.4s, v1.4s, v7.4s 174 SQDMULH v2.4s, v2.4s, v7.4s 175 SQDMULH v3.4s, v3.4s, v7.4s 176 SRSHL v0.4s, v0.4s, v5.4s // signed rounding shift left 177 SRSHL v1.4s, v1.4s, v5.4s 178 SRSHL v2.4s, v2.4s, v5.4s 179 SRSHL v3.4s, v3.4s, v5.4s 180 181 LD1R {v5.8h}, [x11], 2 182 SQXTN v0.4h, v0.4s 183 SQXTN v2.4h, v2.4s 184 SQXTN2 v0.8h, v1.4s 185 SQXTN2 v2.8h, v3.4s 186 SUBS x1, x1, 8 187 SQADD v0.8h, v0.8h, v5.8h 188 SQADD v1.8h, v2.8h, v5.8h 189 SQXTN v0.8b, v0.8h 190 SQXTN2 v0.16b, v1.8h 191 LD1R {v1.16b}, [x11], 1 192 LD1R {v2.16b}, [x11] 193 SMAX v0.16b, v0.16b, v1.16b 194 SUB x11, x11, 15 // rewind params pointer 195 SMIN v0.16b, v0.16b, v2.16b 196 B.LO 3f 197 198 # Store full 2 x 8 199 ST1 {v0.d}[1], [x7], x10 200 SUB x4, x4, x3 // a -= ks 201 ST1 {v0.8b}, [x6], x10 202 203 # nc loop 204 B.HI 0b 205 206 # Restore d10-d15 from stack 207 LDP d14, d15, [sp, 32] 208 LDP d12, d13, [sp, 16] 209 LDP d10, d11, [sp], 48 210 RET 211 212 # Store odd width 213 .p2align 3 2143: 215 TBZ x1, 2, 4f 216 ST1 {v0.s}[2], [x7], 4 217 STR s0, [x6], 4 218 EXT v0.16b, v0.16b, v0.16b, 4 219 2204: 221 TBZ x1, 1, 5f 222 ST1 {v0.h}[4], [x7], 2 223 STR h0, [x6], 2 224 EXT v0.16b, v0.16b, v0.16b, 2 2255: 226 TBZ x1, 0, 6f 227 ST1 {v0.b}[8], [x7] 228 STR b0, [x6] 2296: 230 # Restore d10-d15 from stack 231 LDP d14, d15, [sp, 32] 232 LDP d12, d13, [sp, 16] 233 LDP d10, d11, [sp], 48 234 RET 235 236END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal 237 238#ifdef __ELF__ 239.section ".note.GNU-stack","",%progbits 240#endif 241 242