1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/2x8c8-aarch64-neon-mull.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# const union xnn_qs8_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# A1 x4 v1 30# B x5 v4 v5 v6 v7 31# C0 x7 v16 v18 v20 v22 v24 v26 v28 v30 32# C1 x8 v17 v19 v21 v23 v25 v27 v29 v31 33# temp0 v2 v10 v12 v14 34# temp1 v3 v11 v13 v15 35# unused v8 v9 36 37BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull 38 39 # Clamp A and C pointers 40 CMP x0, 2 // if mr < 2 41 STP d10, d11, [sp, -48]! 42 ADD x4, x3, x4 // a1 = a0 + a_stride 43 STP d12, d13, [sp, 16] 44 ADD x7, x6, x7 // c1 = c0 + cm_stride 45 STP d14, d15, [sp, 32] 46 CSEL x4, x3, x4, LO // a1 = a0 47 ADD x2, x2, 7 // kc = (kc + 7) & ~7 48 CSEL x7, x6, x7, LO // c1 = c0 49 BIC x2, x2, 7 50 51 .p2align 3 520: 53 # Load initial bias from w into accumulators 54 MOV x0, x2 // k = kc 55 LDP s16, s18, [x5], 8 56 MOV v17.16b, v16.16b 57 MOV v19.16b, v18.16b 58 LDP s20, s22, [x5], 8 59 MOV v21.16b, v20.16b 60 MOV v23.16b, v22.16b 61 LDP s24, s26, [x5], 8 62 MOV v25.16b, v24.16b 63 MOV v27.16b, v26.16b 64 LDP s28, s30, [x5], 8 65 MOV v29.16b, v28.16b 66 LDP x10, x11, [sp, 48] // cn_stride, params 67 MOV v31.16b, v30.16b 68 69 # Main loop - 8 bytes of A 70 .p2align 3 711: 72 LDR d0, [x3], 8 73 LDP d4, d5, [x5] 74 LDR d1, [x4], 8 75 LDP d6, d7, [x5, 16] 76 SMULL v2.8h, v4.8b, v0.8b 77 SMULL v3.8h, v4.8b, v1.8b 78 SMULL v10.8h, v5.8b, v0.8b 79 SMULL v11.8h, v5.8b, v1.8b 80 SMULL v12.8h, v6.8b, v0.8b 81 SADALP v16.4s, v2.8h 82 SMULL v13.8h, v6.8b, v1.8b 83 SADALP v17.4s, v3.8h 84 SMULL v14.8h, v7.8b, v0.8b 85 SADALP v18.4s, v10.8h 86 SMULL v15.8h, v7.8b, v1.8b 87 SADALP v19.4s, v11.8h 88 LDP d4, d5, [x5, 32] 89 SMULL v2.8h, v4.8b, v0.8b 90 SADALP v20.4s, v12.8h 91 SMULL v3.8h, v4.8b, v1.8b 92 SADALP v21.4s, v13.8h 93 SMULL v10.8h, v5.8b, v0.8b 94 SADALP v22.4s, v14.8h 95 SMULL v11.8h, v5.8b, v1.8b 96 SADALP v23.4s, v15.8h 97 LDP d6, d7, [x5, 48] 98 SMULL v12.8h, v6.8b, v0.8b 99 SADALP v24.4s, v2.8h 100 SMULL v13.8h, v6.8b, v1.8b 101 SADALP v25.4s, v3.8h 102 SMULL v14.8h, v7.8b, v0.8b 103 SADALP v26.4s, v10.8h 104 SMULL v15.8h, v7.8b, v1.8b 105 SADALP v27.4s, v11.8h 106 ADD x5, x5, 64 107 SADALP v28.4s, v12.8h 108 SADALP v29.4s, v13.8h 109 SUBS x0, x0, 8 110 SADALP v30.4s, v14.8h 111 SADALP v31.4s, v15.8h 112 B.HI 1b 113 114 # Add columns 115 ADDP v16.4s, v16.4s, v18.4s 116 ADDP v20.4s, v20.4s, v22.4s 117 ADDP v24.4s, v24.4s, v26.4s 118 ADDP v28.4s, v28.4s, v30.4s 119 ADDP v17.4s, v17.4s, v19.4s 120 ADDP v21.4s, v21.4s, v23.4s 121 ADDP v25.4s, v25.4s, v27.4s 122 ADDP v29.4s, v29.4s, v31.4s 123 ADDP v0.4s, v16.4s, v20.4s 124 ADDP v1.4s, v24.4s, v28.4s 125 ADDP v2.4s, v17.4s, v21.4s 126 ADDP v3.4s, v25.4s, v29.4s 127 128 # Load per channel scale values from weights 129 SCVTF v0.4s, v0.4s 130 LDR q4, [x5], 16 131 SCVTF v1.4s, v1.4s 132 LDR q5, [x5], 16 133 SCVTF v2.4s, v2.4s 134 SCVTF v3.4s, v3.4s 135 FMUL v0.4s, v0.4s, v4.4s 136 FMUL v1.4s, v1.4s, v5.4s 137 FMUL v2.4s, v2.4s, v4.4s 138 FMUL v3.4s, v3.4s, v5.4s 139 140 FCVTNS v0.4s, v0.4s 141 FCVTNS v1.4s, v1.4s 142 FCVTNS v2.4s, v2.4s 143 FCVTNS v3.4s, v3.4s 144 145 LD1R {v5.8h}, [x11], 2 146 147 SQXTN v0.4h, v0.4s 148 SQXTN v2.4h, v2.4s 149 SQXTN2 v0.8h, v1.4s 150 SQXTN2 v2.8h, v3.4s 151 SUBS x1, x1, 8 152 SQADD v0.8h, v0.8h, v5.8h 153 SQADD v1.8h, v2.8h, v5.8h 154 SQXTN v0.8b, v0.8h 155 SQXTN2 v0.16b, v1.8h 156 LD1R {v1.16b}, [x11], 1 157 LD1R {v2.16b}, [x11] 158 SMAX v0.16b, v0.16b, v1.16b 159 SMIN v0.16b, v0.16b, v2.16b 160 B.LO 2f 161 162 # Store full 2 x 8 163 ST1 {v0.8b}, [x6], x10 164 SUB x3, x3, x2 // a0 -= kc 165 ST1 {v0.d}[1], [x7], x10 166 SUB x4, x4, x2 // a1 -= kc 167 B.HI 0b 168 169 # Restore d10-d15 from stack 170 LDP d14, d15, [sp, 32] 171 LDP d12, d13, [sp, 16] 172 LDP d10, d11, [sp], 48 173 RET 174 175 # Store odd width 176 .p2align 3 1772: 178 TBZ x1, 2, 3f 179 STR s0, [x6], 4 180 ST1 {v0.s}[2], [x7], 4 181 EXT v0.16b, v0.16b, v0.16b, 4 182 1833: 184 TBZ x1, 1, 4f 185 STR h0, [x6], 2 186 ST1 {v0.h}[4], [x7], 2 187 EXT v0.16b, v0.16b, v0.16b, 2 1884: 189 TBZ x1, 0, 5f 190 STR b0, [x6] 191 ST1 {v0.b}[8], [x7] 1925: 193 # Restore d10-d15 from stack 194 LDP d14, d15, [sp, 32] 195 LDP d12, d13, [sp, 16] 196 LDP d10, d11, [sp], 48 197 RET 198 199END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull 200 201#ifdef __ELF__ 202.section ".note.GNU-stack","",%progbits 203#endif 204 205