1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, (x4) 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, (x7) 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# B x5 v4 v5 v6 v7 v16 v17 v18 v19 30# C0 x6 v28 v29 v30 v31 31# unused v8 v9 v10 v11 v12 v13 v14 v15 32 33BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64 34 ADD x2, x2, 3 // kc = (kc + 3) & ~3 35 BIC x2, x2, 3 36 37 .p2align 3 380: 39 # Load initial bias from w into accumulators 40 LDP q28, q29, [x5], 32 41 SUBS x0, x2, 8 // k = kc - 8 42 LDP q30, q31, [x5], 32 43 LDR x11, [sp, 8] // params 44 45 # Is there at least 8 bytes? 46 B.LO 3f 47 48 # Main loop - 8 bytes of A 49 .p2align 3 501: 51 LDR d0, [x3], 8 52 LDR q16, [x5, 0] 53 LDR q17, [x5, 16] 54 SDOT v28.4s, v16.16b, v0.4b[0] 55 LDR q18, [x5, 32] 56 SDOT v29.4s, v17.16b, v0.4b[0] 57 LDR q19, [x5, 48] 58 SDOT v30.4s, v18.16b, v0.4b[0] 59 LDR q4, [x5, 64] 60 SDOT v31.4s, v19.16b, v0.4b[0] 61 LDR q5, [x5, 80] 62 SDOT v28.4s, v4.16b, v0.4b[1] 63 LDR q6, [x5, 96] 64 SDOT v29.4s, v5.16b, v0.4b[1] 65 LDR q7, [x5, 112] 66 SDOT v30.4s, v6.16b, v0.4b[1] 67 ADD x5, x5, 128 68 SDOT v31.4s, v7.16b, v0.4b[1] 69 SUBS x0, x0, 8 70 B.HS 1b 71 72 # Is there a remainder?- 1 to 4 bytes of A 73 TBNZ x0, 2, 3f 74 752: 76 # Load per channel scale values from weights 77 SCVTF v28.4s, v28.4s 78 LDR q4, [x5], 16 79 SCVTF v29.4s, v29.4s 80 LDR q5, [x5], 16 81 SCVTF v30.4s, v30.4s 82 LDR q6, [x5], 16 83 SCVTF v31.4s, v31.4s 84 FMUL v28.4s, v28.4s, v4.4s 85 LDR q4, [x5], 16 86 FMUL v29.4s, v29.4s, v5.4s 87 FMUL v30.4s, v30.4s, v6.4s 88 FMUL v31.4s, v31.4s, v4.4s 89 90 FCVTNS v28.4s, v28.4s 91 FCVTNS v29.4s, v29.4s 92 FCVTNS v30.4s, v30.4s 93 FCVTNS v31.4s, v31.4s 94 95 LD1R {v6.8h}, [x11], 2 // add bias 96 SQXTN v0.4h, v28.4s 97 SQXTN v2.4h, v30.4s 98 SQXTN2 v0.8h, v29.4s 99 SQXTN2 v2.8h, v31.4s 100 101 LD2R {v4.16b, v5.16b}, [x11] // clamp to min/max 102 SQADD v0.8h, v0.8h, v6.8h 103 SQADD v2.8h, v2.8h, v6.8h 104 LDR x12, [sp] // cn_stride 105 SQXTN v0.8b, v0.8h 106 SQXTN2 v0.16b, v2.8h 107 SUBS x1, x1, 16 108 SMAX v0.16b, v0.16b, v4.16b 109 SMIN v0.16b, v0.16b, v5.16b 110 B.LO 4f 111 112 # Store full 1 x 16 113 ST1 {v0.16b}, [x6], x12 114 SUB x3, x3, x2 // a0 -= kc 115 B.NE 0b 116 117 RET 118 119 # Remainder - 4 bytes of A 120 .p2align 3 1213: 122 LDR s0, [x3], 4 123 LDR q16, [x5, 0] 124 LDR q17, [x5, 16] 125 SDOT v28.4s, v16.16b, v0.4b[0] 126 LDR q18, [x5, 32] 127 SDOT v29.4s, v17.16b, v0.4b[0] 128 LDR q19, [x5, 48] 129 SDOT v30.4s, v18.16b, v0.4b[0] 130 ADD x5, x5, 64 131 SDOT v31.4s, v19.16b, v0.4b[0] 132 B 2b 133 134 # Store odd width 135 .p2align 3 1364: 137 TBZ x1, 3, 5f 138 STR d0, [x6], 8 139 DUP d0, v0.d[1] 1405: 141 TBZ x1, 2, 6f 142 STR s0, [x6], 4 143 DUP s0, v0.s[1] 1446: 145 TBZ x1, 1, 7f 146 STR h0, [x6], 2 147 DUP h0, v0.h[1] 1487: 149 TBZ x1, 0, 8f 150 STR b0, [x6] 1518: 152 RET 153 154END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64 155 156#ifdef __ELF__ 157.section ".note.GNU-stack","",%progbits 158#endif 159