1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, (x4) 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, (x7) 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# B x5 v4 v5 v6 v7 v16 v17 v18 v19 30# C0 x6 v28 v29 v30 v31 31# unused v8 v9 v10 v11 v12 v13 v14 v15 32 33BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64 34 ADD x2, x2, 3 // kc = (kc + 3) & ~3 35 BIC x2, x2, 3 36 37 .p2align 3 380: 39 # Load initial bias from w into accumulators 40 LDP q28, q29, [x5], 32 41 SUBS x0, x2, 8 // k = kc - 8 42 LDP q30, q31, [x5], 32 43 LDR x11, [sp, 8] // params 44 45 # Is there at least 8 bytes? 46 B.LO 3f 47 48 # Main loop - 8 bytes of A 49 .p2align 3 501: 51 LDR d0, [x3], 8 52 LDR q16, [x5, 0] 53 LDR q17, [x5, 16] 54 SDOT v28.4s, v16.16b, v0.4b[0] 55 LDR q18, [x5, 32] 56 SDOT v29.4s, v17.16b, v0.4b[0] 57 LDR q19, [x5, 48] 58 SDOT v30.4s, v18.16b, v0.4b[0] 59 LDR q4, [x5, 64] 60 SDOT v31.4s, v19.16b, v0.4b[0] 61 LDR q5, [x5, 80] 62 SDOT v28.4s, v4.16b, v0.4b[1] 63 LDR q6, [x5, 96] 64 SDOT v29.4s, v5.16b, v0.4b[1] 65 LDR q7, [x5, 112] 66 SDOT v30.4s, v6.16b, v0.4b[1] 67 ADD x5, x5, 128 68 SDOT v31.4s, v7.16b, v0.4b[1] 69 SUBS x0, x0, 8 70 B.HS 1b 71 72 # Is there a remainder?- 1 to 4 bytes of A 73 TBNZ x0, 2, 3f 74 752: 76 # Apply params - scale, bias and clamp 77 SCVTF v28.4s, v28.4s 78 LD1R {v4.4s}, [x11], 4 79 SCVTF v29.4s, v29.4s 80 SCVTF v30.4s, v30.4s 81 SCVTF v31.4s, v31.4s 82 FMUL v28.4s, v28.4s, v4.4s 83 FMUL v29.4s, v29.4s, v4.4s 84 FMUL v30.4s, v30.4s, v4.4s 85 FMUL v31.4s, v31.4s, v4.4s 86 87 FCVTNS v28.4s, v28.4s 88 FCVTNS v29.4s, v29.4s 89 FCVTNS v30.4s, v30.4s 90 FCVTNS v31.4s, v31.4s 91 92 LD1R {v6.8h}, [x11], 2 // add bias 93 SQXTN v0.4h, v28.4s 94 SQXTN v2.4h, v30.4s 95 SQXTN2 v0.8h, v29.4s 96 SQXTN2 v2.8h, v31.4s 97 98 LD2R {v4.16b, v5.16b}, [x11] // clamp to min/max 99 SQADD v0.8h, v0.8h, v6.8h 100 SQADD v2.8h, v2.8h, v6.8h 101 LDR x12, [sp] // cn_stride 102 SQXTN v0.8b, v0.8h 103 SQXTN2 v0.16b, v2.8h 104 SUBS x1, x1, 16 105 SMAX v0.16b, v0.16b, v4.16b 106 SMIN v0.16b, v0.16b, v5.16b 107 B.LO 4f 108 109 # Store full 1 x 16 110 ST1 {v0.16b}, [x6], x12 111 SUB x3, x3, x2 // a0 -= kc 112 B.NE 0b 113 114 RET 115 116 # Remainder - 4 bytes of A 117 .p2align 3 1183: 119 LDR s0, [x3], 4 120 LDR q16, [x5, 0] 121 LDR q17, [x5, 16] 122 SDOT v28.4s, v16.16b, v0.4b[0] 123 LDR q18, [x5, 32] 124 SDOT v29.4s, v17.16b, v0.4b[0] 125 LDR q19, [x5, 48] 126 SDOT v30.4s, v18.16b, v0.4b[0] 127 ADD x5, x5, 64 128 SDOT v31.4s, v19.16b, v0.4b[0] 129 B 2b 130 131 # Store odd width 132 .p2align 3 1334: 134 TBZ x1, 3, 5f 135 STR d0, [x6], 8 136 DUP d0, v0.d[1] 1375: 138 TBZ x1, 2, 6f 139 STR s0, [x6], 4 140 DUP s0, v0.s[1] 1416: 142 TBZ x1, 1, 7f 143 STR h0, [x6], 2 144 DUP h0, v0.h[1] 1457: 146 TBZ x1, 0, 8f 147 STR b0, [x6] 1488: 149 RET 150 151END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64 152 153#ifdef __ELF__ 154.section ".note.GNU-stack","",%progbits 155#endif 156