xref: /aosp_15_r20/external/XNNPACK/src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           (x4)
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          (x7)
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3 v0
29# B   x5 v16 v17 v18 v19
30# C0  x6 v28 v29 v30 v31
31# unused v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15
32
33BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32
340:
35        # Load initial bias from w into accumulators
36        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
37        LDP     q28, q29, [x5], 32
38        BIC     x2, x2, 3
39        LDP     q30, q31, [x5], 32
40        MOV     x0, x2                  // k = kc.  assumes kc > 0
41        LDR     x11, [sp, 8]            // params
42
43        # Main loop - 4 bytes of A
44        .p2align 3
451:
46        LDR     s0,  [x3], 4
47        LDR     q16, [x5], 16
48        LDR     q17, [x5], 16
49        LDR     q18, [x5], 16
50        LDR     q19, [x5], 16
51        SDOT    v28.4s, v16.16b, v0.4b[0]
52        SDOT    v29.4s, v17.16b, v0.4b[0]
53        SUBS    x0, x0, 4
54        SDOT    v30.4s, v18.16b, v0.4b[0]
55        SDOT    v31.4s, v19.16b, v0.4b[0]
56        B.HI    1b
57
58        # Apply params - scale, bias and clamp
59        SCVTF   v28.4s, v28.4s
60        LD1R    {v4.4s}, [x11], 4
61        SCVTF   v29.4s, v29.4s
62        SCVTF   v30.4s, v30.4s
63        SCVTF   v31.4s, v31.4s
64        FMUL    v28.4s, v28.4s, v4.4s
65        FMUL    v29.4s, v29.4s, v4.4s
66        FMUL    v30.4s, v30.4s, v4.4s
67        FMUL    v31.4s, v31.4s, v4.4s
68
69        FCVTNS  v28.4s, v28.4s
70        FCVTNS  v29.4s, v29.4s
71        FCVTNS  v30.4s, v30.4s
72        FCVTNS  v31.4s, v31.4s
73
74        LD1R    {v6.8h}, [x11], 2       // add bias
75        SQXTN   v0.4h, v28.4s
76        SQXTN   v2.4h, v30.4s
77        SQXTN2  v0.8h, v29.4s
78        SQXTN2  v2.8h, v31.4s
79
80        LD2R    {v4.16b, v5.16b}, [x11] // clamp to min/max
81        SQADD   v0.8h, v0.8h, v6.8h
82        SQADD   v2.8h, v2.8h, v6.8h
83        LDR     x12, [sp]               // cn_stride
84        SQXTN   v0.8b, v0.8h
85        SQXTN2  v0.16b, v2.8h
86        SUBS    x1, x1, 16
87        SMAX    v0.16b, v0.16b, v4.16b
88        SMIN    v0.16b, v0.16b, v5.16b
89        B.LO    2f
90
91        # Store full 1 x 16
92        ST1     {v0.16b}, [x6], x12
93        SUB     x3,  x3, x2             // a0 -= kc
94        B.NE    0b
95        RET
96
97        # Store odd width
98        .p2align 3
992:
100        TBZ     x1, 3, 3f
101        STR     d0, [x6], 8
102        DUP     d0, v0.d[1]
1033:
104        TBZ     x1, 2, 4f
105        STR     s0, [x6], 4
106        DUP     s0, v0.s[1]
1074:
108        TBZ     x1, 1, 5f
109        STR     h0, [x6], 2
110        DUP     h0, v0.h[1]
1115:
112        TBZ     x1, 0, 6f
113        STR     b0, [x6]
1146:
115        RET
116
117END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32
118
119#ifdef __ELF__
120.section ".note.GNU-stack","",%progbits
121#endif
122