xref: /aosp_15_r20/frameworks/rs/toolkit/Blur_advsimd.S (revision e1eccf28f96817838ad6867f7f39d2351ec11f56)
1*e1eccf28SAndroid Build Coastguard Worker/*
2*e1eccf28SAndroid Build Coastguard Worker * Copyright (C) 2014 The Android Open Source Project
3*e1eccf28SAndroid Build Coastguard Worker *
4*e1eccf28SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*e1eccf28SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*e1eccf28SAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*e1eccf28SAndroid Build Coastguard Worker *
8*e1eccf28SAndroid Build Coastguard Worker *      http://www.apache.org/licenses/LICENSE-2.0
9*e1eccf28SAndroid Build Coastguard Worker *
10*e1eccf28SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*e1eccf28SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*e1eccf28SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*e1eccf28SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*e1eccf28SAndroid Build Coastguard Worker * limitations under the License.
15*e1eccf28SAndroid Build Coastguard Worker */
16*e1eccf28SAndroid Build Coastguard Worker
17*e1eccf28SAndroid Build Coastguard Worker#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18*e1eccf28SAndroid Build Coastguard Worker#define PRIVATE(f) .text; .align 4; .type f,#function; f:
19*e1eccf28SAndroid Build Coastguard Worker#define END(f) .size f, .-f;
20*e1eccf28SAndroid Build Coastguard Worker
21*e1eccf28SAndroid Build Coastguard Worker//#define ARCH_ARM64_USE_BLUR_PRELOAD
22*e1eccf28SAndroid Build Coastguard Worker
23*e1eccf28SAndroid Build Coastguard Worker/* Number of fractional bits to preserve in intermediate results.  The
24*e1eccf28SAndroid Build Coastguard Worker * intermediate storage is 16-bit, and we started with 8 bit data (the integer
25*e1eccf28SAndroid Build Coastguard Worker * part), so this should be between 0 and 8.
26*e1eccf28SAndroid Build Coastguard Worker */
27*e1eccf28SAndroid Build Coastguard Worker.set FRACTION_BITS, 7
28*e1eccf28SAndroid Build Coastguard Worker.set MAX_R, 25
29*e1eccf28SAndroid Build Coastguard Worker
30*e1eccf28SAndroid Build Coastguard Worker
31*e1eccf28SAndroid Build Coastguard Worker/* A quick way of making a line of code conditional on some other condition.
32*e1eccf28SAndroid Build Coastguard Worker * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
33*e1eccf28SAndroid Build Coastguard Worker * `ifcc`:
34*e1eccf28SAndroid Build Coastguard Worker */
35*e1eccf28SAndroid Build Coastguard Worker.macro ifcc zzz:vararg
36*e1eccf28SAndroid Build Coastguard Worker.if cc
37*e1eccf28SAndroid Build Coastguard Worker            \zzz
38*e1eccf28SAndroid Build Coastguard Worker.endif
39*e1eccf28SAndroid Build Coastguard Worker.endm
40*e1eccf28SAndroid Build Coastguard Worker
41*e1eccf28SAndroid Build Coastguard Worker/* It's not always clear that prefetching is beneficial and this needs further
42*e1eccf28SAndroid Build Coastguard Worker * testing on different cores, so it's made switchable here.
43*e1eccf28SAndroid Build Coastguard Worker */
44*e1eccf28SAndroid Build Coastguard Worker#if defined(ARCH_ARM64_USE_BLUR_PRELOAD)
45*e1eccf28SAndroid Build Coastguard Worker#define VERTPLD(...) prfm        PLDL1KEEP, [__VA_ARGS__]
46*e1eccf28SAndroid Build Coastguard Worker#else
47*e1eccf28SAndroid Build Coastguard Worker#define VERTPLD(...) nop
48*e1eccf28SAndroid Build Coastguard Worker#endif
49*e1eccf28SAndroid Build Coastguard Worker
50*e1eccf28SAndroid Build Coastguard Worker/* Fetch 16 columns of bytes (regardless of image format), convolve these
51*e1eccf28SAndroid Build Coastguard Worker * vertically, and leave them in the register file.  If working near the top or
52*e1eccf28SAndroid Build Coastguard Worker * bottom of an image then clamp the addressing while loading the data in.
53*e1eccf28SAndroid Build Coastguard Worker *
54*e1eccf28SAndroid Build Coastguard Worker * The convolution is fully unrolled for windows up to max_r, with the
55*e1eccf28SAndroid Build Coastguard Worker * outermost edges calculated first.  This way it's possible to branch directly
56*e1eccf28SAndroid Build Coastguard Worker * into the relevant part of the code for an arbitrary convolution radius.  Two
57*e1eccf28SAndroid Build Coastguard Worker * variants of the loop are produced; one eliminates the clamping code for a
58*e1eccf28SAndroid Build Coastguard Worker * slight speed advantage.
59*e1eccf28SAndroid Build Coastguard Worker *
60*e1eccf28SAndroid Build Coastguard Worker * Where the macro is called with reg=x, the specified register is taken to
61*e1eccf28SAndroid Build Coastguard Worker * contain a pre-calculated pointer into one of the two loops.
62*e1eccf28SAndroid Build Coastguard Worker *
63*e1eccf28SAndroid Build Coastguard Worker * Input:
64*e1eccf28SAndroid Build Coastguard Worker *      x1 -- src
65*e1eccf28SAndroid Build Coastguard Worker *      x2 -- pitch
66*e1eccf28SAndroid Build Coastguard Worker *      x5 -- r
67*e1eccf28SAndroid Build Coastguard Worker *      x6 -- rup (r, unless clipped to top of source image)
68*e1eccf28SAndroid Build Coastguard Worker *      x7 -- rdn (r, unless clipped to bottom of source image)
69*e1eccf28SAndroid Build Coastguard Worker *      x12 -- switch index
70*e1eccf28SAndroid Build Coastguard Worker *      v0-v3 -- coefficient table
71*e1eccf28SAndroid Build Coastguard Worker *      x13 = -pitch
72*e1eccf28SAndroid Build Coastguard Worker *      x15 = top-row in
73*e1eccf28SAndroid Build Coastguard Worker *      x19 = bottom-row in
74*e1eccf28SAndroid Build Coastguard Worker * Output:
75*e1eccf28SAndroid Build Coastguard Worker *      x1 += 16
76*e1eccf28SAndroid Build Coastguard Worker *      v10,v11 -- 16 convolved columns
77*e1eccf28SAndroid Build Coastguard Worker * Modifies:
78*e1eccf28SAndroid Build Coastguard Worker *      x10 = upper row pointer
79*e1eccf28SAndroid Build Coastguard Worker *      x11 = lower row pointer
80*e1eccf28SAndroid Build Coastguard Worker *      v12-v15 = temporary sums
81*e1eccf28SAndroid Build Coastguard Worker */
82*e1eccf28SAndroid Build Coastguard Worker.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
83*e1eccf28SAndroid Build Coastguard Worker  .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
84*e1eccf28SAndroid Build Coastguard Worker
85*e1eccf28SAndroid Build Coastguard Worker            ld1         {v15.16b}, [x1], #16
86*e1eccf28SAndroid Build Coastguard Worker            mov         x10, x15
87*e1eccf28SAndroid Build Coastguard Worker
88*e1eccf28SAndroid Build Coastguard Worker            uxtl        v14.8h, v15.8b
89*e1eccf28SAndroid Build Coastguard Worker            VERTPLD(x1, #16)
90*e1eccf28SAndroid Build Coastguard Worker            uxtl2       v15.8h, v15.16b
91*e1eccf28SAndroid Build Coastguard Worker  .if \max_r < 16 // approximate
92*e1eccf28SAndroid Build Coastguard Worker    ifcc    adr         \reg, 1f
93*e1eccf28SAndroid Build Coastguard Worker  .else
94*e1eccf28SAndroid Build Coastguard Worker    ifcc    adrp        \reg, 1f
95*e1eccf28SAndroid Build Coastguard Worker    ifcc    add         \reg, \reg, #:lo12:1f
96*e1eccf28SAndroid Build Coastguard Worker  .endif
97*e1eccf28SAndroid Build Coastguard Worker
98*e1eccf28SAndroid Build Coastguard Worker            umull       v12.4s, v14.4h, v0.h[0]
99*e1eccf28SAndroid Build Coastguard Worker    ifcc    sub         \reg, \reg, x5, LSL #6
100*e1eccf28SAndroid Build Coastguard Worker            umull2      v13.4s, v14.8h, v0.h[0]
101*e1eccf28SAndroid Build Coastguard Worker            mov         x11, x19
102*e1eccf28SAndroid Build Coastguard Worker            umull       v14.4s, v15.4h, v0.h[0]
103*e1eccf28SAndroid Build Coastguard Worker    ifcc    add         \reg, \reg, x5, LSL #3
104*e1eccf28SAndroid Build Coastguard Worker            umull2      v15.4s, v15.8h, v0.h[0]
105*e1eccf28SAndroid Build Coastguard Worker            br          \reg
106*e1eccf28SAndroid Build Coastguard Worker
107*e1eccf28SAndroid Build Coastguard Worker  /* This version of the vertical fetch loop body is used away from the edges
108*e1eccf28SAndroid Build Coastguard Worker   * of the source image.  The pointers start at the top and bottom source rows
109*e1eccf28SAndroid Build Coastguard Worker   * and work their way towards the centre on each iteration.  This way the
110*e1eccf28SAndroid Build Coastguard Worker   * number of taps used can be controlled by jumping directly into the middle
111*e1eccf28SAndroid Build Coastguard Worker   * of the loop and running to completion.
112*e1eccf28SAndroid Build Coastguard Worker   * If the loop body changes size then the code which calculates the address of
113*e1eccf28SAndroid Build Coastguard Worker   * the initial iteration must be updated to accordingly.
114*e1eccf28SAndroid Build Coastguard Worker   */
115*e1eccf28SAndroid Build Coastguard Worker  .macro vertfetch_noclamp i, dreg
116*e1eccf28SAndroid Build Coastguard Worker    .if 0 < \i && \i <= \max_r
117*e1eccf28SAndroid Build Coastguard Worker            ld1         {v10.16b}, [x10], x2
118*e1eccf28SAndroid Build Coastguard Worker            ld1         {v11.16b}, [x11], x13
119*e1eccf28SAndroid Build Coastguard Worker            uaddl       v16.8h, v10.8b, v11.8b
120*e1eccf28SAndroid Build Coastguard Worker            uaddl2      v11.8h, v10.16b, v11.16b
121*e1eccf28SAndroid Build Coastguard Worker            umlal       v12.4s, v16.4h, \dreg
122*e1eccf28SAndroid Build Coastguard Worker            umlal2      v13.4s, v16.8h, \dreg
123*e1eccf28SAndroid Build Coastguard Worker            VERTPLD(x10, #32)
124*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v11.4h, \dreg
125*e1eccf28SAndroid Build Coastguard Worker            VERTPLD(x11, #32)
126*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v11.8h, \dreg
127*e1eccf28SAndroid Build Coastguard Worker    .endif
128*e1eccf28SAndroid Build Coastguard Worker  .endm
129*e1eccf28SAndroid Build Coastguard Worker
130*e1eccf28SAndroid Build Coastguard Worker  /* This version of the vertical fetch loop body is used near the edges of the
131*e1eccf28SAndroid Build Coastguard Worker   * source image, where one or both of the accesses may start with a clamped
132*e1eccf28SAndroid Build Coastguard Worker   * value, and the row addresses only begin to change after some number of
133*e1eccf28SAndroid Build Coastguard Worker   * iterations before the end.
134*e1eccf28SAndroid Build Coastguard Worker   * If the loop body changes size then the code which calculates the address of
135*e1eccf28SAndroid Build Coastguard Worker   * the initial iteration must be updated to accordingly.
136*e1eccf28SAndroid Build Coastguard Worker   */
137*e1eccf28SAndroid Build Coastguard Worker  .macro vertfetch_clamped i, dreg
138*e1eccf28SAndroid Build Coastguard Worker    .if 0 < \i && \i <= \max_r
139*e1eccf28SAndroid Build Coastguard Worker            ld1         {v10.16b}, [x10], x2
140*e1eccf28SAndroid Build Coastguard Worker            cmp         x6, #\i
141*e1eccf28SAndroid Build Coastguard Worker            ld1         {v11.16b}, [x11], x13
142*e1eccf28SAndroid Build Coastguard Worker            csel        x10, x15, x10, lo
143*e1eccf28SAndroid Build Coastguard Worker            uaddl       v16.8h, v10.8b, v11.8b
144*e1eccf28SAndroid Build Coastguard Worker            cmp         x7, #\i
145*e1eccf28SAndroid Build Coastguard Worker            uaddl2      v11.8h, v10.16b, v11.16b
146*e1eccf28SAndroid Build Coastguard Worker            csel        x11, x19, x11, lo
147*e1eccf28SAndroid Build Coastguard Worker            umlal       v12.4s, v16.4h, \dreg
148*e1eccf28SAndroid Build Coastguard Worker            umlal2      v13.4s, v16.8h, \dreg
149*e1eccf28SAndroid Build Coastguard Worker            VERTPLD(x10, #32)
150*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v11.4h, \dreg
151*e1eccf28SAndroid Build Coastguard Worker            VERTPLD(x11, #32)
152*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v11.8h, \dreg
153*e1eccf28SAndroid Build Coastguard Worker    .endif
154*e1eccf28SAndroid Build Coastguard Worker  .endm
155*e1eccf28SAndroid Build Coastguard Worker
156*e1eccf28SAndroid Build Coastguard Worker  /* Entry into this unrolled loop is computed as a negative index from
157*e1eccf28SAndroid Build Coastguard Worker   * \labelc at the end of the block.
158*e1eccf28SAndroid Build Coastguard Worker   */
159*e1eccf28SAndroid Build Coastguard Worker  .align 4
160*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 27, v3.h[3]
161*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 26, v3.h[2]
162*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 25, v3.h[1]
163*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 24, v3.h[0]
164*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 23, v2.h[7]
165*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 22, v2.h[6]
166*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 21, v2.h[5]
167*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 20, v2.h[4]
168*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 19, v2.h[3]
169*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 18, v2.h[2]
170*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 17, v2.h[1]
171*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 16, v2.h[0]
172*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 15, v1.h[7]
173*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 14, v1.h[6]
174*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 13, v1.h[5]
175*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 12, v1.h[4]
176*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 11, v1.h[3]
177*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped 10, v1.h[2]
178*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped  9, v1.h[1]
179*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped  8, v1.h[0]
180*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped  7, v0.h[7]
181*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped  6, v0.h[6]
182*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped  5, v0.h[5]
183*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped  4, v0.h[4]
184*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped  3, v0.h[3]
185*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped  2, v0.h[2]
186*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped  1, v0.h[1]
187*e1eccf28SAndroid Build Coastguard Worker  vertfetch_clamped  0, v0.h[0]
188*e1eccf28SAndroid Build Coastguard Worker  1:
189*e1eccf28SAndroid Build Coastguard Worker  \labelc : b 2f    /* done with clamped loop, skip over non-clamped loop */
190*e1eccf28SAndroid Build Coastguard Worker
191*e1eccf28SAndroid Build Coastguard Worker  /* Entry into this unrolled loop is computed as a negative index from
192*e1eccf28SAndroid Build Coastguard Worker   * \labelnc at the end of the block.
193*e1eccf28SAndroid Build Coastguard Worker   */
194*e1eccf28SAndroid Build Coastguard Worker  .align 4
195*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 27, v3.h[3]
196*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 26, v3.h[2]
197*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 25, v3.h[1]
198*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 24, v3.h[0]
199*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 23, v2.h[7]
200*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 22, v2.h[6]
201*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 21, v2.h[5]
202*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 20, v2.h[4]
203*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 19, v2.h[3]
204*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 18, v2.h[2]
205*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 17, v2.h[1]
206*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 16, v2.h[0]
207*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 15, v1.h[7]
208*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 14, v1.h[6]
209*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 13, v1.h[5]
210*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 12, v1.h[4]
211*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 11, v1.h[3]
212*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp 10, v1.h[2]
213*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp  9, v1.h[1]
214*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp  8, v1.h[0]
215*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp  7, v0.h[7]
216*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp  6, v0.h[6]
217*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp  5, v0.h[5]
218*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp  4, v0.h[4]
219*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp  3, v0.h[3]
220*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp  2, v0.h[2]
221*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp  1, v0.h[1]
222*e1eccf28SAndroid Build Coastguard Worker  vertfetch_noclamp  0, v0.h[0]
223*e1eccf28SAndroid Build Coastguard Worker  \labelnc :
224*e1eccf28SAndroid Build Coastguard Worker
225*e1eccf28SAndroid Build Coastguard Worker  .purgem vertfetch_clamped
226*e1eccf28SAndroid Build Coastguard Worker  .purgem vertfetch_noclamp
227*e1eccf28SAndroid Build Coastguard Worker
228*e1eccf28SAndroid Build Coastguard Worker  2:        uqrshrn     v10.4h, v12.4s, #16 - FRACTION_BITS
229*e1eccf28SAndroid Build Coastguard Worker            add         x15, x15, #16
230*e1eccf28SAndroid Build Coastguard Worker            uqrshrn2    v10.8h, v13.4s, #16 - FRACTION_BITS
231*e1eccf28SAndroid Build Coastguard Worker            add         x19, x19, #16
232*e1eccf28SAndroid Build Coastguard Worker            uqrshrn     v11.4h, v14.4s, #16 - FRACTION_BITS
233*e1eccf28SAndroid Build Coastguard Worker            uqrshrn2    v11.8h, v15.4s, #16 - FRACTION_BITS
234*e1eccf28SAndroid Build Coastguard Worker.endm /*}}}*/
235*e1eccf28SAndroid Build Coastguard Worker
236*e1eccf28SAndroid Build Coastguard Worker/* Some portion of the convolution window (as much as will fit, and all of it
237*e1eccf28SAndroid Build Coastguard Worker * for the uchar1 cases) is kept in the register file to avoid unnecessary
238*e1eccf28SAndroid Build Coastguard Worker * memory accesses.  This forces the horizontal loops to be unrolled because
239*e1eccf28SAndroid Build Coastguard Worker * there's no indexed addressing into the register file.
240*e1eccf28SAndroid Build Coastguard Worker *
241*e1eccf28SAndroid Build Coastguard Worker * As in the fetch macro, the operations are ordered from outside to inside, so
242*e1eccf28SAndroid Build Coastguard Worker * that jumping into the middle of the block bypasses the unwanted window taps.
243*e1eccf28SAndroid Build Coastguard Worker *
244*e1eccf28SAndroid Build Coastguard Worker * There are several variants of the macro because of the fixed offets of the
245*e1eccf28SAndroid Build Coastguard Worker * taps -- the wider the maximum radius the further the centre tap is from the
246*e1eccf28SAndroid Build Coastguard Worker * most recently fetched data.  This means that pre-filling the window requires
247*e1eccf28SAndroid Build Coastguard Worker * more data that won't be used and it means that rotating the window involves
248*e1eccf28SAndroid Build Coastguard Worker * more mov operations.
249*e1eccf28SAndroid Build Coastguard Worker *
250*e1eccf28SAndroid Build Coastguard Worker * When the buffer gets too big the buffer at [x9] is used.
251*e1eccf28SAndroid Build Coastguard Worker *
252*e1eccf28SAndroid Build Coastguard Worker * Input:
253*e1eccf28SAndroid Build Coastguard Worker *      v16-v31,v4-v11 -- convoltion window
254*e1eccf28SAndroid Build Coastguard Worker *      x9 -- pointer to additional convolution window data
255*e1eccf28SAndroid Build Coastguard Worker * Output:
256*e1eccf28SAndroid Build Coastguard Worker *      x9 -- updated buffer pointer (if used)
257*e1eccf28SAndroid Build Coastguard Worker *      d31 -- result to be stored
258*e1eccf28SAndroid Build Coastguard Worker * Modifies:
259*e1eccf28SAndroid Build Coastguard Worker *      x12 -- temp buffer pointer
260*e1eccf28SAndroid Build Coastguard Worker *      v12-v13 -- temporaries for load and vext operations.
261*e1eccf28SAndroid Build Coastguard Worker *      v14-v15 -- intermediate sums
262*e1eccf28SAndroid Build Coastguard Worker */
263*e1eccf28SAndroid Build Coastguard Worker#define TUNED_LIST1 8, 16
264*e1eccf28SAndroid Build Coastguard Worker.macro hconv1_8/*{{{*/
265*e1eccf28SAndroid Build Coastguard Worker
266*e1eccf28SAndroid Build Coastguard Worker.rodata
267*e1eccf28SAndroid Build Coastguard Worker    200:    .hword -4
268*e1eccf28SAndroid Build Coastguard Worker            .hword 101f-100f
269*e1eccf28SAndroid Build Coastguard Worker            .hword 102f-100f
270*e1eccf28SAndroid Build Coastguard Worker            .hword 103f-100f
271*e1eccf28SAndroid Build Coastguard Worker            .hword 104f-100f
272*e1eccf28SAndroid Build Coastguard Worker            .hword 105f-100f
273*e1eccf28SAndroid Build Coastguard Worker            .hword 106f-100f
274*e1eccf28SAndroid Build Coastguard Worker            .hword 107f-100f
275*e1eccf28SAndroid Build Coastguard Worker            .hword 108f-100f
276*e1eccf28SAndroid Build Coastguard Worker            .align      4
277*e1eccf28SAndroid Build Coastguard Worker.text
278*e1eccf28SAndroid Build Coastguard Worker            umull       v14.4s, v9.4h, v0.h[0]
279*e1eccf28SAndroid Build Coastguard Worker            umull2      v15.4s, v9.8h, v0.h[0]
280*e1eccf28SAndroid Build Coastguard Worker
281*e1eccf28SAndroid Build Coastguard Worker            adrp        x16, 200b
282*e1eccf28SAndroid Build Coastguard Worker            add         x16, x16, :lo12:200b
283*e1eccf28SAndroid Build Coastguard Worker            ldrsh       x12, [x16, x5, LSL #1]
284*e1eccf28SAndroid Build Coastguard Worker            adr         x16, 100f
285*e1eccf28SAndroid Build Coastguard Worker            add         x12, x12, x16
286*e1eccf28SAndroid Build Coastguard Worker    100:    br          x12
287*e1eccf28SAndroid Build Coastguard Worker    108:    umlal       v14.4s, v8.4h, v1.h[0]
288*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v8.8h, v1.h[0]
289*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v10.4h, v1.h[0]
290*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v10.8h, v1.h[0]
291*e1eccf28SAndroid Build Coastguard Worker    107:    ext         v12.16b, v8.16b, v9.16b, #1*2
292*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #7*2
293*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[7]
294*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[7]
295*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[7]
296*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[7]
297*e1eccf28SAndroid Build Coastguard Worker    106:    ext         v12.16b, v8.16b, v9.16b, #2*2
298*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #6*2
299*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[6]
300*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[6]
301*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[6]
302*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[6]
303*e1eccf28SAndroid Build Coastguard Worker    105:    ext         v12.16b, v8.16b, v9.16b, #3*2
304*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #5*2
305*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[5]
306*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[5]
307*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[5]
308*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[5]
309*e1eccf28SAndroid Build Coastguard Worker    104:    //ext         v12.16b, v8.16b, v9.16b, #4*2
310*e1eccf28SAndroid Build Coastguard Worker            //ext         v13.16b, v9.16b, v10.16b, #4*2
311*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v8.8h, v0.h[4]
312*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v9.4h, v0.h[4]
313*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v9.8h, v0.h[4]
314*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v10.4h, v0.h[4]
315*e1eccf28SAndroid Build Coastguard Worker    103:    ext         v12.16b, v8.16b, v9.16b, #5*2
316*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #3*2
317*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[3]
318*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[3]
319*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[3]
320*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[3]
321*e1eccf28SAndroid Build Coastguard Worker    102:    ext         v12.16b, v8.16b, v9.16b, #6*2
322*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #2*2
323*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[2]
324*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[2]
325*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[2]
326*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[2]
327*e1eccf28SAndroid Build Coastguard Worker    101:    ext         v12.16b, v8.16b, v9.16b, #7*2
328*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #1*2
329*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[1]
330*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[1]
331*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[1]
332*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[1]
333*e1eccf28SAndroid Build Coastguard Worker
334*e1eccf28SAndroid Build Coastguard Worker            uqrshrn     v14.4h, v14.4s, #16
335*e1eccf28SAndroid Build Coastguard Worker            uqrshrn2    v14.8h, v15.4s, #16
336*e1eccf28SAndroid Build Coastguard Worker            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
337*e1eccf28SAndroid Build Coastguard Worker
338*e1eccf28SAndroid Build Coastguard Worker            mov         v8.16b, v9.16b
339*e1eccf28SAndroid Build Coastguard Worker            mov         v9.16b, v10.16b
340*e1eccf28SAndroid Build Coastguard Worker            mov         v10.16b, v11.16b
341*e1eccf28SAndroid Build Coastguard Worker.endm/*}}}*/
342*e1eccf28SAndroid Build Coastguard Worker
343*e1eccf28SAndroid Build Coastguard Worker.macro hconv1_16/*{{{*/
344*e1eccf28SAndroid Build Coastguard Worker.rodata
345*e1eccf28SAndroid Build Coastguard Worker   200:     .hword -4
346*e1eccf28SAndroid Build Coastguard Worker            .hword 101f-100f
347*e1eccf28SAndroid Build Coastguard Worker            .hword 102f-100f
348*e1eccf28SAndroid Build Coastguard Worker            .hword 103f-100f
349*e1eccf28SAndroid Build Coastguard Worker            .hword 104f-100f
350*e1eccf28SAndroid Build Coastguard Worker            .hword 105f-100f
351*e1eccf28SAndroid Build Coastguard Worker            .hword 106f-100f
352*e1eccf28SAndroid Build Coastguard Worker            .hword 107f-100f
353*e1eccf28SAndroid Build Coastguard Worker            .hword 108f-100f
354*e1eccf28SAndroid Build Coastguard Worker            .hword 109f-100f
355*e1eccf28SAndroid Build Coastguard Worker            .hword 110f-100f
356*e1eccf28SAndroid Build Coastguard Worker            .hword 111f-100f
357*e1eccf28SAndroid Build Coastguard Worker            .hword 112f-100f
358*e1eccf28SAndroid Build Coastguard Worker            .hword 113f-100f
359*e1eccf28SAndroid Build Coastguard Worker            .hword 114f-100f
360*e1eccf28SAndroid Build Coastguard Worker            .hword 115f-100f
361*e1eccf28SAndroid Build Coastguard Worker            .hword 116f-100f
362*e1eccf28SAndroid Build Coastguard Worker            .align 4
363*e1eccf28SAndroid Build Coastguard Worker
364*e1eccf28SAndroid Build Coastguard Worker.text
365*e1eccf28SAndroid Build Coastguard Worker            umull       v14.4s, v8.4h, v0.h[0]
366*e1eccf28SAndroid Build Coastguard Worker            umull2      v15.4s, v8.8h, v0.h[0]
367*e1eccf28SAndroid Build Coastguard Worker
368*e1eccf28SAndroid Build Coastguard Worker            adrp        x16, 200b
369*e1eccf28SAndroid Build Coastguard Worker            add         x16, x16, :lo12:200b
370*e1eccf28SAndroid Build Coastguard Worker            ldrsh       x12, [x16, x5, LSL #1]
371*e1eccf28SAndroid Build Coastguard Worker            adr         x16, 100f
372*e1eccf28SAndroid Build Coastguard Worker            add         x12, x12, x16
373*e1eccf28SAndroid Build Coastguard Worker    100:    br          x12
374*e1eccf28SAndroid Build Coastguard Worker    116:    //ext         v12.16b, v6.16b, v7.16b, #0*2
375*e1eccf28SAndroid Build Coastguard Worker            //ext         v13.16b, v10.16b, v11.16b, #0*2
376*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v6.4h, v2.h[0]
377*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v6.8h, v2.h[0]
378*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v10.4h, v2.h[0]
379*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v10.8h, v2.h[0]
380*e1eccf28SAndroid Build Coastguard Worker    115:    ext         v12.16b, v6.16b, v7.16b, #1*2
381*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #7*2
382*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v1.h[7]
383*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v1.h[7]
384*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v1.h[7]
385*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v1.h[7]
386*e1eccf28SAndroid Build Coastguard Worker    114:    ext         v12.16b, v6.16b, v7.16b, #2*2
387*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #6*2
388*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v1.h[6]
389*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v1.h[6]
390*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v1.h[6]
391*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v1.h[6]
392*e1eccf28SAndroid Build Coastguard Worker    113:    ext         v12.16b, v6.16b, v7.16b, #3*2
393*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #5*2
394*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v1.h[5]
395*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v1.h[5]
396*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v1.h[5]
397*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v1.h[5]
398*e1eccf28SAndroid Build Coastguard Worker    112:    //ext         v12.16b, v6.16b, v7.16b, #4*2
399*e1eccf28SAndroid Build Coastguard Worker            //ext         v13.16b, v9.16b, v10.16b, #4*2
400*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v6.8h, v1.h[4]
401*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v7.4h, v1.h[4]
402*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v9.8h, v1.h[4]
403*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v10.4h, v1.h[4]
404*e1eccf28SAndroid Build Coastguard Worker    111:    ext         v12.16b, v6.16b, v7.16b, #5*2
405*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #3*2
406*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v1.h[3]
407*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v1.h[3]
408*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v1.h[3]
409*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v1.h[3]
410*e1eccf28SAndroid Build Coastguard Worker    110:    ext         v12.16b, v6.16b, v7.16b, #6*2
411*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #2*2
412*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v1.h[2]
413*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v1.h[2]
414*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v1.h[2]
415*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v1.h[2]
416*e1eccf28SAndroid Build Coastguard Worker    109:    ext         v12.16b, v6.16b, v7.16b, #7*2
417*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #1*2
418*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v1.h[1]
419*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v1.h[1]
420*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v1.h[1]
421*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v1.h[1]
422*e1eccf28SAndroid Build Coastguard Worker    108:    //ext         v12.16b, v7.16b, v8.16b, #0*2
423*e1eccf28SAndroid Build Coastguard Worker            //ext         v13.16b, v9.16b, v10.16b, #0*2
424*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v7.4h, v1.h[0]
425*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v7.8h, v1.h[0]
426*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v9.4h, v1.h[0]
427*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v9.8h, v1.h[0]
428*e1eccf28SAndroid Build Coastguard Worker    107:    ext         v12.16b, v7.16b, v8.16b, #1*2
429*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v8.16b, v9.16b, #7*2
430*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[7]
431*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[7]
432*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[7]
433*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[7]
434*e1eccf28SAndroid Build Coastguard Worker    106:    ext         v12.16b, v7.16b, v8.16b, #2*2
435*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v8.16b, v9.16b, #6*2
436*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[6]
437*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[6]
438*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[6]
439*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[6]
440*e1eccf28SAndroid Build Coastguard Worker    105:    ext         v12.16b, v7.16b, v8.16b, #3*2
441*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v8.16b, v9.16b, #5*2
442*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[5]
443*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[5]
444*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[5]
445*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[5]
446*e1eccf28SAndroid Build Coastguard Worker    104:    //ext         v12.16b, v7.16b, v8.16b, #4*2
447*e1eccf28SAndroid Build Coastguard Worker            //ext         v13.16b, v8.16b, v9.16b, #4*2
448*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v7.8h, v0.h[4]
449*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v8.4h, v0.h[4]
450*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v8.8h, v0.h[4]
451*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v9.4h, v0.h[4]
452*e1eccf28SAndroid Build Coastguard Worker    103:    ext         v12.16b, v7.16b, v8.16b, #5*2
453*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v8.16b, v9.16b, #3*2
454*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[3]
455*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[3]
456*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[3]
457*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[3]
458*e1eccf28SAndroid Build Coastguard Worker    102:    ext         v12.16b, v7.16b, v8.16b, #6*2
459*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v8.16b, v9.16b, #2*2
460*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[2]
461*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[2]
462*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[2]
463*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[2]
464*e1eccf28SAndroid Build Coastguard Worker    101:    ext         v12.16b, v7.16b, v8.16b, #7*2
465*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v8.16b, v9.16b, #1*2
466*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[1]
467*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[1]
468*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[1]
469*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[1]
470*e1eccf28SAndroid Build Coastguard Worker
471*e1eccf28SAndroid Build Coastguard Worker            uqrshrn     v14.4h, v14.4s, #16
472*e1eccf28SAndroid Build Coastguard Worker            uqrshrn2    v14.8h, v15.4s, #16
473*e1eccf28SAndroid Build Coastguard Worker            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
474*e1eccf28SAndroid Build Coastguard Worker
475*e1eccf28SAndroid Build Coastguard Worker            mov         v6.16b, v7.16b
476*e1eccf28SAndroid Build Coastguard Worker            mov         v7.16b, v8.16b
477*e1eccf28SAndroid Build Coastguard Worker            mov         v8.16b, v9.16b
478*e1eccf28SAndroid Build Coastguard Worker            mov         v9.16b, v10.16b
479*e1eccf28SAndroid Build Coastguard Worker            mov         v10.16b, v11.16b
480*e1eccf28SAndroid Build Coastguard Worker.endm/*}}}*/
481*e1eccf28SAndroid Build Coastguard Worker
482*e1eccf28SAndroid Build Coastguard Worker.macro hconv1_25/*{{{*/
483*e1eccf28SAndroid Build Coastguard Worker.rodata
484*e1eccf28SAndroid Build Coastguard Worker   200:     .hword -4
485*e1eccf28SAndroid Build Coastguard Worker            .hword 101f-100f
486*e1eccf28SAndroid Build Coastguard Worker            .hword 102f-100f
487*e1eccf28SAndroid Build Coastguard Worker            .hword 103f-100f
488*e1eccf28SAndroid Build Coastguard Worker            .hword 104f-100f
489*e1eccf28SAndroid Build Coastguard Worker            .hword 105f-100f
490*e1eccf28SAndroid Build Coastguard Worker            .hword 106f-100f
491*e1eccf28SAndroid Build Coastguard Worker            .hword 107f-100f
492*e1eccf28SAndroid Build Coastguard Worker            .hword 108f-100f
493*e1eccf28SAndroid Build Coastguard Worker            .hword 109f-100f
494*e1eccf28SAndroid Build Coastguard Worker            .hword 110f-100f
495*e1eccf28SAndroid Build Coastguard Worker            .hword 111f-100f
496*e1eccf28SAndroid Build Coastguard Worker            .hword 112f-100f
497*e1eccf28SAndroid Build Coastguard Worker            .hword 113f-100f
498*e1eccf28SAndroid Build Coastguard Worker            .hword 114f-100f
499*e1eccf28SAndroid Build Coastguard Worker            .hword 115f-100f
500*e1eccf28SAndroid Build Coastguard Worker            .hword 116f-100f
501*e1eccf28SAndroid Build Coastguard Worker            .hword 117f-100f
502*e1eccf28SAndroid Build Coastguard Worker            .hword 118f-100f
503*e1eccf28SAndroid Build Coastguard Worker            .hword 119f-100f
504*e1eccf28SAndroid Build Coastguard Worker            .hword 120f-100f
505*e1eccf28SAndroid Build Coastguard Worker            .hword 121f-100f
506*e1eccf28SAndroid Build Coastguard Worker            .hword 122f-100f
507*e1eccf28SAndroid Build Coastguard Worker            .hword 123f-100f
508*e1eccf28SAndroid Build Coastguard Worker            .hword 124f-100f
509*e1eccf28SAndroid Build Coastguard Worker            .hword 125f-100f
510*e1eccf28SAndroid Build Coastguard Worker            .align 4
511*e1eccf28SAndroid Build Coastguard Worker.text
512*e1eccf28SAndroid Build Coastguard Worker            ext         v12.16b, v6.16b, v7.16b, #7*2
513*e1eccf28SAndroid Build Coastguard Worker            umull       v14.4s, v12.4h, v0.h[0]
514*e1eccf28SAndroid Build Coastguard Worker            umull2      v15.4s, v12.8h, v0.h[0]
515*e1eccf28SAndroid Build Coastguard Worker
516*e1eccf28SAndroid Build Coastguard Worker            adrp        x16, 200b
517*e1eccf28SAndroid Build Coastguard Worker            add         x16, x16, :lo12:200b
518*e1eccf28SAndroid Build Coastguard Worker            ldrsh       x12, [x16, x5, LSL #1]
519*e1eccf28SAndroid Build Coastguard Worker            adr         x16, 100f
520*e1eccf28SAndroid Build Coastguard Worker            add         x12, x12, x16
521*e1eccf28SAndroid Build Coastguard Worker    100:    br          x12
522*e1eccf28SAndroid Build Coastguard Worker    125:    ext         v12.16b, v31.16b, v4.16b, #6*2
523*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v10.16b, v11.16b, #0*2
524*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v3.h[1]
525*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v3.h[1]
526*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v3.h[1]
527*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v3.h[1]
528*e1eccf28SAndroid Build Coastguard Worker    124:    ext         v12.16b, v31.16b, v4.16b, #7*2
529*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #7*2
530*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v3.h[0]
531*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v3.h[0]
532*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v3.h[0]
533*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v3.h[0]
534*e1eccf28SAndroid Build Coastguard Worker    123:    ext         v12.16b, v4.16b, v5.16b, #0*2
535*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #6*2
536*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v2.h[7]
537*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v2.h[7]
538*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v2.h[7]
539*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v2.h[7]
540*e1eccf28SAndroid Build Coastguard Worker    122:    ext         v12.16b, v4.16b, v5.16b, #1*2
541*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #5*2
542*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v2.h[6]
543*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v2.h[6]
544*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v2.h[6]
545*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v2.h[6]
546*e1eccf28SAndroid Build Coastguard Worker    121:    ext         v12.16b, v4.16b, v5.16b, #2*2
547*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #4*2
548*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v2.h[5]
549*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v2.h[5]
550*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v2.h[5]
551*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v2.h[5]
552*e1eccf28SAndroid Build Coastguard Worker    120:    ext         v12.16b, v4.16b, v5.16b, #3*2
553*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #3*2
554*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v2.h[4]
555*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v2.h[4]
556*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v2.h[4]
557*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v2.h[4]
558*e1eccf28SAndroid Build Coastguard Worker    119:    ext         v12.16b, v4.16b, v5.16b, #4*2
559*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #2*2
560*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v2.h[3]
561*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v2.h[3]
562*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v2.h[3]
563*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v2.h[3]
564*e1eccf28SAndroid Build Coastguard Worker    118:    ext         v12.16b, v4.16b, v5.16b, #5*2
565*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #1*2
566*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v2.h[2]
567*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v2.h[2]
568*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v2.h[2]
569*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v2.h[2]
570*e1eccf28SAndroid Build Coastguard Worker    117:    ext         v12.16b, v4.16b, v5.16b, #6*2
571*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v9.16b, v10.16b, #0*2
572*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v2.h[1]
573*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v2.h[1]
574*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v2.h[1]
575*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v2.h[1]
576*e1eccf28SAndroid Build Coastguard Worker    116:    ext         v12.16b, v4.16b, v5.16b, #7*2
577*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v8.16b, v9.16b, #7*2
578*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v2.h[0]
579*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v2.h[0]
580*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v2.h[0]
581*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v2.h[0]
582*e1eccf28SAndroid Build Coastguard Worker    115:    ext         v12.16b, v5.16b, v6.16b, #0*2
583*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v8.16b, v9.16b, #6*2
584*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v1.h[7]
585*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v1.h[7]
586*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v1.h[7]
587*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v1.h[7]
588*e1eccf28SAndroid Build Coastguard Worker    114:    ext         v12.16b, v5.16b, v6.16b, #1*2
589*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v8.16b, v9.16b, #5*2
590*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v1.h[6]
591*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v1.h[6]
592*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v1.h[6]
593*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v1.h[6]
594*e1eccf28SAndroid Build Coastguard Worker    113:    ext         v12.16b, v5.16b, v6.16b, #2*2
595*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v8.16b, v9.16b, #4*2
596*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v1.h[5]
597*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v1.h[5]
598*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v1.h[5]
599*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v1.h[5]
600*e1eccf28SAndroid Build Coastguard Worker    112:    ext         v12.16b, v5.16b, v6.16b, #3*2
601*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v8.16b, v9.16b, #3*2
602*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v1.h[4]
603*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v1.h[4]
604*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v1.h[4]
605*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v1.h[4]
606*e1eccf28SAndroid Build Coastguard Worker    111:    ext         v12.16b, v5.16b, v6.16b, #4*2
607*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v8.16b, v9.16b, #2*2
608*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v1.h[3]
609*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v1.h[3]
610*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v1.h[3]
611*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v1.h[3]
612*e1eccf28SAndroid Build Coastguard Worker    110:    ext         v12.16b, v5.16b, v6.16b, #5*2
613*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v8.16b, v9.16b, #1*2
614*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v1.h[2]
615*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v1.h[2]
616*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v1.h[2]
617*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v1.h[2]
618*e1eccf28SAndroid Build Coastguard Worker    109:    ext         v12.16b, v5.16b, v6.16b, #6*2
619*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v8.16b, v9.16b, #0*2
620*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v1.h[1]
621*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v1.h[1]
622*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v1.h[1]
623*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v1.h[1]
624*e1eccf28SAndroid Build Coastguard Worker    108:    ext         v12.16b, v5.16b, v6.16b, #7*2
625*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v7.16b, v8.16b, #7*2
626*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v1.h[0]
627*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v1.h[0]
628*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v1.h[0]
629*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v1.h[0]
630*e1eccf28SAndroid Build Coastguard Worker    107:    ext         v12.16b, v6.16b, v7.16b, #0*2
631*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v7.16b, v8.16b, #6*2
632*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[7]
633*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[7]
634*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[7]
635*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[7]
636*e1eccf28SAndroid Build Coastguard Worker    106:    ext         v12.16b, v6.16b, v7.16b, #1*2
637*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v7.16b, v8.16b, #5*2
638*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[6]
639*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[6]
640*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[6]
641*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[6]
642*e1eccf28SAndroid Build Coastguard Worker    105:    ext         v12.16b, v6.16b, v7.16b, #2*2
643*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v7.16b, v8.16b, #4*2
644*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[5]
645*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[5]
646*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[5]
647*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[5]
648*e1eccf28SAndroid Build Coastguard Worker    104:    ext         v12.16b, v6.16b, v7.16b, #3*2
649*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v7.16b, v8.16b, #3*2
650*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[4]
651*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[4]
652*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[4]
653*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[4]
654*e1eccf28SAndroid Build Coastguard Worker    103:    ext         v12.16b, v6.16b, v7.16b, #4*2
655*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v7.16b, v8.16b, #2*2
656*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[3]
657*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[3]
658*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[3]
659*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[3]
660*e1eccf28SAndroid Build Coastguard Worker    102:    ext         v12.16b, v6.16b, v7.16b, #5*2
661*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v7.16b, v8.16b, #1*2
662*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[2]
663*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[2]
664*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[2]
665*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[2]
666*e1eccf28SAndroid Build Coastguard Worker    101:    ext         v12.16b, v6.16b, v7.16b, #6*2
667*e1eccf28SAndroid Build Coastguard Worker            ext         v13.16b, v7.16b, v8.16b, #0*2
668*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v0.h[1]
669*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v0.h[1]
670*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v13.4h, v0.h[1]
671*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v13.8h, v0.h[1]
672*e1eccf28SAndroid Build Coastguard Worker
673*e1eccf28SAndroid Build Coastguard Worker            uqrshrn     v14.4h, v14.4s, #16
674*e1eccf28SAndroid Build Coastguard Worker            uqrshrn2    v14.8h, v15.4s, #16
675*e1eccf28SAndroid Build Coastguard Worker            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
676*e1eccf28SAndroid Build Coastguard Worker
677*e1eccf28SAndroid Build Coastguard Worker            mov         v31.16b, v4.16b
678*e1eccf28SAndroid Build Coastguard Worker            mov         v4.16b, v5.16b
679*e1eccf28SAndroid Build Coastguard Worker            mov         v5.16b, v6.16b
680*e1eccf28SAndroid Build Coastguard Worker            mov         v6.16b, v7.16b
681*e1eccf28SAndroid Build Coastguard Worker            mov         v7.16b, v8.16b
682*e1eccf28SAndroid Build Coastguard Worker            mov         v8.16b, v9.16b
683*e1eccf28SAndroid Build Coastguard Worker            mov         v9.16b, v10.16b
684*e1eccf28SAndroid Build Coastguard Worker            mov         v10.16b, v11.16b
685*e1eccf28SAndroid Build Coastguard Worker.endm/*}}}*/
686*e1eccf28SAndroid Build Coastguard Worker
687*e1eccf28SAndroid Build Coastguard Worker#define TUNED_LIST4 6, 12, 20
688*e1eccf28SAndroid Build Coastguard Worker.macro hconv4_6/*{{{*/
689*e1eccf28SAndroid Build Coastguard Worker.rodata
690*e1eccf28SAndroid Build Coastguard Worker   200:     .hword -4
691*e1eccf28SAndroid Build Coastguard Worker            .hword 101f-100f
692*e1eccf28SAndroid Build Coastguard Worker            .hword 102f-100f
693*e1eccf28SAndroid Build Coastguard Worker            .hword 103f-100f
694*e1eccf28SAndroid Build Coastguard Worker            .hword 104f-100f
695*e1eccf28SAndroid Build Coastguard Worker            .hword 105f-100f
696*e1eccf28SAndroid Build Coastguard Worker            .hword 106f-100f
697*e1eccf28SAndroid Build Coastguard Worker            .align      4
698*e1eccf28SAndroid Build Coastguard Worker.text
699*e1eccf28SAndroid Build Coastguard Worker            umull       v14.4s, v7.4h, v0.h[0]
700*e1eccf28SAndroid Build Coastguard Worker            umull2      v15.4s, v7.8h, v0.h[0]
701*e1eccf28SAndroid Build Coastguard Worker
702*e1eccf28SAndroid Build Coastguard Worker            adrp        x16, 200b
703*e1eccf28SAndroid Build Coastguard Worker            add         x16, x16, :lo12:200b
704*e1eccf28SAndroid Build Coastguard Worker            ldrsh       x12, [x16, x5, LSL #1]
705*e1eccf28SAndroid Build Coastguard Worker            adr         x16, 100f
706*e1eccf28SAndroid Build Coastguard Worker            add         x12, x12, x16
707*e1eccf28SAndroid Build Coastguard Worker    100:    br          x12
708*e1eccf28SAndroid Build Coastguard Worker    106:    umlal       v14.4s, v4.4h,  v0.h[6]
709*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v4.8h,  v0.h[6]
710*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v10.4h, v0.h[6]
711*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v10.8h, v0.h[6]
712*e1eccf28SAndroid Build Coastguard Worker    105:    umlal2      v14.4s, v4.8h,  v0.h[5]
713*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v5.4h, v0.h[5]
714*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v9.8h, v0.h[5]
715*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v10.4h, v0.h[5]
716*e1eccf28SAndroid Build Coastguard Worker    104:    umlal       v14.4s, v5.4h, v0.h[4]
717*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v5.8h, v0.h[4]
718*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v9.4h, v0.h[4]
719*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v9.8h, v0.h[4]
720*e1eccf28SAndroid Build Coastguard Worker    103:    umlal2      v14.4s, v5.8h, v0.h[3]
721*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v6.4h, v0.h[3]
722*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v8.8h, v0.h[3]
723*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v9.4h, v0.h[3]
724*e1eccf28SAndroid Build Coastguard Worker    102:    umlal       v14.4s, v6.4h, v0.h[2]
725*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v6.8h, v0.h[2]
726*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v8.4h, v0.h[2]
727*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v8.8h, v0.h[2]
728*e1eccf28SAndroid Build Coastguard Worker    101:    umlal2      v14.4s, v6.8h, v0.h[1]
729*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v7.4h, v0.h[1]
730*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v7.8h, v0.h[1]
731*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v8.4h, v0.h[1]
732*e1eccf28SAndroid Build Coastguard Worker
733*e1eccf28SAndroid Build Coastguard Worker            uqrshrn     v14.4h, v14.4s, #16
734*e1eccf28SAndroid Build Coastguard Worker            uqrshrn2    v14.8h, v15.4s, #16
735*e1eccf28SAndroid Build Coastguard Worker            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
736*e1eccf28SAndroid Build Coastguard Worker
737*e1eccf28SAndroid Build Coastguard Worker            mov         v4.16b, v5.16b
738*e1eccf28SAndroid Build Coastguard Worker            mov         v5.16b, v6.16b
739*e1eccf28SAndroid Build Coastguard Worker            mov         v6.16b, v7.16b
740*e1eccf28SAndroid Build Coastguard Worker            mov         v7.16b, v8.16b
741*e1eccf28SAndroid Build Coastguard Worker            mov         v8.16b, v9.16b
742*e1eccf28SAndroid Build Coastguard Worker            mov         v9.16b, v10.16b
743*e1eccf28SAndroid Build Coastguard Worker            mov         v10.16b, v11.16b
744*e1eccf28SAndroid Build Coastguard Worker.endm/*}}}*/
745*e1eccf28SAndroid Build Coastguard Worker
746*e1eccf28SAndroid Build Coastguard Worker.macro hconv4_12/*{{{*/
747*e1eccf28SAndroid Build Coastguard Worker.rodata
748*e1eccf28SAndroid Build Coastguard Worker   200:     .hword -4 //Might need to remove these...
749*e1eccf28SAndroid Build Coastguard Worker            .hword 101f-100f
750*e1eccf28SAndroid Build Coastguard Worker            .hword 102f-100f
751*e1eccf28SAndroid Build Coastguard Worker            .hword 103f-100f
752*e1eccf28SAndroid Build Coastguard Worker            .hword 104f-100f
753*e1eccf28SAndroid Build Coastguard Worker            .hword 105f-100f
754*e1eccf28SAndroid Build Coastguard Worker            .hword 106f-100f
755*e1eccf28SAndroid Build Coastguard Worker            .hword 107f-100f
756*e1eccf28SAndroid Build Coastguard Worker            .hword 108f-100f
757*e1eccf28SAndroid Build Coastguard Worker            .hword 109f-100f
758*e1eccf28SAndroid Build Coastguard Worker            .hword 110f-100f
759*e1eccf28SAndroid Build Coastguard Worker            .hword 111f-100f
760*e1eccf28SAndroid Build Coastguard Worker            .hword 112f-100f
761*e1eccf28SAndroid Build Coastguard Worker            .align 4
762*e1eccf28SAndroid Build Coastguard Worker.text
763*e1eccf28SAndroid Build Coastguard Worker            umull       v14.4s, v4.4h, v0.h[0]
764*e1eccf28SAndroid Build Coastguard Worker            umull2      v15.4s, v4.8h, v0.h[0]
765*e1eccf28SAndroid Build Coastguard Worker
766*e1eccf28SAndroid Build Coastguard Worker            adrp        x16, 200b
767*e1eccf28SAndroid Build Coastguard Worker            add         x16, x16, :lo12:200b
768*e1eccf28SAndroid Build Coastguard Worker            ldrsh       x12, [x16, x5, LSL #1]
769*e1eccf28SAndroid Build Coastguard Worker            adr         x16, 100f
770*e1eccf28SAndroid Build Coastguard Worker            add         x12, x12, x16
771*e1eccf28SAndroid Build Coastguard Worker    100:    br          x12
772*e1eccf28SAndroid Build Coastguard Worker    112:    umlal       v14.4s, v26.4h, v1.h[4]
773*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v26.8h, v1.h[4]
774*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v10.4h, v1.h[4]
775*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v10.8h, v1.h[4]
776*e1eccf28SAndroid Build Coastguard Worker    111:    umlal2      v14.4s, v26.8h, v1.h[3]
777*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v27.4h, v1.h[3]
778*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v9.8h, v1.h[3]
779*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v10.4h, v1.h[3]
780*e1eccf28SAndroid Build Coastguard Worker    110:    umlal       v14.4s, v27.4h, v1.h[2]
781*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v27.8h, v1.h[2]
782*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v9.4h, v1.h[2]
783*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v9.8h, v1.h[2]
784*e1eccf28SAndroid Build Coastguard Worker    109:    umlal2      v14.4s, v27.8h, v1.h[1]
785*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v28.4h, v1.h[1]
786*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v8.8h, v1.h[1]
787*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v9.4h, v1.h[1]
788*e1eccf28SAndroid Build Coastguard Worker    108:    umlal       v14.4s, v28.4h, v1.h[0]
789*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v28.8h, v1.h[0]
790*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v8.4h, v1.h[0]
791*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v8.8h, v1.h[0]
792*e1eccf28SAndroid Build Coastguard Worker    107:    umlal2      v14.4s, v28.8h, v0.h[7]
793*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v29.4h, v0.h[7]
794*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v7.8h, v0.h[7]
795*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v8.4h, v0.h[7]
796*e1eccf28SAndroid Build Coastguard Worker    106:    umlal       v14.4s, v29.4h, v0.h[6]
797*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v29.8h, v0.h[6]
798*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v7.4h, v0.h[6]
799*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v7.8h, v0.h[6]
800*e1eccf28SAndroid Build Coastguard Worker    105:    umlal2      v14.4s, v29.8h, v0.h[5]
801*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v30.4h, v0.h[5]
802*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v6.8h, v0.h[5]
803*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v7.4h, v0.h[5]
804*e1eccf28SAndroid Build Coastguard Worker    104:    umlal       v14.4s, v30.4h, v0.h[4]
805*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v30.8h, v0.h[4]
806*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v6.4h, v0.h[4]
807*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v6.8h, v0.h[4]
808*e1eccf28SAndroid Build Coastguard Worker    103:    umlal2      v14.4s, v30.8h, v0.h[3]
809*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v31.4h, v0.h[3]
810*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v5.8h, v0.h[3]
811*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v6.4h, v0.h[3]
812*e1eccf28SAndroid Build Coastguard Worker    102:    umlal       v14.4s, v31.4h, v0.h[2]
813*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v31.8h, v0.h[2]
814*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v5.4h, v0.h[2]
815*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v5.8h, v0.h[2]
816*e1eccf28SAndroid Build Coastguard Worker    101:    umlal2      v14.4s, v31.8h, v0.h[1]
817*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v4.4h,  v0.h[1]
818*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v4.8h,  v0.h[1]
819*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v5.4h, v0.h[1]
820*e1eccf28SAndroid Build Coastguard Worker
821*e1eccf28SAndroid Build Coastguard Worker            uqrshrn     v14.4h, v14.4s, #16
822*e1eccf28SAndroid Build Coastguard Worker            uqrshrn2    v14.8h, v15.4s, #16
823*e1eccf28SAndroid Build Coastguard Worker            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
824*e1eccf28SAndroid Build Coastguard Worker
825*e1eccf28SAndroid Build Coastguard Worker            mov         v26.16b, v27.16b
826*e1eccf28SAndroid Build Coastguard Worker            mov         v27.16b, v28.16b
827*e1eccf28SAndroid Build Coastguard Worker            mov         v28.16b, v29.16b
828*e1eccf28SAndroid Build Coastguard Worker            mov         v29.16b, v30.16b
829*e1eccf28SAndroid Build Coastguard Worker            mov         v30.16b, v31.16b
830*e1eccf28SAndroid Build Coastguard Worker            mov         v31.16b, v4.16b
831*e1eccf28SAndroid Build Coastguard Worker            mov         v4.16b, v5.16b
832*e1eccf28SAndroid Build Coastguard Worker            mov         v5.16b, v6.16b
833*e1eccf28SAndroid Build Coastguard Worker            mov         v6.16b, v7.16b
834*e1eccf28SAndroid Build Coastguard Worker            mov         v7.16b, v8.16b
835*e1eccf28SAndroid Build Coastguard Worker            mov         v8.16b, v9.16b
836*e1eccf28SAndroid Build Coastguard Worker            mov         v9.16b, v10.16b
837*e1eccf28SAndroid Build Coastguard Worker            mov         v10.16b, v11.16b
838*e1eccf28SAndroid Build Coastguard Worker.endm/*}}}*/
839*e1eccf28SAndroid Build Coastguard Worker
840*e1eccf28SAndroid Build Coastguard Worker.macro hconv4_20/*{{{*/
841*e1eccf28SAndroid Build Coastguard Worker.rodata
842*e1eccf28SAndroid Build Coastguard Worker   200:     .hword -4
843*e1eccf28SAndroid Build Coastguard Worker            .hword 101f-100f
844*e1eccf28SAndroid Build Coastguard Worker            .hword 102f-100f
845*e1eccf28SAndroid Build Coastguard Worker            .hword 103f-100f
846*e1eccf28SAndroid Build Coastguard Worker            .hword 104f-100f
847*e1eccf28SAndroid Build Coastguard Worker            .hword 105f-100f
848*e1eccf28SAndroid Build Coastguard Worker            .hword 106f-100f
849*e1eccf28SAndroid Build Coastguard Worker            .hword 107f-100f
850*e1eccf28SAndroid Build Coastguard Worker            .hword 108f-100f
851*e1eccf28SAndroid Build Coastguard Worker            .hword 109f-100f
852*e1eccf28SAndroid Build Coastguard Worker            .hword 110f-100f
853*e1eccf28SAndroid Build Coastguard Worker            .hword 111f-100f
854*e1eccf28SAndroid Build Coastguard Worker            .hword 112f-100f
855*e1eccf28SAndroid Build Coastguard Worker            .hword 113f-100f
856*e1eccf28SAndroid Build Coastguard Worker            .hword 114f-100f
857*e1eccf28SAndroid Build Coastguard Worker            .hword 115f-100f
858*e1eccf28SAndroid Build Coastguard Worker            .hword 116f-100f
859*e1eccf28SAndroid Build Coastguard Worker            .hword 117f-100f
860*e1eccf28SAndroid Build Coastguard Worker            .hword 118f-100f
861*e1eccf28SAndroid Build Coastguard Worker            .hword 119f-100f
862*e1eccf28SAndroid Build Coastguard Worker            .hword 120f-100f
863*e1eccf28SAndroid Build Coastguard Worker            .align 4
864*e1eccf28SAndroid Build Coastguard Worker.text
865*e1eccf28SAndroid Build Coastguard Worker            umull       v14.4s, v28.4h, v0.h[0]
866*e1eccf28SAndroid Build Coastguard Worker            umull2      v15.4s, v28.8h, v0.h[0]
867*e1eccf28SAndroid Build Coastguard Worker
868*e1eccf28SAndroid Build Coastguard Worker            adrp        x16, 200b
869*e1eccf28SAndroid Build Coastguard Worker            add         x16, x16, :lo12:200b
870*e1eccf28SAndroid Build Coastguard Worker            ldrsh       x12, [x16, x5, LSL #1]
871*e1eccf28SAndroid Build Coastguard Worker            adr         x16, 100f
872*e1eccf28SAndroid Build Coastguard Worker            add         x12, x12, x16
873*e1eccf28SAndroid Build Coastguard Worker    100:    br          x12
874*e1eccf28SAndroid Build Coastguard Worker    120:    umlal       v14.4s, v18.4h, v2.h[4]
875*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v18.8h, v2.h[4]
876*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v10.4h, v2.h[4]
877*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v10.8h, v2.h[4]
878*e1eccf28SAndroid Build Coastguard Worker    119:    umlal2      v14.4s, v18.8h, v2.h[3]
879*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v19.4h, v2.h[3]
880*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v9.8h,  v2.h[3]
881*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v10.4h, v2.h[3]
882*e1eccf28SAndroid Build Coastguard Worker    118:    umlal       v14.4s, v19.4h, v2.h[2]
883*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v19.8h, v2.h[2]
884*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v9.4h,  v2.h[2]
885*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v9.8h,  v2.h[2]
886*e1eccf28SAndroid Build Coastguard Worker    117:    umlal2      v14.4s, v19.8h, v2.h[1]
887*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v20.4h, v2.h[1]
888*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v8.8h,  v2.h[1]
889*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v9.4h,  v2.h[1]
890*e1eccf28SAndroid Build Coastguard Worker    116:    umlal       v14.4s, v20.4h, v2.h[0]
891*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v20.8h, v2.h[0]
892*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v8.4h,  v2.h[0]
893*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v8.8h,  v2.h[0]
894*e1eccf28SAndroid Build Coastguard Worker    115:    umlal2      v14.4s, v20.8h, v1.h[7]
895*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v21.4h, v1.h[7]
896*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v7.8h,  v1.h[7]
897*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v8.4h,  v1.h[7]
898*e1eccf28SAndroid Build Coastguard Worker    114:    umlal       v14.4s, v21.4h, v1.h[6]
899*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v21.8h, v1.h[6]
900*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v7.4h,  v1.h[6]
901*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v7.8h,  v1.h[6]
902*e1eccf28SAndroid Build Coastguard Worker    113:    umlal2      v14.4s, v21.8h, v1.h[5]
903*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v22.4h, v1.h[5]
904*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v6.8h,  v1.h[5]
905*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v7.4h,  v1.h[5]
906*e1eccf28SAndroid Build Coastguard Worker    112:    umlal       v14.4s, v22.4h, v1.h[4]
907*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v22.8h, v1.h[4]
908*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v6.4h,  v1.h[4]
909*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v6.8h,  v1.h[4]
910*e1eccf28SAndroid Build Coastguard Worker    111:    umlal2      v14.4s, v22.8h, v1.h[3]
911*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v23.4h, v1.h[3]
912*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v5.8h,  v1.h[3]
913*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v6.4h,  v1.h[3]
914*e1eccf28SAndroid Build Coastguard Worker    110:    umlal       v14.4s, v23.4h, v1.h[2]
915*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v23.8h, v1.h[2]
916*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v5.4h,  v1.h[2]
917*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v5.8h,  v1.h[2]
918*e1eccf28SAndroid Build Coastguard Worker    109:    umlal2      v14.4s, v23.8h, v1.h[1]
919*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v24.4h, v1.h[1]
920*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v4.8h,  v1.h[1]
921*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v5.4h,  v1.h[1]
922*e1eccf28SAndroid Build Coastguard Worker    108:    umlal       v14.4s, v24.4h, v1.h[0]
923*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v24.8h, v1.h[0]
924*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v4.4h,  v1.h[0]
925*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v4.8h,  v1.h[0]
926*e1eccf28SAndroid Build Coastguard Worker    107:    umlal2      v14.4s, v24.8h, v0.h[7]
927*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v25.4h, v0.h[7]
928*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v31.8h, v0.h[7]
929*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v4.4h,  v0.h[7]
930*e1eccf28SAndroid Build Coastguard Worker    106:    umlal       v14.4s, v25.4h, v0.h[6]
931*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v25.8h, v0.h[6]
932*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v31.4h, v0.h[6]
933*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v31.8h, v0.h[6]
934*e1eccf28SAndroid Build Coastguard Worker    105:    umlal2      v14.4s, v25.8h, v0.h[5]
935*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v26.4h, v0.h[5]
936*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v30.8h, v0.h[5]
937*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v31.4h, v0.h[5]
938*e1eccf28SAndroid Build Coastguard Worker    104:    umlal       v14.4s, v26.4h, v0.h[4]
939*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v26.8h, v0.h[4]
940*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v30.4h, v0.h[4]
941*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v30.8h, v0.h[4]
942*e1eccf28SAndroid Build Coastguard Worker    103:    umlal2      v14.4s, v26.8h, v0.h[3]
943*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v27.4h, v0.h[3]
944*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v29.8h, v0.h[3]
945*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v30.4h, v0.h[3]
946*e1eccf28SAndroid Build Coastguard Worker    102:    umlal       v14.4s, v27.4h, v0.h[2]
947*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v27.8h, v0.h[2]
948*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v29.4h, v0.h[2]
949*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v29.8h, v0.h[2]
950*e1eccf28SAndroid Build Coastguard Worker    101:    umlal2      v14.4s, v27.8h, v0.h[1]
951*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v28.4h, v0.h[1]
952*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v28.8h, v0.h[1]
953*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v29.4h, v0.h[1]
954*e1eccf28SAndroid Build Coastguard Worker
955*e1eccf28SAndroid Build Coastguard Worker            uqrshrn     v14.4h, v14.4s, #16
956*e1eccf28SAndroid Build Coastguard Worker            uqrshrn2    v14.8h, v15.4s, #16
957*e1eccf28SAndroid Build Coastguard Worker            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
958*e1eccf28SAndroid Build Coastguard Worker
959*e1eccf28SAndroid Build Coastguard Worker            mov         v18.16b, v19.16b
960*e1eccf28SAndroid Build Coastguard Worker            mov         v19.16b, v20.16b
961*e1eccf28SAndroid Build Coastguard Worker            mov         v20.16b, v21.16b
962*e1eccf28SAndroid Build Coastguard Worker            mov         v21.16b, v22.16b
963*e1eccf28SAndroid Build Coastguard Worker            mov         v22.16b, v23.16b
964*e1eccf28SAndroid Build Coastguard Worker            mov         v23.16b, v24.16b
965*e1eccf28SAndroid Build Coastguard Worker            mov         v24.16b, v25.16b
966*e1eccf28SAndroid Build Coastguard Worker            mov         v25.16b, v26.16b
967*e1eccf28SAndroid Build Coastguard Worker            mov         v26.16b, v27.16b
968*e1eccf28SAndroid Build Coastguard Worker            mov         v27.16b, v28.16b
969*e1eccf28SAndroid Build Coastguard Worker            mov         v28.16b, v29.16b
970*e1eccf28SAndroid Build Coastguard Worker            mov         v29.16b, v30.16b
971*e1eccf28SAndroid Build Coastguard Worker            mov         v30.16b, v31.16b
972*e1eccf28SAndroid Build Coastguard Worker            mov         v31.16b, v4.16b
973*e1eccf28SAndroid Build Coastguard Worker            mov         v4.16b, v5.16b
974*e1eccf28SAndroid Build Coastguard Worker            mov         v5.16b, v6.16b
975*e1eccf28SAndroid Build Coastguard Worker            mov         v6.16b, v7.16b
976*e1eccf28SAndroid Build Coastguard Worker            mov         v7.16b, v8.16b
977*e1eccf28SAndroid Build Coastguard Worker            mov         v8.16b, v9.16b
978*e1eccf28SAndroid Build Coastguard Worker            mov         v9.16b, v10.16b
979*e1eccf28SAndroid Build Coastguard Worker            mov         v10.16b, v11.16b
980*e1eccf28SAndroid Build Coastguard Worker.endm/*}}}*/
981*e1eccf28SAndroid Build Coastguard Worker
982*e1eccf28SAndroid Build Coastguard Worker.macro hconv4_25/*{{{*/
983*e1eccf28SAndroid Build Coastguard Worker.rodata
984*e1eccf28SAndroid Build Coastguard Worker   200:     .hword -4
985*e1eccf28SAndroid Build Coastguard Worker            .hword 101f-100f
986*e1eccf28SAndroid Build Coastguard Worker            .hword 102f-100f
987*e1eccf28SAndroid Build Coastguard Worker            .hword 103f-100f
988*e1eccf28SAndroid Build Coastguard Worker            .hword 104f-100f
989*e1eccf28SAndroid Build Coastguard Worker            .hword 105f-100f
990*e1eccf28SAndroid Build Coastguard Worker            .hword 106f-100f
991*e1eccf28SAndroid Build Coastguard Worker            .hword 107f-100f
992*e1eccf28SAndroid Build Coastguard Worker            .hword 108f-100f
993*e1eccf28SAndroid Build Coastguard Worker            .hword 109f-100f
994*e1eccf28SAndroid Build Coastguard Worker            .hword 110f-100f
995*e1eccf28SAndroid Build Coastguard Worker            .hword 111f-100f
996*e1eccf28SAndroid Build Coastguard Worker            .hword 112f-100f
997*e1eccf28SAndroid Build Coastguard Worker            .hword 113f-100f
998*e1eccf28SAndroid Build Coastguard Worker            .hword 114f-100f
999*e1eccf28SAndroid Build Coastguard Worker            .hword 115f-100f
1000*e1eccf28SAndroid Build Coastguard Worker            .hword 116f-100f
1001*e1eccf28SAndroid Build Coastguard Worker            .hword 117f-100f
1002*e1eccf28SAndroid Build Coastguard Worker            .hword 118f-100f
1003*e1eccf28SAndroid Build Coastguard Worker            .hword 119f-100f
1004*e1eccf28SAndroid Build Coastguard Worker            .hword 120f-100f
1005*e1eccf28SAndroid Build Coastguard Worker            .hword 121f-100f
1006*e1eccf28SAndroid Build Coastguard Worker            .hword 122f-100f
1007*e1eccf28SAndroid Build Coastguard Worker            .hword 123f-100f
1008*e1eccf28SAndroid Build Coastguard Worker            .hword 124f-100f
1009*e1eccf28SAndroid Build Coastguard Worker            .hword 125f-100f
1010*e1eccf28SAndroid Build Coastguard Worker            .align 4
1011*e1eccf28SAndroid Build Coastguard Worker.text
1012*e1eccf28SAndroid Build Coastguard Worker            umull2      v14.4s, v25.8h, v0.h[0]
1013*e1eccf28SAndroid Build Coastguard Worker            umull       v15.4s, v26.4h, v0.h[0]
1014*e1eccf28SAndroid Build Coastguard Worker
1015*e1eccf28SAndroid Build Coastguard Worker            adrp        x16, 200b
1016*e1eccf28SAndroid Build Coastguard Worker            add         x16, x16, :lo12:200b
1017*e1eccf28SAndroid Build Coastguard Worker            ldrsh       x12, [x16, x5, LSL #1]
1018*e1eccf28SAndroid Build Coastguard Worker            adr         x16, 100f
1019*e1eccf28SAndroid Build Coastguard Worker            add         x12, x12, x16
1020*e1eccf28SAndroid Build Coastguard Worker    100:    br          x12
1021*e1eccf28SAndroid Build Coastguard Worker    125:    ld1         {v12.8h}, [x9]
1022*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v3.h[1]
1023*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v3.h[1]
1024*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v10.4h, v3.h[1]
1025*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v10.8h, v3.h[1]
1026*e1eccf28SAndroid Build Coastguard Worker    124:    add         x12, x9, #0x08
1027*e1eccf28SAndroid Build Coastguard Worker            bic         x12, x12, #0x40
1028*e1eccf28SAndroid Build Coastguard Worker            ld1         {v12.4h}, [x12], #8
1029*e1eccf28SAndroid Build Coastguard Worker            bic         x12, x12, #0x40
1030*e1eccf28SAndroid Build Coastguard Worker            ld1         {v13.4h}, [x12]
1031*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v3.h[0]
1032*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v13.4h, v3.h[0]
1033*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v9.8h,  v3.h[0]
1034*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v10.4h, v3.h[0]
1035*e1eccf28SAndroid Build Coastguard Worker    123:    add         x12, x9, #0x10
1036*e1eccf28SAndroid Build Coastguard Worker            bic         x12, x12, #0x40
1037*e1eccf28SAndroid Build Coastguard Worker            ld1         {v12.8h}, [x12]
1038*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v2.h[7]
1039*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v2.h[7]
1040*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v9.4h,  v2.h[7]
1041*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v9.8h,  v2.h[7]
1042*e1eccf28SAndroid Build Coastguard Worker    122:    add         x12, x9, #0x18
1043*e1eccf28SAndroid Build Coastguard Worker            bic         x12, x12, #0x40
1044*e1eccf28SAndroid Build Coastguard Worker            ld1         {v12.4h}, [x12], #8
1045*e1eccf28SAndroid Build Coastguard Worker            bic         x12, x12, #0x40
1046*e1eccf28SAndroid Build Coastguard Worker            ld1         {v13.4h}, [x12]
1047*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v2.h[6]
1048*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v13.4h, v2.h[6]
1049*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v8.8h,  v2.h[6]
1050*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v9.4h,  v2.h[6]
1051*e1eccf28SAndroid Build Coastguard Worker    121:    add         x12, x9, #0x20
1052*e1eccf28SAndroid Build Coastguard Worker            bic         x12, x12, #0x40
1053*e1eccf28SAndroid Build Coastguard Worker            ld1         {v12.8h}, [x12]
1054*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v2.h[5]
1055*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v2.h[5]
1056*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v8.4h,  v2.h[5]
1057*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v8.8h,  v2.h[5]
1058*e1eccf28SAndroid Build Coastguard Worker    120:    add         x12, x9, #0x28
1059*e1eccf28SAndroid Build Coastguard Worker            bic         x12, x12, #0x40
1060*e1eccf28SAndroid Build Coastguard Worker            ld1         {v12.4h}, [x12], #8
1061*e1eccf28SAndroid Build Coastguard Worker            bic         x12, x12, #0x40
1062*e1eccf28SAndroid Build Coastguard Worker            ld1         {v13.4h}, [x12]
1063*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v2.h[4]
1064*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v13.4h, v2.h[4]
1065*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v7.8h,  v2.h[4]
1066*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v8.4h,  v2.h[4]
1067*e1eccf28SAndroid Build Coastguard Worker    119:    add         x12, x9, #0x30
1068*e1eccf28SAndroid Build Coastguard Worker            bic         x12, x12, #0x40
1069*e1eccf28SAndroid Build Coastguard Worker            ld1         {v12.8h}, [x12]
1070*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v2.h[3]
1071*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v12.8h, v2.h[3]
1072*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v7.4h,  v2.h[3]
1073*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v7.8h,  v2.h[3]
1074*e1eccf28SAndroid Build Coastguard Worker    118:    add         x12, x9, #0x38
1075*e1eccf28SAndroid Build Coastguard Worker            bic         x12, x12, #0x40
1076*e1eccf28SAndroid Build Coastguard Worker            ld1         {v12.4h}, [x12]
1077*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v12.4h, v2.h[2]
1078*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v17.4h, v2.h[2]
1079*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v6.8h,  v2.h[2]
1080*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v7.4h,  v2.h[2]
1081*e1eccf28SAndroid Build Coastguard Worker    117:    umlal       v14.4s, v17.4h, v2.h[1]
1082*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v17.8h, v2.h[1]
1083*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v6.4h,  v2.h[1]
1084*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v6.8h,  v2.h[1]
1085*e1eccf28SAndroid Build Coastguard Worker    116:    umlal2      v14.4s, v17.8h, v2.h[0]
1086*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v18.4h, v2.h[0]
1087*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v5.8h,  v2.h[0]
1088*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v6.4h,  v2.h[0]
1089*e1eccf28SAndroid Build Coastguard Worker    115:    umlal       v14.4s, v18.4h, v1.h[7]
1090*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v18.8h, v1.h[7]
1091*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v5.4h,  v1.h[7]
1092*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v5.8h,  v1.h[7]
1093*e1eccf28SAndroid Build Coastguard Worker    114:    umlal2      v14.4s, v18.8h, v1.h[6]
1094*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v19.4h, v1.h[6]
1095*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v4.8h,  v1.h[6]
1096*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v5.4h,  v1.h[6]
1097*e1eccf28SAndroid Build Coastguard Worker    113:    umlal       v14.4s, v19.4h, v1.h[5]
1098*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v19.8h, v1.h[5]
1099*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v4.4h,  v1.h[5]
1100*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v4.8h,  v1.h[5]
1101*e1eccf28SAndroid Build Coastguard Worker    112:    umlal2      v14.4s, v19.8h, v1.h[4]
1102*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v20.4h, v1.h[4]
1103*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v31.8h, v1.h[4]
1104*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v4.4h,  v1.h[4]
1105*e1eccf28SAndroid Build Coastguard Worker    111:    umlal       v14.4s, v20.4h, v1.h[3]
1106*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v20.8h, v1.h[3]
1107*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v31.4h, v1.h[3]
1108*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v31.8h, v1.h[3]
1109*e1eccf28SAndroid Build Coastguard Worker    110:    umlal2      v14.4s, v20.8h, v1.h[2]
1110*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v21.4h, v1.h[2]
1111*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v30.8h, v1.h[2]
1112*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v31.4h, v1.h[2]
1113*e1eccf28SAndroid Build Coastguard Worker    109:    umlal       v14.4s, v21.4h, v1.h[1]
1114*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v21.8h, v1.h[1]
1115*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v30.4h, v1.h[1]
1116*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v30.8h, v1.h[1]
1117*e1eccf28SAndroid Build Coastguard Worker    108:    umlal2      v14.4s, v21.8h, v1.h[0]
1118*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v22.4h, v1.h[0]
1119*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v29.8h, v1.h[0]
1120*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v30.4h, v1.h[0]
1121*e1eccf28SAndroid Build Coastguard Worker    107:    umlal       v14.4s, v22.4h, v0.h[7]
1122*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v22.8h, v0.h[7]
1123*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v29.4h, v0.h[7]
1124*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v29.8h, v0.h[7]
1125*e1eccf28SAndroid Build Coastguard Worker    106:    umlal2      v14.4s, v22.8h, v0.h[6]
1126*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v23.4h, v0.h[6]
1127*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v28.8h, v0.h[6]
1128*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v29.4h, v0.h[6]
1129*e1eccf28SAndroid Build Coastguard Worker    105:    umlal       v14.4s, v23.4h, v0.h[5]
1130*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v23.8h, v0.h[5]
1131*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v28.4h, v0.h[5]
1132*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v28.8h, v0.h[5]
1133*e1eccf28SAndroid Build Coastguard Worker    104:    umlal2      v14.4s, v23.8h, v0.h[4]
1134*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v24.4h, v0.h[4]
1135*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v27.8h, v0.h[4]
1136*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v28.4h, v0.h[4]
1137*e1eccf28SAndroid Build Coastguard Worker    103:    umlal       v14.4s, v24.4h, v0.h[3]
1138*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v24.8h, v0.h[3]
1139*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v27.4h, v0.h[3]
1140*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v27.8h, v0.h[3]
1141*e1eccf28SAndroid Build Coastguard Worker    102:    umlal2      v14.4s, v24.8h, v0.h[2]
1142*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v25.4h, v0.h[2]
1143*e1eccf28SAndroid Build Coastguard Worker            umlal2      v14.4s, v26.8h, v0.h[2]
1144*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.4s, v27.4h, v0.h[2]
1145*e1eccf28SAndroid Build Coastguard Worker    101:    umlal       v14.4s, v25.4h, v0.h[1]
1146*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v25.8h, v0.h[1]
1147*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.4s, v26.4h, v0.h[1]
1148*e1eccf28SAndroid Build Coastguard Worker            umlal2      v15.4s, v26.8h, v0.h[1]
1149*e1eccf28SAndroid Build Coastguard Worker
1150*e1eccf28SAndroid Build Coastguard Worker            uqrshrn     v14.4h, v14.4s, #16
1151*e1eccf28SAndroid Build Coastguard Worker            uqrshrn2    v14.8h, v15.4s, #16
1152*e1eccf28SAndroid Build Coastguard Worker            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
1153*e1eccf28SAndroid Build Coastguard Worker
1154*e1eccf28SAndroid Build Coastguard Worker            st1         {v17.16b}, [x9], #16
1155*e1eccf28SAndroid Build Coastguard Worker            bic         x9, x9, #0x40
1156*e1eccf28SAndroid Build Coastguard Worker            mov         v17.16b, v18.16b
1157*e1eccf28SAndroid Build Coastguard Worker            mov         v18.16b, v19.16b
1158*e1eccf28SAndroid Build Coastguard Worker            mov         v19.16b, v20.16b
1159*e1eccf28SAndroid Build Coastguard Worker            mov         v20.16b, v21.16b
1160*e1eccf28SAndroid Build Coastguard Worker            mov         v21.16b, v22.16b
1161*e1eccf28SAndroid Build Coastguard Worker            mov         v22.16b, v23.16b
1162*e1eccf28SAndroid Build Coastguard Worker            mov         v23.16b, v24.16b
1163*e1eccf28SAndroid Build Coastguard Worker            mov         v24.16b, v25.16b
1164*e1eccf28SAndroid Build Coastguard Worker            mov         v25.16b, v26.16b
1165*e1eccf28SAndroid Build Coastguard Worker            mov         v26.16b, v27.16b
1166*e1eccf28SAndroid Build Coastguard Worker            mov         v27.16b, v28.16b
1167*e1eccf28SAndroid Build Coastguard Worker            mov         v28.16b, v29.16b
1168*e1eccf28SAndroid Build Coastguard Worker            mov         v29.16b, v30.16b
1169*e1eccf28SAndroid Build Coastguard Worker            mov         v30.16b, v31.16b
1170*e1eccf28SAndroid Build Coastguard Worker            mov         v31.16b, v4.16b
1171*e1eccf28SAndroid Build Coastguard Worker            mov         v4.16b, v5.16b
1172*e1eccf28SAndroid Build Coastguard Worker            mov         v5.16b, v6.16b
1173*e1eccf28SAndroid Build Coastguard Worker            mov         v6.16b, v7.16b
1174*e1eccf28SAndroid Build Coastguard Worker            mov         v7.16b, v8.16b
1175*e1eccf28SAndroid Build Coastguard Worker            mov         v8.16b, v9.16b
1176*e1eccf28SAndroid Build Coastguard Worker            mov         v9.16b, v10.16b
1177*e1eccf28SAndroid Build Coastguard Worker            mov         v10.16b, v11.16b
1178*e1eccf28SAndroid Build Coastguard Worker.endm/*}}}*/
1179*e1eccf28SAndroid Build Coastguard Worker
1180*e1eccf28SAndroid Build Coastguard Worker/* Dedicated function wrapper for the fetch macro, for the cases where
1181*e1eccf28SAndroid Build Coastguard Worker * performance isn't that important, to keep code size down.
1182*e1eccf28SAndroid Build Coastguard Worker */
1183*e1eccf28SAndroid Build Coastguard WorkerPRIVATE(fetch_generic_asm)
1184*e1eccf28SAndroid Build Coastguard Worker            stp         x10, x11, [sp, #-16]!
1185*e1eccf28SAndroid Build Coastguard Worker            fetch
1186*e1eccf28SAndroid Build Coastguard Worker            ldp         x10, x11, [sp], #16
1187*e1eccf28SAndroid Build Coastguard Worker            ret
1188*e1eccf28SAndroid Build Coastguard WorkerEND(fetch_generic_asm)
1189*e1eccf28SAndroid Build Coastguard Worker
1190*e1eccf28SAndroid Build Coastguard Worker
1191*e1eccf28SAndroid Build Coastguard Worker/* Fetch the next (16 - (x10 & 15)) columns of data, avoiding reading memory
1192*e1eccf28SAndroid Build Coastguard Worker * beyond that limit, and filling the rest of the vector with the last legal
1193*e1eccf28SAndroid Build Coastguard Worker * pixel.
1194*e1eccf28SAndroid Build Coastguard Worker * Result is in v10 and v11.  v8 and v9 are filled with the first legal pixel.
1195*e1eccf28SAndroid Build Coastguard Worker * Note: This function can read beyond the right edge of input if the image is
1196*e1eccf28SAndroid Build Coastguard Worker * narrower than 16 bytes.
1197*e1eccf28SAndroid Build Coastguard Worker */
1198*e1eccf28SAndroid Build Coastguard WorkerPRIVATE(fetch_clampleft1)
1199*e1eccf28SAndroid Build Coastguard Worker            stp         x29, x30, [sp, #-16]!
1200*e1eccf28SAndroid Build Coastguard Worker            bl          fetch_generic_asm
1201*e1eccf28SAndroid Build Coastguard Worker            dup         v8.8h, v10.h[0]
1202*e1eccf28SAndroid Build Coastguard Worker            dup         v9.8h, v10.h[0]
1203*e1eccf28SAndroid Build Coastguard Worker            ands        x12, x10, #15
1204*e1eccf28SAndroid Build Coastguard Worker            beq         1f
1205*e1eccf28SAndroid Build Coastguard Worker            sub         x1, x1, x12
1206*e1eccf28SAndroid Build Coastguard Worker            sub         x15, x15, x12
1207*e1eccf28SAndroid Build Coastguard Worker            sub         x19, x19, x12
1208*e1eccf28SAndroid Build Coastguard Worker            sub         x10, x10, x12
1209*e1eccf28SAndroid Build Coastguard Worker            sub         x12, sp, x12, LSL #1
1210*e1eccf28SAndroid Build Coastguard Worker            sub         sp, sp, #64
1211*e1eccf28SAndroid Build Coastguard Worker            sub         x12, x12, #32
1212*e1eccf28SAndroid Build Coastguard Worker            st1         {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
1213*e1eccf28SAndroid Build Coastguard Worker            ld1         {v10.8h,v11.8h}, [x12]
1214*e1eccf28SAndroid Build Coastguard Worker            add         sp, sp, #64
1215*e1eccf28SAndroid Build Coastguard Worker1:          ldp         x29, x30, [sp], #16
1216*e1eccf28SAndroid Build Coastguard Worker            ret
1217*e1eccf28SAndroid Build Coastguard WorkerEND(fetch_clampleft1)
1218*e1eccf28SAndroid Build Coastguard Worker
1219*e1eccf28SAndroid Build Coastguard WorkerPRIVATE(fetch_clampleft4)
1220*e1eccf28SAndroid Build Coastguard Worker            stp         x29, x30, [sp, #-16]!
1221*e1eccf28SAndroid Build Coastguard Worker            bl          fetch_generic_asm
1222*e1eccf28SAndroid Build Coastguard Worker            dup         v8.2d, v10.d[0]
1223*e1eccf28SAndroid Build Coastguard Worker            dup         v9.2d, v10.d[0]
1224*e1eccf28SAndroid Build Coastguard Worker            ands        x12, x10, #15
1225*e1eccf28SAndroid Build Coastguard Worker            beq         1f
1226*e1eccf28SAndroid Build Coastguard Worker            sub         x1, x1, x12
1227*e1eccf28SAndroid Build Coastguard Worker            sub         x15, x15, x12
1228*e1eccf28SAndroid Build Coastguard Worker            sub         x19, x19, x12
1229*e1eccf28SAndroid Build Coastguard Worker            sub         x10, x10, x12
1230*e1eccf28SAndroid Build Coastguard Worker            sub         x12, sp, x12, LSL #1
1231*e1eccf28SAndroid Build Coastguard Worker            sub         sp, sp, #64
1232*e1eccf28SAndroid Build Coastguard Worker            sub         x12, x12, #32
1233*e1eccf28SAndroid Build Coastguard Worker            st1         {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
1234*e1eccf28SAndroid Build Coastguard Worker            ld1         {v10.8h,v11.8h}, [x12]
1235*e1eccf28SAndroid Build Coastguard Worker            add         sp, sp, #64
1236*e1eccf28SAndroid Build Coastguard Worker1:          ldp         x29, x30, [sp], #16
1237*e1eccf28SAndroid Build Coastguard Worker            ret
1238*e1eccf28SAndroid Build Coastguard WorkerEND(fetch_clampleft4)
1239*e1eccf28SAndroid Build Coastguard Worker
1240*e1eccf28SAndroid Build Coastguard Worker/* Fetch only the next (x11 & 15) (where 0 means 16) columns of data, avoiding
1241*e1eccf28SAndroid Build Coastguard Worker * reading memory beyond that limit, and filling the rest of the vector with
1242*e1eccf28SAndroid Build Coastguard Worker * the last legal pixel.
1243*e1eccf28SAndroid Build Coastguard Worker * Result is in v10 and v11.  v12 and v13 are filled with the last legal pixel.
1244*e1eccf28SAndroid Build Coastguard Worker * Note: This function can read beyond the left edge of input if the image is
1245*e1eccf28SAndroid Build Coastguard Worker * narrower than 16 bytes.
1246*e1eccf28SAndroid Build Coastguard Worker */
1247*e1eccf28SAndroid Build Coastguard WorkerPRIVATE(fetch_clampright1)
1248*e1eccf28SAndroid Build Coastguard Worker            stp         x29, x30, [sp, #-16]!
1249*e1eccf28SAndroid Build Coastguard Worker            sub         x12, xzr, x11
1250*e1eccf28SAndroid Build Coastguard Worker            ands        x12, x12, #15
1251*e1eccf28SAndroid Build Coastguard Worker            beq         1f
1252*e1eccf28SAndroid Build Coastguard Worker            sub         x1, x1, x12
1253*e1eccf28SAndroid Build Coastguard Worker            sub         x15, x15, x12
1254*e1eccf28SAndroid Build Coastguard Worker            sub         x19, x19, x12
1255*e1eccf28SAndroid Build Coastguard Worker            bl          fetch_generic_asm
1256*e1eccf28SAndroid Build Coastguard Worker            dup         v12.8h, v11.h[7]
1257*e1eccf28SAndroid Build Coastguard Worker            dup         v13.8h, v11.h[7]
1258*e1eccf28SAndroid Build Coastguard Worker            sub         x12, xzr, x11
1259*e1eccf28SAndroid Build Coastguard Worker            and         x12, x12, #15
1260*e1eccf28SAndroid Build Coastguard Worker            sub         sp, sp, #64
1261*e1eccf28SAndroid Build Coastguard Worker            add         x12, sp, x12, LSL #1
1262*e1eccf28SAndroid Build Coastguard Worker            st1         {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
1263*e1eccf28SAndroid Build Coastguard Worker            ld1         {v10.8h,v11.8h}, [x12]
1264*e1eccf28SAndroid Build Coastguard Worker            add         sp, sp, #64
1265*e1eccf28SAndroid Build Coastguard Worker            ldp         x29, x30, [sp], #16
1266*e1eccf28SAndroid Build Coastguard Worker            ret
1267*e1eccf28SAndroid Build Coastguard Worker1:          bl          fetch_generic_asm
1268*e1eccf28SAndroid Build Coastguard Worker            dup         v12.8h, v11.h[7]
1269*e1eccf28SAndroid Build Coastguard Worker            dup         v13.8h, v11.h[7]
1270*e1eccf28SAndroid Build Coastguard Worker            ldp         x29, x30, [sp], #16
1271*e1eccf28SAndroid Build Coastguard Worker            ret
1272*e1eccf28SAndroid Build Coastguard WorkerEND(fetch_clampright1)
1273*e1eccf28SAndroid Build Coastguard Worker
1274*e1eccf28SAndroid Build Coastguard WorkerPRIVATE(fetch_clampright4)
1275*e1eccf28SAndroid Build Coastguard Worker            stp         x29, x30, [sp, #-16]!
1276*e1eccf28SAndroid Build Coastguard Worker            sub         x12, xzr, x11
1277*e1eccf28SAndroid Build Coastguard Worker            ands        x12, x12, #15
1278*e1eccf28SAndroid Build Coastguard Worker            beq         1f
1279*e1eccf28SAndroid Build Coastguard Worker            sub         x1, x1, x12
1280*e1eccf28SAndroid Build Coastguard Worker            sub         x15, x15, x12
1281*e1eccf28SAndroid Build Coastguard Worker            sub         x19, x19, x12
1282*e1eccf28SAndroid Build Coastguard Worker            bl          fetch_generic_asm
1283*e1eccf28SAndroid Build Coastguard Worker            dup         v12.2d, v11.d[1]
1284*e1eccf28SAndroid Build Coastguard Worker            dup         v13.2d, v11.d[1]
1285*e1eccf28SAndroid Build Coastguard Worker            sub         x12, xzr, x11
1286*e1eccf28SAndroid Build Coastguard Worker            and         x12, x12, #15
1287*e1eccf28SAndroid Build Coastguard Worker            sub         sp, sp, #64
1288*e1eccf28SAndroid Build Coastguard Worker            add         x12, sp, x12, LSL #1
1289*e1eccf28SAndroid Build Coastguard Worker            st1         {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
1290*e1eccf28SAndroid Build Coastguard Worker            ld1         {v10.8h,v11.8h}, [x12]
1291*e1eccf28SAndroid Build Coastguard Worker            add         sp, sp, #64
1292*e1eccf28SAndroid Build Coastguard Worker            ldp         x29, x30, [sp], #16
1293*e1eccf28SAndroid Build Coastguard Worker            ret
1294*e1eccf28SAndroid Build Coastguard Worker1:          bl          fetch_generic_asm
1295*e1eccf28SAndroid Build Coastguard Worker            dup         v12.2d, v11.d[1]
1296*e1eccf28SAndroid Build Coastguard Worker            dup         v13.2d, v11.d[1]
1297*e1eccf28SAndroid Build Coastguard Worker            ldp         x29, x30, [sp], #16
1298*e1eccf28SAndroid Build Coastguard Worker            ret
1299*e1eccf28SAndroid Build Coastguard WorkerEND(fetch_clampright4)
1300*e1eccf28SAndroid Build Coastguard Worker
1301*e1eccf28SAndroid Build Coastguard Worker/* Given values in v10 and v11, and an index in x11, sweep the (x11 & 15)th
1302*e1eccf28SAndroid Build Coastguard Worker * value across to fill the rest of the register pair.  Used for filling the
1303*e1eccf28SAndroid Build Coastguard Worker * right hand edge of the window when reading too close to the right hand edge
1304*e1eccf28SAndroid Build Coastguard Worker * of the image.
1305*e1eccf28SAndroid Build Coastguard Worker * Also returns a dup-ed copy of the last element in v12 for the tail-fill
1306*e1eccf28SAndroid Build Coastguard Worker * case (this happens incidentally in common path, but must be done
1307*e1eccf28SAndroid Build Coastguard Worker * deliberately in the fast-out path).
1308*e1eccf28SAndroid Build Coastguard Worker */
1309*e1eccf28SAndroid Build Coastguard WorkerPRIVATE(prefill_sweepright1)
1310*e1eccf28SAndroid Build Coastguard Worker            ands        x12, x11, #15
1311*e1eccf28SAndroid Build Coastguard Worker            beq         1f
1312*e1eccf28SAndroid Build Coastguard Worker            sub         x12, x12, #1
1313*e1eccf28SAndroid Build Coastguard Worker            sub         sp, sp, #64
1314*e1eccf28SAndroid Build Coastguard Worker            st1         {v10.8h,v11.8h}, [sp]
1315*e1eccf28SAndroid Build Coastguard Worker            add         x12, sp, x12, LSL #1
1316*e1eccf28SAndroid Build Coastguard Worker            ld1r        {v12.8h}, [x12]
1317*e1eccf28SAndroid Build Coastguard Worker            ld1r        {v13.8h}, [x12]
1318*e1eccf28SAndroid Build Coastguard Worker            st1         {v12.8h,v13.8h}, [x12]
1319*e1eccf28SAndroid Build Coastguard Worker            ld1         {v10.8h,v11.8h}, [sp]
1320*e1eccf28SAndroid Build Coastguard Worker            add         sp, sp, #64
1321*e1eccf28SAndroid Build Coastguard Worker            ret
1322*e1eccf28SAndroid Build Coastguard Worker1:          dup         v12.8h, v11.h[7]
1323*e1eccf28SAndroid Build Coastguard Worker            dup         v13.8h, v11.h[7]
1324*e1eccf28SAndroid Build Coastguard Worker            ret
1325*e1eccf28SAndroid Build Coastguard WorkerEND(prefill_sweepright1)
1326*e1eccf28SAndroid Build Coastguard Worker
1327*e1eccf28SAndroid Build Coastguard WorkerPRIVATE(prefill_sweepright4)
1328*e1eccf28SAndroid Build Coastguard Worker            ands        x12, x11, #15
1329*e1eccf28SAndroid Build Coastguard Worker            beq         1f
1330*e1eccf28SAndroid Build Coastguard Worker            sub         x12, x12, #4
1331*e1eccf28SAndroid Build Coastguard Worker            sub         sp, sp, #64
1332*e1eccf28SAndroid Build Coastguard Worker            st1         {v10.8h,v11.8h}, [sp]
1333*e1eccf28SAndroid Build Coastguard Worker            add         x12, sp, x12, LSL #1
1334*e1eccf28SAndroid Build Coastguard Worker            ld1r        {v12.2d}, [x12]
1335*e1eccf28SAndroid Build Coastguard Worker            st1         {v13.8h}, [x12]
1336*e1eccf28SAndroid Build Coastguard Worker            ld1         {v10.8h,v11.8h}, [sp]
1337*e1eccf28SAndroid Build Coastguard Worker            add         sp, sp, #64
1338*e1eccf28SAndroid Build Coastguard Worker            ret
1339*e1eccf28SAndroid Build Coastguard Worker1:          dup         v12.2d, v11.d[1]
1340*e1eccf28SAndroid Build Coastguard Worker            dup         v13.2d, v11.d[1]
1341*e1eccf28SAndroid Build Coastguard Worker            ret
1342*e1eccf28SAndroid Build Coastguard WorkerEND(prefill_sweepright4)
1343*e1eccf28SAndroid Build Coastguard Worker
1344*e1eccf28SAndroid Build Coastguard Worker/* The main loop keeps a sliding window of data that has already been convolved
1345*e1eccf28SAndroid Build Coastguard Worker * in the vertical axis for the current line.  This usually stays in the
1346*e1eccf28SAndroid Build Coastguard Worker * register file, but spills to memory for large windows.  The first thing that
1347*e1eccf28SAndroid Build Coastguard Worker * needs to be done at start-up is to fill this window with image data, taking
1348*e1eccf28SAndroid Build Coastguard Worker * into account the padding needed if the left or right edges of the image fall
1349*e1eccf28SAndroid Build Coastguard Worker * within this window.
1350*e1eccf28SAndroid Build Coastguard Worker */
1351*e1eccf28SAndroid Build Coastguard Worker
1352*e1eccf28SAndroid Build Coastguard Worker/* Because the window is in the register file writes to it cannot be indexed
1353*e1eccf28SAndroid Build Coastguard Worker * by another register.  Consequently the fill loops are unrolled to address
1354*e1eccf28SAndroid Build Coastguard Worker * the registers directly.  This macro distinguishes between writes to the
1355*e1eccf28SAndroid Build Coastguard Worker * register file and writes to the spill buffer (indicated by a destination
1356*e1eccf28SAndroid Build Coastguard Worker * register named xx).
1357*e1eccf28SAndroid Build Coastguard Worker */
1358*e1eccf28SAndroid Build Coastguard Worker.macro prefill_out ra, rb, sra, srb
1359*e1eccf28SAndroid Build Coastguard Worker  .ifc \ra,xx
1360*e1eccf28SAndroid Build Coastguard Worker    .ifc \rb,xx
1361*e1eccf28SAndroid Build Coastguard Worker            st1         {\sra,\srb}, [x9], #32
1362*e1eccf28SAndroid Build Coastguard Worker    .else
1363*e1eccf28SAndroid Build Coastguard Worker            bic         x9, x9, #0x40
1364*e1eccf28SAndroid Build Coastguard Worker            st1         {\sra}, [x9], #16
1365*e1eccf28SAndroid Build Coastguard Worker            mov         \rb, \srb
1366*e1eccf28SAndroid Build Coastguard Worker    .endif
1367*e1eccf28SAndroid Build Coastguard Worker  .else
1368*e1eccf28SAndroid Build Coastguard Worker    .ifnc \ra,\sra
1369*e1eccf28SAndroid Build Coastguard Worker            mov         \ra, \sra
1370*e1eccf28SAndroid Build Coastguard Worker    .endif
1371*e1eccf28SAndroid Build Coastguard Worker    .ifnc \rb,\srb
1372*e1eccf28SAndroid Build Coastguard Worker            mov         \rb, \srb
1373*e1eccf28SAndroid Build Coastguard Worker    .endif
1374*e1eccf28SAndroid Build Coastguard Worker  .endif
1375*e1eccf28SAndroid Build Coastguard Worker.endm
1376*e1eccf28SAndroid Build Coastguard Worker
1377*e1eccf28SAndroid Build Coastguard Worker/* This macro provides the list of registers representing the window, and the
1378*e1eccf28SAndroid Build Coastguard Worker * cases where the register file is too small and a spill buffer is used
1379*e1eccf28SAndroid Build Coastguard Worker * instead.
1380*e1eccf28SAndroid Build Coastguard Worker * Since several specialisations of each function are generated, this also
1381*e1eccf28SAndroid Build Coastguard Worker * culls superfluous iterations, and sets the variable `i` for subsequent
1382*e1eccf28SAndroid Build Coastguard Worker * macros indicating the current index into the window.
1383*e1eccf28SAndroid Build Coastguard Worker */
1384*e1eccf28SAndroid Build Coastguard Worker.macro prefill_list, macro, nextmacro, max_r, step, label
1385*e1eccf28SAndroid Build Coastguard Worker  .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label
1386*e1eccf28SAndroid Build Coastguard Worker    .if windowsize >= (\line * 16)
1387*e1eccf28SAndroid Build Coastguard Worker      .set i, windowsize - (\line * 16)
1388*e1eccf28SAndroid Build Coastguard Worker\label\macro\line:
1389*e1eccf28SAndroid Build Coastguard Worker            prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step
1390*e1eccf28SAndroid Build Coastguard Worker    .endif
1391*e1eccf28SAndroid Build Coastguard Worker  .endm
1392*e1eccf28SAndroid Build Coastguard Worker            ifneeded \macro \nextmacro, 13, 12, xx,      xx,      \step, \label
1393*e1eccf28SAndroid Build Coastguard Worker            ifneeded \macro \nextmacro, 12, 11, xx,      xx,      \step, \label
1394*e1eccf28SAndroid Build Coastguard Worker            ifneeded \macro \nextmacro, 11, 10, xx,      v17.16b, \step, \label
1395*e1eccf28SAndroid Build Coastguard Worker            ifneeded \macro \nextmacro, 10,  9, v18.16b, v19.16b, \step, \label
1396*e1eccf28SAndroid Build Coastguard Worker            ifneeded \macro \nextmacro,  9,  8, v20.16b, v21.16b, \step, \label
1397*e1eccf28SAndroid Build Coastguard Worker            ifneeded \macro \nextmacro,  8,  7, v22.16b, v23.16b, \step, \label
1398*e1eccf28SAndroid Build Coastguard Worker            ifneeded \macro \nextmacro,  7,  6, v24.16b, v25.16b, \step, \label
1399*e1eccf28SAndroid Build Coastguard Worker            ifneeded \macro \nextmacro,  6,  5, v26.16b, v27.16b, \step, \label
1400*e1eccf28SAndroid Build Coastguard Worker            ifneeded \macro \nextmacro,  5,  4, v28.16b, v29.16b, \step, \label
1401*e1eccf28SAndroid Build Coastguard Worker            ifneeded \macro \nextmacro,  4,  3, v30.16b, v31.16b, \step, \label
1402*e1eccf28SAndroid Build Coastguard Worker            ifneeded \macro \nextmacro,  3,  2, v4.16b,  v5.16b,  \step, \label
1403*e1eccf28SAndroid Build Coastguard Worker            ifneeded \macro \nextmacro,  2,  1, v6.16b,  v7.16b,  \step, \label
1404*e1eccf28SAndroid Build Coastguard Worker            ifneeded \macro \nextmacro,  1,  0, v8.16b,  v9.16b,  \step, \label
1405*e1eccf28SAndroid Build Coastguard Worker\label\macro\()0:
1406*e1eccf28SAndroid Build Coastguard Worker            b           \label\()_end
1407*e1eccf28SAndroid Build Coastguard Worker  .purgem ifneeded
1408*e1eccf28SAndroid Build Coastguard Worker.endm
1409*e1eccf28SAndroid Build Coastguard Worker
1410*e1eccf28SAndroid Build Coastguard Worker/* These macros represent the possible stages of filling the window.
1411*e1eccf28SAndroid Build Coastguard Worker * Each macro is unrolled enough times that it can fill the entire window
1412*e1eccf28SAndroid Build Coastguard Worker * itself, but normally it will have to hand control to subsequent macros
1413*e1eccf28SAndroid Build Coastguard Worker * part-way through and this is done using labels named \next and \after, where
1414*e1eccf28SAndroid Build Coastguard Worker * \next is the next macro starting at the same window position and \after is
1415*e1eccf28SAndroid Build Coastguard Worker * the next macro starting after the current window position.
1416*e1eccf28SAndroid Build Coastguard Worker */
1417*e1eccf28SAndroid Build Coastguard Worker
1418*e1eccf28SAndroid Build Coastguard Worker/* leftfill: v8 and v9 contain the left padding value.  While the window
1419*e1eccf28SAndroid Build Coastguard Worker * extends outside of the image on the left-hand side, and at least 16 more
1420*e1eccf28SAndroid Build Coastguard Worker * padding values are needed in the window, store v8 and v9 into the window.
1421*e1eccf28SAndroid Build Coastguard Worker * Otherwise skip forward to storing image data.
1422*e1eccf28SAndroid Build Coastguard Worker */
1423*e1eccf28SAndroid Build Coastguard Worker.macro prefill_leftfill, next, after, ra, rb, step
1424*e1eccf28SAndroid Build Coastguard Worker            cmp         x10, #i+16
1425*e1eccf28SAndroid Build Coastguard Worker            blo         \next
1426*e1eccf28SAndroid Build Coastguard Worker            prefill_out \ra, \rb, v8.16b, v9.16b
1427*e1eccf28SAndroid Build Coastguard Worker.endm
1428*e1eccf28SAndroid Build Coastguard Worker
1429*e1eccf28SAndroid Build Coastguard Worker/* leftedge: The very first non-fill or partial-fill chunk from the image is
1430*e1eccf28SAndroid Build Coastguard Worker * already loaded (as it was used to calculate the left padding value), so
1431*e1eccf28SAndroid Build Coastguard Worker * store it here, and then drop into the regular load/store cycle in the next
1432*e1eccf28SAndroid Build Coastguard Worker * macro.
1433*e1eccf28SAndroid Build Coastguard Worker */
1434*e1eccf28SAndroid Build Coastguard Worker.macro prefill_leftedge, next, after, ra, rb, step
1435*e1eccf28SAndroid Build Coastguard Worker1:          prefill_out \ra, \rb, v10.16b, v11.16b
1436*e1eccf28SAndroid Build Coastguard Worker            b           \after
1437*e1eccf28SAndroid Build Coastguard Worker.endm
1438*e1eccf28SAndroid Build Coastguard Worker
1439*e1eccf28SAndroid Build Coastguard Worker/* dofetch: Copy chunks of the image into the window without any complications
1440*e1eccf28SAndroid Build Coastguard Worker * from edge conditions.
1441*e1eccf28SAndroid Build Coastguard Worker */
1442*e1eccf28SAndroid Build Coastguard Worker.macro prefill_dofetch, next, after, ra, rb, step
1443*e1eccf28SAndroid Build Coastguard Worker            cmp         x11, #i+16
1444*e1eccf28SAndroid Build Coastguard Worker            bls         \next
1445*e1eccf28SAndroid Build Coastguard Worker            bl          fetch_generic_asm
1446*e1eccf28SAndroid Build Coastguard Worker            prefill_out \ra, \rb, v10.16b, v11.16b
1447*e1eccf28SAndroid Build Coastguard Worker.endm
1448*e1eccf28SAndroid Build Coastguard Worker
1449*e1eccf28SAndroid Build Coastguard Worker/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond
1450*e1eccf28SAndroid Build Coastguard Worker * the right-hand edge of the image.  In that case sweep the last valid pixel
1451*e1eccf28SAndroid Build Coastguard Worker * across the rest of the chunk, and in either case prepare padding data in v12
1452*e1eccf28SAndroid Build Coastguard Worker * and v13 for the next macro.  This is done in fetch_clampright.
1453*e1eccf28SAndroid Build Coastguard Worker * This only happens once before going on to the next macro.
1454*e1eccf28SAndroid Build Coastguard Worker * Sometimes leftedge also covers the rightedge case, in which case this has
1455*e1eccf28SAndroid Build Coastguard Worker * to be skipped altogether.
1456*e1eccf28SAndroid Build Coastguard Worker */
1457*e1eccf28SAndroid Build Coastguard Worker.macro prefill_rightedge, next, after, ra, rb, step
1458*e1eccf28SAndroid Build Coastguard Worker            cmp         x11, #i
1459*e1eccf28SAndroid Build Coastguard Worker            bls         \next
1460*e1eccf28SAndroid Build Coastguard Worker            bl          fetch_clampright\step
1461*e1eccf28SAndroid Build Coastguard Worker            prefill_out \ra, \rb, v10.16b, v11.16b
1462*e1eccf28SAndroid Build Coastguard Worker            b           \after
1463*e1eccf28SAndroid Build Coastguard Worker.endm
1464*e1eccf28SAndroid Build Coastguard Worker
1465*e1eccf28SAndroid Build Coastguard Worker/* rightfill: The rest of the window is simply filled with right padding from
1466*e1eccf28SAndroid Build Coastguard Worker * v12 and v13.
1467*e1eccf28SAndroid Build Coastguard Worker */
1468*e1eccf28SAndroid Build Coastguard Worker.macro prefill_rightfill, next, after, ra, rb, step
1469*e1eccf28SAndroid Build Coastguard Worker            prefill_out \ra, \rb, v12.16b, v13.16b
1470*e1eccf28SAndroid Build Coastguard Worker.endm
1471*e1eccf28SAndroid Build Coastguard Worker
1472*e1eccf28SAndroid Build Coastguard Worker/* Here all of the macros above are unrolled and laid out in the proper order.
1473*e1eccf28SAndroid Build Coastguard Worker */
1474*e1eccf28SAndroid Build Coastguard Worker.macro prefill_body, max_r, step, label
1475*e1eccf28SAndroid Build Coastguard Worker            prefill_list leftfill,  leftedge,   \max_r, \step, \label
1476*e1eccf28SAndroid Build Coastguard Worker            prefill_list leftedge,  dofetch,    \max_r, \step, \label
1477*e1eccf28SAndroid Build Coastguard Worker            prefill_list dofetch,   rightedge,  \max_r, \step, \label
1478*e1eccf28SAndroid Build Coastguard Worker            prefill_list rightedge, rightfill,  \max_r, \step, \label
1479*e1eccf28SAndroid Build Coastguard Worker            prefill_list rightfill, oops,       \max_r, \step, \label
1480*e1eccf28SAndroid Build Coastguard Worker\label\()_end:
1481*e1eccf28SAndroid Build Coastguard Worker.endm
1482*e1eccf28SAndroid Build Coastguard Worker
1483*e1eccf28SAndroid Build Coastguard Worker
1484*e1eccf28SAndroid Build Coastguard Worker/* Fill the convolution window with context data.  The aim here is to load
1485*e1eccf28SAndroid Build Coastguard Worker * exactly 2*r columns, and in the main loop to read as many columns as will be
1486*e1eccf28SAndroid Build Coastguard Worker * written.  This is complicated by the window being divided into chunks at
1487*e1eccf28SAndroid Build Coastguard Worker * register boundaries, and the need to handle cases when the input starts very
1488*e1eccf28SAndroid Build Coastguard Worker * close to the left or right (or both) edges of the image and the need to fill
1489*e1eccf28SAndroid Build Coastguard Worker * the spaces that leaves with left and right edge padding values.
1490*e1eccf28SAndroid Build Coastguard Worker *
1491*e1eccf28SAndroid Build Coastguard Worker * Input:
1492*e1eccf28SAndroid Build Coastguard Worker *      x1 -- src
1493*e1eccf28SAndroid Build Coastguard Worker *      x2 -- pitch
1494*e1eccf28SAndroid Build Coastguard Worker *      x3 -- count
1495*e1eccf28SAndroid Build Coastguard Worker *      x4 -- available image data right of src pointer
1496*e1eccf28SAndroid Build Coastguard Worker *      x5 -- r
1497*e1eccf28SAndroid Build Coastguard Worker *      x6 -- rup
1498*e1eccf28SAndroid Build Coastguard Worker *      x7 -- rdn
1499*e1eccf28SAndroid Build Coastguard Worker *      x8 -- available image data left of src pointer
1500*e1eccf28SAndroid Build Coastguard Worker *      x9 -- buffer (if needed)
1501*e1eccf28SAndroid Build Coastguard Worker *      x13 = -pitch
1502*e1eccf28SAndroid Build Coastguard Worker *      x15 = top-row in
1503*e1eccf28SAndroid Build Coastguard Worker *      x19 = bottom-row in
1504*e1eccf28SAndroid Build Coastguard Worker * Output:
1505*e1eccf28SAndroid Build Coastguard Worker *      x4 -= min(inlen, count + windowsize - centertap)
1506*e1eccf28SAndroid Build Coastguard Worker *      x1 += min(inlen, count + windowsize - centertap)
1507*e1eccf28SAndroid Build Coastguard Worker *      x15 += min(inlen, count + windowsize - centertap)
1508*e1eccf28SAndroid Build Coastguard Worker *      x19 += min(inlen, count + windowsize - centertap)
1509*e1eccf28SAndroid Build Coastguard Worker * Modifies:
1510*e1eccf28SAndroid Build Coastguard Worker *      x10 -- fill start index in the window
1511*e1eccf28SAndroid Build Coastguard Worker *      x11 -- fill stop index in the window
1512*e1eccf28SAndroid Build Coastguard Worker *      x12 -- scratch
1513*e1eccf28SAndroid Build Coastguard Worker */
1514*e1eccf28SAndroid Build Coastguard Worker.macro prefill step=1, max_r=25, label=xx
1515*e1eccf28SAndroid Build Coastguard Worker.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15)
1516*e1eccf28SAndroid Build Coastguard Worker.set centertap, (windowsize - \max_r * \step)
1517*e1eccf28SAndroid Build Coastguard Worker            mov         x10, #centertap
1518*e1eccf28SAndroid Build Coastguard Worker            subs        x10, x10, x8
1519*e1eccf28SAndroid Build Coastguard Worker            csel        x10, xzr, x10, lo
1520*e1eccf28SAndroid Build Coastguard Worker
1521*e1eccf28SAndroid Build Coastguard Worker            subs        x11, x4, #windowsize - centertap
1522*e1eccf28SAndroid Build Coastguard Worker            csel        x11, xzr, x11, hs
1523*e1eccf28SAndroid Build Coastguard Worker            add         x11, x11, #windowsize
1524*e1eccf28SAndroid Build Coastguard Worker
1525*e1eccf28SAndroid Build Coastguard Worker            /* x10 indicates where in the window legal image data begins.
1526*e1eccf28SAndroid Build Coastguard Worker             * x11 indicates where in the window legal image date ends.
1527*e1eccf28SAndroid Build Coastguard Worker             * When starting near the centre of a large image these would be
1528*e1eccf28SAndroid Build Coastguard Worker             * zero and windowsize respectively, but when starting near the
1529*e1eccf28SAndroid Build Coastguard Worker             * edges this can change.
1530*e1eccf28SAndroid Build Coastguard Worker             * When starting on the leftmost pixel, x10 will be centertap.
1531*e1eccf28SAndroid Build Coastguard Worker             * When starting on the rightmost pixel, x11 will be centertap+1.
1532*e1eccf28SAndroid Build Coastguard Worker             */
1533*e1eccf28SAndroid Build Coastguard Worker
1534*e1eccf28SAndroid Build Coastguard Worker            /* x4 indicates how much data there is between the current pointers
1535*e1eccf28SAndroid Build Coastguard Worker             * and the right edge of the image.  The pointers currently point
1536*e1eccf28SAndroid Build Coastguard Worker             * to the data needed at centertap.  The subsequent code will
1537*e1eccf28SAndroid Build Coastguard Worker             * consume (windowsize - x10) data, but only the data from
1538*e1eccf28SAndroid Build Coastguard Worker             * centertap to windowsize comes out of x4's budget.
1539*e1eccf28SAndroid Build Coastguard Worker             */
1540*e1eccf28SAndroid Build Coastguard Worker1:          subs        x4, x4, #windowsize - centertap
1541*e1eccf28SAndroid Build Coastguard Worker            csel        x4, xzr, x4, lo
1542*e1eccf28SAndroid Build Coastguard Worker
1543*e1eccf28SAndroid Build Coastguard Worker            /* And the pointers need to rewind to the start of the window.
1544*e1eccf28SAndroid Build Coastguard Worker             */
1545*e1eccf28SAndroid Build Coastguard Worker            sub         x1, x1, #centertap
1546*e1eccf28SAndroid Build Coastguard Worker            sub         x15, x15, #centertap
1547*e1eccf28SAndroid Build Coastguard Worker            sub         x19, x19, #centertap
1548*e1eccf28SAndroid Build Coastguard Worker
1549*e1eccf28SAndroid Build Coastguard Worker            /* Unless x8 indicated that there wasn't that much data available.
1550*e1eccf28SAndroid Build Coastguard Worker             */
1551*e1eccf28SAndroid Build Coastguard Worker            add         x1, x1, x10
1552*e1eccf28SAndroid Build Coastguard Worker            add         x15, x15, x10
1553*e1eccf28SAndroid Build Coastguard Worker            add         x19, x19, x10
1554*e1eccf28SAndroid Build Coastguard Worker
1555*e1eccf28SAndroid Build Coastguard Worker            /* Get the first chunk, and add padding to align it to the window
1556*e1eccf28SAndroid Build Coastguard Worker             * if necessary.
1557*e1eccf28SAndroid Build Coastguard Worker             */
1558*e1eccf28SAndroid Build Coastguard Worker            bl          fetch_clampleft\step
1559*e1eccf28SAndroid Build Coastguard Worker
1560*e1eccf28SAndroid Build Coastguard Worker            /* Sometimes the start and the end of the window are in the same
1561*e1eccf28SAndroid Build Coastguard Worker             * chunk.  In that case both ends need filler at the outset.
1562*e1eccf28SAndroid Build Coastguard Worker             */
1563*e1eccf28SAndroid Build Coastguard Worker            sub         x12, x11, #1
1564*e1eccf28SAndroid Build Coastguard Worker            eor         x12,  x10, x12
1565*e1eccf28SAndroid Build Coastguard Worker            cmp         x12, #16
1566*e1eccf28SAndroid Build Coastguard Worker            bhs         1f
1567*e1eccf28SAndroid Build Coastguard Worker            bl          prefill_sweepright\step
1568*e1eccf28SAndroid Build Coastguard Worker
1569*e1eccf28SAndroid Build Coastguard Worker            /* Iterate through all the points in the window and fill them in
1570*e1eccf28SAndroid Build Coastguard Worker             * with padding or image data as needed.
1571*e1eccf28SAndroid Build Coastguard Worker             */
1572*e1eccf28SAndroid Build Coastguard Worker1:          prefill_body \max_r, \step, \label
1573*e1eccf28SAndroid Build Coastguard Worker.endm
1574*e1eccf28SAndroid Build Coastguard Worker
1575*e1eccf28SAndroid Build Coastguard Worker/* The main body of the convolve functions.  Having already pre-filled the
1576*e1eccf28SAndroid Build Coastguard Worker * convolution window with 2*r input values, the logic settles into a regular
1577*e1eccf28SAndroid Build Coastguard Worker * pattern of reading and writing at a 1:1 rate until either input or output
1578*e1eccf28SAndroid Build Coastguard Worker * expires.  The input leads the output by r values, so when processing all the
1579*e1eccf28SAndroid Build Coastguard Worker * way to the right-hand edge, or within r pixels of that edge, the input will
1580*e1eccf28SAndroid Build Coastguard Worker * run out first.  In the case of very narrow images, or sub-windows starting
1581*e1eccf28SAndroid Build Coastguard Worker * near the right edge, the input may already have run out while the
1582*e1eccf28SAndroid Build Coastguard Worker * convolution window was being filled and this loop will start with a
1583*e1eccf28SAndroid Build Coastguard Worker * zero-length input.
1584*e1eccf28SAndroid Build Coastguard Worker *
1585*e1eccf28SAndroid Build Coastguard Worker * Once the input runs out, the rest of the output must be processed by padding
1586*e1eccf28SAndroid Build Coastguard Worker * the remainder of the window with pad value from the last valid pixel from
1587*e1eccf28SAndroid Build Coastguard Worker * the source.
1588*e1eccf28SAndroid Build Coastguard Worker *
1589*e1eccf28SAndroid Build Coastguard Worker * Input:
1590*e1eccf28SAndroid Build Coastguard Worker *      x0 = dst
1591*e1eccf28SAndroid Build Coastguard Worker *      x1 = src
1592*e1eccf28SAndroid Build Coastguard Worker *      x2 = pitch
1593*e1eccf28SAndroid Build Coastguard Worker *      x3 = count
1594*e1eccf28SAndroid Build Coastguard Worker *      x4 = inlen
1595*e1eccf28SAndroid Build Coastguard Worker *      x5 = r
1596*e1eccf28SAndroid Build Coastguard Worker *      x6 = rup
1597*e1eccf28SAndroid Build Coastguard Worker *      x7 = rdn
1598*e1eccf28SAndroid Build Coastguard Worker *      x9 = buffer
1599*e1eccf28SAndroid Build Coastguard Worker *      x13 = -pitch
1600*e1eccf28SAndroid Build Coastguard Worker *      x15 = top-row in
1601*e1eccf28SAndroid Build Coastguard Worker *      x19 = bottom-row in
1602*e1eccf28SAndroid Build Coastguard Worker * Modifies
1603*e1eccf28SAndroid Build Coastguard Worker *      x8 = fetch code pointer
1604*e1eccf28SAndroid Build Coastguard Worker */
1605*e1eccf28SAndroid Build Coastguard Worker.macro conv_body core, step=1, max_r=25, labelc="", labelnc=""
1606*e1eccf28SAndroid Build Coastguard Worker
1607*e1eccf28SAndroid Build Coastguard Worker            /* If x4 >= x3 then there's no need for clipping.  The main loop
1608*e1eccf28SAndroid Build Coastguard Worker             * needs to exit when either x3 or x4 runs out, so clamp x4 to be
1609*e1eccf28SAndroid Build Coastguard Worker             * no greater than x3 and use x4 for the loop.
1610*e1eccf28SAndroid Build Coastguard Worker             * However, if x4 comes out of the loop with less than 16 bytes
1611*e1eccf28SAndroid Build Coastguard Worker             * left, a partial read would be necessary to avoid reading beyond
1612*e1eccf28SAndroid Build Coastguard Worker             * the end of the image.  To avoid this, clamp x4 to the next
1613*e1eccf28SAndroid Build Coastguard Worker             * multiple of 16, which is still sufficient to force it out of the
1614*e1eccf28SAndroid Build Coastguard Worker             * loop but doesn't imply a rewind.
1615*e1eccf28SAndroid Build Coastguard Worker             */
1616*e1eccf28SAndroid Build Coastguard Worker            add         x12, x3, #15
1617*e1eccf28SAndroid Build Coastguard Worker            bic         x12, x12, #15
1618*e1eccf28SAndroid Build Coastguard Worker            cmp         x4, x12
1619*e1eccf28SAndroid Build Coastguard Worker            csel        x4, x12, x4, hi
1620*e1eccf28SAndroid Build Coastguard Worker
1621*e1eccf28SAndroid Build Coastguard Worker            /* First calculate the entry-point into the internal fetch logic.
1622*e1eccf28SAndroid Build Coastguard Worker             * This is done so the same function can service several kernel
1623*e1eccf28SAndroid Build Coastguard Worker             * sizes.
1624*e1eccf28SAndroid Build Coastguard Worker             */
1625*e1eccf28SAndroid Build Coastguard Worker            adrp        x8, \labelnc
1626*e1eccf28SAndroid Build Coastguard Worker            add         x8, x8, #:lo12:\labelnc
1627*e1eccf28SAndroid Build Coastguard Worker            sub         x8, x8, x5, LSL #5
1628*e1eccf28SAndroid Build Coastguard Worker            sub         x8, x8, x5, LSL #3
1629*e1eccf28SAndroid Build Coastguard Worker            cmp         x5, x6
1630*e1eccf28SAndroid Build Coastguard Worker            ccmp        x5, x7, #0, eq
1631*e1eccf28SAndroid Build Coastguard Worker            beq         5f
1632*e1eccf28SAndroid Build Coastguard Worker
1633*e1eccf28SAndroid Build Coastguard Worker            /* if (r != rup || r != rdn) then the address-clamping table should
1634*e1eccf28SAndroid Build Coastguard Worker             * be used rather than the short-cut version.
1635*e1eccf28SAndroid Build Coastguard Worker             */
1636*e1eccf28SAndroid Build Coastguard Worker            adrp        x8, \labelc
1637*e1eccf28SAndroid Build Coastguard Worker            add         x8, x8, #:lo12:\labelc
1638*e1eccf28SAndroid Build Coastguard Worker            sub         x8, x8, x5, LSL #6
1639*e1eccf28SAndroid Build Coastguard Worker            add         x8, x8, x5, LSL #3
1640*e1eccf28SAndroid Build Coastguard Worker            b           5f
1641*e1eccf28SAndroid Build Coastguard Worker
1642*e1eccf28SAndroid Build Coastguard Worker            /* Main loop: ... */
1643*e1eccf28SAndroid Build Coastguard Worker            .align  4
1644*e1eccf28SAndroid Build Coastguard Worker3:          /* first perform a vertical convolution from memory to get the next
1645*e1eccf28SAndroid Build Coastguard Worker             * 16 taps of the horizontal window into the register file...
1646*e1eccf28SAndroid Build Coastguard Worker             */
1647*e1eccf28SAndroid Build Coastguard Worker            fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8
1648*e1eccf28SAndroid Build Coastguard Worker
1649*e1eccf28SAndroid Build Coastguard Worker            /* ...then perform a horizontal convolution on that window to
1650*e1eccf28SAndroid Build Coastguard Worker             * produce eight output bytes, and slide the window along.
1651*e1eccf28SAndroid Build Coastguard Worker             * This has to be done twice to match the 16-way vertical pass.
1652*e1eccf28SAndroid Build Coastguard Worker             * It would be preferable to have twice the work done in \core, but
1653*e1eccf28SAndroid Build Coastguard Worker             * that would demand yet another variant on those macros and would
1654*e1eccf28SAndroid Build Coastguard Worker             * perturb the register allocation severely.
1655*e1eccf28SAndroid Build Coastguard Worker             */
1656*e1eccf28SAndroid Build Coastguard Worker            \core
1657*e1eccf28SAndroid Build Coastguard Worker            st1         {v15.8b}, [x0], #8
1658*e1eccf28SAndroid Build Coastguard Worker            \core
1659*e1eccf28SAndroid Build Coastguard Worker            st1         {v15.8b}, [x0], #8
1660*e1eccf28SAndroid Build Coastguard Worker
1661*e1eccf28SAndroid Build Coastguard Worker            sub         x3, x3, #16
1662*e1eccf28SAndroid Build Coastguard Worker5:          subs        x4, x4, #16
1663*e1eccf28SAndroid Build Coastguard Worker            bhi         3b
1664*e1eccf28SAndroid Build Coastguard Worker            /* Here there's 16 or fewer bytes available before the edge of the
1665*e1eccf28SAndroid Build Coastguard Worker             * source image.  x4 holds that count minus 16 (because it was
1666*e1eccf28SAndroid Build Coastguard Worker             * decremented before the first iteration ran).  The last read may
1667*e1eccf28SAndroid Build Coastguard Worker             * not be a whole chunk, and beyond that a fill value must be used.
1668*e1eccf28SAndroid Build Coastguard Worker             *
1669*e1eccf28SAndroid Build Coastguard Worker             * Of course, none of that matters if there's no more output to
1670*e1eccf28SAndroid Build Coastguard Worker             * produce...
1671*e1eccf28SAndroid Build Coastguard Worker             */
1672*e1eccf28SAndroid Build Coastguard Worker            cbz         x3, 5f
1673*e1eccf28SAndroid Build Coastguard Worker
1674*e1eccf28SAndroid Build Coastguard Worker            /* Oh well. */
1675*e1eccf28SAndroid Build Coastguard Worker            adds        x4, x4, #16
1676*e1eccf28SAndroid Build Coastguard Worker            bne         1f
1677*e1eccf28SAndroid Build Coastguard Worker  .if \step==1
1678*e1eccf28SAndroid Build Coastguard Worker            dup         v10.8h, v9.h[7]
1679*e1eccf28SAndroid Build Coastguard Worker            dup         v11.8h, v9.h[7]
1680*e1eccf28SAndroid Build Coastguard Worker  .else
1681*e1eccf28SAndroid Build Coastguard Worker            dup         v10.2d, v9.d[1]
1682*e1eccf28SAndroid Build Coastguard Worker            dup         v11.2d, v9.d[1]
1683*e1eccf28SAndroid Build Coastguard Worker  .endif
1684*e1eccf28SAndroid Build Coastguard Worker            b           3f
1685*e1eccf28SAndroid Build Coastguard Worker
1686*e1eccf28SAndroid Build Coastguard Worker            /* To avoid reading past end of input, rewind pointers by (16-x4)
1687*e1eccf28SAndroid Build Coastguard Worker             * to ensure that they're exactly 16 bytes from the edge.
1688*e1eccf28SAndroid Build Coastguard Worker             */
1689*e1eccf28SAndroid Build Coastguard Worker1:          mov         x11, x4
1690*e1eccf28SAndroid Build Coastguard Worker            bl          fetch_clampright\step
1691*e1eccf28SAndroid Build Coastguard Worker            /* Now to put this padding to use, perform any remaining
1692*e1eccf28SAndroid Build Coastguard Worker             * iterations.  This is done at half the rate of the main loop,
1693*e1eccf28SAndroid Build Coastguard Worker             * because there's no longer pressure from a 16-lane window filler.
1694*e1eccf28SAndroid Build Coastguard Worker             */
1695*e1eccf28SAndroid Build Coastguard Worker3:          \core
1696*e1eccf28SAndroid Build Coastguard Worker  .if \step==1
1697*e1eccf28SAndroid Build Coastguard Worker            dup         v11.8h, v11.h[7]
1698*e1eccf28SAndroid Build Coastguard Worker  .else
1699*e1eccf28SAndroid Build Coastguard Worker            dup         v11.2d, v11.d[1]
1700*e1eccf28SAndroid Build Coastguard Worker  .endif
1701*e1eccf28SAndroid Build Coastguard Worker            subs        x3, x3, #8
1702*e1eccf28SAndroid Build Coastguard Worker            blo         4f
1703*e1eccf28SAndroid Build Coastguard Worker            st1         {v15.8b}, [x0], #8
1704*e1eccf28SAndroid Build Coastguard Worker            bne         3b
1705*e1eccf28SAndroid Build Coastguard Worker            b           5f
1706*e1eccf28SAndroid Build Coastguard Worker
1707*e1eccf28SAndroid Build Coastguard Worker            /* If the final iteration contained 0 < l < 8 values, then perform
1708*e1eccf28SAndroid Build Coastguard Worker             * a piecewise store of the final vector.
1709*e1eccf28SAndroid Build Coastguard Worker             */
1710*e1eccf28SAndroid Build Coastguard Worker4:          tbz         x3, #2, 1f
1711*e1eccf28SAndroid Build Coastguard Worker            st1         {v15.s}[0], [x0], #4
1712*e1eccf28SAndroid Build Coastguard Worker            ext         v15.8b, v15.8b, v15.8b, #4
1713*e1eccf28SAndroid Build Coastguard Worker1:          tbz         x3, #1, 1f
1714*e1eccf28SAndroid Build Coastguard Worker            st1         {v15.h}[0], [x0], #2
1715*e1eccf28SAndroid Build Coastguard Worker            ext         v15.8b, v15.8b, v15.8b, #2
1716*e1eccf28SAndroid Build Coastguard Worker1:          tbz         x3, #0, 5f
1717*e1eccf28SAndroid Build Coastguard Worker            st1         {v15.b}[0], [x0], #1
1718*e1eccf28SAndroid Build Coastguard Worker            ext         v15.8b, v15.8b, v15.8b, #1
1719*e1eccf28SAndroid Build Coastguard Worker5:          mov         x0, #0
1720*e1eccf28SAndroid Build Coastguard Worker.endm
1721*e1eccf28SAndroid Build Coastguard Worker
1722*e1eccf28SAndroid Build Coastguard Worker
1723*e1eccf28SAndroid Build Coastguard Worker.irp r, TUNED_LIST1, 25
1724*e1eccf28SAndroid Build Coastguard WorkerPRIVATE(convolve1_\r)
1725*e1eccf28SAndroid Build Coastguard Worker            stp         x29,x30, [sp, #-16]!
1726*e1eccf28SAndroid Build Coastguard Worker
1727*e1eccf28SAndroid Build Coastguard Worker            prefill     step=1, max_r=\r, label=.Lcnv1_\r
1728*e1eccf28SAndroid Build Coastguard Worker
1729*e1eccf28SAndroid Build Coastguard Worker            conv_body   core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
1730*e1eccf28SAndroid Build Coastguard Worker
1731*e1eccf28SAndroid Build Coastguard Worker            ldp         x29,x30, [sp], #16
1732*e1eccf28SAndroid Build Coastguard Worker            ret
1733*e1eccf28SAndroid Build Coastguard WorkerEND(convolve1_\r)
1734*e1eccf28SAndroid Build Coastguard Worker.endr
1735*e1eccf28SAndroid Build Coastguard Worker
1736*e1eccf28SAndroid Build Coastguard Worker.irp r, TUNED_LIST4, 25
1737*e1eccf28SAndroid Build Coastguard WorkerPRIVATE(convolve4_\r)
1738*e1eccf28SAndroid Build Coastguard Worker            sub         x9, sp, #0x40
1739*e1eccf28SAndroid Build Coastguard Worker            stp         x29,x30, [sp, #-(16 + 0x40 + 0x80)]!
1740*e1eccf28SAndroid Build Coastguard Worker            bic         x9, x9, #0x7f
1741*e1eccf28SAndroid Build Coastguard Worker
1742*e1eccf28SAndroid Build Coastguard Worker            /* x9 now points to a 0x40 byte buffer on the stack whose address
1743*e1eccf28SAndroid Build Coastguard Worker             * has the low 7 bits clear.  This allows easy address calculation
1744*e1eccf28SAndroid Build Coastguard Worker             * in the wrap-around cases.
1745*e1eccf28SAndroid Build Coastguard Worker             */
1746*e1eccf28SAndroid Build Coastguard Worker
1747*e1eccf28SAndroid Build Coastguard Worker            prefill     step=4, max_r=\r, label=.Lcnv4_\r
1748*e1eccf28SAndroid Build Coastguard Worker
1749*e1eccf28SAndroid Build Coastguard Worker            conv_body   core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
1750*e1eccf28SAndroid Build Coastguard Worker
1751*e1eccf28SAndroid Build Coastguard Worker            ldp         x29,x30, [sp], #(16 + 0x40 + 0x80)
1752*e1eccf28SAndroid Build Coastguard Worker            ret
1753*e1eccf28SAndroid Build Coastguard WorkerEND(convolve4_\r)
1754*e1eccf28SAndroid Build Coastguard Worker.endr
1755*e1eccf28SAndroid Build Coastguard Worker
1756*e1eccf28SAndroid Build Coastguard Worker/* void rsdIntrinsicBlurU1_K(
1757*e1eccf28SAndroid Build Coastguard Worker *                  void *out,      // x0
1758*e1eccf28SAndroid Build Coastguard Worker *                  void *in,       // x1
1759*e1eccf28SAndroid Build Coastguard Worker *                  size_t w,       // x2
1760*e1eccf28SAndroid Build Coastguard Worker *                  size_t h,       // x3
1761*e1eccf28SAndroid Build Coastguard Worker *                  size_t p,       // x4
1762*e1eccf28SAndroid Build Coastguard Worker *                  size_t x,       // x5
1763*e1eccf28SAndroid Build Coastguard Worker *                  size_t y,       // x6
1764*e1eccf28SAndroid Build Coastguard Worker *                  size_t count,   // x7
1765*e1eccf28SAndroid Build Coastguard Worker *                  size_t r,       // [sp]
1766*e1eccf28SAndroid Build Coastguard Worker *                  uint16_t *tab); // [sp,#8]
1767*e1eccf28SAndroid Build Coastguard Worker */
1768*e1eccf28SAndroid Build Coastguard WorkerENTRY(rsdIntrinsicBlurU1_K)
1769*e1eccf28SAndroid Build Coastguard Worker            stp         x19,x30, [sp, #-16]!
1770*e1eccf28SAndroid Build Coastguard Worker            sub         x8, sp, #32
1771*e1eccf28SAndroid Build Coastguard Worker            sub         sp, sp, #64
1772*e1eccf28SAndroid Build Coastguard Worker            st1         {v8.1d - v11.1d}, [sp]
1773*e1eccf28SAndroid Build Coastguard Worker            st1         {v12.1d - v15.1d}, [x8]
1774*e1eccf28SAndroid Build Coastguard Worker            mov         x8, x5          // x
1775*e1eccf28SAndroid Build Coastguard Worker            ldr         w5, [sp,#80]    // r
1776*e1eccf28SAndroid Build Coastguard Worker            sub         x9, x2, x8      // w - x
1777*e1eccf28SAndroid Build Coastguard Worker            sub         x10, x3, x6     // h - y
1778*e1eccf28SAndroid Build Coastguard Worker            mov         x2, x4          // pitch
1779*e1eccf28SAndroid Build Coastguard Worker            mov         x3, x7          // count
1780*e1eccf28SAndroid Build Coastguard Worker            sub         x7, x10, #1     // h - y - 1
1781*e1eccf28SAndroid Build Coastguard Worker            mov         x4, x9          // inlen = (w - x)
1782*e1eccf28SAndroid Build Coastguard Worker
1783*e1eccf28SAndroid Build Coastguard Worker            ldr         x12, [sp, #88] // tab
1784*e1eccf28SAndroid Build Coastguard Worker
1785*e1eccf28SAndroid Build Coastguard Worker            add         x1, x1, x8      // src += x
1786*e1eccf28SAndroid Build Coastguard Worker
1787*e1eccf28SAndroid Build Coastguard Worker            cmp         x6, x5
1788*e1eccf28SAndroid Build Coastguard Worker            csel        x6, x5, x6, hs  // rup = min(r, y)
1789*e1eccf28SAndroid Build Coastguard Worker            cmp         x7, x5
1790*e1eccf28SAndroid Build Coastguard Worker            csel        x7, x5, x7, hs  // rdn = min(r, h - y - 1)
1791*e1eccf28SAndroid Build Coastguard Worker
1792*e1eccf28SAndroid Build Coastguard Worker            sub         x13, xzr, x2    // -pitch
1793*e1eccf28SAndroid Build Coastguard Worker            msub        x15, x2, x6, x1
1794*e1eccf28SAndroid Build Coastguard Worker            madd        x19, x2, x7, x1
1795*e1eccf28SAndroid Build Coastguard Worker
1796*e1eccf28SAndroid Build Coastguard Worker            ld1         {v0.8h,v1.8h}, [x12], #32
1797*e1eccf28SAndroid Build Coastguard Worker            ld1         {v2.8h,v3.8h}, [x12], #32
1798*e1eccf28SAndroid Build Coastguard Worker
1799*e1eccf28SAndroid Build Coastguard Worker            adr         x30, 1f
1800*e1eccf28SAndroid Build Coastguard Worker  .irp r, TUNED_LIST1
1801*e1eccf28SAndroid Build Coastguard Worker            cmp         x5, #\r
1802*e1eccf28SAndroid Build Coastguard Worker            bls         convolve1_\r
1803*e1eccf28SAndroid Build Coastguard Worker  .endr
1804*e1eccf28SAndroid Build Coastguard Worker            b           convolve1_25
1805*e1eccf28SAndroid Build Coastguard Worker
1806*e1eccf28SAndroid Build Coastguard Worker1:          ld1         {v8.1d - v11.1d}, [sp], #32
1807*e1eccf28SAndroid Build Coastguard Worker            ld1         {v12.1d - v15.1d}, [sp], #32
1808*e1eccf28SAndroid Build Coastguard Worker            ldp         x19,x30, [sp], #16
1809*e1eccf28SAndroid Build Coastguard Worker            ret
1810*e1eccf28SAndroid Build Coastguard WorkerEND(rsdIntrinsicBlurU1_K)
1811*e1eccf28SAndroid Build Coastguard Worker
1812*e1eccf28SAndroid Build Coastguard Worker/* void rsdIntrinsicBlurU4_K(
1813*e1eccf28SAndroid Build Coastguard Worker *                  void *out,      // x0
1814*e1eccf28SAndroid Build Coastguard Worker *                  void *in,       // x1
1815*e1eccf28SAndroid Build Coastguard Worker *                  size_t w,       // x2
1816*e1eccf28SAndroid Build Coastguard Worker *                  size_t h,       // x3
1817*e1eccf28SAndroid Build Coastguard Worker *                  size_t p,       // x4
1818*e1eccf28SAndroid Build Coastguard Worker *                  size_t x,       // x5
1819*e1eccf28SAndroid Build Coastguard Worker *                  size_t y,       // x6
1820*e1eccf28SAndroid Build Coastguard Worker *                  size_t count,   // x7
1821*e1eccf28SAndroid Build Coastguard Worker *                  size_t r,       // [sp]
1822*e1eccf28SAndroid Build Coastguard Worker *                  uint16_t *tab); // [sp,#8]
1823*e1eccf28SAndroid Build Coastguard Worker */
1824*e1eccf28SAndroid Build Coastguard WorkerENTRY(rsdIntrinsicBlurU4_K)
1825*e1eccf28SAndroid Build Coastguard Worker            stp         x19,x30, [sp, #-16]!
1826*e1eccf28SAndroid Build Coastguard Worker            sub         x8, sp, #32
1827*e1eccf28SAndroid Build Coastguard Worker            sub         sp, sp, #64
1828*e1eccf28SAndroid Build Coastguard Worker            st1         {v8.1d - v11.1d}, [sp]
1829*e1eccf28SAndroid Build Coastguard Worker            st1         {v12.1d - v15.1d}, [x8]
1830*e1eccf28SAndroid Build Coastguard Worker            lsl         x8, x5, #2      // x
1831*e1eccf28SAndroid Build Coastguard Worker            lsl         x2, x2, #2
1832*e1eccf28SAndroid Build Coastguard Worker            ldr         w5, [sp,#80]    // r
1833*e1eccf28SAndroid Build Coastguard Worker            sub         x9, x2, x8      // w - x
1834*e1eccf28SAndroid Build Coastguard Worker            sub         x10, x3, x6     // h - y
1835*e1eccf28SAndroid Build Coastguard Worker            mov         x2, x4          // pitch
1836*e1eccf28SAndroid Build Coastguard Worker            lsl         x3, x7, #2      // count
1837*e1eccf28SAndroid Build Coastguard Worker            sub         x7, x10, #1     // h - y - 1
1838*e1eccf28SAndroid Build Coastguard Worker            mov         x4, x9          // inlen = (w - x)
1839*e1eccf28SAndroid Build Coastguard Worker
1840*e1eccf28SAndroid Build Coastguard Worker            ldr         x12, [sp, #88]
1841*e1eccf28SAndroid Build Coastguard Worker
1842*e1eccf28SAndroid Build Coastguard Worker            add         x1, x1, x8      // in += x
1843*e1eccf28SAndroid Build Coastguard Worker
1844*e1eccf28SAndroid Build Coastguard Worker            cmp         x6, x5
1845*e1eccf28SAndroid Build Coastguard Worker            csel        x6, x5, x6, hs  // rup = min(r, y)
1846*e1eccf28SAndroid Build Coastguard Worker            cmp         x7, x5
1847*e1eccf28SAndroid Build Coastguard Worker            csel        x7, x5, x7, hs  // rdn = min(r, h - y - 1)
1848*e1eccf28SAndroid Build Coastguard Worker
1849*e1eccf28SAndroid Build Coastguard Worker
1850*e1eccf28SAndroid Build Coastguard Worker            sub         x13, xzr, x2
1851*e1eccf28SAndroid Build Coastguard Worker            msub        x15, x2, x6, x1
1852*e1eccf28SAndroid Build Coastguard Worker            madd        x19, x2, x7, x1
1853*e1eccf28SAndroid Build Coastguard Worker
1854*e1eccf28SAndroid Build Coastguard Worker            ld1         {v0.8h,v1.8h}, [x12], #32
1855*e1eccf28SAndroid Build Coastguard Worker            ld1         {v2.8h,v3.8h}, [x12], #32
1856*e1eccf28SAndroid Build Coastguard Worker
1857*e1eccf28SAndroid Build Coastguard Worker            adr         x30, 1f
1858*e1eccf28SAndroid Build Coastguard Worker  .irp r, TUNED_LIST4
1859*e1eccf28SAndroid Build Coastguard Worker            cmp         x5, #\r
1860*e1eccf28SAndroid Build Coastguard Worker            bls         convolve4_\r
1861*e1eccf28SAndroid Build Coastguard Worker  .endr
1862*e1eccf28SAndroid Build Coastguard Worker            b           convolve4_25
1863*e1eccf28SAndroid Build Coastguard Worker
1864*e1eccf28SAndroid Build Coastguard Worker1:          ld1         {v8.1d - v11.1d}, [sp], #32
1865*e1eccf28SAndroid Build Coastguard Worker            ld1         {v12.1d - v15.1d}, [sp], #32
1866*e1eccf28SAndroid Build Coastguard Worker            ldp         x19,x30, [sp], #16
1867*e1eccf28SAndroid Build Coastguard Worker            ret
1868*e1eccf28SAndroid Build Coastguard WorkerEND(rsdIntrinsicBlurU4_K)
1869