xref: /aosp_15_r20/external/libdav1d/src/loongarch/looprestoration.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2023, VideoLAN and dav1d authors
3 * Copyright © 2023, Loongson Technology Corporation Limited
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/loongarch/loongson_asm.S"
29
30#define REST_UNIT_STRIDE (400)
31
32.macro MADD_HU_BU in0, in1, out0, out1
33    vsllwil.hu.bu vr12,     \in0,     0
34    vexth.hu.bu   vr13,     \in0
35    vmadd.h       \out0,    vr12,     \in1
36    vmadd.h       \out1,    vr13,     \in1
37.endm
38
39const wiener_shuf
40.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
41endconst
42
43/*
44void wiener_filter_h_lsx(int32_t *hor_ptr,
45                         uint8_t *tmp_ptr,
46                         const int16_t filterh[8],
47                         const int w, const int h)
48*/
49function wiener_filter_h_8bpc_lsx
50    addi.d        sp,       sp,       -40
51    fst.d         f24,      sp,       0
52    fst.d         f25,      sp,       8
53    fst.d         f26,      sp,       16
54    fst.d         f27,      sp,       24
55    fst.d         f28,      sp,       32
56    li.w          t7,       1<<14          // clip_limit
57
58    la.local      t1,       wiener_shuf
59    vld           vr4,      t1,       0
60    vld           vr14,     a2,       0    // filter[0][k]
61    vreplvei.h    vr21,     vr14,     0
62    vreplvei.h    vr22,     vr14,     1
63    vreplvei.h    vr23,     vr14,     2
64    vreplvei.h    vr24,     vr14,     3
65    vreplvei.h    vr25,     vr14,     4
66    vreplvei.h    vr26,     vr14,     5
67    vreplvei.h    vr27,     vr14,     6
68    vreplgr2vr.w  vr0,      t7
69
70.WIENER_FILTER_H_H:
71    addi.w        a4,       a4,       -1    // h
72    addi.w        t0,       a3,       0     // w
73    addi.d        t1,       a1,       0     // tmp_ptr
74    addi.d        t2,       a0,       0     // hor_ptr
75
76.WIENER_FILTER_H_W:
77    addi.w        t0,       t0,       -16
78    vld           vr5,      t1,       0
79    vld           vr13,     t1,       16
80
81    vsubi.bu      vr14,     vr4,      2
82    vsubi.bu      vr15,     vr4,      1
83    vshuf.b       vr6,      vr13,     vr5,     vr14  // 1 ... 8, 9 ... 16
84    vshuf.b       vr7,      vr13,     vr5,     vr15  // 2 ... 9, 10 ... 17
85    vshuf.b       vr8,      vr13,     vr5,     vr4   // 3 ... 10, 11 ... 18
86    vaddi.bu      vr14,     vr4,      1
87    vaddi.bu      vr15,     vr4,      2
88    vshuf.b       vr9,      vr13,     vr5,     vr14  // 4 ... 11, 12 ... 19
89    vshuf.b       vr10,     vr13,     vr5,     vr15  // 5 ... 12, 13 ... 20
90    vaddi.bu      vr14,     vr4,      3
91    vshuf.b       vr11,     vr13,     vr5,     vr14  // 6 ... 13, 14 ... 21
92
93    vsllwil.hu.bu vr15,     vr8,      0    //  3  4  5  6  7  8  9 10
94    vexth.hu.bu   vr16,     vr8            // 11 12 13 14 15 16 17 18
95    vsllwil.wu.hu vr17,     vr15,     7    //  3  4  5  6
96    vexth.wu.hu   vr18,     vr15           //  7  8  9 10
97    vsllwil.wu.hu vr19,     vr16,     7    // 11 12 13 14
98    vexth.wu.hu   vr20,     vr16           // 15 16 17 18
99    vslli.w       vr18,     vr18,     7
100    vslli.w       vr20,     vr20,     7
101    vxor.v        vr15,     vr15,     vr15
102    vxor.v        vr14,     vr14,     vr14
103
104    MADD_HU_BU    vr5,   vr21,  vr14,  vr15
105    MADD_HU_BU    vr6,   vr22,  vr14,  vr15
106    MADD_HU_BU    vr7,   vr23,  vr14,  vr15
107    MADD_HU_BU    vr8,   vr24,  vr14,  vr15
108    MADD_HU_BU    vr9,   vr25,  vr14,  vr15
109    MADD_HU_BU    vr10,  vr26,  vr14,  vr15
110    MADD_HU_BU    vr11,  vr27,  vr14,  vr15
111
112    vsllwil.w.h   vr5,      vr14,     0   //  0  1  2  3
113    vexth.w.h     vr6,      vr14          //  4  5  6  7
114    vsllwil.w.h   vr7,      vr15,     0   //  8  9 10 11
115    vexth.w.h     vr8,      vr15          // 12 13 14 15
116    vadd.w        vr17,     vr17,     vr5
117    vadd.w        vr18,     vr18,     vr6
118    vadd.w        vr19,     vr19,     vr7
119    vadd.w        vr20,     vr20,     vr8
120    vadd.w        vr17,     vr17,     vr0
121    vadd.w        vr18,     vr18,     vr0
122    vadd.w        vr19,     vr19,     vr0
123    vadd.w        vr20,     vr20,     vr0
124
125    vsrli.w       vr1,      vr0,      1
126    vsubi.wu      vr1,      vr1,      1
127    vxor.v        vr3,      vr3,      vr3
128    vsrari.w      vr17,     vr17,     3
129    vsrari.w      vr18,     vr18,     3
130    vsrari.w      vr19,     vr19,     3
131    vsrari.w      vr20,     vr20,     3
132    vclip.w       vr17,     vr17,     vr3,     vr1
133    vclip.w       vr18,     vr18,     vr3,     vr1
134    vclip.w       vr19,     vr19,     vr3,     vr1
135    vclip.w       vr20,     vr20,     vr3,     vr1
136
137    vst           vr17,     t2,       0
138    vst           vr18,     t2,       16
139    vst           vr19,     t2,       32
140    vst           vr20,     t2,       48
141    addi.d        t1,       t1,       16
142    addi.d        t2,       t2,       64
143    blt           zero,     t0,       .WIENER_FILTER_H_W
144
145    addi.d        a1,       a1,       REST_UNIT_STRIDE
146    addi.d        a0,       a0,       (REST_UNIT_STRIDE << 2)
147    bnez          a4,       .WIENER_FILTER_H_H
148
149    fld.d         f24,      sp,       0
150    fld.d         f25,      sp,       8
151    fld.d         f26,      sp,       16
152    fld.d         f27,      sp,       24
153    fld.d         f28,      sp,       32
154    addi.d        sp,       sp,       40
155endfunc
156
157.macro APPLY_FILTER in0, in1, in2
158    alsl.d         t7,      \in0,     \in1,    2
159    vld            vr10,    t7,       0
160    vld            vr11,    t7,       16
161    vld            vr12,    t7,       32
162    vld            vr13,    t7,       48
163    vmadd.w        vr14,    vr10,     \in2
164    vmadd.w        vr15,    vr11,     \in2
165    vmadd.w        vr16,    vr12,     \in2
166    vmadd.w        vr17,    vr13,     \in2
167.endm
168
169.macro wiener_filter_v_8bpc_core_lsx
170    vreplgr2vr.w  vr14,     t6
171    vreplgr2vr.w  vr15,     t6
172    vreplgr2vr.w  vr16,     t6
173    vreplgr2vr.w  vr17,     t6
174
175    addi.w        t7,       t2,       0      // j + index k
176    mul.w         t7,       t7,       t8     // (j + index) * REST_UNIT_STRIDE
177    add.w         t7,       t7,       t4     // (j + index) * REST_UNIT_STRIDE + i
178
179    APPLY_FILTER  t7, a2, vr2
180    APPLY_FILTER  t8, t7, vr3
181    APPLY_FILTER  t8, t7, vr4
182    APPLY_FILTER  t8, t7, vr5
183    APPLY_FILTER  t8, t7, vr6
184    APPLY_FILTER  t8, t7, vr7
185    APPLY_FILTER  t8, t7, vr8
186    vssrarni.hu.w vr15,     vr14,     11
187    vssrarni.hu.w vr17,     vr16,     11
188    vssrlni.bu.h  vr17,     vr15,     0
189.endm
190
191/*
192void wiener_filter_v_lsx(uint8_t *p,
193                         const ptrdiff_t p_stride,
194                         const int32_t *hor,
195                         const int16_t filterv[8],
196                         const int w, const int h)
197*/
198function wiener_filter_v_8bpc_lsx
199    li.w          t6,       -(1 << 18)
200
201    li.w          t8,       REST_UNIT_STRIDE
202    ld.h          t0,       a3,       0
203    ld.h          t1,       a3,       2
204    vreplgr2vr.w  vr2,      t0
205    vreplgr2vr.w  vr3,      t1
206    ld.h          t0,       a3,       4
207    ld.h          t1,       a3,       6
208    vreplgr2vr.w  vr4,      t0
209    vreplgr2vr.w  vr5,      t1
210    ld.h          t0,       a3,       8
211    ld.h          t1,       a3,       10
212    vreplgr2vr.w  vr6,      t0
213    vreplgr2vr.w  vr7,      t1
214    ld.h          t0,       a3,       12
215    vreplgr2vr.w  vr8,      t0
216
217    andi          t1,       a4,       0xf
218    sub.w         t0,       a4,       t1    // w-w%16
219    or            t2,       zero,     zero  // j
220    or            t4,       zero,     zero
221    beqz          t0,       .WIENER_FILTER_V_W_LT16
222
223.WIENER_FILTER_V_H:
224    andi          t1,       a4,       0xf
225    add.d         t3,       zero,     a0     // p
226    or            t4,       zero,     zero   // i
227
228.WIENER_FILTER_V_W:
229
230    wiener_filter_v_8bpc_core_lsx
231
232    mul.w         t5,       t2,       a1   // j * stride
233    add.w         t5,       t5,       t4   // j * stride + i
234    add.d         t3,       a0,       t5
235    addi.w        t4,       t4,       16
236    vst           vr17,     t3,       0
237    bne           t0,       t4,       .WIENER_FILTER_V_W
238
239    beqz          t1,       .WIENER_FILTER_V_W_EQ16
240
241    wiener_filter_v_8bpc_core_lsx
242
243    addi.d        t3,       t3,       16
244    andi          t1,       a4,       0xf
245
246.WIENER_FILTER_V_ST_REM:
247    vstelm.b      vr17,     t3,       0,    0
248    vbsrl.v       vr17,     vr17,     1
249    addi.d        t3,       t3,       1
250    addi.w        t1,       t1,       -1
251    bnez          t1,       .WIENER_FILTER_V_ST_REM
252.WIENER_FILTER_V_W_EQ16:
253    addi.w        t2,       t2,       1
254    blt           t2,       a5,       .WIENER_FILTER_V_H
255    b              .WIENER_FILTER_V_END
256
257.WIENER_FILTER_V_W_LT16:
258    andi          t1,       a4,       0xf
259    add.d         t3,       zero,     a0
260
261    wiener_filter_v_8bpc_core_lsx
262
263    mul.w         t5,       t2,       a1   // j * stride
264    add.d         t3,       a0,       t5
265
266.WIENER_FILTER_V_ST_REM_1:
267    vstelm.b      vr17,     t3,       0,    0
268    vbsrl.v       vr17,     vr17,     1
269    addi.d        t3,       t3,       1
270    addi.w        t1,       t1,       -1
271    bnez          t1,       .WIENER_FILTER_V_ST_REM_1
272
273    addi.w        t2,       t2,       1
274    blt           t2,       a5,       .WIENER_FILTER_V_W_LT16
275
276.WIENER_FILTER_V_END:
277endfunc
278
279/*
280void boxsum3_h(int32_t *sumsq, coef *sum, const pixel *src,
281               const int w, const int h)
282*/
283function boxsum3_h_8bpc_lsx
284    addi.d         a2,      a2,      REST_UNIT_STRIDE
285    li.w           t0,      1
286    addi.w         a3,      a3,      -2
287    addi.w         a4,      a4,      -4
288
289.LBS3_H_H:
290    alsl.d         t1,      t0,      a1,    1     // sum_v    *sum_v = sum + x
291    alsl.d         t2,      t0,      a0,    2     // sumsq_v  *sumsq_v = sumsq + x
292    add.d          t3,      t0,      a2           // s
293    addi.w         t5,      a3,      0
294.LBS3_H_W:
295    vld            vr0,     t3,      0
296    vld            vr1,     t3,      REST_UNIT_STRIDE
297    vld            vr2,     t3,      (REST_UNIT_STRIDE<<1)
298
299    vilvl.b        vr3,     vr1,     vr0
300    vhaddw.hu.bu   vr4,     vr3,     vr3
301    vilvh.b        vr5,     vr1,     vr0
302    vhaddw.hu.bu   vr6,     vr5,     vr5
303    vsllwil.hu.bu  vr7,     vr2,     0
304    vexth.hu.bu    vr8,     vr2
305    // sum_v
306    vadd.h         vr4,     vr4,     vr7
307    vadd.h         vr6,     vr6,     vr8
308    vst            vr4,     t1,      REST_UNIT_STRIDE<<1
309    vst            vr6,     t1,      (REST_UNIT_STRIDE<<1)+16
310    addi.d         t1,      t1,      32
311    // sumsq
312    vmulwev.h.bu   vr9,     vr3,     vr3
313    vmulwod.h.bu   vr10,    vr3,     vr3
314    vmulwev.h.bu   vr11,    vr5,     vr5
315    vmulwod.h.bu   vr12,    vr5,     vr5
316    vaddwev.w.hu   vr13,    vr10,    vr9
317    vaddwod.w.hu   vr14,    vr10,    vr9
318    vaddwev.w.hu   vr15,    vr12,    vr11
319    vaddwod.w.hu   vr16,    vr12,    vr11
320    vmaddwev.w.hu  vr13,    vr7,     vr7
321    vmaddwod.w.hu  vr14,    vr7,     vr7
322    vmaddwev.w.hu  vr15,    vr8,     vr8
323    vmaddwod.w.hu  vr16,    vr8,     vr8
324    vilvl.w        vr9,     vr14,    vr13
325    vilvh.w        vr10,    vr14,    vr13
326    vilvl.w        vr11,    vr16,    vr15
327    vilvh.w        vr12,    vr16,    vr15
328    vst            vr9,     t2,      REST_UNIT_STRIDE<<2
329    vst            vr10,    t2,      (REST_UNIT_STRIDE<<2)+16
330    vst            vr11,    t2,      (REST_UNIT_STRIDE<<2)+32
331    vst            vr12,    t2,      (REST_UNIT_STRIDE<<2)+48
332
333    addi.d         t2,      t2,      64
334    addi.w         t5,      t5,      -16
335    addi.d         t3,      t3,      16
336    blt            zero,    t5,      .LBS3_H_W
337
338    addi.d         a0,      a0,      REST_UNIT_STRIDE<<2
339    addi.d         a1,      a1,      REST_UNIT_STRIDE<<1
340    addi.d         a2,      a2,      REST_UNIT_STRIDE
341    addi.d         a4,      a4,      -1
342    blt            zero,    a4,      .LBS3_H_H
343endfunc
344
345/*
346void boxsum3_v(int32_t *sumsq, coef *sum,
347               const int w, const int h)
348*/
349function boxsum3_v_8bpc_lsx
350    addi.d         a0,      a0,      (REST_UNIT_STRIDE<<2)
351    addi.d         a1,      a1,      (REST_UNIT_STRIDE<<1)
352    addi.w         a3,      a3,      -4
353    addi.w         a2,      a2,      -4
354
355.LBS3_V_H:
356    sub.w          t3,      a2,      zero
357    addi.d         t0,      a0,      4
358    addi.d         t1,      a1,      2
359    addi.d         t5,      a0,      8
360    addi.d         t6,      a1,      4
361
362    vld            vr0,      t1,      0   // a 0 1 2 3 4 5 6 7
363    vld            vr1,      t1,      2   // b 1 2 3 4 5 6 7 8
364    vld            vr2,      t1,      4   // c 2 3 4 5 6 7 8 9
365    vld            vr3,      t0,      0   // a2 0 1 2 3
366    vld            vr4,      t0,      4   // b2 1 2 3 4
367    vld            vr5,      t0,      8   // c2 2 3 4 5
368    vld            vr6,      t0,      16  //    3 4 5 6
369    vld            vr7,      t0,      20  //    4 5 6 7
370    vld            vr8,      t0,      24  //    5 6 7 8
371    vadd.h         vr9,      vr0,     vr1
372    vadd.w         vr10,     vr3,     vr4
373    vadd.w         vr11,     vr6,     vr7
374    vadd.h         vr9,      vr9,     vr2
375    vadd.w         vr10,     vr10,    vr5
376    vadd.w         vr11,     vr11,    vr8
377    vpickve2gr.h   t7,       vr2,     6
378    vpickve2gr.w   t8,       vr8,     2
379    vst            vr9,      t6,      0
380    vst            vr10,     t5,      0
381    vst            vr11,     t5,      16
382
383    addi.d         t1,       t1,      16
384    addi.d         t0,       t0,      32
385    addi.d         t5,       t5,      32
386    addi.d         t6,       t6,      16
387    addi.d         t3,       t3,      -8
388    ble            t3,       zero,    .LBS3_V_H0
389
390.LBS3_V_W8:
391    vld            vr0,      t1,      0   // a 0 1 2 3 4 5 6 7
392    vld            vr1,      t1,      2   // b 1 2 3 4 5 6 7 8
393    vld            vr2,      t1,      4   // c 2 3 4 5 6 7 8 9
394    vld            vr3,      t0,      0   // a2 0 1 2 3
395    vld            vr4,      t0,      4   // b2 1 2 3 4
396    vld            vr5,      t0,      8   // c2 2 3 4 5
397    vld            vr6,      t0,      16  //    3 4 5 6
398    vld            vr7,      t0,      20  //    4 5 6 7
399    vld            vr8,      t0,      24  //    5 6 7 8
400    vinsgr2vr.h    vr0,      t7,      0
401    vinsgr2vr.w    vr3,      t8,      0
402    vpickve2gr.h   t7,       vr2,     6
403    vpickve2gr.w   t8,       vr8,     2
404    vadd.h         vr9,      vr0,     vr1
405    vadd.w         vr10,     vr3,     vr4
406    vadd.w         vr11,     vr6,     vr7
407    vadd.h         vr9,      vr9,     vr2
408    vadd.w         vr10,     vr10,    vr5
409    vadd.w         vr11,     vr11,    vr8
410    vst            vr9,      t6,      0
411    vst            vr10,     t5,      0
412    vst            vr11,     t5,      16
413    addi.d         t3,       t3,      -8
414    addi.d         t1,       t1,      16
415    addi.d         t0,       t0,      32
416    addi.d         t5,       t5,      32
417    addi.d         t6,       t6,      16
418    blt            zero,     t3,      .LBS3_V_W8
419
420.LBS3_V_H0:
421    addi.d         a1,       a1,      REST_UNIT_STRIDE<<1
422    addi.d         a0,       a0,      REST_UNIT_STRIDE<<2
423    addi.w         a3,       a3,      -1
424    bnez           a3,       .LBS3_V_H
425endfunc
426
427/*
428boxsum3_selfguided_filter(int32_t *sumsq, coef *sum,
429                          const int w, const int h,
430                          const unsigned s)
431*/
432function boxsum3_sgf_h_8bpc_lsx
433    addi.d        a0,       a0,        REST_UNIT_STRIDE<<2
434    addi.d        a0,       a0,        12   // AA
435    addi.d        a1,       a1,        REST_UNIT_STRIDE<<1
436    addi.d        a1,       a1,        6    // BB
437    la.local      t8,       dav1d_sgr_x_by_x
438    li.w          t6,       455
439    vreplgr2vr.w  vr20,     t6
440    li.w          t6,       255
441    vreplgr2vr.w  vr22,     t6
442    vaddi.wu      vr21,     vr22,      1  // 256
443    vreplgr2vr.w  vr6,      a4
444    vldi          vr19,     0x809
445    addi.w        a2,       a2,        2  // w + 2
446    addi.w        a3,       a3,        2  // h + 2
447
448.LBS3SGF_H_H:
449    addi.w        t2,       a2,        0
450    addi.d        t0,       a0,        -4
451    addi.d        t1,       a1,        -2
452
453.LBS3SGF_H_W:
454    addi.w        t2,       t2,        -8
455    vld           vr0,      t0,        0   // AA[i]
456    vld           vr1,      t0,        16
457    vld           vr2,      t1,        0   // BB[i]
458
459    vmul.w        vr4,      vr0,       vr19 // a * n
460    vmul.w        vr5,      vr1,       vr19 // a * n
461    vsllwil.w.h   vr9,      vr2,       0
462    vexth.w.h     vr10,     vr2
463    vmsub.w       vr4,      vr9,       vr9   // p
464    vmsub.w       vr5,      vr10,      vr10   // p
465    vmaxi.w       vr4,      vr4,       0
466    vmaxi.w       vr5,      vr5,       0    // p
467    vmul.w        vr4,      vr4,       vr6  // p * s
468    vmul.w        vr5,      vr5,       vr6  // p * s
469    vsrlri.w      vr4,      vr4,       20
470    vsrlri.w      vr5,      vr5,       20   // z
471    vmin.w        vr4,      vr4,       vr22
472    vmin.w        vr5,      vr5,       vr22
473
474    vpickve2gr.w  t6,       vr4,       0
475    ldx.bu        t7,       t8,        t6
476    vinsgr2vr.w   vr7,      t7,        0
477    vpickve2gr.w  t6,       vr4,       1
478    ldx.bu        t7,       t8,        t6
479    vinsgr2vr.w   vr7,      t7,        1
480    vpickve2gr.w  t6,       vr4,       2
481    ldx.bu        t7,       t8,        t6
482    vinsgr2vr.w   vr7,      t7,        2
483    vpickve2gr.w  t6,       vr4,       3
484    ldx.bu        t7,       t8,        t6
485    vinsgr2vr.w   vr7,      t7,        3
486
487    vpickve2gr.w  t6,       vr5,       0
488    ldx.bu        t7,       t8,        t6
489    vinsgr2vr.w   vr8,      t7,        0
490    vpickve2gr.w  t6,       vr5,       1
491    ldx.bu        t7,       t8,        t6
492    vinsgr2vr.w   vr8,      t7,        1
493    vpickve2gr.w  t6,       vr5,       2
494    ldx.bu        t7,       t8,        t6
495    vinsgr2vr.w   vr8,      t7,        2
496    vpickve2gr.w  t6,       vr5,       3
497    ldx.bu        t7,       t8,        t6
498    vinsgr2vr.w   vr8,      t7,        3     // x
499
500    vmul.w        vr9,      vr7,       vr9   // x * BB[i]
501    vmul.w        vr10,     vr8,       vr10
502    vmul.w        vr9,      vr9,       vr20  // x * BB[i] * sgr_one_by_x
503    vmul.w        vr10,     vr10,      vr20
504    vsrlri.w      vr9,      vr9,       12
505    vsrlri.w      vr10,     vr10,      12
506    vsub.w        vr7,      vr21,      vr7
507    vsub.w        vr8,      vr21,      vr8
508    vpickev.h     vr8,      vr8,       vr7
509
510    vst           vr9,      t0,        0
511    vst           vr10,     t0,        16
512    vst           vr8,      t1,        0
513    addi.d        t0,       t0,        32
514    addi.d        t1,       t1,        16
515    blt           zero,     t2,        .LBS3SGF_H_W
516
517    addi.d        a0,       a0,        REST_UNIT_STRIDE<<2
518    addi.d        a1,       a1,        REST_UNIT_STRIDE<<1
519    addi.w        a3,       a3,        -1
520    bnez          a3,       .LBS3SGF_H_H
521endfunc
522
523/*
524boxsum3_selfguided_filter(coef *dst, pixel *src,
525                  int32_t *sumsq, coef *sum,
526                  const int w, const int h)
527*/
528function boxsum3_sgf_v_8bpc_lsx
529    addi.d        a1,        a1,      (3*REST_UNIT_STRIDE+3)   // src
530    addi.d        a2,        a2,      REST_UNIT_STRIDE<<2
531    addi.d        a2,        a2,      (REST_UNIT_STRIDE<<2)+12
532    addi.d        a3,        a3,      REST_UNIT_STRIDE<<2
533    addi.d        a3,        a3,      6
534.LBS3SGF_V_H:
535    // A int32_t *sumsq
536    addi.d        t0,        a2,      -(REST_UNIT_STRIDE<<2)   // -stride
537    addi.d        t1,        a2,      0    // sumsq
538    addi.d        t2,        a2,      REST_UNIT_STRIDE<<2      // +stride
539    addi.d        t6,        a1,      0
540    addi.w        t7,        a4,      0
541    addi.d        t8,        a0,      0
542    // B coef *sum
543    addi.d        t3,        a3,      -(REST_UNIT_STRIDE<<1)   // -stride
544    addi.d        t4,        a3,      0
545    addi.d        t5,        a3,      REST_UNIT_STRIDE<<1
546
547.LBS3SGF_V_W:
548    vld           vr0,       t0,      0   // P[i - REST_UNIT_STRIDE]
549    vld           vr1,       t0,      16
550    vld           vr2,       t1,      -4  // P[i-1]  -1 0 1 2
551    vld           vr3,       t1,      12           // 3 4 5 6
552    vld           vr4,       t2,      0   // P[i + REST_UNIT_STRIDE]
553    vld           vr5,       t2,      16
554    vld           vr6,       t1,      0   // p[i]     0 1 2 3
555    vld           vr7,       t1,      16           // 4 5 6 7
556    vld           vr8,       t1,      4   // p[i+1]   1 2 3 4
557    vld           vr9,       t1,      20           // 5 6 7 8
558
559    vld           vr10,      t0,      -4  // P[i - 1 - REST_UNIT_STRIDE]
560    vld           vr11,      t0,      12
561    vld           vr12,      t2,      -4  // P[i - 1 + REST_UNIT_STRIDE]
562    vld           vr13,      t2,      12
563    vld           vr14,      t0,      4   // P[i + 1 - REST_UNIT_STRIDE]
564    vld           vr15,      t0,      20
565    vld           vr16,      t2,      4   // P[i + 1 + REST_UNIT_STRIDE]
566    vld           vr17,      t2,      20
567
568    vadd.w        vr0,       vr2,     vr0
569    vadd.w        vr4,       vr6,     vr4
570    vadd.w        vr0,       vr0,     vr8
571    vadd.w        vr20,      vr0,     vr4
572    vslli.w       vr20,      vr20,    2      // 0 1 2 3
573    vadd.w        vr0,       vr1,     vr3
574    vadd.w        vr4,       vr5,     vr7
575    vadd.w        vr0,       vr0,     vr9
576    vadd.w        vr21,      vr0,     vr4
577    vslli.w       vr21,      vr21,    2      // 4 5 6 7
578    vadd.w        vr12,      vr10,    vr12
579    vadd.w        vr16,      vr14,    vr16
580    vadd.w        vr22,      vr12,    vr16
581    vslli.w       vr23,      vr22,    1
582    vadd.w        vr22,      vr23,    vr22
583    vadd.w        vr11,      vr11,    vr13
584    vadd.w        vr15,      vr15,    vr17
585    vadd.w        vr0,       vr11,    vr15
586    vslli.w       vr23,      vr0,     1
587    vadd.w        vr23,      vr23,    vr0
588    vadd.w        vr20,      vr20,    vr22   // b
589    vadd.w        vr21,      vr21,    vr23
590
591    // B coef *sum
592    vld           vr0,       t3,      0   // P[i - REST_UNIT_STRIDE]
593    vld           vr1,       t4,      -2  // p[i - 1]
594    vld           vr2,       t4,      0   // p[i]
595    vld           vr3,       t4,      2   // p[i + 1]
596    vld           vr4,       t5,      0   // P[i + REST_UNIT_STRIDE]
597    vld           vr5,       t3,      -2  // P[i - 1 - REST_UNIT_STRIDE]
598    vld           vr6,       t5,      -2  // P[i - 1 + REST_UNIT_STRIDE]
599    vld           vr7,       t3,      2   // P[i + 1 - REST_UNIT_STRIDE]
600    vld           vr8,       t5,      2   // P[i + 1 + REST_UNIT_STRIDE]
601    vaddwev.w.h   vr9,       vr0,     vr1
602    vaddwod.w.h   vr10,      vr0,     vr1
603    vaddwev.w.h   vr11,      vr2,     vr3
604    vaddwod.w.h   vr12,      vr2,     vr3
605    vadd.w        vr9,       vr11,    vr9
606    vadd.w        vr10,      vr12,    vr10
607    vilvl.w       vr11,      vr10,    vr9    // 0 1 2 3
608    vilvh.w       vr12,      vr10,    vr9    // 4 5 6 7
609    vsllwil.w.h   vr0,       vr4,     0
610    vexth.w.h     vr1,       vr4
611    vadd.w        vr0,       vr11,    vr0
612    vadd.w        vr1,       vr12,    vr1
613    vslli.w       vr0,       vr0,     2
614    vslli.w       vr1,       vr1,     2
615    vaddwev.w.h   vr9,       vr5,     vr6
616    vaddwod.w.h   vr10,      vr5,     vr6
617    vaddwev.w.h   vr11,      vr7,     vr8
618    vaddwod.w.h   vr12,      vr7,     vr8
619    vadd.w        vr9,       vr11,    vr9
620    vadd.w        vr10,      vr12,    vr10
621    vilvl.w       vr13,      vr10,    vr9
622    vilvh.w       vr14,      vr10,    vr9
623    vslli.w       vr15,      vr13,    1
624    vslli.w       vr16,      vr14,    1
625    vadd.w        vr15,      vr13,    vr15   // a
626    vadd.w        vr16,      vr14,    vr16
627    vadd.w        vr22,      vr0,     vr15
628    vadd.w        vr23,      vr1,     vr16
629    vld           vr0,       t6,      0      // src
630    vsllwil.hu.bu vr0,       vr0,     0
631    vsllwil.wu.hu vr1,       vr0,     0
632    vexth.wu.hu   vr2,       vr0
633    vmadd.w       vr20,      vr22,    vr1
634    vmadd.w       vr21,      vr23,    vr2
635    vssrlrni.h.w  vr21,      vr20,    9
636    vst           vr21,      t8,      0
637    addi.d        t8,        t8,      16
638
639    addi.d        t0,        t0,      32
640    addi.d        t1,        t1,      32
641    addi.d        t2,        t2,      32
642    addi.d        t3,        t3,      16
643    addi.d        t4,        t4,      16
644    addi.d        t5,        t5,      16
645    addi.d        t6,        t6,      8
646    addi.w        t7,        t7,      -8
647    blt           zero,      t7,      .LBS3SGF_V_W
648
649    addi.w        a5,        a5,      -1
650    addi.d        a0,        a0,      384*2
651    addi.d        a1,        a1,      REST_UNIT_STRIDE
652    addi.d        a3,        a3,      REST_UNIT_STRIDE<<1
653    addi.d        a2,        a2,      REST_UNIT_STRIDE<<2
654    bnez          a5,        .LBS3SGF_V_H
655endfunc
656
657function boxsum3_sgf_v_8bpc_lasx
658    addi.d        a1,        a1,      (3*REST_UNIT_STRIDE+3)   // src
659    addi.d        a2,        a2,      REST_UNIT_STRIDE<<2
660    addi.d        a2,        a2,      (REST_UNIT_STRIDE<<2)+12
661    addi.d        a3,        a3,      REST_UNIT_STRIDE<<2
662    addi.d        a3,        a3,      6
663.LBS3SGF_V_H_LASX:
664    // A int32_t *sumsq
665    addi.d        t0,        a2,      -(REST_UNIT_STRIDE<<2)   // -stride
666    addi.d        t1,        a2,      0    // sumsq
667    addi.d        t2,        a2,      REST_UNIT_STRIDE<<2      // +stride
668    addi.d        t6,        a1,      0
669    addi.w        t7,        a4,      0
670    addi.d        t8,        a0,      0
671    // B coef *sum
672    addi.d        t3,        a3,      -(REST_UNIT_STRIDE<<1)   // -stride
673    addi.d        t4,        a3,      0
674    addi.d        t5,        a3,      REST_UNIT_STRIDE<<1
675
676.LBS3SGF_V_W_LASX:
677    xvld           xr0,       t0,      0   // P[i - REST_UNIT_STRIDE]
678    xvld           xr1,       t0,      32
679    xvld           xr2,       t1,      -4  // P[i-1]  -1 0 1 2
680    xvld           xr3,       t1,      28           // 3 4 5 6
681    xvld           xr4,       t2,      0   // P[i + REST_UNIT_STRIDE]
682    xvld           xr5,       t2,      32
683    xvld           xr6,       t1,      0   // p[i]     0 1 2 3
684    xvld           xr7,       t1,      32           // 4 5 6 7
685    xvld           xr8,       t1,      4   // p[i+1]   1 2 3 4
686    xvld           xr9,       t1,      36           // 5 6 7 8
687
688    xvld           xr10,      t0,      -4  // P[i - 1 - REST_UNIT_STRIDE]
689    xvld           xr11,      t0,      28
690    xvld           xr12,      t2,      -4  // P[i - 1 + REST_UNIT_STRIDE]
691    xvld           xr13,      t2,      28
692    xvld           xr14,      t0,      4   // P[i + 1 - REST_UNIT_STRIDE]
693    xvld           xr15,      t0,      36
694    xvld           xr16,      t2,      4   // P[i + 1 + REST_UNIT_STRIDE]
695    xvld           xr17,      t2,      36
696
697    xvadd.w        xr0,       xr2,     xr0
698    xvadd.w        xr4,       xr6,     xr4
699    xvadd.w        xr0,       xr0,     xr8
700    xvadd.w        xr20,      xr0,     xr4
701    xvslli.w       xr20,      xr20,    2      // 0 1 2 3
702    xvadd.w        xr0,       xr1,     xr3
703    xvadd.w        xr4,       xr5,     xr7
704    xvadd.w        xr0,       xr0,     xr9
705    xvadd.w        xr21,      xr0,     xr4
706    xvslli.w       xr21,      xr21,    2      // 4 5 6 7
707    xvadd.w        xr12,      xr10,    xr12
708    xvadd.w        xr16,      xr14,    xr16
709    xvadd.w        xr22,      xr12,    xr16
710    xvslli.w       xr23,      xr22,    1
711    xvadd.w        xr22,      xr23,    xr22
712    xvadd.w        xr11,      xr11,    xr13
713    xvadd.w        xr15,      xr15,    xr17
714    xvadd.w        xr0,       xr11,    xr15
715    xvslli.w       xr23,      xr0,     1
716    xvadd.w        xr23,      xr23,    xr0
717    xvadd.w        xr20,      xr20,    xr22   // b
718    xvadd.w        xr21,      xr21,    xr23
719
720    // B coef *sum
721    xvld           xr0,       t3,      0   // P[i - REST_UNIT_STRIDE]
722    xvld           xr1,       t4,      -2  // p[i - 1]
723    xvld           xr2,       t4,      0   // p[i]
724    xvld           xr3,       t4,      2   // p[i + 1]
725    xvld           xr4,       t5,      0   // P[i + REST_UNIT_STRIDE]
726    xvld           xr5,       t3,      -2  // P[i - 1 - REST_UNIT_STRIDE]
727    xvld           xr6,       t5,      -2  // P[i - 1 + REST_UNIT_STRIDE]
728    xvld           xr7,       t3,      2   // P[i + 1 - REST_UNIT_STRIDE]
729    xvld           xr8,       t5,      2   // P[i + 1 + REST_UNIT_STRIDE]
730
731    xvaddwev.w.h   xr9,       xr0,     xr1
732    xvaddwod.w.h   xr10,      xr0,     xr1
733    xvaddwev.w.h   xr11,      xr2,     xr3
734    xvaddwod.w.h   xr12,      xr2,     xr3
735    xvadd.w        xr9,       xr11,    xr9   // 0 2 4 6 8 10 12 14
736    xvadd.w        xr10,      xr12,    xr10  // 1 3 5 7 9 11 13 15
737    xvilvl.w       xr11,      xr10,    xr9   // 0 1 2 3 8 9 10 11
738    xvilvh.w       xr12,      xr10,    xr9   // 4 5 6 7 12 13 14 15
739    xvsllwil.w.h   xr0,       xr4,     0     // 0 1 2 3 8 9 10 11
740    xvexth.w.h     xr1,       xr4            // 4 5 6 7 12 13 14 15
741
742    xvadd.w        xr0,       xr11,    xr0
743    xvadd.w        xr1,       xr12,    xr1
744    xvslli.w       xr0,       xr0,     2
745    xvslli.w       xr1,       xr1,     2
746
747    xvaddwev.w.h   xr9,       xr5,     xr6
748    xvaddwod.w.h   xr10,      xr5,     xr6
749    xvaddwev.w.h   xr11,      xr7,     xr8
750    xvaddwod.w.h   xr12,      xr7,     xr8
751    xvadd.w        xr9,       xr11,    xr9
752    xvadd.w        xr10,      xr12,    xr10
753    xvilvl.w       xr13,      xr10,    xr9   // 0 1 2 3 8 9 10 11
754    xvilvh.w       xr14,      xr10,    xr9   // 4 5 6 7 12 13 14 15
755
756    xvslli.w       xr15,      xr13,    1
757    xvslli.w       xr16,      xr14,    1
758    xvadd.w        xr15,      xr13,    xr15   // a
759    xvadd.w        xr16,      xr14,    xr16
760    xvadd.w        xr22,      xr0,     xr15   // A B
761    xvadd.w        xr23,      xr1,     xr16   // C D
762
763    vld            vr0,       t6,      0      // src
764    vilvh.d        vr2,       vr0,     vr0
765    vext2xv.wu.bu  xr1,       xr0
766    vext2xv.wu.bu  xr2,       xr2
767    xvor.v         xr15,      xr22,    xr22   // A B
768    xvpermi.q      xr22,      xr23,    0b00000010  // A C
769    xvpermi.q      xr23,      xr15,    0b00110001
770    xvmadd.w       xr20,      xr22,    xr1
771    xvmadd.w       xr21,      xr23,    xr2
772    xvssrlrni.h.w  xr21,      xr20,    9
773    xvpermi.d      xr22,      xr21,    0b11011000
774    xvst           xr22,      t8,      0
775    addi.d         t8,        t8,      32
776
777    addi.d        t0,        t0,      64
778    addi.d        t1,        t1,      64
779    addi.d        t2,        t2,      64
780    addi.d        t3,        t3,      32
781    addi.d        t4,        t4,      32
782    addi.d        t5,        t5,      32
783    addi.d        t6,        t6,      16
784    addi.w        t7,        t7,      -16
785    blt           zero,      t7,      .LBS3SGF_V_W_LASX
786
787    addi.w        a5,        a5,      -1
788    addi.d        a0,        a0,      384*2
789    addi.d        a1,        a1,      REST_UNIT_STRIDE
790    addi.d        a3,        a3,      REST_UNIT_STRIDE<<1
791    addi.d        a2,        a2,      REST_UNIT_STRIDE<<2
792    bnez          a5,        .LBS3SGF_V_H_LASX
793endfunc
794
795#define FILTER_OUT_STRIDE (384)
796
797/*
798sgr_3x3_finish_c(const pixel *p, const ptrdiff_t stride,
799                   const int16_t *dst, const int w1;
800                   const int w, const int h);
801*/
802function sgr_3x3_finish_8bpc_lsx
803    vreplgr2vr.w  vr3,     a3            // w1
804    andi          t4,      a4,       0x7
805    sub.w         t5,      a4,       t4
806
807    beq           zero,    t5,       .LSGR3X3_REM
808
809.LSGR3X3_H:
810    addi.d        t0,      a0,       0
811    addi.d        t1,      a2,       0
812    addi.w        t2,      t5,       0
813    andi          t4,      a4,       0x7
814.LSGR3X3_W:
815    vld           vr0,     t0,       0
816    vld           vr1,     t1,       0
817    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
818    vsllwil.wu.hu vr4,     vr2,      0   // p
819    vexth.wu.hu   vr5,     vr2           // p
820    vslli.w       vr6,     vr4,      7
821    vslli.w       vr7,     vr5,      7
822    vsllwil.w.h   vr8,     vr1,      0   // dst
823    vexth.w.h     vr9,     vr1           // dst
824    vsub.w        vr8,     vr8,      vr4
825    vsub.w        vr9,     vr9,      vr5
826    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
827    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
828    vssrarni.hu.w vr7,     vr6,      11
829    vssrlni.bu.h  vr7,     vr7,      0
830    vstelm.d      vr7,     t0,       0,    0
831    addi.d        t0,      t0,       8
832    addi.d        t1,      t1,       16
833    addi.d        t2,      t2,       -8
834    bne           zero,    t2,       .LSGR3X3_W
835
836    beq           t4,      zero,     .LSGR3X3_NOREM
837
838    vld           vr0,     t0,       0
839    vld           vr1,     t1,       0
840    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
841    vsllwil.wu.hu vr4,     vr2,      0   // p
842    vexth.wu.hu   vr5,     vr2           // p
843    vslli.w       vr6,     vr4,      7
844    vslli.w       vr7,     vr5,      7
845    vsllwil.w.h   vr8,     vr1,      0   // dst
846    vexth.w.h     vr9,     vr1           // dst
847    vsub.w        vr8,     vr8,      vr4
848    vsub.w        vr9,     vr9,      vr5
849    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
850    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
851    vssrarni.hu.w vr7,     vr6,      11
852    vssrlni.bu.h  vr7,     vr7,      0
853
854.LSGR3X3_ST:
855    vstelm.b      vr7,     t0,       0,    0
856    addi.d        t0,      t0,       1
857    vbsrl.v       vr7,     vr7,      1
858    addi.w        t4,      t4,       -1
859    bnez          t4,      .LSGR3X3_ST
860
861.LSGR3X3_NOREM:
862    addi.w        a5,      a5,       -1
863    add.d         a0,      a0,       a1
864    addi.d        a2,      a2,       (FILTER_OUT_STRIDE<<1)
865    bnez          a5,      .LSGR3X3_H
866    b             .LSGR3X3_END
867
868.LSGR3X3_REM:
869    andi          t4,      a4,       0x7
870    addi.d        t0,      a0,       0
871    vld           vr0,     t0,       0
872    vld           vr1,     a2,       0
873    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
874    vsllwil.wu.hu vr4,     vr2,      0   // p
875    vexth.wu.hu   vr5,     vr2           // p
876    vslli.w       vr6,     vr4,      7
877    vslli.w       vr7,     vr5,      7
878    vsllwil.w.h   vr8,     vr1,      0   // dst
879    vexth.w.h     vr9,     vr1           // dst
880    vsub.w        vr8,     vr8,      vr4
881    vsub.w        vr9,     vr9,      vr5
882    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
883    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
884    vssrarni.hu.w vr7,     vr6,      11
885    vssrlni.bu.h  vr7,     vr7,      0
886
887.LSGR3X3_REM_ST:
888    vstelm.b      vr7,     t0,       0,    0
889    addi.d        t0,      t0,       1
890    vbsrl.v       vr7,     vr7,      1
891    addi.w        t4,      t4,       -1
892    bnez          t4,      .LSGR3X3_REM_ST
893    addi.w        a5,      a5,       -1
894    add.d         a0,      a0,       a1
895    addi.d        a2,      a2,       (FILTER_OUT_STRIDE<<1)
896    bnez          a5,      .LSGR3X3_REM
897
898.LSGR3X3_END:
899endfunc
900
901/*
902void boxsum5(int32_t *sumsq, coef *sum,
903             const pixel *const src,
904             const int w, const int h)
905*/
906function boxsum5_h_8bpc_lsx
907    addi.w        a4,      a4,        -4
908    addi.d        a0,      a0,        REST_UNIT_STRIDE<<2
909    addi.d        a1,      a1,        REST_UNIT_STRIDE<<1
910    li.w          t6,      1
911.LBOXSUM5_H_H:
912    addi.w        t3,      a3,        0
913    addi.d        t2,      a2,        0
914    addi.d        t0,      a0,        0
915    addi.d        t1,      a1,        0
916
917.LBOXSUM5_H_W:
918    vld           vr0,     t2,        0                   // a
919    vld           vr1,     t2,        REST_UNIT_STRIDE    // b
920    vld           vr2,     t2,        REST_UNIT_STRIDE<<1 // c
921    vld           vr3,     t2,        REST_UNIT_STRIDE*3  // d
922    vld           vr4,     t2,        REST_UNIT_STRIDE<<2 // e
923
924    vilvl.b       vr5,     vr1,       vr0
925    vilvh.b       vr6,     vr1,       vr0
926    vilvl.b       vr7,     vr3,       vr2
927    vilvh.b       vr8,     vr3,       vr2
928    //sum_v
929    vhaddw.hu.bu  vr9,     vr5,       vr5  // 0 1  2  3  4  5  6  7
930    vhaddw.hu.bu  vr10,    vr6,       vr6  // 8 9 10 11 12 13 14 15  a+b
931    vhaddw.hu.bu  vr11,    vr7,       vr7
932    vhaddw.hu.bu  vr12,    vr8,       vr8
933    vadd.h        vr9,     vr9,       vr11
934    vadd.h        vr10,    vr10,      vr12  // a + b + c + d
935    vsllwil.hu.bu vr11,    vr4,       0
936    vexth.hu.bu   vr12,    vr4
937    vadd.h        vr9,     vr9,       vr11
938    vadd.h        vr10,    vr10,      vr12
939    vst           vr9,     t1,        0
940    vst           vr10,    t1,        16
941    addi.d        t1,      t1,        32
942
943    // sumsq
944    vmulwev.h.bu  vr9,     vr5,       vr5  // a*a 0 1  2  3  4  5  6  7
945    vmulwev.h.bu  vr10,    vr6,       vr6  // a*a 8 9 10 11 12 13 14 15
946    vmulwod.h.bu  vr13,    vr5,       vr5  // b*b 0 1  2  3  4  5  6  7
947    vmulwod.h.bu  vr14,    vr6,       vr6  // b*b 8 9 10 11 12 13 14 15
948    vmulwev.h.bu  vr15,    vr7,       vr7  // c*c 0 1  2  3  4  5  6  7
949    vmulwev.h.bu  vr16,    vr8,       vr8  // c*c 8 9 10 11 12 13 14 15
950    vmulwod.h.bu  vr17,    vr7,       vr7  // d*d 0 1  2  3  4  5  6  7
951    vmulwod.h.bu  vr18,    vr8,       vr8  // d*d 8 9 10 11 12 13 14 15
952    vaddwev.w.hu  vr5,     vr9,       vr13  // 0 2 4 6
953    vaddwod.w.hu  vr6,     vr9,       vr13  // 1 3 5 7
954    vaddwev.w.hu  vr7,     vr10,      vr14  // 8 10 12 14
955    vaddwod.w.hu  vr8,     vr10,      vr14  // 9 11 13 15   a + b
956    vaddwev.w.hu  vr19,    vr15,      vr17  // 0 2 4 6
957    vaddwod.w.hu  vr20,    vr15,      vr17  // 1 3 5 7
958    vaddwev.w.hu  vr21,    vr16,      vr18  // 8 10 12 14
959    vaddwod.w.hu  vr22,    vr16,      vr18  // 9 11 13 15   c + d
960    vadd.w        vr5,     vr5,       vr19
961    vadd.w        vr6,     vr6,       vr20
962    vadd.w        vr7,     vr7,       vr21
963    vadd.w        vr8,     vr8,       vr22
964    vmaddwev.w.hu vr5,     vr11,      vr11
965    vmaddwod.w.hu vr6,     vr11,      vr11
966    vmaddwev.w.hu vr7,     vr12,      vr12
967    vmaddwod.w.hu vr8,     vr12,      vr12
968    vilvl.w       vr19,    vr6,       vr5
969    vilvh.w       vr20,    vr6,       vr5
970    vilvl.w       vr21,    vr8,       vr7
971    vilvh.w       vr22,    vr8,       vr7
972
973    vst           vr19,    t0,        0
974    vst           vr20,    t0,        16
975    vst           vr21,    t0,        32
976    vst           vr22,    t0,        48
977    addi.d        t0,      t0,        64
978    addi.d        t2,      t2,        16
979    addi.w        t3,      t3,        -16
980    blt           zero,    t3,        .LBOXSUM5_H_W
981
982    addi.d        a0,      a0,        REST_UNIT_STRIDE<<2
983    addi.d        a1,      a1,        REST_UNIT_STRIDE<<1
984    addi.d        a2,      a2,        REST_UNIT_STRIDE
985    addi.d        a4,      a4,        -1
986    bnez          a4,      .LBOXSUM5_H_H
987endfunc
988
989/*
990void boxsum5_h(int32_t *sumsq, coef *sum,
991               const int w, const int h)
992*/
993function boxsum5_v_8bpc_lsx
994    addi.d         a0,      a0,      (REST_UNIT_STRIDE<<2)
995    addi.d         a1,      a1,      (REST_UNIT_STRIDE<<1)
996    addi.w         a3,      a3,      -4
997    addi.w         a2,      a2,      -4
998
999.LBOXSUM5_V_H:
1000    addi.w         t3,      a2,      0
1001    addi.d         t0,      a0,      0
1002    addi.d         t1,      a1,      0
1003    addi.d         t2,      a0,      8
1004    addi.d         t3,      a1,      4
1005    addi.d         t4,      a2,      0
1006
1007    vld            vr0,     t1,      0   // a 0 1 2 3 4 5 6 7
1008    vld            vr1,     t1,      2   // b 1 2 3 4 5 6 7 8
1009    vld            vr2,     t1,      4   // c 2
1010    vld            vr3,     t1,      6   // d 3
1011    vld            vr4,     t1,      8   // e 4 5 6 7 8 9 10 11
1012    vadd.h         vr5,     vr0,     vr1
1013    vadd.h         vr6,     vr2,     vr3
1014    vpickve2gr.w   t5,      vr4,     2
1015    vadd.h         vr5,     vr5,     vr6
1016    vadd.h         vr5,     vr5,     vr4
1017    vst            vr5,     t3,      0
1018
1019    vld            vr0,     t0,      0  // 0 1 2 3   a
1020    vld            vr1,     t0,      4  // 1 2 3 4   b
1021    vld            vr2,     t0,      8  // 2 3 4 5   c
1022    vld            vr3,     t0,      12 // 3 4 5 6   d
1023    vld            vr4,     t0,      16 // 4 5 6 7   e  a
1024    vld            vr5,     t0,      20 // 5 6 7 8      b
1025    vld            vr6,     t0,      24 // 6 7 8 9      c
1026    vld            vr7,     t0,      28 // 7 8 9 10     d
1027    vld            vr8,     t0,      32 // 8 9 10 11    e
1028
1029    vadd.w         vr9,     vr0,     vr1
1030    vadd.w         vr10,    vr2,     vr3
1031    vadd.w         vr9,     vr9,     vr10
1032    vadd.w         vr9,     vr9,     vr4
1033    vadd.w         vr10,    vr4,     vr5
1034    vadd.w         vr11,    vr6,     vr7
1035    vadd.w         vr10,    vr10,    vr8
1036    vadd.w         vr10,    vr10,    vr11
1037    vst            vr9,     t2,      0
1038    vst            vr10,    t2,      16
1039
1040    addi.d         t3,      t3,      16
1041    addi.d         t1,      t1,      16
1042    addi.d         t0,      t0,      32
1043    addi.d         t2,      t2,      32
1044    addi.w         t4,      t4,      -8
1045    ble            t4,      zero,    .LBOXSUM5_V_H1
1046
1047.LBOXSUM5_V_W:
1048    vld            vr0,     t1,      0   // a 0 1 2 3 4 5 6 7
1049    vld            vr1,     t1,      2   // b 1 2 3 4 5 6 7 8
1050    vld            vr2,     t1,      4   // c 2
1051    vld            vr3,     t1,      6   // d 3
1052    vld            vr4,     t1,      8   // e 4 5 6 7 8 9 10 11
1053    vinsgr2vr.w    vr0,     t5,      0
1054    vpickve2gr.w   t5,      vr4,     2
1055    vextrins.h     vr1,     vr0,     0x01
1056    vadd.h         vr5,     vr0,     vr1
1057    vadd.h         vr6,     vr2,     vr3
1058    vadd.h         vr5,     vr5,     vr6
1059    vadd.h         vr5,     vr5,     vr4
1060    vst            vr5,     t3,      0
1061
1062    vaddi.hu       vr0,     vr8,     0  // 8  9 10 11  a
1063    vld            vr1,     t0,      4  // 9 10 11 12  b
1064    vld            vr2,     t0,      8  // 10 11 12 13 c
1065    vld            vr3,     t0,      12 // 14 15 16 17 d
1066    vld            vr4,     t0,      16 // 15 16 17 18 e  a
1067    vld            vr5,     t0,      20 // 16 17 18 19    b
1068    vld            vr6,     t0,      24 // 17 18 19 20    c
1069    vld            vr7,     t0,      28 // 18 19 20 21    d
1070    vld            vr8,     t0,      32 // 19 20 21 22    e
1071    vextrins.w     vr1,     vr0,     0x01
1072    vadd.w         vr9,     vr0,     vr1
1073    vadd.w         vr10,    vr2,     vr3
1074    vadd.w         vr9,     vr9,     vr10
1075    vadd.w         vr9,     vr9,     vr4
1076    vadd.w         vr10,    vr4,     vr5
1077    vadd.w         vr11,    vr6,     vr7
1078    vadd.w         vr10,    vr10,    vr8
1079    vadd.w         vr10,    vr10,    vr11
1080    vst            vr9,     t2,      0
1081    vst            vr10,    t2,      16
1082
1083    addi.d         t3,      t3,      16
1084    addi.d         t1,      t1,      16
1085    addi.d         t0,      t0,      32
1086    addi.d         t2,      t2,      32
1087    addi.w         t4,      t4,      -8
1088    blt            zero,    t4,      .LBOXSUM5_V_W
1089
1090.LBOXSUM5_V_H1:
1091    addi.d         a1,       a1,      REST_UNIT_STRIDE<<1
1092    addi.d         a0,       a0,      REST_UNIT_STRIDE<<2
1093    addi.w         a3,       a3,      -1
1094    bnez           a3,       .LBOXSUM5_V_H
1095endfunc
1096
1097/*
1098selfguided_filter(int32_t *sumsq, coef *sum,
1099                  const int w, const int h,
1100                  const unsigned s)
1101*/
1102function boxsum5_sgf_h_8bpc_lsx
1103    addi.d        a0,       a0,        REST_UNIT_STRIDE<<2
1104    addi.d        a0,       a0,        12   // AA
1105    addi.d        a1,       a1,        REST_UNIT_STRIDE<<1
1106    addi.d        a1,       a1,        6    // BB
1107    la.local      t8,       dav1d_sgr_x_by_x
1108    li.w          t6,       164
1109    vreplgr2vr.w  vr20,     t6
1110    li.w          t6,       255
1111    vreplgr2vr.w  vr22,     t6
1112    vaddi.wu      vr21,     vr22,      1  // 256
1113    vreplgr2vr.w  vr6,      a4
1114    vldi          vr19,     0x819
1115    addi.w        a2,       a2,        2  // w + 2
1116    addi.w        a3,       a3,        2  // h + 2
1117
1118.LBS5SGF_H_H:
1119    addi.w        t2,       a2,        0
1120    addi.d        t0,       a0,        -4
1121    addi.d        t1,       a1,        -2
1122
1123.LBS5SGF_H_W:
1124    vld           vr0,      t0,        0   // AA[i]
1125    vld           vr1,      t0,        16
1126    vld           vr2,      t1,        0   // BB[i]
1127
1128    vmul.w        vr4,      vr0,       vr19 // a * n
1129    vmul.w        vr5,      vr1,       vr19 // a * n
1130    vsllwil.w.h   vr9,      vr2,       0
1131    vexth.w.h     vr10,     vr2
1132    vmsub.w       vr4,      vr9,       vr9   // p
1133    vmsub.w       vr5,      vr10,      vr10   // p
1134    vmaxi.w       vr4,      vr4,       0
1135    vmaxi.w       vr5,      vr5,       0    // p
1136    vmul.w        vr4,      vr4,       vr6  // p * s
1137    vmul.w        vr5,      vr5,       vr6  // p * s
1138    vsrlri.w      vr4,      vr4,       20
1139    vsrlri.w      vr5,      vr5,       20   // z
1140    vmin.w        vr4,      vr4,       vr22
1141    vmin.w        vr5,      vr5,       vr22
1142
1143    // load table data
1144    vpickve2gr.w  t6,       vr4,       0
1145    ldx.bu        t7,       t8,        t6
1146    vinsgr2vr.w   vr7,      t7,        0
1147    vpickve2gr.w  t6,       vr4,       1
1148    ldx.bu        t7,       t8,        t6
1149    vinsgr2vr.w   vr7,      t7,        1
1150    vpickve2gr.w  t6,       vr4,       2
1151    ldx.bu        t7,       t8,        t6
1152    vinsgr2vr.w   vr7,      t7,        2
1153    vpickve2gr.w  t6,       vr4,       3
1154    ldx.bu        t7,       t8,        t6
1155    vinsgr2vr.w   vr7,      t7,        3
1156
1157    vpickve2gr.w  t6,       vr5,       0
1158    ldx.bu        t7,       t8,        t6
1159    vinsgr2vr.w   vr8,      t7,        0
1160    vpickve2gr.w  t6,       vr5,       1
1161    ldx.bu        t7,       t8,        t6
1162    vinsgr2vr.w   vr8,      t7,        1
1163    vpickve2gr.w  t6,       vr5,       2
1164    ldx.bu        t7,       t8,        t6
1165    vinsgr2vr.w   vr8,      t7,        2
1166    vpickve2gr.w  t6,       vr5,       3
1167    ldx.bu        t7,       t8,        t6
1168    vinsgr2vr.w   vr8,      t7,        3     // x
1169
1170    vmul.w        vr9,      vr7,       vr9   // x * BB[i]
1171    vmul.w        vr10,     vr8,       vr10
1172    vmul.w        vr9,      vr9,       vr20  // x * BB[i] * sgr_one_by_x
1173    vmul.w        vr10,     vr10,      vr20
1174    vsrlri.w      vr9,      vr9,       12
1175    vsrlri.w      vr10,     vr10,      12
1176    vsub.w        vr7,      vr21,      vr7
1177    vsub.w        vr8,      vr21,      vr8
1178    vpickev.h     vr8,      vr8,       vr7
1179    vst           vr9,      t0,        0
1180    vst           vr10,     t0,        16
1181    vst           vr8,      t1,        0
1182    addi.d        t0,       t0,        32
1183    addi.d        t1,       t1,        16
1184    addi.w        t2,       t2,        -8
1185    blt           zero,     t2,        .LBS5SGF_H_W
1186
1187    addi.d        a0,       a0,        REST_UNIT_STRIDE<<2
1188    addi.d        a0,       a0,        REST_UNIT_STRIDE<<2
1189    addi.d        a1,       a1,        REST_UNIT_STRIDE<<2
1190    addi.w        a3,       a3,        -2
1191    blt           zero,     a3,        .LBS5SGF_H_H
1192endfunc
1193
1194/*
1195selfguided_filter(coef *dst, pixel *src,
1196                  int32_t *sumsq, coef *sum,
1197                  const int w, const int h)
1198*/
1199function boxsum5_sgf_v_8bpc_lsx
1200    addi.d        a1,        a1,       3*REST_UNIT_STRIDE+3       // src
1201    addi.d        a2,        a2,       (2*REST_UNIT_STRIDE+3)<<1  // A
1202    addi.d        a2,        a2,       (2*REST_UNIT_STRIDE+3)<<1
1203    addi.d        a3,        a3,       (2*REST_UNIT_STRIDE+3)<<1  // B
1204    addi.w        a5,        a5,       -1
1205    vldi          vr10,      0x806
1206    vldi          vr11,      0x805
1207    vldi          vr22,      0x406
1208
1209.LBS5SGF_V_H:
1210    addi.d        t0,        a0,       0
1211    addi.d        t1,        a1,       0
1212    addi.d        t2,        a2,       0
1213    addi.d        t3,        a3,       0
1214    addi.w        t4,        a4,       0
1215
1216    addi.d        t5,        a0,       384*2
1217    addi.d        t6,        a1,       REST_UNIT_STRIDE
1218    addi.d        t7,        a2,       REST_UNIT_STRIDE<<2
1219    addi.d        t8,        a3,       REST_UNIT_STRIDE<<1   // B
1220.LBS5SGF_V_W:
1221    // a
1222    vld           vr0,       t3,       -REST_UNIT_STRIDE*2
1223    vld           vr1,       t3,       REST_UNIT_STRIDE*2
1224    vld           vr2,       t3,       (-REST_UNIT_STRIDE-1)*2
1225    vld           vr3,       t3,       (REST_UNIT_STRIDE-1)*2
1226    vld           vr4,       t3,       (1-REST_UNIT_STRIDE)*2
1227    vld           vr5,       t3,       (1+REST_UNIT_STRIDE)*2
1228    vaddwev.w.h   vr6,       vr0,      vr1
1229    vaddwod.w.h   vr7,       vr0,      vr1
1230    vmul.w        vr6,       vr6,      vr10
1231    vmul.w        vr7,       vr7,      vr10
1232    vaddwev.w.h   vr8,       vr2,      vr3
1233    vaddwod.w.h   vr9,       vr2,      vr3
1234    vaddwev.w.h   vr12,      vr4,      vr5
1235    vaddwod.w.h   vr13,      vr4,      vr5
1236    vadd.w        vr8,       vr8,      vr12
1237    vadd.w        vr9,       vr9,      vr13
1238    vmadd.w       vr6,       vr8,      vr11
1239    vmadd.w       vr7,       vr9,      vr11
1240    vilvl.w       vr18,      vr7,      vr6
1241    vilvh.w       vr19,      vr7,      vr6
1242    // b
1243    vld           vr0,       t2,       -REST_UNIT_STRIDE*4
1244    vld           vr1,       t2,       -REST_UNIT_STRIDE*4+16
1245    vld           vr2,       t2,       REST_UNIT_STRIDE*4
1246    vld           vr3,       t2,       REST_UNIT_STRIDE*4+16
1247    vld           vr4,       t2,       (-REST_UNIT_STRIDE-1)*4
1248    vld           vr5,       t2,       (-REST_UNIT_STRIDE-1)*4+16
1249    vld           vr8,       t2,       (REST_UNIT_STRIDE-1)*4
1250    vld           vr9,       t2,       (REST_UNIT_STRIDE-1)*4+16
1251    vld           vr12,      t2,       (1-REST_UNIT_STRIDE)*4
1252    vld           vr13,      t2,       (1-REST_UNIT_STRIDE)*4+16
1253    vld           vr14,      t2,       (1+REST_UNIT_STRIDE)*4
1254    vld           vr15,      t2,       (1+REST_UNIT_STRIDE)*4+16
1255    vadd.w        vr0,       vr0,      vr2  // 0 1 2 3
1256    vadd.w        vr1,       vr1,      vr3  // 4 5 6 7
1257    vmul.w        vr20,      vr0,      vr10
1258    vmul.w        vr21,      vr1,      vr10
1259    vadd.w        vr4,       vr4,      vr8  // 0 1 2 3
1260    vadd.w        vr5,       vr5,      vr9  // 4 5 6 7
1261    vadd.w        vr12,      vr12,     vr14
1262    vadd.w        vr13,      vr13,     vr15
1263    vadd.w        vr12,      vr12,     vr4
1264    vadd.w        vr13,      vr13,     vr5
1265    vmadd.w       vr20,      vr12,     vr11
1266    vmadd.w       vr21,      vr13,     vr11
1267    vld           vr2,       t1,       0
1268    vsllwil.hu.bu vr2,       vr2,      0
1269    vsllwil.wu.hu vr3,       vr2,      0
1270    vexth.wu.hu   vr4,       vr2
1271    vmadd.w       vr20,      vr18,     vr3
1272    vmadd.w       vr21,      vr19,     vr4
1273    vssrlrni.h.w  vr21,      vr20,     9
1274    vst           vr21,      t0,       0
1275
1276    addi.d        t1,        t1,       8
1277    addi.d        t2,        t2,       32
1278    addi.d        t3,        t3,       16
1279
1280    // a
1281    vld           vr0,       t8,       0
1282    vld           vr1,       t8,       -2
1283    vld           vr2,       t8,       2
1284    vmulwev.w.h   vr3,       vr0,      vr22
1285    vmulwod.w.h   vr4,       vr0,      vr22
1286    vaddwev.w.h   vr5,       vr1,      vr2
1287    vaddwod.w.h   vr6,       vr1,      vr2
1288    vmadd.w       vr3,       vr5,      vr11
1289    vmadd.w       vr4,       vr6,      vr11
1290    vilvl.w       vr19,      vr4,      vr3
1291    vilvh.w       vr20,      vr4,      vr3
1292    // b
1293    vld           vr0,       t7,       0
1294    vld           vr1,       t7,       -4
1295    vld           vr2,       t7,       4
1296    vld           vr5,       t7,       16
1297    vld           vr6,       t7,       12
1298    vld           vr7,       t7,       20
1299    vmul.w        vr8,       vr0,      vr10
1300    vmul.w        vr9,       vr5,      vr10
1301    vadd.w        vr12,      vr1,      vr2
1302    vadd.w        vr13,      vr6,      vr7
1303    vmadd.w       vr8,       vr12,     vr11
1304    vmadd.w       vr9,       vr13,     vr11
1305    vld           vr2,       t6,       0
1306    vsllwil.hu.bu vr2,       vr2,      0
1307    vsllwil.wu.hu vr3,       vr2,      0
1308    vexth.wu.hu   vr4,       vr2
1309    vmadd.w       vr8,       vr19,     vr3
1310    vmadd.w       vr9,       vr20,     vr4
1311    vssrlrni.h.w  vr9,       vr8,      8
1312    vst           vr9,       t0,       384*2
1313
1314    addi.d        t0,        t0,       16
1315    addi.d        t8,        t8,       16
1316    addi.d        t7,        t7,       32
1317    addi.d        t6,        t6,       8
1318    addi.w        t4,        t4,       -8
1319    blt           zero,      t4,       .LBS5SGF_V_W
1320
1321    addi.w        a5,        a5,       -2
1322    addi.d        a0,        a0,       384*4                // dst
1323    addi.d        a1,        a1,       REST_UNIT_STRIDE<<1  // src
1324    addi.d        a2,        a2,       REST_UNIT_STRIDE<<2  //
1325    addi.d        a2,        a2,       REST_UNIT_STRIDE<<2
1326    addi.d        a3,        a3,       REST_UNIT_STRIDE<<2  //
1327    blt           zero,      a5,       .LBS5SGF_V_H
1328    bnez          a5,        .LBS5SGF_END
1329.LBS5SGF_V_W1:
1330    // a
1331    vld           vr0,       a3,       -REST_UNIT_STRIDE*2
1332    vld           vr1,       a3,       REST_UNIT_STRIDE*2
1333    vld           vr2,       a3,       (-REST_UNIT_STRIDE-1)*2
1334    vld           vr3,       a3,       (REST_UNIT_STRIDE-1)*2
1335    vld           vr4,       a3,       (1-REST_UNIT_STRIDE)*2
1336    vld           vr5,       a3,       (1+REST_UNIT_STRIDE)*2
1337    vaddwev.w.h   vr6,       vr0,      vr1
1338    vaddwod.w.h   vr7,       vr0,      vr1
1339    vmul.w        vr6,       vr6,      vr10
1340    vmul.w        vr7,       vr7,      vr10
1341    vaddwev.w.h   vr8,       vr2,      vr3
1342    vaddwod.w.h   vr9,       vr2,      vr3
1343    vaddwev.w.h   vr12,      vr4,      vr5
1344    vaddwod.w.h   vr13,      vr4,      vr5
1345    vadd.w        vr8,       vr8,      vr12
1346    vadd.w        vr9,       vr9,      vr13
1347    vmadd.w       vr6,       vr8,      vr11
1348    vmadd.w       vr7,       vr9,      vr11
1349    vilvl.w       vr18,      vr7,      vr6
1350    vilvh.w       vr19,      vr7,      vr6
1351    // b
1352    vld           vr0,       a2,       -REST_UNIT_STRIDE*4
1353    vld           vr1,       a2,       -REST_UNIT_STRIDE*4+16
1354    vld           vr2,       a2,       REST_UNIT_STRIDE*4
1355    vld           vr3,       a2,       REST_UNIT_STRIDE*4+16
1356    vld           vr4,       a2,       (-REST_UNIT_STRIDE-1)*4
1357    vld           vr5,       a2,       (-REST_UNIT_STRIDE-1)*4+16
1358    vld           vr8,       a2,       (REST_UNIT_STRIDE-1)*4
1359    vld           vr9,       a2,       (REST_UNIT_STRIDE-1)*4+16
1360    vld           vr12,      a2,       (1-REST_UNIT_STRIDE)*4
1361    vld           vr13,      a2,       (1-REST_UNIT_STRIDE)*4+16
1362    vld           vr14,      a2,       (1+REST_UNIT_STRIDE)*4
1363    vld           vr15,      a2,       (1+REST_UNIT_STRIDE)*4+16
1364    vadd.w        vr0,       vr0,      vr2  // 0 1 2 3
1365    vadd.w        vr1,       vr1,      vr3  // 4 5 6 7
1366    vmul.w        vr20,      vr0,      vr10
1367    vmul.w        vr21,      vr1,      vr10
1368    vadd.w        vr4,       vr4,      vr8  // 0 1 2 3
1369    vadd.w        vr5,       vr5,      vr9  // 4 5 6 7
1370    vadd.w        vr12,      vr12,     vr14
1371    vadd.w        vr13,      vr13,     vr15
1372    vadd.w        vr12,      vr12,     vr4
1373    vadd.w        vr13,      vr13,     vr5
1374    vmadd.w       vr20,      vr12,     vr11
1375    vmadd.w       vr21,      vr13,     vr11
1376    vld           vr2,       a1,       0
1377    vsllwil.hu.bu vr2,       vr2,      0
1378    vsllwil.wu.hu vr3,       vr2,      0
1379    vexth.wu.hu   vr4,       vr2
1380    vmadd.w       vr20,      vr18,     vr3
1381    vmadd.w       vr21,      vr19,     vr4
1382    vssrlrni.h.w  vr21,      vr20,     9
1383    vst           vr21,      a0,       0
1384    addi.d        a3,        a3,       16
1385    addi.d        a2,        a2,       32
1386    addi.d        a1,        a1,       8
1387    addi.d        a0,        a0,       16
1388    addi.w        a4,        a4,       -8
1389    blt           zero,      a4,       .LBS5SGF_V_W1
1390.LBS5SGF_END:
1391endfunc
1392
1393/*
1394void dav1d_sgr_mix_finish_lsx(uint8_t *p, const ptrdiff_t stride,
1395                              const int16_t *dst0, const int16_t *dst1,
1396                              const int w0, const int w1,
1397                              const int w, const int h);
1398*/
1399function sgr_mix_finish_8bpc_lsx
1400    vreplgr2vr.w  vr3,     a4            // w0
1401    vreplgr2vr.w  vr13,    a5            // w1
1402    andi          t4,      a6,       0x7
1403    sub.w         t5,      a6,       t4
1404
1405    beq           zero,    t5,      .LSGRMIX_REM
1406
1407.LSGRMIX_H:
1408    addi.d        t0,      a0,       0
1409    addi.d        t1,      a2,       0   // dst0
1410    addi.d        t3,      a3,       0   // dst1
1411    addi.w        t2,      t5,       0
1412    andi          t4,      a6,       0x7
1413.LSGRMIX_W:
1414    vld           vr0,     t0,       0
1415    vld           vr1,     t1,       0
1416    vld           vr10,    t3,       0
1417    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
1418    vsllwil.wu.hu vr4,     vr2,      0   // u 0 1 2 3
1419    vexth.wu.hu   vr5,     vr2           // u 4 5 6 7
1420    vslli.w       vr6,     vr4,      7
1421    vslli.w       vr7,     vr5,      7
1422    vsllwil.w.h   vr8,     vr1,      0   // dst0
1423    vexth.w.h     vr9,     vr1           // dst0
1424    vsub.w        vr8,     vr8,      vr4
1425    vsub.w        vr9,     vr9,      vr5
1426    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
1427    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
1428
1429    vsllwil.w.h   vr11,    vr10,     0    // dst1
1430    vexth.w.h     vr12,    vr10           // dst1
1431    vsub.w        vr11,    vr11,     vr4
1432    vsub.w        vr12,    vr12,     vr5
1433    vmadd.w       vr6,     vr11,     vr13
1434    vmadd.w       vr7,     vr12,     vr13
1435
1436    vssrarni.hu.w vr7,     vr6,      11
1437    vssrlni.bu.h  vr7,     vr7,      0
1438    vstelm.d      vr7,     t0,       0,    0
1439    addi.d        t0,      t0,       8
1440    addi.d        t1,      t1,       16
1441    addi.d        t3,      t3,       16
1442    addi.d        t2,      t2,       -8
1443    bne           zero,    t2,       .LSGRMIX_W
1444
1445    beq           t4,      zero,     .LSGRMIX_W8
1446
1447    vld           vr0,     t0,       0
1448    vld           vr1,     t1,       0
1449    vld           vr10,    t3,       0
1450    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
1451    vsllwil.wu.hu vr4,     vr2,      0   // p
1452    vexth.wu.hu   vr5,     vr2           // p
1453    vslli.w       vr6,     vr4,      7
1454    vslli.w       vr7,     vr5,      7
1455    vsllwil.w.h   vr8,     vr1,      0   // dst
1456    vexth.w.h     vr9,     vr1           // dst
1457    vsub.w        vr8,     vr8,      vr4
1458    vsub.w        vr9,     vr9,      vr5
1459    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
1460    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
1461
1462    vsllwil.w.h   vr11,    vr10,     0    // dst1
1463    vexth.w.h     vr12,    vr10           // dst1
1464    vsub.w        vr11,    vr11,     vr4
1465    vsub.w        vr12,    vr12,     vr5
1466    vmadd.w       vr6,     vr11,     vr13
1467    vmadd.w       vr7,     vr12,     vr13
1468
1469    vssrarni.hu.w vr7,     vr6,      11
1470    vssrlni.bu.h  vr7,     vr7,      0
1471
1472.LSGRMIX_ST:
1473    vstelm.b      vr7,     t0,       0,    0
1474    addi.d        t0,      t0,       1
1475    vbsrl.v       vr7,     vr7,      1
1476    addi.w        t4,      t4,       -1
1477    bnez          t4,      .LSGRMIX_ST
1478
1479.LSGRMIX_W8:
1480    addi.w        a7,      a7,       -1
1481    add.d         a0,      a0,       a1
1482    addi.d        a2,      a2,       (FILTER_OUT_STRIDE<<1)
1483    addi.d        a3,      a3,       (FILTER_OUT_STRIDE<<1)
1484    bnez          a7,      .LSGRMIX_H
1485    b             .LSGR_MIX_END
1486
1487.LSGRMIX_REM:
1488    andi          t4,      a6,       0x7
1489    vld           vr0,     a0,       0
1490    vld           vr1,     a2,       0
1491    vld           vr10,    a3,       0
1492    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
1493    vsllwil.wu.hu vr4,     vr2,      0   // p
1494    vexth.wu.hu   vr5,     vr2           // p
1495    vslli.w       vr6,     vr4,      7
1496    vslli.w       vr7,     vr5,      7
1497    vsllwil.w.h   vr8,     vr1,      0   // dst
1498    vexth.w.h     vr9,     vr1           // dst
1499    vsub.w        vr8,     vr8,      vr4
1500    vsub.w        vr9,     vr9,      vr5
1501    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
1502    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
1503
1504    vsllwil.w.h   vr11,    vr10,     0    // dst1
1505    vexth.w.h     vr12,    vr10           // dst1
1506    vsub.w        vr11,    vr11,     vr4
1507    vsub.w        vr12,    vr12,     vr5
1508    vmadd.w       vr6,     vr11,     vr13
1509    vmadd.w       vr7,     vr12,     vr13
1510
1511    vssrarni.hu.w vr7,     vr6,      11
1512    vssrlni.bu.h  vr7,     vr7,      0
1513    addi.d        t0,      a0,       0
1514.LSGRMIX_REM_ST:
1515    vstelm.b      vr7,     t0,       0,    0
1516    addi.d        t0,      t0,       1
1517    vbsrl.v       vr7,     vr7,      1
1518    addi.w        t4,      t4,       -1
1519    bnez          t4,      .LSGRMIX_REM_ST
1520
1521    addi.w        a7,      a7,       -1
1522    add.d         a0,      a0,       a1
1523    addi.d        a2,      a2,       (FILTER_OUT_STRIDE<<1)
1524    addi.d        a3,      a3,       (FILTER_OUT_STRIDE<<1)
1525    bnez          a7,      .LSGRMIX_REM
1526
1527.LSGR_MIX_END:
1528endfunc
1529
1530.macro MADD_HU_BU_LASX in0, in1, out0, out1
1531    xvsllwil.hu.bu xr12,     \in0,     0
1532    xvexth.hu.bu   xr13,     \in0
1533    xvmadd.h       \out0,    xr12,     \in1
1534    xvmadd.h       \out1,    xr13,     \in1
1535.endm
1536
1537const wiener_shuf_lasx
1538.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
1539.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
1540endconst
1541
1542function wiener_filter_h_8bpc_lasx
1543    addi.d         sp,       sp,       -40
1544    fst.d          f24,      sp,       0
1545    fst.d          f25,      sp,       8
1546    fst.d          f26,      sp,       16
1547    fst.d          f27,      sp,       24
1548    fst.d          f28,      sp,       32
1549    li.w           t7,       1<<14          // clip_limit
1550
1551    la.local       t1,       wiener_shuf_lasx
1552    xvld           xr4,      t1,       0
1553    vld            vr27,     a2,       0    // filter[0][k]
1554    xvpermi.q      xr14,     xr27,     0b00000000
1555    xvrepl128vei.h xr21,     xr14,     0
1556    xvrepl128vei.h xr22,     xr14,     1
1557    xvrepl128vei.h xr23,     xr14,     2
1558    xvrepl128vei.h xr24,     xr14,     3
1559    xvrepl128vei.h xr25,     xr14,     4
1560    xvrepl128vei.h xr26,     xr14,     5
1561    xvrepl128vei.h xr27,     xr14,     6
1562    xvreplgr2vr.w  xr0,      t7
1563
1564.WIENER_FILTER_H_H_LASX:
1565    addi.w         a4,       a4,       -1    // h
1566    addi.w         t0,       a3,       0     // w
1567    addi.d         t1,       a1,       0     // tmp_ptr
1568    addi.d         t2,       a0,       0     // hor_ptr
1569
1570.WIENER_FILTER_H_W_LASX:
1571    addi.w         t0,       t0,       -32
1572    xvld           xr5,      t1,       0
1573    xvld           xr13,     t1,       16
1574
1575    xvsubi.bu      xr14,     xr4,      2
1576    xvsubi.bu      xr15,     xr4,      1
1577    xvshuf.b       xr6,      xr13,     xr5,     xr14  // 1 ... 8, 9 ... 16
1578    xvshuf.b       xr7,      xr13,     xr5,     xr15  // 2 ... 9, 10 ... 17
1579    xvshuf.b       xr8,      xr13,     xr5,     xr4   // 3 ... 10, 11 ... 18
1580    xvaddi.bu      xr14,     xr4,      1
1581    xvaddi.bu      xr15,     xr4,      2
1582    xvshuf.b       xr9,      xr13,     xr5,     xr14  // 4 ... 11, 12 ... 19
1583    xvshuf.b       xr10,     xr13,     xr5,     xr15  // 5 ... 12, 13 ... 20
1584    xvaddi.bu      xr14,     xr4,      3
1585    xvshuf.b       xr11,     xr13,     xr5,     xr14  // 6 ... 13, 14 ... 21
1586
1587    xvsllwil.hu.bu xr15,     xr8,      0    //  3  4  5  6  7  8  9 10
1588    xvexth.hu.bu   xr16,     xr8            // 11 12 13 14 15 16 17 18
1589    xvsllwil.wu.hu xr17,     xr15,     7    //  3  4  5  6
1590    xvexth.wu.hu   xr18,     xr15           //  7  8  9 10
1591    xvsllwil.wu.hu xr19,     xr16,     7    // 11 12 13 14
1592    xvexth.wu.hu   xr20,     xr16           // 15 16 17 18
1593    xvslli.w       xr18,     xr18,     7
1594    xvslli.w       xr20,     xr20,     7
1595    xvxor.v        xr15,     xr15,     xr15
1596    xvxor.v        xr14,     xr14,     xr14
1597
1598    MADD_HU_BU_LASX xr5,  xr21, xr14, xr15
1599    MADD_HU_BU_LASX xr6,  xr22, xr14, xr15
1600    MADD_HU_BU_LASX xr7,  xr23, xr14, xr15
1601    MADD_HU_BU_LASX xr8,  xr24, xr14, xr15
1602    MADD_HU_BU_LASX xr9,  xr25, xr14, xr15
1603    MADD_HU_BU_LASX xr10, xr26, xr14, xr15
1604    MADD_HU_BU_LASX xr11, xr27, xr14, xr15
1605
1606    xvsllwil.w.h   xr5,      xr14,     0   //  0  1  2  3
1607    xvexth.w.h     xr6,      xr14          //  4  5  6  7
1608    xvsllwil.w.h   xr7,      xr15,     0   //  8  9 10 11
1609    xvexth.w.h     xr8,      xr15          // 12 13 14 15
1610    xvadd.w        xr17,     xr17,     xr5
1611    xvadd.w        xr18,     xr18,     xr6
1612    xvadd.w        xr19,     xr19,     xr7
1613    xvadd.w        xr20,     xr20,     xr8
1614    xvadd.w        xr17,     xr17,     xr0
1615    xvadd.w        xr18,     xr18,     xr0
1616    xvadd.w        xr19,     xr19,     xr0
1617    xvadd.w        xr20,     xr20,     xr0
1618
1619    xvsrli.w       xr1,      xr0,      1
1620    xvsubi.wu      xr1,      xr1,      1
1621    xvxor.v        xr3,      xr3,      xr3
1622    xvsrari.w      xr17,     xr17,     3
1623    xvsrari.w      xr18,     xr18,     3
1624    xvsrari.w      xr19,     xr19,     3
1625    xvsrari.w      xr20,     xr20,     3
1626    xvclip.w       xr17,     xr17,     xr3,     xr1
1627    xvclip.w       xr18,     xr18,     xr3,     xr1
1628    xvclip.w       xr19,     xr19,     xr3,     xr1
1629    xvclip.w       xr20,     xr20,     xr3,     xr1
1630
1631    xvor.v         xr5,      xr17,     xr17
1632    xvor.v         xr6,      xr19,     xr19
1633    xvpermi.q      xr17,     xr18,     0b00000010
1634    xvpermi.q      xr19,     xr20,     0b00000010
1635
1636    xvst           xr17,     t2,       0
1637    xvst           xr19,     t2,       32
1638    xvpermi.q      xr18,     xr5,      0b00110001
1639    xvpermi.q      xr20,     xr6,      0b00110001
1640    xvst           xr18,     t2,       64
1641    xvst           xr20,     t2,       96
1642    addi.d         t1,       t1,       32
1643    addi.d         t2,       t2,       128
1644    blt            zero,     t0,       .WIENER_FILTER_H_W_LASX
1645
1646    addi.d         a1,       a1,       REST_UNIT_STRIDE
1647    addi.d         a0,       a0,       (REST_UNIT_STRIDE << 2)
1648    bnez           a4,       .WIENER_FILTER_H_H_LASX
1649
1650    fld.d          f24,      sp,       0
1651    fld.d          f25,      sp,       8
1652    fld.d          f26,      sp,       16
1653    fld.d          f27,      sp,       24
1654    fld.d          f28,      sp,       32
1655    addi.d         sp,       sp,       40
1656endfunc
1657
1658.macro APPLY_FILTER_LASX in0, in1, in2
1659    alsl.d         t7,       \in0,     \in1,    2
1660    xvld           xr10,     t7,       0
1661    xvld           xr12,     t7,       32
1662    xvmadd.w       xr14,     xr10,     \in2
1663    xvmadd.w       xr16,     xr12,     \in2
1664.endm
1665
1666.macro wiener_filter_v_8bpc_core_lasx
1667    xvreplgr2vr.w  xr14,     t6
1668    xvreplgr2vr.w  xr16,     t6
1669
1670    addi.w         t7,       t2,       0      // j + index k
1671    mul.w          t7,       t7,       t8     // (j + index) * REST_UNIT_STRIDE
1672    add.w          t7,       t7,       t4     // (j + index) * REST_UNIT_STRIDE + i
1673
1674    APPLY_FILTER_LASX  t7, a2, xr2
1675    APPLY_FILTER_LASX  t8, t7, xr3
1676    APPLY_FILTER_LASX  t8, t7, xr4
1677    APPLY_FILTER_LASX  t8, t7, xr5
1678    APPLY_FILTER_LASX  t8, t7, xr6
1679    APPLY_FILTER_LASX  t8, t7, xr7
1680    APPLY_FILTER_LASX  t8, t7, xr8
1681    xvssrarni.hu.w xr16,     xr14,      11
1682    xvpermi.d      xr17,     xr16,      0b11011000
1683    xvssrlni.bu.h  xr17,     xr17,      0
1684    xvpermi.d      xr17,     xr17,      0b00001000
1685.endm
1686
1687function wiener_filter_v_8bpc_lasx
1688    li.w          t6,       -(1 << 18)
1689
1690    li.w          t8,       REST_UNIT_STRIDE
1691    ld.h          t0,       a3,       0
1692    ld.h          t1,       a3,       2
1693    xvreplgr2vr.w xr2,      t0
1694    xvreplgr2vr.w xr3,      t1
1695    ld.h          t0,       a3,       4
1696    ld.h          t1,       a3,       6
1697    xvreplgr2vr.w xr4,      t0
1698    xvreplgr2vr.w xr5,      t1
1699    ld.h          t0,       a3,       8
1700    ld.h          t1,       a3,       10
1701    xvreplgr2vr.w xr6,      t0
1702    xvreplgr2vr.w xr7,      t1
1703    ld.h          t0,       a3,       12
1704    xvreplgr2vr.w xr8,      t0
1705
1706    andi          t1,       a4,       0xf
1707    sub.w         t0,       a4,       t1    // w-w%16
1708    or            t2,       zero,     zero  // j
1709    or            t4,       zero,     zero
1710    beqz          t0,       .WIENER_FILTER_V_W_LT16_LASX
1711
1712.WIENER_FILTER_V_H_LASX:
1713    andi          t1,       a4,       0xf
1714    add.d         t3,       zero,     a0     // p
1715    or            t4,       zero,     zero   // i
1716
1717.WIENER_FILTER_V_W_LASX:
1718
1719    wiener_filter_v_8bpc_core_lasx
1720
1721    mul.w         t5,       t2,       a1   // j * stride
1722    add.w         t5,       t5,       t4   // j * stride + i
1723    add.d         t3,       a0,       t5
1724    addi.w        t4,       t4,       16
1725    vst           vr17,     t3,       0
1726    bne           t0,       t4,       .WIENER_FILTER_V_W_LASX
1727
1728    beqz          t1,       .WIENER_FILTER_V_W_EQ16_LASX
1729
1730    wiener_filter_v_8bpc_core_lsx
1731
1732    addi.d        t3,       t3,       16
1733    andi          t1,       a4,       0xf
1734
1735.WIENER_FILTER_V_ST_REM_LASX:
1736    vstelm.b      vr17,     t3,       0,    0
1737    vbsrl.v       vr17,     vr17,     1
1738    addi.d        t3,       t3,       1
1739    addi.w        t1,       t1,       -1
1740    bnez          t1,       .WIENER_FILTER_V_ST_REM_LASX
1741.WIENER_FILTER_V_W_EQ16_LASX:
1742    addi.w        t2,       t2,       1
1743    blt           t2,       a5,       .WIENER_FILTER_V_H_LASX
1744    b              .WIENER_FILTER_V_LASX_END
1745
1746.WIENER_FILTER_V_W_LT16_LASX:
1747    andi          t1,       a4,       0xf
1748    add.d         t3,       zero,     a0
1749
1750    wiener_filter_v_8bpc_core_lsx
1751
1752    mul.w         t5,       t2,       a1   // j * stride
1753    add.d         t3,       a0,       t5
1754
1755.WIENER_FILTER_V_ST_REM_1_LASX:
1756    vstelm.b      vr17,     t3,       0,    0
1757    vbsrl.v       vr17,     vr17,     1
1758    addi.d        t3,       t3,       1
1759    addi.w        t1,       t1,       -1
1760    bnez          t1,       .WIENER_FILTER_V_ST_REM_1_LASX
1761
1762    addi.w        t2,       t2,       1
1763    blt           t2,       a5,       .WIENER_FILTER_V_W_LT16_LASX
1764
1765.WIENER_FILTER_V_LASX_END:
1766endfunc
1767
1768function boxsum3_sgf_h_8bpc_lasx
1769    addi.d         a0,       a0,        (REST_UNIT_STRIDE<<2)+12  // AA
1770    //addi.d        a0,       a0,        12   // AA
1771    addi.d         a1,       a1,        (REST_UNIT_STRIDE<<1)+6   // BB
1772    //addi.d        a1,       a1,        6    // BB
1773    la.local       t8,       dav1d_sgr_x_by_x
1774    li.w           t6,       455
1775    xvreplgr2vr.w  xr20,     t6
1776    li.w           t6,       255
1777    xvreplgr2vr.w  xr22,     t6
1778    xvaddi.wu      xr21,     xr22,      1  // 256
1779    xvreplgr2vr.w  xr6,      a4
1780    xvldi          xr19,     0x809
1781    addi.w         a2,       a2,        2  // w + 2
1782    addi.w         a3,       a3,        2  // h + 2
1783
1784.LBS3SGF_H_H_LASX:
1785    addi.w         t2,       a2,        0
1786    addi.d         t0,       a0,        -4
1787    addi.d         t1,       a1,        -2
1788
1789.LBS3SGF_H_W_LASX:
1790    addi.w         t2,       t2,        -16
1791    xvld           xr0,      t0,        0   // AA[i]
1792    xvld           xr1,      t0,        32
1793    xvld           xr2,      t1,        0   // BB[i]
1794
1795    xvmul.w        xr4,      xr0,       xr19 // a * n
1796    xvmul.w        xr5,      xr1,       xr19
1797    vext2xv.w.h    xr9,      xr2
1798    xvpermi.q      xr10,     xr2,       0b00000001
1799    vext2xv.w.h    xr10,     xr10
1800    xvmsub.w       xr4,      xr9,       xr9  // p
1801    xvmsub.w       xr5,      xr10,      xr10
1802    xvmaxi.w       xr4,      xr4,       0
1803    xvmaxi.w       xr5,      xr5,       0
1804    xvmul.w        xr4,      xr4,       xr6  // p * s
1805    xvmul.w        xr5,      xr5,       xr6
1806    xvsrlri.w      xr4,      xr4,       20
1807    xvsrlri.w      xr5,      xr5,       20
1808    xvmin.w        xr4,      xr4,       xr22
1809    xvmin.w        xr5,      xr5,       xr22
1810
1811    vpickve2gr.w   t6,       vr4,       0
1812    ldx.bu         t7,       t8,        t6
1813    vinsgr2vr.w    vr7,      t7,        0
1814    vpickve2gr.w   t6,       vr4,       1
1815    ldx.bu         t7,       t8,        t6
1816    vinsgr2vr.w    vr7,      t7,        1
1817    vpickve2gr.w   t6,       vr4,       2
1818    ldx.bu         t7,       t8,        t6
1819    vinsgr2vr.w    vr7,      t7,        2
1820    vpickve2gr.w   t6,       vr4,       3
1821    ldx.bu         t7,       t8,        t6
1822    vinsgr2vr.w    vr7,      t7,        3
1823
1824    xvpickve2gr.w  t6,       xr4,       4
1825    ldx.bu         t7,       t8,        t6
1826    xvinsgr2vr.w   xr7,      t7,        4
1827    xvpickve2gr.w  t6,       xr4,       5
1828    ldx.bu         t7,       t8,        t6
1829    xvinsgr2vr.w   xr7,      t7,        5
1830    xvpickve2gr.w  t6,       xr4,       6
1831    ldx.bu         t7,       t8,        t6
1832    xvinsgr2vr.w   xr7,      t7,        6
1833    xvpickve2gr.w  t6,       xr4,       7
1834    ldx.bu         t7,       t8,        t6
1835    xvinsgr2vr.w   xr7,      t7,        7     // x
1836
1837    vpickve2gr.w   t6,       vr5,       0
1838    ldx.bu         t7,       t8,        t6
1839    vinsgr2vr.w    vr8,      t7,        0
1840    vpickve2gr.w   t6,       vr5,       1
1841    ldx.bu         t7,       t8,        t6
1842    vinsgr2vr.w    vr8,      t7,        1
1843    vpickve2gr.w   t6,       vr5,       2
1844    ldx.bu         t7,       t8,        t6
1845    vinsgr2vr.w    vr8,      t7,        2
1846    vpickve2gr.w   t6,       vr5,       3
1847    ldx.bu         t7,       t8,        t6
1848    vinsgr2vr.w    vr8,      t7,        3
1849
1850    xvpickve2gr.w  t6,       xr5,       4
1851    ldx.bu         t7,       t8,        t6
1852    xvinsgr2vr.w   xr8,      t7,        4
1853    xvpickve2gr.w  t6,       xr5,       5
1854    ldx.bu         t7,       t8,        t6
1855    xvinsgr2vr.w   xr8,      t7,        5
1856    xvpickve2gr.w  t6,       xr5,       6
1857    ldx.bu         t7,       t8,        t6
1858    xvinsgr2vr.w   xr8,      t7,        6
1859    xvpickve2gr.w  t6,       xr5,       7
1860    ldx.bu         t7,       t8,        t6
1861    xvinsgr2vr.w   xr8,      t7,        7     // x
1862
1863    xvmul.w        xr9,      xr7,       xr9   // x * BB[i]
1864    xvmul.w        xr10,     xr8,       xr10
1865    xvmul.w        xr9,      xr9,       xr20  // x * BB[i] * sgr_one_by_x
1866    xvmul.w        xr10,     xr10,      xr20
1867    xvsrlri.w      xr9,      xr9,       12
1868    xvsrlri.w      xr10,     xr10,      12
1869    xvsub.w        xr7,      xr21,      xr7
1870    xvsub.w        xr8,      xr21,      xr8
1871    xvpickev.h     xr12,     xr8,       xr7
1872    xvpermi.d      xr11,     xr12,      0b11011000
1873
1874    xvst           xr9,      t0,        0
1875    xvst           xr10,     t0,        32
1876    xvst           xr11,     t1,        0
1877    addi.d         t0,       t0,        64
1878    addi.d         t1,       t1,        32
1879    blt            zero,     t2,        .LBS3SGF_H_W_LASX
1880
1881    addi.d         a0,       a0,        REST_UNIT_STRIDE<<2
1882    addi.d         a1,       a1,        REST_UNIT_STRIDE<<1
1883    addi.w         a3,       a3,        -1
1884    bnez           a3,       .LBS3SGF_H_H_LASX
1885endfunc
1886
1887function boxsum3_h_8bpc_lasx
1888    addi.d         a2,      a2,      REST_UNIT_STRIDE
1889    li.w           t0,      1
1890    addi.w         a3,      a3,      -2
1891    addi.w         a4,      a4,      -4
1892.LBS3_H_H_LASX:
1893    alsl.d         t1,      t0,      a1,    1     // sum_v    *sum_v = sum + x
1894    alsl.d         t2,      t0,      a0,    2     // sumsq_v  *sumsq_v = sumsq + x
1895    add.d          t3,      t0,      a2           // s
1896    addi.w         t5,      a3,      0
1897
1898.LBS3_H_W_LASX:
1899    xvld           xr0,     t3,      0
1900    xvld           xr1,     t3,      REST_UNIT_STRIDE
1901    xvld           xr2,     t3,      (REST_UNIT_STRIDE<<1)
1902
1903    xvilvl.b       xr3,     xr1,     xr0
1904    xvhaddw.hu.bu  xr4,     xr3,     xr3
1905    xvilvh.b       xr5,     xr1,     xr0
1906    xvhaddw.hu.bu  xr6,     xr5,     xr5
1907    xvsllwil.hu.bu xr7,     xr2,     0
1908    xvexth.hu.bu   xr8,     xr2
1909    // sum_v
1910    xvadd.h        xr4,     xr4,     xr7  // 0 2
1911    xvadd.h        xr6,     xr6,     xr8  // 1 3
1912    xvor.v         xr9,     xr4,     xr4
1913    xvpermi.q      xr4,     xr6,     0b00000010
1914    xvpermi.q      xr6,     xr9,     0b00110001
1915    xvst           xr4,     t1,      REST_UNIT_STRIDE<<1
1916    xvst           xr6,     t1,      (REST_UNIT_STRIDE<<1)+32
1917    addi.d         t1,      t1,      64
1918    // sumsq
1919    xvmulwev.h.bu  xr9,     xr3,     xr3
1920    xvmulwod.h.bu  xr10,    xr3,     xr3
1921    xvmulwev.h.bu  xr11,    xr5,     xr5
1922    xvmulwod.h.bu  xr12,    xr5,     xr5
1923    xvaddwev.w.hu  xr13,    xr10,    xr9
1924    xvaddwod.w.hu  xr14,    xr10,    xr9
1925    xvaddwev.w.hu  xr15,    xr12,    xr11
1926    xvaddwod.w.hu  xr16,    xr12,    xr11
1927    xvmaddwev.w.hu xr13,    xr7,     xr7
1928    xvmaddwod.w.hu xr14,    xr7,     xr7
1929    xvmaddwev.w.hu xr15,    xr8,     xr8
1930    xvmaddwod.w.hu xr16,    xr8,     xr8
1931    xvilvl.w       xr9,     xr14,    xr13
1932    xvilvh.w       xr10,    xr14,    xr13
1933    xvilvl.w       xr11,    xr16,    xr15
1934    xvilvh.w       xr12,    xr16,    xr15
1935    xvor.v         xr7,     xr9,     xr9
1936    xvor.v         xr8,     xr11,    xr11
1937    xvpermi.q      xr9,     xr10,    0b00000010
1938    xvpermi.q      xr10,    xr7,     0b00110001
1939    xvpermi.q      xr11,    xr12,    0b00000010
1940    xvpermi.q      xr12,    xr8,     0b00110001
1941    xvst           xr9,     t2,      REST_UNIT_STRIDE<<2
1942    xvst           xr11,    t2,      (REST_UNIT_STRIDE<<2)+32
1943    xvst           xr10,    t2,      (REST_UNIT_STRIDE<<2)+64
1944    xvst           xr12,    t2,      (REST_UNIT_STRIDE<<2)+96
1945
1946    addi.d         t2,      t2,      128
1947    addi.w         t5,      t5,      -32
1948    addi.d         t3,      t3,      32
1949    blt            zero,    t5,      .LBS3_H_W_LASX
1950
1951    addi.d         a0,      a0,      REST_UNIT_STRIDE<<2
1952    addi.d         a1,      a1,      REST_UNIT_STRIDE<<1
1953    addi.d         a2,      a2,      REST_UNIT_STRIDE
1954    addi.d         a4,      a4,      -1
1955    blt            zero,    a4,      .LBS3_H_H_LASX
1956endfunc
1957