xref: /aosp_15_r20/external/libdav1d/src/loongarch/cdef.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2024, VideoLAN and dav1d authors
3 * Copyright © 2024, Loongson Technology Corporation Limited
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/loongarch/loongson_asm.S"
29
30// static int cdef_find_dir_lsx(const pixel *img, const ptrdiff_t stride,
31//                            unsigned *const var HIGHBD_DECL_SUFFIX)
32// param: img: a0, stride: a1, var: a2
33function cdef_find_dir_8bpc_lsx
34    addi.d         sp,    sp,    -64
35    fst.d          f24,   sp,    0
36    fst.d          f25,   sp,    8
37    fst.d          f26,   sp,    16
38    fst.d          f27,   sp,    24
39    fst.d          f28,   sp,    32
40    fst.d          f29,   sp,    40
41    fst.d          f30,   sp,    48
42    fst.d          f31,   sp,    56
43
44    li.d           a3,    128
45    vreplgr2vr.w   vr31,  a3
46
47    // hv: vr0-vr3  diag: vr4-vr11  alt: vr12-vr23
48.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, vr9, vr10, \
49        vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \
50        vr20, vr21, vr22, vr23
51    vxor.v      \i,       \i,       \i
52.endr
53
54.CFDL01:  // 8
55    // 0
56    fld.d          f24,   a0,    0  //img
57    vpermi.w       vr25,  vr24,  0x01
58
59    vsllwil.hu.bu  vr24,  vr24,  0
60    vsllwil.hu.bu  vr24,  vr24,  0
61    vsllwil.hu.bu  vr25,  vr25,  0
62    vsllwil.hu.bu  vr25,  vr25,  0
63
64    vsub.w         vr24,  vr24,  vr31  //px
65    vsub.w         vr25,  vr25,  vr31
66
67    vadd.w         vr4,   vr4,   vr24  //diag[0][y+x]
68    vadd.w         vr5,   vr5,   vr25
69
70    vpackev.w      vr26,  vr25,  vr24
71    vpackod.w      vr27,  vr25,  vr24
72    vpermi.w       vr26,  vr26,  0xd8 //px0246
73    vpermi.w       vr27,  vr27,  0xd8 //px1357
74    vadd.w         vr12,  vr12,  vr26
75    vadd.w         vr12,  vr12,  vr27  //alt[0][y+(x>>1)]
76
77    vhaddw.d.w     vr28,  vr24,  vr24
78    vhaddw.q.d     vr28,  vr28,  vr28
79    vpickve2gr.d   a3,    vr28,  0
80    vhaddw.d.w     vr28,  vr25,  vr25
81    vhaddw.q.d     vr28,  vr28,  vr28
82    vpickve2gr.d   a4,    vr28,  0
83    add.d          a3,    a3,    a4
84    vinsgr2vr.w    vr0,   a3,    0    //hv[0][y]
85
86    vadd.w         vr15,  vr15,  vr26
87    vadd.w         vr15,  vr15,  vr27  //alt[1][3+y-(x>>1)]
88    vpermi.w       vr15,  vr15,  0x1b
89
90    vadd.w         vr9,   vr9,   vr24
91    vadd.w         vr8,   vr8,   vr25
92    vpermi.w       vr8,   vr8,   0x1b
93    vpermi.w       vr9,   vr9,   0x1b  //diag[1][7+y-x]
94
95    vxor.v         vr28,  vr28,  vr28
96    vxor.v         vr29,  vr29,  vr29
97    vadd.w         vr28,  vr28,  vr24
98    vadd.w         vr29,  vr29,  vr25
99    vextrins.w     vr18,  vr28,  0x30
100    vshuf4i.w      vr19,  vr28,  0x39
101    vextrins.w     vr19,  vr29,  0x30
102    vshuf4i.w      vr20,  vr29,  0x39  //alt[2][3-(y>>1)+7]
103    vinsgr2vr.w    vr20,  zero,  3
104
105    vadd.w         vr2,   vr2,   vr24
106    vadd.w         vr3,   vr3,   vr25  //hv[1][x]
107
108    vadd.w         vr21,  vr21,  vr24
109    vadd.w         vr22,  vr22,  vr25  //alt[3][(y>>1)+x]
110
111    add.d          a0,    a0,    a1
112
113    // 1
114    fld.d          f24,   a0,    0  //img
115    vpermi.w       vr25,  vr24,  0x01
116
117    vsllwil.hu.bu  vr24,  vr24,  0
118    vsllwil.hu.bu  vr24,  vr24,  0
119    vsllwil.hu.bu  vr25,  vr25,  0
120    vsllwil.hu.bu  vr25,  vr25,  0
121
122    vsub.w         vr24,  vr24,  vr31  //px
123    vsub.w         vr25,  vr25,  vr31
124
125    vbsrl.v        vr28,  vr4,   4  //1-4
126    vbsrl.v        vr29,  vr5,   4  //5-8
127    vextrins.w     vr28,  vr5,   0x30
128    vadd.w         vr28,  vr28,  vr24  //diag[0][y+x]
129    vadd.w         vr29,  vr29,  vr25
130    vbsll.v        vr5,   vr29,  4
131    vextrins.w     vr5,   vr28,  0x03
132    vextrins.w     vr6,   vr29,  0x03
133    vextrins.w     vr28,  vr4,   0x30
134    vshuf4i.w      vr4,   vr28,  0x93
135
136    vbsrl.v        vr28,  vr12,  4
137    vextrins.w     vr28,  vr13,  0x30
138    vpackev.w      vr26,  vr25,  vr24
139    vpackod.w      vr27,  vr25,  vr24
140    vpermi.w       vr26,  vr26,  0xd8 //px0246
141    vpermi.w       vr27,  vr27,  0xd8 //px1357
142    vadd.w         vr28,  vr28,  vr26
143    vadd.w         vr28,  vr28,  vr27  //alt[0][y+(x>>1)]
144    vextrins.w     vr13,  vr28,  0x03
145    vextrins.w     vr28,  vr12,  0x30
146    vshuf4i.w      vr12,  vr28,  0x93
147
148    vhaddw.d.w     vr28,  vr24,  vr24
149    vhaddw.q.d     vr28,  vr28,  vr28
150    vpickve2gr.d   a3,    vr28,  0
151    vhaddw.d.w     vr28,  vr25,  vr25
152    vhaddw.q.d     vr28,  vr28,  vr28
153    vpickve2gr.d   a4,    vr28,  0
154    add.d          a3,    a3,    a4
155    vinsgr2vr.w    vr0,   a3,    1    //hv[0][y]
156
157    vbsrl.v        vr28,  vr15,  4
158    vextrins.w     vr28,  vr16,  0x30
159    vpermi.w       vr28,  vr28,  0x1b
160    vadd.w         vr28,  vr28,  vr26
161    vadd.w         vr28,  vr28,  vr27  //alt[1][3+y-(x>>1)]
162    vextrins.w     vr16,  vr28,  0x00
163    vextrins.w     vr28,  vr15,  0x00
164    vshuf4i.w      vr15,  vr28,  0x6c
165
166    vbsrl.v        vr28,  vr8,   4     //4321
167    vbsrl.v        vr29,  vr9,   4     //8765
168    vextrins.w     vr28,  vr9,   0x30
169    vpermi.w       vr28,  vr28,  0x1b
170    vpermi.w       vr29,  vr29,  0x1b
171    vadd.w         vr29,  vr29,  vr24
172    vadd.w         vr28,  vr28,  vr25  //diag[1][7+y-x]
173    vextrins.w     vr10,  vr29,  0x00
174    vextrins.w     vr29,  vr28,  0x00
175    vshuf4i.w      vr9,   vr29,  0x6c
176    vextrins.w     vr28,  vr8,   0x00
177    vshuf4i.w      vr8,   vr28,  0x6c
178
179    vbsll.v        vr28,  vr19,  4
180    vextrins.w     vr28,  vr18,  0x03
181    vbsll.v        vr29,  vr20,  4
182    vextrins.w     vr29,  vr19,  0x03
183    vadd.w         vr28,  vr28,  vr24
184    vadd.w         vr29,  vr29,  vr25  //alt[2][3-(y>>1)+7]
185    vextrins.w     vr18,  vr28,  0x30
186    vextrins.w     vr28,  vr29,  0x00
187    vshuf4i.w      vr19,  vr28,  0x39
188    vbsrl.v        vr20,  vr29,  4
189
190    vadd.w         vr2,   vr2,   vr24
191    vadd.w         vr3,   vr3,   vr25  //hv[1][x]
192
193    vadd.w         vr21,  vr21,  vr24
194    vadd.w         vr22,  vr22,  vr25  //alt[3][(y>>1)+x]
195
196    add.d          a0,    a0,    a1
197
198    // 2
199    fld.d          f24,   a0,    0  //img
200    vpermi.w       vr25,  vr24,  0x01
201
202    vsllwil.hu.bu  vr24,  vr24,  0
203    vsllwil.hu.bu  vr24,  vr24,  0
204    vsllwil.hu.bu  vr25,  vr25,  0
205    vsllwil.hu.bu  vr25,  vr25,  0
206
207    vsub.w         vr24,  vr24,  vr31  //px
208    vsub.w         vr25,  vr25,  vr31
209
210    vbsrl.v        vr28,  vr4,   8
211    vbsrl.v        vr29,  vr5,   8
212    vextrins.d     vr28,  vr5,   0x10  //2-5
213    vextrins.d     vr29,  vr6,   0x10  //6-9
214    vadd.w         vr28,  vr28,  vr24  //diag[0][y+x]
215    vadd.w         vr29,  vr29,  vr25
216    vextrins.d     vr4,   vr28,  0x10
217    vextrins.d     vr5,   vr28,  0x01
218    vextrins.d     vr5,   vr29,  0x10
219    vextrins.d     vr6,   vr29,  0x01
220
221    vbsrl.v        vr28,  vr12,  8
222    vextrins.d     vr28,  vr13,  0x10
223    vpackev.w      vr26,  vr25,  vr24
224    vpackod.w      vr27,  vr25,  vr24
225    vpermi.w       vr26,  vr26,  0xd8 //px0246
226    vpermi.w       vr27,  vr27,  0xd8 //px1357
227    vadd.w         vr28,  vr28,  vr26
228    vadd.w         vr28,  vr28,  vr27  //alt[0][y+(x>>1)]
229    vextrins.d     vr12,  vr28,  0x10
230    vextrins.d     vr13,  vr28,  0x01
231
232    vhaddw.d.w     vr28,  vr24,  vr24
233    vhaddw.q.d     vr28,  vr28,  vr28
234    vpickve2gr.d   a3,    vr28,  0
235    vhaddw.d.w     vr28,  vr25,  vr25
236    vhaddw.q.d     vr28,  vr28,  vr28
237    vpickve2gr.d   a4,    vr28,  0
238    add.d          a3,    a3,    a4
239    vinsgr2vr.w    vr0,   a3,    2    //hv[0][y]
240
241    vbsrl.v        vr28,  vr15,  8
242    vextrins.d     vr28,  vr16,  0x10
243    vpermi.w       vr28,  vr28,  0x1b
244    vadd.w         vr28,  vr28,  vr26
245    vadd.w         vr28,  vr28,  vr27  //alt[1][3+y-(x>>1)]
246    vpermi.w       vr28,  vr28,  0x1b
247    vextrins.d     vr15,  vr28,  0x10
248    vextrins.d     vr16,  vr28,  0x01
249
250    vbsrl.v        vr28,  vr8,   8
251    vextrins.d     vr28,  vr9,   0x10
252    vbsrl.v        vr29,  vr9,   8
253    vextrins.d     vr29,  vr10,  0x10
254    vpermi.w       vr28,  vr28,  0x1b  //5432
255    vpermi.w       vr29,  vr29,  0x1b  //9876
256    vadd.w         vr29,  vr29,  vr24
257    vadd.w         vr28,  vr28,  vr25
258    vpermi.w       vr28,  vr28,  0x1b
259    vpermi.w       vr29,  vr29,  0x1b
260    vextrins.d     vr8,   vr28,  0x10
261    vextrins.d     vr9,   vr28,  0x01
262    vextrins.d     vr9,   vr29,  0x10
263    vextrins.d     vr10,  vr29,  0x01  //diag[1][7+y-x]
264
265    vbsrl.v        vr28,  vr18,  8
266    vextrins.d     vr28,  vr19,  0x10 //2345
267    vbsrl.v        vr29,  vr19,  8
268    vextrins.d     vr29,  vr20,  0x10 //6789
269    vadd.w         vr28,  vr28,  vr24
270    vadd.w         vr29,  vr29,  vr25
271    vextrins.d     vr18,  vr28,  0x10
272    vextrins.d     vr19,  vr28,  0x01
273    vextrins.d     vr19,  vr29,  0x10
274    vextrins.d     vr20,  vr29,  0x01   //alt[2][3-(y>>1)+7]
275
276    vadd.w         vr2,   vr2,   vr24
277    vadd.w         vr3,   vr3,   vr25  //hv[1][x]
278
279    vbsrl.v        vr28,  vr21,  4
280    vextrins.w     vr28,  vr22,  0x30  //1234
281    vbsrl.v        vr29,  vr22,  4     //5678
282    vadd.w         vr28,  vr28,  vr24
283    vadd.w         vr29,  vr29,  vr25  //alt[3][(y>>1)+x]
284    vextrins.w     vr23,  vr29,  0x03
285    vextrins.w     vr29,  vr28,  0x33
286    vshuf4i.w      vr22,  vr29,  0x93
287    vextrins.w     vr28,  vr21,  0x30
288    vshuf4i.w      vr21,  vr28,  0x93
289
290    add.d          a0,    a0,    a1
291
292    // 3
293    fld.d          f24,   a0,    0  //img
294    vpermi.w       vr25,  vr24,  0x01
295
296    vsllwil.hu.bu  vr24,  vr24,  0
297    vsllwil.hu.bu  vr24,  vr24,  0
298    vsllwil.hu.bu  vr25,  vr25,  0
299    vsllwil.hu.bu  vr25,  vr25,  0
300
301    vsub.w         vr24,  vr24,  vr31  //px
302    vsub.w         vr25,  vr25,  vr31
303
304    vbsll.v        vr28,  vr5,   4
305    vextrins.w     vr28,  vr4,   0x03 //3456
306    vbsll.v        vr29,  vr6,   4
307    vextrins.w     vr29,  vr5,   0x03 //78910
308    vadd.w         vr28,  vr28,  vr24  //diag[0][y+x]
309    vadd.w         vr29,  vr29,  vr25
310    vextrins.w     vr4,   vr28,  0x30
311    vextrins.w     vr28,  vr29,  0x00
312    vshuf4i.w      vr5,   vr28,  0x39
313    vbsrl.v        vr6,   vr29,  4
314
315    vbsll.v        vr28,  vr13,  4
316    vextrins.w     vr28,  vr12,  0x03
317    vpackev.w      vr26,  vr25,  vr24
318    vpackod.w      vr27,  vr25,  vr24
319    vpermi.w       vr26,  vr26,  0xd8 //px0246
320    vpermi.w       vr27,  vr27,  0xd8 //px1357
321    vadd.w         vr28,  vr28,  vr26
322    vadd.w         vr28,  vr28,  vr27  //alt[0][y+(x>>1)]
323    vextrins.w     vr12,  vr28,  0x30
324    vbsrl.v        vr13,  vr28,  4
325
326    vhaddw.d.w     vr28,  vr24,  vr24
327    vhaddw.q.d     vr28,  vr28,  vr28
328    vpickve2gr.d   a3,    vr28,  0
329    vhaddw.d.w     vr28,  vr25,  vr25
330    vhaddw.q.d     vr28,  vr28,  vr28
331    vpickve2gr.d   a4,    vr28,  0
332    add.d          a3,    a3,    a4
333    vinsgr2vr.w    vr0,   a3,    3    //hv[0][y]
334
335    vbsll.v        vr28,  vr16,  4
336    vextrins.w     vr28,  vr15,  0x03
337    vpermi.w       vr28,  vr28,  0x1b  //6543
338    vadd.w         vr28,  vr28,  vr26
339    vadd.w         vr28,  vr28,  vr27  //alt[1][3+y-(x>>1)]
340    vextrins.w     vr15,  vr28,  0x33
341    vshuf4i.w      vr16,  vr28,  0xc6
342    vinsgr2vr.w    vr16,  zero,  3
343
344    vbsll.v        vr28,  vr9,   4
345    vextrins.w     vr28,  vr8,   0x03  //3456
346    vbsll.v        vr29,  vr10,  4
347    vextrins.w     vr29,  vr9,   0x03  //78910
348    vpermi.w       vr28,  vr28,  0x1b  //6543
349    vpermi.w       vr29,  vr29,  0x1b  //10987
350    vadd.w         vr29,  vr29,  vr24
351    vadd.w         vr28,  vr28,  vr25  //diag[1][7+y-x]
352    vextrins.w     vr8,   vr28,  0x33
353    vextrins.w     vr28,  vr29,  0x33
354    vshuf4i.w      vr9,   vr28,  0xc6
355    vshuf4i.w      vr10,  vr29,  0xc6
356    vinsgr2vr.w    vr10,  zero,  3
357
358    vbsrl.v        vr28,  vr18,  8
359    vextrins.d     vr28,  vr19,  0x10 //2345
360    vbsrl.v        vr29,  vr19,  8
361    vextrins.d     vr29,  vr20,  0x10 //6789
362    vadd.w         vr28,  vr28,  vr24
363    vadd.w         vr29,  vr29,  vr25
364    vextrins.d     vr18,  vr28,  0x10
365    vextrins.d     vr19,  vr28,  0x01
366    vextrins.d     vr19,  vr29,  0x10
367    vextrins.d     vr20,  vr29,  0x01   //alt[2][3-(y>>1)+7]
368
369    vadd.w         vr2,   vr2,   vr24
370    vadd.w         vr3,   vr3,   vr25  //hv[1][x]
371
372    vbsrl.v        vr28,  vr21,  4
373    vextrins.w     vr28,  vr22,  0x30  //1234
374    vbsrl.v        vr29,  vr22,  4     //5678
375    vextrins.w     vr29,  vr23,  0x30
376    vadd.w         vr28,  vr28,  vr24
377    vadd.w         vr29,  vr29,  vr25  //alt[3][(y>>1)+x]
378    vextrins.w     vr23,  vr29,  0x03
379    vextrins.w     vr29,  vr28,  0x33
380    vshuf4i.w      vr22,  vr29,  0x93
381    vextrins.w     vr28,  vr21,  0x30
382    vshuf4i.w      vr21,  vr28,  0x93
383
384    add.d          a0,    a0,    a1
385
386    // 4
387    fld.d          f24,   a0,    0  //img
388    vpermi.w       vr25,  vr24,  0x01
389
390    vsllwil.hu.bu  vr24,  vr24,  0
391    vsllwil.hu.bu  vr24,  vr24,  0
392    vsllwil.hu.bu  vr25,  vr25,  0
393    vsllwil.hu.bu  vr25,  vr25,  0
394
395    vsub.w         vr24,  vr24,  vr31  //px
396    vsub.w         vr25,  vr25,  vr31
397
398    vadd.w         vr5,   vr5,   vr24  //diag[0][y+x]
399    vadd.w         vr6,   vr6,   vr25
400
401    vpackev.w      vr26,  vr25,  vr24
402    vpackod.w      vr27,  vr25,  vr24
403    vpermi.w       vr26,  vr26,  0xd8 //px0246
404    vpermi.w       vr27,  vr27,  0xd8 //px1357
405    vadd.w         vr13,  vr13,  vr26
406    vadd.w         vr13,  vr13,  vr27  //alt[0][y+(x>>1)]
407
408    vhaddw.d.w     vr28,  vr24,  vr24
409    vhaddw.q.d     vr28,  vr28,  vr28
410    vpickve2gr.d   a3,    vr28,  0
411    vhaddw.d.w     vr28,  vr25,  vr25
412    vhaddw.q.d     vr28,  vr28,  vr28
413    vpickve2gr.d   a4,    vr28,  0
414    add.d          a3,    a3,    a4
415    vinsgr2vr.w    vr1,   a3,    0    //hv[0][y]
416
417    vpermi.w       vr16,  vr16,  0x1b
418    vadd.w         vr16,  vr16,  vr26
419    vadd.w         vr16,  vr16,  vr27  //alt[1][3+y-(x>>1)]
420    vpermi.w       vr16,  vr16,  0x1b
421
422    vpermi.w       vr9,   vr9,   0x1b
423    vpermi.w       vr10,  vr10,  0x1b
424    vadd.w         vr10,  vr10,  vr24
425    vadd.w         vr9,   vr9,   vr25
426    vpermi.w       vr9,   vr9,   0x1b
427    vpermi.w       vr10,  vr10,  0x1b  //diag[1][7+y-x]
428
429    vbsrl.v        vr28,  vr18,  4
430    vextrins.w     vr28,  vr19,  0x30  //1234
431    vbsrl.v        vr29,  vr19,  4
432    vextrins.w     vr29,  vr20,  0x30  //5678
433    vadd.w         vr28,  vr28,  vr24
434    vadd.w         vr29,  vr29,  vr25  //alt[2][3-(y>>1)+7]
435    vextrins.w     vr20,  vr29,  0x03
436    vextrins.w     vr29,  vr28,  0x33
437    vshuf4i.w      vr19,  vr29,  0x93
438    vbsll.v        vr18,  vr28,  4
439
440    vadd.w         vr2,   vr2,   vr24
441    vadd.w         vr3,   vr3,   vr25  //hv[1][x]
442
443    vbsrl.v        vr28,  vr21,  8
444    vextrins.d     vr28,  vr22,  0x10
445    vbsrl.v        vr29,  vr22,  8
446    vextrins.d     vr29,  vr23,  0x10
447    vadd.w         vr28,  vr28,  vr24
448    vadd.w         vr29,  vr29,  vr25
449    vextrins.d     vr21,  vr28,  0x10
450    vextrins.d     vr22,  vr28,  0x01
451    vextrins.d     vr22,  vr29,  0x10
452    vextrins.d     vr23,  vr29,  0x01  //alt[3][(y>>1)+x]
453
454    add.d          a0,    a0,    a1
455
456    // 5
457    fld.d          f24,   a0,    0  //img
458    vpermi.w       vr25,  vr24,  0x01
459
460    vsllwil.hu.bu  vr24,  vr24,  0
461    vsllwil.hu.bu  vr24,  vr24,  0
462    vsllwil.hu.bu  vr25,  vr25,  0
463    vsllwil.hu.bu  vr25,  vr25,  0
464
465    vsub.w         vr24,  vr24,  vr31  //px
466    vsub.w         vr25,  vr25,  vr31
467
468    vbsrl.v        vr28,  vr5,   4  //5-8
469    vbsrl.v        vr29,  vr6,   4  //9-12
470    vextrins.w     vr28,  vr6,   0x30
471    vadd.w         vr28,  vr28,  vr24  //diag[0][y+x]
472    vadd.w         vr29,  vr29,  vr25
473    vextrins.w     vr7,   vr29,  0x03
474    vextrins.w     vr29,  vr28,  0x33
475    vshuf4i.w      vr6,   vr29,  0x93
476    vextrins.w     vr28,  vr5,   0x30
477    vshuf4i.w      vr5,   vr28,  0x93
478
479    vbsrl.v        vr28,  vr13,  4
480    vextrins.w     vr28,  vr14,  0x30
481    vpackev.w      vr26,  vr25,  vr24
482    vpackod.w      vr27,  vr25,  vr24
483    vpermi.w       vr26,  vr26,  0xd8 //px0246
484    vpermi.w       vr27,  vr27,  0xd8 //px1357
485    vadd.w         vr28,  vr28,  vr26
486    vadd.w         vr28,  vr28,  vr27  //alt[0][y+(x>>1)]
487    vextrins.w     vr14,  vr28,  0x03
488    vextrins.w     vr28,  vr13,  0x30
489    vshuf4i.w      vr13,  vr28,  0x93
490
491    vhaddw.d.w     vr28,  vr24,  vr24
492    vhaddw.q.d     vr28,  vr28,  vr28
493    vpickve2gr.d   a3,    vr28,  0
494    vhaddw.d.w     vr28,  vr25,  vr25
495    vhaddw.q.d     vr28,  vr28,  vr28
496    vpickve2gr.d   a4,    vr28,  0
497    add.d          a3,    a3,    a4
498    vinsgr2vr.w    vr1,   a3,    1    //hv[0][y]
499
500    vbsrl.v        vr28,  vr16,  4
501    vextrins.w     vr28,  vr17,  0x30
502    vpermi.w       vr28,  vr28,  0x1b
503    vadd.w         vr28,  vr28,  vr26
504    vadd.w         vr28,  vr28,  vr27  //alt[1][3+y-(x>>1)]
505    vextrins.w     vr17,  vr28,  0x00
506    vextrins.w     vr28,  vr16,  0x00
507    vshuf4i.w      vr16,  vr28,  0x6c
508
509    vbsrl.v        vr28,  vr9,   4
510    vbsrl.v        vr29,  vr10,  4
511    vextrins.w     vr28,  vr10,  0x30
512    vpermi.w       vr28,  vr28,  0x1b  //8-5
513    vpermi.w       vr29,  vr29,  0x1b  //12-9
514    vadd.w         vr29,  vr29,  vr24
515    vadd.w         vr28,  vr28,  vr25  //diag[1][7+y-x]
516    vextrins.w     vr11,  vr29,  0x00
517    vextrins.w     vr29,  vr28,  0x00
518    vshuf4i.w      vr10,  vr29,  0x6c
519    vextrins.w     vr28,  vr9,   0x00
520    vshuf4i.w      vr9,   vr28,  0x6c
521
522    vbsrl.v        vr28,  vr18,  4
523    vextrins.w     vr28,  vr19,  0x30  //1234
524    vbsrl.v        vr29,  vr19,  4
525    vextrins.w     vr29,  vr20,  0x30  //5678
526    vadd.w         vr28,  vr28,  vr24
527    vadd.w         vr29,  vr29,  vr25  //alt[2][3-(y>>1)+7]
528    vextrins.w     vr20,  vr29,  0x03
529    vextrins.w     vr29,  vr28,  0x33
530    vshuf4i.w      vr19,  vr29,  0x93
531    vbsll.v        vr18,  vr28,  4
532
533    vadd.w         vr2,   vr2,   vr24
534    vadd.w         vr3,   vr3,   vr25  //hv[1][x]
535
536    vbsrl.v        vr28,  vr21,  8
537    vextrins.d     vr28,  vr22,  0x10
538    vbsrl.v        vr29,  vr22,  8
539    vextrins.d     vr29,  vr23,  0x10
540    vadd.w         vr28,  vr28,  vr24
541    vadd.w         vr29,  vr29,  vr25
542    vextrins.d     vr21,  vr28,  0x10
543    vextrins.d     vr22,  vr28,  0x01
544    vextrins.d     vr22,  vr29,  0x10
545    vextrins.d     vr23,  vr29,  0x01  //alt[3][(y>>1)+x]
546
547    add.d          a0,    a0,    a1
548
549    // 6
550    fld.d          f24,   a0,    0  //img
551    vpermi.w       vr25,  vr24,  0x01
552
553    vsllwil.hu.bu  vr24,  vr24,  0
554    vsllwil.hu.bu  vr24,  vr24,  0
555    vsllwil.hu.bu  vr25,  vr25,  0
556    vsllwil.hu.bu  vr25,  vr25,  0
557
558    vsub.w         vr24,  vr24,  vr31  //px
559    vsub.w         vr25,  vr25,  vr31
560
561    vbsrl.v        vr28,  vr5,   8
562    vbsrl.v        vr29,  vr6,   8
563    vextrins.d     vr28,  vr6,   0x10  //6-9
564    vextrins.d     vr29,  vr7,   0x10  //10-13
565    vadd.w         vr28,  vr28,  vr24  //diag[0][y+x]
566    vadd.w         vr29,  vr29,  vr25
567    vextrins.d     vr5,   vr28,  0x10
568    vextrins.d     vr6,   vr28,  0x01
569    vextrins.d     vr6,   vr29,  0x10
570    vextrins.d     vr7,   vr29,  0x01
571
572    vbsrl.v        vr28,  vr13,  8
573    vextrins.d     vr28,  vr14,  0x10
574    vpackev.w      vr26,  vr25,  vr24
575    vpackod.w      vr27,  vr25,  vr24
576    vpermi.w       vr26,  vr26,  0xd8 //px0246
577    vpermi.w       vr27,  vr27,  0xd8 //px1357
578    vadd.w         vr28,  vr28,  vr26
579    vadd.w         vr28,  vr28,  vr27  //alt[0][y+(x>>1)]
580    vextrins.d     vr13,  vr28,  0x10
581    vextrins.d     vr14,  vr28,  0x01
582
583    vhaddw.d.w     vr28,  vr24,  vr24
584    vhaddw.q.d     vr28,  vr28,  vr28
585    vpickve2gr.d   a3,    vr28,  0
586    vhaddw.d.w     vr28,  vr25,  vr25
587    vhaddw.q.d     vr28,  vr28,  vr28
588    vpickve2gr.d   a4,    vr28,  0
589    add.d          a3,    a3,    a4
590    vinsgr2vr.w    vr1,   a3,    2    //hv[0][y]
591
592    vbsrl.v        vr28,  vr16,  8
593    vextrins.d     vr28,  vr17,  0x10
594    vpermi.w       vr28,  vr28,  0x1b
595    vadd.w         vr28,  vr28,  vr26
596    vadd.w         vr28,  vr28,  vr27  //alt[1][3+y-(x>>1)]
597    vpermi.w       vr28,  vr28,  0x1b
598    vextrins.d     vr16,  vr28,  0x10
599    vextrins.d     vr17,  vr28,  0x01
600
601    vbsrl.v        vr28,  vr9,   8
602    vextrins.d     vr28,  vr10,  0x10
603    vbsrl.v        vr29,  vr10,  8
604    vextrins.d     vr29,  vr11,  0x10
605    vpermi.w       vr28,  vr28,  0x1b  //9876
606    vpermi.w       vr29,  vr29,  0x1b  //13-10
607    vadd.w         vr29,  vr29,  vr24
608    vadd.w         vr28,  vr28,  vr25
609    vpermi.w       vr28,  vr28,  0x1b
610    vpermi.w       vr29,  vr29,  0x1b
611    vextrins.d     vr9,   vr28,  0x10
612    vextrins.d     vr10,  vr28,  0x01
613    vextrins.d     vr10,  vr29,  0x10
614    vextrins.d     vr11,  vr29,  0x01  //diag[1][7+y-x]
615
616    vadd.w         vr18,  vr18,  vr24 //0123
617    vadd.w         vr19,  vr19,  vr25 //4567 alt[2][3-(y>>1)+7]
618
619    vadd.w         vr2,   vr2,   vr24
620    vadd.w         vr3,   vr3,   vr25  //hv[1][x]
621
622    vbsll.v        vr28,  vr22,  4
623    vextrins.w     vr28,  vr21,  0x03  //3456
624    vbsll.v        vr29,  vr23,  4
625    vextrins.w     vr29,  vr22,  0x03  //78910
626    vadd.w         vr28,  vr28,  vr24
627    vadd.w         vr29,  vr29,  vr25  //alt[3][(y>>1)+x]
628    vextrins.w     vr21,  vr28,  0x30
629    vextrins.w     vr28,  vr29,  0x00
630    vshuf4i.w      vr22,  vr28,  0x39
631    vbsrl.v        vr23,  vr29,  4
632
633    add.d          a0,    a0,    a1
634
635    // 7
636    fld.d          f24,   a0,    0  //img
637    vpermi.w       vr25,  vr24,  0x01
638
639    vsllwil.hu.bu  vr24,  vr24,  0
640    vsllwil.hu.bu  vr24,  vr24,  0
641    vsllwil.hu.bu  vr25,  vr25,  0
642    vsllwil.hu.bu  vr25,  vr25,  0
643
644    vsub.w         vr24,  vr24,  vr31  //px
645    vsub.w         vr25,  vr25,  vr31
646
647    vbsll.v        vr28,  vr6,   4
648    vextrins.w     vr28,  vr5,   0x03 //78910
649    vbsll.v        vr29,  vr7,   4
650    vextrins.w     vr29,  vr6,   0x03 //11-14
651    vadd.w         vr28,  vr28,  vr24  //diag[0][y+x]
652    vadd.w         vr29,  vr29,  vr25
653    vextrins.w     vr5,   vr28,  0x30
654    vextrins.w     vr28,  vr29,  0x00
655    vshuf4i.w      vr6,   vr28,  0x39
656    vbsrl.v        vr7,   vr29,  4
657
658    vbsll.v        vr28,  vr14,  4
659    vextrins.w     vr28,  vr13,  0x03
660    vpackev.w      vr26,  vr25,  vr24
661    vpackod.w      vr27,  vr25,  vr24
662    vpermi.w       vr26,  vr26,  0xd8 //px0246
663    vpermi.w       vr27,  vr27,  0xd8 //px1357
664    vadd.w         vr28,  vr28,  vr26
665    vadd.w         vr28,  vr28,  vr27  //alt[0][y+(x>>1)]
666    vextrins.w     vr13,  vr28,  0x30
667    vbsrl.v        vr14,  vr28,  4
668
669    vhaddw.d.w     vr28,  vr24,  vr24
670    vhaddw.q.d     vr28,  vr28,  vr28
671    vpickve2gr.d   a3,    vr28,  0
672    vhaddw.d.w     vr28,  vr25,  vr25
673    vhaddw.q.d     vr28,  vr28,  vr28
674    vpickve2gr.d   a4,    vr28,  0
675    add.d          a3,    a3,    a4
676    vinsgr2vr.w    vr1,   a3,    3    //hv[0][y]
677
678    vbsll.v        vr28,  vr17,  4
679    vextrins.w     vr28,  vr16,  0x03
680    vpermi.w       vr28,  vr28,  0x1b  //10987
681    vadd.w         vr28,  vr28,  vr26
682    vadd.w         vr28,  vr28,  vr27  //alt[1][3+y-(x>>1)]
683    vextrins.w     vr16,  vr28,  0x33
684    vshuf4i.w      vr17,  vr28,  0xc6
685    vinsgr2vr.w    vr17,  zero,  3
686
687    vbsll.v        vr28,  vr10,  4
688    vextrins.w     vr28,  vr9,   0x03  //7-10
689    vbsll.v        vr29,  vr11,  4
690    vextrins.w     vr29,  vr10,  0x03  //11-14
691    vpermi.w       vr28,  vr28,  0x1b  //10-7
692    vpermi.w       vr29,  vr29,  0x1b  //14-11
693    vadd.w         vr29,  vr29,  vr24
694    vadd.w         vr28,  vr28,  vr25  //diag[1][7+y-x]
695    vextrins.w     vr9,   vr28,  0x33
696    vextrins.w     vr28,  vr29,  0x33
697    vshuf4i.w      vr10,  vr28,  0xc6
698    vshuf4i.w      vr11,  vr29,  0xc6
699    vinsgr2vr.w    vr11,  zero,  3
700
701    vadd.w         vr18,  vr18,  vr24 //0123
702    vadd.w         vr19,  vr19,  vr25 //4567 alt[2][3-(y>>1)+7]
703
704    vadd.w         vr2,   vr2,   vr24
705    vadd.w         vr3,   vr3,   vr25  //hv[1][x]
706
707    vbsll.v        vr28,  vr22,  4
708    vextrins.w     vr28,  vr21,  0x03  //3456
709    vbsll.v        vr29,  vr23,  4
710    vextrins.w     vr29,  vr22,  0x03  //78910
711    vadd.w         vr28,  vr28,  vr24
712    vadd.w         vr29,  vr29,  vr25  //alt[3][(y>>1)+x]
713    vextrins.w     vr21,  vr28,  0x30
714    vextrins.w     vr28,  vr29,  0x00
715    vshuf4i.w      vr22,  vr28,  0x39
716    vbsrl.v        vr23,  vr29,  4
717
718    add.d          a0,    a0,    a1
719
720    vxor.v         vr24,  vr24,  vr24  //unsigned cost[8]
721    vxor.v         vr25,  vr25,  vr25
722
723    vmul.w         vr26,  vr0,   vr0
724    vmul.w         vr27,  vr1,   vr1
725    vhaddw.d.w     vr28,  vr26,  vr26
726    vhaddw.q.d     vr28,  vr28,  vr28
727    vpickve2gr.d   a3,    vr28,  0
728    vhaddw.d.w     vr28,  vr27,  vr27
729    vhaddw.q.d     vr28,  vr28,  vr28
730    vpickve2gr.d   a4,    vr28,  0
731    add.d          a3,    a3,    a4
732
733    vmul.w         vr26,  vr2,   vr2
734    vmul.w         vr27,  vr3,   vr3
735    vhaddw.d.w     vr28,  vr26,  vr26
736    vhaddw.q.d     vr28,  vr28,  vr28
737    vpickve2gr.d   a4,    vr28,  0
738    vhaddw.d.w     vr28,  vr27,  vr27
739    vhaddw.q.d     vr28,  vr28,  vr28
740    vpickve2gr.d   a5,    vr28,  0
741    add.d          a4,    a4,    a5
742
743    li.d           a6,    105
744    mul.w          a3,    a3,    a6
745    mul.w          a4,    a4,    a6
746    vinsgr2vr.w    vr24,  a3,    2
747    vinsgr2vr.w    vr25,  a4,    2
748
749    vxor.v         vr30,  vr30,  vr30  //div_table
750    vxor.v         vr31,  vr31,  vr31
751    li.d           t0,    840
752    vinsgr2vr.w    vr30,  t0,    0
753    li.d           t0,    420
754    vinsgr2vr.w    vr30,  t0,    1
755    li.d           t0,    280
756    vinsgr2vr.w    vr30,  t0,    2
757    li.d           t0,    210
758    vinsgr2vr.w    vr30,  t0,    3
759    li.d           t0,    168
760    vinsgr2vr.w    vr31,  t0,    0
761    li.d           t0,    140
762    vinsgr2vr.w    vr31,  t0,    1
763    li.d           t0,    120
764    vinsgr2vr.w    vr31,  t0,    2
765
766    vbsll.v        vr27,  vr7,   4
767    vextrins.w     vr27,  vr6,   0x03
768    vpermi.w       vr27,  vr27,  0x1b
769    vmul.w         vr26,  vr4,   vr4
770    vmadd.w        vr26,  vr27,  vr27
771    vmul.w         vr26,  vr26,  vr30
772    vhaddw.d.w     vr28,  vr26,  vr26
773    vhaddw.q.d     vr28,  vr28,  vr28
774    vpickve2gr.d   a3,    vr28,  0
775    vbsll.v        vr27,  vr6,   4
776    vpermi.w       vr27,  vr27,  0x1b
777    vmul.w         vr26,  vr5,   vr5
778    vmadd.w        vr26,  vr27,  vr27
779    vmul.w         vr26,  vr26,  vr31
780    vextrins.w     vr26,  vr31,  0x33
781    vhaddw.d.w     vr28,  vr26,  vr26
782    vhaddw.q.d     vr28,  vr28,  vr28
783    vpickve2gr.d   a4,    vr28,  0
784    add.d          a3,    a3,    a4   //cost[0]
785
786    vbsll.v        vr27,  vr11,  4
787    vextrins.w     vr27,  vr10,  0x03
788    vpermi.w       vr27,  vr27,  0x1b
789    vmul.w         vr26,  vr8,   vr8
790    vmadd.w        vr26,  vr27,  vr27
791    vmul.w         vr26,  vr26,  vr30
792    vhaddw.d.w     vr28,  vr26,  vr26
793    vhaddw.q.d     vr28,  vr28,  vr28
794    vpickve2gr.d   a4,    vr28,  0
795    vbsll.v        vr27,  vr10,  4
796    vpermi.w       vr27,  vr27,  0x1b
797    vmul.w         vr26,  vr9,   vr9
798    vmadd.w        vr26,  vr27,  vr27
799    vmul.w         vr26,  vr26,  vr31
800    vextrins.w     vr26,  vr31,  0x33
801    vhaddw.d.w     vr28,  vr26,  vr26
802    vhaddw.q.d     vr28,  vr28,  vr28
803    vpickve2gr.d   a5,    vr28,  0
804    add.d          a4,    a4,    a5   //cost[4]
805
806    vpickve2gr.w   a5,    vr5,   3
807    mul.w          a5,    a5,    a5
808    mul.w          a5,    a5,    a6
809    add.w          a3,    a3,    a5
810    vinsgr2vr.w    vr24,  a3,    0
811    vpickve2gr.w   a5,    vr9,   3
812    mul.w          a5,    a5,    a5
813    mul.w          a5,    a5,    a6
814    add.w          a4,    a4,    a5
815    vinsgr2vr.w    vr25,  a4,    0
816
817    //n=0
818    vpickve2gr.w   a3,    vr24,  1
819    vmul.w         vr26,  vr13,  vr13
820    vhaddw.d.w     vr28,  vr26,  vr26
821    vhaddw.q.d     vr28,  vr28,  vr28
822    vpickve2gr.d   a4,    vr28,  0
823    vpickve2gr.w   a5,    vr12,  3
824    mul.w          a5,    a5,    a5
825    add.d          a3,    a3,    a4
826    add.d          a3,    a3,    a5
827    mul.w          a3,    a3,    a6  //*cost_ptr
828
829    vextrins.w     vr29,  vr30,  0x01
830    vextrins.w     vr29,  vr30,  0x13
831    vextrins.w     vr29,  vr31,  0x21
832    vextrins.w     vr29,  vr31,  0x33
833    vbsll.v        vr27,  vr14,  4
834    vpermi.w       vr27,  vr27,  0x1b
835    vmul.w         vr28,  vr12,  vr12
836    vextrins.w     vr28,  vr31,  0x33
837    vmadd.w        vr28,  vr27,  vr27
838    vmul.w         vr26,  vr28,  vr29
839    vhaddw.d.w     vr28,  vr26,  vr26
840    vhaddw.q.d     vr28,  vr28,  vr28
841    vpickve2gr.d   a4,    vr28,  0
842    add.d          a3,    a3,    a4
843    vinsgr2vr.w    vr24,  a3,    1
844
845    //n=1
846    vpickve2gr.w   a3,    vr24,  3
847    vmul.w         vr26,  vr16,  vr16
848    vhaddw.d.w     vr28,  vr26,  vr26
849    vhaddw.q.d     vr28,  vr28,  vr28
850    vpickve2gr.d   a4,    vr28,  0
851    vpickve2gr.w   a5,    vr15,  3
852    mul.w          a5,    a5,    a5
853    add.d          a3,    a3,    a4
854    add.d          a3,    a3,    a5
855    mul.w          a3,    a3,    a6  //*cost_ptr
856
857    vbsll.v        vr27,  vr17,  4
858    vpermi.w       vr27,  vr27,  0x1b
859    vmul.w         vr28,  vr15,  vr15
860    vextrins.w     vr28,  vr31,  0x33
861    vmadd.w        vr28,  vr27,  vr27
862    vmul.w         vr26,  vr28,  vr29
863    vhaddw.d.w     vr28,  vr26,  vr26
864    vhaddw.q.d     vr28,  vr28,  vr28
865    vpickve2gr.d   a4,    vr28,  0
866    add.d          a3,    a3,    a4
867    vinsgr2vr.w    vr24,  a3,    3
868
869    //n=2
870    vpickve2gr.w   a3,    vr25,  1
871    vmul.w         vr26,  vr19,  vr19
872    vhaddw.d.w     vr28,  vr26,  vr26
873    vhaddw.q.d     vr28,  vr28,  vr28
874    vpickve2gr.d   a4,    vr28,  0
875    vpickve2gr.w   a5,    vr18,  3
876    mul.w          a5,    a5,    a5
877    add.d          a3,    a3,    a4
878    add.d          a3,    a3,    a5
879    mul.w          a3,    a3,    a6  //*cost_ptr
880
881    vbsll.v        vr27,  vr20,  4
882    vpermi.w       vr27,  vr27,  0x1b
883    vmul.w         vr28,  vr18,  vr18
884    vextrins.w     vr28,  vr31,  0x33
885    vmadd.w        vr28,  vr27,  vr27
886    vmul.w         vr26,  vr28,  vr29
887    vhaddw.d.w     vr28,  vr26,  vr26
888    vhaddw.q.d     vr28,  vr28,  vr28
889    vpickve2gr.d   a4,    vr28,  0
890    add.d          a3,    a3,    a4
891    vinsgr2vr.w    vr25,  a3,    1
892
893    //n=3
894    vpickve2gr.w   a3,    vr25,  3
895    vmul.w         vr26,  vr22,  vr22
896    vhaddw.d.w     vr28,  vr26,  vr26
897    vhaddw.q.d     vr28,  vr28,  vr28
898    vpickve2gr.d   a4,    vr28,  0
899    vpickve2gr.w   a5,    vr21,  3
900    mul.w          a5,    a5,    a5
901    add.d          a3,    a3,    a4
902    add.d          a3,    a3,    a5
903    mul.w          a3,    a3,    a6  //*cost_ptr
904
905    vbsll.v        vr27,  vr23,  4
906    vpermi.w       vr27,  vr27,  0x1b
907    vmul.w         vr28,  vr21,  vr21
908    vextrins.w     vr28,  vr31,  0x33
909    vmadd.w        vr28,  vr27,  vr27
910    vmul.w         vr26,  vr28,  vr29
911    vhaddw.d.w     vr28,  vr26,  vr26
912    vhaddw.q.d     vr28,  vr28,  vr28
913    vpickve2gr.d   a4,    vr28,  0
914    add.d          a3,    a3,    a4
915    vinsgr2vr.w    vr25,  a3,    3
916
917    xor            a3,    a3,    a3  //best_dir
918    vpickve2gr.w   a4,    vr24,  0   //best_cost
919.BSETDIR01:
920    vpickve2gr.w   a5,    vr24,  1
921    bge            a4,    a5,    .BSETDIR02
922    or             a4,    a5,    a5
923    ori            a3,    zero,  1
924.BSETDIR02:
925    vpickve2gr.w   a5,    vr24,  2
926    bge            a4,    a5,    .BSETDIR03
927    or             a4,    a5,    a5
928    ori            a3,    zero,  2
929.BSETDIR03:
930    vpickve2gr.w   a5,    vr24,  3
931    bge            a4,    a5,    .BSETDIR04
932    or             a4,    a5,    a5
933    ori            a3,    zero,  3
934.BSETDIR04:
935    vpickve2gr.w   a5,    vr25,  0
936    bge            a4,    a5,    .BSETDIR05
937    or             a4,    a5,    a5
938    ori            a3,    zero,  4
939.BSETDIR05:
940    vpickve2gr.w   a5,    vr25,  1
941    bge            a4,    a5,    .BSETDIR06
942    or             a4,    a5,    a5
943    ori            a3,    zero,  5
944.BSETDIR06:
945    vpickve2gr.w   a5,    vr25,  2
946    bge            a4,    a5,    .BSETDIR07
947    or             a4,    a5,    a5
948    ori            a3,    zero,  6
949.BSETDIR07:
950    vpickve2gr.w   a5,    vr25,  3
951    bge            a4,    a5,    .BSETDIREND
952    or             a4,    a5,    a5
953    ori            a3,    zero,  7
954.BSETDIREND:
955    xori           a5,    a3,    4
956    li.d           a1,    4
957    bge            a5,    a1,    .GETCOST01
958    vreplve.w      vr26,  vr24,  a5
959    b              .GETCOST02
960.GETCOST01:
961    vreplve.w      vr26,  vr25,  a5
962.GETCOST02:
963    vpickve2gr.w   a5,    vr26,  0
964    sub.w          a5,    a4,    a5
965    srai.d         a5,    a5,    10
966    st.w           a5,    a2,    0
967    or             a0,    a3,    a3
968
969    fld.d          f24,   sp,    0
970    fld.d          f25,   sp,    8
971    fld.d          f26,   sp,    16
972    fld.d          f27,   sp,    24
973    fld.d          f28,   sp,    32
974    fld.d          f29,   sp,    40
975    fld.d          f30,   sp,    48
976    fld.d          f31,   sp,    56
977    addi.d         sp,    sp,    64
978
979endfunc
980
981.macro cdef_fill tmp, stride, w, h
982    beqz          \h,     700f         //h
983    or            t0,     zero,  zero  //y
984100:
985    or            t1,     zero,  zero  //xx
986    srai.d        s6,     \w,    3     //x
987    beqz          s6,     300f
988200:
989    vstx          vr18,   \tmp,    t1
990    addi.d        t1,     t1,    16
991    addi.d        s6,     s6,    -1
992    bnez          s6,     200b
993300:
994    andi          s6,     \w,    4
995    beqz          s6,     400f
996    fstx.d        f18,    \tmp,    t1
997    addi.d        t1,     t1,    8
998400:
999    andi          s6,     \w,    2
1000    beqz          s6,     500f
1001    fstx.s        f18,    \tmp,    t1
1002    addi.d        t1,     t1,    4
1003500:
1004    andi          s6,     \w,    1
1005    beqz          s6,     600f
1006    li.w          s6,     -16384
1007    stx.h         s6,     \tmp,    t1
1008    addi.d        t1,     t1,    2
1009600:
1010    add.d         \tmp,     \tmp,    \stride
1011    add.d         \tmp,     \tmp,    \stride
1012    addi.d        t0,     t0,    1
1013    blt           t0,     \h,    100b
1014700:
1015.endm
1016
1017const dav1d_cdef_directions
1018.byte   1 * 12 + 0,  2 * 12 + 0
1019.byte   1 * 12 + 0,  2 * 12 - 1
1020.byte   -1 * 12 + 1, -2 * 12 + 2
1021.byte   0 * 12 + 1, -1 * 12 + 2
1022.byte   0 * 12 + 1,  0 * 12 + 2
1023.byte   0 * 12 + 1,  1 * 12 + 2
1024.byte   1 * 12 + 1,  2 * 12 + 2
1025.byte   1 * 12 + 0,  2 * 12 + 1
1026.byte   1 * 12 + 0,  2 * 12 + 0
1027.byte   1 * 12 + 0,  2 * 12 - 1
1028.byte   -1 * 12 + 1, -2 * 12 + 2
1029.byte   0 * 12 + 1, -1 * 12 + 2
1030endconst
1031
1032.macro constrain_vrh in0, in1, in2, tmp0, tmp1, out
1033    vabsd.h        \tmp0, \in0,  vr23   //adiff
1034    vsra.h         \tmp1, \tmp0, \in2
1035    vsub.h         \tmp1, \in1,  \tmp1
1036    vmax.h         \tmp1, vr23,  \tmp1  //imax
1037    vmin.h         \tmp0, \tmp0, \tmp1  //imin
1038
1039    //apply_sign
1040    vslt.h         \tmp1, \in0,  vr23
1041    vandn.v        \in0,  \tmp1, \tmp0
1042    vsigncov.h     \tmp0, \tmp1, \tmp0
1043    vor.v          \out,  \in0,  \tmp0
1044.endm
1045
1046.macro iclip_vrh in0, in1, in2, tmp0, tmp1, out
1047    vmin.h         \tmp0, \in2,  \in0
1048    vslt.h         \in0,  \in0,  \in1
1049    vand.v         \tmp1, \in0,  \in1
1050    vandn.v        \tmp0, \in0,  \tmp0
1051    vor.v          \out,  \tmp1, \tmp0
1052.endm
1053
1054.macro cdef_padding_data
1055    //y < 0
1056    beqz          t7,     90f
10574:
1058    or            t4,     t5,    t5  //data index xx
1059    slli.d        t0,     t4,    1
1060    mul.w         t2,     t7,    s5
1061    slli.d        t2,     t2,    1
1062    add.d         t2,     s4,    t2
1063
1064    sub.d         t3,     t6,    t5  //loop param x
1065    srai.d        t3,     t3,    3
1066    add.d         t3,     t3,    t5
1067    beq           t5,     t3,    6f
10685:  // /8
1069    fldx.d        f18,    a3,    t4
1070    vsllwil.hu.bu vr18,   vr18,  0
1071    vstx          vr18,   t2,    t0
1072    addi.d        t0,     t0,    16
1073    addi.d        t4,     t4,    8
1074
1075    addi.d        t3,     t3,    -1
1076    bne           t5,     t3,    5b
10776:  // &4
1078    sub.d         t1,     t6,    t5
1079    andi          t1,     t1,    4
1080    beqz          t1,     7f
1081
1082    fldx.s        f18,    a3,    t4
1083    vsllwil.hu.bu vr18,   vr18,  0
1084    fstx.d        f18,    t2,    t0
1085    addi.d        t0,     t0,    8
1086    addi.d        t4,     t4,    4
10877:  // &2
1088    sub.d         t1,     t6,    t5
1089    andi          t1,     t1,    2
1090    beqz          t1,     9f
1091
1092    ldx.bu        t1,     a3,    t4
1093    stx.h         t1,     t2,    t0
1094    addi.d        t0,     t0,    2
1095    addi.d        t4,     t4,    1
1096    ldx.bu        t1,     a3,    t4
1097    stx.h         t1,     t2,    t0
1098    addi.d        t0,     t0,    2
1099    addi.d        t4,     t4,    1
11009:
1101    add.d         a3,     a3,    a1
1102    addi.d        t7,     t7,    1
1103    bnez          t7,     4b
1104
110590:
1106    // y < h
1107    beqz          s1,     12f
1108    beqz          t5,     12f
1109    or            t7,     zero,  zero  //y
111010:
1111    or            t4,     t5,    t5  //data index x
111211:
1113    slli.d        t3,     t7,    1
1114    addi.d        t3,     t3,    2
1115    add.d         t3,     t3,    t4
1116    ldx.bu        t1,     a2,    t3
1117
1118    mul.w         t3,     t7,    s5
1119    add.d         t3,     t3,    t4
1120    slli.d        t3,     t3,    1
1121    stx.h         t1,     s4,    t3
1122
1123    addi.d        t4,     t4,    1
1124    bnez          t4,     11b
1125
1126    addi.d        t7,     t7,    1
1127    bne           t7,     s1,    10b
1128
112912:
1130    // y = 0 ; y < h
1131    or            s0,     s4,    s4
1132    beqz          s1,     20f
1133    or            s6,     a0,    a0
1134    or            t7,     zero,  zero  //y
1135    srai.d        t4,     t6,    3    //loop max
113613:
1137    or            t0,     zero,  zero //loop param
1138    or            t3,     t0,    t0   //data index src
1139    or            t1,     t0,    t0   //data index tmp
1140    beqz          t4,     16f
114115:  // /8
1142    fldx.d        f18,    s6,    t3
1143    vsllwil.hu.bu vr18,   vr18,  0
1144    vstx          vr18,   s0,    t1
1145    addi.d        t3,     t3,    8
1146    addi.d        t1,     t1,    16
1147
1148    addi.d        t0,     t0,    1
1149    blt           t0,     t4,    15b
115016:  // &4
1151    andi          t0,     t6,    4
1152    beqz          t0,     17f
1153
1154    fldx.s        f18,    s6,    t3
1155    vsllwil.hu.bu vr18,   vr18,  0
1156    fstx.d        f18,    s0,    t1
1157    addi.d        t3,     t3,    4
1158    addi.d        t1,     t1,    8
115917:  // &2
1160    andi          t0,     t6,    2
1161    beqz          t0,     19f
1162
1163    ldx.bu        t2,     s6,    t3
1164    stx.h         t2,     s0,    t1
1165    addi.d        t3,     t3,    1
1166    addi.d        t1,     t1,    2
1167    ldx.bu        t2,     s6,    t3
1168    stx.h         t2,     s0,    t1
1169    addi.d        t3,     t3,    1
1170    addi.d        t1,     t1,    2
117119: // src+ tmp+
1172    add.d         s6,     s6,    a1
1173    add.d         s0,     s0,    s5
1174    add.d         s0,     s0,    s5
1175
1176    addi.d        t7,     t7,    1
1177    blt           t7,     s1,    13b
1178
1179    // y = h ; y < y_end
118020:
1181    beq           s1,     t8,    27f
1182    or            t7,     s1,    s1  //y
1183    sub.d         t4,     t6,    t5
1184    srai.d        t4,     t4,    3
1185    add.d         t4,     t4,    t5   //8 loop max
118621:
1187    or            t0,     t5,    t5   //xx
1188    or            t3,     t0,    t0   //data index bottom
1189    slli.d        t1,     t0,    1    //data index tmp
1190    beq           t5,     t4,    23f
119122:  // /8
1192    fldx.d        f18,    a4,    t3
1193    vsllwil.hu.bu vr18,   vr18,  0
1194    vstx          vr18,   s0,    t1
1195    addi.d        t3,     t3,    8
1196    addi.d        t1,     t1,    16
1197
1198    addi.d        t0,     t0,    1
1199    blt           t0,     t4,    22b
120023:  // &4
1201    sub.d         t0,     t6,    t5
1202    andi          t0,     t0,    4
1203    beqz          t0,     24f
1204
1205    fldx.s        f18,    a4,    t3
1206    vsllwil.hu.bu vr18,   vr18,  0
1207    fstx.d        f18,    s0,    t1
1208    addi.d        t3,     t3,    4
1209    addi.d        t1,     t1,    8
121024:  // &2
1211    sub.d         t0,     t6,    t5
1212    andi          t0,     t0,    2
1213    beqz          t0,     26f
1214
1215    ldx.bu        t2,     a4,    t3
1216    stx.h         t2,     s0,    t1
1217    addi.d        t3,     t3,    1
1218    addi.d        t1,     t1,    2
1219    ldx.bu        t2,     a4,    t3
1220    stx.h         t2,     s0,    t1
1221    addi.d        t3,     t3,    1
1222    addi.d        t1,     t1,    2
122326: // bottom+ tmp+
1224    add.d         a4,     a4,    a1
1225    add.d         s0,     s0,    s5
1226    add.d         s0,     s0,    s5
1227
1228    addi.d        t7,     t7,    1
1229    blt           t7,     t8,    21b
123027:
1231    // padding end
1232.endm
1233
1234.macro cdef_pri_sec_init
1235    clz.w          t3,    a6
1236    sub.w          t3,    t2,    t3
1237    sub.w          t3,    s7,    t3  //sec_shift
1238
1239    vreplgr2vr.h   vr4,   t0         //pri_tap_k
1240    vreplgr2vr.h   vr9,   a5         //pri_strength
1241    vreplgr2vr.h   vr10,  t1         //pri_shift
1242    vreplgr2vr.h   vr18,  a6         //sec_strength
1243    vreplgr2vr.h   vr19,  t3         //sec_shift
1244
1245    or             t2,    s1,    s1  //dowhile loop param
1246    addi.d         s1,    a7,    2
1247    slli.d         s1,    s1,    1   //directions dir+2
1248    addi.d         s2,    a7,    4
1249    slli.d         s2,    s2,    1   //directions dir+4
1250    slli.d         s3,    a7,    1   //directions dir+0
1251
1252    la.local       t0,    dav1d_cdef_directions
1253    add.d          s1,    t0,    s1
1254    ld.b           a2,    s1,    0  //off01
1255    ld.b           a3,    s1,    1  //off11
1256    add.d          s2,    t0,    s2
1257    ld.b           s1,    s2,    0  //off02
1258    ld.b           s2,    s2,    1  //off12
1259    add.d          s3,    t0,    s3
1260    ld.b           t0,    s3,    0  //off03
1261    ld.b           s3,    s3,    1  //off13
1262
1263    slli.d         a2,    a2,    1
1264    slli.d         a3,    a3,    1
1265    slli.d         s1,    s1,    1
1266    slli.d         s2,    s2,    1
1267    slli.d         t0,    t0,    1
1268    slli.d         s3,    s3,    1
1269.endm
1270
1271.macro cdef_pri_init
1272    vreplgr2vr.h   vr4,   t0         //pri_tap_k
1273    vreplgr2vr.h   vr9,   a5         //pri_strength
1274    vreplgr2vr.h   vr10,  t1         //pri_shift
1275
1276    or             t2,    s1,    s1  //dowhile loop param
1277    addi.d         s1,    a7,    2
1278    slli.d         s1,    s1,    1   //directions dir+2
1279
1280    la.local       t0,    dav1d_cdef_directions
1281    add.d          s1,    t0,    s1
1282    ld.b           a2,    s1,    0  //off01
1283    ld.b           a3,    s1,    1  //off11
1284
1285    slli.d         a2,    a2,    1
1286    slli.d         a3,    a3,    1
1287.endm
1288
1289.macro cdef_sec_init
1290    clz.w          t3,    a6
1291    li.w           t2,    31
1292    sub.w          t3,    t2,    t3
1293    sub.w          t3,    s7,    t3  //sec_shift
1294
1295    vreplgr2vr.h   vr18,  a6         //sec_strength
1296    vreplgr2vr.h   vr19,  t3         //sec_shift
1297
1298    or             t2,    s1,    s1  //dowhile loop param
1299    addi.d         s2,    a7,    4
1300    slli.d         s2,    s2,    1   //directions dir+4
1301    slli.d         s3,    a7,    1   //directions dir+0
1302
1303    la.local       t0,    dav1d_cdef_directions
1304    add.d          s1,    t0,    s1
1305    add.d          s2,    t0,    s2
1306    ld.b           s1,    s2,    0  //off02
1307    ld.b           s2,    s2,    1  //off12
1308    add.d          s3,    t0,    s3
1309    ld.b           t0,    s3,    0  //off03
1310    ld.b           s3,    s3,    1  //off13
1311
1312    slli.d         s1,    s1,    1
1313    slli.d         s2,    s2,    1
1314    slli.d         t0,    t0,    1
1315    slli.d         s3,    s3,    1
1316.endm
1317
1318.macro cdef_process_data_w8 in0, in1
1319    vsub.h       vr11,   vr5,   vr0
1320    vsub.h       vr12,   vr6,   vr0
1321    vsub.h       vr13,   vr7,   vr0
1322    vsub.h       vr14,   vr8,   vr0
1323
1324    constrain_vrh   vr11,  \in0,   \in1,  vr16,  vr17,  vr11
1325    constrain_vrh   vr12,  \in0,   \in1,  vr16,  vr17,  vr12
1326    constrain_vrh   vr13,  \in0,   \in1,  vr16,  vr17,  vr13
1327    constrain_vrh   vr14,  \in0,   \in1,  vr16,  vr17,  vr14
1328.endm
1329
1330.macro cdef_process_data_w4 in0, in1
1331    vpermi.w       vr6,  vr5,  0x44
1332    vpermi.w       vr8,  vr7,  0x44
1333
1334    vsub.h         vr12,  vr6,   vr0
1335    vsub.h         vr14,  vr8,   vr0
1336
1337    constrain_vrh   vr12,  \in0,   \in1,  vr16,  vr17,  vr12
1338    constrain_vrh   vr14,  \in0,   \in1,  vr16,  vr17,  vr14
1339.endm
1340
1341.macro cdef_calc_sum_tapchange_w8
1342    vmul.h         vr1,   vr15,  vr11  //sum
1343    vmadd.h        vr1,   vr15,  vr12  //sum
1344    vand.v         vr15,  vr15,  vr21
1345    vor.v          vr15,  vr15,  vr22
1346    vmadd.h        vr1,   vr15,  vr13  //sum
1347    vmadd.h        vr1,   vr15,  vr14  //sum
1348.endm
1349
1350.macro cdef_calc_sum_tapchange_w4
1351    vmul.h         vr1,   vr15,  vr12  //sum
1352    vand.v         vr15,  vr15,  vr21
1353    vor.v          vr15,  vr15,  vr22
1354    vmadd.h        vr1,   vr15,  vr14  //sum
1355.endm
1356
1357.macro cdef_calc_sum_no_tapchange_w4 in0
1358    vmadd.h        vr1,   \in0,  vr12
1359    vmadd.h        vr1,   \in0,  vr14
1360.endm
1361
1362.macro cdef_calc_sum_no_tapchange_w8 in0
1363    vmadd.h        vr1,   \in0,  vr11  //sum
1364    vmadd.h        vr1,   \in0,  vr12
1365    vmadd.h        vr1,   \in0,  vr13
1366    vmadd.h        vr1,   \in0,  vr14
1367.endm
1368
1369.macro cdef_calc_maxmin_w4
1370    vmin.hu        vr3,   vr6,   vr3
1371    vmax.h         vr2,   vr6,   vr2
1372    vmin.hu        vr3,   vr8,   vr3  //min
1373    vmax.h         vr2,   vr8,   vr2  //max
1374.endm
1375
1376.macro cdef_calc_maxmin_w8
1377    vmin.hu        vr3,   vr5,   vr3
1378    vmax.h         vr2,   vr5,   vr2
1379    vmin.hu        vr3,   vr6,   vr3
1380    vmax.h         vr2,   vr6,   vr2
1381    vmin.hu        vr3,   vr7,   vr3
1382    vmax.h         vr2,   vr7,   vr2
1383    vmin.hu        vr3,   vr8,   vr3  //min
1384    vmax.h         vr2,   vr8,   vr2  //max
1385.endm
1386
1387.macro cdef_calc_dst
1388    vslti.h        vr5,   vr1,   0
1389    vand.v         vr5,   vr5,   vr20
1390    vsub.h         vr5,   vr1,   vr5
1391    vaddi.hu       vr5,   vr5,   8
1392    vsrai.h        vr5,   vr5,   4
1393    vadd.h         vr5,   vr0,   vr5
1394.endm
1395
1396//static NOINLINE void cdef_filter_block_lsx
1397//                    (pixel *dst, const ptrdiff_t dst_stride,
1398//                     const pixel (*left)[2], const pixel *const top,
1399//                     const int pri_strength, const int sec_strength,
1400//                     const int dir, const int damping, const int w, int h,
1401//                     const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
1402// w=4 h=4
1403//param: dst:a0, dst_stride:a1, left:a2, top:a3, bottom:a4, pri_strength:a5
1404//sec_strength:a6, dir:a7, damping:s7, w:s0, h:s1, edges:s2
1405function cdef_filter_block_4x4_8bpc_lsx
1406    ld.w           t0,    sp,    0
1407    ld.w           t1,    sp,    8
1408    addi.d         sp,    sp,    -(64+288)
1409    st.d           s0,    sp,    0
1410    st.d           s1,    sp,    8
1411    st.d           s2,    sp,    16
1412    st.d           s3,    sp,    24
1413    st.d           s4,    sp,    32
1414    st.d           s5,    sp,    40
1415    st.d           s6,    sp,    48
1416    st.d           s7,    sp,    56
1417
1418    li.w           s0,    4         //w
1419    li.w           s1,    4         //h
1420    or             s2,    t1,    t1 //edges
1421    or             s7,    t0,    t0 //damping
1422
1423    li.d           s5,    12         //tmp_stride
1424    addi.d         s4,    sp,    64
1425    slli.d         t0,    s5,    1
1426    addi.d         t0,    t0,    2
1427    slli.d         t0,    t0,    1
1428    add.d          s4,    s4,    t0  //ptr tmp
1429    vxor.v         vr23,  vr23,  vr23
1430    li.w           t2,    1
1431    vreplgr2vr.h   vr20,  t2
1432    vaddi.hu       vr21,  vr20,  2
1433    vaddi.hu       vr22,  vr20,  1
1434
1435    li.w          t0,     -16384
1436    vreplgr2vr.h  vr18,   t0
1437
1438    //padding
1439    li.w          t5,     -2        //x_start
1440    addi.d        t6,     s0,    2  //x_end
1441    li.w          t7,     -2        //y_start
1442    addi.d        t8,     s1,    2  //y_end
1443    li.w          t2,     2
1444
1445    andi          t4,     s2,    4
1446    bnez          t4,     1f
1447
1448    //CDEF_HAVE_TOP
1449    slli.d        t3,     s5,    2
1450    addi.d        t4,     s4,    -4
1451    sub.d         t4,     t4,    t3
1452    addi.d        t3,     s0,    4
1453
1454    cdef_fill     t4,     s5,    t3,     t2
1455
1456    or            t7,     zero,  zero
1457
14581:  //CDEF_HAVE_BOTTOM
1459    andi          t4,     s2,8
1460    bnez          t4,     2f
1461
1462    mul.w         t3,     s1,    s5
1463    slli.d        t3,     t3,  1
1464    add.d         t4,     s4,  t3
1465    addi.d        t4,     t4,    -4
1466    li.d          t3,     8
1467
1468    cdef_fill     t4,     s5,    t3,     t2
1469
1470    addi.d        t8,     t8,    -2
1471
14722:  //CDEF_HAVE_LEFT
1473    andi          t4,     s2,1
1474    bnez          t4,     3f
1475
1476    mul.w         t3,     t7,    s5
1477    slli.d        t3,     t3,    1
1478    add.d         t4,     s4,    t3
1479    addi.d        t4,     t4,    -4
1480    sub.d         t3,     t8,    t7
1481
1482    cdef_fill     t4,     s5,    t2,     t3
1483
1484    or            t5,     zero,  zero
1485
14863:  //CDEF_HAVE_RIGHT
1487    andi          t4,     s2,2
1488    bnez          t4,     40f
1489
1490    mul.w         t3,     t7,    s5
1491    slli.d        t3,     t3,    1
1492    add.d         t4,     s4,    t3
1493    addi.d        t4,     t4,    8
1494    sub.d         t3,     t8,    t7
1495
1496    cdef_fill     t4,     s5,    t2,     t3
1497
1498    addi.d        t6,     t6,    -2
1499
150040:
1501    cdef_padding_data
1502
1503    beqz           a5,    33f
1504
150528:  //if (pri_strength)
1506    li.w           t0,    4
1507    andi           t1,    a5,    1
1508    sub.d          t0,    t0,    t1  //pri_tap
1509
1510    clz.w          t1,    a5
1511    li.d           t2,    31
1512    sub.w          t1,    t2,    t1
1513    sub.w          t1,    s7,    t1
1514
1515    blt            t1,    zero,  281f
1516    or             t1,    t1,    t1
1517    b              282f
1518281:
1519    or             t1,    zero,  zero   //t1: pri_shift
1520282:
1521
1522    beqz           a6,    31f
1523
152429:  //if (sec_strength)
1525    cdef_pri_sec_init
1526
152730:
1528    fld.s          f0,    a0,    0     //px
1529    vsllwil.hu.bu  vr0,   vr0,   0
1530    vpermi.w       vr0,   vr0,   0x44
1531
1532    vxor.v         vr1,   vr1,   vr1   //sum
1533    vor.v          vr2,   vr0,   vr0   //max
1534    vor.v          vr3,   vr0,   vr0   //min
1535    vor.v          vr15,  vr4,   vr4   //pri_tap_k
1536
1537    sub.d          t4,    s4,    a2
1538    sub.d          t5,    s4,    a3
1539
1540    fldx.d         f5,    s4,    a2   //p0_00
1541    fld.d          f6,    t4,    0    //p0_01
1542    fldx.d         f7,    s4,    a3   //p0_10
1543    fld.d          f8,    t5,    0    //p0_11
1544
1545    cdef_process_data_w4 vr9,   vr10
1546    cdef_calc_sum_tapchange_w4
1547    cdef_calc_maxmin_w4
1548
1549    sub.d          t4,    s4,    s1  //tmp[-off02]
1550    sub.d          t5,    s4,    t0  //tmp[-off03]
1551
1552    fldx.d         f5,    s4,    s1   //s0_00
1553    fld.d          f6,    t4,    0    //s0_01
1554    fldx.d         f7,    s4,    t0   //s0_02
1555    fld.d          f8,    t5,    0    //s0_03
1556
1557    cdef_process_data_w4 vr18, vr19
1558    cdef_calc_sum_no_tapchange_w4 vr22
1559    cdef_calc_maxmin_w4
1560
1561    sub.d          t4,    s4,    s2  //tmp[-off12]
1562    sub.d          t5,    s4,    s3  //tmp[-off13]
1563
1564    fldx.d         f5,    s4,    s2   //s0_10
1565    fld.d          f6,    t4,    0    //s0_11
1566    fldx.d         f7,    s4,    s3   //s0_12
1567    fld.d          f8,    t5,    0    //s0_13
1568
1569    cdef_process_data_w4 vr18, vr19
1570    cdef_calc_sum_no_tapchange_w4 vr20
1571    cdef_calc_maxmin_w4
1572
1573    vshuf4i.w      vr5,   vr1,   0x0e
1574    vshuf4i.w      vr6,   vr3,   0x0e
1575    vshuf4i.w      vr7,   vr2,   0x0e
1576    vadd.h         vr1,   vr1,   vr5
1577    vmin.hu        vr3,   vr6,   vr3
1578    vmax.h         vr2,   vr7,   vr2
1579
1580    cdef_calc_dst
1581    iclip_vrh       vr5,   vr3,   vr2,  vr16,  vr17,  vr5
1582
1583    vsrlni.b.h     vr5,   vr5,   0
1584    fst.s          f5,    a0,    0
1585
1586    add.d          a0,    a0,    a1
1587    add.d          s4,    s4,    s5
1588    add.d          s4,    s4,    s5
1589
1590    addi.d         t2,    t2,    -1
1591    blt            zero,  t2,    30b
1592    b              35f
1593
159431:  // pri_strength only
1595    cdef_pri_init
1596
159732:
1598    fld.s          f0,    a0,    0     //px
1599    vsllwil.hu.bu  vr0,   vr0,   0
1600    vpermi.w       vr0,   vr0,   0x44
1601
1602    vxor.v         vr1,   vr1,   vr1   //sum
1603    vor.v          vr15,  vr4,   vr4   //pri_tap_k
1604
1605    sub.d          t4,    s4,    a2
1606    sub.d          t5,    s4,    a3
1607
1608    fldx.d         f5,    s4,    a2   //p0_00
1609    fld.d          f6,    t4,    0    //p0_01
1610    fldx.d         f7,    s4,    a3   //p0_10
1611    fld.d          f8,    t5,    0    //p0_11
1612
1613    cdef_process_data_w4 vr9,   vr10
1614    cdef_calc_sum_tapchange_w4
1615
1616    vshuf4i.w      vr5,   vr1,   0x0e
1617    vadd.h         vr1,   vr1,   vr5
1618
1619    cdef_calc_dst
1620
1621    vsrlni.b.h     vr5,   vr5,   0
1622    fst.s          f5,    a0,    0
1623
1624    add.d          a0,    a0,    a1
1625    add.d          s4,    s4,    s5
1626    add.d          s4,    s4,    s5
1627
1628    addi.d         t2,    t2,    -1
1629    blt            zero,  t2,    32b
1630    b              35f
1631
163233:   // sec_strength only
1633    cdef_sec_init
1634
163534:
1636    fld.s          f0,    a0,    0     //px
1637    vsllwil.hu.bu  vr0,   vr0,   0
1638    vpermi.w       vr0,   vr0,   0x44
1639
1640    vxor.v         vr1,   vr1,   vr1   //sum
1641
1642    sub.d          t4,    s4,    s1  //tmp[-off02]
1643    sub.d          t5,    s4,    t0  //tmp[-off03]
1644
1645    fldx.d         f5,    s4,    s1   //s0_00
1646    fld.d          f6,    t4,    0    //s0_01
1647    fldx.d         f7,    s4,    t0   //s0_02
1648    fld.d          f8,    t5,    0    //s0_03
1649
1650    cdef_process_data_w4 vr18, vr19
1651    cdef_calc_sum_no_tapchange_w4 vr22
1652
1653    sub.d          t4,    s4,    s2  //tmp[-off12]
1654    sub.d          t5,    s4,    s3  //tmp[-off13]
1655
1656    fldx.d         f5,    s4,    s2   //s0_10
1657    fld.d          f6,    t4,    0    //s0_11
1658    fldx.d         f7,    s4,    s3   //s0_12
1659    fld.d          f8,    t5,    0    //s0_13
1660
1661    cdef_process_data_w4 vr18, vr19
1662    cdef_calc_sum_no_tapchange_w4 vr20
1663
1664    vshuf4i.w      vr5,   vr1,   0x0e
1665    vadd.h         vr1,   vr1,   vr5
1666
1667    cdef_calc_dst
1668
1669    vsrlni.b.h     vr5,   vr5,   0
1670    fst.s          f5,    a0,    0
1671
1672    add.d          a0,    a0,    a1
1673    add.d          s4,    s4,    s5
1674    add.d          s4,    s4,    s5
1675
1676    addi.d         t2,    t2,    -1
1677    blt            zero,  t2,    34b
1678
167935:
1680    ld.d           s0,    sp,    0
1681    ld.d           s1,    sp,    8
1682    ld.d           s2,    sp,    16
1683    ld.d           s3,    sp,    24
1684    ld.d           s4,    sp,    32
1685    ld.d           s5,    sp,    40
1686    ld.d           s6,    sp,    48
1687    ld.d           s7,    sp,    56
1688    addi.d         sp,    sp,    (64+288)
1689endfunc
1690
1691function cdef_filter_block_4x8_8bpc_lsx
1692    ld.w           t0,    sp,    0
1693    ld.w           t1,    sp,    8
1694    addi.d         sp,    sp,    -(64+288)
1695    st.d           s0,    sp,    0
1696    st.d           s1,    sp,    8
1697    st.d           s2,    sp,    16
1698    st.d           s3,    sp,    24
1699    st.d           s4,    sp,    32
1700    st.d           s5,    sp,    40
1701    st.d           s6,    sp,    48
1702    st.d           s7,    sp,    56
1703
1704    li.w           s0,    4         //w
1705    li.w           s1,    8         //h
1706    or             s2,    t1,    t1 //edges
1707    or             s7,    t0,    t0 //damping
1708
1709    li.d           s5,    12         //tmp_stride
1710    addi.d         s4,    sp,    64
1711    slli.d         t0,    s5,    1
1712    addi.d         t0,    t0,    2
1713    slli.d         t0,    t0,    1
1714    add.d          s4,    s4,    t0  //ptr tmp
1715    vxor.v         vr23,  vr23,  vr23
1716    li.w           t2,    1
1717    vreplgr2vr.h   vr20,  t2
1718    vaddi.hu       vr21,  vr20,  2
1719    vaddi.hu       vr22,  vr20,  1
1720
1721    li.w          t0,     -16384
1722    vreplgr2vr.h  vr18,   t0
1723
1724    //padding
1725    li.w          t5,     -2        //x_start
1726    addi.d        t6,     s0,    2  //x_end
1727    li.w          t7,     -2        //y_start
1728    addi.d        t8,     s1,    2  //y_end
1729    li.w          t2,     2
1730
1731    andi          t4,     s2,    4
1732    bnez          t4,     1f
1733
1734    //CDEF_HAVE_TOP
1735    slli.d        t3,     s5,    2
1736    addi.d        t4,     s4,    -4
1737    sub.d         t4,     t4,    t3
1738    addi.d        t3,     s0,    4
1739
1740    cdef_fill     t4,     s5,    t3,     t2
1741
1742    or            t7,     zero,  zero
1743
17441:  //CDEF_HAVE_BOTTOM
1745    andi          t4,     s2,8
1746    bnez          t4,     2f
1747
1748    mul.w         t3,     s1,    s5
1749    slli.d        t3,     t3,  1
1750    add.d         t4,     s4,  t3
1751    addi.d        t4,     t4,    -4
1752    li.d          t3,     8
1753
1754    cdef_fill     t4,     s5,    t3,     t2
1755
1756    addi.d        t8,     t8,    -2
1757
17582:  //CDEF_HAVE_LEFT
1759    andi          t4,     s2,1
1760    bnez          t4,     3f
1761
1762    mul.w         t3,     t7,    s5
1763    slli.d        t3,     t3,    1
1764    add.d         t4,     s4,    t3
1765    addi.d        t4,     t4,    -4
1766    sub.d         t3,     t8,    t7
1767
1768    cdef_fill     t4,     s5,    t2,     t3
1769
1770    or            t5,     zero,  zero
1771
17723:  //CDEF_HAVE_RIGHT
1773    andi          t4,     s2,2
1774    bnez          t4,     40f
1775
1776    mul.w         t3,     t7,    s5
1777    slli.d        t3,     t3,    1
1778    add.d         t4,     s4,    t3
1779    addi.d        t4,     t4,    8
1780    sub.d         t3,     t8,    t7
1781
1782    cdef_fill     t4,     s5,    t2,     t3
1783
1784    addi.d        t6,     t6,    -2
1785
178640:
1787    cdef_padding_data
1788
1789    beqz           a5,    33f
1790
179128:  //if (pri_strength)
1792    li.w           t0,    4
1793    andi           t1,    a5,    1
1794    sub.d          t0,    t0,    t1  //pri_tap
1795
1796    clz.w          t1,    a5
1797    li.d           t2,    31
1798    sub.w          t1,    t2,    t1
1799    sub.w          t1,    s7,    t1
1800
1801    blt            t1,    zero,  281f
1802    or             t1,    t1,    t1
1803    b              282f
1804281:
1805    or             t1,    zero,  zero   //t1: pri_shift
1806282:
1807
1808    beqz           a6,    31f
1809
181029:  //if (sec_strength)
1811    cdef_pri_sec_init
1812
181330:
1814    fld.s          f0,    a0,    0     //px
1815    vsllwil.hu.bu  vr0,   vr0,   0
1816    vpermi.w       vr0,   vr0,   0x44
1817
1818    vxor.v         vr1,   vr1,   vr1   //sum
1819    vor.v          vr2,   vr0,   vr0   //max
1820    vor.v          vr3,   vr0,   vr0   //min
1821    vor.v          vr15,  vr4,   vr4   //pri_tap_k
1822
1823    sub.d          t4,    s4,    a2
1824    sub.d          t5,    s4,    a3
1825
1826    fldx.d         f5,    s4,    a2   //p0_00
1827    fld.d          f6,    t4,    0    //p0_01
1828    fldx.d         f7,    s4,    a3   //p0_10
1829    fld.d          f8,    t5,    0    //p0_11
1830
1831    cdef_process_data_w4 vr9,   vr10
1832    cdef_calc_sum_tapchange_w4
1833    cdef_calc_maxmin_w4
1834
1835    sub.d          t4,    s4,    s1  //tmp[-off02]
1836    sub.d          t5,    s4,    t0  //tmp[-off03]
1837
1838    fldx.d         f5,    s4,    s1   //s0_00
1839    fld.d          f6,    t4,    0    //s0_01
1840    fldx.d         f7,    s4,    t0   //s0_02
1841    fld.d          f8,    t5,    0    //s0_03
1842
1843    cdef_process_data_w4 vr18, vr19
1844    cdef_calc_sum_no_tapchange_w4 vr22
1845    cdef_calc_maxmin_w4
1846
1847    sub.d          t4,    s4,    s2  //tmp[-off12]
1848    sub.d          t5,    s4,    s3  //tmp[-off13]
1849
1850    fldx.d         f5,    s4,    s2   //s0_10
1851    fld.d          f6,    t4,    0    //s0_11
1852    fldx.d         f7,    s4,    s3   //s0_12
1853    fld.d          f8,    t5,    0    //s0_13
1854
1855    cdef_process_data_w4 vr18, vr19
1856    cdef_calc_sum_no_tapchange_w4 vr20
1857    cdef_calc_maxmin_w4
1858
1859    vshuf4i.w      vr5,   vr1,   0x0e
1860    vshuf4i.w      vr6,   vr3,   0x0e
1861    vshuf4i.w      vr7,   vr2,   0x0e
1862    vadd.h         vr1,   vr1,   vr5
1863    vmin.hu        vr3,   vr6,   vr3
1864    vmax.h         vr2,   vr7,   vr2
1865
1866    cdef_calc_dst
1867    iclip_vrh       vr5,   vr3,   vr2,  vr16,  vr17,  vr5
1868
1869    vsrlni.b.h     vr5,   vr5,   0
1870    fst.s          f5,    a0,    0
1871
1872    add.d          a0,    a0,    a1
1873    add.d          s4,    s4,    s5
1874    add.d          s4,    s4,    s5
1875
1876    addi.d         t2,    t2,    -1
1877    blt            zero,  t2,    30b
1878    b              35f
1879
188031:  // pri_strength only
1881    cdef_pri_init
1882
188332:
1884    fld.s          f0,    a0,    0     //px
1885    vsllwil.hu.bu  vr0,   vr0,   0
1886    vpermi.w       vr0,   vr0,   0x44
1887
1888    vxor.v         vr1,   vr1,   vr1   //sum
1889    vor.v          vr15,  vr4,   vr4   //pri_tap_k
1890
1891    sub.d          t4,    s4,    a2
1892    sub.d          t5,    s4,    a3
1893
1894    fldx.d         f5,    s4,    a2   //p0_00
1895    fld.d          f6,    t4,    0    //p0_01
1896    fldx.d         f7,    s4,    a3   //p0_10
1897    fld.d          f8,    t5,    0    //p0_11
1898
1899    cdef_process_data_w4 vr9,   vr10
1900    cdef_calc_sum_tapchange_w4
1901
1902    vshuf4i.w      vr5,   vr1,   0x0e
1903    vadd.h         vr1,   vr1,   vr5
1904
1905    cdef_calc_dst
1906
1907    vsrlni.b.h     vr5,   vr5,   0
1908    fst.s          f5,    a0,    0
1909
1910    add.d          a0,    a0,    a1
1911    add.d          s4,    s4,    s5
1912    add.d          s4,    s4,    s5
1913
1914    addi.d         t2,    t2,    -1
1915    blt            zero,  t2,    32b
1916    b              35f
1917
191833:   // sec_strength only
1919    cdef_sec_init
1920
192134:
1922    fld.s          f0,    a0,    0     //px
1923    vsllwil.hu.bu  vr0,   vr0,   0
1924    vpermi.w       vr0,   vr0,   0x44
1925
1926    vxor.v         vr1,   vr1,   vr1   //sum
1927
1928    sub.d          t4,    s4,    s1  //tmp[-off02]
1929    sub.d          t5,    s4,    t0  //tmp[-off03]
1930
1931    fldx.d         f5,    s4,    s1   //s0_00
1932    fld.d          f6,    t4,    0    //s0_01
1933    fldx.d         f7,    s4,    t0   //s0_02
1934    fld.d          f8,    t5,    0    //s0_03
1935
1936    cdef_process_data_w4 vr18, vr19
1937    cdef_calc_sum_no_tapchange_w4 vr22
1938
1939    sub.d          t4,    s4,    s2  //tmp[-off12]
1940    sub.d          t5,    s4,    s3  //tmp[-off13]
1941
1942    fldx.d         f5,    s4,    s2   //s0_10
1943    fld.d          f6,    t4,    0    //s0_11
1944    fldx.d         f7,    s4,    s3   //s0_12
1945    fld.d          f8,    t5,    0    //s0_13
1946
1947    cdef_process_data_w4 vr18, vr19
1948    cdef_calc_sum_no_tapchange_w4 vr20
1949
1950    vshuf4i.w      vr5,   vr1,   0x0e
1951    vadd.h         vr1,   vr1,   vr5
1952
1953    cdef_calc_dst
1954
1955    vsrlni.b.h     vr5,   vr5,   0
1956    fst.s          f5,    a0,    0
1957
1958    add.d          a0,    a0,    a1
1959    add.d          s4,    s4,    s5
1960    add.d          s4,    s4,    s5
1961
1962    addi.d         t2,    t2,    -1
1963    blt            zero,  t2,    34b
1964
196535:
1966    ld.d           s0,    sp,    0
1967    ld.d           s1,    sp,    8
1968    ld.d           s2,    sp,    16
1969    ld.d           s3,    sp,    24
1970    ld.d           s4,    sp,    32
1971    ld.d           s5,    sp,    40
1972    ld.d           s6,    sp,    48
1973    ld.d           s7,    sp,    56
1974    addi.d         sp,    sp,    (64+288)
1975endfunc
1976
1977function cdef_filter_block_8x8_8bpc_lsx
1978    ld.w           t0,    sp,    0
1979    ld.w           t1,    sp,    8
1980    addi.d         sp,    sp,    -(64+288)
1981    st.d           s0,    sp,    0
1982    st.d           s1,    sp,    8
1983    st.d           s2,    sp,    16
1984    st.d           s3,    sp,    24
1985    st.d           s4,    sp,    32
1986    st.d           s5,    sp,    40
1987    st.d           s6,    sp,    48
1988    st.d           s7,    sp,    56
1989
1990    li.w           s0,    8         //w
1991    li.w           s1,    8         //h
1992    or             s2,    t1,    t1 //edges
1993    or             s7,    t0,    t0 //damping
1994
1995    // cdef_filter_block_kernel
1996    li.d           s5,    12         //tmp_stride
1997    addi.d         s4,    sp,    64
1998    slli.d         t0,    s5,    1
1999    addi.d         t0,    t0,    2
2000    slli.d         t0,    t0,    1
2001    add.d          s4,    s4,    t0  //ptr tmp
2002    vxor.v         vr23,  vr23,  vr23
2003    li.w           t2,    1
2004    vreplgr2vr.h   vr20,  t2
2005    vaddi.hu       vr21,  vr20,  2
2006    vaddi.hu       vr22,  vr20,  1
2007
2008    li.w          t0,     -16384
2009    vreplgr2vr.h  vr18,   t0
2010
2011    //padding
2012    li.w          t5,     -2        //x_start
2013    addi.d        t6,     s0,    2  //x_end
2014    li.w          t7,     -2        //y_start
2015    addi.d        t8,     s1,    2  //y_end
2016    li.w          t2,     2
2017
2018    andi          t4,     s2,    4
2019    bnez          t4,     1f
2020
2021    //CDEF_HAVE_TOP
2022    slli.d        t3,     s5,    2
2023    addi.d        t4,     s4,    -4
2024    sub.d         t4,     t4,    t3
2025    addi.d        t3,     s0,    4
2026
2027    cdef_fill     t4,     s5,    t3,     t2
2028
2029    or            t7,     zero,  zero
2030
20311:  //CDEF_HAVE_BOTTOM
2032    andi          t4,     s2,8
2033    bnez          t4,     2f
2034
2035    mul.w         t3,     s1,    s5
2036    slli.d        t3,     t3,  1
2037    add.d         t4,     s4,  t3
2038    addi.d        t4,     t4,    -4
2039    li.d          t3,     12
2040
2041    cdef_fill     t4,     s5,    t3,    t2
2042
2043    addi.d        t8,     t8,    -2
2044
20452:  //CDEF_HAVE_LEFT
2046    andi          t4,     s2,1
2047    bnez          t4,     3f
2048
2049    mul.w         t3,     t7,    s5
2050    slli.d        t3,     t3,    1
2051    add.d         t4,     s4,    t3
2052    addi.d        t4,     t4,    -4
2053    sub.d         t3,     t8,    t7
2054    li.d          t2,     2
2055
2056    cdef_fill     t4,     s5,    t2,    t3
2057
2058    or            t5,     zero,  zero
2059
20603:  //CDEF_HAVE_RIGHT
2061    andi          t4,     s2,2
2062    bnez          t4,     40f
2063
2064    mul.w         t3,     t7,    s5
2065    slli.d        t3,     t3,    1
2066    add.d         t4,     s4,    t3
2067    addi.d        t4,     t4,    16
2068    sub.d         t3,     t8,    t7
2069    li.d          t2,     2
2070
2071    cdef_fill     t4,     s5,    t2,    t3
2072
2073    addi.d        t6,     t6,    -2
2074
207540:
2076    cdef_padding_data
2077
2078    beqz           a5,    33f
2079
208028:  //if (pri_strength)
2081    li.w           t0,    4
2082    andi           t1,    a5,    1
2083    sub.d          t0,    t0,    t1  //pri_tap
2084
2085    //edit
2086    clz.w          t1,    a5
2087    li.d           t2,    31
2088    sub.w          t3,    t2,    t1
2089    sub.w          t3,    s7,    t3
2090
2091    or             t1,    zero,  zero   //t1: pri_shift
2092    blt            t3,    zero,  281f
2093    or             t1,    t3,    t3
2094281:
2095
2096    beqz           a6,    31f
2097
209829:  //if (sec_strength)
2099    cdef_pri_sec_init
2100
2101301:
2102    fld.d          f0,    a0,    0     //px
2103    vsllwil.hu.bu  vr0,   vr0,   0
2104
2105    vxor.v         vr1,   vr1,   vr1   //sum
2106    vor.v          vr2,   vr0,   vr0   //max
2107    vor.v          vr3,   vr0,   vr0   //min
2108    vor.v          vr15,  vr4,   vr4   //pri_tap_k
2109
2110    sub.d          t4,    s4,    a2
2111    sub.d          t5,    s4,    a3
2112
2113    vldx           vr5,  s4,    a2
2114    vld            vr6,  t4,    0
2115    vldx           vr7,  s4,    a3
2116    vld            vr8,  t5,    0
2117
2118    cdef_process_data_w8 vr9, vr10
2119    cdef_calc_sum_tapchange_w8
2120    cdef_calc_maxmin_w8
2121
2122    //s 00-03
2123    sub.d          t4,    s4,    s1  //tmp[-off02]
2124    sub.d          t5,    s4,    t0  //tmp[-off03]
2125
2126    vldx           vr5,  s4,    s1
2127    vld            vr6,  t4,    0
2128    vldx           vr7,  s4,    t0
2129    vld            vr8,  t5,    0
2130
2131    cdef_process_data_w8 vr18, vr19
2132    cdef_calc_sum_no_tapchange_w8 vr22
2133    cdef_calc_maxmin_w8
2134
2135    //s 10-13
2136    sub.d          t4,    s4,    s2  //tmp[-off12]
2137    sub.d          t5,    s4,    s3  //tmp[-off13]
2138
2139    vldx           vr5,  s4,    s2
2140    vld            vr6,  t4,    0
2141    vldx           vr7,  s4,    s3
2142    vld            vr8,  t5,    0
2143
2144    cdef_process_data_w8 vr18, vr19
2145    cdef_calc_sum_no_tapchange_w8 vr20
2146
2147    cdef_calc_maxmin_w8
2148    cdef_calc_dst
2149
2150    iclip_vrh       vr5,   vr3,   vr2,  vr16,  vr17,  vr5
2151
2152    vsrlni.b.h     vr5,   vr5,   0
2153    fst.d          f5,    a0,    0
2154
2155    add.d          a0,    a0,    a1
2156    add.d          s4,    s4,    s5
2157    add.d          s4,    s4,    s5
2158
2159    addi.d         t2,    t2,    -1
2160    blt            zero,  t2,    301b
2161    b              35f
2162
216331:  // pri_strength only
2164    cdef_pri_init
2165
216632:
2167    fld.d          f0,    a0,    0     //px
2168    vsllwil.hu.bu  vr0,   vr0,   0
2169
2170    vxor.v         vr1,   vr1,   vr1   //sum
2171    vor.v          vr15,  vr4,   vr4   //pri_tap_k
2172
2173    sub.d          t4,    s4,    a2
2174    sub.d          t5,    s4,    a3
2175
2176    vldx           vr5,  s4,    a2
2177    vld            vr6,  t4,    0
2178    vldx           vr7,  s4,    a3
2179    vld            vr8,  t5,    0
2180
2181    cdef_process_data_w8 vr9, vr10
2182    cdef_calc_sum_tapchange_w8
2183    cdef_calc_dst
2184
2185    vsrlni.b.h     vr5,   vr5,   0
2186    fst.d          f5,    a0,    0
2187
2188    add.d          a0,    a0,    a1
2189    add.d          s4,    s4,    s5
2190    add.d          s4,    s4,    s5
2191
2192    addi.d         t2,    t2,    -1
2193    blt            zero,  t2,    32b
2194    b              35f
2195
219633:   // sec_strength only
2197    cdef_sec_init
2198
219934:
2200    fld.d          f0,    a0,    0     //px
2201    vsllwil.hu.bu  vr0,   vr0,   0
2202
2203    vxor.v         vr1,   vr1,   vr1   //sum
2204
2205    sub.d          t4,    s4,    s1  //tmp[-off02]
2206    sub.d          t5,    s4,    t0  //tmp[-off03]
2207
2208    vldx           vr5,  s4,    s1
2209    vld            vr6,  t4,    0
2210    vldx           vr7,  s4,    t0
2211    vld            vr8,  t5,    0
2212
2213    cdef_process_data_w8 vr18,  vr19
2214    cdef_calc_sum_no_tapchange_w8 vr22
2215
2216    sub.d          t4,    s4,    s2  //tmp[-off12]
2217    sub.d          t5,    s4,    s3  //tmp[-off13]
2218
2219    vldx           vr5,  s4,    s2
2220    vld            vr6,  t4,    0
2221    vldx           vr7,  s4,    s3
2222    vld            vr8,  t5,    0
2223
2224    cdef_process_data_w8 vr18,  vr19
2225    cdef_calc_sum_no_tapchange_w8 vr20
2226    cdef_calc_dst
2227
2228    vsrlni.b.h     vr5,   vr5,   0
2229    fst.d          f5,    a0,    0
2230
2231    add.d          a0,    a0,    a1
2232    add.d          s4,    s4,    s5
2233    add.d          s4,    s4,    s5
2234
2235    addi.d         t2,    t2,    -1
2236    blt            zero,  t2,    34b
2237
223835:
2239    ld.d           s0,    sp,    0
2240    ld.d           s1,    sp,    8
2241    ld.d           s2,    sp,    16
2242    ld.d           s3,    sp,    24
2243    ld.d           s4,    sp,    32
2244    ld.d           s5,    sp,    40
2245    ld.d           s6,    sp,    48
2246    ld.d           s7,    sp,    56
2247    addi.d         sp,    sp,    (64+288)
2248endfunc
2249
2250