xref: /aosp_15_r20/external/libdav1d/src/x86/looprestoration16_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 32
32
33sgr_lshuf3:    db  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
34sgr_lshuf5:    db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9
35wiener_lshuf5: db  4,  5,  4,  5,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
36               db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
37wiener_lshuf7: db  8,  9,  8,  9,  8,  9,  8,  9,  8,  9, 10, 11, 12, 13, 14, 15
38               db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
39wiener_shufA:  db  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11
40wiener_shufB:  db  6,  7,  4,  5,  8,  9,  6,  7, 10, 11,  8,  9, 12, 13, 10, 11
41wiener_shufC:  db  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
42wiener_shufD:  db  2,  3, -1, -1,  4,  5, -1, -1,  6,  7, -1, -1,  8,  9, -1, -1
43wiener_shufE:  db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
44
45wiener_hshift: dw 4, 4, 1, 1
46wiener_vshift: dw 1024, 1024, 4096, 4096
47wiener_round:  dd 1049600, 1048832
48
49pb_m10_m9:     times 2 db -10, -9
50pb_m6_m5:      times 2 db  -6, -5
51pb_m2_m1:      times 2 db  -2, -1
52pb_2_3:        times 2 db   2,  3
53pb_6_7:        times 2 db   6,  7
54pw_1023:       times 2 dw 1023
55pw_164_24:     dw 164, 24
56pw_455_24:     dw 455, 24
57pd_8:          dd 8
58pd_25:         dd 25
59pd_4096:       dd 4096
60pd_34816:      dd 34816
61pd_m262128:    dd -262128
62pf_256:        dd 256.0
63
64%define pw_256 sgr_lshuf5
65
66cextern pb_0to63
67
68SECTION .text
69
70DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
71
72INIT_YMM avx2
73cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
74                                                     w, h, edge, flt
75%define base t4-wiener_hshift
76    mov           fltq, r6mp
77    movifnidn       wd, wm
78    movifnidn       hd, hm
79    mov          edged, r7m
80    mov            t3d, r8m ; pixel_max
81    vbroadcasti128  m6, [wiener_shufA]
82    vpbroadcastd   m12, [fltq+ 0] ; x0 x1
83    lea             t4, [wiener_hshift]
84    vbroadcasti128  m7, [wiener_shufB]
85    add             wd, wd
86    vpbroadcastd   m13, [fltq+ 4] ; x2 x3
87    shr            t3d, 11
88    vpbroadcastd   m14, [fltq+16] ; y0 y1
89    add           lpfq, wq
90    vpbroadcastd   m15, [fltq+20] ; y2 y3
91    add           dstq, wq
92    vbroadcasti128  m8, [wiener_shufC]
93    lea             t1, [rsp+wq+16]
94    vbroadcasti128  m9, [wiener_shufD]
95    neg             wq
96    vpbroadcastd    m0, [base+wiener_hshift+t3*4]
97    vpbroadcastd   m10, [base+wiener_round+t3*4]
98    vpbroadcastd   m11, [base+wiener_vshift+t3*4]
99    pmullw         m12, m0 ; upshift filter coefs to make the
100    pmullw         m13, m0 ; horizontal downshift constant
101    test         edgeb, 4 ; LR_HAVE_TOP
102    jz .no_top
103    call .h_top
104    add           lpfq, strideq
105    mov             t6, t1
106    mov             t5, t1
107    add             t1, 384*2
108    call .h_top
109    lea            r10, [lpfq+strideq*4]
110    mov           lpfq, dstq
111    mov             t4, t1
112    add             t1, 384*2
113    add            r10, strideq
114    mov          [rsp], r10 ; below
115    call .h
116    mov             t3, t1
117    mov             t2, t1
118    dec             hd
119    jz .v1
120    add           lpfq, strideq
121    add             t1, 384*2
122    call .h
123    mov             t2, t1
124    dec             hd
125    jz .v2
126    add           lpfq, strideq
127    add             t1, 384*2
128    call .h
129    dec             hd
130    jz .v3
131.main:
132    lea             t0, [t1+384*2]
133.main_loop:
134    call .hv
135    dec             hd
136    jnz .main_loop
137    test         edgeb, 8 ; LR_HAVE_BOTTOM
138    jz .v3
139    mov           lpfq, [rsp]
140    call .hv_bottom
141    add           lpfq, strideq
142    call .hv_bottom
143.v1:
144    call .v
145    RET
146.no_top:
147    lea            r10, [lpfq+strideq*4]
148    mov           lpfq, dstq
149    lea            r10, [r10+strideq*2]
150    mov          [rsp], r10
151    call .h
152    mov             t6, t1
153    mov             t5, t1
154    mov             t4, t1
155    mov             t3, t1
156    mov             t2, t1
157    dec             hd
158    jz .v1
159    add           lpfq, strideq
160    add             t1, 384*2
161    call .h
162    mov             t2, t1
163    dec             hd
164    jz .v2
165    add           lpfq, strideq
166    add             t1, 384*2
167    call .h
168    dec             hd
169    jz .v3
170    lea             t0, [t1+384*2]
171    call .hv
172    dec             hd
173    jz .v3
174    add             t0, 384*8
175    call .hv
176    dec             hd
177    jnz .main
178.v3:
179    call .v
180.v2:
181    call .v
182    jmp .v1
183.extend_right:
184    movd           xm1, r10d
185    vpbroadcastd    m0, [pb_6_7]
186    mova            m2, [pb_0to63]
187    vpbroadcastb    m1, xm1
188    psubb           m0, m1
189    pminub          m0, m2
190    pshufb          m3, m0
191    vpbroadcastd    m0, [pb_m2_m1]
192    psubb           m0, m1
193    pminub          m0, m2
194    pshufb          m4, m0
195    vpbroadcastd    m0, [pb_m10_m9]
196    psubb           m0, m1
197    pminub          m0, m2
198    pshufb          m5, m0
199    ret
200.h:
201    mov            r10, wq
202    test         edgeb, 1 ; LR_HAVE_LEFT
203    jz .h_extend_left
204    movq           xm3, [leftq]
205    vpblendd        m3, [lpfq+r10-8], 0xfc
206    add          leftq, 8
207    jmp .h_main
208.h_extend_left:
209    vbroadcasti128  m3, [lpfq+r10] ; avoid accessing memory located
210    mova            m4, [lpfq+r10] ; before the start of the buffer
211    shufpd          m3, m4, 0x05
212    pshufb          m3, [wiener_lshuf7]
213    jmp .h_main2
214.h_top:
215    mov            r10, wq
216    test         edgeb, 1 ; LR_HAVE_LEFT
217    jz .h_extend_left
218.h_loop:
219    movu            m3, [lpfq+r10-8]
220.h_main:
221    mova            m4, [lpfq+r10+0]
222.h_main2:
223    movu            m5, [lpfq+r10+8]
224    test         edgeb, 2 ; LR_HAVE_RIGHT
225    jnz .h_have_right
226    cmp           r10d, -36
227    jl .h_have_right
228    call .extend_right
229.h_have_right:
230    pshufb          m0, m3, m6
231    pshufb          m1, m4, m7
232    paddw           m0, m1
233    pshufb          m3, m8
234    pmaddwd         m0, m12
235    pshufb          m1, m4, m9
236    paddw           m3, m1
237    pshufb          m1, m4, m6
238    pmaddwd         m3, m13
239    pshufb          m2, m5, m7
240    paddw           m1, m2
241    vpbroadcastd    m2, [pd_m262128] ; (1 << 4) - (1 << 18)
242    pshufb          m4, m8
243    pmaddwd         m1, m12
244    pshufb          m5, m9
245    paddw           m4, m5
246    pmaddwd         m4, m13
247    paddd           m0, m2
248    paddd           m1, m2
249    paddd           m0, m3
250    paddd           m1, m4
251    psrad           m0, 4
252    psrad           m1, 4
253    packssdw        m0, m1
254    psraw           m0, 1
255    mova      [t1+r10], m0
256    add            r10, 32
257    jl .h_loop
258    ret
259ALIGN function_align
260.hv:
261    add           lpfq, strideq
262    mov            r10, wq
263    test         edgeb, 1 ; LR_HAVE_LEFT
264    jz .hv_extend_left
265    movq           xm3, [leftq]
266    vpblendd        m3, [lpfq+r10-8], 0xfc
267    add          leftq, 8
268    jmp .hv_main
269.hv_extend_left:
270    movu            m3, [lpfq+r10-8]
271    pshufb          m3, [wiener_lshuf7]
272    jmp .hv_main
273.hv_bottom:
274    mov            r10, wq
275    test         edgeb, 1 ; LR_HAVE_LEFT
276    jz .hv_extend_left
277.hv_loop:
278    movu            m3, [lpfq+r10-8]
279.hv_main:
280    mova            m4, [lpfq+r10+0]
281    movu            m5, [lpfq+r10+8]
282    test         edgeb, 2 ; LR_HAVE_RIGHT
283    jnz .hv_have_right
284    cmp           r10d, -36
285    jl .hv_have_right
286    call .extend_right
287.hv_have_right:
288    pshufb          m0, m3, m6
289    pshufb          m1, m4, m7
290    paddw           m0, m1
291    pshufb          m3, m8
292    pmaddwd         m0, m12
293    pshufb          m1, m4, m9
294    paddw           m3, m1
295    pshufb          m1, m4, m6
296    pmaddwd         m3, m13
297    pshufb          m2, m5, m7
298    paddw           m1, m2
299    vpbroadcastd    m2, [pd_m262128]
300    pshufb          m4, m8
301    pmaddwd         m1, m12
302    pshufb          m5, m9
303    paddw           m4, m5
304    pmaddwd         m4, m13
305    paddd           m0, m2
306    paddd           m1, m2
307    mova            m2, [t4+r10]
308    paddw           m2, [t2+r10]
309    mova            m5, [t3+r10]
310    paddd           m0, m3
311    paddd           m1, m4
312    psrad           m0, 4
313    psrad           m1, 4
314    packssdw        m0, m1
315    mova            m4, [t5+r10]
316    paddw           m4, [t1+r10]
317    psraw           m0, 1
318    paddw           m3, m0, [t6+r10]
319    mova      [t0+r10], m0
320    punpcklwd       m0, m2, m5
321    pmaddwd         m0, m15
322    punpckhwd       m2, m5
323    pmaddwd         m2, m15
324    punpcklwd       m1, m3, m4
325    pmaddwd         m1, m14
326    punpckhwd       m3, m4
327    pmaddwd         m3, m14
328    paddd           m0, m10
329    paddd           m2, m10
330    paddd           m0, m1
331    paddd           m2, m3
332    psrad           m0, 5
333    psrad           m2, 5
334    packusdw        m0, m2
335    pmulhuw         m0, m11
336    mova    [dstq+r10], m0
337    add            r10, 32
338    jl .hv_loop
339    mov             t6, t5
340    mov             t5, t4
341    mov             t4, t3
342    mov             t3, t2
343    mov             t2, t1
344    mov             t1, t0
345    mov             t0, t6
346    add           dstq, strideq
347    ret
348.v:
349    mov            r10, wq
350.v_loop:
351    mova            m1, [t4+r10]
352    paddw           m1, [t2+r10]
353    mova            m2, [t3+r10]
354    mova            m4, [t1+r10]
355    paddw           m3, m4, [t6+r10]
356    paddw           m4, [t5+r10]
357    punpcklwd       m0, m1, m2
358    pmaddwd         m0, m15
359    punpckhwd       m1, m2
360    pmaddwd         m1, m15
361    punpcklwd       m2, m3, m4
362    pmaddwd         m2, m14
363    punpckhwd       m3, m4
364    pmaddwd         m3, m14
365    paddd           m0, m10
366    paddd           m1, m10
367    paddd           m0, m2
368    paddd           m1, m3
369    psrad           m0, 5
370    psrad           m1, 5
371    packusdw        m0, m1
372    pmulhuw         m0, m11
373    mova    [dstq+r10], m0
374    add            r10, 32
375    jl .v_loop
376    mov             t6, t5
377    mov             t5, t4
378    mov             t4, t3
379    mov             t3, t2
380    mov             t2, t1
381    add           dstq, strideq
382    ret
383
384cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
385                                                   w, h, edge, flt
386%define base t4-wiener_hshift
387    mov           fltq, r6mp
388    movifnidn       wd, wm
389    movifnidn       hd, hm
390    mov          edged, r7m
391    mov            t3d, r8m ; pixel_max
392    vbroadcasti128  m5, [wiener_shufE]
393    vpbroadcastw   m11, [fltq+ 2] ; x1
394    vbroadcasti128  m6, [wiener_shufB]
395    lea             t4, [wiener_hshift]
396    vbroadcasti128  m7, [wiener_shufD]
397    add             wd, wd
398    vpbroadcastd   m12, [fltq+ 4] ; x2 x3
399    shr            t3d, 11
400    vpbroadcastd    m8, [pd_m262128] ; (1 << 4) - (1 << 18)
401    add           lpfq, wq
402    vpbroadcastw   m13, [fltq+18] ; y1
403    add           dstq, wq
404    vpbroadcastd   m14, [fltq+20] ; y2 y3
405    lea             t1, [rsp+wq+16]
406    neg             wq
407    vpbroadcastd    m0, [base+wiener_hshift+t3*4]
408    vpbroadcastd    m9, [base+wiener_round+t3*4]
409    vpbroadcastd   m10, [base+wiener_vshift+t3*4]
410    mova           m15, [wiener_lshuf5]
411    pmullw         m11, m0
412    pmullw         m12, m0
413    test         edgeb, 4 ; LR_HAVE_TOP
414    jz .no_top
415    call .h_top
416    add           lpfq, strideq
417    mov             t4, t1
418    add             t1, 384*2
419    call .h_top
420    lea            r10, [lpfq+strideq*4]
421    mov           lpfq, dstq
422    mov             t3, t1
423    add             t1, 384*2
424    add            r10, strideq
425    mov          [rsp], r10 ; below
426    call .h
427    mov             t2, t1
428    dec             hd
429    jz .v1
430    add           lpfq, strideq
431    add             t1, 384*2
432    call .h
433    dec             hd
434    jz .v2
435.main:
436    mov             t0, t4
437.main_loop:
438    call .hv
439    dec             hd
440    jnz .main_loop
441    test         edgeb, 8 ; LR_HAVE_BOTTOM
442    jz .v2
443    mov           lpfq, [rsp]
444    call .hv_bottom
445    add           lpfq, strideq
446    call .hv_bottom
447.end:
448    RET
449.no_top:
450    lea            r10, [lpfq+strideq*4]
451    mov           lpfq, dstq
452    lea            r10, [r10+strideq*2]
453    mov          [rsp], r10
454    call .h
455    mov             t4, t1
456    mov             t3, t1
457    mov             t2, t1
458    dec             hd
459    jz .v1
460    add           lpfq, strideq
461    add             t1, 384*2
462    call .h
463    dec             hd
464    jz .v2
465    lea             t0, [t1+384*2]
466    call .hv
467    dec             hd
468    jz .v2
469    add             t0, 384*6
470    call .hv
471    dec             hd
472    jnz .main
473.v2:
474    call .v
475    mov             t4, t3
476    mov             t3, t2
477    mov             t2, t1
478    add           dstq, strideq
479.v1:
480    call .v
481    jmp .end
482.extend_right:
483    movd           xm2, r10d
484    vpbroadcastd    m0, [pb_2_3]
485    vpbroadcastd    m1, [pb_m6_m5]
486    vpbroadcastb    m2, xm2
487    psubb           m0, m2
488    psubb           m1, m2
489    mova            m2, [pb_0to63]
490    pminub          m0, m2
491    pminub          m1, m2
492    pshufb          m3, m0
493    pshufb          m4, m1
494    ret
495.h:
496    mov            r10, wq
497    test         edgeb, 1 ; LR_HAVE_LEFT
498    jz .h_extend_left
499    movd           xm3, [leftq+4]
500    vpblendd        m3, [lpfq+r10-4], 0xfe
501    add          leftq, 8
502    jmp .h_main
503.h_extend_left:
504    vbroadcasti128  m4, [lpfq+r10] ; avoid accessing memory located
505    mova            m3, [lpfq+r10] ; before the start of the buffer
506    palignr         m3, m4, 12
507    pshufb          m3, m15
508    jmp .h_main
509.h_top:
510    mov            r10, wq
511    test         edgeb, 1 ; LR_HAVE_LEFT
512    jz .h_extend_left
513.h_loop:
514    movu            m3, [lpfq+r10-4]
515.h_main:
516    movu            m4, [lpfq+r10+4]
517    test         edgeb, 2 ; LR_HAVE_RIGHT
518    jnz .h_have_right
519    cmp           r10d, -34
520    jl .h_have_right
521    call .extend_right
522.h_have_right:
523    pshufb          m0, m3, m5
524    pmaddwd         m0, m11
525    pshufb          m1, m4, m5
526    pmaddwd         m1, m11
527    pshufb          m2, m3, m6
528    pshufb          m3, m7
529    paddw           m2, m3
530    pshufb          m3, m4, m6
531    pmaddwd         m2, m12
532    pshufb          m4, m7
533    paddw           m3, m4
534    pmaddwd         m3, m12
535    paddd           m0, m8
536    paddd           m1, m8
537    paddd           m0, m2
538    paddd           m1, m3
539    psrad           m0, 4
540    psrad           m1, 4
541    packssdw        m0, m1
542    psraw           m0, 1
543    mova      [t1+r10], m0
544    add            r10, 32
545    jl .h_loop
546    ret
547ALIGN function_align
548.hv:
549    add           lpfq, strideq
550    mov            r10, wq
551    test         edgeb, 1 ; LR_HAVE_LEFT
552    jz .hv_extend_left
553    movd           xm3, [leftq+4]
554    vpblendd        m3, [lpfq+r10-4], 0xfe
555    add          leftq, 8
556    jmp .hv_main
557.hv_extend_left:
558    movu            m3, [lpfq+r10-4]
559    pshufb          m3, m15
560    jmp .hv_main
561.hv_bottom:
562    mov            r10, wq
563    test         edgeb, 1 ; LR_HAVE_LEFT
564    jz .hv_extend_left
565.hv_loop:
566    movu            m3, [lpfq+r10-4]
567.hv_main:
568    movu            m4, [lpfq+r10+4]
569    test         edgeb, 2 ; LR_HAVE_RIGHT
570    jnz .hv_have_right
571    cmp           r10d, -34
572    jl .hv_have_right
573    call .extend_right
574.hv_have_right:
575    pshufb          m0, m3, m5
576    pmaddwd         m0, m11
577    pshufb          m1, m4, m5
578    pmaddwd         m1, m11
579    pshufb          m2, m3, m6
580    pshufb          m3, m7
581    paddw           m2, m3
582    pshufb          m3, m4, m6
583    pmaddwd         m2, m12
584    pshufb          m4, m7
585    paddw           m3, m4
586    pmaddwd         m3, m12
587    paddd           m0, m8
588    paddd           m1, m8
589    paddd           m0, m2
590    mova            m2, [t3+r10]
591    paddw           m2, [t1+r10]
592    paddd           m1, m3
593    mova            m4, [t2+r10]
594    punpckhwd       m3, m2, m4
595    pmaddwd         m3, m14
596    punpcklwd       m2, m4
597    mova            m4, [t4+r10]
598    psrad           m0, 4
599    psrad           m1, 4
600    packssdw        m0, m1
601    pmaddwd         m2, m14
602    psraw           m0, 1
603    mova      [t0+r10], m0
604    punpckhwd       m1, m0, m4
605    pmaddwd         m1, m13
606    punpcklwd       m0, m4
607    pmaddwd         m0, m13
608    paddd           m3, m9
609    paddd           m2, m9
610    paddd           m1, m3
611    paddd           m0, m2
612    psrad           m1, 5
613    psrad           m0, 5
614    packusdw        m0, m1
615    pmulhuw         m0, m10
616    mova    [dstq+r10], m0
617    add            r10, 32
618    jl .hv_loop
619    mov             t4, t3
620    mov             t3, t2
621    mov             t2, t1
622    mov             t1, t0
623    mov             t0, t4
624    add           dstq, strideq
625    ret
626.v:
627    mov            r10, wq
628.v_loop:
629    mova            m0, [t1+r10]
630    paddw           m2, m0, [t3+r10]
631    mova            m1, [t2+r10]
632    mova            m4, [t4+r10]
633    punpckhwd       m3, m2, m1
634    pmaddwd         m3, m14
635    punpcklwd       m2, m1
636    pmaddwd         m2, m14
637    punpckhwd       m1, m0, m4
638    pmaddwd         m1, m13
639    punpcklwd       m0, m4
640    pmaddwd         m0, m13
641    paddd           m3, m9
642    paddd           m2, m9
643    paddd           m1, m3
644    paddd           m0, m2
645    psrad           m1, 5
646    psrad           m0, 5
647    packusdw        m0, m1
648    pmulhuw         m0, m10
649    mova    [dstq+r10], m0
650    add            r10, 32
651    jl .v_loop
652    ret
653
654cglobal sgr_filter_5x5_16bpc, 4, 14, 16, 400*24+16, dst, stride, left, lpf, \
655                                                    w, h, edge, params
656%define base r13-pb_m10_m9
657    movifnidn       wd, wm
658    mov        paramsq, r6mp
659    lea            r13, [pb_m10_m9]
660    movifnidn       hd, hm
661    mov          edged, r7m
662    vpbroadcastw    m7, [paramsq+8] ; w0
663    add             wd, wd
664    vpbroadcastd    m8, [base+pd_8]
665    add           lpfq, wq
666    vpbroadcastd    m9, [base+pd_25]
667    add           dstq, wq
668    mova          xm10, [base+sgr_lshuf5]
669    lea             t3, [rsp+wq*2+400*12+16]
670    vpbroadcastd   m11, [paramsq+0] ; s0
671    lea             t4, [rsp+wq+400*20+16]
672    vpbroadcastd   m12, [base+pw_164_24]
673    lea             t1, [rsp+wq+20]
674    vbroadcastss   m13, [base+pf_256]
675    neg             wq
676    vpbroadcastd   m14, [base+pd_34816] ; (1 << 11) + (1 << 15)
677    pxor            m6, m6
678    vpbroadcastd   m15, [base+pw_1023]
679    psllw           m7, 4
680    test         edgeb, 4 ; LR_HAVE_TOP
681    jz .no_top
682    call .h_top
683    add           lpfq, strideq
684    mov             t2, t1
685    call .top_fixup
686    add             t1, 400*6
687    call .h_top
688    lea            r10, [lpfq+strideq*4]
689    mov           lpfq, dstq
690    add            r10, strideq
691    mov          [rsp], r10 ; below
692    mov             t0, t2
693    dec             hd
694    jz .height1
695    or           edged, 16
696    call .h
697.main:
698    add           lpfq, strideq
699    call .hv
700    call .prep_n
701    sub             hd, 2
702    jl .extend_bottom
703.main_loop:
704    add           lpfq, strideq
705    test            hd, hd
706    jz .odd_height
707    call .h
708    add           lpfq, strideq
709    call .hv
710    call .n0
711    call .n1
712    sub             hd, 2
713    jge .main_loop
714    test         edgeb, 8 ; LR_HAVE_BOTTOM
715    jz .extend_bottom
716    mov           lpfq, [rsp]
717    call .h_top
718    add           lpfq, strideq
719    call .hv_bottom
720.end:
721    call .n0
722    call .n1
723.end2:
724    RET
725.height1:
726    call .hv
727    call .prep_n
728    jmp .odd_height_end
729.odd_height:
730    call .hv
731    call .n0
732    call .n1
733.odd_height_end:
734    call .v
735    call .n0
736    jmp .end2
737.extend_bottom:
738    call .v
739    jmp .end
740.no_top:
741    lea            r10, [lpfq+strideq*4]
742    mov           lpfq, dstq
743    lea            r10, [r10+strideq*2]
744    mov          [rsp], r10
745    call .h
746    lea             t2, [t1+400*6]
747    call .top_fixup
748    dec             hd
749    jz .no_top_height1
750    or           edged, 16
751    mov             t0, t1
752    mov             t1, t2
753    jmp .main
754.no_top_height1:
755    call .v
756    call .prep_n
757    jmp .odd_height_end
758.extend_right:
759    vpbroadcastw    m0, [lpfq-2]
760    movu            m1, [r13+r10+ 0]
761    movu            m2, [r13+r10+16]
762    vpblendvb       m4, m0, m1
763    vpblendvb       m5, m0, m2
764    ret
765.h: ; horizontal boxsum
766    lea            r10, [wq-4]
767    test         edgeb, 1 ; LR_HAVE_LEFT
768    jz .h_extend_left
769    vpbroadcastq   xm5, [leftq]
770    vinserti128     m5, [lpfq+wq], 1
771    mova            m4, [lpfq+wq]
772    add          leftq, 8
773    palignr         m4, m5, 10
774    jmp .h_main
775.h_extend_left:
776    mova           xm4, [lpfq+wq]
777    pshufb         xm4, xm10
778    vinserti128     m4, [lpfq+wq+10], 1
779    jmp .h_main
780.h_top:
781    lea            r10, [wq-4]
782    test         edgeb, 1 ; LR_HAVE_LEFT
783    jz .h_extend_left
784.h_loop:
785    movu            m4, [lpfq+r10- 2]
786.h_main:
787    movu            m5, [lpfq+r10+14]
788    test         edgeb, 2 ; LR_HAVE_RIGHT
789    jnz .h_have_right
790    cmp           r10d, -36
791    jl .h_have_right
792    call .extend_right
793.h_have_right:
794    palignr         m2, m5, m4, 2
795    paddw           m0, m4, m2
796    palignr         m3, m5, m4, 6
797    paddw           m0, m3
798    punpcklwd       m1, m2, m3
799    pmaddwd         m1, m1
800    punpckhwd       m2, m3
801    pmaddwd         m2, m2
802    shufpd          m5, m4, m5, 0x05
803    paddw           m0, m5
804    punpcklwd       m3, m4, m5
805    pmaddwd         m3, m3
806    paddd           m1, m3
807    punpckhwd       m3, m4, m5
808    pmaddwd         m3, m3
809    shufps          m4, m5, q2121
810    paddw           m0, m4             ; sum
811    punpcklwd       m5, m4, m6
812    pmaddwd         m5, m5
813    punpckhwd       m4, m6
814    pmaddwd         m4, m4
815    paddd           m2, m3
816    test         edgeb, 16             ; y > 0
817    jz .h_loop_end
818    paddw           m0, [t1+r10+400*0]
819    paddd           m1, [t1+r10+400*2]
820    paddd           m2, [t1+r10+400*4]
821.h_loop_end:
822    paddd           m1, m5             ; sumsq
823    paddd           m2, m4
824    mova [t1+r10+400*0], m0
825    mova [t1+r10+400*2], m1
826    mova [t1+r10+400*4], m2
827    add            r10, 32
828    jl .h_loop
829    ret
830.top_fixup:
831    lea            r10, [wq-4]
832.top_fixup_loop: ; the sums of the first row needs to be doubled
833    mova            m0, [t1+r10+400*0]
834    mova            m1, [t1+r10+400*2]
835    mova            m2, [t1+r10+400*4]
836    paddw           m0, m0
837    paddd           m1, m1
838    paddd           m2, m2
839    mova [t2+r10+400*0], m0
840    mova [t2+r10+400*2], m1
841    mova [t2+r10+400*4], m2
842    add            r10, 32
843    jl .top_fixup_loop
844    ret
845ALIGN function_align
846.hv: ; horizontal boxsum + vertical boxsum + ab
847    lea            r10, [wq-4]
848    test         edgeb, 1 ; LR_HAVE_LEFT
849    jz .hv_extend_left
850    vpbroadcastq   xm5, [leftq]
851    vinserti128     m5, [lpfq+wq], 1
852    mova            m4, [lpfq+wq]
853    add          leftq, 8
854    palignr         m4, m5, 10
855    jmp .hv_main
856.hv_extend_left:
857    mova           xm4, [lpfq+wq]
858    pshufb         xm4, xm10
859    vinserti128     m4, [lpfq+wq+10], 1
860    jmp .hv_main
861.hv_bottom:
862    lea            r10, [wq-4]
863    test         edgeb, 1 ; LR_HAVE_LEFT
864    jz .hv_extend_left
865.hv_loop:
866    movu            m4, [lpfq+r10- 2]
867.hv_main:
868    movu            m5, [lpfq+r10+14]
869    test         edgeb, 2 ; LR_HAVE_RIGHT
870    jnz .hv_have_right
871    cmp           r10d, -36
872    jl .hv_have_right
873    call .extend_right
874.hv_have_right:
875    palignr         m3, m5, m4, 2
876    paddw           m0, m4, m3
877    palignr         m1, m5, m4, 6
878    paddw           m0, m1
879    punpcklwd       m2, m3, m1
880    pmaddwd         m2, m2
881    punpckhwd       m3, m1
882    pmaddwd         m3, m3
883    shufpd          m5, m4, m5, 0x05
884    paddw           m0, m5
885    punpcklwd       m1, m4, m5
886    pmaddwd         m1, m1
887    paddd           m2, m1
888    punpckhwd       m1, m4, m5
889    pmaddwd         m1, m1
890    shufps          m4, m5, q2121
891    paddw           m0, m4            ; h sum
892    punpcklwd       m5, m4, m6
893    pmaddwd         m5, m5
894    punpckhwd       m4, m6
895    pmaddwd         m4, m4
896    paddd           m3, m1
897    paddd           m2, m5            ; h sumsq
898    paddd           m3, m4
899    paddw           m1, m0, [t1+r10+400*0]
900    paddd           m4, m2, [t1+r10+400*2]
901    paddd           m5, m3, [t1+r10+400*4]
902    test            hd, hd
903    jz .hv_last_row
904.hv_main2:
905    paddw           m1, [t2+r10+400*0] ; hv sum
906    paddd           m4, [t2+r10+400*2] ; hv sumsq
907    paddd           m5, [t2+r10+400*4]
908    mova [t0+r10+400*0], m0
909    mova [t0+r10+400*2], m2
910    mova [t0+r10+400*4], m3
911    psrlw           m3, m1, 1
912    paddd           m4, m8
913    pavgw           m3, m6             ; (b + 2) >> 2
914    paddd           m5, m8
915    psrld           m4, 4              ; (a + 8) >> 4
916    punpcklwd       m2, m3, m6
917    psrld           m5, 4
918    punpckhwd       m3, m6
919    pmulld          m4, m9             ; a * 25
920    pmulld          m5, m9
921    pmaddwd         m2, m2             ; b * b
922    pmaddwd         m3, m3
923    punpcklwd       m0, m1, m6         ; b
924    punpckhwd       m1, m6
925    pmaxud          m4, m2
926    pmaxud          m5, m3
927    psubd           m4, m2             ; p
928    psubd           m5, m3
929    pmulld          m4, m11            ; p * s
930    pmulld          m5, m11
931    pmaddwd         m0, m12            ; b * 164
932    pmaddwd         m1, m12
933    paddw           m4, m12
934    paddw           m5, m12
935    psrld           m4, 20             ; z + 1
936    psrld           m5, 20
937    cvtdq2ps        m4, m4
938    cvtdq2ps        m5, m5
939    rcpps           m2, m4             ; 1 / (z + 1)
940    rcpps           m3, m5
941    pcmpgtd         m4, m13, m4
942    pcmpgtd         m5, m13, m5
943    mulps           m2, m13            ; 256 / (z + 1)
944    mulps           m3, m13
945    psrld           m4, 24             ; z < 255 ? 255 : 0
946    psrld           m5, 24
947    cvtps2dq        m2, m2
948    cvtps2dq        m3, m3
949    pminsw          m2, m4             ; x
950    pminsw          m3, m5
951    pmulld          m0, m2
952    pmulld          m1, m3
953    packssdw        m2, m3
954    paddd           m0, m14            ; x * b * 164 + (1 << 11) + (1 << 15)
955    paddd           m1, m14
956    mova    [t4+r10+4], m2
957    psrld           m0, 12             ; b
958    psrld           m1, 12
959    mova         [t3+r10*2+ 8], xm0
960    vextracti128 [t3+r10*2+40], m0, 1
961    mova         [t3+r10*2+24], xm1
962    vextracti128 [t3+r10*2+56], m1, 1
963    add            r10, 32
964    jl .hv_loop
965    mov             t2, t1
966    mov             t1, t0
967    mov             t0, t2
968    ret
969.hv_last_row: ; esoteric edge case for odd heights
970    mova [t1+r10+400*0], m1
971    paddw            m1, m0
972    mova [t1+r10+400*2], m4
973    paddd            m4, m2
974    mova [t1+r10+400*4], m5
975    paddd            m5, m3
976    jmp .hv_main2
977.v: ; vertical boxsum + ab
978    lea            r10, [wq-4]
979.v_loop:
980    mova            m0, [t1+r10+400*0]
981    mova            m2, [t1+r10+400*2]
982    mova            m3, [t1+r10+400*4]
983    paddw           m1, m0, [t2+r10+400*0]
984    paddd           m4, m2, [t2+r10+400*2]
985    paddd           m5, m3, [t2+r10+400*4]
986    paddw           m0, m0
987    paddd           m2, m2
988    paddd           m3, m3
989    paddw           m1, m0             ; hv sum
990    paddd           m4, m2             ; hv sumsq
991    paddd           m5, m3
992    psrlw           m3, m1, 1
993    paddd           m4, m8
994    pavgw           m3, m6             ; (b + 2) >> 2
995    paddd           m5, m8
996    psrld           m4, 4              ; (a + 8) >> 4
997    punpcklwd       m2, m3, m6
998    psrld           m5, 4
999    punpckhwd       m3, m6
1000    pmulld          m4, m9             ; a * 25
1001    pmulld          m5, m9
1002    pmaddwd         m2, m2             ; b * b
1003    pmaddwd         m3, m3
1004    punpcklwd       m0, m1, m6         ; b
1005    punpckhwd       m1, m6
1006    pmaxud          m4, m2
1007    pmaxud          m5, m3
1008    psubd           m4, m2             ; p
1009    psubd           m5, m3
1010    pmulld          m4, m11            ; p * s
1011    pmulld          m5, m11
1012    pmaddwd         m0, m12            ; b * 164
1013    pmaddwd         m1, m12
1014    paddw           m4, m12
1015    paddw           m5, m12
1016    psrld           m4, 20             ; z + 1
1017    psrld           m5, 20
1018    cvtdq2ps        m4, m4
1019    cvtdq2ps        m5, m5
1020    rcpps           m2, m4             ; 1 / (z + 1)
1021    rcpps           m3, m5
1022    pcmpgtd         m4, m13, m4
1023    pcmpgtd         m5, m13, m5
1024    mulps           m2, m13            ; 256 / (z + 1)
1025    mulps           m3, m13
1026    psrld           m4, 24             ; z < 255 ? 255 : 0
1027    psrld           m5, 24
1028    cvtps2dq        m2, m2
1029    cvtps2dq        m3, m3
1030    pminsw          m2, m4             ; x
1031    pminsw          m3, m5
1032    pmulld          m0, m2
1033    pmulld          m1, m3
1034    packssdw        m2, m3
1035    paddd           m0, m14            ; x * b * 164 + (1 << 11) + (1 << 15)
1036    paddd           m1, m14
1037    mova    [t4+r10+4], m2
1038    psrld           m0, 12             ; b
1039    psrld           m1, 12
1040    mova         [t3+r10*2+ 8], xm0
1041    vextracti128 [t3+r10*2+40], m0, 1
1042    mova         [t3+r10*2+24], xm1
1043    vextracti128 [t3+r10*2+56], m1, 1
1044    add            r10, 32
1045    jl .v_loop
1046    ret
1047.prep_n: ; initial neighbor setup
1048    mov            r10, wq
1049.prep_n_loop:
1050    movu            m0, [t4+r10*1+ 2]
1051    movu            m1, [t3+r10*2+ 4]
1052    movu            m2, [t3+r10*2+36]
1053    paddw           m3, m0, [t4+r10*1+ 0]
1054    paddd           m4, m1, [t3+r10*2+ 0]
1055    paddd           m5, m2, [t3+r10*2+32]
1056    paddw           m3, [t4+r10*1+ 4]
1057    paddd           m4, [t3+r10*2+ 8]
1058    paddd           m5, [t3+r10*2+40]
1059    paddw           m0, m3
1060    psllw           m3, 2
1061    paddd           m1, m4
1062    pslld           m4, 2
1063    paddd           m2, m5
1064    pslld           m5, 2
1065    paddw           m0, m3             ; a 565
1066    paddd           m1, m4             ; b 565
1067    paddd           m2, m5
1068    mova [t4+r10*1+400*2+ 0], m0
1069    mova [t3+r10*2+400*4+ 0], m1
1070    mova [t3+r10*2+400*4+32], m2
1071    add            r10, 32
1072    jl .prep_n_loop
1073    ret
1074ALIGN function_align
1075.n0: ; neighbor + output (even rows)
1076    mov            r10, wq
1077.n0_loop:
1078    movu            m0, [t4+r10*1+ 2]
1079    movu            m1, [t3+r10*2+ 4]
1080    movu            m2, [t3+r10*2+36]
1081    paddw           m3, m0, [t4+r10*1+ 0]
1082    paddd           m4, m1, [t3+r10*2+ 0]
1083    paddd           m5, m2, [t3+r10*2+32]
1084    paddw           m3, [t4+r10*1+ 4]
1085    paddd           m4, [t3+r10*2+ 8]
1086    paddd           m5, [t3+r10*2+40]
1087    paddw           m0, m3
1088    psllw           m3, 2
1089    paddd           m1, m4
1090    pslld           m4, 2
1091    paddd           m2, m5
1092    pslld           m5, 2
1093    paddw           m0, m3             ; a 565
1094    paddd           m1, m4             ; b 565
1095    paddd           m2, m5
1096    paddw           m3, m0, [t4+r10*1+400*2+ 0]
1097    paddd           m4, m1, [t3+r10*2+400*4+ 0]
1098    paddd           m5, m2, [t3+r10*2+400*4+32]
1099    mova [t4+r10*1+400*2+ 0], m0
1100    mova [t3+r10*2+400*4+ 0], m1
1101    mova [t3+r10*2+400*4+32], m2
1102    mova            m0, [dstq+r10]
1103    punpcklwd       m1, m0, m6          ; src
1104    punpcklwd       m2, m3, m6          ; a
1105    pmaddwd         m2, m1              ; a * src
1106    punpckhwd       m1, m0, m6
1107    punpckhwd       m3, m6
1108    pmaddwd         m3, m1
1109    vinserti128     m1, m4, xm5, 1
1110    vperm2i128      m4, m5, 0x31
1111    psubd           m1, m2              ; b - a * src + (1 << 8)
1112    psubd           m4, m3
1113    psrad           m1, 9
1114    psrad           m4, 9
1115    packssdw        m1, m4
1116    pmulhrsw        m1, m7
1117    paddw           m0, m1
1118    pmaxsw          m0, m6
1119    pminsw          m0, m15
1120    mova    [dstq+r10], m0
1121    add            r10, 32
1122    jl .n0_loop
1123    add           dstq, strideq
1124    ret
1125ALIGN function_align
1126.n1: ; neighbor + output (odd rows)
1127    mov            r10, wq
1128.n1_loop:
1129    mova            m0, [dstq+r10]
1130    mova            m3, [t4+r10*1+400*2+ 0]
1131    mova            m4, [t3+r10*2+400*4+ 0]
1132    mova            m5, [t3+r10*2+400*4+32]
1133    punpcklwd       m1, m0, m6          ; src
1134    punpcklwd       m2, m3, m6          ; a
1135    pmaddwd         m2, m1
1136    punpckhwd       m1, m0, m6
1137    punpckhwd       m3, m6
1138    pmaddwd         m3, m1
1139    vinserti128     m1, m4, xm5, 1
1140    vperm2i128      m4, m5, 0x31
1141    psubd           m1, m2              ; b - a * src + (1 << 7)
1142    psubd           m4, m3
1143    psrad           m1, 8
1144    psrad           m4, 8
1145    packssdw        m1, m4
1146    pmulhrsw        m1, m7
1147    paddw           m0, m1
1148    pmaxsw          m0, m6
1149    pminsw          m0, m15
1150    mova    [dstq+r10], m0
1151    add            r10, 32
1152    jl .n1_loop
1153    add           dstq, strideq
1154    ret
1155
1156cglobal sgr_filter_3x3_16bpc, 4, 14, 15, 400*42+8, dst, stride, left, lpf, \
1157                                                   w, h, edge, params
1158    movifnidn       wd, wm
1159    mov        paramsq, r6mp
1160    lea            r13, [pb_m10_m9]
1161    add             wd, wd
1162    movifnidn       hd, hm
1163    mov          edged, r7m
1164    vpbroadcastw    m7, [paramsq+10] ; w1
1165    add           lpfq, wq
1166    vpbroadcastd    m8, [base+pd_8]
1167    add           dstq, wq
1168    vpbroadcastd    m9, [paramsq+ 4] ; s1
1169    lea             t3, [rsp+wq*2+400*12+8]
1170    mova          xm10, [base+sgr_lshuf3]
1171    lea             t4, [rsp+wq+400*32+8]
1172    vpbroadcastd   m11, [base+pw_455_24]
1173    lea             t1, [rsp+wq+12]
1174    vbroadcastss   m12, [base+pf_256]
1175    neg             wq
1176    vpbroadcastd   m13, [base+pd_34816]
1177    pxor            m6, m6
1178    vpbroadcastd   m14, [base+pw_1023]
1179    psllw           m7, 4
1180    test         edgeb, 4 ; LR_HAVE_TOP
1181    jz .no_top
1182    call .h_top
1183    add           lpfq, strideq
1184    mov             t2, t1
1185    add             t1, 400*6
1186    call .h_top
1187    lea            r10, [lpfq+strideq*4]
1188    mov           lpfq, dstq
1189    add            r10, strideq
1190    mov          [rsp], r10 ; below
1191    call .hv0
1192.main:
1193    dec             hd
1194    jz .height1
1195    add           lpfq, strideq
1196    call .hv1
1197    call .prep_n
1198    sub             hd, 2
1199    jl .extend_bottom
1200.main_loop:
1201    add           lpfq, strideq
1202    call .hv0
1203    test            hd, hd
1204    jz .odd_height
1205    add           lpfq, strideq
1206    call .hv1
1207    call .n0
1208    call .n1
1209    sub             hd, 2
1210    jge .main_loop
1211    test         edgeb, 8 ; LR_HAVE_BOTTOM
1212    jz .extend_bottom
1213    mov           lpfq, [rsp]
1214    call .hv0_bottom
1215    add           lpfq, strideq
1216    call .hv1_bottom
1217.end:
1218    call .n0
1219    call .n1
1220.end2:
1221    RET
1222.height1:
1223    call .v1
1224    call .prep_n
1225    jmp .odd_height_end
1226.odd_height:
1227    call .v1
1228    call .n0
1229    call .n1
1230.odd_height_end:
1231    call .v0
1232    call .v1
1233    call .n0
1234    jmp .end2
1235.extend_bottom:
1236    call .v0
1237    call .v1
1238    jmp .end
1239.no_top:
1240    lea            r10, [lpfq+strideq*4]
1241    mov           lpfq, dstq
1242    lea            r10, [r10+strideq*2]
1243    mov          [rsp], r10
1244    call .h
1245    lea            r10, [wq-4]
1246    lea             t2, [t1+400*6]
1247.top_fixup_loop:
1248    mova            m0, [t1+r10+400*0]
1249    mova            m1, [t1+r10+400*2]
1250    mova            m2, [t1+r10+400*4]
1251    mova [t2+r10+400*0], m0
1252    mova [t2+r10+400*2], m1
1253    mova [t2+r10+400*4], m2
1254    add            r10, 32
1255    jl .top_fixup_loop
1256    call .v0
1257    jmp .main
1258.extend_right:
1259    vpbroadcastw    m0, [lpfq-2]
1260    movu            m1, [r13+r10+ 2]
1261    movu            m2, [r13+r10+18]
1262    vpblendvb       m4, m0, m1
1263    vpblendvb       m5, m0, m2
1264    ret
1265.h: ; horizontal boxsum
1266    lea            r10, [wq-4]
1267    test         edgeb, 1 ; LR_HAVE_LEFT
1268    jz .h_extend_left
1269    vpbroadcastq   xm5, [leftq]
1270    vinserti128     m5, [lpfq+wq], 1
1271    mova            m4, [lpfq+wq]
1272    add          leftq, 8
1273    palignr         m4, m5, 12
1274    jmp .h_main
1275.h_extend_left:
1276    mova           xm4, [lpfq+wq]
1277    pshufb         xm4, xm10
1278    vinserti128     m4, [lpfq+wq+12], 1
1279    jmp .h_main
1280.h_top:
1281    lea            r10, [wq-4]
1282    test         edgeb, 1 ; LR_HAVE_LEFT
1283    jz .h_extend_left
1284.h_loop:
1285    movu            m4, [lpfq+r10+ 0]
1286.h_main:
1287    movu            m5, [lpfq+r10+16]
1288    test         edgeb, 2 ; LR_HAVE_RIGHT
1289    jnz .h_have_right
1290    cmp           r10d, -34
1291    jl .h_have_right
1292    call .extend_right
1293.h_have_right:
1294    palignr         m0, m5, m4, 2
1295    paddw           m1, m4, m0
1296    punpcklwd       m2, m4, m0
1297    pmaddwd         m2, m2
1298    punpckhwd       m3, m4, m0
1299    pmaddwd         m3, m3
1300    palignr         m5, m4, 4
1301    paddw           m1, m5             ; sum
1302    punpcklwd       m4, m5, m6
1303    pmaddwd         m4, m4
1304    punpckhwd       m5, m6
1305    pmaddwd         m5, m5
1306    paddd           m2, m4             ; sumsq
1307    paddd           m3, m5
1308    mova [t1+r10+400*0], m1
1309    mova [t1+r10+400*2], m2
1310    mova [t1+r10+400*4], m3
1311    add            r10, 32
1312    jl .h_loop
1313    ret
1314ALIGN function_align
1315.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
1316    lea            r10, [wq-4]
1317    test         edgeb, 1 ; LR_HAVE_LEFT
1318    jz .hv0_extend_left
1319    vpbroadcastq   xm5, [leftq]
1320    vinserti128     m5, [lpfq+wq], 1
1321    mova            m4, [lpfq+wq]
1322    add          leftq, 8
1323    palignr         m4, m5, 12
1324    jmp .hv0_main
1325.hv0_extend_left:
1326    mova           xm4, [lpfq+wq]
1327    pshufb         xm4, xm10
1328    vinserti128     m4, [lpfq+wq+12], 1
1329    jmp .hv0_main
1330.hv0_bottom:
1331    lea            r10, [wq-4]
1332    test         edgeb, 1 ; LR_HAVE_LEFT
1333    jz .hv0_extend_left
1334.hv0_loop:
1335    movu            m4, [lpfq+r10+ 0]
1336.hv0_main:
1337    movu            m5, [lpfq+r10+16]
1338    test         edgeb, 2 ; LR_HAVE_RIGHT
1339    jnz .hv0_have_right
1340    cmp           r10d, -34
1341    jl .hv0_have_right
1342    call .extend_right
1343.hv0_have_right:
1344    palignr         m0, m5, m4, 2
1345    paddw           m1, m4, m0
1346    punpcklwd       m2, m4, m0
1347    pmaddwd         m2, m2
1348    punpckhwd       m3, m4, m0
1349    pmaddwd         m3, m3
1350    palignr         m5, m4, 4
1351    paddw           m1, m5             ; sum
1352    punpcklwd       m4, m5, m6
1353    pmaddwd         m4, m4
1354    punpckhwd       m5, m6
1355    pmaddwd         m5, m5
1356    paddd           m2, m4             ; sumsq
1357    paddd           m3, m5
1358    paddw           m0, m1, [t1+r10+400*0]
1359    paddd           m4, m2, [t1+r10+400*2]
1360    paddd           m5, m3, [t1+r10+400*4]
1361    mova [t1+r10+400*0], m1
1362    mova [t1+r10+400*2], m2
1363    mova [t1+r10+400*4], m3
1364    paddw           m1, m0, [t2+r10+400*0]
1365    paddd           m2, m4, [t2+r10+400*2]
1366    paddd           m3, m5, [t2+r10+400*4]
1367    mova [t2+r10+400*0], m0
1368    mova [t2+r10+400*2], m4
1369    mova [t2+r10+400*4], m5
1370    paddd           m2, m8
1371    paddd           m3, m8
1372    psrld           m2, 4              ; (a + 8) >> 4
1373    psrld           m3, 4
1374    pslld           m4, m2, 3
1375    pslld           m5, m3, 3
1376    paddd           m4, m2             ; ((a + 8) >> 4) * 9
1377    paddd           m5, m3
1378    psrlw           m3, m1, 1
1379    pavgw           m3, m6             ; (b + 2) >> 2
1380    punpcklwd       m2, m3, m6
1381    pmaddwd         m2, m2
1382    punpckhwd       m3, m6
1383    pmaddwd         m3, m3
1384    punpcklwd       m0, m1, m6         ; b
1385    punpckhwd       m1, m6
1386    pmaxud          m4, m2
1387    psubd           m4, m2             ; p
1388    pmaxud          m5, m3
1389    psubd           m5, m3
1390    pmulld          m4, m9             ; p * s
1391    pmulld          m5, m9
1392    pmaddwd         m0, m11            ; b * 455
1393    pmaddwd         m1, m11
1394    paddw           m4, m11
1395    paddw           m5, m11
1396    psrld           m4, 20             ; z + 1
1397    psrld           m5, 20
1398    cvtdq2ps        m4, m4
1399    cvtdq2ps        m5, m5
1400    rcpps           m2, m4             ; 1 / (z + 1)
1401    rcpps           m3, m5
1402    pcmpgtd         m4, m12, m4
1403    pcmpgtd         m5, m12, m5
1404    mulps           m2, m12            ; 256 / (z + 1)
1405    mulps           m3, m12
1406    psrld           m4, 24             ; z < 255 ? 255 : 0
1407    psrld           m5, 24
1408    cvtps2dq        m2, m2
1409    cvtps2dq        m3, m3
1410    pminsw          m2, m4             ; x
1411    pminsw          m3, m5
1412    pmulld          m0, m2
1413    pmulld          m1, m3
1414    packssdw        m2, m3
1415    paddd           m0, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
1416    paddd           m1, m13
1417    psrld           m0, 12
1418    psrld           m1, 12
1419    mova         [t4+r10*1+400*0+ 4], m2
1420    mova         [t3+r10*2+400*0+ 8], xm0
1421    vextracti128 [t3+r10*2+400*0+40], m0, 1
1422    mova         [t3+r10*2+400*0+24], xm1
1423    vextracti128 [t3+r10*2+400*0+56], m1, 1
1424    add            r10, 32
1425    jl .hv0_loop
1426    ret
1427ALIGN function_align
1428.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
1429    lea            r10, [wq-4]
1430    test         edgeb, 1 ; LR_HAVE_LEFT
1431    jz .hv1_extend_left
1432    vpbroadcastq   xm5, [leftq]
1433    vinserti128     m5, [lpfq+wq], 1
1434    mova            m4, [lpfq+wq]
1435    add          leftq, 8
1436    palignr         m4, m5, 12
1437    jmp .hv1_main
1438.hv1_extend_left:
1439    mova           xm4, [lpfq+wq]
1440    pshufb         xm4, xm10
1441    vinserti128     m4, [lpfq+wq+12], 1
1442    jmp .hv1_main
1443.hv1_bottom:
1444    lea            r10, [wq-4]
1445    test         edgeb, 1 ; LR_HAVE_LEFT
1446    jz .hv1_extend_left
1447.hv1_loop:
1448    movu            m4, [lpfq+r10+ 0]
1449.hv1_main:
1450    movu            m5, [lpfq+r10+16]
1451    test         edgeb, 2 ; LR_HAVE_RIGHT
1452    jnz .hv1_have_right
1453    cmp           r10d, -34
1454    jl .hv1_have_right
1455    call .extend_right
1456.hv1_have_right:
1457    palignr         m1, m5, m4, 2
1458    paddw           m0, m4, m1
1459    punpcklwd       m2, m4, m1
1460    pmaddwd         m2, m2
1461    punpckhwd       m3, m4, m1
1462    pmaddwd         m3, m3
1463    palignr         m5, m4, 4
1464    paddw           m0, m5             ; h sum
1465    punpcklwd       m1, m5, m6
1466    pmaddwd         m1, m1
1467    punpckhwd       m5, m6
1468    pmaddwd         m5, m5
1469    paddd           m2, m1             ; h sumsq
1470    paddd           m3, m5
1471    paddw           m1, m0, [t2+r10+400*0]
1472    paddd           m4, m2, [t2+r10+400*2]
1473    paddd           m5, m3, [t2+r10+400*4]
1474    mova [t2+r10+400*0], m0
1475    mova [t2+r10+400*2], m2
1476    mova [t2+r10+400*4], m3
1477    paddd           m4, m8
1478    paddd           m5, m8
1479    psrld           m4, 4              ; (a + 8) >> 4
1480    psrld           m5, 4
1481    pslld           m2, m4, 3
1482    pslld           m3, m5, 3
1483    paddd           m4, m2             ; ((a + 8) >> 4) * 9
1484    paddd           m5, m3
1485    psrlw           m3, m1, 1
1486    pavgw           m3, m6             ; (b + 2) >> 2
1487    punpcklwd       m2, m3, m6
1488    pmaddwd         m2, m2
1489    punpckhwd       m3, m6
1490    pmaddwd         m3, m3
1491    punpcklwd       m0, m1, m6         ; b
1492    punpckhwd       m1, m6
1493    pmaxud          m4, m2
1494    psubd           m4, m2             ; p
1495    pmaxud          m5, m3
1496    psubd           m5, m3
1497    pmulld          m4, m9             ; p * s
1498    pmulld          m5, m9
1499    pmaddwd         m0, m11            ; b * 455
1500    pmaddwd         m1, m11
1501    paddw           m4, m11
1502    paddw           m5, m11
1503    psrld           m4, 20             ; z + 1
1504    psrld           m5, 20
1505    cvtdq2ps        m4, m4
1506    cvtdq2ps        m5, m5
1507    rcpps           m2, m4             ; 1 / (z + 1)
1508    rcpps           m3, m5
1509    pcmpgtd         m4, m12, m4
1510    pcmpgtd         m5, m12, m5
1511    mulps           m2, m12            ; 256 / (z + 1)
1512    mulps           m3, m12
1513    psrld           m4, 24             ; z < 255 ? 255 : 0
1514    psrld           m5, 24
1515    cvtps2dq        m2, m2
1516    cvtps2dq        m3, m3
1517    pminsw          m2, m4             ; x
1518    pminsw          m3, m5
1519    pmulld          m0, m2
1520    pmulld          m1, m3
1521    packssdw        m2, m3
1522    paddd           m0, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
1523    paddd           m1, m13
1524    psrld           m0, 12
1525    psrld           m1, 12
1526    mova         [t4+r10*1+400*2 +4], m2
1527    mova         [t3+r10*2+400*4+ 8], xm0
1528    vextracti128 [t3+r10*2+400*4+40], m0, 1
1529    mova         [t3+r10*2+400*4+24], xm1
1530    vextracti128 [t3+r10*2+400*4+56], m1, 1
1531    add            r10, 32
1532    jl .hv1_loop
1533    mov            r10, t2
1534    mov             t2, t1
1535    mov             t1, r10
1536    ret
1537.v0: ; vertical boxsums + ab (even rows)
1538    lea            r10, [wq-4]
1539.v0_loop:
1540    mova            m0, [t1+r10+400*0]
1541    mova            m4, [t1+r10+400*2]
1542    mova            m5, [t1+r10+400*4]
1543    paddw           m0, m0
1544    paddd           m4, m4
1545    paddd           m5, m5
1546    paddw           m1, m0, [t2+r10+400*0]
1547    paddd           m2, m4, [t2+r10+400*2]
1548    paddd           m3, m5, [t2+r10+400*4]
1549    mova [t2+r10+400*0], m0
1550    mova [t2+r10+400*2], m4
1551    mova [t2+r10+400*4], m5
1552    paddd           m2, m8
1553    paddd           m3, m8
1554    psrld           m2, 4              ; (a + 8) >> 4
1555    psrld           m3, 4
1556    pslld           m4, m2, 3
1557    pslld           m5, m3, 3
1558    paddd           m4, m2             ; ((a + 8) >> 4) * 9
1559    paddd           m5, m3
1560    psrlw           m3, m1, 1
1561    pavgw           m3, m6             ; (b + 2) >> 2
1562    punpcklwd       m2, m3, m6
1563    pmaddwd         m2, m2
1564    punpckhwd       m3, m6
1565    pmaddwd         m3, m3
1566    punpcklwd       m0, m1, m6         ; b
1567    punpckhwd       m1, m6
1568    pmaxud          m4, m2
1569    psubd           m4, m2             ; p
1570    pmaxud          m5, m3
1571    psubd           m5, m3
1572    pmulld          m4, m9             ; p * s
1573    pmulld          m5, m9
1574    pmaddwd         m0, m11            ; b * 455
1575    pmaddwd         m1, m11
1576    paddw           m4, m11
1577    paddw           m5, m11
1578    psrld           m4, 20             ; z + 1
1579    psrld           m5, 20
1580    cvtdq2ps        m4, m4
1581    cvtdq2ps        m5, m5
1582    rcpps           m2, m4             ; 1 / (z + 1)
1583    rcpps           m3, m5
1584    pcmpgtd         m4, m12, m4
1585    pcmpgtd         m5, m12, m5
1586    mulps           m2, m12            ; 256 / (z + 1)
1587    mulps           m3, m12
1588    psrld           m4, 24             ; z < 255 ? 255 : 0
1589    psrld           m5, 24
1590    cvtps2dq        m2, m2
1591    cvtps2dq        m3, m3
1592    pminsw          m2, m4             ; x
1593    pminsw          m3, m5
1594    pmulld          m0, m2
1595    pmulld          m1, m3
1596    packssdw        m2, m3
1597    paddd           m0, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
1598    paddd           m1, m13
1599    psrld           m0, 12
1600    psrld           m1, 12
1601    mova         [t4+r10*1+400*0+ 4], m2
1602    mova         [t3+r10*2+400*0+ 8], xm0
1603    vextracti128 [t3+r10*2+400*0+40], m0, 1
1604    mova         [t3+r10*2+400*0+24], xm1
1605    vextracti128 [t3+r10*2+400*0+56], m1, 1
1606    add            r10, 32
1607    jl .v0_loop
1608    ret
1609.v1: ; vertical boxsums + ab (odd rows)
1610    lea            r10, [wq-4]
1611.v1_loop:
1612    mova            m0, [t1+r10+400*0]
1613    mova            m4, [t1+r10+400*2]
1614    mova            m5, [t1+r10+400*4]
1615    paddw           m1, m0, [t2+r10+400*0]
1616    paddd           m2, m4, [t2+r10+400*2]
1617    paddd           m3, m5, [t2+r10+400*4]
1618    mova [t2+r10+400*0], m0
1619    mova [t2+r10+400*2], m4
1620    mova [t2+r10+400*4], m5
1621    paddd           m2, m8
1622    paddd           m3, m8
1623    psrld           m2, 4              ; (a + 8) >> 4
1624    psrld           m3, 4
1625    pslld           m4, m2, 3
1626    pslld           m5, m3, 3
1627    paddd           m4, m2             ; ((a + 8) >> 4) * 9
1628    paddd           m5, m3
1629    psrlw           m3, m1, 1
1630    pavgw           m3, m6             ; (b + 2) >> 2
1631    punpcklwd       m2, m3, m6
1632    pmaddwd         m2, m2
1633    punpckhwd       m3, m6
1634    pmaddwd         m3, m3
1635    punpcklwd       m0, m1, m6         ; b
1636    punpckhwd       m1, m6
1637    pmaxud          m4, m2
1638    psubd           m4, m2             ; p
1639    pmaxud          m5, m3
1640    psubd           m5, m3
1641    pmulld          m4, m9             ; p * s
1642    pmulld          m5, m9
1643    pmaddwd         m0, m11            ; b * 455
1644    pmaddwd         m1, m11
1645    paddw           m4, m11
1646    paddw           m5, m11
1647    psrld           m4, 20             ; z + 1
1648    psrld           m5, 20
1649    cvtdq2ps        m4, m4
1650    cvtdq2ps        m5, m5
1651    rcpps           m2, m4             ; 1 / (z + 1)
1652    rcpps           m3, m5
1653    pcmpgtd         m4, m12, m4
1654    pcmpgtd         m5, m12, m5
1655    mulps           m2, m12            ; 256 / (z + 1)
1656    mulps           m3, m12
1657    psrld           m4, 24             ; z < 255 ? 255 : 0
1658    psrld           m5, 24
1659    cvtps2dq        m2, m2
1660    cvtps2dq        m3, m3
1661    pminsw          m2, m4             ; x
1662    pminsw          m3, m5
1663    pmulld          m0, m2
1664    pmulld          m1, m3
1665    packssdw        m2, m3
1666    paddd           m0, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
1667    paddd           m1, m13
1668    psrld           m0, 12
1669    psrld           m1, 12
1670    mova         [t4+r10*1+400*2+ 4], m2
1671    mova         [t3+r10*2+400*4+ 8], xm0
1672    vextracti128 [t3+r10*2+400*4+40], m0, 1
1673    mova         [t3+r10*2+400*4+24], xm1
1674    vextracti128 [t3+r10*2+400*4+56], m1, 1
1675    add            r10, 32
1676    jl .v1_loop
1677    mov            r10, t2
1678    mov             t2, t1
1679    mov             t1, r10
1680    ret
1681.prep_n: ; initial neighbor setup
1682    mov            r10, wq
1683.prep_n_loop:
1684    mova           xm0, [t4+r10*1+400*0+0]
1685    paddw          xm0, [t4+r10*1+400*0+4]
1686    paddw          xm2, xm0, [t4+r10*1+400*0+2]
1687    mova            m1, [t3+r10*2+400*0+0]
1688    paddd           m1, [t3+r10*2+400*0+8]
1689    paddd           m3, m1, [t3+r10*2+400*0+4]
1690    psllw          xm2, 2                ; a[-1] 444
1691    pslld           m3, 2                ; b[-1] 444
1692    psubw          xm2, xm0              ; a[-1] 343
1693    psubd           m3, m1               ; b[-1] 343
1694    mova [t4+r10*1+400* 4], xm2
1695    mova [t3+r10*2+400* 8], m3
1696    mova           xm0, [t4+r10*1+400*2+0]
1697    paddw          xm0, [t4+r10*1+400*2+4]
1698    paddw          xm2, xm0, [t4+r10*1+400*2+2]
1699    mova            m1, [t3+r10*2+400*4+0]
1700    paddd           m1, [t3+r10*2+400*4+8]
1701    paddd           m3, m1, [t3+r10*2+400*4+4]
1702    psllw          xm2, 2                 ; a[ 0] 444
1703    pslld           m3, 2                 ; b[ 0] 444
1704    mova [t4+r10*1+400* 6], xm2
1705    mova [t3+r10*2+400*12], m3
1706    psubw          xm2, xm0               ; a[ 0] 343
1707    psubd           m3, m1                ; b[ 0] 343
1708    mova [t4+r10*1+400* 8], xm2
1709    mova [t3+r10*2+400*16], m3
1710    add            r10, 16
1711    jl .prep_n_loop
1712    ret
1713ALIGN function_align
1714.n0: ; neighbor + output (even rows)
1715    mov            r10, wq
1716.n0_loop:
1717    mova            m3, [t4+r10*1+400*0+0]
1718    paddw           m3, [t4+r10*1+400*0+4]
1719    paddw           m1, m3, [t4+r10*1+400*0+2]
1720    psllw           m1, 2                ; a[ 1] 444
1721    psubw           m2, m1, m3           ; a[ 1] 343
1722    paddw           m3, m2, [t4+r10*1+400*4]
1723    paddw           m3, [t4+r10*1+400*6]
1724    mova [t4+r10*1+400*4], m2
1725    mova [t4+r10*1+400*6], m1
1726    mova            m4, [t3+r10*2+400*0+0]
1727    paddd           m4, [t3+r10*2+400*0+8]
1728    paddd           m1, m4, [t3+r10*2+400*0+4]
1729    pslld           m1, 2                ; b[ 1] 444
1730    psubd           m2, m1, m4           ; b[ 1] 343
1731    paddd           m4, m2, [t3+r10*2+400* 8+ 0]
1732    paddd           m4, [t3+r10*2+400*12+ 0]
1733    mova [t3+r10*2+400* 8+ 0], m2
1734    mova [t3+r10*2+400*12+ 0], m1
1735    mova            m5, [t3+r10*2+400*0+32]
1736    paddd           m5, [t3+r10*2+400*0+40]
1737    paddd           m1, m5, [t3+r10*2+400*0+36]
1738    pslld           m1, 2
1739    psubd           m2, m1, m5
1740    paddd           m5, m2, [t3+r10*2+400* 8+32]
1741    paddd           m5, [t3+r10*2+400*12+32]
1742    mova [t3+r10*2+400* 8+32], m2
1743    mova [t3+r10*2+400*12+32], m1
1744    mova            m0, [dstq+r10]
1745    punpcklwd       m1, m0, m6
1746    punpcklwd       m2, m3, m6
1747    pmaddwd         m2, m1               ; a * src
1748    punpckhwd       m1, m0, m6
1749    punpckhwd       m3, m6
1750    pmaddwd         m3, m1
1751    vinserti128     m1, m4, xm5, 1
1752    vperm2i128      m4, m5, 0x31
1753    psubd           m1, m2               ; b - a * src + (1 << 8)
1754    psubd           m4, m3
1755    psrad           m1, 9
1756    psrad           m4, 9
1757    packssdw        m1, m4
1758    pmulhrsw        m1, m7
1759    paddw           m0, m1
1760    pmaxsw          m0, m6
1761    pminsw          m0, m14
1762    mova    [dstq+r10], m0
1763    add            r10, 32
1764    jl .n0_loop
1765    add           dstq, strideq
1766    ret
1767ALIGN function_align
1768.n1: ; neighbor + output (odd rows)
1769    mov            r10, wq
1770.n1_loop:
1771    mova            m3, [t4+r10*1+400*2+0]
1772    paddw           m3, [t4+r10*1+400*2+4]
1773    paddw           m1, m3, [t4+r10*1+400*2+2]
1774    psllw           m1, 2                ; a[ 1] 444
1775    psubw           m2, m1, m3           ; a[ 1] 343
1776    paddw           m3, m2, [t4+r10*1+400*6]
1777    paddw           m3, [t4+r10*1+400*8]
1778    mova [t4+r10*1+400*6], m1
1779    mova [t4+r10*1+400*8], m2
1780    mova            m4, [t3+r10*2+400*4+0]
1781    paddd           m4, [t3+r10*2+400*4+8]
1782    paddd           m1, m4, [t3+r10*2+400*4+4]
1783    pslld           m1, 2                ; b[ 1] 444
1784    psubd           m2, m1, m4           ; b[ 1] 343
1785    paddd           m4, m2, [t3+r10*2+400*12+ 0]
1786    paddd           m4, [t3+r10*2+400*16+ 0]
1787    mova [t3+r10*2+400*12+ 0], m1
1788    mova [t3+r10*2+400*16+ 0], m2
1789    mova            m5, [t3+r10*2+400*4+32]
1790    paddd           m5, [t3+r10*2+400*4+40]
1791    paddd           m1, m5, [t3+r10*2+400*4+36]
1792    pslld           m1, 2
1793    psubd           m2, m1, m5
1794    paddd           m5, m2, [t3+r10*2+400*12+32]
1795    paddd           m5, [t3+r10*2+400*16+32]
1796    mova [t3+r10*2+400*12+32], m1
1797    mova [t3+r10*2+400*16+32], m2
1798    mova            m0, [dstq+r10]
1799    punpcklwd       m1, m0, m6
1800    punpcklwd       m2, m3, m6
1801    pmaddwd         m2, m1               ; a * src
1802    punpckhwd       m1, m0, m6
1803    punpckhwd       m3, m6
1804    pmaddwd         m3, m1
1805    vinserti128     m1, m4, xm5, 1
1806    vperm2i128      m4, m5, 0x31
1807    psubd           m1, m2               ; b - a * src + (1 << 8)
1808    psubd           m4, m3
1809    psrad           m1, 9
1810    psrad           m4, 9
1811    packssdw        m1, m4
1812    pmulhrsw        m1, m7
1813    paddw           m0, m1
1814    pmaxsw          m0, m6
1815    pminsw          m0, m14
1816    mova    [dstq+r10], m0
1817    add            r10, 32
1818    jl .n1_loop
1819    add           dstq, strideq
1820    ret
1821
1822cglobal sgr_filter_mix_16bpc, 4, 14, 16, 400*66+8, dst, stride, left, lpf, \
1823                                                   w, h, edge, params
1824    movifnidn       wd, wm
1825    mov        paramsq, r6mp
1826    lea            r13, [pb_m10_m9]
1827    add             wd, wd
1828    movifnidn       hd, hm
1829    mov          edged, r7m
1830    add           lpfq, wq
1831    vpbroadcastd   m15, [paramsq+8] ; w0 w1
1832    add           dstq, wq
1833    vpbroadcastd   m13, [paramsq+0] ; s0
1834    lea             t3, [rsp+wq*2+400*24+8]
1835    vpbroadcastd   m14, [paramsq+4] ; s1
1836    lea             t4, [rsp+wq+400*52+8]
1837    vpbroadcastd    m9, [base+pd_8]
1838    lea             t1, [rsp+wq+12]
1839    vpbroadcastd   m10, [base+pd_34816]
1840    neg             wq
1841    vbroadcastss   m11, [base+pf_256]
1842    pxor            m7, m7
1843    vpbroadcastd   m12, [base+pw_455_24]
1844    psllw          m15, 2
1845    test         edgeb, 4 ; LR_HAVE_TOP
1846    jz .no_top
1847    call .h_top
1848    add           lpfq, strideq
1849    mov             t2, t1
1850    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup
1851    add             t1, 400*12
1852    call .h_top
1853    lea            r10, [lpfq+strideq*4]
1854    mov           lpfq, dstq
1855    add            r10, strideq
1856    mov          [rsp], r10 ; below
1857    call .hv0
1858.main:
1859    dec             hd
1860    jz .height1
1861    add           lpfq, strideq
1862    call .hv1
1863    call .prep_n
1864    sub             hd, 2
1865    jl .extend_bottom
1866.main_loop:
1867    add           lpfq, strideq
1868    call .hv0
1869    test            hd, hd
1870    jz .odd_height
1871    add           lpfq, strideq
1872    call .hv1
1873    call .n0
1874    call .n1
1875    sub             hd, 2
1876    jge .main_loop
1877    test         edgeb, 8 ; LR_HAVE_BOTTOM
1878    jz .extend_bottom
1879    mov           lpfq, [rsp]
1880    call .hv0_bottom
1881    add           lpfq, strideq
1882    call .hv1_bottom
1883.end:
1884    call .n0
1885    call .n1
1886.end2:
1887    RET
1888.height1:
1889    call .v1
1890    call .prep_n
1891    jmp .odd_height_end
1892.odd_height:
1893    call .v1
1894    call .n0
1895    call .n1
1896.odd_height_end:
1897    call .v0
1898    call .v1
1899    call .n0
1900    jmp .end2
1901.extend_bottom:
1902    call .v0
1903    call .v1
1904    jmp .end
1905.no_top:
1906    lea            r10, [lpfq+strideq*4]
1907    mov           lpfq, dstq
1908    lea            r10, [r10+strideq*2]
1909    mov          [rsp], r10
1910    call .h
1911    lea            r10, [wq-4]
1912    lea             t2, [t1+400*12]
1913.top_fixup_loop:
1914    mova            m0, [t1+r10+400* 0]
1915    mova            m1, [t1+r10+400* 2]
1916    mova            m2, [t1+r10+400* 4]
1917    paddw           m0, m0
1918    mova            m3, [t1+r10+400* 6]
1919    paddd           m1, m1
1920    mova            m4, [t1+r10+400* 8]
1921    paddd           m2, m2
1922    mova            m5, [t1+r10+400*10]
1923    mova [t2+r10+400* 0], m0
1924    mova [t2+r10+400* 2], m1
1925    mova [t2+r10+400* 4], m2
1926    mova [t2+r10+400* 6], m3
1927    mova [t2+r10+400* 8], m4
1928    mova [t2+r10+400*10], m5
1929    add            r10, 32
1930    jl .top_fixup_loop
1931    call .v0
1932    jmp .main
1933.h: ; horizontal boxsum
1934    lea            r10, [wq-4]
1935    test         edgeb, 1 ; LR_HAVE_LEFT
1936    jz .h_extend_left
1937    vpbroadcastq   xm5, [leftq]
1938    vinserti128     m5, [lpfq+wq], 1
1939    mova            m4, [lpfq+wq]
1940    add          leftq, 8
1941    palignr         m4, m5, 10
1942    jmp .h_main
1943.h_extend_left:
1944    mova           xm4, [lpfq+wq]
1945    pshufb         xm4, [base+sgr_lshuf5]
1946    vinserti128     m4, [lpfq+wq+10], 1
1947    jmp .h_main
1948.h_top:
1949    lea            r10, [wq-4]
1950    test         edgeb, 1 ; LR_HAVE_LEFT
1951    jz .h_extend_left
1952.h_loop:
1953    movu            m4, [lpfq+r10- 2]
1954.h_main:
1955    movu            m5, [lpfq+r10+14]
1956    test         edgeb, 2 ; LR_HAVE_RIGHT
1957    jnz .h_have_right
1958    cmp           r10d, -36
1959    jl .h_have_right
1960    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
1961.h_have_right:
1962    palignr         m3, m5, m4, 2
1963    palignr         m0, m5, m4, 4
1964    paddw           m1, m3, m0
1965    punpcklwd       m2, m3, m0
1966    pmaddwd         m2, m2
1967    punpckhwd       m3, m0
1968    pmaddwd         m3, m3
1969    palignr         m0, m5, m4, 6
1970    paddw           m1, m0             ; sum3
1971    punpcklwd       m6, m0, m7
1972    pmaddwd         m6, m6
1973    punpckhwd       m0, m7
1974    pmaddwd         m0, m0
1975    paddd           m2, m6             ; sumsq3
1976    shufpd          m6, m4, m5, 0x05
1977    punpcklwd       m5, m6, m4
1978    paddw           m8, m4, m6
1979    pmaddwd         m5, m5
1980    punpckhwd       m6, m4
1981    pmaddwd         m6, m6
1982    paddd           m3, m0
1983    mova [t1+r10+400* 6], m1
1984    mova [t1+r10+400* 8], m2
1985    mova [t1+r10+400*10], m3
1986    paddw           m8, m1             ; sum5
1987    paddd           m5, m2             ; sumsq5
1988    paddd           m6, m3
1989    mova [t1+r10+400* 0], m8
1990    mova [t1+r10+400* 2], m5
1991    mova [t1+r10+400* 4], m6
1992    add            r10, 32
1993    jl .h_loop
1994    ret
1995ALIGN function_align
1996.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
1997    lea            r10, [wq-4]
1998    test         edgeb, 1 ; LR_HAVE_LEFT
1999    jz .hv0_extend_left
2000    vpbroadcastq   xm5, [leftq]
2001    vinserti128     m5, [lpfq+wq], 1
2002    mova            m4, [lpfq+wq]
2003    add          leftq, 8
2004    palignr         m4, m5, 10
2005    jmp .hv0_main
2006.hv0_extend_left:
2007    mova           xm4, [lpfq+wq]
2008    pshufb         xm4, [base+sgr_lshuf5]
2009    vinserti128     m4, [lpfq+wq+10], 1
2010    jmp .hv0_main
2011.hv0_bottom:
2012    lea            r10, [wq-4]
2013    test         edgeb, 1 ; LR_HAVE_LEFT
2014    jz .hv0_extend_left
2015.hv0_loop:
2016    movu            m4, [lpfq+r10- 2]
2017.hv0_main:
2018    movu            m5, [lpfq+r10+14]
2019    test         edgeb, 2 ; LR_HAVE_RIGHT
2020    jnz .hv0_have_right
2021    cmp           r10d, -36
2022    jl .hv0_have_right
2023    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
2024.hv0_have_right:
2025    palignr         m3, m5, m4, 2
2026    palignr         m0, m5, m4, 4
2027    paddw           m1, m3, m0
2028    punpcklwd       m2, m3, m0
2029    pmaddwd         m2, m2
2030    punpckhwd       m3, m0
2031    pmaddwd         m3, m3
2032    palignr         m0, m5, m4, 6
2033    paddw           m1, m0             ; h sum3
2034    punpcklwd       m6, m0, m7
2035    pmaddwd         m6, m6
2036    punpckhwd       m0, m7
2037    pmaddwd         m0, m0
2038    paddd           m2, m6             ; h sumsq3
2039    shufpd          m6, m4, m5, 0x05
2040    punpcklwd       m5, m6, m4
2041    paddw           m8, m4, m6
2042    pmaddwd         m5, m5
2043    punpckhwd       m6, m4
2044    pmaddwd         m6, m6
2045    paddd           m3, m0
2046    paddw           m8, m1             ; h sum5
2047    paddd           m5, m2             ; h sumsq5
2048    paddd           m6, m3
2049    mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4?
2050    mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd
2051    mova [t3+r10*2+400*0+40], m6
2052    paddw           m8, [t1+r10+400* 0]
2053    paddd           m5, [t1+r10+400* 2]
2054    paddd           m6, [t1+r10+400* 4]
2055    mova [t1+r10+400* 0], m8
2056    mova [t1+r10+400* 2], m5
2057    mova [t1+r10+400* 4], m6
2058    paddw           m0, m1, [t1+r10+400* 6]
2059    paddd           m4, m2, [t1+r10+400* 8]
2060    paddd           m5, m3, [t1+r10+400*10]
2061    mova [t1+r10+400* 6], m1
2062    mova [t1+r10+400* 8], m2
2063    mova [t1+r10+400*10], m3
2064    paddw           m1, m0, [t2+r10+400* 6]
2065    paddd           m2, m4, [t2+r10+400* 8]
2066    paddd           m3, m5, [t2+r10+400*10]
2067    mova [t2+r10+400* 6], m0
2068    mova [t2+r10+400* 8], m4
2069    mova [t2+r10+400*10], m5
2070    paddd           m2, m9
2071    paddd           m3, m9
2072    psrld           m2, 4              ; (a3 + 8) >> 4
2073    psrld           m3, 4
2074    pslld           m4, m2, 3
2075    pslld           m5, m3, 3
2076    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
2077    paddd           m5, m3
2078    psrlw           m3, m1, 1
2079    pavgw           m3, m7             ; (b3 + 2) >> 2
2080    punpcklwd       m2, m3, m7
2081    pmaddwd         m2, m2
2082    punpckhwd       m3, m7
2083    pmaddwd         m3, m3
2084    punpcklwd       m0, m1, m7         ; b3
2085    punpckhwd       m1, m7
2086    pmaxud          m4, m2
2087    psubd           m4, m2             ; p3
2088    pmaxud          m5, m3
2089    psubd           m5, m3
2090    pmulld          m4, m14            ; p3 * s1
2091    pmulld          m5, m14
2092    pmaddwd         m0, m12            ; b3 * 455
2093    pmaddwd         m1, m12
2094    paddw           m4, m12
2095    paddw           m5, m12
2096    psrld           m4, 20             ; z3 + 1
2097    psrld           m5, 20
2098    cvtdq2ps        m4, m4
2099    cvtdq2ps        m5, m5
2100    rcpps           m2, m4             ; 1 / (z3 + 1)
2101    rcpps           m3, m5
2102    pcmpgtd         m4, m11, m4
2103    pcmpgtd         m5, m11, m5
2104    mulps           m2, m11            ; 256 / (z3 + 1)
2105    mulps           m3, m11
2106    psrld           m4, 24             ; z3 < 255 ? 255 : 0
2107    psrld           m5, 24
2108    cvtps2dq        m2, m2
2109    cvtps2dq        m3, m3
2110    pminsw          m2, m4             ; x3
2111    pminsw          m3, m5
2112    pmulld          m0, m2
2113    pmulld          m1, m3
2114    packssdw        m2, m3
2115    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2116    paddd           m1, m10
2117    psrld           m0, 12
2118    psrld           m1, 12
2119    mova         [t4+r10*1+400*2+ 4], m2
2120    mova         [t3+r10*2+400*4+ 8], xm0
2121    vextracti128 [t3+r10*2+400*4+40], m0, 1
2122    mova         [t3+r10*2+400*4+24], xm1
2123    vextracti128 [t3+r10*2+400*4+56], m1, 1
2124    add            r10, 32
2125    jl .hv0_loop
2126    ret
2127ALIGN function_align
2128.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
2129    lea            r10, [wq-4]
2130    test         edgeb, 1 ; LR_HAVE_LEFT
2131    jz .hv1_extend_left
2132    vpbroadcastq   xm5, [leftq]
2133    vinserti128     m5, [lpfq+wq], 1
2134    mova            m4, [lpfq+wq]
2135    add          leftq, 8
2136    palignr         m4, m5, 10
2137    jmp .hv1_main
2138.hv1_extend_left:
2139    mova           xm4, [lpfq+wq]
2140    pshufb         xm4, [base+sgr_lshuf5]
2141    vinserti128     m4, [lpfq+wq+10], 1
2142    jmp .hv1_main
2143.hv1_bottom:
2144    lea            r10, [wq-4]
2145    test         edgeb, 1 ; LR_HAVE_LEFT
2146    jz .hv1_extend_left
2147.hv1_loop:
2148    movu            m4, [lpfq+r10- 2]
2149.hv1_main:
2150    movu            m5, [lpfq+r10+14]
2151    test         edgeb, 2 ; LR_HAVE_RIGHT
2152    jnz .hv1_have_right
2153    cmp           r10d, -36
2154    jl .hv1_have_right
2155    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
2156.hv1_have_right:
2157    palignr         m6, m5, m4, 2
2158    palignr         m3, m5, m4, 4
2159    paddw           m2, m6, m3
2160    punpcklwd       m0, m6, m3
2161    pmaddwd         m0, m0
2162    punpckhwd       m6, m3
2163    pmaddwd         m6, m6
2164    palignr         m3, m5, m4, 6
2165    paddw           m2, m3             ; h sum3
2166    punpcklwd       m1, m3, m7
2167    pmaddwd         m1, m1
2168    punpckhwd       m3, m7
2169    pmaddwd         m3, m3
2170    paddd           m0, m1             ; h sumsq3
2171    shufpd          m1, m4, m5, 0x05
2172    punpckhwd       m5, m4, m1
2173    paddw           m8, m4, m1
2174    pmaddwd         m5, m5
2175    punpcklwd       m4, m1
2176    pmaddwd         m4, m4
2177    paddd           m6, m3
2178    paddw           m1, m2, [t2+r10+400* 6]
2179    mova [t2+r10+400* 6], m2
2180    paddw           m8, m2             ; h sum5
2181    paddd           m2, m0, [t2+r10+400* 8]
2182    paddd           m3, m6, [t2+r10+400*10]
2183    mova [t2+r10+400* 8], m0
2184    mova [t2+r10+400*10], m6
2185    paddd           m4, m0             ; h sumsq5
2186    paddd           m5, m6
2187    paddd           m2, m9
2188    paddd           m3, m9
2189    psrld           m2, 4              ; (a3 + 8) >> 4
2190    psrld           m3, 4
2191    pslld           m0, m2, 3
2192    pslld           m6, m3, 3
2193    paddd           m2, m0             ; ((a3 + 8) >> 4) * 9
2194    paddd           m3, m6
2195    psrlw           m6, m1, 1
2196    pavgw           m6, m7             ; (b3 + 2) >> 2
2197    punpcklwd       m0, m6, m7
2198    pmaddwd         m0, m0
2199    punpckhwd       m6, m7
2200    pmaddwd         m6, m6
2201    pmaxud          m2, m0
2202    psubd           m2, m0             ; p3
2203    pmaxud          m3, m6
2204    psubd           m3, m6
2205    punpcklwd       m0, m1, m7         ; b3
2206    punpckhwd       m1, m7
2207    pmulld          m2, m14            ; p3 * s1
2208    pmulld          m3, m14
2209    pmaddwd         m0, m12            ; b3 * 455
2210    pmaddwd         m1, m12
2211    paddw           m2, m12
2212    paddw           m3, m12
2213    psrld           m2, 20             ; z + 1
2214    psrld           m3, 20
2215    cvtdq2ps        m2, m2
2216    cvtdq2ps        m3, m3
2217    rcpps           m6, m2             ; 1 / (z + 1)
2218    rcpps           m7, m3
2219    pcmpgtd         m2, m11, m2
2220    pcmpgtd         m3, m11, m3
2221    mulps           m6, m11            ; 256 / (z + 1)
2222    mulps           m7, m11
2223    psrld           m2, 24             ; z < 255 ? 255 : 0
2224    psrld           m3, 24
2225    cvtps2dq        m6, m6
2226    cvtps2dq        m7, m7
2227    pminsw          m6, m2             ; x
2228    pminsw          m7, m3
2229    pmulld          m0, m6
2230    packssdw        m6, m7
2231    pmulld          m7, m1
2232    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2233    paddd           m7, m10
2234    psrld           m0, 12
2235    psrld           m7, 12
2236    paddw           m1, m8, [t2+r10+400*0]
2237    paddd           m2, m4, [t2+r10+400*2]
2238    paddd           m3, m5, [t2+r10+400*4]
2239    paddw           m1, [t1+r10+400*0]
2240    paddd           m2, [t1+r10+400*2]
2241    paddd           m3, [t1+r10+400*4]
2242    mova [t2+r10+400*0], m8
2243    mova [t2+r10+400*2], m4
2244    mova [t2+r10+400*4], m5
2245    mova         [t4+r10*1+400*4 +4], m6
2246    mova         [t3+r10*2+400*8+ 8], xm0
2247    vextracti128 [t3+r10*2+400*8+40], m0, 1
2248    mova         [t3+r10*2+400*8+24], xm7
2249    vextracti128 [t3+r10*2+400*8+56], m7, 1
2250    vpbroadcastd    m4, [base+pd_25]
2251    vpbroadcastd    m6, [base+pw_164_24]
2252    pxor            m7, m7
2253    paddd           m2, m9
2254    paddd           m3, m9
2255    psrld           m2, 4              ; (a5 + 8) >> 4
2256    psrld           m3, 4
2257    pmulld          m2, m4             ; ((a5 + 8) >> 4) * 25
2258    pmulld          m3, m4
2259    psrlw           m5, m1, 1
2260    pavgw           m5, m7             ; (b5 + 2) >> 2
2261    punpcklwd       m4, m5, m7
2262    pmaddwd         m4, m4
2263    punpckhwd       m5, m7
2264    pmaddwd         m5, m5
2265    punpcklwd       m0, m1, m7         ; b5
2266    punpckhwd       m1, m7
2267    pmaxud          m2, m4
2268    psubd           m2, m4             ; p5
2269    pmaxud          m3, m5
2270    psubd           m3, m5
2271    pmulld          m2, m13            ; p5 * s0
2272    pmulld          m3, m13
2273    pmaddwd         m0, m6             ; b5 * 164
2274    pmaddwd         m1, m6
2275    paddw           m2, m6
2276    paddw           m3, m6
2277    psrld           m2, 20             ; z5 + 1
2278    psrld           m3, 20
2279    cvtdq2ps        m2, m2
2280    cvtdq2ps        m3, m3
2281    rcpps           m4, m2             ; 1 / (z5 + 1)
2282    rcpps           m5, m3
2283    pcmpgtd         m2, m11, m2
2284    pcmpgtd         m3, m11, m3
2285    mulps           m4, m11            ; 256 / (z5 + 1)
2286    mulps           m5, m11
2287    psrld           m2, 24             ; z5 < 255 ? 255 : 0
2288    psrld           m3, 24
2289    cvtps2dq        m4, m4
2290    cvtps2dq        m5, m5
2291    pminsw          m4, m2             ; x5
2292    pminsw          m5, m3
2293    pmulld          m0, m4
2294    pmulld          m1, m5
2295    packssdw        m4, m5
2296    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
2297    paddd           m1, m10
2298    psrld           m0, 12
2299    psrld           m1, 12
2300    mova         [t4+r10*1+400*0+ 4], m4
2301    mova         [t3+r10*2+400*0+ 8], xm0
2302    vextracti128 [t3+r10*2+400*0+40], m0, 1
2303    mova         [t3+r10*2+400*0+24], xm1
2304    vextracti128 [t3+r10*2+400*0+56], m1, 1
2305    add            r10, 32
2306    jl .hv1_loop
2307    mov            r10, t2
2308    mov             t2, t1
2309    mov             t1, r10
2310    ret
2311.v0: ; vertical boxsums + ab3 (even rows)
2312    lea            r10, [wq-4]
2313.v0_loop:
2314    mova            m0, [t1+r10+400* 6]
2315    mova            m4, [t1+r10+400* 8]
2316    mova            m5, [t1+r10+400*10]
2317    paddw           m0, m0
2318    paddd           m4, m4
2319    paddd           m5, m5
2320    paddw           m1, m0, [t2+r10+400* 6]
2321    paddd           m2, m4, [t2+r10+400* 8]
2322    paddd           m3, m5, [t2+r10+400*10]
2323    mova [t2+r10+400* 6], m0
2324    mova [t2+r10+400* 8], m4
2325    mova [t2+r10+400*10], m5
2326    paddd           m2, m9
2327    paddd           m3, m9
2328    psrld           m2, 4              ; (a3 + 8) >> 4
2329    psrld           m3, 4
2330    pslld           m4, m2, 3
2331    pslld           m5, m3, 3
2332    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
2333    paddd           m5, m3
2334    psrlw           m3, m1, 1
2335    pavgw           m3, m7             ; (b3 + 2) >> 2
2336    punpcklwd       m2, m3, m7
2337    pmaddwd         m2, m2
2338    punpckhwd       m3, m7
2339    pmaddwd         m3, m3
2340    punpcklwd       m0, m1, m7         ; b3
2341    punpckhwd       m1, m7
2342    pmaxud          m4, m2
2343    psubd           m4, m2             ; p3
2344    pmaxud          m5, m3
2345    psubd           m5, m3
2346    pmulld          m4, m14            ; p3 * s1
2347    pmulld          m5, m14
2348    pmaddwd         m0, m12            ; b3 * 455
2349    pmaddwd         m1, m12
2350    paddw           m4, m12
2351    paddw           m5, m12
2352    psrld           m4, 20             ; z + 1
2353    psrld           m5, 20
2354    cvtdq2ps        m4, m4
2355    cvtdq2ps        m5, m5
2356    rcpps           m2, m4             ; 1 / (z + 1)
2357    rcpps           m3, m5
2358    pcmpgtd         m4, m11, m4
2359    pcmpgtd         m5, m11, m5
2360    mulps           m2, m11            ; 256 / (z + 1)
2361    mulps           m3, m11
2362    psrld           m4, 24             ; z < 255 ? 255 : 0
2363    psrld           m5, 24
2364    cvtps2dq        m2, m2
2365    cvtps2dq        m3, m3
2366    pminsw          m2, m4             ; x
2367    pminsw          m3, m5
2368    pmulld          m0, m2
2369    pmulld          m1, m3
2370    packssdw        m2, m3
2371    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2372    paddd           m1, m10
2373    psrld           m0, 12
2374    psrld           m1, 12
2375    mova            m3, [t1+r10+400*0]
2376    mova            m4, [t1+r10+400*2]
2377    mova            m5, [t1+r10+400*4]
2378    mova [t3+r10*2+400*8+ 8], m3
2379    mova [t3+r10*2+400*0+ 8], m4
2380    mova [t3+r10*2+400*0+40], m5
2381    paddw           m3, m3 ; cc5
2382    paddd           m4, m4
2383    paddd           m5, m5
2384    mova [t1+r10+400*0], m3
2385    mova [t1+r10+400*2], m4
2386    mova [t1+r10+400*4], m5
2387    mova         [t4+r10*1+400*2+ 4], m2
2388    mova         [t3+r10*2+400*4+ 8], xm0
2389    vextracti128 [t3+r10*2+400*4+40], m0, 1
2390    mova         [t3+r10*2+400*4+24], xm1
2391    vextracti128 [t3+r10*2+400*4+56], m1, 1
2392    add            r10, 32
2393    jl .v0_loop
2394    ret
2395.v1: ; vertical boxsums + ab (odd rows)
2396    lea            r10, [wq-4]
2397.v1_loop:
2398    mova            m4, [t1+r10+400* 6]
2399    mova            m5, [t1+r10+400* 8]
2400    mova            m6, [t1+r10+400*10]
2401    paddw           m1, m4, [t2+r10+400* 6]
2402    paddd           m2, m5, [t2+r10+400* 8]
2403    paddd           m3, m6, [t2+r10+400*10]
2404    mova [t2+r10+400* 6], m4
2405    mova [t2+r10+400* 8], m5
2406    mova [t2+r10+400*10], m6
2407    paddd           m2, m9
2408    paddd           m3, m9
2409    psrld           m2, 4              ; (a3 + 8) >> 4
2410    psrld           m3, 4
2411    pslld           m4, m2, 3
2412    pslld           m5, m3, 3
2413    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
2414    paddd           m5, m3
2415    psrlw           m3, m1, 1
2416    pavgw           m3, m7             ; (b3 + 2) >> 2
2417    punpcklwd       m2, m3, m7
2418    pmaddwd         m2, m2
2419    punpckhwd       m3, m7
2420    pmaddwd         m3, m3
2421    punpcklwd       m0, m1, m7         ; b3
2422    punpckhwd       m1, m7
2423    pmaxud          m4, m2
2424    psubd           m4, m2             ; p3
2425    pmaxud          m5, m3
2426    psubd           m5, m3
2427    pmulld          m4, m14            ; p3 * s1
2428    pmulld          m5, m14
2429    pmaddwd         m0, m12            ; b3 * 455
2430    pmaddwd         m1, m12
2431    paddw           m4, m12
2432    paddw           m5, m12
2433    psrld           m4, 20             ; z + 1
2434    psrld           m5, 20
2435    cvtdq2ps        m4, m4
2436    cvtdq2ps        m5, m5
2437    rcpps           m2, m4             ; 1 / (z + 1)
2438    rcpps           m3, m5
2439    pcmpgtd         m4, m11, m4
2440    pcmpgtd         m5, m11, m5
2441    mulps           m2, m11            ; 256 / (z + 1)
2442    mulps           m3, m11
2443    psrld           m4, 24             ; z < 255 ? 255 : 0
2444    psrld           m5, 24
2445    cvtps2dq        m2, m2
2446    cvtps2dq        m3, m3
2447    pminsw          m2, m4             ; x
2448    pminsw          m3, m5
2449    pmulld          m0, m2
2450    pmulld          m1, m3
2451    packssdw        m2, m3
2452    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2453    paddd           m1, m10
2454    psrld           m0, 12
2455    psrld           m8, m1, 12
2456    mova [t4+r10*1+400*4+4], m2
2457    mova            m4, [t3+r10*2+400*8+ 8]
2458    mova            m5, [t3+r10*2+400*0+ 8]
2459    mova            m6, [t3+r10*2+400*0+40]
2460    paddw           m1, m4, [t2+r10+400*0]
2461    paddd           m2, m5, [t2+r10+400*2]
2462    paddd           m3, m6, [t2+r10+400*4]
2463    paddw           m1, [t1+r10+400*0]
2464    paddd           m2, [t1+r10+400*2]
2465    paddd           m3, [t1+r10+400*4]
2466    mova [t2+r10+400*0], m4
2467    mova [t2+r10+400*2], m5
2468    mova [t2+r10+400*4], m6
2469    mova         [t3+r10*2+400*8+ 8], xm0
2470    vextracti128 [t3+r10*2+400*8+40], m0, 1
2471    mova         [t3+r10*2+400*8+24], xm8
2472    vextracti128 [t3+r10*2+400*8+56], m8, 1
2473    vpbroadcastd    m4, [base+pd_25]
2474    vpbroadcastd    m6, [base+pw_164_24]
2475    paddd           m2, m9
2476    paddd           m3, m9
2477    psrld           m2, 4              ; (a5 + 8) >> 4
2478    psrld           m3, 4
2479    pmulld          m2, m4             ; ((a5 + 8) >> 4) * 25
2480    pmulld          m3, m4
2481    psrlw           m5, m1, 1
2482    pavgw           m5, m7             ; (b5 + 2) >> 2
2483    punpcklwd       m4, m5, m7
2484    pmaddwd         m4, m4
2485    punpckhwd       m5, m7
2486    pmaddwd         m5, m5
2487    punpcklwd       m0, m1, m7         ; b5
2488    punpckhwd       m1, m7
2489    pmaxud          m2, m4
2490    psubd           m2, m4             ; p5
2491    pmaxud          m3, m5
2492    psubd           m3, m5
2493    pmulld          m2, m13            ; p5 * s0
2494    pmulld          m3, m13
2495    pmaddwd         m0, m6             ; b5 * 164
2496    pmaddwd         m1, m6
2497    paddw           m2, m6
2498    paddw           m3, m6
2499    psrld           m2, 20             ; z5 + 1
2500    psrld           m3, 20
2501    cvtdq2ps        m2, m2
2502    cvtdq2ps        m3, m3
2503    rcpps           m4, m2             ; 1 / (z5 + 1)
2504    rcpps           m5, m3
2505    pcmpgtd         m2, m11, m2
2506    pcmpgtd         m3, m11, m3
2507    mulps           m4, m11            ; 256 / (z5 + 1)
2508    mulps           m5, m11
2509    psrld           m2, 24             ; z5 < 255 ? 255 : 0
2510    psrld           m3, 24
2511    cvtps2dq        m4, m4
2512    cvtps2dq        m5, m5
2513    pminsw          m4, m2             ; x5
2514    pminsw          m5, m3
2515    pmulld          m0, m4
2516    pmulld          m1, m5
2517    packssdw        m4, m5
2518    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
2519    paddd           m1, m10
2520    psrld           m0, 12
2521    psrld           m1, 12
2522    mova         [t4+r10*1+400*0+ 4], m4
2523    mova         [t3+r10*2+400*0+ 8], xm0
2524    vextracti128 [t3+r10*2+400*0+40], m0, 1
2525    mova         [t3+r10*2+400*0+24], xm1
2526    vextracti128 [t3+r10*2+400*0+56], m1, 1
2527    add            r10, 32
2528    jl .v1_loop
2529    mov            r10, t2
2530    mov             t2, t1
2531    mov             t1, r10
2532    ret
2533.prep_n: ; initial neighbor setup
2534    mov            r10, wq
2535.prep_n_loop:
2536    movu           xm0, [t4+r10*1+400*0+2]
2537    paddw          xm2, xm0, [t4+r10*1+400*0+0]
2538    paddw          xm2, [t4+r10*1+400*0+4]
2539    movu            m1, [t3+r10*2+400*0+4]
2540    paddd           m3, m1, [t3+r10*2+400*0+0]
2541    paddd           m3, [t3+r10*2+400*0+8]
2542    paddw          xm0, xm2
2543    paddd           m1, m3
2544    psllw          xm2, 2
2545    pslld           m3, 2
2546    paddw          xm0, xm2              ; a5 565
2547    paddd           m1, m3               ; b5 565
2548    mova [t4+r10*1+400* 6], xm0
2549    mova [t3+r10*2+400*12], m1
2550    mova           xm0, [t4+r10*1+400*2+0]
2551    paddw          xm0, [t4+r10*1+400*2+4]
2552    paddw          xm2, xm0, [t4+r10*1+400*2+2]
2553    mova            m1, [t3+r10*2+400*4+0]
2554    paddd           m1, [t3+r10*2+400*4+8]
2555    paddd           m3, m1, [t3+r10*2+400*4+4]
2556    psllw          xm2, 2                ; a3[-1] 444
2557    pslld           m3, 2                ; b3[-1] 444
2558    psubw          xm2, xm0              ; a3[-1] 343
2559    psubd           m3, m1               ; b3[-1] 343
2560    mova [t4+r10*1+400* 8], xm2
2561    mova [t3+r10*2+400*16], m3
2562    mova           xm0, [t4+r10*1+400*4+0]
2563    paddw          xm0, [t4+r10*1+400*4+4]
2564    paddw          xm2, xm0, [t4+r10*1+400*4+2]
2565    mova            m1, [t3+r10*2+400*8+0]
2566    paddd           m1, [t3+r10*2+400*8+8]
2567    paddd           m3, m1, [t3+r10*2+400*8+4]
2568    psllw          xm2, 2                 ; a3[ 0] 444
2569    pslld           m3, 2                 ; b3[ 0] 444
2570    mova [t4+r10*1+400*10], xm2
2571    mova [t3+r10*2+400*20], m3
2572    psubw          xm2, xm0               ; a3[ 0] 343
2573    psubd           m3, m1                ; b3[ 0] 343
2574    mova [t4+r10*1+400*12], xm2
2575    mova [t3+r10*2+400*24], m3
2576    add            r10, 16
2577    jl .prep_n_loop
2578    ret
2579ALIGN function_align
2580.n0: ; neighbor + output (even rows)
2581    mov            r10, wq
2582    vpbroadcastd    m6, [base+pd_4096]
2583.n0_loop:
2584    movu           xm2, [t4+r10*1+2]
2585    paddw          xm0, xm2, [t4+r10*1+0]
2586    paddw          xm0, [t4+r10*1+4]
2587    paddw          xm2, xm0
2588    psllw          xm0, 2
2589    paddw          xm0, xm2              ; a5
2590    movu            m1, [t3+r10*2+4]
2591    paddd           m4, m1, [t3+r10*2+0]
2592    paddd           m4, [t3+r10*2+8]
2593    paddd           m1, m4
2594    pslld           m4, 2
2595    paddd           m4, m1               ; b5
2596    paddw          xm2, xm0, [t4+r10*1+400* 6]
2597    mova [t4+r10*1+400* 6], xm0
2598    paddd           m0, m4, [t3+r10*2+400*12]
2599    mova [t3+r10*2+400*12], m4
2600    mova           xm3, [t4+r10*1+400*2+0]
2601    paddw          xm3, [t4+r10*1+400*2+4]
2602    paddw          xm5, xm3, [t4+r10*1+400*2+2]
2603    psllw          xm5, 2                ; a3[ 1] 444
2604    psubw          xm4, xm5, xm3         ; a3[ 1] 343
2605    paddw          xm3, xm4, [t4+r10*1+400* 8]
2606    paddw          xm3, [t4+r10*1+400*10]
2607    mova [t4+r10*1+400* 8], xm4
2608    mova [t4+r10*1+400*10], xm5
2609    mova            m1, [t3+r10*2+400*4+0]
2610    paddd           m1, [t3+r10*2+400*4+8]
2611    paddd           m5, m1, [t3+r10*2+400*4+4]
2612    pslld           m5, 2                ; b3[ 1] 444
2613    psubd           m4, m5, m1           ; b3[ 1] 343
2614    paddd           m1, m4, [t3+r10*2+400*16]
2615    paddd           m1, [t3+r10*2+400*20]
2616    mova [t3+r10*2+400*16], m4
2617    mova [t3+r10*2+400*20], m5
2618    pmovzxwd        m4, [dstq+r10]
2619    pmovzxwd        m2, xm2              ; a5
2620    pmovzxwd        m3, xm3              ; a3
2621    pmaddwd         m2, m4               ; a5 * src
2622    pmaddwd         m3, m4               ; a3 * src
2623    pslld           m4, 13
2624    psubd           m0, m2               ; b5 - a5 * src + (1 << 8)
2625    psubd           m1, m3               ; b3 - a3 * src + (1 << 8)
2626    psrld           m0, 9
2627    pslld           m1, 7
2628    pblendw         m0, m1, 0xaa
2629    pmaddwd         m0, m15
2630    paddd           m4, m6
2631    paddd           m0, m4
2632    psrad           m0, 7
2633    vextracti128   xm1, m0, 1
2634    packusdw       xm0, xm1              ; clip
2635    psrlw          xm0, 6
2636    mova    [dstq+r10], xm0
2637    add            r10, 16
2638    jl .n0_loop
2639    add           dstq, strideq
2640    ret
2641ALIGN function_align
2642.n1: ; neighbor + output (odd rows)
2643    mov            r10, wq
2644    vpbroadcastd    m6, [base+pd_4096]
2645.n1_loop:
2646    mova           xm3, [t4+r10*1+400*4+0]
2647    paddw          xm3, [t4+r10*1+400*4+4]
2648    paddw          xm5, xm3, [t4+r10*1+400*4+2]
2649    psllw          xm5, 2                ; a3[ 1] 444
2650    psubw          xm4, xm5, xm3         ; a3[ 1] 343
2651    paddw          xm3, xm4, [t4+r10*1+400*12]
2652    paddw          xm3, [t4+r10*1+400*10]
2653    mova [t4+r10*1+400*10], xm5
2654    mova [t4+r10*1+400*12], xm4
2655    mova            m1, [t3+r10*2+400*8+0]
2656    paddd           m1, [t3+r10*2+400*8+8]
2657    paddd           m5, m1, [t3+r10*2+400*8+4]
2658    pslld           m5, 2                ; b3[ 1] 444
2659    psubd           m4, m5, m1           ; b3[ 1] 343
2660    paddd           m1, m4, [t3+r10*2+400*24]
2661    paddd           m1, [t3+r10*2+400*20]
2662    mova [t3+r10*2+400*20], m5
2663    mova [t3+r10*2+400*24], m4
2664    pmovzxwd        m4, [dstq+r10]
2665    pmovzxwd        m2, [t4+r10*1+400* 6]
2666    pmovzxwd        m3, xm3
2667    mova            m0, [t3+r10*2+400*12]
2668    pmaddwd         m2, m4               ; a5 * src
2669    pmaddwd         m3, m4               ; a3 * src
2670    pslld           m4, 13
2671    psubd           m0, m2               ; b5 - a5 * src + (1 << 8)
2672    psubd           m1, m3               ; b3 - a3 * src + (1 << 8)
2673    psrld           m0, 8
2674    pslld           m1, 7
2675    pblendw         m0, m1, 0xaa
2676    pmaddwd         m0, m15
2677    paddd           m4, m6
2678    paddd           m0, m4
2679    psrad           m0, 7
2680    vextracti128   xm1, m0, 1
2681    packusdw       xm0, xm1              ; clip
2682    psrlw          xm0, 6
2683    mova    [dstq+r10], xm0
2684    add            r10, 16
2685    jl .n1_loop
2686    add           dstq, strideq
2687    ret
2688
2689%endif ; ARCH_X86_64
2690