xref: /aosp_15_r20/external/libdav1d/src/x86/looprestoration16_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 16
32
33wiener_shufA:  db  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11
34wiener_shufB:  db  6,  7,  4,  5,  8,  9,  6,  7, 10, 11,  8,  9, 12, 13, 10, 11
35wiener_shufC:  db  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
36wiener_shufD:  db  2,  3, -1, -1,  4,  5, -1, -1,  6,  7, -1, -1,  8,  9, -1, -1
37wiener_shufE:  db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
38r_ext_mask:    times 72 db -1
39               times  8 db  0
40wiener_hshift: dw 4, 4, 1, 1
41wiener_vshift: dw 1024, 1024, 4096, 4096
42wiener_round:  dd 1049600, 1048832
43
44pw_164_455:    dw 164, 455
45pw_1023:       times 2 dw 1023
46pw_61448:      times 2 dw 61448
47pd_m262128:    dd -262128
48pd_m34816:     dd -34816
49pd_m25:        dd -25
50pd_m9:         dd -9
51pd_8:          dd 8
52pd_2147483648: dd 2147483648
53
54cextern sgr_x_by_x
55
56SECTION .text
57
58DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
59
60INIT_ZMM avx512icl
61cglobal wiener_filter7_16bpc, 4, 15, 17, -384*12-16, dst, stride, left, lpf, \
62                                                     w, h, edge, flt
63%define base t4-wiener_hshift
64    mov           fltq, r6mp
65    movifnidn       wd, wm
66    movifnidn       hd, hm
67    mov          edged, r7m
68    mov            t3d, r8m ; pixel_max
69    vbroadcasti128  m6, [wiener_shufA]
70    vpbroadcastd   m12, [fltq+ 0] ; x0 x1
71    lea             t4, [wiener_hshift]
72    vbroadcasti128  m7, [wiener_shufB]
73    add             wd, wd
74    vpbroadcastd   m13, [fltq+ 4] ; x2 x3
75    shr            t3d, 11
76    vpbroadcastd   m14, [fltq+16] ; y0 y1
77    add           lpfq, wq
78    vpbroadcastd   m15, [fltq+20] ; y2 y3
79    add           dstq, wq
80    vbroadcasti128  m8, [wiener_shufC]
81    lea             t1, [rsp+wq+16]
82    vbroadcasti128  m9, [wiener_shufD]
83    neg             wq
84    vpbroadcastd    m0, [base+wiener_hshift+t3*4]
85    mov           r10d, 0xfe
86    vpbroadcastd   m10, [base+wiener_round+t3*4]
87    kmovb           k1, r10d
88    vpbroadcastd   m11, [base+wiener_vshift+t3*4]
89    pmullw         m12, m0 ; upshift filter coefs to make the
90    vpbroadcastd   m16, [pd_m262128]
91    pmullw         m13, m0 ; horizontal downshift constant
92    test         edgeb, 4 ; LR_HAVE_TOP
93    jz .no_top
94    call .h_top
95    add           lpfq, strideq
96    mov             t6, t1
97    mov             t5, t1
98    add             t1, 384*2
99    call .h_top
100    lea            r10, [lpfq+strideq*4]
101    mov           lpfq, dstq
102    mov             t4, t1
103    add             t1, 384*2
104    add            r10, strideq
105    mov          [rsp], r10 ; below
106    call .h
107    mov             t3, t1
108    mov             t2, t1
109    dec             hd
110    jz .v1
111    add           lpfq, strideq
112    add             t1, 384*2
113    call .h
114    mov             t2, t1
115    dec             hd
116    jz .v2
117    add           lpfq, strideq
118    add             t1, 384*2
119    call .h
120    dec             hd
121    jz .v3
122.main:
123    lea             t0, [t1+384*2]
124.main_loop:
125    call .hv
126    dec             hd
127    jnz .main_loop
128    test         edgeb, 8 ; LR_HAVE_BOTTOM
129    jz .v3
130    mov           lpfq, [rsp]
131    call .hv_bottom
132    add           lpfq, strideq
133    call .hv_bottom
134.v1:
135    call .v
136    RET
137.no_top:
138    lea            r10, [lpfq+strideq*4]
139    mov           lpfq, dstq
140    lea            r10, [r10+strideq*2]
141    mov          [rsp], r10
142    call .h
143    mov             t6, t1
144    mov             t5, t1
145    mov             t4, t1
146    mov             t3, t1
147    mov             t2, t1
148    dec             hd
149    jz .v1
150    add           lpfq, strideq
151    add             t1, 384*2
152    call .h
153    mov             t2, t1
154    dec             hd
155    jz .v2
156    add           lpfq, strideq
157    add             t1, 384*2
158    call .h
159    dec             hd
160    jz .v3
161    lea             t0, [t1+384*2]
162    call .hv
163    dec             hd
164    jz .v3
165    add             t0, 384*8
166    call .hv
167    dec             hd
168    jnz .main
169.v3:
170    call .v
171.v2:
172    call .v
173    jmp .v1
174.h:
175    mov            r10, wq
176    test         edgeb, 1 ; LR_HAVE_LEFT
177    jz .h_extend_left
178    movq           xm3, [leftq]
179    vmovdqu64   m3{k1}, [lpfq+r10-8]
180    add          leftq, 8
181    jmp .h_main
182.h_extend_left:
183    mova            m4, [lpfq+r10+0]
184    vpbroadcastw   xm3, xm4
185    vmovdqu64   m3{k1}, [lpfq+r10-8]
186    jmp .h_main2
187.h_top:
188    mov            r10, wq
189    test         edgeb, 1 ; LR_HAVE_LEFT
190    jz .h_extend_left
191.h_loop:
192    movu            m3, [lpfq+r10-8]
193.h_main:
194    mova            m4, [lpfq+r10+0]
195.h_main2:
196    movu            m5, [lpfq+r10+8]
197    test         edgeb, 2 ; LR_HAVE_RIGHT
198    jnz .h_have_right
199    cmp           r10d, -68
200    jl .h_have_right
201    push            r0
202    lea             r0, [r_ext_mask+66]
203    vpbroadcastw    m0, [lpfq-2]
204    vpternlogd      m3, m0, [r0+r10+ 0], 0xe4 ; c ? a : b
205    vpternlogd      m4, m0, [r0+r10+ 8], 0xe4
206    vpternlogd      m5, m0, [r0+r10+16], 0xe4
207    pop             r0
208.h_have_right:
209    pshufb          m2, m3, m6
210    pshufb          m1, m4, m7
211    paddw           m2, m1
212    pshufb          m3, m8
213    mova            m0, m16
214    vpdpwssd        m0, m2, m12
215    pshufb          m1, m4, m9
216    paddw           m3, m1
217    pshufb          m1, m4, m6
218    vpdpwssd        m0, m3, m13
219    pshufb          m2, m5, m7
220    paddw           m2, m1
221    mova            m1, m16
222    pshufb          m4, m8
223    vpdpwssd        m1, m2, m12
224    pshufb          m5, m9
225    paddw           m4, m5
226    vpdpwssd        m1, m4, m13
227    psrad           m0, 4
228    psrad           m1, 4
229    packssdw        m0, m1
230    psraw           m0, 1
231    mova      [t1+r10], m0
232    add            r10, 64
233    jl .h_loop
234    ret
235ALIGN function_align
236.hv:
237    add           lpfq, strideq
238    mov            r10, wq
239    test         edgeb, 1 ; LR_HAVE_LEFT
240    jz .hv_extend_left
241    movq           xm3, [leftq]
242    vmovdqu64   m3{k1}, [lpfq+r10-8]
243    add          leftq, 8
244    jmp .hv_main
245.hv_extend_left:
246    mova            m4, [lpfq+r10+0]
247    vpbroadcastw   xm3, xm4
248    vmovdqu64   m3{k1}, [lpfq+r10-8]
249    jmp .hv_main2
250.hv_bottom:
251    mov            r10, wq
252    test         edgeb, 1 ; LR_HAVE_LEFT
253    jz .hv_extend_left
254.hv_loop:
255    movu            m3, [lpfq+r10-8]
256.hv_main:
257    mova            m4, [lpfq+r10+0]
258.hv_main2:
259    movu            m5, [lpfq+r10+8]
260    test         edgeb, 2 ; LR_HAVE_RIGHT
261    jnz .hv_have_right
262    cmp           r10d, -68
263    jl .hv_have_right
264    push            r0
265    lea             r0, [r_ext_mask+66]
266    vpbroadcastw    m0, [lpfq-2]
267    vpternlogd      m3, m0, [r0+r10+ 0], 0xe4
268    vpternlogd      m4, m0, [r0+r10+ 8], 0xe4
269    vpternlogd      m5, m0, [r0+r10+16], 0xe4
270    pop             r0
271.hv_have_right:
272    pshufb          m2, m3, m6
273    pshufb          m1, m4, m7
274    paddw           m2, m1
275    pshufb          m3, m8
276    mova            m0, m16
277    vpdpwssd        m0, m2, m12
278    pshufb          m1, m4, m9
279    paddw           m3, m1
280    pshufb          m1, m4, m6
281    vpdpwssd        m0, m3, m13
282    pshufb          m2, m5, m7
283    paddw           m2, m1
284    pshufb          m4, m8
285    mova            m1, m16
286    vpdpwssd        m1, m2, m12
287    pshufb          m5, m9
288    paddw           m4, m5
289    vpdpwssd        m1, m4, m13
290    mova            m2, [t4+r10]
291    paddw           m2, [t2+r10]
292    mova            m5, [t3+r10]
293    psrad           m0, 4
294    psrad           m1, 4
295    packssdw        m0, m1
296    mova            m4, [t5+r10]
297    paddw           m4, [t1+r10]
298    psraw           m0, 1
299    paddw           m3, m0, [t6+r10]
300    mova      [t0+r10], m0
301    punpcklwd       m1, m2, m5
302    mova            m0, m10
303    vpdpwssd        m0, m1, m15
304    punpckhwd       m2, m5
305    mova            m1, m10
306    vpdpwssd        m1, m2, m15
307    punpcklwd       m2, m3, m4
308    vpdpwssd        m0, m2, m14
309    punpckhwd       m3, m4
310    vpdpwssd        m1, m3, m14
311    psrad           m0, 5
312    psrad           m1, 5
313    packusdw        m0, m1
314    pmulhuw         m0, m11
315    mova    [dstq+r10], m0
316    add            r10, 64
317    jl .hv_loop
318    mov             t6, t5
319    mov             t5, t4
320    mov             t4, t3
321    mov             t3, t2
322    mov             t2, t1
323    mov             t1, t0
324    mov             t0, t6
325    add           dstq, strideq
326    ret
327.v:
328    mov            r10, wq
329.v_loop:
330    mova            m2, [t4+r10]
331    paddw           m2, [t2+r10]
332    mova            m3, [t3+r10]
333    punpcklwd       m1, m2, m3
334    mova            m0, m10
335    vpdpwssd        m0, m1, m15
336    punpckhwd       m2, m3
337    mova            m1, m10
338    vpdpwssd        m1, m2, m15
339    mova            m4, [t1+r10]
340    paddw           m3, m4, [t6+r10]
341    paddw           m4, [t5+r10]
342    punpcklwd       m2, m3, m4
343    vpdpwssd        m0, m2, m14
344    punpckhwd       m3, m4
345    vpdpwssd        m1, m3, m14
346    psrad           m0, 5
347    psrad           m1, 5
348    packusdw        m0, m1
349    pmulhuw         m0, m11
350    mova    [dstq+r10], m0
351    add            r10, 64
352    jl .v_loop
353    mov             t6, t5
354    mov             t5, t4
355    mov             t4, t3
356    mov             t3, t2
357    mov             t2, t1
358    add           dstq, strideq
359    ret
360
361cglobal wiener_filter5_16bpc, 4, 14, 15, 384*8+16, dst, stride, left, lpf, \
362                                                   w, h, edge, flt
363%define base r13-r_ext_mask-70
364    mov           fltq, r6mp
365    movifnidn       wd, wm
366    movifnidn       hd, hm
367    mov          edged, r7m
368    mov            t3d, r8m ; pixel_max
369    vbroadcasti128  m5, [wiener_shufE]
370    vpbroadcastw   m11, [fltq+ 2] ; x1
371    vbroadcasti128  m6, [wiener_shufB]
372    lea            r13, [r_ext_mask+70]
373    vbroadcasti128  m7, [wiener_shufD]
374    add             wd, wd
375    vpbroadcastd   m12, [fltq+ 4] ; x2 x3
376    shr            t3d, 11
377    vpbroadcastd    m8, [pd_m262128] ; (1 << 4) - (1 << 18)
378    add           lpfq, wq
379    vpbroadcastw   m13, [fltq+18] ; y1
380    add           dstq, wq
381    vpbroadcastd   m14, [fltq+20] ; y2 y3
382    lea             t1, [rsp+wq+16]
383    vpbroadcastd    m0, [base+wiener_hshift+t3*4]
384    neg             wq
385    vpbroadcastd    m9, [base+wiener_round+t3*4]
386    mov           r10d, 0xfffe
387    vpbroadcastd   m10, [base+wiener_vshift+t3*4]
388    kmovw           k1, r10d
389    pmullw         m11, m0
390    pmullw         m12, m0
391    test         edgeb, 4 ; LR_HAVE_TOP
392    jz .no_top
393    call .h_top
394    add           lpfq, strideq
395    mov             t4, t1
396    add             t1, 384*2
397    call .h_top
398    lea            r10, [lpfq+strideq*4]
399    mov           lpfq, dstq
400    mov             t3, t1
401    add             t1, 384*2
402    add            r10, strideq
403    mov          [rsp], r10 ; below
404    call .h
405    mov             t2, t1
406    dec             hd
407    jz .v1
408    add           lpfq, strideq
409    add             t1, 384*2
410    call .h
411    dec             hd
412    jz .v2
413.main:
414    mov             t0, t4
415.main_loop:
416    call .hv
417    dec             hd
418    jnz .main_loop
419    test         edgeb, 8 ; LR_HAVE_BOTTOM
420    jz .v2
421    mov           lpfq, [rsp]
422    call .hv_bottom
423    add           lpfq, strideq
424    call .hv_bottom
425.end:
426    RET
427.no_top:
428    lea            r10, [lpfq+strideq*4]
429    mov           lpfq, dstq
430    lea            r10, [r10+strideq*2]
431    mov          [rsp], r10
432    call .h
433    mov             t4, t1
434    mov             t3, t1
435    mov             t2, t1
436    dec             hd
437    jz .v1
438    add           lpfq, strideq
439    add             t1, 384*2
440    call .h
441    dec             hd
442    jz .v2
443    lea             t0, [t1+384*2]
444    call .hv
445    dec             hd
446    jz .v2
447    add             t0, 384*6
448    call .hv
449    dec             hd
450    jnz .main
451.v2:
452    call .v
453    mov             t4, t3
454    mov             t3, t2
455    mov             t2, t1
456    add           dstq, strideq
457.v1:
458    call .v
459    jmp .end
460.h:
461    mov            r10, wq
462    test         edgeb, 1 ; LR_HAVE_LEFT
463    jz .h_extend_left
464    movd           xm3, [leftq+4]
465    vmovdqu32   m3{k1}, [lpfq+r10-4]
466    add          leftq, 8
467    jmp .h_main
468.h_extend_left:
469    vpbroadcastw   xm3, [lpfq+r10]
470    vmovdqu32   m3{k1}, [lpfq+r10-4]
471    jmp .h_main
472.h_top:
473    mov            r10, wq
474    test         edgeb, 1 ; LR_HAVE_LEFT
475    jz .h_extend_left
476.h_loop:
477    movu            m3, [lpfq+r10-4]
478.h_main:
479    movu            m4, [lpfq+r10+4]
480    test         edgeb, 2 ; LR_HAVE_RIGHT
481    jnz .h_have_right
482    cmp           r10d, -66
483    jl .h_have_right
484    vpbroadcastw    m0, [lpfq-2]
485    vpternlogd      m3, m0, [r13+r10+0], 0xe4 ; c ? a : b
486    vpternlogd      m4, m0, [r13+r10+8], 0xe4
487.h_have_right:
488    pshufb          m1, m3, m5
489    mova            m0, m8
490    vpdpwssd        m0, m1, m11
491    pshufb          m2, m4, m5
492    mova            m1, m8
493    vpdpwssd        m1, m2, m11
494    pshufb          m2, m3, m6
495    pshufb          m3, m7
496    paddw           m2, m3
497    pshufb          m3, m4, m6
498    vpdpwssd        m0, m2, m12
499    pshufb          m4, m7
500    paddw           m3, m4
501    vpdpwssd        m1, m3, m12
502    psrad           m0, 4
503    psrad           m1, 4
504    packssdw        m0, m1
505    psraw           m0, 1
506    mova      [t1+r10], m0
507    add            r10, 64
508    jl .h_loop
509    ret
510ALIGN function_align
511.hv:
512    add           lpfq, strideq
513    mov            r10, wq
514    test         edgeb, 1 ; LR_HAVE_LEFT
515    jz .hv_extend_left
516    movd           xm3, [leftq+4]
517    vmovdqu32   m3{k1}, [lpfq+r10-4]
518    add          leftq, 8
519    jmp .hv_main
520.hv_extend_left:
521    vpbroadcastw   xm3, [lpfq+r10]
522    vmovdqu32   m3{k1}, [lpfq+r10-4]
523    jmp .hv_main
524.hv_bottom:
525    mov            r10, wq
526    test         edgeb, 1 ; LR_HAVE_LEFT
527    jz .hv_extend_left
528.hv_loop:
529    movu            m3, [lpfq+r10-4]
530.hv_main:
531    movu            m4, [lpfq+r10+4]
532    test         edgeb, 2 ; LR_HAVE_RIGHT
533    jnz .hv_have_right
534    cmp           r10d, -66
535    jl .hv_have_right
536    vpbroadcastw    m0, [lpfq-2]
537    vpternlogd      m3, m0, [r13+r10+0], 0xe4
538    vpternlogd      m4, m0, [r13+r10+8], 0xe4
539.hv_have_right:
540    pshufb          m1, m3, m5
541    mova            m0, m8
542    vpdpwssd        m0, m1, m11
543    pshufb          m2, m4, m5
544    mova            m1, m8
545    vpdpwssd        m1, m2, m11
546    pshufb          m2, m3, m6
547    pshufb          m3, m7
548    paddw           m2, m3
549    pshufb          m3, m4, m6
550    vpdpwssd        m0, m2, m12
551    pshufb          m4, m7
552    paddw           m4, m3
553    vpdpwssd        m1, m4, m12
554    mova            m2, [t3+r10]
555    paddw           m2, [t1+r10]
556    mova            m3, [t2+r10]
557    punpcklwd       m4, m2, m3
558    punpckhwd       m2, m3
559    mova            m3, m9
560    vpdpwssd        m3, m2, m14
561    mova            m2, m9
562    vpdpwssd        m2, m4, m14
563    mova            m4, [t4+r10]
564    psrad           m0, 4
565    psrad           m1, 4
566    packssdw        m0, m1
567    psraw           m0, 1
568    mova      [t0+r10], m0
569    punpcklwd       m1, m0, m4
570    vpdpwssd        m2, m1, m13
571    punpckhwd       m0, m4
572    vpdpwssd        m3, m0, m13
573    psrad           m2, 5
574    psrad           m3, 5
575    packusdw        m2, m3
576    pmulhuw         m2, m10
577    mova    [dstq+r10], m2
578    add            r10, 64
579    jl .hv_loop
580    mov             t4, t3
581    mov             t3, t2
582    mov             t2, t1
583    mov             t1, t0
584    mov             t0, t4
585    add           dstq, strideq
586    ret
587.v:
588    mov            r10, wq
589.v_loop:
590    mova            m0, [t1+r10]
591    paddw           m2, m0, [t3+r10]
592    mova            m1, [t2+r10]
593    mova            m4, [t4+r10]
594    punpckhwd       m3, m2, m1
595    pmaddwd         m3, m14
596    punpcklwd       m2, m1
597    pmaddwd         m2, m14
598    punpckhwd       m1, m0, m4
599    pmaddwd         m1, m13
600    punpcklwd       m0, m4
601    pmaddwd         m0, m13
602    paddd           m3, m9
603    paddd           m2, m9
604    paddd           m1, m3
605    paddd           m0, m2
606    psrad           m1, 5
607    psrad           m0, 5
608    packusdw        m0, m1
609    pmulhuw         m0, m10
610    mova    [dstq+r10], m0
611    add            r10, 64
612    jl .v_loop
613    ret
614
615cglobal sgr_filter_5x5_16bpc, 4, 14, 22, 416*24+8, dst, stride, left, lpf, \
616                                                   w, h, edge, params
617%define base r13-r_ext_mask-72
618    movifnidn       wd, wm
619    mov        paramsq, r6mp
620    lea            r13, [r_ext_mask+72]
621    mov          edged, r7m
622    movifnidn       hd, hm
623    pxor            m6, m6
624    vpbroadcastw    m7, [paramsq+8] ; w0
625    add             wd, wd
626    vpbroadcastd    m8, [base+pd_8]
627    add           lpfq, wq
628    vpbroadcastd    m9, [base+pd_m25]
629    add           dstq, wq
630    vpsubd         m10, m6, [paramsq+0] {1to16} ; -s0
631    lea             t3, [rsp+wq*2+416*12+8]
632    vpbroadcastd   m11, [base+pw_164_455]
633    lea             t4, [rsp+wq+416*20+8]
634    vpbroadcastd   m12, [base+pw_61448]  ; (15 << 12) + (1 << 3)
635    lea             t1, [rsp+wq+12]
636    vpbroadcastd   m13, [base+pd_m34816] ; -((1 << 11) + (1 << 15))
637    neg             wq
638    vpbroadcastd   m14, [base+pw_1023]
639    psllw           m7, 4
640    mova           m18, [sgr_x_by_x+64*0]
641    mov           r10d, 0xfffffff8
642    mova           m19, [sgr_x_by_x+64*1]
643    kmovd           k1, r10d
644    mova           m20, [sgr_x_by_x+64*2]
645    mov            r10, 0x3333333333333333
646    mova           m21, [sgr_x_by_x+64*3]
647    kmovq           k2, r10
648    test         edgeb, 4 ; LR_HAVE_TOP
649    jz .no_top
650    call .h_top
651    add           lpfq, strideq
652    mov             t2, t1
653    call .top_fixup
654    add             t1, 416*6
655    call .h_top
656    lea            r10, [lpfq+strideq*4]
657    mov           lpfq, dstq
658    add            r10, strideq
659    mov          [rsp], r10 ; below
660    mov             t0, t2
661    dec             hd
662    jz .height1
663    or           edged, 16
664    call .h
665.main:
666    add           lpfq, strideq
667    call .hv
668    call .prep_n
669    sub             hd, 2
670    jl .extend_bottom
671.main_loop:
672    add           lpfq, strideq
673    test            hd, hd
674    jz .odd_height
675    call .h
676    add           lpfq, strideq
677    call .hv
678    call .n0
679    call .n1
680    sub             hd, 2
681    jge .main_loop
682    test         edgeb, 8 ; LR_HAVE_BOTTOM
683    jz .extend_bottom
684    mov           lpfq, [rsp]
685    call .h_top
686    add           lpfq, strideq
687    call .hv_bottom
688.end:
689    call .n0
690    call .n1
691.end2:
692    RET
693.height1:
694    call .hv
695    call .prep_n
696    jmp .odd_height_end
697.odd_height:
698    call .hv
699    call .n0
700    call .n1
701.odd_height_end:
702    call .v
703    call .n0
704    jmp .end2
705.extend_bottom:
706    call .v
707    jmp .end
708.no_top:
709    lea            r10, [lpfq+strideq*4]
710    mov           lpfq, dstq
711    lea            r10, [r10+strideq*2]
712    mov          [rsp], r10
713    call .h
714    lea             t2, [t1+416*6]
715    call .top_fixup
716    dec             hd
717    jz .no_top_height1
718    or           edged, 16
719    mov             t0, t1
720    mov             t1, t2
721    jmp .main
722.no_top_height1:
723    call .v
724    call .prep_n
725    jmp .odd_height_end
726.h: ; horizontal boxsum
727    lea            r10, [wq-4]
728    test         edgeb, 1 ; LR_HAVE_LEFT
729    jz .h_extend_left
730    movq          xm16, [leftq+2]
731    vmovdqu16  m16{k1}, [lpfq+wq-6]
732    add          leftq, 8
733    jmp .h_main
734.h_extend_left:
735    vpbroadcastw  xm16, [lpfq+wq]
736    vmovdqu16  m16{k1}, [lpfq+wq-6]
737    jmp .h_main
738.h_top:
739    lea            r10, [wq-4]
740    test         edgeb, 1 ; LR_HAVE_LEFT
741    jz .h_extend_left
742.h_loop:
743    movu           m16, [lpfq+r10- 2]
744.h_main:
745    movu           m17, [lpfq+r10+14]
746    test         edgeb, 2 ; LR_HAVE_RIGHT
747    jnz .h_have_right
748    cmp           r10d, -68
749    jl .h_have_right
750    vpbroadcastw    m0, [lpfq-2]
751    vpternlogd     m16, m0, [r13+r10+ 0], 0xe4 ; c ? a : b
752    vpternlogd     m17, m0, [r13+r10+16], 0xe4
753.h_have_right:
754    palignr         m2, m17, m16, 2
755    paddw           m0, m16, m2
756    palignr         m3, m17, m16, 6
757    paddw           m0, m3
758    punpcklwd       m1, m2, m3
759    pmaddwd         m1, m1
760    punpckhwd       m2, m3
761    pmaddwd         m2, m2
762    shufpd         m17, m16, m17, 0x55
763    paddw           m0, m17
764    punpcklwd       m3, m16, m17
765    vpdpwssd        m1, m3, m3
766    punpckhwd       m3, m16, m17
767    vpdpwssd        m2, m3, m3
768    shufps         m16, m17, q2121
769    paddw           m0, m16            ; sum
770    test         edgeb, 16             ; y > 0
771    jz .h_loop_end
772    paddw           m0, [t1+r10+416*0]
773    paddd           m1, [t1+r10+416*2]
774    paddd           m2, [t1+r10+416*4]
775.h_loop_end:
776    punpcklwd      m17, m16, m6
777    vpdpwssd        m1, m17, m17       ; sumsq
778    punpckhwd      m16, m6
779    vpdpwssd        m2, m16, m16
780    mova [t1+r10+416*0], m0
781    mova [t1+r10+416*2], m1
782    mova [t1+r10+416*4], m2
783    add            r10, 64
784    jl .h_loop
785    ret
786.top_fixup:
787    lea            r10, [wq-4]
788.top_fixup_loop: ; the sums of the first row needs to be doubled
789    mova            m0, [t1+r10+416*0]
790    mova            m1, [t1+r10+416*2]
791    mova            m2, [t1+r10+416*4]
792    paddw           m0, m0
793    paddd           m1, m1
794    paddd           m2, m2
795    mova [t2+r10+416*0], m0
796    mova [t2+r10+416*2], m1
797    mova [t2+r10+416*4], m2
798    add            r10, 64
799    jl .top_fixup_loop
800    ret
801ALIGN function_align
802.hv: ; horizontal boxsum + vertical boxsum + ab
803    lea            r10, [wq-4]
804    test         edgeb, 1 ; LR_HAVE_LEFT
805    jz .hv_extend_left
806    movq          xm16, [leftq+2]
807    vmovdqu16  m16{k1}, [lpfq+wq-6]
808    add          leftq, 8
809    jmp .hv_main
810.hv_extend_left:
811    vpbroadcastw  xm16, [lpfq+wq]
812    vmovdqu16  m16{k1}, [lpfq+wq-6]
813    jmp .hv_main
814.hv_bottom:
815    lea            r10, [wq-4]
816    test         edgeb, 1 ; LR_HAVE_LEFT
817    jz .hv_extend_left
818.hv_loop:
819    movu           m16, [lpfq+r10- 2]
820.hv_main:
821    movu           m17, [lpfq+r10+14]
822    test         edgeb, 2 ; LR_HAVE_RIGHT
823    jnz .hv_have_right
824    cmp           r10d, -68
825    jl .hv_have_right
826    vpbroadcastw    m0, [lpfq-2]
827    vpternlogd     m16, m0, [r13+r10+ 0], 0xe4
828    vpternlogd     m17, m0, [r13+r10+16], 0xe4
829.hv_have_right:
830    palignr         m3, m17, m16, 2
831    paddw           m0, m16, m3
832    palignr         m1, m17, m16, 6
833    paddw           m0, m1
834    punpcklwd       m2, m3, m1
835    pmaddwd         m2, m2
836    punpckhwd       m3, m1
837    pmaddwd         m3, m3
838    shufpd         m17, m16, m17, 0x55
839    paddw           m0, m17
840    punpcklwd       m1, m16, m17
841    vpdpwssd        m2, m1, m1
842    punpckhwd       m1, m16, m17
843    vpdpwssd        m3, m1, m1
844    shufps         m16, m17, q2121
845    paddw           m0, m16           ; h sum
846    punpcklwd      m17, m16, m6
847    vpdpwssd        m2, m17, m17      ; h sumsq
848    punpckhwd      m16, m6
849    vpdpwssd        m3, m16, m16
850    paddw           m1, m0, [t1+r10+416*0]
851    paddd          m16, m2, [t1+r10+416*2]
852    paddd          m17, m3, [t1+r10+416*4]
853    test            hd, hd
854    jz .hv_last_row
855.hv_main2:
856    paddw           m1, [t2+r10+416*0] ; hv sum
857    paddd          m16, [t2+r10+416*2] ; hv sumsq
858    paddd          m17, [t2+r10+416*4]
859    mova [t0+r10+416*0], m0
860    mova [t0+r10+416*2], m2
861    mova [t0+r10+416*4], m3
862    psrlw           m3, m1, 1
863    paddd          m16, m8
864    pavgw           m3, m6             ; (b + 2) >> 2
865    paddd          m17, m8
866    psrld          m16, 4              ; (a + 8) >> 4
867    psrld          m17, 4
868    pmulld         m16, m9             ; -a * 25
869    pmulld         m17, m9
870    punpcklwd       m2, m3, m6
871    vpdpwssd       m16, m2, m2         ; -p
872    punpckhwd       m3, m6
873    vpdpwssd       m17, m3, m3
874    punpcklwd       m0, m1, m6         ; b
875    punpckhwd       m1, m6
876    pmulld         m16, m10            ; p * s
877    pmulld         m17, m10
878    pmaddwd         m0, m11            ; b * 164
879    pmaddwd         m1, m11
880    vpalignr   m17{k2}, m16, m16, 2
881    mova           m16, m20
882    pmaxsw         m17, m6
883    paddusw        m17, m12
884    psraw          m17, 4              ; min(z, 255) - 256
885    vpermt2b       m16, m17, m21       ; sgr_x_by_x[128..255]
886    vpmovb2m        k3, m17
887    vpermi2b       m17, m18, m19       ; sgr_x_by_x[  0..127]
888    vmovdqu8   m17{k3}, m16            ; x
889    pandn          m16, m13, m17
890    psrld          m17, 16
891    pmulld          m0, m16
892    pmulld          m1, m17
893    packssdw       m16, m17
894    psubd           m0, m13            ; x * b * 164 + (1 << 11) + (1 << 15)
895    psubd           m1, m13
896    mova    [t4+r10+4], m16
897    psrld          m16, m0, 12         ; b
898    psrld          m17, m1, 12
899    mova          [t3+r10*2+  8], xm16
900    mova          [t3+r10*2+ 24], xm17
901    vextracti128  [t3+r10*2+ 40], ym16, 1
902    vextracti128  [t3+r10*2+ 56], ym17, 1
903    vextracti32x4 [t3+r10*2+ 72], m16, 2
904    vextracti32x4 [t3+r10*2+ 88], m17, 2
905    vextracti32x4 [t3+r10*2+104], m16, 3
906    vextracti32x4 [t3+r10*2+120], m17, 3
907    add            r10, 64
908    jl .hv_loop
909    mov             t2, t1
910    mov             t1, t0
911    mov             t0, t2
912    ret
913.hv_last_row: ; esoteric edge case for odd heights
914    mova [t1+r10+416*0], m1
915    paddw            m1, m0
916    mova [t1+r10+416*2], m16
917    paddd           m16, m2
918    mova [t1+r10+416*4], m17
919    paddd           m17, m3
920    jmp .hv_main2
921.v: ; vertical boxsum + ab
922    lea            r10, [wq-4]
923.v_loop:
924    mova            m2, [t1+r10+416*2]
925    mova            m3, [t1+r10+416*4]
926    mova            m0, [t1+r10+416*0]
927    paddd          m16, m2, [t2+r10+416*2]
928    paddd          m17, m3, [t2+r10+416*4]
929    paddw           m1, m0, [t2+r10+416*0]
930    paddd           m2, m2
931    paddd           m3, m3
932    paddd          m16, m2             ; hv sumsq
933    paddd          m17, m3
934    paddd          m16, m8
935    paddd          m17, m8
936    psrld          m16, 4              ; (a + 8) >> 4
937    psrld          m17, 4
938    pmulld         m16, m9             ; -a * 25
939    pmulld         m17, m9
940    paddw           m0, m0
941    paddw           m1, m0             ; hv sum
942    psrlw           m3, m1, 1
943    pavgw           m3, m6             ; (b + 2) >> 2
944    punpcklwd       m2, m3, m6
945    vpdpwssd       m16, m2, m2         ; -p
946    punpckhwd       m3, m6
947    vpdpwssd       m17, m3, m3
948    punpcklwd       m0, m1, m6         ; b
949    punpckhwd       m1, m6
950    pmulld         m16, m10            ; p * s
951    pmulld         m17, m10
952    pmaddwd         m0, m11            ; b * 164
953    pmaddwd         m1, m11
954    vpalignr   m17{k2}, m16, m16, 2
955    mova           m16, m20
956    pmaxsw         m17, m6
957    paddusw        m17, m12
958    psraw          m17, 4              ; min(z, 255) - 256
959    vpermt2b       m16, m17, m21       ; sgr_x_by_x[128..255]
960    vpmovb2m        k3, m17
961    vpermi2b       m17, m18, m19       ; sgr_x_by_x[  0..127]
962    vmovdqu8   m17{k3}, m16            ; x
963    pandn          m16, m13, m17
964    psrld          m17, 16
965    pmulld          m0, m16
966    pmulld          m1, m17
967    packssdw       m16, m17
968    psubd           m0, m13            ; x * b * 164 + (1 << 11) + (1 << 15)
969    psubd           m1, m13
970    mova    [t4+r10+4], m16
971    psrld          m16, m0, 12         ; b
972    psrld          m17, m1, 12
973    mova          [t3+r10*2+  8], xm16
974    mova          [t3+r10*2+ 24], xm17
975    vextracti128  [t3+r10*2+ 40], ym16, 1
976    vextracti128  [t3+r10*2+ 56], ym17, 1
977    vextracti32x4 [t3+r10*2+ 72], m16, 2
978    vextracti32x4 [t3+r10*2+ 88], m17, 2
979    vextracti32x4 [t3+r10*2+104], m16, 3
980    vextracti32x4 [t3+r10*2+120], m17, 3
981    add            r10, 64
982    jl .v_loop
983    ret
984.prep_n: ; initial neighbor setup
985    mov            r10, wq
986.prep_n_loop:
987    movu            m0, [t4+r10*1+ 2]
988    movu            m1, [t3+r10*2+ 4]
989    movu            m2, [t3+r10*2+68]
990    paddw           m3, m0, [t4+r10*1+ 0]
991    paddd          m16, m1, [t3+r10*2+ 0]
992    paddd          m17, m2, [t3+r10*2+64]
993    paddw           m3, [t4+r10*1+ 4]
994    paddd          m16, [t3+r10*2+ 8]
995    paddd          m17, [t3+r10*2+72]
996    paddw           m0, m3
997    psllw           m3, 2
998    paddd           m1, m16
999    pslld          m16, 2
1000    paddd           m2, m17
1001    pslld          m17, 2
1002    paddw           m0, m3             ; a 565
1003    paddd           m1, m16            ; b 565
1004    paddd           m2, m17
1005    mova [t4+r10*1+416*2+ 0], m0
1006    mova [t3+r10*2+416*4+ 0], m1
1007    mova [t3+r10*2+416*4+64], m2
1008    add            r10, 64
1009    jl .prep_n_loop
1010    ret
1011ALIGN function_align
1012.n0: ; neighbor + output (even rows)
1013    mov            r10, wq
1014.n0_loop:
1015    movu            m0, [t4+r10*1+ 2]
1016    movu            m1, [t3+r10*2+ 4]
1017    movu            m2, [t3+r10*2+68]
1018    paddw           m3, m0, [t4+r10*1+ 0]
1019    paddd          m16, m1, [t3+r10*2+ 0]
1020    paddd          m17, m2, [t3+r10*2+64]
1021    paddw           m3, [t4+r10*1+ 4]
1022    paddd          m16, [t3+r10*2+ 8]
1023    paddd          m17, [t3+r10*2+72]
1024    paddw           m0, m3
1025    psllw           m3, 2
1026    paddd           m1, m16
1027    pslld          m16, 2
1028    paddd           m2, m17
1029    pslld          m17, 2
1030    paddw           m0, m3             ; a 565
1031    paddd           m1, m16            ; b 565
1032    paddd           m2, m17
1033    paddw           m3, m0, [t4+r10*1+416*2+ 0]
1034    paddd          m16, m1, [t3+r10*2+416*4+ 0]
1035    paddd          m17, m2, [t3+r10*2+416*4+64]
1036    mova [t4+r10*1+416*2+ 0], m0
1037    mova [t3+r10*2+416*4+ 0], m1
1038    mova [t3+r10*2+416*4+64], m2
1039    mova            m0, [dstq+r10]
1040    punpcklwd       m1, m0, m6          ; src
1041    punpcklwd       m2, m3, m6          ; a
1042    pmaddwd         m2, m1              ; a * src
1043    punpckhwd       m1, m0, m6
1044    punpckhwd       m3, m6
1045    pmaddwd         m3, m1
1046    vshufi32x4      m1, m16, m17, q2020
1047    vshufi32x4     m16, m17, q3131
1048    psubd           m1, m2              ; b - a * src + (1 << 8)
1049    psubd          m16, m3
1050    psrad           m1, 9
1051    psrad          m16, 9
1052    packssdw        m1, m16
1053    pmulhrsw        m1, m7
1054    paddw           m0, m1
1055    pmaxsw          m0, m6
1056    pminsw          m0, m14
1057    mova    [dstq+r10], m0
1058    add            r10, 64
1059    jl .n0_loop
1060    add           dstq, strideq
1061    ret
1062ALIGN function_align
1063.n1: ; neighbor + output (odd rows)
1064    mov            r10, wq
1065.n1_loop:
1066    mova            m0, [dstq+r10]
1067    mova            m3, [t4+r10*1+416*2+ 0]
1068    mova           m16, [t3+r10*2+416*4+ 0]
1069    mova           m17, [t3+r10*2+416*4+64]
1070    punpcklwd       m1, m0, m6          ; src
1071    punpcklwd       m2, m3, m6          ; a
1072    pmaddwd         m2, m1
1073    punpckhwd       m1, m0, m6
1074    punpckhwd       m3, m6
1075    pmaddwd         m3, m1
1076    vshufi32x4      m1, m16, m17, q2020
1077    vshufi32x4     m16, m17, q3131
1078    psubd           m1, m2              ; b - a * src + (1 << 7)
1079    psubd          m16, m3
1080    psrad           m1, 8
1081    psrad          m16, 8
1082    packssdw        m1, m16
1083    pmulhrsw        m1, m7
1084    paddw           m0, m1
1085    pmaxsw          m0, m6
1086    pminsw          m0, m14
1087    mova    [dstq+r10], m0
1088    add            r10, 64
1089    jl .n1_loop
1090    add           dstq, strideq
1091    ret
1092
1093cglobal sgr_filter_3x3_16bpc, 4, 14, 22, 416*42+8, dst, stride, left, lpf, \
1094                                                   w, h, edge, params
1095    movifnidn       wd, wm
1096    mov        paramsq, r6mp
1097    lea            r13, [r_ext_mask+72]
1098    mov          edged, r7m
1099    movifnidn       hd, hm
1100    pxor            m6, m6
1101    vpbroadcastw    m7, [paramsq+10] ; w1
1102    add             wd, wd
1103    vpbroadcastd    m8, [base+pd_8]
1104    add           lpfq, wq
1105    vpbroadcastd    m9, [base+pd_m9]
1106    add           dstq, wq
1107    vpsubd         m10, m6, [paramsq+4] {1to16} ; -s1
1108    lea             t3, [rsp+wq*2+416*12+8]
1109    vpbroadcastd   m11, [base+pw_164_455]
1110    lea             t4, [rsp+wq+416*32+8]
1111    vpbroadcastd   m12, [base+pw_61448]
1112    lea             t1, [rsp+wq+12]
1113    vpbroadcastd   m13, [base+pd_m34816]
1114    neg             wq
1115    vpbroadcastd   m14, [base+pw_1023]
1116    psllw           m7, 4
1117    mova           m18, [sgr_x_by_x+64*0]
1118    mov           r10d, 0xfffffffc
1119    mova           m19, [sgr_x_by_x+64*1]
1120    kmovd           k1, r10d
1121    mova           m20, [sgr_x_by_x+64*2]
1122    mov            r10, 0x3333333333333333
1123    mova           m21, [sgr_x_by_x+64*3]
1124    kmovq           k2, r10
1125    test         edgeb, 4 ; LR_HAVE_TOP
1126    jz .no_top
1127    call .h_top
1128    add           lpfq, strideq
1129    mov             t2, t1
1130    add             t1, 416*6
1131    call .h_top
1132    lea            r10, [lpfq+strideq*4]
1133    mov           lpfq, dstq
1134    add            r10, strideq
1135    mov          [rsp], r10 ; below
1136    call .hv0
1137.main:
1138    dec             hd
1139    jz .height1
1140    add           lpfq, strideq
1141    call .hv1
1142    call .prep_n
1143    sub             hd, 2
1144    jl .extend_bottom
1145.main_loop:
1146    add           lpfq, strideq
1147    call .hv0
1148    test            hd, hd
1149    jz .odd_height
1150    add           lpfq, strideq
1151    call .hv1
1152    call .n0
1153    call .n1
1154    sub             hd, 2
1155    jge .main_loop
1156    test         edgeb, 8 ; LR_HAVE_BOTTOM
1157    jz .extend_bottom
1158    mov           lpfq, [rsp]
1159    call .hv0_bottom
1160    add           lpfq, strideq
1161    call .hv1_bottom
1162.end:
1163    call .n0
1164    call .n1
1165.end2:
1166    RET
1167.height1:
1168    call .v1
1169    call .prep_n
1170    jmp .odd_height_end
1171.odd_height:
1172    call .v1
1173    call .n0
1174    call .n1
1175.odd_height_end:
1176    call .v0
1177    call .v1
1178    call .n0
1179    jmp .end2
1180.extend_bottom:
1181    call .v0
1182    call .v1
1183    jmp .end
1184.no_top:
1185    lea            r10, [lpfq+strideq*4]
1186    mov           lpfq, dstq
1187    lea            r10, [r10+strideq*2]
1188    mov          [rsp], r10
1189    call .h
1190    lea            r10, [wq-4]
1191    lea             t2, [t1+416*6]
1192.top_fixup_loop:
1193    mova            m0, [t1+r10+416*0]
1194    mova            m1, [t1+r10+416*2]
1195    mova            m2, [t1+r10+416*4]
1196    mova [t2+r10+416*0], m0
1197    mova [t2+r10+416*2], m1
1198    mova [t2+r10+416*4], m2
1199    add            r10, 64
1200    jl .top_fixup_loop
1201    call .v0
1202    jmp .main
1203.h: ; horizontal boxsum
1204    lea            r10, [wq-4]
1205    test         edgeb, 1 ; LR_HAVE_LEFT
1206    jz .h_extend_left
1207    movd          xm16, [leftq+4]
1208    vmovdqu16  m16{k1}, [lpfq+wq-4]
1209    add          leftq, 8
1210    jmp .h_main
1211.h_extend_left:
1212    vpbroadcastw  xm16, [lpfq+wq]
1213    vmovdqu16  m16{k1}, [lpfq+wq-4]
1214    jmp .h_main
1215.h_top:
1216    lea            r10, [wq-4]
1217    test         edgeb, 1 ; LR_HAVE_LEFT
1218    jz .h_extend_left
1219.h_loop:
1220    movu           m16, [lpfq+r10+ 0]
1221.h_main:
1222    movu           m17, [lpfq+r10+16]
1223    test         edgeb, 2 ; LR_HAVE_RIGHT
1224    jnz .h_have_right
1225    cmp           r10d, -66
1226    jl .h_have_right
1227    vpbroadcastw    m0, [lpfq-2]
1228    vpternlogd     m16, m0, [r13+r10+ 0], 0xe4
1229    vpternlogd     m17, m0, [r13+r10+16], 0xe4
1230.h_have_right:
1231    palignr         m0, m17, m16, 2
1232    paddw           m1, m16, m0
1233    punpcklwd       m2, m16, m0
1234    pmaddwd         m2, m2
1235    punpckhwd       m3, m16, m0
1236    pmaddwd         m3, m3
1237    palignr        m17, m16, 4
1238    paddw           m1, m17            ; sum
1239    punpcklwd      m16, m17, m6
1240    vpdpwssd        m2, m16, m16       ; sumsq
1241    punpckhwd      m17, m6
1242    vpdpwssd        m3, m17, m17
1243    mova [t1+r10+416*0], m1
1244    mova [t1+r10+416*2], m2
1245    mova [t1+r10+416*4], m3
1246    add            r10, 64
1247    jl .h_loop
1248    ret
1249ALIGN function_align
1250.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
1251    lea            r10, [wq-4]
1252    test         edgeb, 1 ; LR_HAVE_LEFT
1253    jz .hv0_extend_left
1254    movd          xm16, [leftq+4]
1255    vmovdqu16  m16{k1}, [lpfq+wq-4]
1256    add          leftq, 8
1257    jmp .hv0_main
1258.hv0_extend_left:
1259    vpbroadcastw  xm16, [lpfq+wq]
1260    vmovdqu16  m16{k1}, [lpfq+wq-4]
1261    jmp .hv0_main
1262.hv0_bottom:
1263    lea            r10, [wq-4]
1264    test         edgeb, 1 ; LR_HAVE_LEFT
1265    jz .hv0_extend_left
1266.hv0_loop:
1267    movu           m16, [lpfq+r10+ 0]
1268.hv0_main:
1269    movu           m17, [lpfq+r10+16]
1270    test         edgeb, 2 ; LR_HAVE_RIGHT
1271    jnz .hv0_have_right
1272    cmp           r10d, -66
1273    jl .hv0_have_right
1274    vpbroadcastw    m0, [lpfq-2]
1275    vpternlogd     m16, m0, [r13+r10+ 0], 0xe4
1276    vpternlogd     m17, m0, [r13+r10+16], 0xe4
1277.hv0_have_right:
1278    palignr         m0, m17, m16, 2
1279    paddw           m1, m16, m0
1280    punpcklwd       m2, m16, m0
1281    pmaddwd         m2, m2
1282    punpckhwd       m3, m16, m0
1283    pmaddwd         m3, m3
1284    palignr        m17, m16, 4
1285    paddw           m1, m17            ; sum
1286    punpcklwd      m16, m17, m6
1287    vpdpwssd        m2, m16, m16       ; sumsq
1288    punpckhwd      m17, m6
1289    vpdpwssd        m3, m17, m17
1290    paddw           m0, m1, [t1+r10+416*0]
1291    paddd          m16, m2, [t1+r10+416*2]
1292    paddd          m17, m3, [t1+r10+416*4]
1293    mova [t1+r10+416*0], m1
1294    mova [t1+r10+416*2], m2
1295    mova [t1+r10+416*4], m3
1296    paddw           m1, m0, [t2+r10+416*0]
1297    paddd           m2, m16, [t2+r10+416*2]
1298    paddd           m3, m17, [t2+r10+416*4]
1299    mova [t2+r10+416*0], m0
1300    mova [t2+r10+416*2], m16
1301    mova [t2+r10+416*4], m17
1302    paddd           m2, m8
1303    paddd           m3, m8
1304    psrld           m2, 4              ; (a + 8) >> 4
1305    psrld           m3, 4
1306    pmulld          m2, m9             ; -((a + 8) >> 4) * 9
1307    pmulld          m3, m9
1308    psrlw          m17, m1, 1
1309    pavgw          m17, m6             ; (b + 2) >> 2
1310    punpcklwd      m16, m17, m6
1311    vpdpwssd        m2, m16, m16       ; -p
1312    punpckhwd      m17, m6
1313    vpdpwssd        m3, m17, m17
1314    punpcklwd      m16, m6, m1         ; b
1315    punpckhwd      m17, m6, m1
1316    pminsd          m2, m6
1317    pminsd          m3, m6
1318    pmulld          m2, m10            ; p * s
1319    pmulld          m3, m10
1320    pmaddwd        m16, m11            ; b * 455
1321    pmaddwd        m17, m11
1322    vpalignr    m3{k2}, m2, m2, 2
1323    mova            m2, m20
1324    paddusw         m3, m12
1325    psraw           m3, 4              ; min(z, 255) - 256
1326    vpermt2b        m2, m3, m21        ; sgr_x_by_x[128..255]
1327    vpmovb2m        k3, m3
1328    vpermi2b        m3, m18, m19       ; sgr_x_by_x[  0..127]
1329    vmovdqu8    m3{k3}, m2             ; x
1330    pandn           m2, m13, m3
1331    psrld           m3, 16
1332    pmulld         m16, m2
1333    pmulld         m17, m3
1334    packssdw        m2, m3
1335    psubd          m16, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
1336    psubd          m17, m13
1337    mova [t4+r10*1+416*0+4], m2
1338    psrld          m16, 12
1339    psrld          m17, 12
1340    mova          [t3+r10*2+416*0+  8], xm16
1341    mova          [t3+r10*2+416*0+ 24], xm17
1342    vextracti128  [t3+r10*2+416*0+ 40], ym16, 1
1343    vextracti128  [t3+r10*2+416*0+ 56], ym17, 1
1344    vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
1345    vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
1346    vextracti32x4 [t3+r10*2+416*0+104], m16, 3
1347    vextracti32x4 [t3+r10*2+416*0+120], m17, 3
1348    add            r10, 64
1349    jl .hv0_loop
1350    ret
1351ALIGN function_align
1352.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
1353    lea            r10, [wq-4]
1354    test         edgeb, 1 ; LR_HAVE_LEFT
1355    jz .hv1_extend_left
1356    movd          xm16, [leftq+4]
1357    vmovdqu16  m16{k1}, [lpfq+wq-4]
1358    add          leftq, 8
1359    jmp .hv1_main
1360.hv1_extend_left:
1361    vpbroadcastw  xm16, [lpfq+wq]
1362    vmovdqu16  m16{k1}, [lpfq+wq-4]
1363    jmp .hv1_main
1364.hv1_bottom:
1365    lea            r10, [wq-4]
1366    test         edgeb, 1 ; LR_HAVE_LEFT
1367    jz .hv1_extend_left
1368.hv1_loop:
1369    movu           m16, [lpfq+r10+ 0]
1370.hv1_main:
1371    movu           m17, [lpfq+r10+16]
1372    test         edgeb, 2 ; LR_HAVE_RIGHT
1373    jnz .hv1_have_right
1374    cmp           r10d, -66
1375    jl .hv1_have_right
1376    vpbroadcastw    m0, [lpfq-2]
1377    vpternlogd     m16, m0, [r13+r10+ 0], 0xe4
1378    vpternlogd     m17, m0, [r13+r10+16], 0xe4
1379.hv1_have_right:
1380    palignr         m1, m17, m16, 2
1381    paddw           m0, m16, m1
1382    punpcklwd       m2, m16, m1
1383    pmaddwd         m2, m2
1384    punpckhwd       m3, m16, m1
1385    pmaddwd         m3, m3
1386    palignr        m17, m16, 4
1387    paddw           m0, m17            ; h sum
1388    punpcklwd       m1, m17, m6
1389    vpdpwssd        m2, m1, m1         ; h sumsq
1390    punpckhwd      m17, m6
1391    vpdpwssd        m3, m17, m17
1392    paddw           m1, m0, [t2+r10+416*0]
1393    paddd          m16, m2, [t2+r10+416*2]
1394    paddd          m17, m3, [t2+r10+416*4]
1395    mova [t2+r10+416*0], m0
1396    mova [t2+r10+416*2], m2
1397    mova [t2+r10+416*4], m3
1398    paddd          m16, m8
1399    paddd          m17, m8
1400    psrld          m16, 4              ; (a + 8) >> 4
1401    psrld          m17, 4
1402    pmulld         m16, m9             ; -((a + 8) >> 4) * 9
1403    pmulld         m17, m9
1404    psrlw           m3, m1, 1
1405    pavgw           m3, m6             ; (b + 2) >> 2
1406    punpcklwd       m2, m3, m6
1407    vpdpwssd       m16, m2, m2         ; -p
1408    punpckhwd       m3, m6
1409    vpdpwssd       m17, m3, m3
1410    punpcklwd       m0, m6, m1         ; b
1411    punpckhwd       m1, m6, m1
1412    pminsd         m16, m6
1413    pminsd         m17, m6
1414    pmulld         m16, m10            ; p * s
1415    pmulld         m17, m10
1416    pmaddwd         m0, m11            ; b * 455
1417    pmaddwd         m1, m11
1418    vpalignr   m17{k2}, m16, m16, 2
1419    mova           m16, m20
1420    paddusw        m17, m12
1421    psraw          m17, 4              ; min(z, 255) - 256
1422    vpermt2b       m16, m17, m21       ; sgr_x_by_x[128..255]
1423    vpmovb2m       k3, m17
1424    vpermi2b       m17, m18, m19       ; sgr_x_by_x[  0..127]
1425    vmovdqu8   m17{k3}, m16            ; x
1426    pandn          m16, m13, m17
1427    psrld          m17, 16
1428    pmulld          m0, m16
1429    pmulld          m1, m17
1430    packssdw       m16, m17
1431    psubd           m0, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
1432    psubd           m1, m13
1433    mova [t4+r10*1+416*2+4], m16
1434    psrld          m16, m0, 12
1435    psrld          m17, m1, 12
1436    mova          [t3+r10*2+416*4+  8], xm16
1437    mova          [t3+r10*2+416*4+ 24], xm17
1438    vextracti128  [t3+r10*2+416*4+ 40], ym16, 1
1439    vextracti128  [t3+r10*2+416*4+ 56], ym17, 1
1440    vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
1441    vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
1442    vextracti32x4 [t3+r10*2+416*4+104], m16, 3
1443    vextracti32x4 [t3+r10*2+416*4+120], m17, 3
1444    add            r10, 64
1445    jl .hv1_loop
1446    mov            r10, t2
1447    mov             t2, t1
1448    mov             t1, r10
1449    ret
1450.v0: ; vertical boxsums + ab (even rows)
1451    lea            r10, [wq-4]
1452.v0_loop:
1453    mova            m0, [t1+r10+416*0]
1454    mova           m16, [t1+r10+416*2]
1455    mova           m17, [t1+r10+416*4]
1456    paddw           m0, m0
1457    paddd          m16, m16
1458    paddd          m17, m17
1459    paddw           m1, m0, [t2+r10+416*0]
1460    paddd           m2, m16, [t2+r10+416*2]
1461    paddd           m3, m17, [t2+r10+416*4]
1462    mova [t2+r10+416*0], m0
1463    mova [t2+r10+416*2], m16
1464    mova [t2+r10+416*4], m17
1465    paddd           m2, m8
1466    paddd           m3, m8
1467    psrld           m2, 4              ; (a + 8) >> 4
1468    psrld           m3, 4
1469    pmulld          m2, m9             ; -((a + 8) >> 4) * 9
1470    pmulld          m3, m9
1471    psrlw          m17, m1, 1
1472    pavgw          m17, m6             ; (b + 2) >> 2
1473    punpcklwd      m16, m17, m6
1474    vpdpwssd        m2, m16, m16       ; -p
1475    punpckhwd      m17, m6
1476    vpdpwssd        m3, m17, m17
1477    punpcklwd      m16, m6, m1         ; b
1478    punpckhwd      m17, m6, m1
1479    pminsd          m2, m6
1480    pminsd          m3, m6
1481    pmulld          m2, m10            ; p * s
1482    pmulld          m3, m10
1483    pmaddwd        m16, m11            ; b * 455
1484    pmaddwd        m17, m11
1485    vpalignr    m3{k2}, m2, m2, 2
1486    mova            m2, m20
1487    paddusw         m3, m12
1488    psraw           m3, 4              ; min(z, 255) - 256
1489    vpermt2b        m2, m3, m21        ; sgr_x_by_x[128..255]
1490    vpmovb2m        k3, m3
1491    vpermi2b        m3, m18, m19       ; sgr_x_by_x[  0..127]
1492    vmovdqu8    m3{k3}, m2             ; x
1493    pandn           m2, m13, m3
1494    psrld           m3, 16
1495    pmulld         m16, m2
1496    pmulld         m17, m3
1497    packssdw        m2, m3
1498    psubd          m16, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
1499    psubd          m17, m13
1500    mova [t4+r10*1+416*0+4], m2
1501    psrld          m16, 12
1502    psrld          m17, 12
1503    mova          [t3+r10*2+416*0+  8], xm16
1504    mova          [t3+r10*2+416*0+ 24], xm17
1505    vextracti128  [t3+r10*2+416*0+ 40], ym16, 1
1506    vextracti128  [t3+r10*2+416*0+ 56], ym17, 1
1507    vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
1508    vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
1509    vextracti32x4 [t3+r10*2+416*0+104], m16, 3
1510    vextracti32x4 [t3+r10*2+416*0+120], m17, 3
1511    add            r10, 64
1512    jl .v0_loop
1513    ret
1514.v1: ; vertical boxsums + ab (odd rows)
1515    lea            r10, [wq-4]
1516.v1_loop:
1517    mova            m0, [t1+r10+416*0]
1518    mova           m16, [t1+r10+416*2]
1519    mova           m17, [t1+r10+416*4]
1520    paddw           m1, m0, [t2+r10+416*0]
1521    paddd           m2, m16, [t2+r10+416*2]
1522    paddd           m3, m17, [t2+r10+416*4]
1523    mova [t2+r10+416*0], m0
1524    mova [t2+r10+416*2], m16
1525    mova [t2+r10+416*4], m17
1526    paddd           m2, m8
1527    paddd           m3, m8
1528    psrld           m2, 4              ; (a + 8) >> 4
1529    psrld           m3, 4
1530    pmulld          m2, m9             ; -((a + 8) >> 4) * 9
1531    pmulld          m3, m9
1532    psrlw          m17, m1, 1
1533    pavgw          m17, m6             ; (b + 2) >> 2
1534    punpcklwd      m16, m17, m6
1535    vpdpwssd        m2, m16, m16       ; -p
1536    punpckhwd      m17, m6
1537    vpdpwssd        m3, m17, m17
1538    punpcklwd      m16, m6, m1         ; b
1539    punpckhwd      m17, m6, m1
1540    pminsd          m2, m6
1541    pminsd          m3, m6
1542    pmulld          m2, m10            ; p * s
1543    pmulld          m3, m10
1544    pmaddwd        m16, m11            ; b * 455
1545    pmaddwd        m17, m11
1546    vpalignr    m3{k2}, m2, m2, 2
1547    mova            m2, m20
1548    paddusw         m3, m12
1549    psraw           m3, 4              ; min(z, 255) - 256
1550    vpermt2b        m2, m3, m21        ; sgr_x_by_x[128..255]
1551    vpmovb2m        k3, m3
1552    vpermi2b        m3, m18, m19       ; sgr_x_by_x[  0..127]
1553    vmovdqu8    m3{k3}, m2             ; x
1554    pandn           m2, m13, m3
1555    psrld           m3, 16
1556    pmulld         m16, m2
1557    pmulld         m17, m3
1558    packssdw        m2, m3
1559    psubd          m16, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
1560    psubd          m17, m13
1561    mova [t4+r10*1+416*2+4], m2
1562    psrld          m16, 12
1563    psrld          m17, 12
1564    mova          [t3+r10*2+416*4+  8], xm16
1565    mova          [t3+r10*2+416*4+ 24], xm17
1566    vextracti128  [t3+r10*2+416*4+ 40], ym16, 1
1567    vextracti128  [t3+r10*2+416*4+ 56], ym17, 1
1568    vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
1569    vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
1570    vextracti32x4 [t3+r10*2+416*4+104], m16, 3
1571    vextracti32x4 [t3+r10*2+416*4+120], m17, 3
1572    add            r10, 64
1573    jl .v1_loop
1574    mov            r10, t2
1575    mov             t2, t1
1576    mov             t1, r10
1577    ret
1578.prep_n: ; initial neighbor setup
1579    mov            r10, wq
1580.prep_n_loop:
1581    mova          ym16, [t4+r10*1+416*0+0]
1582    paddw         ym16, [t4+r10*1+416*0+4]
1583    paddw         ym17, ym16, [t4+r10*1+416*0+2]
1584    mova            m0, [t3+r10*2+416*0+0]
1585    paddd           m0, [t3+r10*2+416*0+8]
1586    paddd           m1, m0, [t3+r10*2+416*0+4]
1587    psllw         ym17, 2                ; a[-1] 444
1588    pslld           m1, 2                ; b[-1] 444
1589    psubw         ym17, ym16             ; a[-1] 343
1590    psubd           m1, m0               ; b[-1] 343
1591    vmovdqa32 [t4+r10*1+416* 4], ym17
1592    vmovdqa32 [t3+r10*2+416* 8], m1
1593    mova          ym16, [t4+r10*1+416*2+0]
1594    paddw         ym16, [t4+r10*1+416*2+4]
1595    paddw         ym17, ym16, [t4+r10*1+416*2+2]
1596    mova            m0, [t3+r10*2+416*4+0]
1597    paddd           m0, [t3+r10*2+416*4+8]
1598    paddd           m1, m0, [t3+r10*2+416*4+4]
1599    psllw         ym17, 2                 ; a[ 0] 444
1600    pslld           m1, 2                 ; b[ 0] 444
1601    vmovdqa32 [t4+r10*1+416* 6], ym17
1602    vmovdqa32 [t3+r10*2+416*12], m1
1603    psubw         ym17, ym16              ; a[ 0] 343
1604    psubd           m1, m0                ; b[ 0] 343
1605    vmovdqa32 [t4+r10*1+416* 8], ym17
1606    vmovdqa32 [t3+r10*2+416*16], m1
1607    add            r10, 32
1608    jl .prep_n_loop
1609    ret
1610ALIGN function_align
1611.n0: ; neighbor + output (even rows)
1612    mov            r10, wq
1613.n0_loop:
1614    mova            m3, [t4+r10*1+416*0+0]
1615    paddw           m3, [t4+r10*1+416*0+4]
1616    paddw           m1, m3, [t4+r10*1+416*0+2]
1617    psllw           m1, 2                ; a[ 1] 444
1618    psubw           m2, m1, m3           ; a[ 1] 343
1619    paddw           m3, m2, [t4+r10*1+416*4]
1620    paddw           m3, [t4+r10*1+416*6]
1621    mova [t4+r10*1+416*4], m2
1622    mova [t4+r10*1+416*6], m1
1623    mova           m16, [t3+r10*2+416*0+0]
1624    paddd          m16, [t3+r10*2+416*0+8]
1625    paddd           m1, m16, [t3+r10*2+416*0+4]
1626    pslld           m1, 2                ; b[ 1] 444
1627    psubd           m2, m1, m16          ; b[ 1] 343
1628    paddd          m16, m2, [t3+r10*2+416* 8+ 0]
1629    paddd          m16, [t3+r10*2+416*12+ 0]
1630    mova [t3+r10*2+416* 8+ 0], m2
1631    mova [t3+r10*2+416*12+ 0], m1
1632    mova           m17, [t3+r10*2+416*0+64]
1633    paddd          m17, [t3+r10*2+416*0+72]
1634    paddd           m1, m17, [t3+r10*2+416*0+68]
1635    pslld           m1, 2
1636    psubd           m2, m1, m17
1637    paddd          m17, m2, [t3+r10*2+416* 8+64]
1638    paddd          m17, [t3+r10*2+416*12+64]
1639    mova [t3+r10*2+416* 8+64], m2
1640    mova [t3+r10*2+416*12+64], m1
1641    mova            m0, [dstq+r10]
1642    punpcklwd       m1, m0, m6
1643    punpcklwd       m2, m3, m6
1644    pmaddwd         m2, m1               ; a * src
1645    punpckhwd       m1, m0, m6
1646    punpckhwd       m3, m6
1647    pmaddwd         m3, m1
1648    vshufi32x4      m1, m16, m17, q2020
1649    vshufi32x4     m16, m17, q3131
1650    psubd           m1, m2               ; b - a * src + (1 << 8)
1651    psubd          m16, m3
1652    psrad           m1, 9
1653    psrad          m16, 9
1654    packssdw        m1, m16
1655    pmulhrsw        m1, m7
1656    paddw           m0, m1
1657    pmaxsw          m0, m6
1658    pminsw          m0, m14
1659    mova    [dstq+r10], m0
1660    add            r10, 64
1661    jl .n0_loop
1662    add           dstq, strideq
1663    ret
1664ALIGN function_align
1665.n1: ; neighbor + output (odd rows)
1666    mov            r10, wq
1667.n1_loop:
1668    mova            m3, [t4+r10*1+416*2+0]
1669    paddw           m3, [t4+r10*1+416*2+4]
1670    paddw           m1, m3, [t4+r10*1+416*2+2]
1671    psllw           m1, 2                ; a[ 1] 444
1672    psubw           m2, m1, m3           ; a[ 1] 343
1673    paddw           m3, m2, [t4+r10*1+416*6]
1674    paddw           m3, [t4+r10*1+416*8]
1675    mova [t4+r10*1+416*6], m1
1676    mova [t4+r10*1+416*8], m2
1677    mova           m16, [t3+r10*2+416*4+0]
1678    paddd          m16, [t3+r10*2+416*4+8]
1679    paddd           m1, m16, [t3+r10*2+416*4+4]
1680    pslld           m1, 2                ; b[ 1] 444
1681    psubd           m2, m1, m16          ; b[ 1] 343
1682    paddd          m16, m2, [t3+r10*2+416*12+ 0]
1683    paddd          m16, [t3+r10*2+416*16+ 0]
1684    mova [t3+r10*2+416*12+ 0], m1
1685    mova [t3+r10*2+416*16+ 0], m2
1686    mova           m17, [t3+r10*2+416*4+64]
1687    paddd          m17, [t3+r10*2+416*4+72]
1688    paddd           m1, m17, [t3+r10*2+416*4+68]
1689    pslld           m1, 2
1690    psubd           m2, m1, m17
1691    paddd          m17, m2, [t3+r10*2+416*12+64]
1692    paddd          m17, [t3+r10*2+416*16+64]
1693    mova [t3+r10*2+416*12+64], m1
1694    mova [t3+r10*2+416*16+64], m2
1695    mova            m0, [dstq+r10]
1696    punpcklwd       m1, m0, m6
1697    punpcklwd       m2, m3, m6
1698    pmaddwd         m2, m1               ; a * src
1699    punpckhwd       m1, m0, m6
1700    punpckhwd       m3, m6
1701    pmaddwd         m3, m1
1702    vshufi32x4      m1, m16, m17, q2020
1703    vshufi32x4     m16, m17, q3131
1704    psubd           m1, m2               ; b - a * src + (1 << 8)
1705    psubd          m16, m3
1706    psrad           m1, 9
1707    psrad          m16, 9
1708    packssdw        m1, m16
1709    pmulhrsw        m1, m7
1710    paddw           m0, m1
1711    pmaxsw          m0, m6
1712    pminsw          m0, m14
1713    mova    [dstq+r10], m0
1714    add            r10, 64
1715    jl .n1_loop
1716    add           dstq, strideq
1717    ret
1718
1719cglobal sgr_filter_mix_16bpc, 4, 14, 23, 416*66+8, dst, stride, left, lpf, \
1720                                                   w, h, edge, params
1721    movifnidn       wd, wm
1722    mov        paramsq, r6mp
1723    lea            r13, [r_ext_mask+72]
1724    mov          edged, r7m
1725    movifnidn       hd, hm
1726    vpbroadcastd    m7, [paramsq+8] ; w0 w1
1727    pxor            m6, m6
1728    vpbroadcastd    m8, [base+pd_8]
1729    add             wd, wd
1730    vpbroadcastd    m9, [base+pd_m9]
1731    add           lpfq, wq
1732    vpbroadcastd   m10, [base+pd_m25]
1733    add           dstq, wq
1734    vpsubd         m11, m6, [paramsq+0] {1to16} ; -s0
1735    lea             t3, [rsp+wq*2+416*24+8]
1736    vpsubd         m12, m6, [paramsq+4] {1to16} ; -s1
1737    lea             t4, [rsp+wq+416*52+8]
1738    vpbroadcastd   m13, [base+pw_164_455]
1739    lea             t1, [rsp+wq+12]
1740    vpbroadcastd   m14, [base+pw_61448]
1741    neg             wq
1742    vpbroadcastd   m15, [base+pd_m34816]
1743    psllw           m7, 2
1744    vpbroadcastd   m22, [base+pd_2147483648]
1745    mov           r10d, 0xfffffff8
1746    mova           m18, [sgr_x_by_x+64*0]
1747    kmovd           k1, r10d
1748    mova           m19, [sgr_x_by_x+64*1]
1749    mov            r10, 0x3333333333333333
1750    mova           m20, [sgr_x_by_x+64*2]
1751    kmovq           k2, r10
1752    mova           m21, [sgr_x_by_x+64*3]
1753    test         edgeb, 4 ; LR_HAVE_TOP
1754    jz .no_top
1755    call .h_top
1756    add           lpfq, strideq
1757    mov             t2, t1
1758    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx512icl).top_fixup
1759    add             t1, 416*12
1760    call .h_top
1761    lea            r10, [lpfq+strideq*4]
1762    mov           lpfq, dstq
1763    add            r10, strideq
1764    mov          [rsp], r10 ; below
1765    call .hv0
1766.main:
1767    dec             hd
1768    jz .height1
1769    add           lpfq, strideq
1770    call .hv1
1771    call .prep_n
1772    sub             hd, 2
1773    jl .extend_bottom
1774.main_loop:
1775    add           lpfq, strideq
1776    call .hv0
1777    test            hd, hd
1778    jz .odd_height
1779    add           lpfq, strideq
1780    call .hv1
1781    call .n0
1782    call .n1
1783    sub             hd, 2
1784    jge .main_loop
1785    test         edgeb, 8 ; LR_HAVE_BOTTOM
1786    jz .extend_bottom
1787    mov           lpfq, [rsp]
1788    call .hv0_bottom
1789    add           lpfq, strideq
1790    call .hv1_bottom
1791.end:
1792    call .n0
1793    call .n1
1794.end2:
1795    RET
1796.height1:
1797    call .v1
1798    call .prep_n
1799    jmp .odd_height_end
1800.odd_height:
1801    call .v1
1802    call .n0
1803    call .n1
1804.odd_height_end:
1805    call .v0
1806    call .v1
1807    call .n0
1808    jmp .end2
1809.extend_bottom:
1810    call .v0
1811    call .v1
1812    jmp .end
1813.no_top:
1814    lea            r10, [lpfq+strideq*4]
1815    mov           lpfq, dstq
1816    lea            r10, [r10+strideq*2]
1817    mov          [rsp], r10
1818    call .h
1819    lea            r10, [wq-4]
1820    lea             t2, [t1+416*12]
1821.top_fixup_loop:
1822    mova            m0, [t1+r10+416* 0]
1823    mova            m1, [t1+r10+416* 2]
1824    mova            m2, [t1+r10+416* 4]
1825    paddw           m0, m0
1826    mova            m3, [t1+r10+416* 6]
1827    paddd           m1, m1
1828    mova            m4, [t1+r10+416* 8]
1829    paddd           m2, m2
1830    mova            m5, [t1+r10+416*10]
1831    mova [t2+r10+416* 0], m0
1832    mova [t2+r10+416* 2], m1
1833    mova [t2+r10+416* 4], m2
1834    mova [t2+r10+416* 6], m3
1835    mova [t2+r10+416* 8], m4
1836    mova [t2+r10+416*10], m5
1837    add            r10, 64
1838    jl .top_fixup_loop
1839    call .v0
1840    jmp .main
1841.h: ; horizontal boxsum
1842    lea            r10, [wq-4]
1843    test         edgeb, 1 ; LR_HAVE_LEFT
1844    jz .h_extend_left
1845    movq          xm16, [leftq+2]
1846    vmovdqu16  m16{k1}, [lpfq+wq-6]
1847    add          leftq, 8
1848    jmp .h_main
1849.h_extend_left:
1850    vpbroadcastw  xm16, [lpfq+wq]
1851    vmovdqu16  m16{k1}, [lpfq+wq-6]
1852    jmp .h_main
1853.h_top:
1854    lea            r10, [wq-4]
1855    test         edgeb, 1 ; LR_HAVE_LEFT
1856    jz .h_extend_left
1857.h_loop:
1858    movu           m16, [lpfq+r10- 2]
1859.h_main:
1860    movu           m17, [lpfq+r10+14]
1861    test         edgeb, 2 ; LR_HAVE_RIGHT
1862    jnz .h_have_right
1863    cmp           r10d, -68
1864    jl .h_have_right
1865    vpbroadcastw    m0, [lpfq-2]
1866    vpternlogd     m16, m0, [r13+r10+ 0], 0xe4
1867    vpternlogd     m17, m0, [r13+r10+16], 0xe4
1868.h_have_right:
1869    palignr         m3, m17, m16, 2
1870    palignr         m0, m17, m16, 4
1871    paddw           m1, m3, m0
1872    punpcklwd       m2, m3, m0
1873    pmaddwd         m2, m2
1874    punpckhwd       m3, m0
1875    pmaddwd         m3, m3
1876    palignr         m0, m17, m16, 6
1877    paddw           m1, m0             ; sum3
1878    punpcklwd       m4, m0, m6
1879    vpdpwssd        m2, m4, m4         ; sumsq3
1880    punpckhwd       m0, m6
1881    vpdpwssd        m3, m0, m0
1882    shufpd          m4, m16, m17, 0x55
1883    punpcklwd      m17, m4, m16
1884    paddw           m0, m16, m4
1885    punpckhwd       m4, m16
1886    mova [t1+r10+416* 6], m1
1887    mova [t1+r10+416* 8], m2
1888    mova [t1+r10+416*10], m3
1889    paddw           m1, m0             ; sum5
1890    vpdpwssd        m2, m17, m17       ; sumsq5
1891    vpdpwssd        m3, m4, m4
1892    mova [t1+r10+416* 0], m1
1893    mova [t1+r10+416* 2], m2
1894    mova [t1+r10+416* 4], m3
1895    add            r10, 64
1896    jl .h_loop
1897    ret
1898ALIGN function_align
1899.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
1900    lea            r10, [wq-4]
1901    test         edgeb, 1 ; LR_HAVE_LEFT
1902    jz .hv0_extend_left
1903    movq          xm16, [leftq+2]
1904    vmovdqu16  m16{k1}, [lpfq+wq-6]
1905    add          leftq, 8
1906    jmp .hv0_main
1907.hv0_extend_left:
1908    vpbroadcastw  xm16, [lpfq+wq]
1909    vmovdqu16  m16{k1}, [lpfq+wq-6]
1910    jmp .hv0_main
1911.hv0_bottom:
1912    lea            r10, [wq-4]
1913    test         edgeb, 1 ; LR_HAVE_LEFT
1914    jz .hv0_extend_left
1915.hv0_loop:
1916    movu           m16, [lpfq+r10- 2]
1917.hv0_main:
1918    movu           m17, [lpfq+r10+14]
1919    test         edgeb, 2 ; LR_HAVE_RIGHT
1920    jnz .hv0_have_right
1921    cmp           r10d, -68
1922    jl .hv0_have_right
1923    vpbroadcastw    m0, [lpfq-2]
1924    vpternlogd     m16, m0, [r13+r10+ 0], 0xe4
1925    vpternlogd     m17, m0, [r13+r10+16], 0xe4
1926.hv0_have_right:
1927    palignr         m3, m17, m16, 2
1928    palignr         m0, m17, m16, 4
1929    paddw           m1, m3, m0
1930    punpcklwd       m2, m3, m0
1931    pmaddwd         m2, m2
1932    punpckhwd       m3, m0
1933    pmaddwd         m3, m3
1934    palignr         m0, m17, m16, 6
1935    paddw           m1, m0             ; h sum3
1936    punpcklwd       m4, m0, m6
1937    vpdpwssd        m2, m4, m4         ; h sumsq3
1938    punpckhwd       m0, m6
1939    vpdpwssd        m3, m0, m0
1940    shufpd         m17, m16, m17, 0x55
1941    paddw           m4, m1, [t1+r10+416* 6]
1942    paddd           m5, m2, [t1+r10+416* 8]
1943    mova [t1+r10+416* 6], m1
1944    mova [t1+r10+416* 8], m2
1945    paddw           m1, m16
1946    paddw           m1, m17            ; h sum5
1947    punpcklwd       m0, m17, m16
1948    vpdpwssd        m2, m0, m0         ; h sumsq5
1949    paddd           m0, m3, [t1+r10+416*10]
1950    mova [t1+r10+416*10], m3
1951    punpckhwd      m17, m16
1952    vpdpwssd        m3, m17, m17
1953    mova [t3+r10*2+416*8+ 8], m1       ; we need a clean copy of the last row
1954    mova [t3+r10*2+416*0+ 8], m2       ; in case height is odd
1955    mova [t3+r10*2+416*0+72], m3
1956    paddw           m1, [t1+r10+416* 0]
1957    paddd           m2, [t1+r10+416* 2]
1958    paddd           m3, [t1+r10+416* 4]
1959    mova [t1+r10+416* 0], m1
1960    mova [t1+r10+416* 2], m2
1961    mova [t1+r10+416* 4], m3
1962    paddw          m17, m4, [t2+r10+416* 6]
1963    paddd           m2, m5, [t2+r10+416* 8]
1964    paddd           m3, m0, [t2+r10+416*10]
1965    mova [t2+r10+416* 6], m4
1966    mova [t2+r10+416* 8], m5
1967    mova [t2+r10+416*10], m0
1968    paddd           m2, m8
1969    paddd           m3, m8
1970    psrld           m2, 4              ; (a3 + 8) >> 4
1971    psrld           m3, 4
1972    pmulld          m2, m9             ; -((a3 + 8) >> 4) * 9
1973    pmulld          m3, m9
1974    psrlw           m5, m17, 1
1975    pavgw           m5, m6             ; (b3 + 2) >> 2
1976    punpcklwd       m4, m5, m6
1977    vpdpwssd        m2, m4, m4         ; -p3
1978    punpckhwd       m5, m6
1979    vpdpwssd        m3, m5, m5
1980    punpcklwd      m16, m6, m17        ; b3
1981    punpckhwd      m17, m6, m17
1982    pminsd          m2, m6
1983    pminsd          m3, m6
1984    pmulld          m2, m12            ; p3 * s1
1985    pmulld          m3, m12
1986    pmaddwd        m16, m13            ; b3 * 455
1987    pmaddwd        m17, m13
1988    vpalignr    m3{k2}, m2, m2, 2
1989    mova            m2, m20
1990    paddusw         m3, m14
1991    psraw           m3, 4              ; min(z3, 255) - 256
1992    vpermt2b        m2, m3, m21        ; sgr_x_by_x[128..255]
1993    vpmovb2m        k3, m3
1994    vpermi2b        m3, m18, m19       ; sgr_x_by_x[  0..127]
1995    vmovdqu8    m3{k3}, m2             ; x3
1996    pandn           m2, m15, m3
1997    psrld           m3, 16
1998    pmulld         m16, m2
1999    pmulld         m17, m3
2000    packssdw        m2, m3
2001    mova [t4+r10*1+416*2+4], m2
2002    psubd          m16, m15            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2003    psubd          m17, m15
2004    psrld          m16, 12
2005    psrld          m17, 12
2006    mova          [t3+r10*2+416*4+  8], xm16
2007    mova          [t3+r10*2+416*4+ 24], xm17
2008    vextracti128  [t3+r10*2+416*4+ 40], ym16, 1
2009    vextracti128  [t3+r10*2+416*4+ 56], ym17, 1
2010    vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
2011    vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
2012    vextracti32x4 [t3+r10*2+416*4+104], m16, 3
2013    vextracti32x4 [t3+r10*2+416*4+120], m17, 3
2014    add            r10, 64
2015    jl .hv0_loop
2016    ret
2017ALIGN function_align
2018.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
2019    lea            r10, [wq-4]
2020    test         edgeb, 1 ; LR_HAVE_LEFT
2021    jz .hv1_extend_left
2022    movq          xm16, [leftq+2]
2023    vmovdqu16  m16{k1}, [lpfq+wq-6]
2024    add          leftq, 8
2025    jmp .hv1_main
2026.hv1_extend_left:
2027    vpbroadcastw  xm16, [lpfq+wq]
2028    vmovdqu16  m16{k1}, [lpfq+wq-6]
2029    jmp .hv1_main
2030.hv1_bottom:
2031    lea            r10, [wq-4]
2032    test         edgeb, 1 ; LR_HAVE_LEFT
2033    jz .hv1_extend_left
2034.hv1_loop:
2035    movu           m16, [lpfq+r10- 2]
2036.hv1_main:
2037    movu           m17, [lpfq+r10+14]
2038    test         edgeb, 2 ; LR_HAVE_RIGHT
2039    jnz .hv1_have_right
2040    cmp           r10d, -68
2041    jl .hv1_have_right
2042    vpbroadcastw    m0, [lpfq-2]
2043    vpternlogd     m16, m0, [r13+r10+ 0], 0xe4
2044    vpternlogd     m17, m0, [r13+r10+16], 0xe4
2045.hv1_have_right:
2046    palignr         m1, m17, m16, 2
2047    palignr         m3, m17, m16, 4
2048    paddw           m2, m1, m3
2049    punpcklwd       m0, m1, m3
2050    pmaddwd         m0, m0
2051    punpckhwd       m1, m3
2052    pmaddwd         m1, m1
2053    palignr         m3, m17, m16, 6
2054    paddw           m2, m3             ; h sum3
2055    punpcklwd       m5, m3, m6
2056    vpdpwssd        m0, m5, m5         ; h sumsq3
2057    punpckhwd       m3, m6
2058    vpdpwssd        m1, m3, m3
2059    shufpd          m3, m16, m17, 0x55
2060    punpcklwd       m5, m16, m3
2061    paddw           m4, m16, m3
2062    punpckhwd      m16, m3
2063    paddw          m17, m2, [t2+r10+416* 6]
2064    mova [t2+r10+416* 6], m2
2065    paddw           m4, m2             ; h sum5
2066    paddd           m2, m0, [t2+r10+416* 8]
2067    paddd           m3, m1, [t2+r10+416*10]
2068    mova [t2+r10+416* 8], m0
2069    mova [t2+r10+416*10], m1
2070    vpdpwssd        m0, m5, m5         ; h sumsq5
2071    vpdpwssd        m1, m16, m16
2072    paddd           m2, m8
2073    paddd           m3, m8
2074    psrld           m2, 4              ; (a3 + 8) >> 4
2075    psrld           m3, 4
2076    pmulld          m2, m9             ; -((a3 + 8) >> 4) * 9
2077    pmulld          m3, m9
2078    psrlw          m16, m17, 1
2079    pavgw          m16, m6             ; (b3 + 2) >> 2
2080    punpcklwd       m5, m16, m6
2081    vpdpwssd        m2, m5, m5         ; -p3
2082    punpckhwd      m16, m6
2083    vpdpwssd        m3, m16, m16
2084    punpcklwd      m16, m6, m17        ; b3
2085    punpckhwd      m17, m6, m17
2086    pminsd          m2, m6
2087    pminsd          m3, m6
2088    pmulld          m2, m12            ; p3 * s1
2089    pmulld          m3, m12
2090    pmaddwd        m16, m13            ; b3 * 455
2091    pmaddwd        m17, m13
2092    vpalignr    m3{k2}, m2, m2, 2
2093    mova            m2, m20
2094    paddusw         m3, m14
2095    psraw           m3, 4              ; min(z3, 255) - 256
2096    vpermt2b        m2, m3, m21        ; sgr_x_by_x[128..255]
2097    vpmovb2m        k3, m3
2098    vpermi2b        m3, m18, m19       ; sgr_x_by_x[  0..127]
2099    vmovdqu8    m3{k3}, m2             ; x3
2100    pandn           m2, m15, m3
2101    psrld           m3, 16
2102    pmulld         m16, m2
2103    pmulld         m17, m3
2104    packssdw        m2, m3
2105    mova [t4+r10*1+416*4+4], m2
2106    psubd          m16, m15            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2107    psubd          m17, m15
2108    psrld          m16, 12
2109    psrld          m17, 12
2110    paddw           m5, m4, [t2+r10+416*0]
2111    paddd           m2, m0, [t2+r10+416*2]
2112    paddd           m3, m1, [t2+r10+416*4]
2113    paddw           m5, [t1+r10+416*0]
2114    paddd           m2, [t1+r10+416*2]
2115    paddd           m3, [t1+r10+416*4]
2116    mova [t2+r10+416*0], m4
2117    mova [t2+r10+416*2], m0
2118    mova [t2+r10+416*4], m1
2119    mova          [t3+r10*2+416*8+  8], xm16
2120    mova          [t3+r10*2+416*8+ 24], xm17
2121    vextracti128  [t3+r10*2+416*8+ 40], ym16, 1
2122    vextracti128  [t3+r10*2+416*8+ 56], ym17, 1
2123    vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2
2124    vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2
2125    vextracti32x4 [t3+r10*2+416*8+104], m16, 3
2126    vextracti32x4 [t3+r10*2+416*8+120], m17, 3
2127    paddd           m2, m8
2128    paddd           m3, m8
2129    psrld           m2, 4              ; (a5 + 8) >> 4
2130    psrld           m3, 4
2131    pmulld          m2, m10            ; -((a5 + 8) >> 4) * 25
2132    pmulld          m3, m10
2133    psrlw          m17, m5, 1
2134    pavgw          m17, m6             ; (b5 + 2) >> 2
2135    punpcklwd      m16, m17, m6
2136    vpdpwssd        m2, m16, m16       ; -p5
2137    punpckhwd      m17, m6
2138    vpdpwssd        m3, m17, m17
2139    punpcklwd      m16, m5, m6         ; b5
2140    punpckhwd      m17, m5, m6
2141    pmulld          m2, m11            ; p5 * s0
2142    pmulld          m3, m11
2143    pmaddwd        m16, m13            ; b5 * 164
2144    pmaddwd        m17, m13
2145    vpalignr    m3{k2}, m2, m2, 2
2146    mova            m2, m20
2147    pmaxsw          m3, m6
2148    paddusw         m3, m14
2149    psraw           m3, 4              ; min(z5, 255) - 256
2150    vpermt2b        m2, m3, m21        ; sgr_x_by_x[128..255]
2151    vpmovb2m        k3, m3
2152    vpermi2b        m3, m18, m19       ; sgr_x_by_x[  0..127]
2153    vmovdqu8    m3{k3}, m2             ; x5
2154    pandn           m2, m15, m3
2155    psrld           m3, 16
2156    pmulld         m16, m2
2157    pmulld         m17, m3
2158    packssdw        m2, m3
2159    mova [t4+r10*1+416*0+4], m2
2160    psubd          m16, m15            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
2161    psubd          m17, m15
2162    psrld          m16, 12
2163    psrld          m17, 12
2164    mova          [t3+r10*2+416*0+  8], xm16
2165    mova          [t3+r10*2+416*0+ 24], xm17
2166    vextracti128  [t3+r10*2+416*0+ 40], ym16, 1
2167    vextracti128  [t3+r10*2+416*0+ 56], ym17, 1
2168    vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
2169    vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
2170    vextracti32x4 [t3+r10*2+416*0+104], m16, 3
2171    vextracti32x4 [t3+r10*2+416*0+120], m17, 3
2172    add            r10, 64
2173    jl .hv1_loop
2174    mov            r10, t2
2175    mov             t2, t1
2176    mov             t1, r10
2177    ret
2178.v0: ; vertical boxsums + ab3 (even rows)
2179    lea            r10, [wq-4]
2180.v0_loop:
2181    mova           m16, [t1+r10+416* 6]
2182    mova            m2, [t1+r10+416* 8]
2183    mova            m3, [t1+r10+416*10]
2184    paddw          m16, m16
2185    paddd           m2, m2
2186    paddd           m3, m3
2187    paddw          m17, m16, [t2+r10+416* 6]
2188    paddd           m4, m2, [t2+r10+416* 8]
2189    paddd           m5, m3, [t2+r10+416*10]
2190    mova [t2+r10+416* 6], m16
2191    mova [t2+r10+416* 8], m2
2192    mova [t2+r10+416*10], m3
2193    paddd           m4, m8
2194    paddd           m5, m8
2195    psrld           m4, 4              ; (a3 + 8) >> 4
2196    psrld           m5, 4
2197    pmulld          m4, m9             ; -((a3 + 8) >> 4) * 9
2198    pmulld          m5, m9
2199    psrlw           m3, m17, 1
2200    pavgw           m3, m6             ; (b3 + 2) >> 2
2201    punpcklwd       m2, m3, m6
2202    vpdpwssd        m4, m2, m2         ; -p3
2203    punpckhwd       m3, m6
2204    vpdpwssd        m5, m3, m3
2205    punpcklwd      m16, m6, m17        ; b3
2206    punpckhwd      m17, m6, m17
2207    pminsd          m4, m6
2208    pminsd          m5, m6
2209    pmulld          m4, m12            ; p3 * s1
2210    pmulld          m5, m12
2211    pmaddwd        m16, m13            ; b3 * 455
2212    pmaddwd        m17, m13
2213    vpalignr    m5{k2}, m4, m4, 2
2214    mova            m4, m20
2215    paddusw         m5, m14
2216    psraw           m5, 4              ; min(z3, 255) - 256
2217    vpermt2b        m4, m5, m21        ; sgr_x_by_x[128..255]
2218    vpmovb2m        k3, m5
2219    vpermi2b        m5, m18, m19       ; sgr_x_by_x[  0..127]
2220    vmovdqu8    m5{k3}, m4             ; x3
2221    pandn           m4, m15, m5
2222    psrld           m5, 16
2223    pmulld         m16, m4
2224    pmulld         m17, m5
2225    packssdw        m4, m5
2226    mova [t4+r10*1+416*2+4], m4
2227    psubd          m16, m15            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2228    psubd          m17, m15
2229    psrld          m16, 12
2230    psrld          m17, 12
2231    mova            m3, [t1+r10+416*0]
2232    mova            m4, [t1+r10+416*2]
2233    mova            m5, [t1+r10+416*4]
2234    mova [t3+r10*2+416*8+ 8], m3
2235    mova [t3+r10*2+416*0+ 8], m4
2236    mova [t3+r10*2+416*0+72], m5
2237    paddw           m3, m3              ; cc5
2238    paddd           m4, m4
2239    paddd           m5, m5
2240    mova [t1+r10+416*0], m3
2241    mova [t1+r10+416*2], m4
2242    mova [t1+r10+416*4], m5
2243    mova          [t3+r10*2+416*4+  8], xm16
2244    mova          [t3+r10*2+416*4+ 24], xm17
2245    vextracti128  [t3+r10*2+416*4+ 40], ym16, 1
2246    vextracti128  [t3+r10*2+416*4+ 56], ym17, 1
2247    vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
2248    vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
2249    vextracti32x4 [t3+r10*2+416*4+104], m16, 3
2250    vextracti32x4 [t3+r10*2+416*4+120], m17, 3
2251    add            r10, 64
2252    jl .v0_loop
2253    ret
2254.v1: ; vertical boxsums + ab (odd rows)
2255    lea            r10, [wq-4]
2256.v1_loop:
2257    mova           m16, [t1+r10+416* 6]
2258    mova            m2, [t1+r10+416* 8]
2259    mova            m3, [t1+r10+416*10]
2260    paddw          m17, m16, [t2+r10+416* 6]
2261    paddd           m4, m2, [t2+r10+416* 8]
2262    paddd           m5, m3, [t2+r10+416*10]
2263    mova [t2+r10+416* 6], m16
2264    mova [t2+r10+416* 8], m2
2265    mova [t2+r10+416*10], m3
2266    paddd           m4, m8
2267    paddd           m5, m8
2268    psrld           m4, 4              ; (a3 + 8) >> 4
2269    psrld           m5, 4
2270    pmulld          m4, m9              ; -((a3 + 8) >> 4) * 9
2271    pmulld          m5, m9
2272    psrlw           m3, m17, 1
2273    pavgw           m3, m6             ; (b3 + 2) >> 2
2274    punpcklwd       m2, m3, m6
2275    vpdpwssd        m4, m2, m2         ; -p3
2276    punpckhwd       m3, m6
2277    vpdpwssd        m5, m3, m3
2278    punpcklwd      m16, m6, m17        ; b3
2279    punpckhwd      m17, m6, m17
2280    pminsd          m4, m6
2281    pminsd          m5, m6
2282    pmulld          m4, m12            ; p3 * s1
2283    pmulld          m5, m12
2284    pmaddwd        m16, m13            ; b3 * 455
2285    pmaddwd        m17, m13
2286    vpalignr    m5{k2}, m4, m4, 2
2287    mova            m4, m20
2288    paddusw         m5, m14
2289    psraw           m5, 4              ; min(z3, 255) - 256
2290    vpermt2b        m4, m5, m21        ; sgr_x_by_x[128..255]
2291    vpmovb2m        k3, m5
2292    vpermi2b        m5, m18, m19       ; sgr_x_by_x[  0..127]
2293    vmovdqu8    m5{k3}, m4             ; x3
2294    pandn           m4, m15, m5
2295    psrld           m5, 16
2296    pmulld         m16, m4
2297    pmulld         m17, m5
2298    packssdw        m4, m5
2299    mova [t4+r10*1+416*4+4], m4
2300    psubd          m16, m15            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2301    psubd          m17, m15
2302    psrld          m16, 12
2303    psrld          m17, 12
2304    mova            m0, [t3+r10*2+416*8+ 8]
2305    mova            m4, [t3+r10*2+416*0+ 8]
2306    mova            m5, [t3+r10*2+416*0+72]
2307    paddw           m1, m0, [t2+r10+416*0]
2308    paddd           m2, m4, [t2+r10+416*2]
2309    paddd           m3, m5, [t2+r10+416*4]
2310    paddw           m1, [t1+r10+416*0]
2311    paddd           m2, [t1+r10+416*2]
2312    paddd           m3, [t1+r10+416*4]
2313    mova [t2+r10+416*0], m0
2314    mova [t2+r10+416*2], m4
2315    mova [t2+r10+416*4], m5
2316    mova          [t3+r10*2+416*8+  8], xm16
2317    mova          [t3+r10*2+416*8+ 24], xm17
2318    vextracti128  [t3+r10*2+416*8+ 40], ym16, 1
2319    vextracti128  [t3+r10*2+416*8+ 56], ym17, 1
2320    vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2
2321    vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2
2322    vextracti32x4 [t3+r10*2+416*8+104], m16, 3
2323    vextracti32x4 [t3+r10*2+416*8+120], m17, 3
2324    paddd           m2, m8
2325    paddd           m3, m8
2326    psrld           m2, 4              ; (a5 + 8) >> 4
2327    psrld           m3, 4
2328    pmulld          m2, m10            ; -((a5 + 8) >> 4) * 25
2329    pmulld          m3, m10
2330    psrlw           m5, m1, 1
2331    pavgw           m5, m6             ; (b5 + 2) >> 2
2332    punpcklwd       m4, m5, m6
2333    vpdpwssd        m2, m4, m4         ; -p5
2334    punpckhwd       m5, m6
2335    vpdpwssd        m3, m5, m5
2336    punpcklwd      m16, m1, m6         ; b5
2337    punpckhwd      m17, m1, m6
2338    pmulld          m2, m11            ; p5 * s0
2339    pmulld          m3, m11
2340    pmaddwd        m16, m13            ; b5 * 164
2341    pmaddwd        m17, m13
2342    vpalignr    m3{k2}, m2, m2, 2
2343    mova            m2, m20
2344    pmaxsw          m3, m6
2345    paddusw         m3, m14
2346    psraw           m3, 4              ; min(z5, 255) - 256
2347    vpermt2b        m2, m3, m21        ; sgr_x_by_x[128..255]
2348    vpmovb2m        k3, m3
2349    vpermi2b        m3, m18, m19       ; sgr_x_by_x[  0..127]
2350    vmovdqu8    m3{k3}, m2             ; x5
2351    pandn           m2, m15, m3
2352    psrld           m3, 16
2353    pmulld         m16, m2
2354    pmulld         m17, m3
2355    packssdw        m2, m3
2356    mova [t4+r10*1+416*0+4], m2
2357    psubd          m16, m15            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
2358    psubd          m17, m15
2359    psrld          m16, 12
2360    psrld          m17, 12
2361    mova          [t3+r10*2+416*0+  8], xm16
2362    mova          [t3+r10*2+416*0+ 24], xm17
2363    vextracti128  [t3+r10*2+416*0+ 40], ym16, 1
2364    vextracti128  [t3+r10*2+416*0+ 56], ym17, 1
2365    vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
2366    vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
2367    vextracti32x4 [t3+r10*2+416*0+104], m16, 3
2368    vextracti32x4 [t3+r10*2+416*0+120], m17, 3
2369    add            r10, 64
2370    jl .v1_loop
2371    mov            r10, t2
2372    mov             t2, t1
2373    mov             t1, r10
2374    ret
2375.prep_n: ; initial neighbor setup
2376    mov            r10, wq
2377.prep_n_loop:
2378    movu           ym0, [t4+r10*1+416*0+2]
2379    paddw          ym2, ym0, [t4+r10*1+416*0+0]
2380    paddw          ym2, [t4+r10*1+416*0+4]
2381    movu            m1, [t3+r10*2+416*0+4]
2382    paddd           m3, m1, [t3+r10*2+416*0+0]
2383    paddd           m3, [t3+r10*2+416*0+8]
2384    paddw          ym0, ym2
2385    paddd           m1, m3
2386    psllw          ym2, 2
2387    pslld           m3, 2
2388    paddw          ym0, ym2              ; a5 565
2389    paddd           m1, m3               ; b5 565
2390    mova [t4+r10*1+416* 6], ym0
2391    mova [t3+r10*2+416*12], m1
2392    mova           ym0, [t4+r10*1+416*2+0]
2393    paddw          ym0, [t4+r10*1+416*2+4]
2394    paddw          ym2, ym0, [t4+r10*1+416*2+2]
2395    mova            m1, [t3+r10*2+416*4+0]
2396    paddd           m1, [t3+r10*2+416*4+8]
2397    paddd           m3, m1, [t3+r10*2+416*4+4]
2398    psllw          ym2, 2                ; a3[-1] 444
2399    pslld           m3, 2                ; b3[-1] 444
2400    psubw          ym2, ym0              ; a3[-1] 343
2401    psubd           m3, m1               ; b3[-1] 343
2402    mova [t4+r10*1+416* 8], ym2
2403    mova [t3+r10*2+416*16], m3
2404    mova           ym0, [t4+r10*1+416*4+0]
2405    paddw          ym0, [t4+r10*1+416*4+4]
2406    paddw          ym2, ym0, [t4+r10*1+416*4+2]
2407    mova            m1, [t3+r10*2+416*8+0]
2408    paddd           m1, [t3+r10*2+416*8+8]
2409    paddd           m3, m1, [t3+r10*2+416*8+4]
2410    psllw          ym2, 2                 ; a3[ 0] 444
2411    pslld           m3, 2                 ; b3[ 0] 444
2412    mova [t4+r10*1+416*10], ym2
2413    mova [t3+r10*2+416*20], m3
2414    psubw          ym2, ym0               ; a3[ 0] 343
2415    psubd           m3, m1                ; b3[ 0] 343
2416    mova [t4+r10*1+416*12], ym2
2417    mova [t3+r10*2+416*24], m3
2418    add            r10, 32
2419    jl .prep_n_loop
2420    ret
2421ALIGN function_align
2422.n0: ; neighbor + output (even rows)
2423    mov            r10, wq
2424.n0_loop:
2425    movu           ym2, [t4+r10*1+2]
2426    paddw          ym0, ym2, [t4+r10*1+0]
2427    paddw          ym0, [t4+r10*1+4]
2428    paddw          ym2, ym0
2429    psllw          ym0, 2
2430    paddw          ym0, ym2              ; a5
2431    movu            m1, [t3+r10*2+4]
2432    paddd           m4, m1, [t3+r10*2+0]
2433    paddd           m4, [t3+r10*2+8]
2434    paddd           m1, m4
2435    pslld           m4, 2
2436    paddd           m4, m1               ; b5
2437    paddw          ym2, ym0, [t4+r10*1+416* 6]
2438    mova [t4+r10*1+416* 6], ym0
2439    paddd           m0, m4, [t3+r10*2+416*12]
2440    mova [t3+r10*2+416*12], m4
2441    mova           ym3, [t4+r10*1+416*2+0]
2442    paddw          ym3, [t4+r10*1+416*2+4]
2443    paddw          ym5, ym3, [t4+r10*1+416*2+2]
2444    psllw          ym5, 2                ; a3[ 1] 444
2445    psubw          ym4, ym5, ym3         ; a3[ 1] 343
2446    paddw          ym3, ym4, [t4+r10*1+416* 8]
2447    paddw          ym3, [t4+r10*1+416*10]
2448    mova [t4+r10*1+416* 8], ym4
2449    mova [t4+r10*1+416*10], ym5
2450    mova            m1, [t3+r10*2+416*4+0]
2451    paddd           m1, [t3+r10*2+416*4+8]
2452    paddd           m5, m1, [t3+r10*2+416*4+4]
2453    pslld           m5, 2                ; b3[ 1] 444
2454    psubd           m4, m5, m1           ; b3[ 1] 343
2455    paddd           m1, m4, [t3+r10*2+416*16]
2456    paddd           m1, [t3+r10*2+416*20]
2457    mova [t3+r10*2+416*16], m4
2458    mova [t3+r10*2+416*20], m5
2459    pmovzxwd        m4, [dstq+r10]
2460    pmovzxwd        m2, ym2              ; a5
2461    pmovzxwd        m3, ym3              ; a3
2462    pmaddwd         m2, m4               ; a5 * src
2463    pmaddwd         m3, m4               ; a3 * src
2464    vpshldd         m4, m22, 13
2465    psubd           m0, m2               ; b5 - a5 * src + (1 << 8)
2466    psubd           m1, m3               ; b3 - a3 * src + (1 << 8)
2467    psrld           m0, 9
2468    pslld           m1, 7
2469    vpblendmb   m0{k2}, m1, m0
2470    vpdpwssd        m4, m0, m7
2471    psrad           m4, 7
2472    pmaxsd          m4, m6
2473    vpmovusdw     ym16, m4               ; clip
2474    psrlw         ym16, 6
2475    mova    [dstq+r10], ym16
2476    add            r10, 32
2477    jl .n0_loop
2478    add           dstq, strideq
2479    ret
2480ALIGN function_align
2481.n1: ; neighbor + output (odd rows)
2482    mov            r10, wq
2483.n1_loop:
2484    mova           ym3, [t4+r10*1+416*4+0]
2485    paddw          ym3, [t4+r10*1+416*4+4]
2486    paddw          ym5, ym3, [t4+r10*1+416*4+2]
2487    psllw          ym5, 2                ; a3[ 1] 444
2488    psubw          ym4, ym5, ym3         ; a3[ 1] 343
2489    paddw          ym3, ym4, [t4+r10*1+416*12]
2490    paddw          ym3, [t4+r10*1+416*10]
2491    mova [t4+r10*1+416*10], ym5
2492    mova [t4+r10*1+416*12], ym4
2493    mova            m0, [t3+r10*2+416*8+0]
2494    paddd           m0, [t3+r10*2+416*8+8]
2495    paddd           m5, m0, [t3+r10*2+416*8+4]
2496    pslld           m5, 2                ; b3[ 1] 444
2497    psubd           m4, m5, m0           ; b3[ 1] 343
2498    paddd           m0, m4, [t3+r10*2+416*24]
2499    paddd           m0, [t3+r10*2+416*20]
2500    mova [t3+r10*2+416*20], m5
2501    mova [t3+r10*2+416*24], m4
2502    pmovzxwd        m4, [dstq+r10]
2503    pmovzxwd        m2, [t4+r10*1+416* 6]
2504    pmovzxwd        m3, ym3
2505    mova            m1, [t3+r10*2+416*12]
2506    pmaddwd         m2, m4               ; a5 * src
2507    pmaddwd         m3, m4               ; a3 * src
2508    vpshldd         m4, m22, 13
2509    psubd           m1, m2               ; b5 - a5 * src + (1 << 8)
2510    psubd           m0, m3               ; b3 - a3 * src + (1 << 8)
2511    pslld           m0, 7
2512    vpalignr    m0{k2}, m1, m1, 1
2513    vpdpwssd        m4, m0, m7
2514    psrad           m4, 7
2515    pmaxsd          m4, m6
2516    vpmovusdw     ym16, m4               ; clip
2517    psrlw         ym16, 6
2518    mova    [dstq+r10], ym16
2519    add            r10, 32
2520    jl .n1_loop
2521    add           dstq, strideq
2522    ret
2523
2524%endif ; ARCH_X86_64
2525