xref: /aosp_15_r20/external/libdav1d/src/x86/looprestoration_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 32
32
33wiener_l_shuf: db  4,  4,  4,  4,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
34               db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
35wiener_shufA:  db  1,  7,  2,  8,  3,  9,  4, 10,  5, 11,  6, 12,  7, 13,  8, 14
36wiener_shufB:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10
37wiener_shufC:  db  6,  5,  7,  6,  8,  7,  9,  8, 10,  9, 11, 10, 12, 11, 13, 12
38sgr_l_shuf:    db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
39sgr_r_ext:     times 16 db 1
40               times 16 db 9
41sgr_shuf:      db  1, -1,  2, -1,  3, -1,  4, -1,  5, -1,  6, -1,  7, -1,  8, -1
42               db  9, -1, 10, -1, 11, -1, 12, -1
43
44pb_m5:         times 4 db -5
45pb_3:          times 4 db 3
46pw_5_6:        dw 5, 6
47pw_164_24:     dw 164, 24
48pw_455_24:     dw 455, 24
49pw_256:        times 2 dw 256
50pw_2056:       times 2 dw 2056
51pw_m16380:     times 2 dw -16380
52pd_25:         dd 25
53pd_34816:      dd 34816
54pd_m4096:      dd -4096
55pf_256:        dd 256.0
56
57cextern pb_0to63
58
59SECTION .text
60
61DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
62
63INIT_YMM avx2
64cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
65                                                    w, h, edge, flt
66    mov           fltq, r6mp
67    movifnidn       hd, hm
68    mov          edged, r7m
69    mov             wd, wm
70    vbroadcasti128  m6, [wiener_shufA]
71    vpbroadcastb   m11, [fltq+ 0] ; x0 x0
72    vbroadcasti128  m7, [wiener_shufB]
73    vpbroadcastd   m12, [fltq+ 2]
74    vbroadcasti128  m8, [wiener_shufC]
75    packsswb       m12, m12       ; x1 x2
76    vpbroadcastw   m13, [fltq+ 6] ; x3
77    vbroadcasti128  m9, [sgr_shuf+6]
78    add           lpfq, wq
79    vpbroadcastd   m10, [pw_m16380]
80    vpbroadcastd   m14, [fltq+16] ; y0 y1
81    add           dstq, wq
82    vpbroadcastd   m15, [fltq+20] ; y2 y3
83    lea             t1, [rsp+wq*2+16]
84    psllw          m14, 5
85    neg             wq
86    psllw          m15, 5
87    test         edgeb, 4 ; LR_HAVE_TOP
88    jz .no_top
89    call .h_top
90    add           lpfq, strideq
91    mov             t6, t1
92    mov             t5, t1
93    add             t1, 384*2
94    call .h_top
95    lea            r10, [lpfq+strideq*4]
96    mov           lpfq, dstq
97    mov             t4, t1
98    add             t1, 384*2
99    add            r10, strideq
100    mov          [rsp], r10 ; below
101    call .h
102    mov             t3, t1
103    mov             t2, t1
104    dec             hd
105    jz .v1
106    add           lpfq, strideq
107    add             t1, 384*2
108    call .h
109    mov             t2, t1
110    dec             hd
111    jz .v2
112    add           lpfq, strideq
113    add             t1, 384*2
114    call .h
115    dec             hd
116    jz .v3
117.main:
118    lea             t0, [t1+384*2]
119.main_loop:
120    call .hv
121    dec             hd
122    jnz .main_loop
123    test         edgeb, 8 ; LR_HAVE_BOTTOM
124    jz .v3
125    mov           lpfq, [rsp]
126    call .hv_bottom
127    add           lpfq, strideq
128    call .hv_bottom
129.v1:
130    call .v
131    RET
132.no_top:
133    lea            r10, [lpfq+strideq*4]
134    mov           lpfq, dstq
135    lea            r10, [r10+strideq*2]
136    mov          [rsp], r10
137    call .h
138    mov             t6, t1
139    mov             t5, t1
140    mov             t4, t1
141    mov             t3, t1
142    mov             t2, t1
143    dec             hd
144    jz .v1
145    add           lpfq, strideq
146    add             t1, 384*2
147    call .h
148    mov             t2, t1
149    dec             hd
150    jz .v2
151    add           lpfq, strideq
152    add             t1, 384*2
153    call .h
154    dec             hd
155    jz .v3
156    lea             t0, [t1+384*2]
157    call .hv
158    dec             hd
159    jz .v3
160    add             t0, 384*8
161    call .hv
162    dec             hd
163    jnz .main
164.v3:
165    call .v
166.v2:
167    call .v
168    jmp .v1
169.extend_right:
170    movd           xm2, r10d
171    vpbroadcastd    m0, [pb_3]
172    vpbroadcastd    m1, [pb_m5]
173    vpbroadcastb    m2, xm2
174    mova            m3, [pb_0to63]
175    psubb           m0, m2
176    psubb           m1, m2
177    pminub          m0, m3
178    pminub          m1, m3
179    pshufb          m4, m0
180    pshufb          m5, m1
181    ret
182.h:
183    mov            r10, wq
184    test         edgeb, 1 ; LR_HAVE_LEFT
185    jz .h_extend_left
186    movd           xm4, [leftq]
187    vpblendd        m4, [lpfq+r10-4], 0xfe
188    add          leftq, 4
189    jmp .h_main
190.h_extend_left:
191    vbroadcasti128  m5, [lpfq+r10] ; avoid accessing memory located
192    mova            m4, [lpfq+r10] ; before the start of the buffer
193    palignr         m4, m5, 12
194    pshufb          m4, [wiener_l_shuf]
195    jmp .h_main
196.h_top:
197    mov            r10, wq
198    test         edgeb, 1 ; LR_HAVE_LEFT
199    jz .h_extend_left
200.h_loop:
201    movu            m4, [lpfq+r10-4]
202.h_main:
203    movu            m5, [lpfq+r10+4]
204    test         edgeb, 2 ; LR_HAVE_RIGHT
205    jnz .h_have_right
206    cmp           r10d, -34
207    jl .h_have_right
208    call .extend_right
209.h_have_right:
210    pshufb          m0, m4, m6
211    pmaddubsw       m0, m11
212    pshufb          m1, m5, m6
213    pmaddubsw       m1, m11
214    pshufb          m2, m4, m7
215    pmaddubsw       m2, m12
216    pshufb          m3, m5, m7
217    pmaddubsw       m3, m12
218    paddw           m0, m2
219    pshufb          m2, m4, m8
220    pmaddubsw       m2, m12
221    paddw           m1, m3
222    pshufb          m3, m5, m8
223    pmaddubsw       m3, m12
224    pshufb          m4, m9
225    paddw           m0, m2
226    pmullw          m2, m4, m13
227    pshufb          m5, m9
228    paddw           m1, m3
229    pmullw          m3, m5, m13
230    psllw           m4, 7
231    psllw           m5, 7
232    paddw           m4, m10
233    paddw           m5, m10
234    paddw           m0, m2
235    vpbroadcastd    m2, [pw_2056]
236    paddw           m1, m3
237    paddsw          m0, m4
238    paddsw          m1, m5
239    psraw           m0, 3
240    psraw           m1, 3
241    paddw           m0, m2
242    paddw           m1, m2
243    mova [t1+r10*2+ 0], m0
244    mova [t1+r10*2+32], m1
245    add            r10, 32
246    jl .h_loop
247    ret
248ALIGN function_align
249.hv:
250    add           lpfq, strideq
251    mov            r10, wq
252    test         edgeb, 1 ; LR_HAVE_LEFT
253    jz .hv_extend_left
254    movd           xm4, [leftq]
255    vpblendd        m4, [lpfq+r10-4], 0xfe
256    add          leftq, 4
257    jmp .hv_main
258.hv_extend_left:
259    movu            m4, [lpfq+r10-4]
260    pshufb          m4, [wiener_l_shuf]
261    jmp .hv_main
262.hv_bottom:
263    mov            r10, wq
264    test         edgeb, 1 ; LR_HAVE_LEFT
265    jz .hv_extend_left
266.hv_loop:
267    movu            m4, [lpfq+r10-4]
268.hv_main:
269    movu            m5, [lpfq+r10+4]
270    test         edgeb, 2 ; LR_HAVE_RIGHT
271    jnz .hv_have_right
272    cmp           r10d, -34
273    jl .hv_have_right
274    call .extend_right
275.hv_have_right:
276    pshufb          m0, m4, m6
277    pmaddubsw       m0, m11
278    pshufb          m1, m5, m6
279    pmaddubsw       m1, m11
280    pshufb          m2, m4, m7
281    pmaddubsw       m2, m12
282    pshufb          m3, m5, m7
283    pmaddubsw       m3, m12
284    paddw           m0, m2
285    pshufb          m2, m4, m8
286    pmaddubsw       m2, m12
287    paddw           m1, m3
288    pshufb          m3, m5, m8
289    pmaddubsw       m3, m12
290    pshufb          m4, m9
291    paddw           m0, m2
292    pmullw          m2, m4, m13
293    pshufb          m5, m9
294    paddw           m1, m3
295    pmullw          m3, m5, m13
296    psllw           m4, 7
297    psllw           m5, 7
298    paddw           m4, m10
299    paddw           m5, m10
300    paddw           m0, m2
301    paddw           m1, m3
302    mova            m2, [t4+r10*2]
303    paddw           m2, [t2+r10*2]
304    mova            m3, [t3+r10*2]
305    paddsw          m0, m4
306    vpbroadcastd    m4, [pw_2056]
307    paddsw          m1, m5
308    mova            m5, [t5+r10*2]
309    paddw           m5, [t1+r10*2]
310    psraw           m0, 3
311    psraw           m1, 3
312    paddw           m0, m4
313    paddw           m1, m4
314    paddw           m4, m0, [t6+r10*2]
315    mova    [t0+r10*2], m0
316    punpcklwd       m0, m2, m3
317    pmaddwd         m0, m15
318    punpckhwd       m2, m3
319    pmaddwd         m2, m15
320    punpcklwd       m3, m4, m5
321    pmaddwd         m3, m14
322    punpckhwd       m4, m5
323    pmaddwd         m4, m14
324    paddd           m0, m3
325    paddd           m4, m2
326    mova            m2, [t4+r10*2+32]
327    paddw           m2, [t2+r10*2+32]
328    mova            m3, [t3+r10*2+32]
329    mova            m5, [t5+r10*2+32]
330    paddw           m5, [t1+r10*2+32]
331    packuswb        m0, m4
332    paddw           m4, m1, [t6+r10*2+32]
333    mova [t0+r10*2+32], m1
334    punpcklwd       m1, m2, m3
335    pmaddwd         m1, m15
336    punpckhwd       m2, m3
337    pmaddwd         m2, m15
338    punpcklwd       m3, m4, m5
339    pmaddwd         m3, m14
340    punpckhwd       m4, m5
341    pmaddwd         m4, m14
342    paddd           m1, m3
343    paddd           m2, m4
344    packuswb        m1, m2
345    psrlw           m0, 8
346    psrlw           m1, 8
347    packuswb        m0, m1
348    mova    [dstq+r10], m0
349    add            r10, 32
350    jl .hv_loop
351    mov             t6, t5
352    mov             t5, t4
353    mov             t4, t3
354    mov             t3, t2
355    mov             t2, t1
356    mov             t1, t0
357    mov             t0, t6
358    add           dstq, strideq
359    ret
360.v:
361    mov            r10, wq
362.v_loop:
363    mova            m2, [t4+r10*2+ 0]
364    paddw           m2, [t2+r10*2+ 0]
365    mova            m4, [t3+r10*2+ 0]
366    mova            m6, [t1+r10*2+ 0]
367    paddw           m8, m6, [t6+r10*2+ 0]
368    paddw           m6, [t5+r10*2+ 0]
369    mova            m3, [t4+r10*2+32]
370    paddw           m3, [t2+r10*2+32]
371    mova            m5, [t3+r10*2+32]
372    mova            m7, [t1+r10*2+32]
373    paddw           m9, m7, [t6+r10*2+32]
374    paddw           m7, [t5+r10*2+32]
375    punpcklwd       m0, m2, m4
376    pmaddwd         m0, m15
377    punpckhwd       m2, m4
378    pmaddwd         m2, m15
379    punpcklwd       m4, m8, m6
380    pmaddwd         m4, m14
381    punpckhwd       m6, m8, m6
382    pmaddwd         m6, m14
383    punpcklwd       m1, m3, m5
384    pmaddwd         m1, m15
385    punpckhwd       m3, m5
386    pmaddwd         m3, m15
387    punpcklwd       m5, m9, m7
388    pmaddwd         m5, m14
389    punpckhwd       m7, m9, m7
390    pmaddwd         m7, m14
391    paddd           m0, m4
392    paddd           m2, m6
393    paddd           m1, m5
394    paddd           m3, m7
395    packuswb        m0, m2
396    packuswb        m1, m3
397    psrlw           m0, 8
398    psrlw           m1, 8
399    packuswb        m0, m1
400    mova    [dstq+r10], m0
401    add            r10, 32
402    jl .v_loop
403    mov             t6, t5
404    mov             t5, t4
405    mov             t4, t3
406    mov             t3, t2
407    mov             t2, t1
408    add           dstq, strideq
409    ret
410
411cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
412                                                  w, h, edge, flt
413    mov           fltq, r6mp
414    movifnidn       hd, hm
415    mov          edged, r7m
416    mov             wd, wm
417    vbroadcasti128  m6, [wiener_shufB]
418    vpbroadcastd   m12, [fltq+ 2]
419    vbroadcasti128  m7, [wiener_shufC]
420    packsswb       m12, m12       ; x1 x2
421    vpbroadcastw   m13, [fltq+ 6] ; x3
422    vbroadcasti128  m8, [sgr_shuf+6]
423    add           lpfq, wq
424    vpbroadcastd    m9, [pw_m16380]
425    vpbroadcastd   m10, [pw_2056]
426    mova           m11, [wiener_l_shuf]
427    vpbroadcastd   m14, [fltq+16] ; __ y1
428    add           dstq, wq
429    vpbroadcastd   m15, [fltq+20] ; y2 y3
430    lea             t1, [rsp+wq*2+16]
431    psllw          m14, 5
432    neg             wq
433    psllw          m15, 5
434    test         edgeb, 4 ; LR_HAVE_TOP
435    jz .no_top
436    call .h_top
437    add           lpfq, strideq
438    mov             t4, t1
439    add             t1, 384*2
440    call .h_top
441    lea            r10, [lpfq+strideq*4]
442    mov           lpfq, dstq
443    mov             t3, t1
444    add             t1, 384*2
445    add            r10, strideq
446    mov          [rsp], r10 ; below
447    call .h
448    mov             t2, t1
449    dec             hd
450    jz .v1
451    add           lpfq, strideq
452    add             t1, 384*2
453    call .h
454    dec             hd
455    jz .v2
456.main:
457    mov             t0, t4
458.main_loop:
459    call .hv
460    dec             hd
461    jnz .main_loop
462    test         edgeb, 8 ; LR_HAVE_BOTTOM
463    jz .v2
464    mov           lpfq, [rsp]
465    call .hv_bottom
466    add           lpfq, strideq
467    call .hv_bottom
468.end:
469    RET
470.no_top:
471    lea            r10, [lpfq+strideq*4]
472    mov           lpfq, dstq
473    lea            r10, [r10+strideq*2]
474    mov          [rsp], r10
475    call .h
476    mov             t4, t1
477    mov             t3, t1
478    mov             t2, t1
479    dec             hd
480    jz .v1
481    add           lpfq, strideq
482    add             t1, 384*2
483    call .h
484    dec             hd
485    jz .v2
486    lea             t0, [t1+384*2]
487    call .hv
488    dec             hd
489    jz .v2
490    add             t0, 384*6
491    call .hv
492    dec             hd
493    jnz .main
494.v2:
495    call .v
496    mov             t4, t3
497    mov             t3, t2
498    mov             t2, t1
499    add           dstq, strideq
500.v1:
501    call .v
502    jmp .end
503.h:
504    mov            r10, wq
505    test         edgeb, 1 ; LR_HAVE_LEFT
506    jz .h_extend_left
507    movd           xm4, [leftq]
508    vpblendd        m4, [lpfq+r10-4], 0xfe
509    add          leftq, 4
510    jmp .h_main
511.h_extend_left:
512    vbroadcasti128  m5, [lpfq+r10] ; avoid accessing memory located
513    mova            m4, [lpfq+r10] ; before the start of the buffer
514    palignr         m4, m5, 12
515    pshufb          m4, m11
516    jmp .h_main
517.h_top:
518    mov            r10, wq
519    test         edgeb, 1 ; LR_HAVE_LEFT
520    jz .h_extend_left
521.h_loop:
522    movu            m4, [lpfq+r10-4]
523.h_main:
524    movu            m5, [lpfq+r10+4]
525    test         edgeb, 2 ; LR_HAVE_RIGHT
526    jnz .h_have_right
527    cmp           r10d, -33
528    jl .h_have_right
529    call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
530.h_have_right:
531    pshufb          m0, m4, m6
532    pmaddubsw       m0, m12
533    pshufb          m1, m5, m6
534    pmaddubsw       m1, m12
535    pshufb          m2, m4, m7
536    pmaddubsw       m2, m12
537    pshufb          m3, m5, m7
538    pmaddubsw       m3, m12
539    pshufb          m4, m8
540    paddw           m0, m2
541    pmullw          m2, m4, m13
542    pshufb          m5, m8
543    paddw           m1, m3
544    pmullw          m3, m5, m13
545    psllw           m4, 7
546    psllw           m5, 7
547    paddw           m4, m9
548    paddw           m5, m9
549    paddw           m0, m2
550    paddw           m1, m3
551    paddsw          m0, m4
552    paddsw          m1, m5
553    psraw           m0, 3
554    psraw           m1, 3
555    paddw           m0, m10
556    paddw           m1, m10
557    mova [t1+r10*2+ 0], m0
558    mova [t1+r10*2+32], m1
559    add            r10, 32
560    jl .h_loop
561    ret
562ALIGN function_align
563.hv:
564    add           lpfq, strideq
565    mov            r10, wq
566    test         edgeb, 1 ; LR_HAVE_LEFT
567    jz .hv_extend_left
568    movd           xm4, [leftq]
569    vpblendd        m4, [lpfq+r10-4], 0xfe
570    add          leftq, 4
571    jmp .hv_main
572.hv_extend_left:
573    movu            m4, [lpfq+r10-4]
574    pshufb          m4, m11
575    jmp .hv_main
576.hv_bottom:
577    mov            r10, wq
578    test         edgeb, 1 ; LR_HAVE_LEFT
579    jz .hv_extend_left
580.hv_loop:
581    movu            m4, [lpfq+r10-4]
582.hv_main:
583    movu            m5, [lpfq+r10+4]
584    test         edgeb, 2 ; LR_HAVE_RIGHT
585    jnz .hv_have_right
586    cmp           r10d, -33
587    jl .hv_have_right
588    call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
589.hv_have_right:
590    pshufb          m0, m4, m6
591    pmaddubsw       m0, m12
592    pshufb          m1, m5, m6
593    pmaddubsw       m1, m12
594    pshufb          m2, m4, m7
595    pmaddubsw       m2, m12
596    pshufb          m3, m5, m7
597    pmaddubsw       m3, m12
598    pshufb          m4, m8
599    paddw           m0, m2
600    pmullw          m2, m4, m13
601    pshufb          m5, m8
602    paddw           m1, m3
603    pmullw          m3, m5, m13
604    psllw           m4, 7
605    psllw           m5, 7
606    paddw           m4, m9
607    paddw           m5, m9
608    paddw           m0, m2
609    paddw           m1, m3
610    mova            m2, [t3+r10*2]
611    paddw           m2, [t1+r10*2]
612    mova            m3, [t2+r10*2]
613    paddsw          m0, m4
614    paddsw          m1, m5
615    psraw           m0, 3
616    psraw           m1, 3
617    paddw           m0, m10
618    paddw           m1, m10
619    paddw           m4, m0, [t4+r10*2]
620    mova    [t0+r10*2], m0
621    punpcklwd       m0, m2, m3
622    pmaddwd         m0, m15
623    punpckhwd       m2, m3
624    pmaddwd         m2, m15
625    punpcklwd       m3, m4, m4
626    pmaddwd         m3, m14
627    punpckhwd       m4, m4
628    pmaddwd         m4, m14
629    paddd           m0, m3
630    paddd           m4, m2
631    mova            m2, [t3+r10*2+32]
632    paddw           m2, [t1+r10*2+32]
633    mova            m3, [t2+r10*2+32]
634    packuswb        m0, m4
635    paddw           m4, m1, [t4+r10*2+32]
636    mova [t0+r10*2+32], m1
637    punpcklwd       m1, m2, m3
638    pmaddwd         m1, m15
639    punpckhwd       m2, m3
640    pmaddwd         m2, m15
641    punpcklwd       m3, m4, m4
642    pmaddwd         m3, m14
643    punpckhwd       m4, m4
644    pmaddwd         m4, m14
645    paddd           m1, m3
646    paddd           m2, m4
647    packuswb        m1, m2
648    psrlw           m0, 8
649    psrlw           m1, 8
650    packuswb        m0, m1
651    mova    [dstq+r10], m0
652    add            r10, 32
653    jl .hv_loop
654    mov             t4, t3
655    mov             t3, t2
656    mov             t2, t1
657    mov             t1, t0
658    mov             t0, t4
659    add           dstq, strideq
660    ret
661.v:
662    mov            r10, wq
663    psrld          m13, m14, 16 ; y1 __
664.v_loop:
665    mova            m6, [t1+r10*2+ 0]
666    paddw           m2, m6, [t3+r10*2+ 0]
667    mova            m4, [t2+r10*2+ 0]
668    mova            m7, [t1+r10*2+32]
669    paddw           m3, m7, [t3+r10*2+32]
670    mova            m5, [t2+r10*2+32]
671    paddw           m6, [t4+r10*2+ 0]
672    paddw           m7, [t4+r10*2+32]
673    punpcklwd       m0, m2, m4
674    pmaddwd         m0, m15
675    punpckhwd       m2, m4
676    pmaddwd         m2, m15
677    punpcklwd       m1, m3, m5
678    pmaddwd         m1, m15
679    punpckhwd       m3, m5
680    pmaddwd         m3, m15
681    punpcklwd       m5, m7, m6
682    pmaddwd         m4, m5, m14
683    punpckhwd       m7, m6
684    pmaddwd         m6, m7, m14
685    pmaddwd         m5, m13
686    pmaddwd         m7, m13
687    paddd           m0, m4
688    paddd           m2, m6
689    paddd           m1, m5
690    paddd           m3, m7
691    packuswb        m0, m2
692    packuswb        m1, m3
693    psrlw           m0, 8
694    psrlw           m1, 8
695    packuswb        m0, m1
696    mova    [dstq+r10], m0
697    add            r10, 32
698    jl .v_loop
699    ret
700
701cglobal sgr_filter_5x5_8bpc, 4, 12, 16, 400*24+16, dst, stride, left, lpf, \
702                                                   w, h, edge, params
703    mov        paramsq, r6mp
704    mov             wd, wm
705    movifnidn       hd, hm
706    vbroadcasti128  m8, [sgr_shuf+0]
707    mov          edged, r7m
708    vbroadcasti128  m9, [sgr_shuf+8]
709    add           lpfq, wq
710    vbroadcasti128 m10, [sgr_shuf+2]
711    add           dstq, wq
712    vbroadcasti128 m11, [sgr_shuf+6]
713    lea             t3, [rsp+wq*4+16+400*12]
714    vpbroadcastw    m7, [paramsq+8] ; w0
715    pxor            m6, m6
716    vpbroadcastd   m12, [paramsq+0] ; s0
717    lea             t1, [rsp+wq*2+20]
718    vpbroadcastd   m13, [pw_164_24]
719    neg             wq
720    vbroadcastss   m14, [pf_256]
721    psllw           m7, 4
722    vpbroadcastd   m15, [pd_m4096]
723    test         edgeb, 4 ; LR_HAVE_TOP
724    jz .no_top
725    call .h_top
726    add           lpfq, strideq
727    mov             t2, t1
728    call .top_fixup
729    add             t1, 400*6
730    call .h_top
731    lea            r10, [lpfq+strideq*4]
732    mov           lpfq, dstq
733    add            r10, strideq
734    mov          [rsp], r10 ; below
735    mov             t0, t2
736    dec             hd
737    jz .height1
738    or           edged, 16
739    call .h
740.main:
741    add           lpfq, strideq
742    call .hv
743    call .prep_n
744    sub             hd, 2
745    jl .extend_bottom
746.main_loop:
747    add           lpfq, strideq
748    test            hd, hd
749    jz .odd_height
750    call .h
751    add           lpfq, strideq
752    call .hv
753    call .n0
754    call .n1
755    sub             hd, 2
756    jge .main_loop
757    test         edgeb, 8 ; LR_HAVE_BOTTOM
758    jz .extend_bottom
759    mov           lpfq, [rsp]
760    call .h_top
761    add           lpfq, strideq
762    call .hv_bottom
763.end:
764    call .n0
765    call .n1
766.end2:
767    RET
768.height1:
769    call .hv
770    call .prep_n
771    jmp .odd_height_end
772.odd_height:
773    call .hv
774    call .n0
775    call .n1
776.odd_height_end:
777    call .v
778    call .n0
779    jmp .end2
780.extend_bottom:
781    call .v
782    jmp .end
783.no_top:
784    lea            r10, [lpfq+strideq*4]
785    mov           lpfq, dstq
786    lea            r10, [r10+strideq*2]
787    mov          [rsp], r10
788    call .h
789    lea             t2, [t1+400*6]
790    call .top_fixup
791    dec             hd
792    jz .no_top_height1
793    or           edged, 16
794    mov             t0, t1
795    mov             t1, t2
796    jmp .main
797.no_top_height1:
798    call .v
799    call .prep_n
800    jmp .odd_height_end
801.extend_right:
802    movd           xm2, r10d
803    mova            m0, [sgr_r_ext]
804    vpbroadcastb    m2, xm2
805    psubb           m0, m2
806    pminub          m0, [pb_0to63]
807    pshufb          m5, m0
808    ret
809.h: ; horizontal boxsum
810    lea            r10, [wq-2]
811    test         edgeb, 1 ; LR_HAVE_LEFT
812    jz .h_extend_left
813    vpbroadcastd   xm0, [leftq]
814    mova           xm5, [lpfq+wq]
815    palignr        xm5, xm0, 12
816    add          leftq, 4
817    jmp .h_main
818.h_extend_left:
819    mova           xm5, [lpfq+wq]
820    pshufb         xm5, [sgr_l_shuf]
821    jmp .h_main
822.h_top:
823    lea            r10, [wq-2]
824    test         edgeb, 1 ; LR_HAVE_LEFT
825    jz .h_extend_left
826.h_loop:
827    movu           xm5, [lpfq+r10-2]
828.h_main:
829    vinserti128     m5, [lpfq+r10+6], 1
830    test         edgeb, 2 ; LR_HAVE_RIGHT
831    jnz .h_have_right
832    cmp           r10d, -18
833    jl .h_have_right
834    call .extend_right
835.h_have_right:
836    pshufb          m3, m5, m8
837    pmullw          m4, m3, m3
838    pshufb          m2, m5, m9
839    paddw           m0, m3, m2
840    shufps          m3, m2, q2121
841    paddw           m0, m3
842    punpcklwd       m1, m2, m3
843    pmaddwd         m1, m1
844    punpckhwd       m2, m3
845    pmaddwd         m2, m2
846    punpcklwd       m3, m4, m6
847    paddd           m1, m3
848    punpckhwd       m4, m6
849    paddd           m2, m4
850    pshufb          m4, m5, m10
851    paddw           m0, m4
852    pshufb          m5, m11
853    paddw           m0, m5 ; sum
854    punpcklwd       m3, m4, m5
855    pmaddwd         m3, m3
856    punpckhwd       m4, m5
857    pmaddwd         m4, m4
858    test         edgeb, 16 ; y > 0
859    jz .h_loop_end
860    paddw           m0, [t1+r10*2+400*0]
861    paddd           m1, [t1+r10*2+400*2]
862    paddd           m2, [t1+r10*2+400*4]
863.h_loop_end:
864    paddd           m1, m3 ; sumsq
865    paddd           m2, m4
866    mova [t1+r10*2+400*0], m0
867    mova [t1+r10*2+400*2], m1
868    mova [t1+r10*2+400*4], m2
869    add            r10, 16
870    jl .h_loop
871    ret
872.top_fixup:
873    lea            r10, [wq-2]
874.top_fixup_loop: ; the sums of the first row needs to be doubled
875    mova            m0, [t1+r10*2+400*0]
876    mova            m1, [t1+r10*2+400*2]
877    mova            m2, [t1+r10*2+400*4]
878    paddw           m0, m0
879    paddd           m1, m1
880    paddd           m2, m2
881    mova [t2+r10*2+400*0], m0
882    mova [t2+r10*2+400*2], m1
883    mova [t2+r10*2+400*4], m2
884    add            r10, 16
885    jl .top_fixup_loop
886    ret
887ALIGN function_align
888.hv: ; horizontal boxsum + vertical boxsum + ab
889    lea            r10, [wq-2]
890    test         edgeb, 1 ; LR_HAVE_LEFT
891    jz .hv_extend_left
892    vpbroadcastd   xm0, [leftq]
893    mova           xm5, [lpfq+wq]
894    palignr        xm5, xm0, 12
895    add          leftq, 4
896    jmp .hv_main
897.hv_extend_left:
898    mova           xm5, [lpfq+wq]
899    pshufb         xm5, [sgr_l_shuf]
900    jmp .hv_main
901.hv_bottom:
902    lea            r10, [wq-2]
903    test         edgeb, 1 ; LR_HAVE_LEFT
904    jz .hv_extend_left
905.hv_loop:
906    movu           xm5, [lpfq+r10-2]
907.hv_main:
908    vinserti128     m5, [lpfq+r10+6], 1
909    test         edgeb, 2 ; LR_HAVE_RIGHT
910    jnz .hv_have_right
911    cmp           r10d, -18
912    jl .hv_have_right
913    call .extend_right
914.hv_have_right:
915    pshufb          m1, m5, m8
916    pmullw          m4, m1, m1
917    pshufb          m3, m5, m9
918    paddw           m0, m1, m3
919    shufps          m1, m3, q2121
920    paddw           m0, m1
921    punpcklwd       m2, m3, m1
922    pmaddwd         m2, m2
923    punpckhwd       m3, m1
924    pmaddwd         m3, m3
925    punpcklwd       m1, m4, m6
926    paddd           m2, m1
927    punpckhwd       m4, m6
928    paddd           m3, m4
929    pshufb          m1, m5, m10
930    paddw           m0, m1
931    pshufb          m5, m11
932    paddw           m0, m5               ; h sum
933    punpcklwd       m4, m5, m1
934    pmaddwd         m4, m4
935    punpckhwd       m5, m1
936    pmaddwd         m5, m5
937    paddw           m1, m0, [t1+r10*2+400*0]
938    paddd           m2, m4               ; h sumsq
939    paddd           m3, m5
940    paddd           m4, m2, [t1+r10*2+400*2]
941    paddd           m5, m3, [t1+r10*2+400*4]
942    test            hd, hd
943    jz .hv_last_row
944.hv_main2:
945    paddw           m1, [t2+r10*2+400*0] ; hv sum
946    paddd           m4, [t2+r10*2+400*2] ; hv sumsq
947    paddd           m5, [t2+r10*2+400*4]
948    mova [t0+r10*2+400*0], m0
949    mova [t0+r10*2+400*2], m2
950    mova [t0+r10*2+400*4], m3
951    vpbroadcastd    m2, [pd_25]
952    punpcklwd       m0, m1, m6           ; b
953    punpckhwd       m1, m6
954    pmulld          m4, m2               ; a * 25
955    pmulld          m5, m2
956    pmaddwd         m2, m0, m0           ; b * b
957    pmaddwd         m3, m1, m1
958    psubd           m4, m2               ; p
959    psubd           m5, m3
960    pmulld          m4, m12              ; p * s
961    pmulld          m5, m12
962    pmaddwd         m0, m13              ; b * 164
963    pmaddwd         m1, m13
964    paddw           m4, m13
965    paddw           m5, m13
966    psrld           m4, 20               ; z + 1
967    psrld           m5, 20
968    cvtdq2ps        m4, m4
969    cvtdq2ps        m5, m5
970    rcpps           m2, m4               ; 1 / (z + 1)
971    rcpps           m3, m5
972    pcmpgtd         m4, m14, m4
973    pcmpgtd         m5, m14, m5
974    mulps           m2, m14              ; 256 / (z + 1)
975    mulps           m3, m14
976    psrld           m4, 24               ; z < 255 ? 255 : 0
977    psrld           m5, 24
978    cvtps2dq        m2, m2
979    cvtps2dq        m3, m3
980    pminsw          m2, m4               ; x
981    pminsw          m3, m5
982    vpbroadcastd    m4, [pd_34816]
983    pmulld          m0, m2
984    pmulld          m1, m3
985    paddd           m0, m4               ; x * b * 164 + (1 << 11) + (1 << 15)
986    paddd           m1, m4
987    pand            m0, m15
988    pand            m1, m15
989    por             m0, m2               ; a | (b << 12)
990    por             m1, m3
991    mova         [t3+r10*4+ 8], xm0      ; The neighbor calculations requires
992    vextracti128 [t3+r10*4+40], m0, 1    ; 13 bits for a and 21 bits for b.
993    mova         [t3+r10*4+24], xm1      ; Packing them allows for 12+20, but
994    vextracti128 [t3+r10*4+56], m1, 1    ; that gets us most of the way.
995    add            r10, 16
996    jl .hv_loop
997    mov             t2, t1
998    mov             t1, t0
999    mov             t0, t2
1000    ret
1001.hv_last_row: ; esoteric edge case for odd heights
1002    mova [t1+r10*2+400*0], m1
1003    paddw              m1, m0
1004    mova [t1+r10*2+400*2], m4
1005    paddd              m4, m2
1006    mova [t1+r10*2+400*4], m5
1007    paddd              m5, m3
1008    jmp .hv_main2
1009.v: ; vertical boxsum + ab
1010    lea            r10, [wq-2]
1011.v_loop:
1012    mova            m0, [t1+r10*2+400*0]
1013    mova            m2, [t1+r10*2+400*2]
1014    mova            m3, [t1+r10*2+400*4]
1015    paddw           m1, m0, [t2+r10*2+400*0]
1016    paddd           m4, m2, [t2+r10*2+400*2]
1017    paddd           m5, m3, [t2+r10*2+400*4]
1018    paddw           m0, m0
1019    paddd           m2, m2
1020    paddd           m3, m3
1021    paddw           m1, m0               ; hv sum
1022    paddd           m4, m2               ; hv sumsq
1023    paddd           m5, m3
1024    vpbroadcastd    m2, [pd_25]
1025    punpcklwd       m0, m1, m6           ; b
1026    punpckhwd       m1, m6
1027    pmulld          m4, m2               ; a * 25
1028    pmulld          m5, m2
1029    pmaddwd         m2, m0, m0           ; b * b
1030    pmaddwd         m3, m1, m1
1031    psubd           m4, m2               ; p
1032    psubd           m5, m3
1033    pmulld          m4, m12              ; p * s
1034    pmulld          m5, m12
1035    pmaddwd         m0, m13              ; b * 164
1036    pmaddwd         m1, m13
1037    paddw           m4, m13
1038    paddw           m5, m13
1039    psrld           m4, 20               ; z + 1
1040    psrld           m5, 20
1041    cvtdq2ps        m4, m4
1042    cvtdq2ps        m5, m5
1043    rcpps           m2, m4               ; 1 / (z + 1)
1044    rcpps           m3, m5
1045    pcmpgtd         m4, m14, m4
1046    pcmpgtd         m5, m14, m5
1047    mulps           m2, m14              ; 256 / (z + 1)
1048    mulps           m3, m14
1049    psrld           m4, 24               ; z < 255 ? 255 : 0
1050    psrld           m5, 24
1051    cvtps2dq        m2, m2
1052    cvtps2dq        m3, m3
1053    pminsw          m2, m4               ; x
1054    pminsw          m3, m5
1055    vpbroadcastd    m4, [pd_34816]
1056    pmulld          m0, m2
1057    pmulld          m1, m3
1058    paddd           m0, m4               ; x * b * 164 + (1 << 11) + (1 << 15)
1059    paddd           m1, m4
1060    pand            m0, m15
1061    pand            m1, m15
1062    por             m0, m2               ; a | (b << 12)
1063    por             m1, m3
1064    mova         [t3+r10*4+ 8], xm0
1065    vextracti128 [t3+r10*4+40], m0, 1
1066    mova         [t3+r10*4+24], xm1
1067    vextracti128 [t3+r10*4+56], m1, 1
1068    add            r10, 16
1069    jl .v_loop
1070    ret
1071.prep_n: ; initial neighbor setup
1072    mov            r10, wq
1073.prep_n_loop:
1074    movu            m0, [t3+r10*4+ 4]
1075    movu            m1, [t3+r10*4+36]
1076    paddd           m2, m0, [t3+r10*4+ 0]
1077    paddd           m3, m1, [t3+r10*4+32]
1078    paddd           m2, [t3+r10*4+ 8]
1079    paddd           m3, [t3+r10*4+40]
1080    paddd           m0, m2
1081    pslld           m2, 2
1082    paddd           m1, m3
1083    pslld           m3, 2
1084    paddd           m2, m0                ; ab 565
1085    paddd           m3, m1
1086    pandn           m0, m15, m2           ; a
1087    psrld           m2, 12                ; b
1088    pandn           m1, m15, m3
1089    psrld           m3, 12
1090    mova [t3+r10*4+400*4+ 0], m0
1091    mova [t3+r10*4+400*8+ 0], m2
1092    mova [t3+r10*4+400*4+32], m1
1093    mova [t3+r10*4+400*8+32], m3
1094    add            r10, 16
1095    jl .prep_n_loop
1096    ret
1097ALIGN function_align
1098.n0: ; neighbor + output (even rows)
1099    mov            r10, wq
1100.n0_loop:
1101    movu            m0, [t3+r10*4+ 4]
1102    movu            m1, [t3+r10*4+36]
1103    paddd           m2, m0, [t3+r10*4+ 0]
1104    paddd           m3, m1, [t3+r10*4+32]
1105    paddd           m2, [t3+r10*4+ 8]
1106    paddd           m3, [t3+r10*4+40]
1107    paddd           m0, m2
1108    pslld           m2, 2
1109    paddd           m1, m3
1110    pslld           m3, 2
1111    paddd           m2, m0
1112    paddd           m3, m1
1113    pandn           m0, m15, m2
1114    psrld           m2, 12
1115    pandn           m1, m15, m3
1116    psrld           m3, 12
1117    paddd           m4, m0, [t3+r10*4+400*4+ 0] ; a
1118    paddd           m5, m1, [t3+r10*4+400*4+32]
1119    mova [t3+r10*4+400*4+ 0], m0
1120    mova [t3+r10*4+400*4+32], m1
1121    paddd           m0, m2, [t3+r10*4+400*8+ 0] ; b
1122    paddd           m1, m3, [t3+r10*4+400*8+32]
1123    mova [t3+r10*4+400*8+ 0], m2
1124    mova [t3+r10*4+400*8+32], m3
1125    pmovzxbd        m2, [dstq+r10+0]
1126    pmovzxbd        m3, [dstq+r10+8]
1127    pmaddwd         m4, m2 ; a * src
1128    pmaddwd         m5, m3
1129    packssdw        m2, m3
1130    psubd           m0, m4 ; b - a * src + (1 << 8)
1131    psubd           m1, m5
1132    psrad           m0, 9
1133    psrad           m1, 9
1134    packssdw        m0, m1
1135    pmulhrsw        m0, m7
1136    paddw           m0, m2
1137    vextracti128   xm1, m0, 1
1138    packuswb       xm0, xm1
1139    pshufd         xm0, xm0, q3120
1140    mova    [dstq+r10], xm0
1141    add            r10, 16
1142    jl .n0_loop
1143    add           dstq, strideq
1144    ret
1145ALIGN function_align
1146.n1: ; neighbor + output (odd rows)
1147    mov            r10, wq
1148.n1_loop:
1149    pmovzxbd        m2, [dstq+r10+0]
1150    pmovzxbd        m3, [dstq+r10+8]
1151    pmaddwd         m4, m2, [t3+r10*4+400*4+ 0] ; a * src
1152    pmaddwd         m5, m3, [t3+r10*4+400*4+32]
1153    mova            m0, [t3+r10*4+400*8+ 0]     ; b
1154    mova            m1, [t3+r10*4+400*8+32]
1155    packssdw        m2, m3
1156    psubd           m0, m4                      ; b - a * src + (1 << 7)
1157    psubd           m1, m5
1158    psrad           m0, 8
1159    psrad           m1, 8
1160    packssdw        m0, m1
1161    pmulhrsw        m0, m7
1162    paddw           m0, m2
1163    vextracti128   xm1, m0, 1
1164    packuswb       xm0, xm1
1165    pshufd         xm0, xm0, q3120
1166    mova    [dstq+r10], xm0
1167    add            r10, 16
1168    jl .n1_loop
1169    add           dstq, strideq
1170    ret
1171
1172cglobal sgr_filter_3x3_8bpc, 4, 14, 16, -400*28-16, dst, stride, left, lpf, \
1173                                                    w, h, edge, params
1174    mov        paramsq, r6mp
1175    mov             wd, wm
1176    movifnidn       hd, hm
1177    vbroadcasti128  m8, [sgr_shuf+2]
1178    mov          edged, r7m
1179    vbroadcasti128  m9, [sgr_shuf+4]
1180    add           lpfq, wq
1181    vbroadcasti128 m10, [sgr_shuf+6]
1182    add           dstq, wq
1183    vpbroadcastw    m7, [paramsq+10] ; w1
1184    lea             t3, [rsp+wq*4+16+400*12]
1185    vpbroadcastd   m11, [paramsq+ 4] ; s1
1186    pxor            m6, m6
1187    vpbroadcastd   m12, [pw_455_24]
1188    lea             t1, [rsp+wq*2+20]
1189    vbroadcastss   m13, [pf_256]
1190    neg             wq
1191    vpbroadcastd   m14, [pd_34816] ; (1 << 11) + (1 << 15)
1192    psllw           m7, 4
1193    vpbroadcastd   m15, [pd_m4096]
1194    test         edgeb, 4 ; LR_HAVE_TOP
1195    jz .no_top
1196    call .h_top
1197    add           lpfq, strideq
1198    mov             t2, t1
1199    add             t1, 400*6
1200    call .h_top
1201    lea             t4, [lpfq+strideq*4]
1202    mov           lpfq, dstq
1203    add             t4, strideq
1204    mov          [rsp], t4 ; below
1205    mov             t0, t2
1206    call .hv
1207.main:
1208    mov             t5, t3
1209    add             t3, 400*4
1210    dec             hd
1211    jz .height1
1212    add           lpfq, strideq
1213    call .hv
1214    call .prep_n
1215    dec             hd
1216    jz .extend_bottom
1217.main_loop:
1218    add           lpfq, strideq
1219    call .hv
1220    call .n
1221    dec             hd
1222    jnz .main_loop
1223    test         edgeb, 8 ; LR_HAVE_BOTTOM
1224    jz .extend_bottom
1225    mov           lpfq, [rsp]
1226    call .hv_bottom
1227    call .n
1228    add           lpfq, strideq
1229    call .hv_bottom
1230.end:
1231    call .n
1232    RET
1233.height1:
1234    call .v
1235    call .prep_n
1236    mov             t2, t1
1237    call .v
1238    jmp .end
1239.extend_bottom:
1240    call .v
1241    call .n
1242    mov             t2, t1
1243    call .v
1244    jmp .end
1245.no_top:
1246    lea             t4, [lpfq+strideq*4]
1247    mov           lpfq, dstq
1248    lea             t4, [t4+strideq*2]
1249    mov          [rsp], t4
1250    call .h
1251    lea             t0, [t1+400*6]
1252    mov             t2, t1
1253    call .v
1254    jmp .main
1255.h: ; horizontal boxsum
1256    lea            r10, [wq-2]
1257    test         edgeb, 1 ; LR_HAVE_LEFT
1258    jz .h_extend_left
1259    vpbroadcastd   xm0, [leftq]
1260    mova           xm5, [lpfq+wq]
1261    palignr        xm5, xm0, 12
1262    add          leftq, 4
1263    jmp .h_main
1264.h_extend_left:
1265    mova           xm5, [lpfq+wq]
1266    pshufb         xm5, [sgr_l_shuf]
1267    jmp .h_main
1268.h_top:
1269    lea            r10, [wq-2]
1270    test         edgeb, 1 ; LR_HAVE_LEFT
1271    jz .h_extend_left
1272.h_loop:
1273    movu           xm5, [lpfq+r10-2]
1274.h_main:
1275    vinserti128     m5, [lpfq+r10+6], 1
1276    test         edgeb, 2 ; LR_HAVE_RIGHT
1277    jnz .h_have_right
1278    cmp           r10d, -17
1279    jl .h_have_right
1280    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1281.h_have_right:
1282    pshufb          m0, m5, m8
1283    pmullw          m2, m0, m0
1284    pshufb          m4, m5, m9
1285    paddw           m0, m4
1286    pshufb          m5, m10
1287    paddw           m0, m5 ; sum
1288    punpcklwd       m3, m4, m5
1289    pmaddwd         m3, m3
1290    punpckhwd       m4, m5
1291    pmaddwd         m4, m4
1292    punpcklwd       m1, m2, m6
1293    punpckhwd       m2, m6
1294    mova [t1+r10*2+400*0], m0
1295    paddd           m1, m3 ; sumsq
1296    paddd           m2, m4
1297    mova [t1+r10*2+400*2], m1
1298    mova [t1+r10*2+400*4], m2
1299    add            r10, 16
1300    jl .h_loop
1301    ret
1302ALIGN function_align
1303.hv: ; horizontal boxsum + vertical boxsum + ab
1304    lea            r10, [wq-2]
1305    test         edgeb, 1 ; LR_HAVE_LEFT
1306    jz .hv_extend_left
1307    vpbroadcastd   xm0, [leftq]
1308    mova           xm5, [lpfq+wq]
1309    palignr        xm5, xm0, 12
1310    add          leftq, 4
1311    jmp .hv_main
1312.hv_extend_left:
1313    mova           xm5, [lpfq+wq]
1314    pshufb         xm5, [sgr_l_shuf]
1315    jmp .hv_main
1316.hv_bottom:
1317    lea            r10, [wq-2]
1318    test         edgeb, 1 ; LR_HAVE_LEFT
1319    jz .hv_extend_left
1320.hv_loop:
1321    movu           xm5, [lpfq+r10-2]
1322.hv_main:
1323    vinserti128     m5, [lpfq+r10+6], 1
1324    test         edgeb, 2 ; LR_HAVE_RIGHT
1325    jnz .hv_have_right
1326    cmp           r10d, -17
1327    jl .hv_have_right
1328    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1329.hv_have_right:
1330    pshufb          m0, m5, m8
1331    pmullw          m3, m0, m0
1332    pshufb          m1, m5, m9
1333    paddw           m0, m1
1334    pshufb          m5, m10
1335    paddw           m0, m5               ; h sum
1336    punpcklwd       m4, m5, m1
1337    pmaddwd         m4, m4
1338    punpckhwd       m5, m1
1339    pmaddwd         m5, m5
1340    paddw           m1, m0, [t2+r10*2+400*0]
1341    paddw           m1, [t1+r10*2+400*0] ; hv sum
1342    punpcklwd       m2, m3, m6
1343    punpckhwd       m3, m6
1344    paddd           m4, m2               ; h sumsq
1345    paddd           m5, m3
1346    paddd           m2, m4, [t2+r10*2+400*2]
1347    paddd           m3, m5, [t2+r10*2+400*4]
1348    paddd           m2, [t1+r10*2+400*2] ; hv sumsq
1349    paddd           m3, [t1+r10*2+400*4]
1350    mova [t0+r10*2+400*0], m0
1351    punpcklwd       m0, m1, m6           ; b
1352    punpckhwd       m1, m6
1353    mova [t0+r10*2+400*2], m4
1354    pslld           m4, m2, 3
1355    mova [t0+r10*2+400*4], m5
1356    pslld           m5, m3, 3
1357    paddd           m4, m2               ; a * 9
1358    pmaddwd         m2, m0, m0           ; b * b
1359    paddd           m5, m3
1360    pmaddwd         m3, m1, m1
1361    psubd           m4, m2               ; p
1362    psubd           m5, m3
1363    pmulld          m4, m11              ; p * s
1364    pmulld          m5, m11
1365    pmaddwd         m0, m12              ; b * 455
1366    pmaddwd         m1, m12
1367    paddw           m4, m12
1368    paddw           m5, m12
1369    psrld           m4, 20               ; z + 1
1370    psrld           m5, 20
1371    cvtdq2ps        m4, m4
1372    cvtdq2ps        m5, m5
1373    rcpps           m2, m4               ; 1 / (z + 1)
1374    rcpps           m3, m5
1375    pcmpgtd         m4, m13, m4
1376    pcmpgtd         m5, m13, m5
1377    mulps           m2, m13              ; 256 / (z + 1)
1378    mulps           m3, m13
1379    psrld           m4, 24               ; z < 255 ? 255 : 0
1380    psrld           m5, 24
1381    cvtps2dq        m2, m2
1382    cvtps2dq        m3, m3
1383    pminsw          m2, m4               ; x
1384    pminsw          m3, m5
1385    pmulld          m0, m2
1386    pmulld          m1, m3
1387    paddd           m0, m14              ; x * b * 455 + (1 << 11) + (1 << 15)
1388    paddd           m1, m14
1389    pand            m0, m15
1390    pand            m1, m15
1391    por             m0, m2               ; a | (b << 12)
1392    por             m1, m3
1393    mova         [t3+r10*4+ 8], xm0
1394    vextracti128 [t3+r10*4+40], m0, 1
1395    mova         [t3+r10*4+24], xm1
1396    vextracti128 [t3+r10*4+56], m1, 1
1397    add            r10, 16
1398    jl .hv_loop
1399    mov             t2, t1
1400    mov             t1, t0
1401    mov             t0, t2
1402    ret
1403.v: ; vertical boxsum + ab
1404    lea            r10, [wq-2]
1405.v_loop:
1406    mova            m1, [t1+r10*2+400*0]
1407    paddw           m1, m1
1408    paddw           m1, [t2+r10*2+400*0] ; hv sum
1409    mova            m2, [t1+r10*2+400*2]
1410    mova            m3, [t1+r10*2+400*4]
1411    paddd           m2, m2
1412    paddd           m3, m3
1413    paddd           m2, [t2+r10*2+400*2] ; hv sumsq
1414    paddd           m3, [t2+r10*2+400*4]
1415    punpcklwd       m0, m1, m6           ; b
1416    punpckhwd       m1, m6
1417    pslld           m4, m2, 3
1418    pslld           m5, m3, 3
1419    paddd           m4, m2               ; a * 9
1420    pmaddwd         m2, m0, m0           ; b * b
1421    paddd           m5, m3
1422    pmaddwd         m3, m1, m1
1423    psubd           m4, m2               ; p
1424    psubd           m5, m3
1425    pmulld          m4, m11              ; p * s
1426    pmulld          m5, m11
1427    pmaddwd         m0, m12              ; b * 455
1428    pmaddwd         m1, m12
1429    paddw           m4, m12
1430    paddw           m5, m12
1431    psrld           m4, 20               ; z + 1
1432    psrld           m5, 20
1433    cvtdq2ps        m4, m4
1434    cvtdq2ps        m5, m5
1435    rcpps           m2, m4               ; 1 / (z + 1)
1436    rcpps           m3, m5
1437    pcmpgtd         m4, m13, m4
1438    pcmpgtd         m5, m13, m5
1439    mulps           m2, m13              ; 256 / (z + 1)
1440    mulps           m3, m13
1441    psrld           m4, 24               ; z < 255 ? 255 : 0
1442    psrld           m5, 24
1443    cvtps2dq        m2, m2
1444    cvtps2dq        m3, m3
1445    pminsw          m2, m4               ; x
1446    pminsw          m3, m5
1447    pmulld          m0, m2
1448    pmulld          m1, m3
1449    paddd           m0, m14              ; x * b * 455 + (1 << 11) + (1 << 15)
1450    paddd           m1, m14
1451    pand            m0, m15
1452    pand            m1, m15
1453    por             m0, m2               ; a | (b << 12)
1454    por             m1, m3
1455    mova         [t3+r10*4+ 8], xm0
1456    vextracti128 [t3+r10*4+40], m0, 1
1457    mova         [t3+r10*4+24], xm1
1458    vextracti128 [t3+r10*4+56], m1, 1
1459    add            r10, 16
1460    jl .v_loop
1461    ret
1462.prep_n: ; initial neighbor setup
1463    mov            r10, wq
1464    mov             t4, t3
1465    add             t3, 400*4
1466.prep_n_loop:
1467    mova            m2, [t5+r10*4+0]
1468    mova            m3, [t4+r10*4+0]
1469    paddd           m2, [t5+r10*4+8]
1470    paddd           m3, [t4+r10*4+8]
1471    paddd           m0, m2, [t5+r10*4+4]
1472    paddd           m1, m3, [t4+r10*4+4]
1473    pslld           m0, 2
1474    paddd           m1, m1                ; ab[ 0] 222
1475    psubd           m0, m2                ; ab[-1] 343
1476    mova [t3+r10*4+400*4], m1
1477    paddd           m1, m1
1478    mova    [t5+r10*4], m0
1479    psubd           m1, m3                ; ab[ 0] 343
1480    mova    [t4+r10*4], m1
1481    add            r10, 8
1482    jl .prep_n_loop
1483    ret
1484; a+b are packed together in a single dword, but we can't do the
1485; full neighbor calculations before splitting them since we don't
1486; have sufficient precision. The solution is to do the calculations
1487; in two equal halves and split a and b before doing the final sum.
1488ALIGN function_align
1489.n: ; neighbor + output
1490    mov            r10, wq
1491.n_loop:
1492    mova            m4, [t3+r10*4+ 0]
1493    paddd           m4, [t3+r10*4+ 8]
1494    paddd           m5, m4, [t3+r10*4+ 4]
1495    paddd           m5, m5                ; ab[+1] 222
1496    mova            m2, [t3+r10*4+400*4+ 0]
1497    paddd           m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343
1498    mova            m3, [t3+r10*4+400*4+32]
1499    paddd           m1, m3, [t5+r10*4+32]
1500    mova [t3+r10*4+400*4+ 0], m5
1501    paddd           m5, m5
1502    psubd           m5, m4                ; ab[+1] 343
1503    mova [t5+r10*4+ 0], m5
1504    paddd           m2, m5                ; ab[ 0] 222 + ab[+1] 343
1505    mova            m4, [t3+r10*4+32]
1506    paddd           m4, [t3+r10*4+40]
1507    paddd           m5, m4, [t3+r10*4+36]
1508    paddd           m5, m5
1509    mova [t3+r10*4+400*4+32], m5
1510    paddd           m5, m5
1511    psubd           m5, m4
1512    mova [t5+r10*4+32], m5
1513    pandn           m4, m15, m0
1514    psrld           m0, 12
1515    paddd           m3, m5
1516    pandn           m5, m15, m2
1517    psrld           m2, 12
1518    paddd           m4, m5                ; a
1519    pandn           m5, m15, m1
1520    psrld           m1, 12
1521    paddd           m0, m2                ; b + (1 << 8)
1522    pandn           m2, m15, m3
1523    psrld           m3, 12
1524    paddd           m5, m2
1525    pmovzxbd        m2, [dstq+r10+0]
1526    paddd           m1, m3
1527    pmovzxbd        m3, [dstq+r10+8]
1528    pmaddwd         m4, m2                ; a * src
1529    pmaddwd         m5, m3
1530    packssdw        m2, m3
1531    psubd           m0, m4                ; b - a * src + (1 << 8)
1532    psubd           m1, m5
1533    psrad           m0, 9
1534    psrad           m1, 9
1535    packssdw        m0, m1
1536    pmulhrsw        m0, m7
1537    paddw           m0, m2
1538    vextracti128   xm1, m0, 1
1539    packuswb       xm0, xm1
1540    pshufd         xm0, xm0, q3120
1541    mova    [dstq+r10], xm0
1542    add            r10, 16
1543    jl .n_loop
1544    mov            r10, t5
1545    mov             t5, t4
1546    mov             t4, r10
1547    add           dstq, strideq
1548    ret
1549
1550cglobal sgr_filter_mix_8bpc, 4, 12, 16, 400*56+8, dst, stride, left, lpf, \
1551                                                  w, h, edge, params
1552    mov        paramsq, r6mp
1553    mov             wd, wm
1554    movifnidn       hd, hm
1555    mov          edged, r7m
1556    vbroadcasti128  m9, [sgr_shuf+0]
1557    vbroadcasti128 m10, [sgr_shuf+8]
1558    add           lpfq, wq
1559    vbroadcasti128 m11, [sgr_shuf+2]
1560    vbroadcasti128 m12, [sgr_shuf+6]
1561    add           dstq, wq
1562    vpbroadcastd   m15, [paramsq+8] ; w0 w1
1563    lea             t3, [rsp+wq*4+400*24+8]
1564    vpbroadcastd   m13, [paramsq+0] ; s0
1565    pxor            m7, m7
1566    vpbroadcastd   m14, [paramsq+4] ; s1
1567    lea             t1, [rsp+wq*2+12]
1568    neg             wq
1569    psllw          m15, 2 ; to reuse existing pd_m4096 register for rounding
1570    test         edgeb, 4 ; LR_HAVE_TOP
1571    jz .no_top
1572    call .h_top
1573    add           lpfq, strideq
1574    mov             t2, t1
1575    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup
1576    add             t1, 400*12
1577    call .h_top
1578    lea            r10, [lpfq+strideq*4]
1579    mov           lpfq, dstq
1580    add            r10, strideq
1581    mov          [rsp], r10 ; below
1582    call .hv0
1583.main:
1584    dec             hd
1585    jz .height1
1586    add           lpfq, strideq
1587    call .hv1
1588    call .prep_n
1589    sub             hd, 2
1590    jl .extend_bottom
1591.main_loop:
1592    add           lpfq, strideq
1593    call .hv0
1594    test            hd, hd
1595    jz .odd_height
1596    add           lpfq, strideq
1597    call .hv1
1598    call .n0
1599    call .n1
1600    sub             hd, 2
1601    jge .main_loop
1602    test         edgeb, 8 ; LR_HAVE_BOTTOM
1603    jz .extend_bottom
1604    mov           lpfq, [rsp]
1605    call .hv0_bottom
1606    add           lpfq, strideq
1607    call .hv1_bottom
1608.end:
1609    call .n0
1610    call .n1
1611.end2:
1612    RET
1613.height1:
1614    call .v1
1615    call .prep_n
1616    jmp .odd_height_end
1617.odd_height:
1618    call .v1
1619    call .n0
1620    call .n1
1621.odd_height_end:
1622    call .v0
1623    call .v1
1624    call .n0
1625    jmp .end2
1626.extend_bottom:
1627    call .v0
1628    call .v1
1629    jmp .end
1630.no_top:
1631    lea            r10, [lpfq+strideq*4]
1632    mov           lpfq, dstq
1633    lea            r10, [r10+strideq*2]
1634    mov          [rsp], r10
1635    call .h
1636    lea             t2, [t1+400*12]
1637    lea            r10, [wq-2]
1638.top_fixup_loop:
1639    mova            m0, [t1+r10*2+400* 0]
1640    mova            m1, [t1+r10*2+400* 2]
1641    mova            m2, [t1+r10*2+400* 4]
1642    paddw           m0, m0
1643    mova            m3, [t1+r10*2+400* 6]
1644    paddd           m1, m1
1645    mova            m4, [t1+r10*2+400* 8]
1646    paddd           m2, m2
1647    mova            m5, [t1+r10*2+400*10]
1648    mova [t2+r10*2+400* 0], m0
1649    mova [t2+r10*2+400* 2], m1
1650    mova [t2+r10*2+400* 4], m2
1651    mova [t2+r10*2+400* 6], m3
1652    mova [t2+r10*2+400* 8], m4
1653    mova [t2+r10*2+400*10], m5
1654    add            r10, 16
1655    jl .top_fixup_loop
1656    call .v0
1657    jmp .main
1658.h: ; horizontal boxsums
1659    lea            r10, [wq-2]
1660    test         edgeb, 1 ; LR_HAVE_LEFT
1661    jz .h_extend_left
1662    vpbroadcastd   xm0, [leftq]
1663    mova           xm5, [lpfq+wq]
1664    palignr        xm5, xm0, 12
1665    add          leftq, 4
1666    jmp .h_main
1667.h_extend_left:
1668    mova           xm5, [lpfq+wq]
1669    pshufb         xm5, [sgr_l_shuf]
1670    jmp .h_main
1671.h_top:
1672    lea            r10, [wq-2]
1673    test         edgeb, 1 ; LR_HAVE_LEFT
1674    jz .h_extend_left
1675.h_loop:
1676    movu           xm5, [lpfq+r10-2]
1677.h_main:
1678    vinserti128     m5, [lpfq+r10+6], 1
1679    test         edgeb, 2 ; LR_HAVE_RIGHT
1680    jnz .h_have_right
1681    cmp           r10d, -18
1682    jl .h_have_right
1683    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1684.h_have_right:
1685    pshufb          m6, m5, m9
1686    pshufb          m4, m5, m10
1687    paddw           m8, m6, m4
1688    shufps          m0, m6, m4, q2121
1689    pmullw          m3, m0, m0
1690    pshufb          m2, m5, m11
1691    paddw           m0, m2
1692    pshufb          m5, m12
1693    paddw           m0, m5 ; sum3
1694    punpcklwd       m1, m2, m5
1695    pmaddwd         m1, m1
1696    punpckhwd       m2, m5
1697    pmaddwd         m2, m2
1698    punpcklwd       m5, m6, m4
1699    pmaddwd         m5, m5
1700    punpckhwd       m6, m4
1701    pmaddwd         m6, m6
1702    punpcklwd       m4, m3, m7
1703    paddd           m1, m4 ; sumsq3
1704    punpckhwd       m3, m7
1705    paddd           m2, m3
1706    mova [t1+r10*2+400* 6], m0
1707    mova [t1+r10*2+400* 8], m1
1708    mova [t1+r10*2+400*10], m2
1709    paddw           m8, m0 ; sum5
1710    paddd           m5, m1 ; sumsq5
1711    paddd           m6, m2
1712    mova [t1+r10*2+400* 0], m8
1713    mova [t1+r10*2+400* 2], m5
1714    mova [t1+r10*2+400* 4], m6
1715    add            r10, 16
1716    jl .h_loop
1717    ret
1718ALIGN function_align
1719.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows)
1720    lea            r10, [wq-2]
1721    test         edgeb, 1 ; LR_HAVE_LEFT
1722    jz .hv0_extend_left
1723    vpbroadcastd   xm0, [leftq]
1724    mova           xm5, [lpfq+wq]
1725    palignr        xm5, xm0, 12
1726    add          leftq, 4
1727    jmp .hv0_main
1728.hv0_extend_left:
1729    mova           xm5, [lpfq+wq]
1730    pshufb         xm5, [sgr_l_shuf]
1731    jmp .hv0_main
1732.hv0_bottom:
1733    lea            r10, [wq-2]
1734    test         edgeb, 1 ; LR_HAVE_LEFT
1735    jz .hv0_extend_left
1736.hv0_loop:
1737    movu           xm5, [lpfq+r10-2]
1738.hv0_main:
1739    vinserti128     m5, [lpfq+r10+6], 1
1740    test         edgeb, 2 ; LR_HAVE_RIGHT
1741    jnz .hv0_have_right
1742    cmp           r10d, -18
1743    jl .hv0_have_right
1744    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1745.hv0_have_right:
1746    pshufb          m6, m5, m9
1747    pshufb          m4, m5, m10
1748    paddw           m8, m6, m4
1749    shufps          m1, m6, m4, q2121
1750    pmullw          m0, m1, m1
1751    pshufb          m3, m5, m11
1752    paddw           m1, m3
1753    pshufb          m5, m12
1754    paddw           m1, m5 ; sum3
1755    punpcklwd       m2, m3, m5
1756    pmaddwd         m2, m2
1757    punpckhwd       m3, m5
1758    pmaddwd         m3, m3
1759    punpcklwd       m5, m6, m4
1760    pmaddwd         m5, m5
1761    punpckhwd       m6, m4
1762    pmaddwd         m6, m6
1763    punpcklwd       m4, m0, m7
1764    paddd           m2, m4 ; sumsq3
1765    punpckhwd       m0, m7
1766    paddd           m3, m0
1767    paddw           m8, m1 ; sum5
1768    paddd           m5, m2 ; sumsq5
1769    paddd           m6, m3
1770    mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row
1771    mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd
1772    mova [t3+r10*4+400*0+40], m6
1773    paddw           m8, [t1+r10*2+400* 0]
1774    paddd           m5, [t1+r10*2+400* 2]
1775    paddd           m6, [t1+r10*2+400* 4]
1776    mova [t1+r10*2+400* 0], m8
1777    mova [t1+r10*2+400* 2], m5
1778    mova [t1+r10*2+400* 4], m6
1779    paddw           m0, m1, [t1+r10*2+400* 6]
1780    paddd           m4, m2, [t1+r10*2+400* 8]
1781    paddd           m5, m3, [t1+r10*2+400*10]
1782    mova [t1+r10*2+400* 6], m1
1783    mova [t1+r10*2+400* 8], m2
1784    mova [t1+r10*2+400*10], m3
1785    paddw           m1, m0, [t2+r10*2+400* 6]
1786    paddd           m2, m4, [t2+r10*2+400* 8]
1787    paddd           m3, m5, [t2+r10*2+400*10]
1788    mova [t2+r10*2+400* 6], m0
1789    mova [t2+r10*2+400* 8], m4
1790    mova [t2+r10*2+400*10], m5
1791    vpbroadcastd    m8, [pw_455_24]
1792    punpcklwd       m0, m1, m7           ; b3
1793    vbroadcastss    m6, [pf_256]
1794    punpckhwd       m1, m7
1795    pslld           m4, m2, 3
1796    pslld           m5, m3, 3
1797    paddd           m4, m2               ; a3 * 9
1798    pmaddwd         m2, m0, m0           ; b3 * b
1799    paddd           m5, m3
1800    pmaddwd         m3, m1, m1
1801    psubd           m4, m2               ; p3
1802    psubd           m5, m3
1803    pmulld          m4, m14              ; p3 * s1
1804    pmulld          m5, m14
1805    pmaddwd         m0, m8               ; b3 * 455
1806    pmaddwd         m1, m8
1807    paddw           m4, m8
1808    paddw           m5, m8
1809    vpbroadcastd    m8, [pd_34816]
1810    psrld           m4, 20               ; z3 + 1
1811    psrld           m5, 20
1812    cvtdq2ps        m4, m4
1813    cvtdq2ps        m5, m5
1814    rcpps           m2, m4               ; 1 / (z3 + 1)
1815    rcpps           m3, m5
1816    pcmpgtd         m4, m6, m4
1817    pcmpgtd         m5, m6, m5
1818    mulps           m2, m6               ; 256 / (z3 + 1)
1819    mulps           m3, m6
1820    vpbroadcastd    m6, [pd_m4096]
1821    psrld           m4, 24               ; z3 < 255 ? 255 : 0
1822    psrld           m5, 24
1823    cvtps2dq        m2, m2
1824    cvtps2dq        m3, m3
1825    pminsw          m2, m4               ; x3
1826    pminsw          m3, m5
1827    pmulld          m0, m2
1828    pmulld          m1, m3
1829    paddd           m0, m8               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1830    paddd           m1, m8
1831    pand            m0, m6
1832    pand            m1, m6
1833    por             m0, m2               ; a3 | (b3 << 12)
1834    por             m1, m3
1835    mova         [t3+r10*4+400*4+ 8], xm0
1836    vextracti128 [t3+r10*4+400*4+40], m0, 1
1837    mova         [t3+r10*4+400*4+24], xm1
1838    vextracti128 [t3+r10*4+400*4+56], m1, 1
1839    add            r10, 16
1840    jl .hv0_loop
1841    ret
1842ALIGN function_align
1843.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
1844    lea            r10, [wq-2]
1845    test         edgeb, 1 ; LR_HAVE_LEFT
1846    jz .hv1_extend_left
1847    vpbroadcastd   xm0, [leftq]
1848    mova           xm5, [lpfq+wq]
1849    palignr        xm5, xm0, 12
1850    add          leftq, 4
1851    jmp .hv1_main
1852.hv1_extend_left:
1853    mova           xm5, [lpfq+wq]
1854    pshufb         xm5, [sgr_l_shuf]
1855    jmp .hv1_main
1856.hv1_bottom:
1857    lea            r10, [wq-2]
1858    test         edgeb, 1 ; LR_HAVE_LEFT
1859    jz .hv1_extend_left
1860.hv1_loop:
1861    movu           xm5, [lpfq+r10-2]
1862.hv1_main:
1863    vinserti128     m5, [lpfq+r10+6], 1
1864    test         edgeb, 2 ; LR_HAVE_RIGHT
1865    jnz .hv1_have_right
1866    cmp           r10d, -18
1867    jl .hv1_have_right
1868    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1869.hv1_have_right:
1870    pshufb          m6, m5, m9
1871    pshufb          m3, m5, m10
1872    paddw           m8, m6, m3
1873    shufps          m2, m6, m3, q2121
1874    pmullw          m1, m2, m2
1875    pshufb          m0, m5, m11
1876    paddw           m2, m0
1877    pshufb          m5, m12
1878    paddw           m2, m5 ; sum3
1879    punpcklwd       m4, m5, m0
1880    pmaddwd         m4, m4
1881    punpckhwd       m5, m0
1882    pmaddwd         m5, m5
1883    punpcklwd       m0, m6, m3
1884    pmaddwd         m0, m0
1885    punpckhwd       m6, m3
1886    pmaddwd         m6, m6
1887    punpcklwd       m3, m1, m7
1888    paddd           m4, m3 ; sumsq3
1889    punpckhwd       m1, m7
1890    paddd           m5, m1
1891    paddw           m1, m2, [t2+r10*2+400* 6]
1892    mova [t2+r10*2+400* 6], m2
1893    paddw           m8, m2 ; sum5
1894    paddd           m2, m4, [t2+r10*2+400* 8]
1895    paddd           m3, m5, [t2+r10*2+400*10]
1896    mova [t2+r10*2+400* 8], m4
1897    mova [t2+r10*2+400*10], m5
1898    vpbroadcastd    m9, [pw_455_24]
1899    paddd           m4, m0 ; sumsq5
1900    paddd           m5, m6
1901    punpcklwd       m0, m1, m7           ; b3
1902    punpckhwd       m1, m7
1903    pslld           m6, m2, 3
1904    pslld           m7, m3, 3
1905    paddd           m6, m2               ; a3 * 9
1906    pmaddwd         m2, m0, m0           ; b3 * b3
1907    paddd           m7, m3
1908    pmaddwd         m3, m1, m1
1909    psubd           m6, m2               ; p3
1910    psubd           m7, m3
1911    pmulld          m6, m14              ; p3 * s1
1912    pmulld          m7, m14
1913    pmaddwd         m0, m9               ; b3 * 455
1914    pmaddwd         m1, m9
1915    paddw           m6, m9
1916    paddw           m7, m9
1917    vbroadcastss    m9, [pf_256]
1918    psrld           m6, 20               ; z3 + 1
1919    psrld           m7, 20
1920    cvtdq2ps        m6, m6
1921    cvtdq2ps        m7, m7
1922    rcpps           m2, m6               ; 1 / (z3 + 1)
1923    rcpps           m3, m7
1924    pcmpgtd         m6, m9, m6
1925    pcmpgtd         m7, m9, m7
1926    mulps           m2, m9               ; 256 / (z3 + 1)
1927    mulps           m3, m9
1928    vpbroadcastd    m9, [pd_34816]
1929    psrld           m6, 24               ; z3 < 255 ? 255 : 0
1930    psrld           m7, 24
1931    cvtps2dq        m2, m2
1932    cvtps2dq        m3, m3
1933    pminsw          m2, m6               ; x3
1934    vpbroadcastd    m6, [pd_m4096]
1935    pminsw          m3, m7
1936    pmulld          m0, m2
1937    pmulld          m1, m3
1938    paddd           m0, m9               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1939    paddd           m1, m9
1940    pand            m0, m6
1941    pand            m7, m6, m1
1942    por             m0, m2               ; a3 | (b3 << 12)
1943    por             m7, m3
1944    paddw           m1, m8, [t2+r10*2+400*0]
1945    paddd           m2, m4, [t2+r10*2+400*2]
1946    paddd           m3, m5, [t2+r10*2+400*4]
1947    paddw           m1, [t1+r10*2+400*0]
1948    paddd           m2, [t1+r10*2+400*2]
1949    paddd           m3, [t1+r10*2+400*4]
1950    mova [t2+r10*2+400*0], m8
1951    mova [t2+r10*2+400*2], m4
1952    mova [t2+r10*2+400*4], m5
1953    mova         [t3+r10*4+400*8+ 8], xm0
1954    vextracti128 [t3+r10*4+400*8+40], m0, 1
1955    mova         [t3+r10*4+400*8+24], xm7
1956    vextracti128 [t3+r10*4+400*8+56], m7, 1
1957    vpbroadcastd    m4, [pd_25]
1958    pxor            m7, m7
1959    vpbroadcastd    m8, [pw_164_24]
1960    punpcklwd       m0, m1, m7           ; b5
1961    punpckhwd       m1, m7
1962    pmulld          m2, m4               ; a5 * 25
1963    pmulld          m3, m4
1964    pmaddwd         m4, m0, m0           ; b5 * b5
1965    pmaddwd         m5, m1, m1
1966    psubd           m2, m4               ; p5
1967    psubd           m3, m5
1968    pmulld          m2, m13              ; p5 * s0
1969    pmulld          m3, m13
1970    pmaddwd         m0, m8               ; b5 * 164
1971    pmaddwd         m1, m8
1972    paddw           m2, m8
1973    paddw           m3, m8
1974    vbroadcastss    m8, [pf_256]
1975    psrld           m2, 20               ; z5 + 1
1976    psrld           m3, 20
1977    cvtdq2ps        m2, m2
1978    cvtdq2ps        m3, m3
1979    rcpps           m4, m2               ; 1 / (z5 + 1)
1980    rcpps           m5, m3
1981    pcmpgtd         m2, m8, m2
1982    pcmpgtd         m3, m8, m3
1983    mulps           m4, m8               ; 256 / (z5 + 1)
1984    mulps           m5, m8
1985    psrld           m2, 24               ; z5 < 255 ? 255 : 0
1986    psrld           m3, 24
1987    cvtps2dq        m4, m4
1988    cvtps2dq        m5, m5
1989    pminsw          m4, m2               ; x5
1990    pminsw          m5, m3
1991    pmulld          m0, m4
1992    pmulld          m1, m5
1993    paddd           m0, m9               ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
1994    paddd           m1, m9
1995    vbroadcasti128  m9, [sgr_shuf]
1996    pand            m0, m6
1997    pand            m1, m6
1998    por             m0, m4               ; a5 | (b5 << 12)
1999    por             m1, m5
2000    mova         [t3+r10*4+400*0+ 8], xm0
2001    vextracti128 [t3+r10*4+400*0+40], m0, 1
2002    mova         [t3+r10*4+400*0+24], xm1
2003    vextracti128 [t3+r10*4+400*0+56], m1, 1
2004    add            r10, 16
2005    jl .hv1_loop
2006    mov            r10, t2
2007    mov             t2, t1
2008    mov             t1, r10
2009    ret
2010.v0: ; vertical boxsums + ab3 (even rows)
2011    lea            r10, [wq-2]
2012    vpbroadcastd    m6, [pd_34816]
2013.v0_loop:
2014    mova            m0, [t1+r10*2+400* 6]
2015    mova            m4, [t1+r10*2+400* 8]
2016    mova            m5, [t1+r10*2+400*10]
2017    paddw           m0, m0
2018    paddd           m4, m4
2019    paddd           m5, m5
2020    paddw           m1, m0, [t2+r10*2+400* 6]
2021    paddd           m2, m4, [t2+r10*2+400* 8]
2022    paddd           m3, m5, [t2+r10*2+400*10]
2023    mova [t2+r10*2+400* 6], m0
2024    mova [t2+r10*2+400* 8], m4
2025    mova [t2+r10*2+400*10], m5
2026    vpbroadcastd    m8, [pw_455_24]
2027    punpcklwd       m0, m1, m7           ; b3
2028    punpckhwd       m1, m7
2029    pslld           m4, m2, 3
2030    pslld           m5, m3, 3
2031    paddd           m4, m2               ; a3 * 9
2032    pmaddwd         m2, m0, m0           ; b3 * b3
2033    paddd           m5, m3
2034    pmaddwd         m3, m1, m1
2035    psubd           m4, m2               ; p3
2036    psubd           m5, m3
2037    pmulld          m4, m14              ; p3 * s1
2038    pmulld          m5, m14
2039    pmaddwd         m0, m8               ; b3 * 455
2040    pmaddwd         m1, m8
2041    paddw           m4, m8
2042    paddw           m5, m8
2043    vbroadcastss    m8, [pf_256]
2044    psrld           m4, 20               ; z3 + 1
2045    psrld           m5, 20
2046    cvtdq2ps        m4, m4
2047    cvtdq2ps        m5, m5
2048    rcpps           m2, m4               ; 1 / (z3 + 1)
2049    rcpps           m3, m5
2050    pcmpgtd         m4, m8, m4
2051    pcmpgtd         m5, m8, m5
2052    mulps           m2, m8               ; 256 / (z3 + 1)
2053    mulps           m3, m8
2054    vpbroadcastd    m8, [pd_m4096]
2055    psrld           m4, 24               ; z3 < 255 ? 255 : 0
2056    psrld           m5, 24
2057    cvtps2dq        m2, m2
2058    cvtps2dq        m3, m3
2059    pminsw          m2, m4               ; x3
2060    pminsw          m3, m5
2061    pmulld          m0, m2
2062    pmulld          m1, m3
2063    paddd           m0, m6               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2064    paddd           m1, m6
2065    pand            m0, m8
2066    pand            m1, m8
2067    por             m0, m2               ; a3 | (b3 << 12)
2068    por             m1, m3
2069    mova            m2, [t1+r10*2+400*0]
2070    mova            m3, [t1+r10*2+400*2]
2071    mova            m4, [t1+r10*2+400*4]
2072    mova [t3+r10*4+400*8+ 8], m2
2073    mova [t3+r10*4+400*0+ 8], m3
2074    mova [t3+r10*4+400*0+40], m4
2075    paddw           m2, m2               ; cc5
2076    paddd           m3, m3
2077    paddd           m4, m4
2078    mova [t1+r10*2+400*0], m2
2079    mova [t1+r10*2+400*2], m3
2080    mova [t1+r10*2+400*4], m4
2081    mova         [t3+r10*4+400*4+ 8], xm0
2082    vextracti128 [t3+r10*4+400*4+40], m0, 1
2083    mova         [t3+r10*4+400*4+24], xm1
2084    vextracti128 [t3+r10*4+400*4+56], m1, 1
2085    add            r10, 16
2086    jl .v0_loop
2087    ret
2088.v1: ; vertical boxsums + ab (odd rows)
2089    lea            r10, [wq-2]
2090.v1_loop:
2091    mova            m4, [t1+r10*2+400* 6]
2092    mova            m5, [t1+r10*2+400* 8]
2093    mova            m6, [t1+r10*2+400*10]
2094    paddw           m1, m4, [t2+r10*2+400* 6]
2095    paddd           m2, m5, [t2+r10*2+400* 8]
2096    paddd           m3, m6, [t2+r10*2+400*10]
2097    mova [t2+r10*2+400* 6], m4
2098    mova [t2+r10*2+400* 8], m5
2099    mova [t2+r10*2+400*10], m6
2100    vpbroadcastd    m8, [pw_455_24]
2101    punpcklwd       m0, m1, m7           ; b3
2102    punpckhwd       m1, m7
2103    pslld           m4, m2, 3
2104    pslld           m5, m3, 3
2105    paddd           m4, m2               ; a3 * 9
2106    pmaddwd         m2, m0, m0           ; b3 * b3
2107    paddd           m5, m3
2108    pmaddwd         m3, m1, m1
2109    psubd           m4, m2               ; p3
2110    psubd           m5, m3
2111    pmulld          m4, m14              ; p3 * s1
2112    pmulld          m5, m14
2113    pmaddwd         m0, m8               ; b3 * 455
2114    pmaddwd         m1, m8
2115    paddw           m4, m8
2116    paddw           m5, m8
2117    vbroadcastss    m8, [pf_256]
2118    psrld           m4, 20               ; z3 + 1
2119    psrld           m5, 20
2120    cvtdq2ps        m4, m4
2121    cvtdq2ps        m5, m5
2122    rcpps           m2, m4               ; 1 / (z3 + 1)
2123    rcpps           m3, m5
2124    pcmpgtd         m4, m8, m4
2125    pcmpgtd         m5, m8, m5
2126    mulps           m2, m8               ; 256 / (z3 + 1)
2127    mulps           m3, m8
2128    vpbroadcastd    m8, [pd_m4096]
2129    psrld           m4, 24               ; z3 < 255 ? 255 : 0
2130    psrld           m5, 24
2131    cvtps2dq        m2, m2
2132    cvtps2dq        m3, m3
2133    pminsw          m2, m4               ; x3
2134    vpbroadcastd    m4, [pd_34816]
2135    pminsw          m3, m5
2136    pmulld          m0, m2
2137    pmulld          m1, m3
2138    paddd           m0, m4               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2139    paddd           m1, m4
2140    pand            m0, m8
2141    pand            m8, m1
2142    por             m0, m2               ; a3 | (b3 << 12)
2143    por             m8, m3
2144    mova            m4, [t3+r10*4+400*8+ 8]
2145    mova            m5, [t3+r10*4+400*0+ 8]
2146    mova            m6, [t3+r10*4+400*0+40]
2147    paddw           m1, m4, [t2+r10*2+400*0]
2148    paddd           m2, m5, [t2+r10*2+400*2]
2149    paddd           m3, m6, [t2+r10*2+400*4]
2150    paddw           m1, [t1+r10*2+400*0]
2151    paddd           m2, [t1+r10*2+400*2]
2152    paddd           m3, [t1+r10*2+400*4]
2153    mova [t2+r10*2+400*0], m4
2154    mova [t2+r10*2+400*2], m5
2155    mova [t2+r10*2+400*4], m6
2156    vpbroadcastd    m4, [pd_25]
2157    mova         [t3+r10*4+400*8+ 8], xm0
2158    vextracti128 [t3+r10*4+400*8+40], m0, 1
2159    mova         [t3+r10*4+400*8+24], xm8
2160    vextracti128 [t3+r10*4+400*8+56], m8, 1
2161    vpbroadcastd    m8, [pw_164_24]
2162    punpcklwd       m0, m1, m7           ; b5
2163    vbroadcastss    m6, [pf_256]
2164    punpckhwd       m1, m7
2165    pmulld          m2, m4               ; a5 * 25
2166    pmulld          m3, m4
2167    pmaddwd         m4, m0, m0           ; b5 * b5
2168    pmaddwd         m5, m1, m1
2169    psubd           m2, m4               ; p5
2170    psubd           m3, m5
2171    pmulld          m2, m13              ; p5 * s0
2172    pmulld          m3, m13
2173    pmaddwd         m0, m8               ; b5 * 164
2174    pmaddwd         m1, m8
2175    paddw           m2, m8
2176    paddw           m3, m8
2177    vpbroadcastd    m8, [pd_34816]
2178    psrld           m2, 20               ; z5 + 1
2179    psrld           m3, 20
2180    cvtdq2ps        m2, m2
2181    cvtdq2ps        m3, m3
2182    rcpps           m4, m2               ; 1 / (z5 + 1)
2183    rcpps           m5, m3
2184    pcmpgtd         m2, m6, m2
2185    pcmpgtd         m3, m6, m3
2186    mulps           m4, m6               ; 256 / (z5 + 1)
2187    mulps           m5, m6
2188    vpbroadcastd    m6, [pd_m4096]
2189    psrld           m2, 24               ; z5 < 255 ? 255 : 0
2190    psrld           m3, 24
2191    cvtps2dq        m4, m4
2192    cvtps2dq        m5, m5
2193    pminsw          m4, m2               ; x5
2194    pminsw          m5, m3
2195    pmulld          m0, m4
2196    pmulld          m1, m5
2197    paddd           m0, m8               ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
2198    paddd           m1, m8
2199    pand            m0, m6
2200    pand            m1, m6
2201    por             m0, m4               ; a5 | (b5 << 12)
2202    por             m1, m5
2203    mova         [t3+r10*4+400*0+ 8], xm0
2204    vextracti128 [t3+r10*4+400*0+40], m0, 1
2205    mova         [t3+r10*4+400*0+24], xm1
2206    vextracti128 [t3+r10*4+400*0+56], m1, 1
2207    add            r10, 16
2208    jl .v1_loop
2209    mov            r10, t2
2210    mov             t2, t1
2211    mov             t1, r10
2212    ret
2213.prep_n: ; initial neighbor setup
2214    mov            r10, wq
2215.prep_n_loop:
2216    movu            m0, [t3+r10*4+400*0+4]
2217    paddd           m1, m0, [t3+r10*4+400*0+0]
2218    mova            m4, [t3+r10*4+400*4+0]
2219    paddd           m1, [t3+r10*4+400*0+8]
2220    mova            m5, [t3+r10*4+400*8+0]
2221    paddd           m4, [t3+r10*4+400*4+8]
2222    paddd           m5, [t3+r10*4+400*8+8]
2223    paddd           m2, m4, [t3+r10*4+400*4+4]
2224    paddd           m3, m5, [t3+r10*4+400*8+4]
2225    paddd           m0, m1
2226    pslld           m1, 2
2227    pslld           m2, 2
2228    paddd           m1, m0                ; ab5 565
2229    paddd           m3, m3                ; ab3[ 0] 222
2230    psubd           m2, m4                ; ab3[-1] 343
2231    mova [t3+r10*4+400*20], m3
2232    pandn           m0, m6, m1            ; a5 565
2233    mova [t3+r10*4+400*24], m2
2234    psrld           m1, 12                ; b5 565
2235    mova [t3+r10*4+400*12], m0
2236    paddd           m3, m3
2237    mova [t3+r10*4+400*16], m1
2238    psubd           m3, m5                ; ab3[ 0] 343
2239    mova [t3+r10*4+400*28], m3
2240    add            r10, 8
2241    jl .prep_n_loop
2242    ret
2243ALIGN function_align
2244.n0: ; neighbor + output (even rows)
2245    mov            r10, wq
2246.n0_loop:
2247    movu            m0, [t3+r10*4+4]
2248    paddd           m4, m0, [t3+r10*4+0]
2249    paddd           m4, [t3+r10*4+8]
2250    paddd           m0, m4
2251    pslld           m4, 2
2252    paddd           m4, m0
2253    pandn           m0, m6, m4
2254    psrld           m4, 12
2255    paddd           m2, m0, [t3+r10*4+400*12] ; a5
2256    mova [t3+r10*4+400*12], m0
2257    paddd           m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8)
2258    mova [t3+r10*4+400*16], m4
2259    mova            m3, [t3+r10*4+400*4+0]
2260    paddd           m3, [t3+r10*4+400*4+8]
2261    paddd           m5, m3, [t3+r10*4+400*4+4]
2262    paddd           m5, m5                    ; ab3[ 1] 222
2263    mova            m4, [t3+r10*4+400*20]
2264    paddd           m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343
2265    mova [t3+r10*4+400*20], m5
2266    paddd           m5, m5
2267    psubd           m5, m3                    ; ab3[ 1] 343
2268    mova [t3+r10*4+400*24], m5
2269    paddd           m4, m5                    ; ab3[ 0] 222 + ab3[ 1] 343
2270    pandn           m3, m6, m1
2271    psrld           m1, 12
2272    pandn           m5, m6, m4
2273    psrld           m4, 12
2274    paddd           m3, m5                    ; a3
2275    paddd           m1, m4                    ; b3 + (1 << 8)
2276    pmovzxbd        m4, [dstq+r10]
2277    pmaddwd         m2, m4                    ; a5 * src
2278    pmaddwd         m3, m4                    ; a3 * src
2279    psubd           m0, m2                    ; b5 - a5 * src + (1 << 8)
2280    psubd           m1, m3                    ; b3 - a3 * src + (1 << 8)
2281    psrld           m0, 9
2282    pslld           m1, 7
2283    pblendw         m0, m1, 0xaa
2284    pmaddwd         m0, m15
2285    psubd           m0, m6
2286    psrad           m0, 13
2287    paddd           m0, m4
2288    vextracti128   xm1, m0, 1
2289    packssdw       xm0, xm1
2290    packuswb       xm0, xm0
2291    movq    [dstq+r10], xm0
2292    add            r10, 8
2293    jl .n0_loop
2294    add           dstq, strideq
2295    ret
2296ALIGN function_align
2297.n1: ; neighbor + output (odd rows)
2298    mov            r10, wq
2299.n1_loop:
2300    mova            m3, [t3+r10*4+400*8+0]
2301    paddd           m3, [t3+r10*4+400*8+8]
2302    paddd           m5, m3, [t3+r10*4+400*8+4]
2303    paddd           m5, m5                    ; ab3[ 1] 222
2304    mova            m4, [t3+r10*4+400*20]
2305    paddd           m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343
2306    mova [t3+r10*4+400*20], m5
2307    paddd           m5, m5
2308    psubd           m5, m3                    ; ab3[ 1] 343
2309    mova [t3+r10*4+400*28], m5
2310    paddd           m4, m5                    ; ab3[ 0] 222 + ab3[ 1] 343
2311    pandn           m3, m6, m1
2312    psrld           m1, 12
2313    pandn           m5, m6, m4
2314    psrld           m4, 12
2315    paddd           m3, m5                    ; -a3
2316    paddd           m1, m4                    ;  b3 + (1 << 8)
2317    pmovzxbd        m4, [dstq+r10]
2318    pmaddwd         m2, m4, [t3+r10*4+400*12] ; -a5 * src
2319    mova            m0, [t3+r10*4+400*16]     ;  b5 + (1 << 7)
2320    pmaddwd         m3, m4                    ; -a3 * src
2321    psubd           m0, m2                    ; a5 * src + b5 + (1 << 7)
2322    psubd           m1, m3                    ; a3 * src + b3 + (1 << 8)
2323    psrld           m0, 8
2324    pslld           m1, 7
2325    pblendw         m0, m1, 0xaa
2326    pmaddwd         m0, m15
2327    psubd           m0, m6
2328    psrad           m0, 13
2329    paddd           m0, m4
2330    vextracti128   xm1, m0, 1
2331    packssdw       xm0, xm1
2332    packuswb       xm0, xm0
2333    movq    [dstq+r10], xm0
2334    add            r10, 8
2335    jl .n1_loop
2336    add           dstq, strideq
2337    ret
2338
2339%endif ; ARCH_X86_64
2340