xref: /aosp_15_r20/external/libdav1d/src/x86/looprestoration16_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA
30
31wiener_shufA:  db  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11
32wiener_shufB:  db  6,  7,  4,  5,  8,  9,  6,  7, 10, 11,  8,  9, 12, 13, 10, 11
33wiener_shufC:  db  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
34wiener_shufD:  db  2,  3, -1, -1,  4,  5, -1, -1,  6,  7, -1, -1,  8,  9, -1, -1
35wiener_shufE:  db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
36wiener_lshuf5: db  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
37wiener_lshuf7: db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
38sgr_lshuf3:    db  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
39sgr_lshuf5:    db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9
40pb_0to15:      db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
41
42pb_m14_m13:    times 8 db -14,-13
43pb_m10_m9:     times 8 db -10, -9
44pb_m6_m5:      times 8 db  -6, -5
45pb_m2_m1:      times 8 db  -2, -1
46pb_2_3:        times 8 db   2,  3
47pb_6_7:        times 8 db   6,  7
48pw_256:        times 8 dw 256
49pw_1023:       times 8 dw 1023
50pd_8:          times 4 dd 8
51pd_4096:       times 4 dd 4096
52pd_34816:      times 4 dd 34816
53pd_m262128:    times 4 dd -262128
54pd_0xffff:     times 4 dd 0xffff
55pd_0xf00800a4: times 4 dd 0xf00800a4
56pd_0xf00801c7: times 4 dd 0xf00801c7
57pd_0xfffffff0: times 4 dd 0xfffffff0
58
59wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192
60wiener_round:  dd 1049600, 1048832
61
62cextern sgr_x_by_x
63
64SECTION .text
65
66%macro movif64 2 ; dst, src
67 %if ARCH_X86_64
68    mov             %1, %2
69 %endif
70%endmacro
71
72%macro movif32 2 ; dst, src
73 %if ARCH_X86_32
74    mov             %1, %2
75 %endif
76%endmacro
77
78INIT_XMM ssse3
79%if ARCH_X86_32
80DECLARE_REG_TMP 5, 6
81 %if STACK_ALIGNMENT < 16
82  %assign extra_stack 13*16
83 %else
84  %assign extra_stack 12*16
85 %endif
86cglobal wiener_filter7_16bpc, 4, 7, 8, -384*12-16-extra_stack, \
87                              dst, stride, left, lpf, w, flt
88 %if STACK_ALIGNMENT < 16
89  %define lpfm        dword [esp+calloff+16*12+ 0]
90  %define wm          dword [esp+calloff+16*12+ 4]
91  %define hd          dword [esp+calloff+16*12+ 8]
92  %define edgeb        byte [esp+calloff+16*12+12]
93  %define edged       dword [esp+calloff+16*12+12]
94 %else
95  %define hd dword r5m
96  %define edgeb byte r7m
97 %endif
98 %define PICmem dword [esp+calloff+4*0]
99 %define t0m    dword [esp+calloff+4*1] ; wiener ring buffer pointers
100 %define t1m    dword [esp+calloff+4*2]
101 %define t2m    dword [esp+calloff+4*3]
102 %define t3m    dword [esp+calloff+4*4]
103 %define t4m    dword [esp+calloff+4*5]
104 %define t5m    dword [esp+calloff+4*6]
105 %define t6m    dword [esp+calloff+4*7]
106 %define t2 t2m
107 %define t3 t3m
108 %define t4 t4m
109 %define t5 t5m
110 %define t6 t6m
111 %define  m8 [esp+calloff+16*2]
112 %define  m9 [esp+calloff+16*3]
113 %define m10 [esp+calloff+16*4]
114 %define m11 [esp+calloff+16*5]
115 %define m12 [esp+calloff+16*6]
116 %define m13 [esp+calloff+16*7]
117 %define m14 [esp+calloff+16*8]
118 %define m15 [esp+calloff+16*9]
119 %define r10 r4
120 %define base t0-wiener_shifts
121 %assign calloff 0
122 %if STACK_ALIGNMENT < 16
123    mov             wd, [rstk+stack_offset+20]
124    mov             wm, wd
125    mov             r5, [rstk+stack_offset+24]
126    mov             hd, r5
127    mov             r5, [rstk+stack_offset+32]
128    mov          edged, r5 ; edge
129 %endif
130%else
131DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
132cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
133                                                     w, h, edge, flt
134 %define base
135%endif
136%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
137    movifnidn       wd, wm
138%endif
139%if ARCH_X86_64
140    mov           fltq, r6mp
141    movifnidn       hd, hm
142    mov          edged, r7m
143    mov            t3d, r8m ; pixel_max
144    movq           m13, [fltq]
145    movq           m15, [fltq+16]
146%else
147 %if STACK_ALIGNMENT < 16
148    mov             t0, [rstk+stack_offset+28]
149    mov             t1, [rstk+stack_offset+36] ; pixel_max
150    movq            m1, [t0]    ; fx
151    movq            m3, [t0+16] ; fy
152    LEA             t0, wiener_shifts
153 %else
154    mov           fltq, r6m
155    movq            m1, [fltq]
156    movq            m3, [fltq+16]
157    LEA             t0, wiener_shifts
158    mov             t1, r8m ; pixel_max
159 %endif
160    mov         PICmem, t0
161%endif
162    mova            m6, [base+wiener_shufA]
163    mova            m7, [base+wiener_shufB]
164%if ARCH_X86_64
165    lea             t4, [wiener_shifts]
166    add             wd, wd
167    pshufd         m12, m13, q0000 ; x0 x1
168    pshufd         m13, m13, q1111 ; x2 x3
169    pshufd         m14, m15, q0000 ; y0 y1
170    pshufd         m15, m15, q1111 ; y2 y3
171    mova            m8, [wiener_shufC]
172    mova            m9, [wiener_shufD]
173    add           lpfq, wq
174    lea             t1, [rsp+wq+16]
175    add           dstq, wq
176    neg             wq
177    shr            t3d, 11
178 %define base t4-wiener_shifts
179    movd           m10, [base+wiener_round+t3*4]
180    movq           m11, [base+wiener_shifts+t3*8]
181    pshufd         m10, m10, q0000
182    pshufd          m0, m11, q0000
183    pshufd         m11, m11, q1111
184    pmullw         m12, m0 ; upshift filter coefs to make the
185    pmullw         m13, m0 ; horizontal downshift constant
186 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
187 %define lpfm [rsp]
188 %define base
189 %define wiener_lshuf7_mem [wiener_lshuf7]
190 %define pd_m262128_mem [pd_m262128]
191%else
192    add             wd, wd
193    mova            m4, [base+wiener_shufC]
194    mova            m5, [base+wiener_shufD]
195    pshufd          m0, m1, q0000
196    pshufd          m1, m1, q1111
197    pshufd          m2, m3, q0000
198    pshufd          m3, m3, q1111
199    mova            m8, m4
200    mova            m9, m5
201    mova           m14, m2
202    mova           m15, m3
203    shr             t1, 11
204    add           lpfq, wq
205    mova            m3, [base+pd_m262128]
206    movd            m4, [base+wiener_round+t1*4]
207    movq            m5, [base+wiener_shifts+t1*8]
208    lea             t1, [esp+extra_stack+wq+16]
209    add           dstq, wq
210    neg             wq
211    pshufd          m4, m4, q0000
212    pshufd          m2, m5, q0000
213    pshufd          m5, m5, q1111
214    mov             wm, wq
215    pmullw          m0, m2
216    pmullw          m1, m2
217    mova            m2, [base+wiener_lshuf7]
218 %define pd_m262128_mem [esp+calloff+16*10]
219    mova pd_m262128_mem, m3
220    mova           m10, m4
221    mova           m11, m5
222    mova           m12, m0
223    mova           m13, m1
224 %define wiener_lshuf7_mem [esp+calloff+16*11]
225    mova wiener_lshuf7_mem, m2
226%endif
227    test         edgeb, 4 ; LR_HAVE_TOP
228    jz .no_top
229    call .h_top
230    add           lpfq, strideq
231    mov             t6, t1
232    mov             t5, t1
233    add             t1, 384*2
234    call .h_top
235    lea            r10, [lpfq+strideq*4]
236    mov           lpfq, dstq
237    mov             t4, t1
238    add             t1, 384*2
239    add            r10, strideq
240    mov           lpfm, r10 ; below
241    call .h
242    mov             t3, t1
243    mov             t2, t1
244    dec             hd
245    jz .v1
246    add           lpfq, strideq
247    add             t1, 384*2
248    call .h
249    mov             t2, t1
250    dec             hd
251    jz .v2
252    add           lpfq, strideq
253    add             t1, 384*2
254    call .h
255    dec             hd
256    jz .v3
257.main:
258    lea             t0, [t1+384*2]
259.main_loop:
260    call .hv
261    dec             hd
262    jnz .main_loop
263    test         edgeb, 8 ; LR_HAVE_BOTTOM
264    jz .v3
265    mov           lpfq, lpfm
266    call .hv_bottom
267    add           lpfq, strideq
268    call .hv_bottom
269.v1:
270    call .v
271    RET
272.no_top:
273    lea            r10, [lpfq+strideq*4]
274    mov           lpfq, dstq
275    lea            r10, [r10+strideq*2]
276    mov           lpfm, r10
277    call .h
278    mov             t6, t1
279    mov             t5, t1
280    mov             t4, t1
281    mov             t3, t1
282    mov             t2, t1
283    dec             hd
284    jz .v1
285    add           lpfq, strideq
286    add             t1, 384*2
287    call .h
288    mov             t2, t1
289    dec             hd
290    jz .v2
291    add           lpfq, strideq
292    add             t1, 384*2
293    call .h
294    dec             hd
295    jz .v3
296    lea             t0, [t1+384*2]
297    call .hv
298    dec             hd
299    jz .v3
300    add             t0, 384*8
301    call .hv
302    dec             hd
303    jnz .main
304.v3:
305    call .v
306    movif32         wq, wm
307.v2:
308    call .v
309    movif32         wq, wm
310    jmp .v1
311.extend_right:
312%assign stack_offset stack_offset+8
313%assign calloff 8
314    movif32         t0, PICmem
315    pxor            m0, m0
316    movd            m1, wd
317    mova            m2, [base+pb_0to15]
318    pshufb          m1, m0
319    mova            m0, [base+pb_6_7]
320    psubb           m0, m1
321    pminub          m0, m2
322    pshufb          m3, m0
323    mova            m0, [base+pb_m2_m1]
324    psubb           m0, m1
325    pminub          m0, m2
326    pshufb          m4, m0
327    mova            m0, [base+pb_m10_m9]
328    psubb           m0, m1
329    pminub          m0, m2
330    pshufb          m5, m0
331    movif32         t0, t0m
332    ret
333%assign stack_offset stack_offset-4
334%assign calloff 4
335.h:
336    movif64         wq, r4
337    movif32         wq, wm
338    test         edgeb, 1 ; LR_HAVE_LEFT
339    jz .h_extend_left
340    movq            m3, [leftq]
341    movhps          m3, [lpfq+wq]
342    add          leftq, 8
343    jmp .h_main
344.h_extend_left:
345    mova            m3, [lpfq+wq]         ; avoid accessing memory located
346    pshufb          m3, wiener_lshuf7_mem ; before the start of the buffer
347    jmp .h_main
348.h_top:
349    movif64         wq, r4
350    test         edgeb, 1 ; LR_HAVE_LEFT
351    jz .h_extend_left
352.h_loop:
353    movu            m3, [lpfq+wq-8]
354.h_main:
355    mova            m4, [lpfq+wq+0]
356    movu            m5, [lpfq+wq+8]
357    test         edgeb, 2 ; LR_HAVE_RIGHT
358    jnz .h_have_right
359    cmp             wd, -20
360    jl .h_have_right
361    call .extend_right
362.h_have_right:
363    pshufb          m0, m3, m6
364    pshufb          m1, m4, m7
365    paddw           m0, m1
366    pshufb          m3, m8
367    pmaddwd         m0, m12
368    pshufb          m1, m4, m9
369    paddw           m3, m1
370    pshufb          m1, m4, m6
371    pmaddwd         m3, m13
372    pshufb          m2, m5, m7
373    paddw           m1, m2
374    mova            m2, pd_m262128_mem ; (1 << 4) - (1 << 18)
375    pshufb          m4, m8
376    pmaddwd         m1, m12
377    pshufb          m5, m9
378    paddw           m4, m5
379    pmaddwd         m4, m13
380    paddd           m0, m2
381    paddd           m1, m2
382    paddd           m0, m3
383    paddd           m1, m4
384    psrad           m0, 4
385    psrad           m1, 4
386    packssdw        m0, m1
387    psraw           m0, 1
388    mova       [t1+wq], m0
389    add             wq, 16
390    jl .h_loop
391    movif32         wq, wm
392    ret
393ALIGN function_align
394.hv:
395    add           lpfq, strideq
396    movif64         wq, r4
397    movif32        t0m, t0
398    movif32        t1m, t1
399    test         edgeb, 1 ; LR_HAVE_LEFT
400    jz .hv_extend_left
401    movq            m3, [leftq]
402    movhps          m3, [lpfq+wq]
403    add          leftq, 8
404    jmp .hv_main
405.hv_extend_left:
406    mova            m3, [lpfq+wq]
407    pshufb          m3, wiener_lshuf7_mem
408    jmp .hv_main
409.hv_bottom:
410    movif64         wq, r4
411    movif32        t0m, t0
412    movif32        t1m, t1
413    test         edgeb, 1 ; LR_HAVE_LEFT
414    jz .hv_extend_left
415.hv_loop:
416    movu            m3, [lpfq+wq-8]
417.hv_main:
418    mova            m4, [lpfq+wq+0]
419    movu            m5, [lpfq+wq+8]
420    test         edgeb, 2 ; LR_HAVE_RIGHT
421    jnz .hv_have_right
422    cmp             wd, -20
423    jl .hv_have_right
424    call .extend_right
425.hv_have_right:
426    movif32         t1, t4m
427    movif32         t0, t2m
428    pshufb          m0, m3, m6
429    pshufb          m1, m4, m7
430    paddw           m0, m1
431    pshufb          m3, m8
432    pmaddwd         m0, m12
433    pshufb          m1, m4, m9
434    paddw           m3, m1
435    pshufb          m1, m4, m6
436    pmaddwd         m3, m13
437    pshufb          m2, m5, m7
438    paddw           m1, m2
439    mova            m2, pd_m262128_mem
440    pshufb          m4, m8
441    pmaddwd         m1, m12
442    pshufb          m5, m9
443    paddw           m4, m5
444    pmaddwd         m4, m13
445    paddd           m0, m2
446    paddd           m1, m2
447%if ARCH_X86_64
448    mova            m2, [t4+wq]
449    paddw           m2, [t2+wq]
450    mova            m5, [t3+wq]
451%else
452    mova            m2, [t1+wq]
453    paddw           m2, [t0+wq]
454    mov             t1, t3m
455    mov             t0, t5m
456    mova            m5, [t1+wq]
457    mov             t1, t1m
458%endif
459    paddd           m0, m3
460    paddd           m1, m4
461    psrad           m0, 4
462    psrad           m1, 4
463    packssdw        m0, m1
464%if ARCH_X86_64
465    mova            m4, [t5+wq]
466    paddw           m4, [t1+wq]
467    psraw           m0, 1
468    paddw           m3, m0, [t6+wq]
469%else
470    mova            m4, [t0+wq]
471    paddw           m4, [t1+wq]
472    mov             t0, t0m
473    mov             t1, t6m
474    psraw           m0, 1
475    paddw           m3, m0, [t1+wq]
476%endif
477    mova       [t0+wq], m0
478    punpcklwd       m0, m2, m5
479    pmaddwd         m0, m15
480    punpckhwd       m2, m5
481    pmaddwd         m2, m15
482    punpcklwd       m1, m3, m4
483    pmaddwd         m1, m14
484    punpckhwd       m3, m4
485    pmaddwd         m3, m14
486    paddd           m0, m10
487    paddd           m2, m10
488    paddd           m0, m1
489    paddd           m2, m3
490    psrad           m0, 6
491    psrad           m2, 6
492    packssdw        m0, m2
493    pmulhw          m0, m11
494    pxor            m1, m1
495    pmaxsw          m0, m1
496    mova     [dstq+wq], m0
497    add             wq, 16
498    jl .hv_loop
499%if ARCH_X86_64
500    mov             t6, t5
501    mov             t5, t4
502    mov             t4, t3
503    mov             t3, t2
504    mov             t2, t1
505    mov             t1, t0
506    mov             t0, t6
507%else
508    mov             r4, t5m
509    mov             t1, t4m
510    mov            t6m, r4
511    mov            t5m, t1
512    mov             r4, t3m
513    mov             t1, t2m
514    mov            t4m, r4
515    mov            t3m, t1
516    mov             r4, t1m
517    mov             t1, t0
518    mov            t2m, r4
519    mov             t0, t6m
520    mov             wq, wm
521%endif
522    add           dstq, strideq
523    ret
524.v:
525    movif64         wq, r4
526    movif32        t0m, t0
527    movif32        t1m, t1
528.v_loop:
529%if ARCH_X86_64
530    mova            m1, [t4+wq]
531    paddw           m1, [t2+wq]
532    mova            m2, [t3+wq]
533    mova            m4, [t1+wq]
534    paddw           m3, m4, [t6+wq]
535    paddw           m4, [t5+wq]
536%else
537    mov             t0, t4m
538    mov             t1, t2m
539    mova            m1, [t0+wq]
540    paddw           m1, [t1+wq]
541    mov             t0, t3m
542    mov             t1, t1m
543    mova            m2, [t0+wq]
544    mova            m4, [t1+wq]
545    mov             t0, t6m
546    mov             t1, t5m
547    paddw           m3, m4, [t0+wq]
548    paddw           m4, [t1+wq]
549%endif
550    punpcklwd       m0, m1, m2
551    pmaddwd         m0, m15
552    punpckhwd       m1, m2
553    pmaddwd         m1, m15
554    punpcklwd       m2, m3, m4
555    pmaddwd         m2, m14
556    punpckhwd       m3, m4
557    pmaddwd         m3, m14
558    paddd           m0, m10
559    paddd           m1, m10
560    paddd           m0, m2
561    paddd           m1, m3
562    psrad           m0, 6
563    psrad           m1, 6
564    packssdw        m0, m1
565    pmulhw          m0, m11
566    pxor            m1, m1
567    pmaxsw          m0, m1
568    mova     [dstq+wq], m0
569    add             wq, 16
570    jl .v_loop
571%if ARCH_X86_64
572    mov             t6, t5
573    mov             t5, t4
574    mov             t4, t3
575    mov             t3, t2
576    mov             t2, t1
577%else
578    mov             t0, t5m
579    mov             t1, t4m
580    mov             r4, t3m
581    mov            t6m, t0
582    mov            t5m, t1
583    mov            t4m, r4
584    mov             r4, t2m
585    mov             t1, t1m
586    mov             t0, t0m
587    mov            t3m, r4
588    mov            t2m, t1
589%endif
590    add           dstq, strideq
591    ret
592
593%if ARCH_X86_32
594 %if STACK_ALIGNMENT < 16
595  %assign stack_size 12*16+384*8
596 %else
597  %assign stack_size 11*16+384*8
598 %endif
599cglobal wiener_filter5_16bpc, 4, 7, 8, -stack_size, dst, stride, left, \
600                                                    lpf, w, flt
601 %if STACK_ALIGNMENT < 16
602  %define lpfm        dword [esp+calloff+4*6]
603  %define wm          dword [esp+calloff+4*7]
604  %define hd          dword [esp+calloff+16*10+0]
605  %define edgeb        byte [esp+calloff+16*10+4]
606  %define edged       dword [esp+calloff+16*10+4]
607 %else
608  %define hd dword r5m
609  %define edgeb byte r7m
610 %endif
611 %define PICmem dword [esp+calloff+4*0]
612 %define t0m    dword [esp+calloff+4*1] ; wiener ring buffer pointers
613 %define t1m    dword [esp+calloff+4*2]
614 %define t2m    dword [esp+calloff+4*3]
615 %define t3m    dword [esp+calloff+4*4]
616 %define t4m    dword [esp+calloff+4*5]
617 %define t2 t2m
618 %define t3 t3m
619 %define t4 t4m
620 %define  m8 [esp+calloff+16*2]
621 %define  m9 [esp+calloff+16*3]
622 %define m10 [esp+calloff+16*4]
623 %define m11 [esp+calloff+16*5]
624 %define m12 [esp+calloff+16*6]
625 %define m13 [esp+calloff+16*7]
626 %define m14 [esp+calloff+16*8]
627 %define m15 [esp+calloff+16*9]
628 %define base t0-wiener_shifts
629 %assign calloff 0
630 %if STACK_ALIGNMENT < 16
631    mov             wd, [rstk+stack_offset+20]
632    mov             wm, wd
633    mov             r5, [rstk+stack_offset+24]
634    mov             hd, r5
635    mov             r5, [rstk+stack_offset+32]
636    mov          edged, r5 ; edge
637 %endif
638%else
639cglobal wiener_filter5_16bpc, 4, 14, 16, 384*8+16, dst, stride, left, lpf, \
640                                                   w, h, edge, flt
641 %define base
642%endif
643%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
644    movifnidn       wd, wm
645%endif
646%if ARCH_X86_64
647    mov           fltq, r6mp
648    movifnidn       hd, hm
649    mov          edged, r7m
650    mov            t3d, r8m ; pixel_max
651    movq           m12, [fltq]
652    movq           m14, [fltq+16]
653%else
654 %if STACK_ALIGNMENT < 16
655    mov             t0, [rstk+stack_offset+28]
656    mov             t1, [rstk+stack_offset+36] ; pixel_max
657    movq            m1, [t0]    ; fx
658    movq            m3, [t0+16] ; fy
659    LEA             t0, wiener_shifts
660 %else
661    mov           fltq, r6m
662    movq            m1, [fltq]
663    movq            m3, [fltq+16]
664    LEA             t0, wiener_shifts
665    mov             t1, r8m ; pixel_max
666 %endif
667    mov         PICmem, t0
668%endif
669    mova            m5, [base+wiener_shufE]
670    mova            m6, [base+wiener_shufB]
671    mova            m7, [base+wiener_shufD]
672%if ARCH_X86_64
673    lea             t4, [wiener_shifts]
674    add             wd, wd
675    punpcklwd      m11, m12, m12
676    pshufd         m11, m11, q1111 ; x1
677    pshufd         m12, m12, q1111 ; x2 x3
678    punpcklwd      m13, m14, m14
679    pshufd         m13, m13, q1111 ; y1
680    pshufd         m14, m14, q1111 ; y2 y3
681    shr            t3d, 11
682    mova            m8, [pd_m262128] ; (1 << 4) - (1 << 18)
683    add           lpfq, wq
684    lea             t1, [rsp+wq+16]
685    add           dstq, wq
686    neg             wq
687 %define base t4-wiener_shifts
688    movd            m9, [base+wiener_round+t3*4]
689    movq           m10, [base+wiener_shifts+t3*8]
690    pshufd          m9, m9, q0000
691    pshufd          m0, m10, q0000
692    pshufd         m10, m10, q1111
693    mova           m15, [wiener_lshuf5]
694    pmullw         m11, m0
695    pmullw         m12, m0
696 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
697 %define lpfm [rsp]
698 %define base
699%else
700    add             wd, wd
701    punpcklwd       m0, m1, m1
702    pshufd          m0, m0, q1111 ; x1
703    pshufd          m1, m1, q1111 ; x2 x3
704    punpcklwd       m2, m3, m3
705    pshufd          m2, m2, q1111 ; y1
706    pshufd          m3, m3, q1111 ; y2 y3
707    mova            m4, [base+pd_m262128] ; (1 << 4) - (1 << 18)
708    mova           m13, m2
709    mova           m14, m3
710    mova            m8, m4
711    shr             t1, 11
712    add           lpfq, wq
713    movd            m2, [base+wiener_round+t1*4]
714    movq            m3, [base+wiener_shifts+t1*8]
715 %if STACK_ALIGNMENT < 16
716    lea             t1, [esp+16*11+wq+16]
717 %else
718    lea             t1, [esp+16*10+wq+16]
719 %endif
720    add           dstq, wq
721    neg             wq
722    pshufd          m2, m2, q0000
723    pshufd          m4, m3, q0000
724    pshufd          m3, m3, q1111
725    mov             wm, wq
726    pmullw          m0, m4
727    pmullw          m1, m4
728    mova            m4, [base+wiener_lshuf5]
729    mova            m9, m2
730    mova           m10, m3
731    mova           m11, m0
732    mova           m12, m1
733    mova           m15, m4
734%endif
735    test         edgeb, 4 ; LR_HAVE_TOP
736    jz .no_top
737    call .h_top
738    add           lpfq, strideq
739    mov             t4, t1
740    add             t1, 384*2
741    call .h_top
742    lea            r10, [lpfq+strideq*4]
743    mov           lpfq, dstq
744    mov             t3, t1
745    add             t1, 384*2
746    add            r10, strideq
747    mov           lpfm, r10 ; below
748    call .h
749    mov             t2, t1
750    dec             hd
751    jz .v1
752    add           lpfq, strideq
753    add             t1, 384*2
754    call .h
755    dec             hd
756    jz .v2
757.main:
758    mov             t0, t4
759.main_loop:
760    call .hv
761    dec             hd
762    jnz .main_loop
763    test         edgeb, 8 ; LR_HAVE_BOTTOM
764    jz .v2
765    mov           lpfq, lpfm
766    call .hv_bottom
767    add           lpfq, strideq
768    call .hv_bottom
769.end:
770    RET
771.no_top:
772    lea            r10, [lpfq+strideq*4]
773    mov           lpfq, dstq
774    lea            r10, [r10+strideq*2]
775    mov           lpfm, r10
776    call .h
777    mov             t4, t1
778    mov             t3, t1
779    mov             t2, t1
780    dec             hd
781    jz .v1
782    add           lpfq, strideq
783    add             t1, 384*2
784    call .h
785    dec             hd
786    jz .v2
787    lea             t0, [t1+384*2]
788    call .hv
789    dec             hd
790    jz .v2
791    add             t0, 384*6
792    call .hv
793    dec             hd
794    jnz .main
795.v2:
796    call .v
797%if ARCH_X86_64
798    mov             t4, t3
799    mov             t3, t2
800    mov             t2, t1
801%else
802    mov             t0, t3m
803    mov             r4, t2m
804    mov             t1, t1m
805    mov            t4m, t0
806    mov            t3m, r4
807    mov            t2m, t1
808    mov             wq, wm
809%endif
810    add           dstq, strideq
811.v1:
812    call .v
813    jmp .end
814.extend_right:
815%assign stack_offset stack_offset+8
816%assign calloff 8
817    movif32         t0, PICmem
818    pxor            m1, m1
819    movd            m2, wd
820    mova            m0, [base+pb_2_3]
821    pshufb          m2, m1
822    mova            m1, [base+pb_m6_m5]
823    psubb           m0, m2
824    psubb           m1, m2
825    mova            m2, [base+pb_0to15]
826    pminub          m0, m2
827    pminub          m1, m2
828    pshufb          m3, m0
829    pshufb          m4, m1
830    ret
831%assign stack_offset stack_offset-4
832%assign calloff 4
833.h:
834    movif64         wq, r4
835    movif32         wq, wm
836    test         edgeb, 1 ; LR_HAVE_LEFT
837    jz .h_extend_left
838    mova            m4, [lpfq+wq]
839    movd            m3, [leftq+4]
840    pslldq          m4, 4
841    por             m3, m4
842    add          leftq, 8
843    jmp .h_main
844.h_extend_left:
845    mova            m3, [lpfq+wq] ; avoid accessing memory located
846    pshufb          m3, m15       ; before the start of the buffer
847    jmp .h_main
848.h_top:
849    movif64         wq, r4
850    movif32         wq, wm
851    test         edgeb, 1 ; LR_HAVE_LEFT
852    jz .h_extend_left
853.h_loop:
854    movu            m3, [lpfq+wq-4]
855.h_main:
856    movu            m4, [lpfq+wq+4]
857    test         edgeb, 2 ; LR_HAVE_RIGHT
858    jnz .h_have_right
859    cmp             wd, -18
860    jl .h_have_right
861    call .extend_right
862.h_have_right:
863    pshufb          m0, m3, m5
864    pmaddwd         m0, m11
865    pshufb          m1, m4, m5
866    pmaddwd         m1, m11
867    pshufb          m2, m3, m6
868    pshufb          m3, m7
869    paddw           m2, m3
870    pshufb          m3, m4, m6
871    pmaddwd         m2, m12
872    pshufb          m4, m7
873    paddw           m3, m4
874    pmaddwd         m3, m12
875    paddd           m0, m8
876    paddd           m1, m8
877    paddd           m0, m2
878    paddd           m1, m3
879    psrad           m0, 4
880    psrad           m1, 4
881    packssdw        m0, m1
882    psraw           m0, 1
883    mova       [t1+wq], m0
884    add             wq, 16
885    jl .h_loop
886    movif32         wq, wm
887    ret
888ALIGN function_align
889.hv:
890    add           lpfq, strideq
891    movif64         wq, r4
892    movif32        t0m, t0
893    movif32        t1m, t1
894    test         edgeb, 1 ; LR_HAVE_LEFT
895    jz .hv_extend_left
896    mova            m4, [lpfq+wq]
897    movd            m3, [leftq+4]
898    pslldq          m4, 4
899    por             m3, m4
900    add          leftq, 8
901    jmp .hv_main
902.hv_extend_left:
903    mova            m3, [lpfq+wq]
904    pshufb          m3, m15
905    jmp .hv_main
906.hv_bottom:
907    movif64         wq, r4
908    movif32        t0m, t0
909    movif32        t1m, t1
910    test         edgeb, 1 ; LR_HAVE_LEFT
911    jz .hv_extend_left
912.hv_loop:
913    movu            m3, [lpfq+wq-4]
914.hv_main:
915    movu            m4, [lpfq+wq+4]
916    test         edgeb, 2 ; LR_HAVE_RIGHT
917    jnz .hv_have_right
918    cmp             wd, -18
919    jl .hv_have_right
920    call .extend_right
921.hv_have_right:
922    movif32         t1, t1m
923    movif32         t0, t3m
924    pshufb          m0, m3, m5
925    pmaddwd         m0, m11
926    pshufb          m1, m4, m5
927    pmaddwd         m1, m11
928    pshufb          m2, m3, m6
929    pshufb          m3, m7
930    paddw           m2, m3
931    pshufb          m3, m4, m6
932    pmaddwd         m2, m12
933    pshufb          m4, m7
934    paddw           m3, m4
935    pmaddwd         m3, m12
936    paddd           m0, m8
937    paddd           m1, m8
938    paddd           m0, m2
939%if ARCH_X86_64
940    mova            m2, [t3+wq]
941    paddw           m2, [t1+wq]
942    paddd           m1, m3
943    mova            m4, [t2+wq]
944%else
945    mova            m2, [t0+wq]
946    mov             t0, t2m
947    paddw           m2, [t1+wq]
948    mov             t1, t4m
949    paddd           m1, m3
950    mova            m4, [t0+wq]
951    mov             t0, t0m
952%endif
953    punpckhwd       m3, m2, m4
954    pmaddwd         m3, m14
955    punpcklwd       m2, m4
956%if ARCH_X86_64
957    mova            m4, [t4+wq]
958%else
959    mova            m4, [t1+wq]
960%endif
961    psrad           m0, 4
962    psrad           m1, 4
963    packssdw        m0, m1
964    pmaddwd         m2, m14
965    psraw           m0, 1
966    mova       [t0+wq], m0
967    punpckhwd       m1, m0, m4
968    pmaddwd         m1, m13
969    punpcklwd       m0, m4
970    pmaddwd         m0, m13
971    paddd           m3, m9
972    paddd           m2, m9
973    paddd           m1, m3
974    paddd           m0, m2
975    psrad           m1, 6
976    psrad           m0, 6
977    packssdw        m0, m1
978    pmulhw          m0, m10
979    pxor            m1, m1
980    pmaxsw          m0, m1
981    mova     [dstq+wq], m0
982    add             wq, 16
983    jl .hv_loop
984%if ARCH_X86_64
985    mov             t4, t3
986    mov             t3, t2
987    mov             t2, t1
988    mov             t1, t0
989    mov             t0, t4
990%else
991    mov             r4, t3m
992    mov             t1, t2m
993    mov            t4m, r4
994    mov            t3m, t1
995    mov             r4, t1m
996    mov             t1, t0
997    mov            t2m, r4
998    mov             t0, t4m
999    mov             wq, wm
1000%endif
1001    add           dstq, strideq
1002    ret
1003.v:
1004    movif64         wq, r4
1005    movif32        t1m, t1
1006.v_loop:
1007%if ARCH_X86_64
1008    mova            m0, [t1+wq]
1009    paddw           m2, m0, [t3+wq]
1010    mova            m1, [t2+wq]
1011    mova            m4, [t4+wq]
1012%else
1013    mov             t0, t3m
1014    mova            m0, [t1+wq]
1015    mov             t1, t2m
1016    paddw           m2, m0, [t0+wq]
1017    mov             t0, t4m
1018    mova            m1, [t1+wq]
1019    mova            m4, [t0+wq]
1020%endif
1021    punpckhwd       m3, m2, m1
1022    pmaddwd         m3, m14
1023    punpcklwd       m2, m1
1024    pmaddwd         m2, m14
1025    punpckhwd       m1, m0, m4
1026    pmaddwd         m1, m13
1027    punpcklwd       m0, m4
1028    pmaddwd         m0, m13
1029    paddd           m3, m9
1030    paddd           m2, m9
1031    paddd           m1, m3
1032    paddd           m0, m2
1033    psrad           m1, 6
1034    psrad           m0, 6
1035    packssdw        m0, m1
1036    pmulhw          m0, m10
1037    pxor            m1, m1
1038    pmaxsw          m0, m1
1039    mova     [dstq+wq], m0
1040    add             wq, 16
1041%if ARCH_X86_64
1042    jl .v_loop
1043%else
1044    jge .v_end
1045    mov             t1, t1m
1046    jmp .v_loop
1047.v_end:
1048%endif
1049    ret
1050
1051%macro GATHERDD 3 ; dst, src, tmp
1052    movd           %3d, %2
1053 %if ARCH_X86_64
1054    movd            %1, [r13+%3]
1055    pextrw         %3d, %2, 2
1056    pinsrw          %1, [r13+%3+2], 3
1057    pextrw         %3d, %2, 4
1058    pinsrw          %1, [r13+%3+2], 5
1059    pextrw         %3d, %2, 6
1060    pinsrw          %1, [r13+%3+2], 7
1061 %else
1062    movd            %1, [base+sgr_x_by_x-0xf03+%3]
1063    pextrw          %3, %2, 2
1064    pinsrw          %1, [base+sgr_x_by_x-0xf03+%3+2], 3
1065    pextrw          %3, %2, 4
1066    pinsrw          %1, [base+sgr_x_by_x-0xf03+%3+2], 5
1067    pextrw          %3, %2, 6
1068    pinsrw          %1, [base+sgr_x_by_x-0xf03+%3+2], 7
1069 %endif
1070%endmacro
1071
1072%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore
1073 %if ARCH_X86_64
1074  %define tmp r14
1075 %else
1076  %define tmp %4
1077 %endif
1078    GATHERDD        %1, %2, tmp
1079    GATHERDD        %2, %3, tmp
1080    movif32         %4, %5
1081    psrld           %1, 24
1082    psrld           %2, 24
1083    packssdw        %1, %2
1084%endmacro
1085
1086%macro MAXSD 3-4 0 ; dst, src, restore_tmp
1087    pcmpgtd         %3, %1, %2
1088    pand            %1, %3
1089    pandn           %3, %2
1090    por             %1, %3
1091 %if %4 == 1
1092    pxor            %3, %3
1093 %endif
1094%endmacro
1095
1096%macro MULLD 3 ; dst, src, tmp
1097    pmulhuw         %3, %1, %2
1098    pmullw          %1, %2
1099    pslld           %3, 16
1100    paddd           %1, %3
1101%endmacro
1102
1103%if ARCH_X86_32
1104DECLARE_REG_TMP 0, 1, 2, 3, 5
1105 %if STACK_ALIGNMENT < 16
1106  %assign extra_stack 5*16
1107 %else
1108  %assign extra_stack 3*16
1109 %endif
1110cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \
1111                              dst, stride, left, lpf, w
1112 %if STACK_ALIGNMENT < 16
1113  %define dstm         dword [esp+calloff+16*0+4*6]
1114  %define stridemp     dword [esp+calloff+16*0+4*7]
1115  %define leftm        dword [esp+calloff+16*3+4*0]
1116  %define lpfm         dword [esp+calloff+16*3+4*1]
1117  %define w0m          dword [esp+calloff+16*3+4*2]
1118  %define hd           dword [esp+calloff+16*3+4*3]
1119  %define edgeb         byte [esp+calloff+16*3+4*4]
1120  %define edged        dword [esp+calloff+16*3+4*4]
1121  %define leftmp leftm
1122 %else
1123  %define w0m wm
1124  %define hd dword r5m
1125  %define edgeb  byte r7m
1126  %define edged dword r7m
1127 %endif
1128 %define hvsrcm dword [esp+calloff+4*0]
1129 %define w1m    dword [esp+calloff+4*1]
1130 %define t0m    dword [esp+calloff+4*2]
1131 %define t2m    dword [esp+calloff+4*3]
1132 %define t3m    dword [esp+calloff+4*4]
1133 %define t4m    dword [esp+calloff+4*5]
1134 %define  m8 [base+pd_8]
1135 %define  m9 [base+pd_0xfffffff0]
1136 %define m10 [esp+calloff+16*2]
1137 %define m11 [base+pd_0xf00800a4]
1138 %define m12 [base+sgr_lshuf5]
1139 %define m13 [base+pd_34816]
1140 %define m14 [base+pw_1023]
1141 %define r10 r4
1142 %define base r6-$$
1143 %assign calloff 0
1144 %if STACK_ALIGNMENT < 16
1145    mov        strideq, [rstk+stack_offset+ 8]
1146    mov          leftq, [rstk+stack_offset+12]
1147    mov           lpfq, [rstk+stack_offset+16]
1148    mov             wd, [rstk+stack_offset+20]
1149    mov           dstm, dstq
1150    mov       stridemp, strideq
1151    mov          leftm, leftq
1152    mov             r1, [rstk+stack_offset+24]
1153    mov             r2, [rstk+stack_offset+32]
1154    mov           lpfm, lpfq
1155    mov             hd, r1
1156    mov          edged, r2
1157 %endif
1158%else
1159cglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \
1160                                                     w, h, edge, params
1161%endif
1162%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
1163    movifnidn       wd, wm
1164%endif
1165%if ARCH_X86_64
1166    mov        paramsq, r6mp
1167    lea            r13, [sgr_x_by_x-0xf03]
1168    movifnidn       hd, hm
1169    add             wd, wd
1170    mov          edged, r7m
1171    movu           m10, [paramsq]
1172    mova           m12, [sgr_lshuf5]
1173    add           lpfq, wq
1174    mova            m8, [pd_8]
1175    lea             t1, [rsp+wq+20]
1176    mova            m9, [pd_0xfffffff0]
1177    add           dstq, wq
1178    lea             t3, [rsp+wq*2+400*12+16]
1179    mova           m11, [pd_0xf00800a4]
1180    lea             t4, [rsp+wq+400*20+16]
1181    pshufhw         m7, m10, q0000
1182    pshufb         m10, [pw_256]  ; s0
1183    punpckhqdq      m7, m7        ; w0
1184    neg             wq
1185    mova           m13, [pd_34816]  ; (1 << 11) + (1 << 15)
1186    pxor            m6, m6
1187    mova           m14, [pw_1023]
1188    psllw           m7, 4
1189 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
1190 %define lpfm        [rsp]
1191%else
1192    mov             r1, [rstk+stack_offset+28] ; params
1193    LEA             r6, $$
1194    add             wd, wd
1195    movu            m1, [r1]
1196    add           lpfm, wq
1197    lea             t1, [rsp+extra_stack+wq+20]
1198    add           dstq, wq
1199    lea             t3, [rsp+extra_stack+wq*2+400*12+16]
1200    mov           dstm, dstq
1201    lea             t4, [rsp+extra_stack+wq+400*20+16]
1202    mov            t3m, t3
1203    pshufhw         m7, m1, q0000
1204    mov            t4m, t4
1205    pshufb          m1, [base+pw_256] ; s0
1206    punpckhqdq      m7, m7            ; w0
1207    psllw           m7, 4
1208    neg             wq
1209    mova           m10, m1
1210    pxor            m6, m6
1211    mov            w1m, wd
1212    sub             wd, 4
1213    mov           lpfq, lpfm
1214    mov            w0m, wd
1215 %define strideq r5
1216%endif
1217    test         edgeb, 4 ; LR_HAVE_TOP
1218    jz .no_top
1219    call .h_top
1220    add           lpfq, stridemp
1221    movif32        t2m, t1
1222    mov             t2, t1
1223    call .top_fixup
1224    add             t1, 400*6
1225    call .h_top
1226    movif32    strideq, stridemp
1227    lea            r10, [lpfq+strideq*4]
1228    mov           lpfq, dstq
1229    add            r10, strideq
1230    mov           lpfm, r10 ; below
1231    movif32        t0m, t2
1232    mov             t0, t2
1233    dec             hd
1234    jz .height1
1235    or           edged, 16
1236    call .h
1237.main:
1238    add           lpfq, stridemp
1239    movif32         t4, t4m
1240    call .hv
1241    call .prep_n
1242    sub             hd, 2
1243    jl .extend_bottom
1244.main_loop:
1245    movif32       lpfq, hvsrcm
1246    add           lpfq, stridemp
1247%if ARCH_X86_64
1248    test            hb, hb
1249%else
1250    mov             r4, hd
1251    test            r4, r4
1252%endif
1253    jz .odd_height
1254    call .h
1255    add           lpfq, stridemp
1256    call .hv
1257    movif32       dstq, dstm
1258    call .n0
1259    call .n1
1260    sub             hd, 2
1261    movif32         t0, t0m
1262    jge .main_loop
1263    test         edgeb, 8 ; LR_HAVE_BOTTOM
1264    jz .extend_bottom
1265    mov           lpfq, lpfm
1266    call .h_top
1267    add           lpfq, stridemp
1268    call .hv_bottom
1269.end:
1270    movif32       dstq, dstm
1271    call .n0
1272    call .n1
1273.end2:
1274    RET
1275.height1:
1276    movif32         t4, t4m
1277    call .hv
1278    call .prep_n
1279    jmp .odd_height_end
1280.odd_height:
1281    call .hv
1282    movif32       dstq, dstm
1283    call .n0
1284    call .n1
1285.odd_height_end:
1286    call .v
1287    movif32       dstq, dstm
1288    call .n0
1289    jmp .end2
1290.extend_bottom:
1291    call .v
1292    jmp .end
1293.no_top:
1294    movif32    strideq, stridemp
1295    lea            r10, [lpfq+strideq*4]
1296    mov           lpfq, dstq
1297    lea            r10, [r10+strideq*2]
1298    mov           lpfm, r10
1299    call .h
1300    lea             t2, [t1+400*6]
1301    movif32        t2m, t2
1302    call .top_fixup
1303    dec             hd
1304    jz .no_top_height1
1305    or           edged, 16
1306    mov             t0, t1
1307    mov             t1, t2
1308    movif32        t0m, t0
1309    jmp .main
1310.no_top_height1:
1311    movif32         t3, t3m
1312    movif32         t4, t4m
1313    call .v
1314    call .prep_n
1315    jmp .odd_height_end
1316.extend_right:
1317    movd            m0, wd
1318    movd            m1, [lpfq-2]
1319    mova            m2, [base+pw_256]
1320    mova            m3, [base+pb_m14_m13]
1321    pshufb          m0, m6
1322    pshufb          m1, m2
1323    psubb           m2, m0
1324    psubb           m3, m0
1325    mova            m0, [base+pb_0to15]
1326    pcmpgtb         m2, m0
1327    pcmpgtb         m3, m0
1328    pand            m4, m2
1329    pand            m5, m3
1330    pandn           m2, m1
1331    pandn           m3, m1
1332    por             m4, m2
1333    por             m5, m3
1334    ret
1335%assign stack_offset stack_offset+4
1336%assign calloff 4
1337.h: ; horizontal boxsum
1338%if ARCH_X86_64
1339    lea             wq, [r4-4]
1340%else
1341 %define leftq r4
1342%endif
1343    test         edgeb, 1 ; LR_HAVE_LEFT
1344    jz .h_extend_left
1345    movif32      leftq, leftm
1346    movddup         m5, [leftq]
1347    movif32         wq, w0m
1348    mova            m4, [lpfq+wq+4]
1349    add         leftmp, 8
1350    palignr         m4, m5, 10
1351    jmp .h_main
1352.h_extend_left:
1353    movif32         wq, w0m
1354    mova            m4, [lpfq+wq+4]
1355    pshufb          m4, m12
1356    jmp .h_main
1357.h_top:
1358%if ARCH_X86_64
1359    lea             wq, [r4-4]
1360%endif
1361    test         edgeb, 1 ; LR_HAVE_LEFT
1362    jz .h_extend_left
1363    movif32         wq, w0m
1364.h_loop:
1365    movu            m4, [lpfq+wq- 2]
1366.h_main:
1367    movu            m5, [lpfq+wq+14]
1368    test         edgeb, 2 ; LR_HAVE_RIGHT
1369    jnz .h_have_right
1370    cmp             wd, -20
1371    jl .h_have_right
1372    call .extend_right
1373.h_have_right:
1374    palignr         m2, m5, m4, 2
1375    paddw           m0, m4, m2
1376    palignr         m3, m5, m4, 6
1377    paddw           m0, m3
1378    punpcklwd       m1, m2, m3
1379    pmaddwd         m1, m1
1380    punpckhwd       m2, m3
1381    pmaddwd         m2, m2
1382    palignr         m5, m4, 8
1383    paddw           m0, m5
1384    punpcklwd       m3, m4, m5
1385    pmaddwd         m3, m3
1386    paddd           m1, m3
1387    punpckhwd       m3, m4, m5
1388    pmaddwd         m3, m3
1389    shufps          m4, m5, q2121
1390    paddw           m0, m4             ; sum
1391    punpcklwd       m5, m4, m6
1392    pmaddwd         m5, m5
1393    punpckhwd       m4, m6
1394    pmaddwd         m4, m4
1395    paddd           m2, m3
1396    test         edgeb, 16             ; y > 0
1397    jz .h_loop_end
1398    paddw           m0, [t1+wq+400*0]
1399    paddd           m1, [t1+wq+400*2]
1400    paddd           m2, [t1+wq+400*4]
1401.h_loop_end:
1402    paddd           m1, m5             ; sumsq
1403    paddd           m2, m4
1404    mova [t1+wq+400*0], m0
1405    mova [t1+wq+400*2], m1
1406    mova [t1+wq+400*4], m2
1407    add             wq, 16
1408    jl .h_loop
1409    ret
1410.top_fixup:
1411%if ARCH_X86_64
1412    lea             wq, [r4-4]
1413%else
1414    mov             wd, w0m
1415%endif
1416.top_fixup_loop: ; the sums of the first row needs to be doubled
1417    mova            m0, [t1+wq+400*0]
1418    mova            m1, [t1+wq+400*2]
1419    mova            m2, [t1+wq+400*4]
1420    paddw           m0, m0
1421    paddd           m1, m1
1422    paddd           m2, m2
1423    mova [t2+wq+400*0], m0
1424    mova [t2+wq+400*2], m1
1425    mova [t2+wq+400*4], m2
1426    add             wq, 16
1427    jl .top_fixup_loop
1428    ret
1429ALIGN function_align
1430.hv: ; horizontal boxsum + vertical boxsum + ab
1431%if ARCH_X86_64
1432    lea             wq, [r4-4]
1433%else
1434    mov         hvsrcm, lpfq
1435%endif
1436    test         edgeb, 1 ; LR_HAVE_LEFT
1437    jz .hv_extend_left
1438    movif32      leftq, leftm
1439    movddup         m5, [leftq]
1440    movif32         wq, w0m
1441    mova            m4, [lpfq+wq+4]
1442    add         leftmp, 8
1443    palignr         m4, m5, 10
1444    jmp .hv_main
1445.hv_extend_left:
1446    movif32         wq, w0m
1447    mova            m4, [lpfq+wq+4]
1448    pshufb          m4, m12
1449    jmp .hv_main
1450.hv_bottom:
1451%if ARCH_X86_64
1452    lea             wq, [r4-4]
1453%else
1454    mov         hvsrcm, lpfq
1455%endif
1456    test         edgeb, 1 ; LR_HAVE_LEFT
1457    jz .hv_extend_left
1458    movif32         wq, w0m
1459%if ARCH_X86_32
1460    jmp .hv_loop_start
1461%endif
1462.hv_loop:
1463    movif32       lpfq, hvsrcm
1464.hv_loop_start:
1465    movu            m4, [lpfq+wq- 2]
1466.hv_main:
1467    movu            m5, [lpfq+wq+14]
1468    test         edgeb, 2 ; LR_HAVE_RIGHT
1469    jnz .hv_have_right
1470    cmp             wd, -20
1471    jl .hv_have_right
1472    call .extend_right
1473.hv_have_right:
1474    movif32         t3, hd
1475    palignr         m3, m5, m4, 2
1476    paddw           m0, m4, m3
1477    palignr         m1, m5, m4, 6
1478    paddw           m0, m1
1479    punpcklwd       m2, m3, m1
1480    pmaddwd         m2, m2
1481    punpckhwd       m3, m1
1482    pmaddwd         m3, m3
1483    palignr         m5, m4, 8
1484    paddw           m0, m5
1485    punpcklwd       m1, m4, m5
1486    pmaddwd         m1, m1
1487    paddd           m2, m1
1488    punpckhwd       m1, m4, m5
1489    pmaddwd         m1, m1
1490    shufps          m4, m5, q2121
1491    paddw           m0, m4            ; h sum
1492    punpcklwd       m5, m4, m6
1493    pmaddwd         m5, m5
1494    punpckhwd       m4, m6
1495    pmaddwd         m4, m4
1496    paddd           m3, m1
1497    paddd           m2, m5            ; h sumsq
1498    paddd           m3, m4
1499    paddw           m1, m0, [t1+wq+400*0]
1500    paddd           m4, m2, [t1+wq+400*2]
1501    paddd           m5, m3, [t1+wq+400*4]
1502%if ARCH_X86_64
1503    test            hd, hd
1504%else
1505    test            t3, t3
1506%endif
1507    jz .hv_last_row
1508.hv_main2:
1509    paddw           m1, [t2+wq+400*0] ; hv sum
1510    paddd           m4, [t2+wq+400*2] ; hv sumsq
1511    paddd           m5, [t2+wq+400*4]
1512    mova [t0+wq+400*0], m0
1513    mova [t0+wq+400*2], m2
1514    mova [t0+wq+400*4], m3
1515    psrlw           m3, m1, 1
1516    paddd           m4, m8
1517    pavgw           m3, m6             ; (b + 2) >> 2
1518    paddd           m5, m8
1519    pand            m4, m9             ; ((a + 8) >> 4) << 4
1520    pand            m5, m9
1521    psrld           m2, m4, 4
1522    psrld           m0, m5, 4
1523    paddd           m2, m4
1524    psrld           m4, 1
1525    paddd           m0, m5
1526    psrld           m5, 1
1527    paddd           m4, m2             ; a * 25
1528    paddd           m5, m0
1529    punpcklwd       m2, m3, m6
1530    punpckhwd       m3, m6
1531    pmaddwd         m2, m2             ; b * b
1532    pmaddwd         m3, m3
1533    punpcklwd       m0, m1, m6         ; b
1534    punpckhwd       m1, m6
1535    MAXSD           m4, m2, m6
1536    MAXSD           m5, m3, m6, 1
1537    psubd           m4, m2             ; p
1538    psubd           m5, m3
1539    MULLD           m4, m10, m2        ; p * s
1540    MULLD           m5, m10, m2
1541    pmaddwd         m0, m11            ; b * 164
1542    pmaddwd         m1, m11
1543    paddusw         m4, m11
1544    paddusw         m5, m11
1545    psrld           m4, 20             ; min(z, 255)
1546    movif32         t3, t3m
1547    psrld           m5, 20
1548    GATHER_X_BY_X   m3, m4, m5, t2, t2m
1549    punpcklwd       m4, m3, m3
1550    punpckhwd       m5, m3, m3
1551    MULLD           m0, m4, m2
1552    MULLD           m1, m5, m2
1553    paddd           m0, m13            ; x * b * 164 + (1 << 11) + (1 << 15)
1554    paddd           m1, m13
1555    mova     [t4+wq+4], m3
1556    psrld           m0, 12             ; b
1557    psrld           m1, 12
1558    mova  [t3+wq*2+ 8], m0
1559    mova  [t3+wq*2+24], m1
1560    add             wq, 16
1561    jl .hv_loop
1562    mov             t2, t1
1563    mov             t1, t0
1564    mov             t0, t2
1565    movif32        t2m, t2
1566    movif32        t0m, t0
1567    ret
1568.hv_last_row: ; esoteric edge case for odd heights
1569    mova [t1+wq+400*0], m1
1570    paddw           m1, m0
1571    mova [t1+wq+400*2], m4
1572    paddd           m4, m2
1573    mova [t1+wq+400*4], m5
1574    paddd           m5, m3
1575    jmp .hv_main2
1576.v: ; vertical boxsum + ab
1577%if ARCH_X86_64
1578    lea             wq, [r4-4]
1579%else
1580    mov             wd, w0m
1581%endif
1582.v_loop:
1583    mova            m0, [t1+wq+400*0]
1584    mova            m2, [t1+wq+400*2]
1585    mova            m3, [t1+wq+400*4]
1586    paddw           m1, m0, [t2+wq+400*0]
1587    paddd           m4, m2, [t2+wq+400*2]
1588    paddd           m5, m3, [t2+wq+400*4]
1589    paddw           m0, m0
1590    paddd           m2, m2
1591    paddd           m3, m3
1592    paddw           m1, m0             ; hv sum
1593    paddd           m4, m2             ; hv sumsq
1594    paddd           m5, m3
1595    psrlw           m3, m1, 1
1596    paddd           m4, m8
1597    pavgw           m3, m6             ; (b + 2) >> 2
1598    paddd           m5, m8
1599    pand            m4, m9             ; ((a + 8) >> 4) << 4
1600    pand            m5, m9
1601    psrld           m2, m4, 4
1602    psrld           m0, m5, 4
1603    paddd           m2, m4
1604    psrld           m4, 1
1605    paddd           m0, m5
1606    psrld           m5, 1
1607    paddd           m4, m2             ; a * 25
1608    paddd           m5, m0
1609    punpcklwd       m2, m3, m6
1610    punpckhwd       m3, m6
1611    pmaddwd         m2, m2             ; b * b
1612    pmaddwd         m3, m3
1613    punpcklwd       m0, m1, m6         ; b
1614    punpckhwd       m1, m6
1615    MAXSD           m4, m2, m6
1616    MAXSD           m5, m3, m6, 1
1617    psubd           m4, m2             ; p
1618    psubd           m5, m3
1619    MULLD           m4, m10, m2        ; p * s
1620    MULLD           m5, m10, m2
1621    pmaddwd         m0, m11            ; b * 164
1622    pmaddwd         m1, m11
1623    paddusw         m4, m11
1624    paddusw         m5, m11
1625    psrld           m4, 20             ; min(z, 255)
1626    psrld           m5, 20
1627    GATHER_X_BY_X   m3, m4, m5, t2, t2m
1628    punpcklwd       m4, m3, m3
1629    punpckhwd       m5, m3, m3
1630    MULLD           m0, m4, m2
1631    MULLD           m1, m5, m2
1632    paddd           m0, m13            ; x * b * 164 + (1 << 11) + (1 << 15)
1633    paddd           m1, m13
1634    mova     [t4+wq+4], m3
1635    psrld           m0, 12             ; b
1636    psrld           m1, 12
1637    mova  [t3+wq*2+ 8], m0
1638    mova  [t3+wq*2+24], m1
1639    add             wq, 16
1640    jl .v_loop
1641    ret
1642.prep_n: ; initial neighbor setup
1643    movif64         wq, r4
1644    movif32         wd, w1m
1645.prep_n_loop:
1646    movu            m0, [t4+wq*1+ 2]
1647    movu            m3, [t4+wq*1+ 4]
1648    movu            m1, [t3+wq*2+ 4]
1649    movu            m4, [t3+wq*2+ 8]
1650    movu            m2, [t3+wq*2+20]
1651    movu            m5, [t3+wq*2+24]
1652    paddw           m3, m0
1653    paddd           m4, m1
1654    paddd           m5, m2
1655    paddw           m3, [t4+wq*1+ 0]
1656    paddd           m4, [t3+wq*2+ 0]
1657    paddd           m5, [t3+wq*2+16]
1658    paddw           m0, m3
1659    psllw           m3, 2
1660    paddd           m1, m4
1661    pslld           m4, 2
1662    paddd           m2, m5
1663    pslld           m5, 2
1664    paddw           m0, m3             ; a 565
1665    paddd           m1, m4             ; b 565
1666    paddd           m2, m5
1667    mova [t4+wq*1+400*2+ 0], m0
1668    mova [t3+wq*2+400*4+ 0], m1
1669    mova [t3+wq*2+400*4+16], m2
1670    add             wq, 16
1671    jl .prep_n_loop
1672    ret
1673ALIGN function_align
1674.n0: ; neighbor + output (even rows)
1675    movif64         wq, r4
1676    movif32         wd, w1m
1677.n0_loop:
1678    movu            m0, [t4+wq*1+ 2]
1679    movu            m3, [t4+wq*1+ 4]
1680    movu            m1, [t3+wq*2+ 4]
1681    movu            m4, [t3+wq*2+ 8]
1682    movu            m2, [t3+wq*2+20]
1683    movu            m5, [t3+wq*2+24]
1684    paddw           m3, m0
1685    paddd           m4, m1
1686    paddd           m5, m2
1687    paddw           m3, [t4+wq*1+ 0]
1688    paddd           m4, [t3+wq*2+ 0]
1689    paddd           m5, [t3+wq*2+16]
1690    paddw           m0, m3
1691    psllw           m3, 2
1692    paddd           m1, m4
1693    pslld           m4, 2
1694    paddd           m2, m5
1695    pslld           m5, 2
1696    paddw           m0, m3             ; a 565
1697    paddd           m1, m4             ; b 565
1698    paddd           m2, m5
1699    paddw           m3, m0, [t4+wq*1+400*2+ 0]
1700    paddd           m4, m1, [t3+wq*2+400*4+ 0]
1701    paddd           m5, m2, [t3+wq*2+400*4+16]
1702    mova [t4+wq*1+400*2+ 0], m0
1703    mova [t3+wq*2+400*4+ 0], m1
1704    mova [t3+wq*2+400*4+16], m2
1705    mova            m0, [dstq+wq]
1706    punpcklwd       m1, m0, m6          ; src
1707    punpcklwd       m2, m3, m6          ; a
1708    pmaddwd         m2, m1              ; a * src
1709    punpckhwd       m1, m0, m6
1710    punpckhwd       m3, m6
1711    pmaddwd         m3, m1
1712    psubd           m4, m2              ; b - a * src + (1 << 8)
1713    psubd           m5, m3
1714    psrad           m4, 9
1715    psrad           m5, 9
1716    packssdw        m4, m5
1717    pmulhrsw        m4, m7
1718    paddw           m0, m4
1719    pmaxsw          m0, m6
1720    pminsw          m0, m14
1721    mova     [dstq+wq], m0
1722    add             wq, 16
1723    jl .n0_loop
1724    add           dstq, stridemp
1725    ret
1726ALIGN function_align
1727.n1: ; neighbor + output (odd rows)
1728    movif64         wq, r4
1729    movif32         wd, w1m
1730.n1_loop:
1731    mova            m0, [dstq+wq]
1732    mova            m3, [t4+wq*1+400*2+ 0]
1733    mova            m4, [t3+wq*2+400*4+ 0]
1734    mova            m5, [t3+wq*2+400*4+16]
1735    punpcklwd       m1, m0, m6          ; src
1736    punpcklwd       m2, m3, m6          ; a
1737    pmaddwd         m2, m1
1738    punpckhwd       m1, m0, m6
1739    punpckhwd       m3, m6
1740    pmaddwd         m3, m1
1741    psubd           m4, m2              ; b - a * src + (1 << 7)
1742    psubd           m5, m3
1743    psrad           m4, 8
1744    psrad           m5, 8
1745    packssdw        m4, m5
1746    pmulhrsw        m4, m7
1747    paddw           m0, m4
1748    pmaxsw          m0, m6
1749    pminsw          m0, m14
1750    mova     [dstq+wq], m0
1751    add             wq, 16
1752    jl .n1_loop
1753    add           dstq, stridemp
1754    movif32       dstm, dstq
1755    ret
1756
1757%if ARCH_X86_32
1758 %if STACK_ALIGNMENT < 16
1759  %assign extra_stack 4*16
1760 %else
1761  %assign extra_stack 2*16
1762 %endif
1763cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \
1764                              dst, stride, left, lpf, w
1765 %if STACK_ALIGNMENT < 16
1766  %define dstm         dword [esp+calloff+16*2+4*0]
1767  %define stridemp     dword [esp+calloff+16*2+4*1]
1768  %define leftm        dword [esp+calloff+16*2+4*2]
1769  %define lpfm         dword [esp+calloff+16*2+4*3]
1770  %define w0m          dword [esp+calloff+16*2+4*4]
1771  %define hd           dword [esp+calloff+16*2+4*5]
1772  %define edgeb         byte [esp+calloff+16*2+4*6]
1773  %define edged        dword [esp+calloff+16*2+4*6]
1774  %define leftmp leftm
1775 %else
1776  %define w0m wm
1777  %define hd dword r5m
1778  %define edgeb  byte r7m
1779  %define edged dword r7m
1780 %endif
1781 %define hvsrcm dword [esp+calloff+4*0]
1782 %define w1m    dword [esp+calloff+4*1]
1783 %define t3m    dword [esp+calloff+4*2]
1784 %define t4m    dword [esp+calloff+4*3]
1785 %define  m8 [base+pd_8]
1786 %define  m9 [esp+calloff+16*1]
1787 %define m10 [base+pd_0xf00801c7]
1788 %define m11 [base+pd_34816]
1789 %define m12 [base+sgr_lshuf3]
1790 %define m13 [base+pw_1023]
1791 %define m14 m6
1792 %define base r6-$$
1793 %assign calloff 0
1794 %if STACK_ALIGNMENT < 16
1795    mov        strideq, [rstk+stack_offset+ 8]
1796    mov          leftq, [rstk+stack_offset+12]
1797    mov           lpfq, [rstk+stack_offset+16]
1798    mov             wd, [rstk+stack_offset+20]
1799    mov           dstm, dstq
1800    mov       stridemp, strideq
1801    mov          leftm, leftq
1802    mov             r1, [rstk+stack_offset+24]
1803    mov             r2, [rstk+stack_offset+32]
1804    mov           lpfm, lpfq
1805    mov             hd, r1
1806    mov          edged, r2
1807 %endif
1808%else
1809cglobal sgr_filter_3x3_16bpc, 4, 15, 15, -400*42-8, dst, stride, left, lpf, \
1810                                                    w, h, edge, params
1811%endif
1812%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
1813    movifnidn       wd, wm
1814%endif
1815%if ARCH_X86_64
1816    mov        paramsq, r6mp
1817    lea            r13, [sgr_x_by_x-0xf03]
1818    movifnidn       hd, hm
1819    add             wd, wd
1820    mov          edged, r7m
1821    movq            m9, [paramsq+4]
1822    add           lpfq, wq
1823    lea             t1, [rsp+wq+12]
1824    mova            m8, [pd_8]
1825    add           dstq, wq
1826    lea             t3, [rsp+wq*2+400*12+8]
1827    mova           m10, [pd_0xf00801c7]
1828    lea             t4, [rsp+wq+400*32+8]
1829    mova           m11, [pd_34816]
1830    pshuflw         m7, m9, q3333
1831    pshufb          m9, [pw_256]  ; s1
1832    punpcklqdq      m7, m7        ; w1
1833    neg             wq
1834    pxor            m6, m6
1835    mova           m13, [pw_1023]
1836    psllw           m7, 4
1837    mova           m12, [sgr_lshuf3]
1838 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
1839 %define lpfm [rsp]
1840%else
1841    mov             r1, [rstk+stack_offset+28] ; params
1842    LEA             r6, $$
1843    add             wd, wd
1844    movq            m1, [r1+4]
1845    add           lpfm, wq
1846    lea             t1, [rsp+extra_stack+wq+20]
1847    add           dstq, wq
1848    lea             t3, [rsp+extra_stack+wq*2+400*12+16]
1849    mov           dstm, dstq
1850    lea             t4, [rsp+extra_stack+wq+400*32+16]
1851    mov            t3m, t3
1852    pshuflw         m7, m1, q3333
1853    mov            t4m, t4
1854    pshufb          m1, [base+pw_256] ; s1
1855    punpcklqdq      m7, m7            ; w1
1856    psllw           m7, 4
1857    neg             wq
1858    mova            m9, m1
1859    pxor            m6, m6
1860    mov            w1m, wd
1861    sub             wd, 4
1862    mov           lpfq, lpfm
1863    mov            w0m, wd
1864 %define strideq r5
1865%endif
1866    test         edgeb, 4 ; LR_HAVE_TOP
1867    jz .no_top
1868    call .h_top
1869    add           lpfq, stridemp
1870    mov             t2, t1
1871    add             t1, 400*6
1872    call .h_top
1873    movif32    strideq, stridemp
1874    lea            r10, [lpfq+strideq*4]
1875    mov           lpfq, dstq
1876    add            r10, strideq
1877    mov           lpfm, r10 ; below
1878    movif32         t4, t4m
1879    call .hv0
1880.main:
1881    dec             hd
1882    jz .height1
1883    movif32       lpfq, hvsrcm
1884    add           lpfq, stridemp
1885    call .hv1
1886    call .prep_n
1887    sub             hd, 2
1888    jl .extend_bottom
1889.main_loop:
1890    movif32       lpfq, hvsrcm
1891    add           lpfq, stridemp
1892    call .hv0
1893%if ARCH_X86_64
1894    test            hb, hb
1895%else
1896    mov             r4, hd
1897    test            r4, r4
1898%endif
1899    jz .odd_height
1900    movif32       lpfq, hvsrcm
1901    add           lpfq, stridemp
1902    call .hv1
1903    call .n0
1904    call .n1
1905    sub             hd, 2
1906    jge .main_loop
1907    test         edgeb, 8 ; LR_HAVE_BOTTOM
1908    jz .extend_bottom
1909    mov           lpfq, lpfm
1910    call .hv0_bottom
1911    movif32       lpfq, hvsrcm
1912    add           lpfq, stridemp
1913    call .hv1_bottom
1914.end:
1915    call .n0
1916    call .n1
1917.end2:
1918    RET
1919.height1:
1920    call .v1
1921    call .prep_n
1922    jmp .odd_height_end
1923.odd_height:
1924    call .v1
1925    call .n0
1926    call .n1
1927.odd_height_end:
1928    call .v0
1929    call .v1
1930    call .n0
1931    jmp .end2
1932.extend_bottom:
1933    call .v0
1934    call .v1
1935    jmp .end
1936.no_top:
1937    movif32    strideq, stridemp
1938    lea            r10, [lpfq+strideq*4]
1939    mov           lpfq, dstq
1940    lea            r10, [r10+strideq*2]
1941    mov           lpfm, r10
1942    call .h
1943%if ARCH_X86_64
1944    lea             wq, [r4-4]
1945%else
1946    mov             wq, w0m
1947    mov         hvsrcm, lpfq
1948%endif
1949    lea             t2, [t1+400*6]
1950.top_fixup_loop:
1951    mova            m0, [t1+wq+400*0]
1952    mova            m1, [t1+wq+400*2]
1953    mova            m2, [t1+wq+400*4]
1954    mova [t2+wq+400*0], m0
1955    mova [t2+wq+400*2], m1
1956    mova [t2+wq+400*4], m2
1957    add             wq, 16
1958    jl .top_fixup_loop
1959    movif32         t3, t3m
1960    movif32         t4, t4m
1961    call .v0
1962    jmp .main
1963.extend_right:
1964    movd            m1, wd
1965    movd            m5, [lpfq-2]
1966    mova            m2, [base+pw_256]
1967    mova            m3, [base+pb_0to15]
1968    pshufb          m1, m6
1969    pshufb          m5, m2
1970    psubb           m2, m1
1971    pcmpgtb         m2, m3
1972    pand            m4, m2
1973    pandn           m2, m5
1974    por             m4, m2
1975    ret
1976%assign stack_offset stack_offset+4
1977%assign calloff 4
1978.h: ; horizontal boxsum
1979%if ARCH_X86_64
1980    lea             wq, [r4-4]
1981%else
1982 %define leftq r4
1983%endif
1984    test         edgeb, 1 ; LR_HAVE_LEFT
1985    jz .h_extend_left
1986    movif32      leftq, leftm
1987    movddup         m5, [leftq]
1988    movif32         wq, w0m
1989    mova            m4, [lpfq+wq+4]
1990    add         leftmp, 8
1991    palignr         m4, m5, 12
1992    jmp .h_main
1993.h_extend_left:
1994    movif32         wq, w0m
1995    mova            m4, [lpfq+wq+4]
1996    pshufb          m4, m12
1997    jmp .h_main
1998.h_top:
1999%if ARCH_X86_64
2000    lea             wq, [r4-4]
2001%endif
2002    test         edgeb, 1 ; LR_HAVE_LEFT
2003    jz .h_extend_left
2004    movif32         wq, w0m
2005.h_loop:
2006    movu            m4, [lpfq+wq+ 0]
2007.h_main:
2008    movu            m5, [lpfq+wq+16]
2009    test         edgeb, 2 ; LR_HAVE_RIGHT
2010    jnz .h_have_right
2011    cmp             wd, -18
2012    jl .h_have_right
2013    call .extend_right
2014.h_have_right:
2015    palignr         m0, m5, m4, 2
2016    paddw           m1, m4, m0
2017    punpcklwd       m2, m4, m0
2018    pmaddwd         m2, m2
2019    punpckhwd       m3, m4, m0
2020    pmaddwd         m3, m3
2021    palignr         m5, m4, 4
2022    paddw           m1, m5             ; sum
2023    punpcklwd       m4, m5, m6
2024    pmaddwd         m4, m4
2025    punpckhwd       m5, m6
2026    pmaddwd         m5, m5
2027    paddd           m2, m4             ; sumsq
2028    paddd           m3, m5
2029    mova [t1+wq+400*0], m1
2030    mova [t1+wq+400*2], m2
2031    mova [t1+wq+400*4], m3
2032    add             wq, 16
2033    jl .h_loop
2034    ret
2035ALIGN function_align
2036.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
2037%if ARCH_X86_64
2038    lea             wq, [r4-4]
2039%else
2040    mov         hvsrcm, lpfq
2041%endif
2042    test         edgeb, 1 ; LR_HAVE_LEFT
2043    jz .hv0_extend_left
2044    movif32      leftq, leftm
2045    movddup         m5, [leftq]
2046    movif32         wq, w0m
2047    mova            m4, [lpfq+wq+4]
2048    add         leftmp, 8
2049    palignr         m4, m5, 12
2050    jmp .hv0_main
2051.hv0_extend_left:
2052    movif32         wq, w0m
2053    mova            m4, [lpfq+wq+4]
2054    pshufb          m4, m12
2055    jmp .hv0_main
2056.hv0_bottom:
2057%if ARCH_X86_64
2058    lea             wq, [r4-4]
2059%else
2060    mov         hvsrcm, lpfq
2061%endif
2062    test         edgeb, 1 ; LR_HAVE_LEFT
2063    jz .hv0_extend_left
2064    movif32         wq, w0m
2065%if ARCH_X86_32
2066    jmp .hv0_loop_start
2067%endif
2068.hv0_loop:
2069    movif32       lpfq, hvsrcm
2070.hv0_loop_start:
2071    movu            m4, [lpfq+wq+ 0]
2072.hv0_main:
2073    movu            m5, [lpfq+wq+16]
2074    test         edgeb, 2 ; LR_HAVE_RIGHT
2075    jnz .hv0_have_right
2076    cmp             wd, -18
2077    jl .hv0_have_right
2078    call .extend_right
2079.hv0_have_right:
2080    palignr         m0, m5, m4, 2
2081    paddw           m1, m4, m0
2082    punpcklwd       m2, m4, m0
2083    pmaddwd         m2, m2
2084    punpckhwd       m3, m4, m0
2085    pmaddwd         m3, m3
2086    palignr         m5, m4, 4
2087    paddw           m1, m5             ; sum
2088    punpcklwd       m4, m5, m6
2089    pmaddwd         m4, m4
2090    punpckhwd       m5, m6
2091    pmaddwd         m5, m5
2092    paddd           m2, m4             ; sumsq
2093    paddd           m3, m5
2094    paddw           m0, m1, [t1+wq+400*0]
2095    paddd           m4, m2, [t1+wq+400*2]
2096    paddd           m5, m3, [t1+wq+400*4]
2097    mova [t1+wq+400*0], m1
2098    mova [t1+wq+400*2], m2
2099    mova [t1+wq+400*4], m3
2100    paddw           m1, m0, [t2+wq+400*0]
2101    paddd           m2, m4, [t2+wq+400*2]
2102    paddd           m3, m5, [t2+wq+400*4]
2103    mova [t2+wq+400*0], m0
2104    mova [t2+wq+400*2], m4
2105    mova [t2+wq+400*4], m5
2106    paddd           m2, m8
2107    paddd           m3, m8
2108    psrld           m2, 4              ; (a + 8) >> 4
2109    psrld           m3, 4
2110    pslld           m4, m2, 3
2111    pslld           m5, m3, 3
2112    paddd           m4, m2             ; ((a + 8) >> 4) * 9
2113    paddd           m5, m3
2114    psrlw           m3, m1, 1
2115    pavgw           m3, m6             ; (b + 2) >> 2
2116    punpcklwd       m2, m3, m6
2117    pmaddwd         m2, m2
2118    punpckhwd       m3, m6
2119    pmaddwd         m3, m3
2120    punpcklwd       m0, m1, m6         ; b
2121    punpckhwd       m1, m6
2122    MAXSD           m4, m2, m14
2123    MAXSD           m5, m3, m14
2124    psubd           m4, m2             ; p
2125    psubd           m5, m3
2126    MULLD           m4, m9, m14        ; p * s
2127    MULLD           m5, m9, m14
2128    pmaddwd         m0, m10            ; b * 455
2129    pmaddwd         m1, m10
2130    paddusw         m4, m10
2131    paddusw         m5, m10
2132    psrld           m4, 20             ; min(z, 255)
2133    movif32         t3, t3m
2134    psrld           m5, 20
2135    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2136    punpcklwd       m4, m3, m3
2137    punpckhwd       m5, m3, m3
2138    MULLD           m0, m4, m14
2139    MULLD           m1, m5, m14
2140%if ARCH_X86_32
2141    pxor            m6, m6
2142%endif
2143    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2144    paddd           m1, m11
2145    mova     [t4+wq+4], m3
2146    psrld           m0, 12
2147    psrld           m1, 12
2148    mova  [t3+wq*2+ 8], m0
2149    mova  [t3+wq*2+24], m1
2150    add             wq, 16
2151    jl .hv0_loop
2152    ret
2153ALIGN function_align
2154.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
2155%if ARCH_X86_64
2156    lea             wq, [r4-4]
2157%else
2158    mov         hvsrcm, lpfq
2159%endif
2160    test         edgeb, 1 ; LR_HAVE_LEFT
2161    jz .hv1_extend_left
2162    movif32      leftq, leftm
2163    movddup         m5, [leftq]
2164    movif32         wq, w0m
2165    mova            m4, [lpfq+wq+4]
2166    add         leftmp, 8
2167    palignr         m4, m5, 12
2168    jmp .hv1_main
2169.hv1_extend_left:
2170    movif32         wq, w0m
2171    mova            m4, [lpfq+wq+4]
2172    pshufb          m4, m12
2173    jmp .hv1_main
2174.hv1_bottom:
2175%if ARCH_X86_64
2176    lea             wq, [r4-4]
2177%else
2178    mov         hvsrcm, lpfq
2179%endif
2180    test         edgeb, 1 ; LR_HAVE_LEFT
2181    jz .hv1_extend_left
2182    movif32         wq, w0m
2183%if ARCH_X86_32
2184    jmp .hv1_loop_start
2185%endif
2186.hv1_loop:
2187    movif32       lpfq, hvsrcm
2188.hv1_loop_start:
2189    movu            m4, [lpfq+wq+ 0]
2190.hv1_main:
2191    movu            m5, [lpfq+wq+16]
2192    test         edgeb, 2 ; LR_HAVE_RIGHT
2193    jnz .hv1_have_right
2194    cmp             wd, -18
2195    jl .hv1_have_right
2196    call .extend_right
2197.hv1_have_right:
2198    palignr         m1, m5, m4, 2
2199    paddw           m0, m4, m1
2200    punpcklwd       m2, m4, m1
2201    pmaddwd         m2, m2
2202    punpckhwd       m3, m4, m1
2203    pmaddwd         m3, m3
2204    palignr         m5, m4, 4
2205    paddw           m0, m5             ; h sum
2206    punpcklwd       m1, m5, m6
2207    pmaddwd         m1, m1
2208    punpckhwd       m5, m6
2209    pmaddwd         m5, m5
2210    paddd           m2, m1             ; h sumsq
2211    paddd           m3, m5
2212    paddw           m1, m0, [t2+wq+400*0]
2213    paddd           m4, m2, [t2+wq+400*2]
2214    paddd           m5, m3, [t2+wq+400*4]
2215    mova [t2+wq+400*0], m0
2216    mova [t2+wq+400*2], m2
2217    mova [t2+wq+400*4], m3
2218    paddd           m4, m8
2219    paddd           m5, m8
2220    psrld           m4, 4              ; (a + 8) >> 4
2221    psrld           m5, 4
2222    pslld           m2, m4, 3
2223    pslld           m3, m5, 3
2224    paddd           m4, m2             ; ((a + 8) >> 4) * 9
2225    paddd           m5, m3
2226    psrlw           m3, m1, 1
2227    pavgw           m3, m6             ; (b + 2) >> 2
2228    punpcklwd       m2, m3, m6
2229    pmaddwd         m2, m2
2230    punpckhwd       m3, m6
2231    pmaddwd         m3, m3
2232    punpcklwd       m0, m1, m6         ; b
2233    punpckhwd       m1, m6
2234    MAXSD           m4, m2, m14
2235    MAXSD           m5, m3, m14
2236    psubd           m4, m2             ; p
2237    psubd           m5, m3
2238    MULLD           m4, m9, m14        ; p * s
2239    MULLD           m5, m9, m14
2240    pmaddwd         m0, m10            ; b * 455
2241    pmaddwd         m1, m10
2242    paddusw         m4, m10
2243    paddusw         m5, m10
2244    psrld           m4, 20             ; min(z, 255)
2245    movif32         t3, t3m
2246    psrld           m5, 20
2247    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2248    punpcklwd       m4, m3, m3
2249    punpckhwd       m5, m3, m3
2250    MULLD           m0, m4, m14
2251    MULLD           m1, m5, m14
2252%if ARCH_X86_32
2253    pxor            m6, m6
2254%endif
2255    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2256    paddd           m1, m11
2257    mova [t4+wq*1+400*2 +4], m3
2258    psrld           m0, 12
2259    psrld           m1, 12
2260    mova [t3+wq*2+400*4+ 8], m0
2261    mova [t3+wq*2+400*4+24], m1
2262    add             wq, 16
2263    jl .hv1_loop
2264    mov            r10, t2
2265    mov             t2, t1
2266    mov             t1, r10
2267    ret
2268.v0: ; vertical boxsums + ab (even rows)
2269%if ARCH_X86_64
2270    lea             wq, [r4-4]
2271%else
2272    mov             wd, w0m
2273%endif
2274.v0_loop:
2275    mova            m0, [t1+wq+400*0]
2276    mova            m4, [t1+wq+400*2]
2277    mova            m5, [t1+wq+400*4]
2278    paddw           m0, m0
2279    paddd           m4, m4
2280    paddd           m5, m5
2281    paddw           m1, m0, [t2+wq+400*0]
2282    paddd           m2, m4, [t2+wq+400*2]
2283    paddd           m3, m5, [t2+wq+400*4]
2284    mova [t2+wq+400*0], m0
2285    mova [t2+wq+400*2], m4
2286    mova [t2+wq+400*4], m5
2287    paddd           m2, m8
2288    paddd           m3, m8
2289    psrld           m2, 4              ; (a + 8) >> 4
2290    psrld           m3, 4
2291    pslld           m4, m2, 3
2292    pslld           m5, m3, 3
2293    paddd           m4, m2             ; ((a + 8) >> 4) * 9
2294    paddd           m5, m3
2295    psrlw           m3, m1, 1
2296    pavgw           m3, m6             ; (b + 2) >> 2
2297    punpcklwd       m2, m3, m6
2298    pmaddwd         m2, m2
2299    punpckhwd       m3, m6
2300    pmaddwd         m3, m3
2301    punpcklwd       m0, m1, m6         ; b
2302    punpckhwd       m1, m6
2303    MAXSD           m4, m2, m14
2304    MAXSD           m5, m3, m14
2305    psubd           m4, m2             ; p
2306    psubd           m5, m3
2307    MULLD           m4, m9, m14        ; p * s
2308    MULLD           m5, m9, m14
2309    pmaddwd         m0, m10            ; b * 455
2310    pmaddwd         m1, m10
2311    paddusw         m4, m10
2312    paddusw         m5, m10
2313    psrld           m4, 20             ; min(z, 255)
2314    psrld           m5, 20
2315    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2316    punpcklwd       m4, m3, m3
2317    punpckhwd       m5, m3, m3
2318    MULLD           m0, m4, m14
2319    MULLD           m1, m5, m14
2320%if ARCH_X86_32
2321    pxor            m6, m6
2322%endif
2323    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2324    paddd           m1, m11
2325    mova [t4+wq*1+400*0+ 4], m3
2326    psrld           m0, 12
2327    psrld           m1, 12
2328    mova [t3+wq*2+400*0+ 8], m0
2329    mova [t3+wq*2+400*0+24], m1
2330    add             wq, 16
2331    jl .v0_loop
2332    ret
2333.v1: ; vertical boxsums + ab (odd rows)
2334%if ARCH_X86_64
2335    lea             wq, [r4-4]
2336%else
2337    mov             wd, w0m
2338%endif
2339.v1_loop:
2340    mova            m0, [t1+wq+400*0]
2341    mova            m4, [t1+wq+400*2]
2342    mova            m5, [t1+wq+400*4]
2343    paddw           m1, m0, [t2+wq+400*0]
2344    paddd           m2, m4, [t2+wq+400*2]
2345    paddd           m3, m5, [t2+wq+400*4]
2346    mova [t2+wq+400*0], m0
2347    mova [t2+wq+400*2], m4
2348    mova [t2+wq+400*4], m5
2349    paddd           m2, m8
2350    paddd           m3, m8
2351    psrld           m2, 4              ; (a + 8) >> 4
2352    psrld           m3, 4
2353    pslld           m4, m2, 3
2354    pslld           m5, m3, 3
2355    paddd           m4, m2             ; ((a + 8) >> 4) * 9
2356    paddd           m5, m3
2357    psrlw           m3, m1, 1
2358    pavgw           m3, m6             ; (b + 2) >> 2
2359    punpcklwd       m2, m3, m6
2360    pmaddwd         m2, m2
2361    punpckhwd       m3, m6
2362    pmaddwd         m3, m3
2363    punpcklwd       m0, m1, m6         ; b
2364    punpckhwd       m1, m6
2365    MAXSD           m4, m2, m14
2366    MAXSD           m5, m3, m14
2367    psubd           m4, m2             ; p
2368    psubd           m5, m3
2369    MULLD           m4, m9, m14        ; p * s
2370    MULLD           m5, m9, m14
2371    pmaddwd         m0, m10            ; b * 455
2372    pmaddwd         m1, m10
2373    paddusw         m4, m10
2374    paddusw         m5, m10
2375    psrld           m4, 20             ; min(z, 255)
2376    psrld           m5, 20
2377    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2378    punpcklwd       m4, m3, m3
2379    punpckhwd       m5, m3, m3
2380    MULLD           m0, m4, m14
2381    MULLD           m1, m5, m14
2382%if ARCH_X86_32
2383    pxor            m6, m6
2384%endif
2385    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2386    paddd           m1, m11
2387    mova [t4+wq*1+400*2+ 4], m3
2388    psrld           m0, 12
2389    psrld           m1, 12
2390    mova [t3+wq*2+400*4+ 8], m0
2391    mova [t3+wq*2+400*4+24], m1
2392    add             wq, 16
2393    jl .v1_loop
2394    mov            r10, t2
2395    mov             t2, t1
2396    mov             t1, r10
2397    ret
2398.prep_n: ; initial neighbor setup
2399    movif64         wq, r4
2400    movif32         wd, w1m
2401.prep_n_loop:
2402    movu            m0, [t4+wq*1+400*0+ 4]
2403    movu            m1, [t3+wq*2+400*0+ 8]
2404    movu            m2, [t3+wq*2+400*0+24]
2405    movu            m3, [t4+wq*1+400*0+ 2]
2406    movu            m4, [t3+wq*2+400*0+ 4]
2407    movu            m5, [t3+wq*2+400*0+20]
2408    paddw           m0, [t4+wq*1+400*0+ 0]
2409    paddd           m1, [t3+wq*2+400*0+ 0]
2410    paddd           m2, [t3+wq*2+400*0+16]
2411    paddw           m3, m0
2412    paddd           m4, m1
2413    paddd           m5, m2
2414    psllw           m3, 2                ; a[-1] 444
2415    pslld           m4, 2                ; b[-1] 444
2416    pslld           m5, 2
2417    psubw           m3, m0               ; a[-1] 343
2418    psubd           m4, m1               ; b[-1] 343
2419    psubd           m5, m2
2420    mova [t4+wq*1+400*4], m3
2421    mova [t3+wq*2+400*8+ 0], m4
2422    mova [t3+wq*2+400*8+16], m5
2423    movu            m0, [t4+wq*1+400*2+ 4]
2424    movu            m1, [t3+wq*2+400*4+ 8]
2425    movu            m2, [t3+wq*2+400*4+24]
2426    movu            m3, [t4+wq*1+400*2+ 2]
2427    movu            m4, [t3+wq*2+400*4+ 4]
2428    movu            m5, [t3+wq*2+400*4+20]
2429    paddw           m0, [t4+wq*1+400*2+ 0]
2430    paddd           m1, [t3+wq*2+400*4+ 0]
2431    paddd           m2, [t3+wq*2+400*4+16]
2432    paddw           m3, m0
2433    paddd           m4, m1
2434    paddd           m5, m2
2435    psllw           m3, 2                 ; a[ 0] 444
2436    pslld           m4, 2                 ; b[ 0] 444
2437    pslld           m5, 2
2438    mova [t4+wq*1+400* 6], m3
2439    mova [t3+wq*2+400*12+ 0], m4
2440    mova [t3+wq*2+400*12+16], m5
2441    psubw           m3, m0                ; a[ 0] 343
2442    psubd           m4, m1                ; b[ 0] 343
2443    psubd           m5, m2
2444    mova [t4+wq*1+400* 8], m3
2445    mova [t3+wq*2+400*16+ 0], m4
2446    mova [t3+wq*2+400*16+16], m5
2447    add             wq, 16
2448    jl .prep_n_loop
2449    ret
2450ALIGN function_align
2451.n0: ; neighbor + output (even rows)
2452    movif64         wq, r4
2453    movif32         wd, w1m
2454.n0_loop:
2455    movu            m3, [t4+wq*1+400*0+4]
2456    movu            m1, [t4+wq*1+400*0+2]
2457    paddw           m3, [t4+wq*1+400*0+0]
2458    paddw           m1, m3
2459    psllw           m1, 2                ; a[ 1] 444
2460    psubw           m2, m1, m3           ; a[ 1] 343
2461    paddw           m3, m2, [t4+wq*1+400*4]
2462    paddw           m3, [t4+wq*1+400*6]
2463    mova [t4+wq*1+400*4], m2
2464    mova [t4+wq*1+400*6], m1
2465    movu            m4, [t3+wq*2+400*0+8]
2466    movu            m1, [t3+wq*2+400*0+4]
2467    paddd           m4, [t3+wq*2+400*0+0]
2468    paddd           m1, m4
2469    pslld           m1, 2                ; b[ 1] 444
2470    psubd           m2, m1, m4           ; b[ 1] 343
2471    paddd           m4, m2, [t3+wq*2+400* 8+ 0]
2472    paddd           m4, [t3+wq*2+400*12+ 0]
2473    mova [t3+wq*2+400* 8+ 0], m2
2474    mova [t3+wq*2+400*12+ 0], m1
2475    movu            m5, [t3+wq*2+400*0+24]
2476    movu            m1, [t3+wq*2+400*0+20]
2477    paddd           m5, [t3+wq*2+400*0+16]
2478    paddd           m1, m5
2479    pslld           m1, 2
2480    psubd           m2, m1, m5
2481    paddd           m5, m2, [t3+wq*2+400* 8+16]
2482    paddd           m5, [t3+wq*2+400*12+16]
2483    mova [t3+wq*2+400* 8+16], m2
2484    mova [t3+wq*2+400*12+16], m1
2485    mova            m0, [dstq+wq]
2486    punpcklwd       m1, m0, m6
2487    punpcklwd       m2, m3, m6
2488    pmaddwd         m2, m1               ; a * src
2489    punpckhwd       m1, m0, m6
2490    punpckhwd       m3, m6
2491    pmaddwd         m3, m1
2492    psubd           m4, m2               ; b - a * src + (1 << 8)
2493    psubd           m5, m3
2494    psrad           m4, 9
2495    psrad           m5, 9
2496    packssdw        m4, m5
2497    pmulhrsw        m4, m7
2498    paddw           m0, m4
2499    pmaxsw          m0, m6
2500    pminsw          m0, m13
2501    mova     [dstq+wq], m0
2502    add             wq, 16
2503    jl .n0_loop
2504    add           dstq, stridemp
2505    ret
2506ALIGN function_align
2507.n1: ; neighbor + output (odd rows)
2508    movif64         wq, r4
2509    movif32         wd, w1m
2510.n1_loop:
2511    movu            m3, [t4+wq*1+400*2+4]
2512    movu            m1, [t4+wq*1+400*2+2]
2513    paddw           m3, [t4+wq*1+400*2+0]
2514    paddw           m1, m3
2515    psllw           m1, 2                ; a[ 1] 444
2516    psubw           m2, m1, m3           ; a[ 1] 343
2517    paddw           m3, m2, [t4+wq*1+400*6]
2518    paddw           m3, [t4+wq*1+400*8]
2519    mova [t4+wq*1+400*6], m1
2520    mova [t4+wq*1+400*8], m2
2521    movu            m4, [t3+wq*2+400*4+8]
2522    movu            m1, [t3+wq*2+400*4+4]
2523    paddd           m4, [t3+wq*2+400*4+0]
2524    paddd           m1, m4
2525    pslld           m1, 2                ; b[ 1] 444
2526    psubd           m2, m1, m4           ; b[ 1] 343
2527    paddd           m4, m2, [t3+wq*2+400*12+ 0]
2528    paddd           m4, [t3+wq*2+400*16+ 0]
2529    mova [t3+wq*2+400*12+ 0], m1
2530    mova [t3+wq*2+400*16+ 0], m2
2531    movu            m5, [t3+wq*2+400*4+24]
2532    movu            m1, [t3+wq*2+400*4+20]
2533    paddd           m5, [t3+wq*2+400*4+16]
2534    paddd           m1, m5
2535    pslld           m1, 2
2536    psubd           m2, m1, m5
2537    paddd           m5, m2, [t3+wq*2+400*12+16]
2538    paddd           m5, [t3+wq*2+400*16+16]
2539    mova [t3+wq*2+400*12+16], m1
2540    mova [t3+wq*2+400*16+16], m2
2541    mova            m0, [dstq+wq]
2542    punpcklwd       m1, m0, m6
2543    punpcklwd       m2, m3, m6
2544    pmaddwd         m2, m1               ; a * src
2545    punpckhwd       m1, m0, m6
2546    punpckhwd       m3, m6
2547    pmaddwd         m3, m1
2548    psubd           m4, m2               ; b - a * src + (1 << 8)
2549    psubd           m5, m3
2550    psrad           m4, 9
2551    psrad           m5, 9
2552    packssdw        m4, m5
2553    pmulhrsw        m4, m7
2554    paddw           m0, m4
2555    pmaxsw          m0, m6
2556    pminsw          m0, m13
2557    mova     [dstq+wq], m0
2558    add             wq, 16
2559    jl .n1_loop
2560    add           dstq, stridemp
2561    movif32       dstm, dstq
2562    ret
2563
2564%if ARCH_X86_32
2565 %if STACK_ALIGNMENT < 16
2566  %assign extra_stack 10*16
2567 %else
2568  %assign extra_stack 8*16
2569 %endif
2570cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \
2571                              dst, stride, left, lpf, w
2572 %if STACK_ALIGNMENT < 16
2573  %define dstm         dword [esp+calloff+16*8+4*0]
2574  %define stridemp     dword [esp+calloff+16*8+4*1]
2575  %define leftm        dword [esp+calloff+16*8+4*2]
2576  %define lpfm         dword [esp+calloff+16*8+4*3]
2577  %define w0m          dword [esp+calloff+16*8+4*4]
2578  %define hd           dword [esp+calloff+16*8+4*5]
2579  %define edgeb         byte [esp+calloff+16*8+4*6]
2580  %define edged        dword [esp+calloff+16*8+4*6]
2581  %define leftmp leftm
2582 %else
2583  %define w0m wm
2584  %define hd dword r5m
2585  %define edgeb  byte r7m
2586  %define edged dword r7m
2587 %endif
2588 %define hvsrcm dword [esp+calloff+4*0]
2589 %define w1m    dword [esp+calloff+4*1]
2590 %define t3m    dword [esp+calloff+4*2]
2591 %define t4m    dword [esp+calloff+4*3]
2592 %xdefine m8 m6
2593 %define  m9 [base+pd_8]
2594 %define m10 [base+pd_34816]
2595 %define m11 [base+pd_0xf00801c7]
2596 %define m12 [base+pd_0xf00800a4]
2597 %define m13 [esp+calloff+16*4]
2598 %define m14 [esp+calloff+16*5]
2599 %define m15 [esp+calloff+16*6]
2600 %define  m6 [esp+calloff+16*7]
2601 %define base r6-$$
2602 %assign calloff 0
2603 %if STACK_ALIGNMENT < 16
2604    mov        strideq, [rstk+stack_offset+ 8]
2605    mov          leftq, [rstk+stack_offset+12]
2606    mov           lpfq, [rstk+stack_offset+16]
2607    mov             wd, [rstk+stack_offset+20]
2608    mov           dstm, dstq
2609    mov       stridemp, strideq
2610    mov          leftm, leftq
2611    mov             r1, [rstk+stack_offset+24]
2612    mov             r2, [rstk+stack_offset+32]
2613    mov           lpfm, lpfq
2614    mov             hd, r1
2615    mov          edged, r2
2616 %endif
2617%else
2618cglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
2619                                                     w, h, edge, params
2620%endif
2621%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
2622    movifnidn       wd, wm
2623%endif
2624%if ARCH_X86_64
2625    mov        paramsq, r6mp
2626    lea            r13, [sgr_x_by_x-0xf03]
2627    movifnidn       hd, hm
2628    add             wd, wd
2629    mov          edged, r7m
2630    mova           m14, [paramsq]
2631    add           lpfq, wq
2632    mova            m9, [pd_8]
2633    lea             t1, [rsp+wq+44]
2634    mova           m10, [pd_34816]
2635    add           dstq, wq
2636    mova           m11, [pd_0xf00801c7]
2637    lea             t3, [rsp+wq*2+400*24+40]
2638    mova           m12, [pd_0xf00800a4]
2639    lea             t4, [rsp+wq+400*52+40]
2640    neg             wq
2641    pshufd         m15, m14, q2222 ; w0 w1
2642    punpcklwd      m14, m14
2643    pshufd         m13, m14, q0000 ; s0
2644    pshufd         m14, m14, q2222 ; s1
2645    pxor            m6, m6
2646    psllw          m15, 2
2647 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
2648 %define lpfm [rsp]
2649%else
2650    mov             r1, [rstk+stack_offset+28] ; params
2651    LEA             r6, $$
2652    add             wd, wd
2653    mova            m2, [r1]
2654    add           lpfm, wq
2655    lea             t1, [rsp+extra_stack+wq+52]
2656    add           dstq, wq
2657    lea             t3, [rsp+extra_stack+wq*2+400*24+48]
2658    mov           dstm, dstq
2659    lea             t4, [rsp+extra_stack+wq+400*52+48]
2660    mov            t3m, t3
2661    mov            t4m, t4
2662    neg             wq
2663    pshuflw         m0, m2, q0000
2664    pshuflw         m1, m2, q2222
2665    pshufhw         m2, m2, q1010
2666    punpcklqdq      m0, m0 ; s0
2667    punpcklqdq      m1, m1 ; s1
2668    punpckhqdq      m2, m2 ; w0 w1
2669    mov            w1m, wd
2670    pxor            m3, m3
2671    psllw           m2, 2
2672    mova           m13, m0
2673    mova           m14, m1
2674    sub             wd, 4
2675    mova           m15, m2
2676    mova            m6, m3
2677    mov           lpfq, lpfm
2678    mov            w0m, wd
2679 %define strideq r5
2680%endif
2681    test         edgeb, 4 ; LR_HAVE_TOP
2682    jz .no_top
2683    call .h_top
2684    add           lpfq, stridemp
2685    mov             t2, t1
2686%if ARCH_X86_64
2687    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup
2688%else
2689    mov             wq, w0m
2690    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop
2691%endif
2692    add             t1, 400*12
2693    call .h_top
2694    movif32    strideq, stridemp
2695    lea            r10, [lpfq+strideq*4]
2696    mov           lpfq, dstq
2697    add            r10, strideq
2698    mov           lpfm, r10 ; below
2699    movif32         t4, t4m
2700    call .hv0
2701.main:
2702    dec             hd
2703    jz .height1
2704    movif32       lpfq, hvsrcm
2705    add           lpfq, stridemp
2706    call .hv1
2707    call .prep_n
2708    sub             hd, 2
2709    jl .extend_bottom
2710.main_loop:
2711    movif32       lpfq, hvsrcm
2712    add           lpfq, stridemp
2713    call .hv0
2714%if ARCH_X86_64
2715    test            hd, hd
2716%else
2717    mov             r4, hd
2718    test            r4, r4
2719%endif
2720    jz .odd_height
2721    movif32       lpfq, hvsrcm
2722    add           lpfq, stridemp
2723    call .hv1
2724    call .n0
2725    call .n1
2726    sub             hd, 2
2727    jge .main_loop
2728    test         edgeb, 8 ; LR_HAVE_BOTTOM
2729    jz .extend_bottom
2730    mov           lpfq, lpfm
2731    call .hv0_bottom
2732    movif32       lpfq, hvsrcm
2733    add           lpfq, stridemp
2734    call .hv1_bottom
2735.end:
2736    call .n0
2737    call .n1
2738.end2:
2739    RET
2740.height1:
2741    call .v1
2742    call .prep_n
2743    jmp .odd_height_end
2744.odd_height:
2745    call .v1
2746    call .n0
2747    call .n1
2748.odd_height_end:
2749    call .v0
2750    call .v1
2751    call .n0
2752    jmp .end2
2753.extend_bottom:
2754    call .v0
2755    call .v1
2756    jmp .end
2757.no_top:
2758    movif32    strideq, stridemp
2759    lea            r10, [lpfq+strideq*4]
2760    mov           lpfq, dstq
2761    lea            r10, [r10+strideq*2]
2762    mov           lpfm, r10
2763    call .h
2764%if ARCH_X86_64
2765    lea             wq, [r4-4]
2766%else
2767    mov             wq, w0m
2768    mov         hvsrcm, lpfq
2769%endif
2770    lea             t2, [t1+400*12]
2771.top_fixup_loop:
2772    mova            m0, [t1+wq+400* 0]
2773    mova            m1, [t1+wq+400* 2]
2774    mova            m2, [t1+wq+400* 4]
2775    paddw           m0, m0
2776    mova            m3, [t1+wq+400* 6]
2777    paddd           m1, m1
2778    mova            m4, [t1+wq+400* 8]
2779    paddd           m2, m2
2780    mova            m5, [t1+wq+400*10]
2781    mova [t2+wq+400* 0], m0
2782    mova [t2+wq+400* 2], m1
2783    mova [t2+wq+400* 4], m2
2784    mova [t2+wq+400* 6], m3
2785    mova [t2+wq+400* 8], m4
2786    mova [t2+wq+400*10], m5
2787    add             wq, 16
2788    jl .top_fixup_loop
2789    movif32         t3, t3m
2790    movif32         t4, t4m
2791    call .v0
2792    jmp .main
2793.h: ; horizontal boxsum
2794%assign stack_offset stack_offset+4
2795%assign calloff 4
2796%if ARCH_X86_64
2797    lea             wq, [r4-4]
2798%else
2799 %define leftq r4
2800%endif
2801    test         edgeb, 1 ; LR_HAVE_LEFT
2802    jz .h_extend_left
2803    movif32      leftq, leftm
2804    movddup         m5, [leftq]
2805    movif32         wq, w0m
2806    mova            m4, [lpfq+wq+4]
2807    add         leftmp, 8
2808    palignr         m4, m5, 10
2809    jmp .h_main
2810.h_extend_left:
2811    movif32         wq, w0m
2812    mova            m4, [lpfq+wq+4]
2813    pshufb          m4, [base+sgr_lshuf5]
2814    jmp .h_main
2815.h_top:
2816%if ARCH_X86_64
2817    lea             wq, [r4-4]
2818%endif
2819    test         edgeb, 1 ; LR_HAVE_LEFT
2820    jz .h_extend_left
2821    movif32         wq, w0m
2822.h_loop:
2823    movu            m4, [lpfq+wq- 2]
2824.h_main:
2825    movu            m5, [lpfq+wq+14]
2826    test         edgeb, 2 ; LR_HAVE_RIGHT
2827    jnz .h_have_right
2828    cmp             wd, -20
2829    jl .h_have_right
2830%if ARCH_X86_32
2831    pxor            m8, m8
2832%endif
2833    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
2834.h_have_right:
2835    palignr         m3, m5, m4, 2
2836    palignr         m0, m5, m4, 4
2837    paddw           m1, m3, m0
2838    punpcklwd       m2, m3, m0
2839    pmaddwd         m2, m2
2840    punpckhwd       m3, m0
2841    pmaddwd         m3, m3
2842    palignr         m0, m5, m4, 6
2843    paddw           m1, m0             ; sum3
2844    punpcklwd       m7, m0, m6
2845    pmaddwd         m7, m7
2846    punpckhwd       m0, m6
2847    pmaddwd         m0, m0
2848    paddd           m2, m7             ; sumsq3
2849    palignr         m5, m4, 8
2850    punpcklwd       m7, m5, m4
2851    paddw           m8, m4, m5
2852    pmaddwd         m7, m7
2853    punpckhwd       m5, m4
2854    pmaddwd         m5, m5
2855    paddd           m3, m0
2856    mova [t1+wq+400* 6], m1
2857    mova [t1+wq+400* 8], m2
2858    mova [t1+wq+400*10], m3
2859    paddw           m8, m1             ; sum5
2860    paddd           m7, m2             ; sumsq5
2861    paddd           m5, m3
2862    mova [t1+wq+400* 0], m8
2863    mova [t1+wq+400* 2], m7
2864    mova [t1+wq+400* 4], m5
2865    add             wq, 16
2866    jl .h_loop
2867    ret
2868ALIGN function_align
2869.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
2870%if ARCH_X86_64
2871    lea             wq, [r4-4]
2872%else
2873    mov         hvsrcm, lpfq
2874%endif
2875    test         edgeb, 1 ; LR_HAVE_LEFT
2876    jz .hv0_extend_left
2877    movif32      leftq, leftm
2878    movddup         m5, [leftq]
2879    movif32         wq, w0m
2880    mova            m4, [lpfq+wq+4]
2881    add         leftmp, 8
2882    palignr         m4, m5, 10
2883    jmp .hv0_main
2884.hv0_extend_left:
2885    movif32         wq, w0m
2886    mova            m4, [lpfq+wq+4]
2887    pshufb          m4, [base+sgr_lshuf5]
2888    jmp .hv0_main
2889.hv0_bottom:
2890%if ARCH_X86_64
2891    lea             wq, [r4-4]
2892%else
2893    mov         hvsrcm, lpfq
2894%endif
2895    test         edgeb, 1 ; LR_HAVE_LEFT
2896    jz .hv0_extend_left
2897    movif32         wq, w0m
2898%if ARCH_X86_32
2899    jmp .hv0_loop_start
2900%endif
2901.hv0_loop:
2902    movif32       lpfq, hvsrcm
2903.hv0_loop_start:
2904    movu            m4, [lpfq+wq- 2]
2905.hv0_main:
2906    movu            m5, [lpfq+wq+14]
2907    test         edgeb, 2 ; LR_HAVE_RIGHT
2908    jnz .hv0_have_right
2909    cmp             wd, -20
2910    jl .hv0_have_right
2911%if ARCH_X86_32
2912    pxor            m8, m8
2913%endif
2914    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
2915.hv0_have_right:
2916    palignr         m3, m5, m4, 2
2917    palignr         m0, m5, m4, 4
2918    movif32         t3, t3m
2919    paddw           m1, m3, m0
2920    punpcklwd       m2, m3, m0
2921    pmaddwd         m2, m2
2922    punpckhwd       m3, m0
2923    pmaddwd         m3, m3
2924    palignr         m0, m5, m4, 6
2925    paddw           m1, m0             ; h sum3
2926    punpcklwd       m7, m0, m6
2927    pmaddwd         m7, m7
2928    punpckhwd       m0, m6
2929    pmaddwd         m0, m0
2930    paddd           m2, m7             ; h sumsq3
2931    palignr         m5, m4, 8
2932    punpcklwd       m7, m5, m4
2933    paddw           m8, m4, m5
2934    pmaddwd         m7, m7
2935    punpckhwd       m5, m4
2936    pmaddwd         m5, m5
2937    paddd           m3, m0
2938    paddw           m8, m1             ; h sum5
2939    paddd           m7, m2             ; h sumsq5
2940    paddd           m5, m3
2941    mova [t3+wq*2+400*8+ 8], m8
2942    mova [t3+wq*2+400*0+ 8], m7
2943    mova [t3+wq*2+400*0+24], m5
2944    paddw           m8, [t1+wq+400* 0]
2945    paddd           m7, [t1+wq+400* 2]
2946    paddd           m5, [t1+wq+400* 4]
2947    mova [t1+wq+400* 0], m8
2948    mova [t1+wq+400* 2], m7
2949    mova [t1+wq+400* 4], m5
2950    paddw           m0, m1, [t1+wq+400* 6]
2951    paddd           m4, m2, [t1+wq+400* 8]
2952    paddd           m5, m3, [t1+wq+400*10]
2953    mova [t1+wq+400* 6], m1
2954    mova [t1+wq+400* 8], m2
2955    mova [t1+wq+400*10], m3
2956    paddw           m1, m0, [t2+wq+400* 6]
2957    paddd           m2, m4, [t2+wq+400* 8]
2958    paddd           m3, m5, [t2+wq+400*10]
2959    mova [t2+wq+400* 6], m0
2960    mova [t2+wq+400* 8], m4
2961    mova [t2+wq+400*10], m5
2962    paddd           m2, m9
2963    paddd           m3, m9
2964    psrld           m2, 4              ; (a3 + 8) >> 4
2965    psrld           m3, 4
2966%if ARCH_X86_32
2967    pxor            m7, m7
2968%else
2969    SWAP            m7, m6
2970%endif
2971    pslld           m4, m2, 3
2972    pslld           m5, m3, 3
2973    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
2974    paddd           m5, m3
2975    psrlw           m3, m1, 1
2976    pavgw           m3, m7             ; (b3 + 2) >> 2
2977    punpcklwd       m2, m3, m7
2978    pmaddwd         m2, m2
2979    punpckhwd       m3, m7
2980    pmaddwd         m3, m3
2981    punpcklwd       m0, m1, m7         ; b3
2982    punpckhwd       m1, m7
2983%if ARCH_X86_64
2984    SWAP            m7, m6
2985%endif
2986    MAXSD           m4, m2, m7
2987    MAXSD           m5, m3, m7
2988    psubd           m4, m2             ; p3
2989    psubd           m5, m3
2990    MULLD           m4, m14, m7        ; p3 * s1
2991    MULLD           m5, m14, m7
2992    pmaddwd         m0, m11            ; b3 * 455
2993    pmaddwd         m1, m11
2994    paddusw         m4, m11
2995    paddusw         m5, m11
2996    psrld           m4, 20             ; min(z3, 255)
2997    psrld           m5, 20
2998    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2999    punpcklwd       m4, m3, m3
3000    punpckhwd       m5, m3, m3
3001    MULLD           m0, m4, m7
3002    MULLD           m1, m5, m7
3003    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3004    paddd           m1, m10
3005    mova [t4+wq*1+400*2+ 4], m3
3006    psrld           m0, 12
3007    psrld           m1, 12
3008    mova [t3+wq*2+400*4+ 8], m0
3009    mova [t3+wq*2+400*4+24], m1
3010    add             wq, 16
3011    jl .hv0_loop
3012    ret
3013ALIGN function_align
3014.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
3015%if ARCH_X86_64
3016    lea             wq, [r4-4]
3017%else
3018    mov         hvsrcm, lpfq
3019%endif
3020    test         edgeb, 1 ; LR_HAVE_LEFT
3021    jz .hv1_extend_left
3022    movif32      leftq, leftm
3023    movddup         m5, [leftq]
3024    movif32         wq, w0m
3025    mova            m4, [lpfq+wq+4]
3026    add         leftmp, 8
3027    palignr         m4, m5, 10
3028    jmp .hv1_main
3029.hv1_extend_left:
3030    movif32         wq, w0m
3031    mova            m4, [lpfq+wq+4]
3032    pshufb          m4, [base+sgr_lshuf5]
3033    jmp .hv1_main
3034.hv1_bottom:
3035%if ARCH_X86_64
3036    lea             wq, [r4-4]
3037%else
3038    mov         hvsrcm, lpfq
3039%endif
3040    test         edgeb, 1 ; LR_HAVE_LEFT
3041    jz .hv1_extend_left
3042    movif32         wq, w0m
3043%if ARCH_X86_32
3044    jmp .hv1_loop_start
3045%endif
3046.hv1_loop:
3047    movif32       lpfq, hvsrcm
3048.hv1_loop_start:
3049    movu            m4, [lpfq+wq- 2]
3050.hv1_main:
3051    movu            m5, [lpfq+wq+14]
3052    test         edgeb, 2 ; LR_HAVE_RIGHT
3053    jnz .hv1_have_right
3054    cmp             wd, -20
3055    jl .hv1_have_right
3056%if ARCH_X86_32
3057    pxor            m8, m8
3058%endif
3059    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
3060.hv1_have_right:
3061    palignr         m7, m5, m4, 2
3062    palignr         m3, m5, m4, 4
3063    paddw           m2, m7, m3
3064    punpcklwd       m0, m7, m3
3065    pmaddwd         m0, m0
3066    punpckhwd       m7, m3
3067    pmaddwd         m7, m7
3068    palignr         m3, m5, m4, 6
3069    paddw           m2, m3             ; h sum3
3070    punpcklwd       m1, m3, m6
3071    pmaddwd         m1, m1
3072    punpckhwd       m3, m6
3073    pmaddwd         m3, m3
3074    paddd           m0, m1             ; h sumsq3
3075    palignr         m5, m4, 8
3076    punpckhwd       m1, m4, m5
3077    paddw           m8, m4, m5
3078    pmaddwd         m1, m1
3079    punpcklwd       m4, m5
3080    pmaddwd         m4, m4
3081    paddd           m7, m3
3082    paddw           m5, m2, [t2+wq+400* 6]
3083    mova [t2+wq+400* 6], m2
3084    paddw           m8, m2             ; h sum5
3085    paddd           m2, m0, [t2+wq+400* 8]
3086    paddd           m3, m7, [t2+wq+400*10]
3087    mova [t2+wq+400* 8], m0
3088    mova [t2+wq+400*10], m7
3089    paddd           m4, m0             ; h sumsq5
3090    paddd           m1, m7
3091    paddd           m2, m9
3092    paddd           m3, m9
3093    psrld           m2, 4              ; (a3 + 8) >> 4
3094    psrld           m3, 4
3095    pslld           m0, m2, 3
3096    pslld           m7, m3, 3
3097    paddd           m2, m0             ; ((a3 + 8) >> 4) * 9
3098    paddd           m3, m7
3099    psrlw           m7, m5, 1
3100    pavgw           m7, m6             ; (b3 + 2) >> 2
3101    punpcklwd       m0, m7, m6
3102    pmaddwd         m0, m0
3103    punpckhwd       m7, m6
3104    pmaddwd         m7, m7
3105%if ARCH_X86_32
3106    mova      [esp+20], m8
3107%else
3108    SWAP            m8, m6
3109%endif
3110    MAXSD           m2, m0, m8
3111    MAXSD           m3, m7, m8
3112    pxor            m8, m8
3113    psubd           m2, m0             ; p3
3114    psubd           m3, m7
3115    punpcklwd       m0, m5, m8         ; b3
3116    punpckhwd       m5, m8
3117    MULLD           m2, m14, m8        ; p3 * s1
3118    MULLD           m3, m14, m8
3119    pmaddwd         m0, m11            ; b3 * 455
3120    pmaddwd         m5, m11
3121    paddusw         m2, m11
3122    paddusw         m3, m11
3123    psrld           m2, 20             ; min(z3, 255)
3124    movif32         t3, t3m
3125    psrld           m3, 20
3126    GATHER_X_BY_X   m8, m2, m3, r0, dstm
3127    punpcklwd       m2, m8, m8
3128    punpckhwd       m3, m8, m8
3129    MULLD           m0, m2, m7
3130    MULLD           m5, m3, m7
3131    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3132    paddd           m5, m10
3133    psrld           m0, 12
3134    psrld           m5, 12
3135    mova [t4+wq*1+400*4+4], m8
3136    mova [t3+wq*2+400*8+ 8], m0
3137    mova [t3+wq*2+400*8+24], m5
3138%if ARCH_X86_32
3139    mova            m8, [esp+20]
3140%else
3141    SWAP            m6, m8
3142    pxor            m6, m6
3143%endif
3144    paddw           m5, m8, [t2+wq+400*0]
3145    paddd           m2, m4, [t2+wq+400*2]
3146    paddd           m3, m1, [t2+wq+400*4]
3147    paddw           m5, [t1+wq+400*0]
3148    paddd           m2, [t1+wq+400*2]
3149    paddd           m3, [t1+wq+400*4]
3150    mova [t2+wq+400*0], m8
3151    paddd           m2, m9
3152    paddd           m3, m9
3153    psrld           m2, 4              ; (a5 + 8) >> 4
3154    psrld           m3, 4
3155    mova [t2+wq+400*2], m4
3156    pslld           m8, m2, 4
3157    mova [t2+wq+400*4], m1
3158    pslld           m4, m3, 4
3159    paddd           m8, m2
3160    pslld           m2, 3
3161    paddd           m4, m3
3162    pslld           m3, 3
3163    paddd           m2, m8             ; ((a5 + 8) >> 4) * 25
3164    paddd           m3, m4
3165%if ARCH_X86_32
3166    pxor            m7, m7
3167%else
3168    SWAP            m7, m6
3169%endif
3170    psrlw           m1, m5, 1
3171    pavgw           m1, m7             ; (b5 + 2) >> 2
3172    punpcklwd       m4, m1, m7
3173    pmaddwd         m4, m4
3174    punpckhwd       m1, m7
3175    pmaddwd         m1, m1
3176    punpcklwd       m0, m5, m7         ; b5
3177    punpckhwd       m5, m7
3178%if ARCH_X86_64
3179    SWAP            m7, m6
3180%endif
3181    MAXSD           m2, m4, m7
3182    psubd           m2, m4             ; p5
3183    MAXSD           m3, m1, m7
3184    psubd           m3, m1
3185    MULLD           m2, m13, m7        ; p5 * s0
3186    MULLD           m3, m13, m7
3187    pmaddwd         m0, m12             ; b5 * 164
3188    pmaddwd         m5, m12
3189    paddusw         m2, m12
3190    paddusw         m3, m12
3191    psrld           m2, 20             ; min(z5, 255)
3192    psrld           m3, 20
3193    GATHER_X_BY_X   m1, m2, m3, r0, dstm
3194    punpcklwd       m2, m1, m1
3195    punpckhwd       m3, m1, m1
3196    MULLD           m0, m2, m7
3197    MULLD           m5, m3, m7
3198    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
3199    paddd           m5, m10
3200    mova [t4+wq*1+400*0+ 4], m1
3201    psrld           m0, 12
3202    psrld           m5, 12
3203    mova [t3+wq*2+400*0+ 8], m0
3204    mova [t3+wq*2+400*0+24], m5
3205    add             wq, 16
3206    jl .hv1_loop
3207    mov            r10, t2
3208    mov             t2, t1
3209    mov             t1, r10
3210    ret
3211.v0: ; vertical boxsums + ab3 (even rows)
3212%if ARCH_X86_64
3213    lea             wq, [r4-4]
3214%else
3215    mov             wd, w0m
3216%endif
3217.v0_loop:
3218    mova            m0, [t1+wq+400* 6]
3219    mova            m4, [t1+wq+400* 8]
3220    mova            m5, [t1+wq+400*10]
3221    paddw           m0, m0
3222    paddd           m4, m4
3223    paddd           m5, m5
3224    paddw           m1, m0, [t2+wq+400* 6]
3225    paddd           m2, m4, [t2+wq+400* 8]
3226    paddd           m3, m5, [t2+wq+400*10]
3227    mova [t2+wq+400* 6], m0
3228    mova [t2+wq+400* 8], m4
3229    mova [t2+wq+400*10], m5
3230    paddd           m2, m9
3231    paddd           m3, m9
3232    psrld           m2, 4              ; (a3 + 8) >> 4
3233    psrld           m3, 4
3234%if ARCH_X86_32
3235    pxor            m7, m7
3236%else
3237    SWAP            m7, m6
3238%endif
3239    pslld           m4, m2, 3
3240    pslld           m5, m3, 3
3241    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
3242    paddd           m5, m3
3243    psrlw           m3, m1, 1
3244    pavgw           m3, m7             ; (b3 + 2) >> 2
3245    punpcklwd       m2, m3, m7
3246    pmaddwd         m2, m2
3247    punpckhwd       m3, m7
3248    pmaddwd         m3, m3
3249    punpcklwd       m0, m1, m7         ; b3
3250    punpckhwd       m1, m7
3251%if ARCH_X86_64
3252    SWAP            m7, m6
3253%endif
3254    MAXSD           m4, m2, m7
3255    MAXSD           m5, m3, m7
3256    psubd           m4, m2             ; p3
3257    psubd           m5, m3
3258    MULLD           m4, m14, m7        ; p3 * s1
3259    MULLD           m5, m14, m7
3260    pmaddwd         m0, m11            ; b3 * 455
3261    pmaddwd         m1, m11
3262    paddusw         m4, m11
3263    paddusw         m5, m11
3264    psrld           m4, 20             ; min(z3, 255)
3265    psrld           m5, 20
3266    GATHER_X_BY_X   m3, m4, m5, r0, dstm
3267    punpcklwd       m4, m3, m3
3268    punpckhwd       m5, m3, m3
3269    MULLD           m0, m4, m7
3270    MULLD           m1, m5, m7
3271    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3272    paddd           m1, m10
3273    mova [t4+wq*1+400*2+4], m3
3274    psrld           m0, 12
3275    psrld           m1, 12
3276    mova            m3, [t1+wq+400*0]
3277    mova            m4, [t1+wq+400*2]
3278    mova            m5, [t1+wq+400*4]
3279    mova [t3+wq*2+400*8+ 8], m3
3280    mova [t3+wq*2+400*0+ 8], m4
3281    mova [t3+wq*2+400*0+24], m5
3282    paddw           m3, m3 ; cc5
3283    paddd           m4, m4
3284    paddd           m5, m5
3285    mova [t1+wq+400*0], m3
3286    mova [t1+wq+400*2], m4
3287    mova [t1+wq+400*4], m5
3288    mova [t3+wq*2+400*4+ 8], m0
3289    mova [t3+wq*2+400*4+24], m1
3290    add             wq, 16
3291    jl .v0_loop
3292    ret
3293.v1: ; vertical boxsums + ab (odd rows)
3294%if ARCH_X86_64
3295    lea             wq, [r4-4]
3296%else
3297    mov             wd, w0m
3298%endif
3299.v1_loop:
3300    mova            m4, [t1+wq+400* 6]
3301    mova            m5, [t1+wq+400* 8]
3302    mova            m7, [t1+wq+400*10]
3303    paddw           m1, m4, [t2+wq+400* 6]
3304    paddd           m2, m5, [t2+wq+400* 8]
3305    paddd           m3, m7, [t2+wq+400*10]
3306    mova [t2+wq+400* 6], m4
3307    mova [t2+wq+400* 8], m5
3308    mova [t2+wq+400*10], m7
3309    paddd           m2, m9
3310    paddd           m3, m9
3311    psrld           m2, 4              ; (a3 + 8) >> 4
3312    psrld           m3, 4
3313%if ARCH_X86_32
3314    pxor            m7, m7
3315%else
3316    SWAP            m7, m6
3317%endif
3318    pslld           m4, m2, 3
3319    pslld           m5, m3, 3
3320    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
3321    paddd           m5, m3
3322    psrlw           m3, m1, 1
3323    pavgw           m3, m7             ; (b3 + 2) >> 2
3324    punpcklwd       m2, m3, m7
3325    pmaddwd         m2, m2
3326    punpckhwd       m3, m7
3327    pmaddwd         m3, m3
3328    punpcklwd       m0, m1, m7         ; b3
3329    punpckhwd       m1, m7
3330%if ARCH_X86_64
3331    SWAP            m7, m6
3332%endif
3333    MAXSD           m4, m2, m7
3334    MAXSD           m5, m3, m7
3335    psubd           m4, m2             ; p3
3336    psubd           m5, m3
3337    MULLD           m4, m14, m7        ; p3 * s1
3338    MULLD           m5, m14, m7
3339    pmaddwd         m0, m11            ; b3 * 455
3340    pmaddwd         m1, m11
3341    paddusw         m4, m11
3342    paddusw         m5, m11
3343    psrld           m4, 20             ; min(z3, 255)
3344    psrld           m5, 20
3345    GATHER_X_BY_X   m3, m4, m5, r0, dstm
3346    punpcklwd       m4, m3, m3
3347    punpckhwd       m5, m3, m3
3348    MULLD           m0, m4, m7
3349    MULLD           m1, m5, m7
3350    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3351    paddd           m1, m10
3352    mova [t4+wq*1+400*4+4], m3
3353    psrld           m0, 12
3354    psrld           m8, m1, 12
3355    mova            m4, [t3+wq*2+400*8+ 8]
3356    mova            m5, [t3+wq*2+400*0+ 8]
3357    mova            m7, [t3+wq*2+400*0+24]
3358    paddw           m1, m4, [t2+wq+400*0]
3359    paddd           m2, m5, [t2+wq+400*2]
3360    paddd           m3, m7, [t2+wq+400*4]
3361    paddw           m1, [t1+wq+400*0]
3362    paddd           m2, [t1+wq+400*2]
3363    paddd           m3, [t1+wq+400*4]
3364    mova [t2+wq+400*0], m4
3365    mova [t2+wq+400*2], m5
3366    mova [t2+wq+400*4], m7
3367    paddd           m2, m9
3368    paddd           m3, m9
3369    psrld           m2, 4              ; (a5 + 8) >> 4
3370    psrld           m3, 4
3371    mova         [t3+wq*2+400*8+ 8], m0
3372    pslld           m4, m2, 4
3373    mova         [t3+wq*2+400*8+24], m8
3374    pslld           m5, m3, 4
3375    paddd           m4, m2
3376    pslld           m2, 3
3377    paddd           m5, m3
3378    pslld           m3, 3
3379    paddd           m2, m4
3380    paddd           m3, m5
3381%if ARCH_X86_32
3382    pxor            m7, m7
3383%else
3384    SWAP            m7, m6
3385%endif
3386    psrlw           m5, m1, 1
3387    pavgw           m5, m7             ; (b5 + 2) >> 2
3388    punpcklwd       m4, m5, m7
3389    pmaddwd         m4, m4
3390    punpckhwd       m5, m7
3391    pmaddwd         m5, m5
3392    punpcklwd       m0, m1, m7         ; b5
3393    punpckhwd       m1, m7
3394%if ARCH_X86_64
3395    SWAP            m7, m6
3396%endif
3397    MAXSD           m2, m4, m7
3398    psubd           m2, m4             ; p5
3399    MAXSD           m3, m5, m7
3400    psubd           m3, m5
3401    MULLD           m2, m13, m7        ; p5 * s0
3402    MULLD           m3, m13, m7
3403    pmaddwd         m0, m12            ; b5 * 164
3404    pmaddwd         m1, m12
3405    paddusw         m2, m12
3406    paddusw         m3, m12
3407    psrld           m2, 20             ; min(z5, 255)
3408    psrld           m3, 20
3409    GATHER_X_BY_X   m4, m2, m3, r0, dstm
3410    punpcklwd       m2, m4, m4
3411    punpckhwd       m3, m4, m4
3412    MULLD           m0, m2, m7
3413    MULLD           m1, m3, m7
3414    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
3415    paddd           m1, m10
3416    mova [t4+wq*1+400*0+ 4], m4
3417    psrld           m0, 12
3418    psrld           m1, 12
3419    mova [t3+wq*2+400*0+ 8], m0
3420    mova [t3+wq*2+400*0+24], m1
3421    add             wq, 16
3422    jl .v1_loop
3423    mov            r10, t2
3424    mov             t2, t1
3425    mov             t1, r10
3426    ret
3427.prep_n: ; initial neighbor setup
3428    movif64         wq, r4
3429    movif32         wd, w1m
3430.prep_n_loop:
3431    movu            m0, [t4+wq*1+400*0+ 2]
3432    movu            m1, [t3+wq*2+400*0+ 4]
3433    movu            m2, [t3+wq*2+400*0+20]
3434    movu            m7, [t4+wq*1+400*0+ 4]
3435    movu            m8, [t3+wq*2+400*0+ 8]
3436    paddw           m3, m0, [t4+wq*1+400*0+ 0]
3437    paddd           m4, m1, [t3+wq*2+400*0+ 0]
3438    paddd           m5, m2, [t3+wq*2+400*0+16]
3439    paddw           m3, m7
3440    paddd           m4, m8
3441    movu            m7, [t3+wq*2+400*0+24]
3442    paddw           m0, m3
3443    paddd           m1, m4
3444    psllw           m3, 2
3445    pslld           m4, 2
3446    paddd           m5, m7
3447    paddd           m2, m5
3448    pslld           m5, 2
3449    paddw           m0, m3               ; a5 565
3450    paddd           m1, m4               ; b5 565
3451    paddd           m2, m5
3452    mova [t4+wq*1+400* 6+ 0], m0
3453    mova [t3+wq*2+400*12+ 0], m1
3454    mova [t3+wq*2+400*12+16], m2
3455    movu            m0, [t4+wq*1+400*2+ 4]
3456    movu            m1, [t3+wq*2+400*4+ 8]
3457    movu            m2, [t3+wq*2+400*4+24]
3458    movu            m3, [t4+wq*1+400*2+ 2]
3459    movu            m4, [t3+wq*2+400*4+ 4]
3460    movu            m5, [t3+wq*2+400*4+20]
3461    paddw           m0, [t4+wq*1+400*2+ 0]
3462    paddd           m1, [t3+wq*2+400*4+ 0]
3463    paddd           m2, [t3+wq*2+400*4+16]
3464    paddw           m3, m0
3465    paddd           m4, m1
3466    paddd           m5, m2
3467    psllw           m3, 2                ; a3[-1] 444
3468    pslld           m4, 2                ; b3[-1] 444
3469    pslld           m5, 2
3470    psubw           m3, m0               ; a3[-1] 343
3471    psubd           m4, m1               ; b3[-1] 343
3472    psubd           m5, m2
3473    mova [t4+wq*1+400* 8+ 0], m3
3474    mova [t3+wq*2+400*16+ 0], m4
3475    mova [t3+wq*2+400*16+16], m5
3476    movu            m0, [t4+wq*1+400*4+ 4]
3477    movu            m1, [t3+wq*2+400*8+ 8]
3478    movu            m2, [t3+wq*2+400*8+24]
3479    movu            m3, [t4+wq*1+400*4+ 2]
3480    movu            m4, [t3+wq*2+400*8+ 4]
3481    movu            m5, [t3+wq*2+400*8+20]
3482    paddw           m0, [t4+wq*1+400*4+ 0]
3483    paddd           m1, [t3+wq*2+400*8+ 0]
3484    paddd           m2, [t3+wq*2+400*8+16]
3485    paddw           m3, m0
3486    paddd           m4, m1
3487    paddd           m5, m2
3488    psllw           m3, 2                 ; a3[ 0] 444
3489    pslld           m4, 2                 ; b3[ 0] 444
3490    pslld           m5, 2
3491    mova [t4+wq*1+400*10+ 0], m3
3492    mova [t3+wq*2+400*20+ 0], m4
3493    mova [t3+wq*2+400*20+16], m5
3494    psubw           m3, m0                ; a3[ 0] 343
3495    psubd           m4, m1                ; b3[ 0] 343
3496    psubd           m5, m2
3497    mova [t4+wq*1+400*12+ 0], m3
3498    mova [t3+wq*2+400*24+ 0], m4
3499    mova [t3+wq*2+400*24+16], m5
3500    add             wq, 16
3501    jl .prep_n_loop
3502    ret
3503ALIGN function_align
3504.n0: ; neighbor + output (even rows)
3505    movif64         wq, r4
3506    movif32         wd, w1m
3507.n0_loop:
3508    movu            m0, [t4+wq*1+ 4]
3509    movu            m2, [t4+wq*1+ 2]
3510    paddw           m0, [t4+wq*1+ 0]
3511    paddw           m0, m2
3512    paddw           m2, m0
3513    psllw           m0, 2
3514    paddw           m0, m2               ; a5
3515    movu            m4, [t3+wq*2+ 8]
3516    movu            m5, [t3+wq*2+24]
3517    movu            m1, [t3+wq*2+ 4]
3518    movu            m3, [t3+wq*2+20]
3519    paddd           m4, [t3+wq*2+ 0]
3520    paddd           m5, [t3+wq*2+16]
3521    paddd           m4, m1
3522    paddd           m5, m3
3523    paddd           m1, m4
3524    paddd           m3, m5
3525    pslld           m4, 2
3526    pslld           m5, 2
3527    paddd           m4, m1               ; b5
3528    paddd           m5, m3
3529    movu            m2, [t4+wq*1+400* 6]
3530    paddw           m2, m0
3531    mova [t4+wq*1+400* 6], m0
3532    paddd           m0, m4, [t3+wq*2+400*12+ 0]
3533    paddd           m1, m5, [t3+wq*2+400*12+16]
3534    mova [t3+wq*2+400*12+ 0], m4
3535    mova [t3+wq*2+400*12+16], m5
3536    mova [rsp+16+ARCH_X86_32*4], m1
3537    movu            m3, [t4+wq*1+400*2+4]
3538    movu            m5, [t4+wq*1+400*2+2]
3539    paddw           m3, [t4+wq*1+400*2+0]
3540    paddw           m5, m3
3541    psllw           m5, 2                ; a3[ 1] 444
3542    psubw           m4, m5, m3           ; a3[ 1] 343
3543    movu            m3, [t4+wq*1+400* 8]
3544    paddw           m3, [t4+wq*1+400*10]
3545    paddw           m3, m4
3546    mova [t4+wq*1+400* 8], m4
3547    mova [t4+wq*1+400*10], m5
3548    movu            m1, [t3+wq*2+400*4+ 8]
3549    movu            m5, [t3+wq*2+400*4+ 4]
3550    movu            m7, [t3+wq*2+400*4+24]
3551    movu            m8, [t3+wq*2+400*4+20]
3552    paddd           m1, [t3+wq*2+400*4+ 0]
3553    paddd           m7, [t3+wq*2+400*4+16]
3554    paddd           m5, m1
3555    paddd           m8, m7
3556    pslld           m5, 2                ; b3[ 1] 444
3557    pslld           m8, 2
3558    psubd           m4, m5, m1           ; b3[ 1] 343
3559%if ARCH_X86_32
3560    mova      [esp+52], m8
3561    psubd           m8, m7
3562%else
3563    psubd           m6, m8, m7
3564    SWAP            m8, m6
3565%endif
3566    paddd           m1, m4, [t3+wq*2+400*16+ 0]
3567    paddd           m7, m8, [t3+wq*2+400*16+16]
3568    paddd           m1, [t3+wq*2+400*20+ 0]
3569    paddd           m7, [t3+wq*2+400*20+16]
3570    mova [t3+wq*2+400*16+ 0], m4
3571    mova [t3+wq*2+400*16+16], m8
3572    mova [t3+wq*2+400*20+ 0], m5
3573%if ARCH_X86_32
3574    mova            m8, [esp+52]
3575%else
3576    SWAP            m8, m6
3577    pxor            m6, m6
3578%endif
3579    mova [t3+wq*2+400*20+16], m8
3580    mova [rsp+32+ARCH_X86_32*4], m7
3581    movu            m5, [dstq+wq]
3582    punpcklwd       m4, m5, m6
3583    punpcklwd       m7, m2, m6
3584    pmaddwd         m7, m4               ; a5 * src
3585    punpcklwd       m8, m3, m6
3586    pmaddwd         m8, m4               ; a3 * src
3587    punpckhwd       m5, m6
3588    punpckhwd       m2, m6
3589    pmaddwd         m2, m5
3590    punpckhwd       m3, m6
3591    pmaddwd         m3, m5
3592    pslld           m4, 13
3593    pslld           m5, 13
3594    psubd           m0, m7               ; b5 - a5 * src + (1 << 8)
3595    psubd           m1, m8               ; b3 - a3 * src + (1 << 8)
3596    mova            m7, [base+pd_0xffff]
3597    psrld           m0, 9
3598    pslld           m1, 7
3599    pand            m0, m7
3600    pandn           m8, m7, m1
3601    por             m0, m8
3602    mova            m1, [rsp+16+ARCH_X86_32*4]
3603    mova            m8, [rsp+32+ARCH_X86_32*4]
3604    psubd           m1, m2
3605    psubd           m8, m3
3606    mova            m2, [base+pd_4096]
3607    psrld           m1, 9
3608    pslld           m8, 7
3609    pand            m1, m7
3610    pandn           m7, m8
3611    por             m1, m7
3612    pmaddwd         m0, m15
3613    pmaddwd         m1, m15
3614%if ARCH_X86_32
3615    pxor            m7, m7
3616%else
3617    SWAP            m7, m6
3618%endif
3619    paddd           m4, m2
3620    paddd           m5, m2
3621    paddd           m0, m4
3622    paddd           m1, m5
3623    psrad           m0, 8
3624    psrad           m1, 8
3625    packssdw        m0, m1               ; clip
3626    pmaxsw          m0, m7
3627    psrlw           m0, 5
3628    mova     [dstq+wq], m0
3629    add             wq, 16
3630    jl .n0_loop
3631    add           dstq, stridemp
3632    ret
3633%if ARCH_X86_64
3634    SWAP            m6, m7
3635%endif
3636ALIGN function_align
3637.n1: ; neighbor + output (odd rows)
3638    movif64         wq, r4
3639    movif32         wd, w1m
3640.n1_loop:
3641    movu            m3, [t4+wq*1+400*4+4]
3642    movu            m5, [t4+wq*1+400*4+2]
3643    paddw           m3, [t4+wq*1+400*4+0]
3644    paddw           m5, m3
3645    psllw           m5, 2                ; a3[ 1] 444
3646    psubw           m4, m5, m3           ; a3[ 1] 343
3647    paddw           m3, m4, [t4+wq*1+400*12]
3648    paddw           m3, [t4+wq*1+400*10]
3649    mova [t4+wq*1+400*10], m5
3650    mova [t4+wq*1+400*12], m4
3651    movu            m1, [t3+wq*2+400*8+ 8]
3652    movu            m5, [t3+wq*2+400*8+ 4]
3653    movu            m7, [t3+wq*2+400*8+24]
3654    movu            m8, [t3+wq*2+400*8+20]
3655    paddd           m1, [t3+wq*2+400*8+ 0]
3656    paddd           m7, [t3+wq*2+400*8+16]
3657    paddd           m5, m1
3658    paddd           m8, m7
3659    pslld           m5, 2                ; b3[ 1] 444
3660    pslld           m8, 2
3661    psubd           m4, m5, m1           ; b3[ 1] 343
3662    psubd           m0, m8, m7
3663    paddd           m1, m4, [t3+wq*2+400*24+ 0]
3664    paddd           m7, m0, [t3+wq*2+400*24+16]
3665    paddd           m1, [t3+wq*2+400*20+ 0]
3666    paddd           m7, [t3+wq*2+400*20+16]
3667    mova [t3+wq*2+400*20+ 0], m5
3668    mova [t3+wq*2+400*20+16], m8
3669    mova [t3+wq*2+400*24+ 0], m4
3670    mova [t3+wq*2+400*24+16], m0
3671    mova            m5, [dstq+wq]
3672    mova            m2, [t4+wq*1+400* 6]
3673    punpcklwd       m4, m5, m6
3674    punpcklwd       m8, m2, m6
3675    pmaddwd         m8, m4               ; a5 * src
3676    punpcklwd       m0, m3, m6
3677    pmaddwd         m0, m4               ; a3 * src
3678    punpckhwd       m5, m6
3679    punpckhwd       m2, m6
3680    pmaddwd         m2, m5
3681    punpckhwd       m3, m6
3682    pmaddwd         m3, m5
3683    psubd           m1, m0               ; b3 - a3 * src + (1 << 8)
3684    pslld           m4, 13
3685    pslld           m5, 13
3686    mova            m0, [t3+wq*2+400*12+ 0]
3687    psubd           m0, m8               ; b5 - a5 * src + (1 << 8)
3688    mova            m8, [t3+wq*2+400*12+16]
3689    psubd           m8, m2
3690    psubd           m7, m3
3691    mova            m2, [base+pd_0xffff]
3692    pslld           m1, 7
3693    psrld           m0, 8
3694    psrld           m8, 8
3695    pslld           m7, 7
3696    pand            m0, m2
3697    pandn           m3, m2, m1
3698    por             m0, m3
3699    pand            m8, m2
3700    pandn           m2, m7
3701    por             m2, m8
3702    mova            m1, [base+pd_4096]
3703    pmaddwd         m0, m15
3704    pmaddwd         m2, m15
3705%if ARCH_X86_64
3706    SWAP            m7, m6
3707%endif
3708    pxor            m7, m7
3709    paddd           m4, m1
3710    paddd           m5, m1
3711    paddd           m0, m4
3712    paddd           m2, m5
3713    psrad           m0, 8
3714    psrad           m2, 8
3715    packssdw        m0, m2              ; clip
3716    pmaxsw          m0, m7
3717    psrlw           m0, 5
3718    mova     [dstq+wq], m0
3719    add             wq, 16
3720    jl .n1_loop
3721    add           dstq, stridemp
3722    movif32       dstm, dstq
3723    ret
3724