xref: /aosp_15_r20/external/libaom/aom_dsp/x86/subpel_variance_ssse3.asm (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION_RODATA
17pw_8: times  8 dw  8
18
19bilin_filter_m_ssse3: times  8 db 16,  0
20                      times  8 db 14,  2
21                      times  8 db 12,  4
22                      times  8 db 10,  6
23                      times 16 db  8
24                      times  8 db  6, 10
25                      times  8 db  4, 12
26                      times  8 db  2, 14
27
28SECTION .text
29
30; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
31;                               int x_offset, int y_offset,
32;                               const uint8_t *dst, ptrdiff_t dst_stride,
33;                               int height, unsigned int *sse);
34;
35; This function returns the SE and stores SSE in the given pointer.
36
37%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
38  psubw                %3, %4
39  psubw                %1, %2
40  paddw                %5, %3
41  pmaddwd              %3, %3
42  paddw                %5, %1
43  pmaddwd              %1, %1
44  paddd                %6, %3
45  paddd                %6, %1
46%endmacro
47
48%macro STORE_AND_RET 1
49%if %1 > 4
50  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
51  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
52  ; We have to sign-extend it before adding the words within the register
53  ; and outputing to a dword.
54  pcmpgtw              m5, m6           ; mask for 0 > x
55  movhlps              m3, m7
56  punpcklwd            m4, m6, m5
57  punpckhwd            m6, m5           ; sign-extend m6 word->dword
58  paddd                m7, m3
59  paddd                m6, m4
60  pshufd               m3, m7, 0x1
61  movhlps              m4, m6
62  paddd                m7, m3
63  paddd                m6, m4
64  mov                  r1, ssem         ; r1 = unsigned int *sse
65  pshufd               m4, m6, 0x1
66  movd               [r1], m7           ; store sse
67  paddd                m6, m4
68  movd               raxd, m6           ; store sum as return value
69%else ; 4xh
70  pshuflw              m4, m6, 0xe
71  pshuflw              m3, m7, 0xe
72  paddw                m6, m4
73  paddd                m7, m3
74  pcmpgtw              m5, m6           ; mask for 0 > x
75  mov                  r1, ssem         ; r1 = unsigned int *sse
76  punpcklwd            m6, m5           ; sign-extend m6 word->dword
77  movd               [r1], m7           ; store sse
78  pshuflw              m4, m6, 0xe
79  paddd                m6, m4
80  movd               raxd, m6           ; store sum as return value
81%endif
82  RET
83%endmacro
84
85%macro INC_SRC_BY_SRC_STRIDE  0
86%if AOM_ARCH_X86=1 && CONFIG_PIC=1
87  add                srcq, src_stridemp
88%else
89  add                srcq, src_strideq
90%endif
91%endmacro
92
93%macro SUBPEL_VARIANCE 1-2 0 ; W
94%if cpuflag(ssse3)
95%define bilin_filter_m bilin_filter_m_ssse3
96%define filter_idx_shift 4
97%endif
98; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
99; 11, not 13, if the registers are ordered correctly. May make a minor speed
100; difference on Win64
101
102%if AOM_ARCH_X86_64
103  %if %2 == 1 ; avg
104    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
105                                        x_offset, y_offset, dst, dst_stride, \
106                                        sec, sec_stride, height, sse
107    %define sec_str sec_strideq
108  %else
109    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
110                                    x_offset, y_offset, dst, dst_stride, \
111                                    height, sse
112  %endif
113  %define block_height heightd
114  %define bilin_filter sseq
115%else
116  %if CONFIG_PIC=1
117    %if %2 == 1 ; avg
118      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
119                                          x_offset, y_offset, dst, dst_stride, \
120                                          sec, sec_stride, height, sse
121      %define block_height dword heightm
122      %define sec_str sec_stridemp
123    %else
124      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
125                                      x_offset, y_offset, dst, dst_stride, \
126                                      height, sse
127      %define block_height heightd
128    %endif
129
130    ; reuse argument stack space
131    %define g_bilin_filterm x_offsetm
132    %define g_pw_8m y_offsetm
133
134    ;Store bilin_filter and pw_8 location in stack
135    %if GET_GOT_DEFINED == 1
136      GET_GOT eax
137      add esp, 4                ; restore esp
138    %endif
139
140    lea ecx, [GLOBAL(bilin_filter_m)]
141    mov g_bilin_filterm, ecx
142
143    lea ecx, [GLOBAL(pw_8)]
144    mov g_pw_8m, ecx
145
146    LOAD_IF_USED 0, 1         ; load eax, ecx back
147  %else
148    %if %2 == 1 ; avg
149      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
150                                          x_offset, y_offset, \
151                                          dst, dst_stride, sec, sec_stride, \
152                                          height, sse
153      %define block_height dword heightm
154      %define sec_str sec_stridemp
155    %else
156      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
157                                      x_offset, y_offset, dst, dst_stride, \
158                                      height, sse
159      %define block_height heightd
160    %endif
161    %define bilin_filter bilin_filter_m
162  %endif
163%endif
164
165%if %1 == 4
166  %define movx movd
167%else
168  %define movx movh
169%endif
170
171  ASSERT               %1 <= 16         ; m6 overflows if w > 16
172  pxor                 m6, m6           ; sum
173  pxor                 m7, m7           ; sse
174  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
175  ; could perhaps use it for something more productive then
176  pxor                 m5, m5           ; dedicated zero register
177%if %1 < 16
178  sar                   block_height, 1
179%if %2 == 1 ; avg
180  shl             sec_str, 1
181%endif
182%endif
183
184  ; FIXME(rbultje) replace by jumptable?
185  test          x_offsetd, x_offsetd
186  jnz .x_nonzero
187  ; x_offset == 0
188  test          y_offsetd, y_offsetd
189  jnz .x_zero_y_nonzero
190
191  ; x_offset == 0 && y_offset == 0
192.x_zero_y_zero_loop:
193%if %1 == 16
194  movu                 m0, [srcq]
195  mova                 m1, [dstq]
196%if %2 == 1 ; avg
197  pavgb                m0, [secq]
198  punpckhbw            m3, m1, m5
199  punpcklbw            m1, m5
200%endif
201  punpckhbw            m2, m0, m5
202  punpcklbw            m0, m5
203
204%if %2 == 0 ; !avg
205  punpckhbw            m3, m1, m5
206  punpcklbw            m1, m5
207%endif
208  SUM_SSE              m0, m1, m2, m3, m6, m7
209
210  add                srcq, src_strideq
211  add                dstq, dst_strideq
212%else ; %1 < 16
213  movx                 m0, [srcq]
214%if %2 == 1 ; avg
215%if %1 > 4
216  movhps               m0, [srcq+src_strideq]
217%else ; 4xh
218  movx                 m1, [srcq+src_strideq]
219  punpckldq            m0, m1
220%endif
221%else ; !avg
222  movx                 m2, [srcq+src_strideq]
223%endif
224
225  movx                 m1, [dstq]
226  movx                 m3, [dstq+dst_strideq]
227
228%if %2 == 1 ; avg
229%if %1 > 4
230  pavgb                m0, [secq]
231%else
232  movh                 m2, [secq]
233  pavgb                m0, m2
234%endif
235  punpcklbw            m3, m5
236  punpcklbw            m1, m5
237%if %1 > 4
238  punpckhbw            m2, m0, m5
239  punpcklbw            m0, m5
240%else ; 4xh
241  punpcklbw            m0, m5
242  movhlps              m2, m0
243%endif
244%else ; !avg
245  punpcklbw            m0, m5
246  punpcklbw            m2, m5
247  punpcklbw            m3, m5
248  punpcklbw            m1, m5
249%endif
250  SUM_SSE              m0, m1, m2, m3, m6, m7
251
252  lea                srcq, [srcq+src_strideq*2]
253  lea                dstq, [dstq+dst_strideq*2]
254%endif
255%if %2 == 1 ; avg
256  add                secq, sec_str
257%endif
258  dec                   block_height
259  jg .x_zero_y_zero_loop
260  STORE_AND_RET %1
261
262.x_zero_y_nonzero:
263  cmp           y_offsetd, 4
264  jne .x_zero_y_nonhalf
265
266  ; x_offset == 0 && y_offset == 0.5
267.x_zero_y_half_loop:
268%if %1 == 16
269  movu                 m0, [srcq]
270  movu                 m4, [srcq+src_strideq]
271  mova                 m1, [dstq]
272  pavgb                m0, m4
273  punpckhbw            m3, m1, m5
274%if %2 == 1 ; avg
275  pavgb                m0, [secq]
276%endif
277  punpcklbw            m1, m5
278  punpckhbw            m2, m0, m5
279  punpcklbw            m0, m5
280  SUM_SSE              m0, m1, m2, m3, m6, m7
281
282  add                srcq, src_strideq
283  add                dstq, dst_strideq
284%else ; %1 < 16
285  movx                 m0, [srcq]
286  movx                 m2, [srcq+src_strideq]
287%if %2 == 1 ; avg
288%if %1 > 4
289  movhps               m2, [srcq+src_strideq*2]
290%else ; 4xh
291  movx                 m1, [srcq+src_strideq*2]
292  punpckldq            m2, m1
293%endif
294  movx                 m1, [dstq]
295%if %1 > 4
296  movlhps              m0, m2
297%else ; 4xh
298  punpckldq            m0, m2
299%endif
300  movx                 m3, [dstq+dst_strideq]
301  pavgb                m0, m2
302  punpcklbw            m1, m5
303%if %1 > 4
304  pavgb                m0, [secq]
305  punpcklbw            m3, m5
306  punpckhbw            m2, m0, m5
307  punpcklbw            m0, m5
308%else ; 4xh
309  movh                 m4, [secq]
310  pavgb                m0, m4
311  punpcklbw            m3, m5
312  punpcklbw            m0, m5
313  movhlps              m2, m0
314%endif
315%else ; !avg
316  movx                 m4, [srcq+src_strideq*2]
317  movx                 m1, [dstq]
318  pavgb                m0, m2
319  movx                 m3, [dstq+dst_strideq]
320  pavgb                m2, m4
321  punpcklbw            m0, m5
322  punpcklbw            m2, m5
323  punpcklbw            m3, m5
324  punpcklbw            m1, m5
325%endif
326  SUM_SSE              m0, m1, m2, m3, m6, m7
327
328  lea                srcq, [srcq+src_strideq*2]
329  lea                dstq, [dstq+dst_strideq*2]
330%endif
331%if %2 == 1 ; avg
332  add                secq, sec_str
333%endif
334  dec                   block_height
335  jg .x_zero_y_half_loop
336  STORE_AND_RET %1
337
338.x_zero_y_nonhalf:
339  ; x_offset == 0 && y_offset == bilin interpolation
340%if AOM_ARCH_X86_64
341  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
342%endif
343  shl           y_offsetd, filter_idx_shift
344%if AOM_ARCH_X86_64 && %1 > 4
345  mova                 m8, [bilin_filter+y_offsetq]
346%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
347  mova                 m9, [bilin_filter+y_offsetq+16]
348%endif
349  mova                m10, [GLOBAL(pw_8)]
350%define filter_y_a m8
351%define filter_y_b m9
352%define filter_rnd m10
353%else ; x86-32 or mmx
354%if AOM_ARCH_X86=1 && CONFIG_PIC=1
355; x_offset == 0, reuse x_offset reg
356%define tempq x_offsetq
357  add y_offsetq, g_bilin_filterm
358%define filter_y_a [y_offsetq]
359%define filter_y_b [y_offsetq+16]
360  mov tempq, g_pw_8m
361%define filter_rnd [tempq]
362%else
363  add           y_offsetq, bilin_filter
364%define filter_y_a [y_offsetq]
365%define filter_y_b [y_offsetq+16]
366%define filter_rnd [GLOBAL(pw_8)]
367%endif
368%endif
369
370.x_zero_y_other_loop:
371%if %1 == 16
372  movu                 m0, [srcq]
373  movu                 m4, [srcq+src_strideq]
374  mova                 m1, [dstq]
375%if cpuflag(ssse3)
376  punpckhbw            m2, m0, m4
377  punpcklbw            m0, m4
378  pmaddubsw            m2, filter_y_a
379  pmaddubsw            m0, filter_y_a
380  paddw                m2, filter_rnd
381  paddw                m0, filter_rnd
382%else
383  punpckhbw            m2, m0, m5
384  punpckhbw            m3, m4, m5
385  punpcklbw            m0, m5
386  punpcklbw            m4, m5
387  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
388  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
389  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
390  ; slightly faster because of pmullw latency. It would also cut our rodata
391  ; tables in half for this function, and save 1-2 registers on x86-64.
392  pmullw               m2, filter_y_a
393  pmullw               m3, filter_y_b
394  paddw                m2, filter_rnd
395  pmullw               m0, filter_y_a
396  pmullw               m4, filter_y_b
397  paddw                m0, filter_rnd
398  paddw                m2, m3
399  paddw                m0, m4
400%endif
401  psraw                m2, 4
402  psraw                m0, 4
403%if %2 == 1 ; avg
404  ; FIXME(rbultje) pipeline
405  packuswb             m0, m2
406  pavgb                m0, [secq]
407  punpckhbw            m2, m0, m5
408  punpcklbw            m0, m5
409%endif
410  punpckhbw            m3, m1, m5
411  punpcklbw            m1, m5
412  SUM_SSE              m0, m1, m2, m3, m6, m7
413
414  add                srcq, src_strideq
415  add                dstq, dst_strideq
416%else ; %1 < 16
417  movx                 m0, [srcq]
418  movx                 m2, [srcq+src_strideq]
419  movx                 m4, [srcq+src_strideq*2]
420  movx                 m3, [dstq+dst_strideq]
421%if cpuflag(ssse3)
422  movx                 m1, [dstq]
423  punpcklbw            m0, m2
424  punpcklbw            m2, m4
425  pmaddubsw            m0, filter_y_a
426  pmaddubsw            m2, filter_y_a
427  punpcklbw            m3, m5
428  paddw                m2, filter_rnd
429  paddw                m0, filter_rnd
430%else
431  punpcklbw            m0, m5
432  punpcklbw            m2, m5
433  punpcklbw            m4, m5
434  pmullw               m0, filter_y_a
435  pmullw               m1, m2, filter_y_b
436  punpcklbw            m3, m5
437  paddw                m0, filter_rnd
438  pmullw               m2, filter_y_a
439  pmullw               m4, filter_y_b
440  paddw                m0, m1
441  paddw                m2, filter_rnd
442  movx                 m1, [dstq]
443  paddw                m2, m4
444%endif
445  psraw                m0, 4
446  psraw                m2, 4
447%if %2 == 1 ; avg
448  ; FIXME(rbultje) pipeline
449%if %1 == 4
450  movlhps              m0, m2
451%endif
452  packuswb             m0, m2
453%if %1 > 4
454  pavgb                m0, [secq]
455  punpckhbw            m2, m0, m5
456  punpcklbw            m0, m5
457%else ; 4xh
458  movh                 m2, [secq]
459  pavgb                m0, m2
460  punpcklbw            m0, m5
461  movhlps              m2, m0
462%endif
463%endif
464  punpcklbw            m1, m5
465  SUM_SSE              m0, m1, m2, m3, m6, m7
466
467  lea                srcq, [srcq+src_strideq*2]
468  lea                dstq, [dstq+dst_strideq*2]
469%endif
470%if %2 == 1 ; avg
471  add                secq, sec_str
472%endif
473  dec                   block_height
474  jg .x_zero_y_other_loop
475%undef filter_y_a
476%undef filter_y_b
477%undef filter_rnd
478  STORE_AND_RET %1
479
480.x_nonzero:
481  cmp           x_offsetd, 4
482  jne .x_nonhalf
483  ; x_offset == 0.5
484  test          y_offsetd, y_offsetd
485  jnz .x_half_y_nonzero
486
487  ; x_offset == 0.5 && y_offset == 0
488.x_half_y_zero_loop:
489%if %1 == 16
490  movu                 m0, [srcq]
491  movu                 m4, [srcq+1]
492  mova                 m1, [dstq]
493  pavgb                m0, m4
494  punpckhbw            m3, m1, m5
495%if %2 == 1 ; avg
496  pavgb                m0, [secq]
497%endif
498  punpcklbw            m1, m5
499  punpckhbw            m2, m0, m5
500  punpcklbw            m0, m5
501  SUM_SSE              m0, m1, m2, m3, m6, m7
502
503  add                srcq, src_strideq
504  add                dstq, dst_strideq
505%else ; %1 < 16
506  movx                 m0, [srcq]
507  movx                 m4, [srcq+1]
508%if %2 == 1 ; avg
509%if %1 > 4
510  movhps               m0, [srcq+src_strideq]
511  movhps               m4, [srcq+src_strideq+1]
512%else ; 4xh
513  movx                 m1, [srcq+src_strideq]
514  punpckldq            m0, m1
515  movx                 m2, [srcq+src_strideq+1]
516  punpckldq            m4, m2
517%endif
518  movx                 m1, [dstq]
519  movx                 m3, [dstq+dst_strideq]
520  pavgb                m0, m4
521  punpcklbw            m3, m5
522%if %1 > 4
523  pavgb                m0, [secq]
524  punpcklbw            m1, m5
525  punpckhbw            m2, m0, m5
526  punpcklbw            m0, m5
527%else ; 4xh
528  movh                 m2, [secq]
529  pavgb                m0, m2
530  punpcklbw            m1, m5
531  punpcklbw            m0, m5
532  movhlps              m2, m0
533%endif
534%else ; !avg
535  movx                 m2, [srcq+src_strideq]
536  movx                 m1, [dstq]
537  pavgb                m0, m4
538  movx                 m4, [srcq+src_strideq+1]
539  movx                 m3, [dstq+dst_strideq]
540  pavgb                m2, m4
541  punpcklbw            m0, m5
542  punpcklbw            m2, m5
543  punpcklbw            m3, m5
544  punpcklbw            m1, m5
545%endif
546  SUM_SSE              m0, m1, m2, m3, m6, m7
547
548  lea                srcq, [srcq+src_strideq*2]
549  lea                dstq, [dstq+dst_strideq*2]
550%endif
551%if %2 == 1 ; avg
552  add                secq, sec_str
553%endif
554  dec                   block_height
555  jg .x_half_y_zero_loop
556  STORE_AND_RET %1
557
558.x_half_y_nonzero:
559  cmp           y_offsetd, 4
560  jne .x_half_y_nonhalf
561
562  ; x_offset == 0.5 && y_offset == 0.5
563%if %1 == 16
564  movu                 m0, [srcq]
565  movu                 m3, [srcq+1]
566  add                srcq, src_strideq
567  pavgb                m0, m3
568.x_half_y_half_loop:
569  movu                 m4, [srcq]
570  movu                 m3, [srcq+1]
571  mova                 m1, [dstq]
572  pavgb                m4, m3
573  punpckhbw            m3, m1, m5
574  pavgb                m0, m4
575%if %2 == 1 ; avg
576  punpcklbw            m1, m5
577  pavgb                m0, [secq]
578  punpckhbw            m2, m0, m5
579  punpcklbw            m0, m5
580%else
581  punpckhbw            m2, m0, m5
582  punpcklbw            m0, m5
583  punpcklbw            m1, m5
584%endif
585  SUM_SSE              m0, m1, m2, m3, m6, m7
586  mova                 m0, m4
587
588  add                srcq, src_strideq
589  add                dstq, dst_strideq
590%else ; %1 < 16
591  movx                 m0, [srcq]
592  movx                 m3, [srcq+1]
593  add                srcq, src_strideq
594  pavgb                m0, m3
595.x_half_y_half_loop:
596  movx                 m2, [srcq]
597  movx                 m3, [srcq+1]
598%if %2 == 1 ; avg
599%if %1 > 4
600  movhps               m2, [srcq+src_strideq]
601  movhps               m3, [srcq+src_strideq+1]
602%else
603  movx                 m1, [srcq+src_strideq]
604  punpckldq            m2, m1
605  movx                 m1, [srcq+src_strideq+1]
606  punpckldq            m3, m1
607%endif
608  pavgb                m2, m3
609%if %1 > 4
610  movlhps              m0, m2
611  movhlps              m4, m2
612%else ; 4xh
613  punpckldq            m0, m2
614  pshuflw              m4, m2, 0xe
615%endif
616  movx                 m1, [dstq]
617  pavgb                m0, m2
618  movx                 m3, [dstq+dst_strideq]
619%if %1 > 4
620  pavgb                m0, [secq]
621%else
622  movh                 m2, [secq]
623  pavgb                m0, m2
624%endif
625  punpcklbw            m3, m5
626  punpcklbw            m1, m5
627%if %1 > 4
628  punpckhbw            m2, m0, m5
629  punpcklbw            m0, m5
630%else
631  punpcklbw            m0, m5
632  movhlps              m2, m0
633%endif
634%else ; !avg
635  movx                 m4, [srcq+src_strideq]
636  movx                 m1, [srcq+src_strideq+1]
637  pavgb                m2, m3
638  pavgb                m4, m1
639  pavgb                m0, m2
640  pavgb                m2, m4
641  movx                 m1, [dstq]
642  movx                 m3, [dstq+dst_strideq]
643  punpcklbw            m0, m5
644  punpcklbw            m2, m5
645  punpcklbw            m3, m5
646  punpcklbw            m1, m5
647%endif
648  SUM_SSE              m0, m1, m2, m3, m6, m7
649  mova                 m0, m4
650
651  lea                srcq, [srcq+src_strideq*2]
652  lea                dstq, [dstq+dst_strideq*2]
653%endif
654%if %2 == 1 ; avg
655  add                secq, sec_str
656%endif
657  dec                   block_height
658  jg .x_half_y_half_loop
659  STORE_AND_RET %1
660
661.x_half_y_nonhalf:
662  ; x_offset == 0.5 && y_offset == bilin interpolation
663%if AOM_ARCH_X86_64
664  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
665%endif
666  shl           y_offsetd, filter_idx_shift
667%if AOM_ARCH_X86_64 && %1 > 4
668  mova                 m8, [bilin_filter+y_offsetq]
669%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
670  mova                 m9, [bilin_filter+y_offsetq+16]
671%endif
672  mova                m10, [GLOBAL(pw_8)]
673%define filter_y_a m8
674%define filter_y_b m9
675%define filter_rnd m10
676%else  ;x86_32
677%if AOM_ARCH_X86=1 && CONFIG_PIC=1
678; x_offset == 0.5. We can reuse x_offset reg
679%define tempq x_offsetq
680  add y_offsetq, g_bilin_filterm
681%define filter_y_a [y_offsetq]
682%define filter_y_b [y_offsetq+16]
683  mov tempq, g_pw_8m
684%define filter_rnd [tempq]
685%else
686  add           y_offsetq, bilin_filter
687%define filter_y_a [y_offsetq]
688%define filter_y_b [y_offsetq+16]
689%define filter_rnd [GLOBAL(pw_8)]
690%endif
691%endif
692
693%if %1 == 16
694  movu                 m0, [srcq]
695  movu                 m3, [srcq+1]
696  add                srcq, src_strideq
697  pavgb                m0, m3
698.x_half_y_other_loop:
699  movu                 m4, [srcq]
700  movu                 m2, [srcq+1]
701  mova                 m1, [dstq]
702  pavgb                m4, m2
703%if cpuflag(ssse3)
704  punpckhbw            m2, m0, m4
705  punpcklbw            m0, m4
706  pmaddubsw            m2, filter_y_a
707  pmaddubsw            m0, filter_y_a
708  paddw                m2, filter_rnd
709  paddw                m0, filter_rnd
710  psraw                m2, 4
711%else
712  punpckhbw            m2, m0, m5
713  punpckhbw            m3, m4, m5
714  pmullw               m2, filter_y_a
715  pmullw               m3, filter_y_b
716  paddw                m2, filter_rnd
717  punpcklbw            m0, m5
718  paddw                m2, m3
719  punpcklbw            m3, m4, m5
720  pmullw               m0, filter_y_a
721  pmullw               m3, filter_y_b
722  paddw                m0, filter_rnd
723  psraw                m2, 4
724  paddw                m0, m3
725%endif
726  punpckhbw            m3, m1, m5
727  psraw                m0, 4
728%if %2 == 1 ; avg
729  ; FIXME(rbultje) pipeline
730  packuswb             m0, m2
731  pavgb                m0, [secq]
732  punpckhbw            m2, m0, m5
733  punpcklbw            m0, m5
734%endif
735  punpcklbw            m1, m5
736  SUM_SSE              m0, m1, m2, m3, m6, m7
737  mova                 m0, m4
738
739  add                srcq, src_strideq
740  add                dstq, dst_strideq
741%else ; %1 < 16
742  movx                 m0, [srcq]
743  movx                 m3, [srcq+1]
744  add                srcq, src_strideq
745  pavgb                m0, m3
746%if notcpuflag(ssse3)
747  punpcklbw            m0, m5
748%endif
749.x_half_y_other_loop:
750  movx                 m2, [srcq]
751  movx                 m1, [srcq+1]
752  movx                 m4, [srcq+src_strideq]
753  movx                 m3, [srcq+src_strideq+1]
754  pavgb                m2, m1
755  pavgb                m4, m3
756  movx                 m3, [dstq+dst_strideq]
757%if cpuflag(ssse3)
758  movx                 m1, [dstq]
759  punpcklbw            m0, m2
760  punpcklbw            m2, m4
761  pmaddubsw            m0, filter_y_a
762  pmaddubsw            m2, filter_y_a
763  punpcklbw            m3, m5
764  paddw                m0, filter_rnd
765  paddw                m2, filter_rnd
766%else
767  punpcklbw            m2, m5
768  punpcklbw            m4, m5
769  pmullw               m0, filter_y_a
770  pmullw               m1, m2, filter_y_b
771  punpcklbw            m3, m5
772  paddw                m0, filter_rnd
773  pmullw               m2, filter_y_a
774  paddw                m0, m1
775  pmullw               m1, m4, filter_y_b
776  paddw                m2, filter_rnd
777  paddw                m2, m1
778  movx                 m1, [dstq]
779%endif
780  psraw                m0, 4
781  psraw                m2, 4
782%if %2 == 1 ; avg
783  ; FIXME(rbultje) pipeline
784%if %1 == 4
785  movlhps              m0, m2
786%endif
787  packuswb             m0, m2
788%if %1 > 4
789  pavgb                m0, [secq]
790  punpckhbw            m2, m0, m5
791  punpcklbw            m0, m5
792%else
793  movh                 m2, [secq]
794  pavgb                m0, m2
795  punpcklbw            m0, m5
796  movhlps              m2, m0
797%endif
798%endif
799  punpcklbw            m1, m5
800  SUM_SSE              m0, m1, m2, m3, m6, m7
801  mova                 m0, m4
802
803  lea                srcq, [srcq+src_strideq*2]
804  lea                dstq, [dstq+dst_strideq*2]
805%endif
806%if %2 == 1 ; avg
807  add                secq, sec_str
808%endif
809  dec                   block_height
810  jg .x_half_y_other_loop
811%undef filter_y_a
812%undef filter_y_b
813%undef filter_rnd
814  STORE_AND_RET %1
815
816.x_nonhalf:
817  test          y_offsetd, y_offsetd
818  jnz .x_nonhalf_y_nonzero
819
820  ; x_offset == bilin interpolation && y_offset == 0
821%if AOM_ARCH_X86_64
822  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
823%endif
824  shl           x_offsetd, filter_idx_shift
825%if AOM_ARCH_X86_64 && %1 > 4
826  mova                 m8, [bilin_filter+x_offsetq]
827%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
828  mova                 m9, [bilin_filter+x_offsetq+16]
829%endif
830  mova                m10, [GLOBAL(pw_8)]
831%define filter_x_a m8
832%define filter_x_b m9
833%define filter_rnd m10
834%else    ; x86-32
835%if AOM_ARCH_X86=1 && CONFIG_PIC=1
836;y_offset == 0. We can reuse y_offset reg.
837%define tempq y_offsetq
838  add x_offsetq, g_bilin_filterm
839%define filter_x_a [x_offsetq]
840%define filter_x_b [x_offsetq+16]
841  mov tempq, g_pw_8m
842%define filter_rnd [tempq]
843%else
844  add           x_offsetq, bilin_filter
845%define filter_x_a [x_offsetq]
846%define filter_x_b [x_offsetq+16]
847%define filter_rnd [GLOBAL(pw_8)]
848%endif
849%endif
850
851.x_other_y_zero_loop:
852%if %1 == 16
853  movu                 m0, [srcq]
854  movu                 m4, [srcq+1]
855  mova                 m1, [dstq]
856%if cpuflag(ssse3)
857  punpckhbw            m2, m0, m4
858  punpcklbw            m0, m4
859  pmaddubsw            m2, filter_x_a
860  pmaddubsw            m0, filter_x_a
861  paddw                m2, filter_rnd
862  paddw                m0, filter_rnd
863%else
864  punpckhbw            m2, m0, m5
865  punpckhbw            m3, m4, m5
866  punpcklbw            m0, m5
867  punpcklbw            m4, m5
868  pmullw               m2, filter_x_a
869  pmullw               m3, filter_x_b
870  paddw                m2, filter_rnd
871  pmullw               m0, filter_x_a
872  pmullw               m4, filter_x_b
873  paddw                m0, filter_rnd
874  paddw                m2, m3
875  paddw                m0, m4
876%endif
877  psraw                m2, 4
878  psraw                m0, 4
879%if %2 == 1 ; avg
880  ; FIXME(rbultje) pipeline
881  packuswb             m0, m2
882  pavgb                m0, [secq]
883  punpckhbw            m2, m0, m5
884  punpcklbw            m0, m5
885%endif
886  punpckhbw            m3, m1, m5
887  punpcklbw            m1, m5
888  SUM_SSE              m0, m1, m2, m3, m6, m7
889
890  add                srcq, src_strideq
891  add                dstq, dst_strideq
892%else ; %1 < 16
893  movx                 m0, [srcq]
894  movx                 m1, [srcq+1]
895  movx                 m2, [srcq+src_strideq]
896  movx                 m4, [srcq+src_strideq+1]
897  movx                 m3, [dstq+dst_strideq]
898%if cpuflag(ssse3)
899  punpcklbw            m0, m1
900  movx                 m1, [dstq]
901  punpcklbw            m2, m4
902  pmaddubsw            m0, filter_x_a
903  pmaddubsw            m2, filter_x_a
904  punpcklbw            m3, m5
905  paddw                m0, filter_rnd
906  paddw                m2, filter_rnd
907%else
908  punpcklbw            m0, m5
909  punpcklbw            m1, m5
910  punpcklbw            m2, m5
911  punpcklbw            m4, m5
912  pmullw               m0, filter_x_a
913  pmullw               m1, filter_x_b
914  punpcklbw            m3, m5
915  paddw                m0, filter_rnd
916  pmullw               m2, filter_x_a
917  pmullw               m4, filter_x_b
918  paddw                m0, m1
919  paddw                m2, filter_rnd
920  movx                 m1, [dstq]
921  paddw                m2, m4
922%endif
923  psraw                m0, 4
924  psraw                m2, 4
925%if %2 == 1 ; avg
926  ; FIXME(rbultje) pipeline
927%if %1 == 4
928  movlhps              m0, m2
929%endif
930  packuswb             m0, m2
931%if %1 > 4
932  pavgb                m0, [secq]
933  punpckhbw            m2, m0, m5
934  punpcklbw            m0, m5
935%else
936  movh                 m2, [secq]
937  pavgb                m0, m2
938  punpcklbw            m0, m5
939  movhlps              m2, m0
940%endif
941%endif
942  punpcklbw            m1, m5
943  SUM_SSE              m0, m1, m2, m3, m6, m7
944
945  lea                srcq, [srcq+src_strideq*2]
946  lea                dstq, [dstq+dst_strideq*2]
947%endif
948%if %2 == 1 ; avg
949  add                secq, sec_str
950%endif
951  dec                   block_height
952  jg .x_other_y_zero_loop
953%undef filter_x_a
954%undef filter_x_b
955%undef filter_rnd
956  STORE_AND_RET %1
957
958.x_nonhalf_y_nonzero:
959  cmp           y_offsetd, 4
960  jne .x_nonhalf_y_nonhalf
961
962  ; x_offset == bilin interpolation && y_offset == 0.5
963%if AOM_ARCH_X86_64
964  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
965%endif
966  shl           x_offsetd, filter_idx_shift
967%if AOM_ARCH_X86_64 && %1 > 4
968  mova                 m8, [bilin_filter+x_offsetq]
969%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
970  mova                 m9, [bilin_filter+x_offsetq+16]
971%endif
972  mova                m10, [GLOBAL(pw_8)]
973%define filter_x_a m8
974%define filter_x_b m9
975%define filter_rnd m10
976%else    ; x86-32
977%if AOM_ARCH_X86=1 && CONFIG_PIC=1
978; y_offset == 0.5. We can reuse y_offset reg.
979%define tempq y_offsetq
980  add x_offsetq, g_bilin_filterm
981%define filter_x_a [x_offsetq]
982%define filter_x_b [x_offsetq+16]
983  mov tempq, g_pw_8m
984%define filter_rnd [tempq]
985%else
986  add           x_offsetq, bilin_filter
987%define filter_x_a [x_offsetq]
988%define filter_x_b [x_offsetq+16]
989%define filter_rnd [GLOBAL(pw_8)]
990%endif
991%endif
992
993%if %1 == 16
994  movu                 m0, [srcq]
995  movu                 m1, [srcq+1]
996%if cpuflag(ssse3)
997  punpckhbw            m2, m0, m1
998  punpcklbw            m0, m1
999  pmaddubsw            m2, filter_x_a
1000  pmaddubsw            m0, filter_x_a
1001  paddw                m2, filter_rnd
1002  paddw                m0, filter_rnd
1003%else
1004  punpckhbw            m2, m0, m5
1005  punpckhbw            m3, m1, m5
1006  punpcklbw            m0, m5
1007  punpcklbw            m1, m5
1008  pmullw               m0, filter_x_a
1009  pmullw               m1, filter_x_b
1010  paddw                m0, filter_rnd
1011  pmullw               m2, filter_x_a
1012  pmullw               m3, filter_x_b
1013  paddw                m2, filter_rnd
1014  paddw                m0, m1
1015  paddw                m2, m3
1016%endif
1017  psraw                m0, 4
1018  psraw                m2, 4
1019  add                srcq, src_strideq
1020  packuswb             m0, m2
1021.x_other_y_half_loop:
1022  movu                 m4, [srcq]
1023  movu                 m3, [srcq+1]
1024%if cpuflag(ssse3)
1025  mova                 m1, [dstq]
1026  punpckhbw            m2, m4, m3
1027  punpcklbw            m4, m3
1028  pmaddubsw            m2, filter_x_a
1029  pmaddubsw            m4, filter_x_a
1030  paddw                m2, filter_rnd
1031  paddw                m4, filter_rnd
1032  psraw                m2, 4
1033  psraw                m4, 4
1034  packuswb             m4, m2
1035  pavgb                m0, m4
1036  punpckhbw            m3, m1, m5
1037  punpcklbw            m1, m5
1038%else
1039  punpckhbw            m2, m4, m5
1040  punpckhbw            m1, m3, m5
1041  punpcklbw            m4, m5
1042  punpcklbw            m3, m5
1043  pmullw               m4, filter_x_a
1044  pmullw               m3, filter_x_b
1045  paddw                m4, filter_rnd
1046  pmullw               m2, filter_x_a
1047  pmullw               m1, filter_x_b
1048  paddw                m2, filter_rnd
1049  paddw                m4, m3
1050  paddw                m2, m1
1051  mova                 m1, [dstq]
1052  psraw                m4, 4
1053  psraw                m2, 4
1054  punpckhbw            m3, m1, m5
1055  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
1056  ; have a 1-register shortage to be able to store the backup of the bilin
1057  ; filtered second line as words as cache for the next line. Packing into
1058  ; a byte costs 1 pack and 2 unpacks, but saves a register.
1059  packuswb             m4, m2
1060  punpcklbw            m1, m5
1061  pavgb                m0, m4
1062%endif
1063%if %2 == 1 ; avg
1064  ; FIXME(rbultje) pipeline
1065  pavgb                m0, [secq]
1066%endif
1067  punpckhbw            m2, m0, m5
1068  punpcklbw            m0, m5
1069  SUM_SSE              m0, m1, m2, m3, m6, m7
1070  mova                 m0, m4
1071
1072  add                srcq, src_strideq
1073  add                dstq, dst_strideq
1074%else ; %1 < 16
1075  movx                 m0, [srcq]
1076  movx                 m1, [srcq+1]
1077%if cpuflag(ssse3)
1078  punpcklbw            m0, m1
1079  pmaddubsw            m0, filter_x_a
1080  paddw                m0, filter_rnd
1081%else
1082  punpcklbw            m0, m5
1083  punpcklbw            m1, m5
1084  pmullw               m0, filter_x_a
1085  pmullw               m1, filter_x_b
1086  paddw                m0, filter_rnd
1087  paddw                m0, m1
1088%endif
1089  add                srcq, src_strideq
1090  psraw                m0, 4
1091.x_other_y_half_loop:
1092  movx                 m2, [srcq]
1093  movx                 m1, [srcq+1]
1094  movx                 m4, [srcq+src_strideq]
1095  movx                 m3, [srcq+src_strideq+1]
1096%if cpuflag(ssse3)
1097  punpcklbw            m2, m1
1098  punpcklbw            m4, m3
1099  pmaddubsw            m2, filter_x_a
1100  pmaddubsw            m4, filter_x_a
1101  movx                 m1, [dstq]
1102  movx                 m3, [dstq+dst_strideq]
1103  paddw                m2, filter_rnd
1104  paddw                m4, filter_rnd
1105%else
1106  punpcklbw            m2, m5
1107  punpcklbw            m1, m5
1108  punpcklbw            m4, m5
1109  punpcklbw            m3, m5
1110  pmullw               m2, filter_x_a
1111  pmullw               m1, filter_x_b
1112  paddw                m2, filter_rnd
1113  pmullw               m4, filter_x_a
1114  pmullw               m3, filter_x_b
1115  paddw                m4, filter_rnd
1116  paddw                m2, m1
1117  movx                 m1, [dstq]
1118  paddw                m4, m3
1119  movx                 m3, [dstq+dst_strideq]
1120%endif
1121  psraw                m2, 4
1122  psraw                m4, 4
1123  pavgw                m0, m2
1124  pavgw                m2, m4
1125%if %2 == 1 ; avg
1126  ; FIXME(rbultje) pipeline - also consider going to bytes here
1127%if %1 == 4
1128  movlhps              m0, m2
1129%endif
1130  packuswb             m0, m2
1131%if %1 > 4
1132  pavgb                m0, [secq]
1133  punpckhbw            m2, m0, m5
1134  punpcklbw            m0, m5
1135%else
1136  movh                 m2, [secq]
1137  pavgb                m0, m2
1138  punpcklbw            m0, m5
1139  movhlps              m2, m0
1140%endif
1141%endif
1142  punpcklbw            m3, m5
1143  punpcklbw            m1, m5
1144  SUM_SSE              m0, m1, m2, m3, m6, m7
1145  mova                 m0, m4
1146
1147  lea                srcq, [srcq+src_strideq*2]
1148  lea                dstq, [dstq+dst_strideq*2]
1149%endif
1150%if %2 == 1 ; avg
1151  add                secq, sec_str
1152%endif
1153  dec                   block_height
1154  jg .x_other_y_half_loop
1155%undef filter_x_a
1156%undef filter_x_b
1157%undef filter_rnd
1158  STORE_AND_RET %1
1159
1160.x_nonhalf_y_nonhalf:
1161%if AOM_ARCH_X86_64
1162  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
1163%endif
1164  shl           x_offsetd, filter_idx_shift
1165  shl           y_offsetd, filter_idx_shift
1166%if AOM_ARCH_X86_64 && %1 > 4
1167  mova                 m8, [bilin_filter+x_offsetq]
1168%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
1169  mova                 m9, [bilin_filter+x_offsetq+16]
1170%endif
1171  mova                m10, [bilin_filter+y_offsetq]
1172%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
1173  mova                m11, [bilin_filter+y_offsetq+16]
1174%endif
1175  mova                m12, [GLOBAL(pw_8)]
1176%define filter_x_a m8
1177%define filter_x_b m9
1178%define filter_y_a m10
1179%define filter_y_b m11
1180%define filter_rnd m12
1181%else   ; x86-32
1182%if AOM_ARCH_X86=1 && CONFIG_PIC=1
1183; In this case, there is NO unused register. Used src_stride register. Later,
1184; src_stride has to be loaded from stack when it is needed.
1185%define tempq src_strideq
1186  mov tempq, g_bilin_filterm
1187  add           x_offsetq, tempq
1188  add           y_offsetq, tempq
1189%define filter_x_a [x_offsetq]
1190%define filter_x_b [x_offsetq+16]
1191%define filter_y_a [y_offsetq]
1192%define filter_y_b [y_offsetq+16]
1193
1194  mov tempq, g_pw_8m
1195%define filter_rnd [tempq]
1196%else
1197  add           x_offsetq, bilin_filter
1198  add           y_offsetq, bilin_filter
1199%define filter_x_a [x_offsetq]
1200%define filter_x_b [x_offsetq+16]
1201%define filter_y_a [y_offsetq]
1202%define filter_y_b [y_offsetq+16]
1203%define filter_rnd [GLOBAL(pw_8)]
1204%endif
1205%endif
1206
1207  ; x_offset == bilin interpolation && y_offset == bilin interpolation
1208%if %1 == 16
1209  movu                 m0, [srcq]
1210  movu                 m1, [srcq+1]
1211%if cpuflag(ssse3)
1212  punpckhbw            m2, m0, m1
1213  punpcklbw            m0, m1
1214  pmaddubsw            m2, filter_x_a
1215  pmaddubsw            m0, filter_x_a
1216  paddw                m2, filter_rnd
1217  paddw                m0, filter_rnd
1218%else
1219  punpckhbw            m2, m0, m5
1220  punpckhbw            m3, m1, m5
1221  punpcklbw            m0, m5
1222  punpcklbw            m1, m5
1223  pmullw               m0, filter_x_a
1224  pmullw               m1, filter_x_b
1225  paddw                m0, filter_rnd
1226  pmullw               m2, filter_x_a
1227  pmullw               m3, filter_x_b
1228  paddw                m2, filter_rnd
1229  paddw                m0, m1
1230  paddw                m2, m3
1231%endif
1232  psraw                m0, 4
1233  psraw                m2, 4
1234
1235  INC_SRC_BY_SRC_STRIDE
1236
1237  packuswb             m0, m2
1238.x_other_y_other_loop:
1239%if cpuflag(ssse3)
1240  movu                 m4, [srcq]
1241  movu                 m3, [srcq+1]
1242  mova                 m1, [dstq]
1243  punpckhbw            m2, m4, m3
1244  punpcklbw            m4, m3
1245  pmaddubsw            m2, filter_x_a
1246  pmaddubsw            m4, filter_x_a
1247  punpckhbw            m3, m1, m5
1248  paddw                m2, filter_rnd
1249  paddw                m4, filter_rnd
1250  psraw                m2, 4
1251  psraw                m4, 4
1252  packuswb             m4, m2
1253  punpckhbw            m2, m0, m4
1254  punpcklbw            m0, m4
1255  pmaddubsw            m2, filter_y_a
1256  pmaddubsw            m0, filter_y_a
1257  punpcklbw            m1, m5
1258  paddw                m2, filter_rnd
1259  paddw                m0, filter_rnd
1260  psraw                m2, 4
1261  psraw                m0, 4
1262%else
1263  movu                 m3, [srcq]
1264  movu                 m4, [srcq+1]
1265  punpckhbw            m1, m3, m5
1266  punpckhbw            m2, m4, m5
1267  punpcklbw            m3, m5
1268  punpcklbw            m4, m5
1269  pmullw               m3, filter_x_a
1270  pmullw               m4, filter_x_b
1271  paddw                m3, filter_rnd
1272  pmullw               m1, filter_x_a
1273  pmullw               m2, filter_x_b
1274  paddw                m1, filter_rnd
1275  paddw                m3, m4
1276  paddw                m1, m2
1277  psraw                m3, 4
1278  psraw                m1, 4
1279  packuswb             m4, m3, m1
1280  punpckhbw            m2, m0, m5
1281  punpcklbw            m0, m5
1282  pmullw               m2, filter_y_a
1283  pmullw               m1, filter_y_b
1284  paddw                m2, filter_rnd
1285  pmullw               m0, filter_y_a
1286  pmullw               m3, filter_y_b
1287  paddw                m2, m1
1288  mova                 m1, [dstq]
1289  paddw                m0, filter_rnd
1290  psraw                m2, 4
1291  paddw                m0, m3
1292  punpckhbw            m3, m1, m5
1293  psraw                m0, 4
1294  punpcklbw            m1, m5
1295%endif
1296%if %2 == 1 ; avg
1297  ; FIXME(rbultje) pipeline
1298  packuswb             m0, m2
1299  pavgb                m0, [secq]
1300  punpckhbw            m2, m0, m5
1301  punpcklbw            m0, m5
1302%endif
1303  SUM_SSE              m0, m1, m2, m3, m6, m7
1304  mova                 m0, m4
1305
1306  INC_SRC_BY_SRC_STRIDE
1307  add                dstq, dst_strideq
1308%else ; %1 < 16
1309  movx                 m0, [srcq]
1310  movx                 m1, [srcq+1]
1311%if cpuflag(ssse3)
1312  punpcklbw            m0, m1
1313  pmaddubsw            m0, filter_x_a
1314  paddw                m0, filter_rnd
1315%else
1316  punpcklbw            m0, m5
1317  punpcklbw            m1, m5
1318  pmullw               m0, filter_x_a
1319  pmullw               m1, filter_x_b
1320  paddw                m0, filter_rnd
1321  paddw                m0, m1
1322%endif
1323  psraw                m0, 4
1324%if cpuflag(ssse3)
1325  packuswb             m0, m0
1326%endif
1327
1328  INC_SRC_BY_SRC_STRIDE
1329
1330.x_other_y_other_loop:
1331  movx                 m2, [srcq]
1332  movx                 m1, [srcq+1]
1333
1334  INC_SRC_BY_SRC_STRIDE
1335  movx                 m4, [srcq]
1336  movx                 m3, [srcq+1]
1337
1338%if cpuflag(ssse3)
1339  punpcklbw            m2, m1
1340  punpcklbw            m4, m3
1341  pmaddubsw            m2, filter_x_a
1342  pmaddubsw            m4, filter_x_a
1343  movx                 m3, [dstq+dst_strideq]
1344  movx                 m1, [dstq]
1345  paddw                m2, filter_rnd
1346  paddw                m4, filter_rnd
1347  psraw                m2, 4
1348  psraw                m4, 4
1349  packuswb             m2, m2
1350  packuswb             m4, m4
1351  punpcklbw            m0, m2
1352  punpcklbw            m2, m4
1353  pmaddubsw            m0, filter_y_a
1354  pmaddubsw            m2, filter_y_a
1355  punpcklbw            m3, m5
1356  paddw                m0, filter_rnd
1357  paddw                m2, filter_rnd
1358  psraw                m0, 4
1359  psraw                m2, 4
1360  punpcklbw            m1, m5
1361%else
1362  punpcklbw            m2, m5
1363  punpcklbw            m1, m5
1364  punpcklbw            m4, m5
1365  punpcklbw            m3, m5
1366  pmullw               m2, filter_x_a
1367  pmullw               m1, filter_x_b
1368  paddw                m2, filter_rnd
1369  pmullw               m4, filter_x_a
1370  pmullw               m3, filter_x_b
1371  paddw                m4, filter_rnd
1372  paddw                m2, m1
1373  paddw                m4, m3
1374  psraw                m2, 4
1375  psraw                m4, 4
1376  pmullw               m0, filter_y_a
1377  pmullw               m3, m2, filter_y_b
1378  paddw                m0, filter_rnd
1379  pmullw               m2, filter_y_a
1380  pmullw               m1, m4, filter_y_b
1381  paddw                m2, filter_rnd
1382  paddw                m0, m3
1383  movx                 m3, [dstq+dst_strideq]
1384  paddw                m2, m1
1385  movx                 m1, [dstq]
1386  psraw                m0, 4
1387  psraw                m2, 4
1388  punpcklbw            m3, m5
1389  punpcklbw            m1, m5
1390%endif
1391%if %2 == 1 ; avg
1392  ; FIXME(rbultje) pipeline
1393%if %1 == 4
1394  movlhps              m0, m2
1395%endif
1396  packuswb             m0, m2
1397%if %1 > 4
1398  pavgb                m0, [secq]
1399  punpckhbw            m2, m0, m5
1400  punpcklbw            m0, m5
1401%else
1402  movh                 m2, [secq]
1403  pavgb                m0, m2
1404  punpcklbw            m0, m5
1405  movhlps              m2, m0
1406%endif
1407%endif
1408  SUM_SSE              m0, m1, m2, m3, m6, m7
1409  mova                 m0, m4
1410
1411  INC_SRC_BY_SRC_STRIDE
1412  lea                dstq, [dstq+dst_strideq*2]
1413%endif
1414%if %2 == 1 ; avg
1415  add                secq, sec_str
1416%endif
1417  dec                   block_height
1418  jg .x_other_y_other_loop
1419%undef filter_x_a
1420%undef filter_x_b
1421%undef filter_y_a
1422%undef filter_y_b
1423%undef filter_rnd
1424%undef movx
1425  STORE_AND_RET %1
1426%endmacro
1427
1428; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
1429; between the ssse3 and non-ssse3 version. It may make sense to merge their
1430; code in the sense that the ssse3 version would jump to the appropriate
1431; location in the sse/2 version, rather than duplicating that code in the
1432; binary.
1433
1434INIT_XMM ssse3
1435SUBPEL_VARIANCE  4
1436SUBPEL_VARIANCE  8
1437SUBPEL_VARIANCE 16
1438
1439INIT_XMM ssse3
1440SUBPEL_VARIANCE  4, 1
1441SUBPEL_VARIANCE  8, 1
1442SUBPEL_VARIANCE 16, 1
1443