xref: /aosp_15_r20/external/libaom/aom_dsp/x86/subtract_sse2.asm (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker;
2*77c1e3ccSAndroid Build Coastguard Worker; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker;
4*77c1e3ccSAndroid Build Coastguard Worker; This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker; was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker; Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker;
11*77c1e3ccSAndroid Build Coastguard Worker
12*77c1e3ccSAndroid Build Coastguard Worker;
13*77c1e3ccSAndroid Build Coastguard Worker
14*77c1e3ccSAndroid Build Coastguard Worker%include "third_party/x86inc/x86inc.asm"
15*77c1e3ccSAndroid Build Coastguard Worker
16*77c1e3ccSAndroid Build Coastguard WorkerSECTION .text
17*77c1e3ccSAndroid Build Coastguard Worker
18*77c1e3ccSAndroid Build Coastguard Worker; void aom_subtract_block(int rows, int cols,
19*77c1e3ccSAndroid Build Coastguard Worker;                         int16_t *diff, ptrdiff_t diff_stride,
20*77c1e3ccSAndroid Build Coastguard Worker;                         const uint8_t *src, ptrdiff_t src_stride,
21*77c1e3ccSAndroid Build Coastguard Worker;                         const uint8_t *pred, ptrdiff_t pred_stride)
22*77c1e3ccSAndroid Build Coastguard Worker
23*77c1e3ccSAndroid Build Coastguard WorkerINIT_XMM sse2
24*77c1e3ccSAndroid Build Coastguard Workercglobal subtract_block, 7, 7, 8, \
25*77c1e3ccSAndroid Build Coastguard Worker                        rows, cols, diff, diff_stride, src, src_stride, \
26*77c1e3ccSAndroid Build Coastguard Worker                        pred, pred_stride
27*77c1e3ccSAndroid Build Coastguard Worker%define pred_str colsq
28*77c1e3ccSAndroid Build Coastguard Worker  pxor                  m7, m7         ; dedicated zero register
29*77c1e3ccSAndroid Build Coastguard Worker  cmp                colsd, 4
30*77c1e3ccSAndroid Build Coastguard Worker  je .case_4
31*77c1e3ccSAndroid Build Coastguard Worker  cmp                colsd, 8
32*77c1e3ccSAndroid Build Coastguard Worker  je .case_8
33*77c1e3ccSAndroid Build Coastguard Worker  cmp                colsd, 16
34*77c1e3ccSAndroid Build Coastguard Worker  je .case_16
35*77c1e3ccSAndroid Build Coastguard Worker  cmp                colsd, 32
36*77c1e3ccSAndroid Build Coastguard Worker  je .case_32
37*77c1e3ccSAndroid Build Coastguard Worker  cmp                colsd, 64
38*77c1e3ccSAndroid Build Coastguard Worker  je .case_64
39*77c1e3ccSAndroid Build Coastguard Worker
40*77c1e3ccSAndroid Build Coastguard Worker%macro loop16 6
41*77c1e3ccSAndroid Build Coastguard Worker  mova                  m0, [srcq+%1]
42*77c1e3ccSAndroid Build Coastguard Worker  mova                  m4, [srcq+%2]
43*77c1e3ccSAndroid Build Coastguard Worker  movu                  m1, [predq+%3]
44*77c1e3ccSAndroid Build Coastguard Worker  movu                  m5, [predq+%4]
45*77c1e3ccSAndroid Build Coastguard Worker  punpckhbw             m2, m0, m7
46*77c1e3ccSAndroid Build Coastguard Worker  punpckhbw             m3, m1, m7
47*77c1e3ccSAndroid Build Coastguard Worker  punpcklbw             m0, m7
48*77c1e3ccSAndroid Build Coastguard Worker  punpcklbw             m1, m7
49*77c1e3ccSAndroid Build Coastguard Worker  psubw                 m2, m3
50*77c1e3ccSAndroid Build Coastguard Worker  psubw                 m0, m1
51*77c1e3ccSAndroid Build Coastguard Worker  punpckhbw             m1, m4, m7
52*77c1e3ccSAndroid Build Coastguard Worker  punpckhbw             m3, m5, m7
53*77c1e3ccSAndroid Build Coastguard Worker  punpcklbw             m4, m7
54*77c1e3ccSAndroid Build Coastguard Worker  punpcklbw             m5, m7
55*77c1e3ccSAndroid Build Coastguard Worker  psubw                 m1, m3
56*77c1e3ccSAndroid Build Coastguard Worker  psubw                 m4, m5
57*77c1e3ccSAndroid Build Coastguard Worker  mova [diffq+mmsize*0+%5], m0
58*77c1e3ccSAndroid Build Coastguard Worker  mova [diffq+mmsize*1+%5], m2
59*77c1e3ccSAndroid Build Coastguard Worker  mova [diffq+mmsize*0+%6], m4
60*77c1e3ccSAndroid Build Coastguard Worker  mova [diffq+mmsize*1+%6], m1
61*77c1e3ccSAndroid Build Coastguard Worker%endmacro
62*77c1e3ccSAndroid Build Coastguard Worker
63*77c1e3ccSAndroid Build Coastguard Worker  mov             pred_str, pred_stridemp
64*77c1e3ccSAndroid Build Coastguard Worker.loop_128:
65*77c1e3ccSAndroid Build Coastguard Worker  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize,  0*mmsize,  2*mmsize
66*77c1e3ccSAndroid Build Coastguard Worker  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize,  4*mmsize,  6*mmsize
67*77c1e3ccSAndroid Build Coastguard Worker  loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize,  8*mmsize, 10*mmsize
68*77c1e3ccSAndroid Build Coastguard Worker  loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
69*77c1e3ccSAndroid Build Coastguard Worker  lea                diffq, [diffq+diff_strideq*2]
70*77c1e3ccSAndroid Build Coastguard Worker  add                predq, pred_str
71*77c1e3ccSAndroid Build Coastguard Worker  add                 srcq, src_strideq
72*77c1e3ccSAndroid Build Coastguard Worker  sub                rowsd, 1
73*77c1e3ccSAndroid Build Coastguard Worker  jnz .loop_128
74*77c1e3ccSAndroid Build Coastguard Worker  RET
75*77c1e3ccSAndroid Build Coastguard Worker
76*77c1e3ccSAndroid Build Coastguard Worker.case_64:
77*77c1e3ccSAndroid Build Coastguard Worker  mov             pred_str, pred_stridemp
78*77c1e3ccSAndroid Build Coastguard Worker.loop_64:
79*77c1e3ccSAndroid Build Coastguard Worker  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
80*77c1e3ccSAndroid Build Coastguard Worker  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
81*77c1e3ccSAndroid Build Coastguard Worker  lea                diffq, [diffq+diff_strideq*2]
82*77c1e3ccSAndroid Build Coastguard Worker  add                predq, pred_str
83*77c1e3ccSAndroid Build Coastguard Worker  add                 srcq, src_strideq
84*77c1e3ccSAndroid Build Coastguard Worker  dec                rowsd
85*77c1e3ccSAndroid Build Coastguard Worker  jg .loop_64
86*77c1e3ccSAndroid Build Coastguard Worker  RET
87*77c1e3ccSAndroid Build Coastguard Worker
88*77c1e3ccSAndroid Build Coastguard Worker.case_32:
89*77c1e3ccSAndroid Build Coastguard Worker  mov             pred_str, pred_stridemp
90*77c1e3ccSAndroid Build Coastguard Worker.loop_32:
91*77c1e3ccSAndroid Build Coastguard Worker  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
92*77c1e3ccSAndroid Build Coastguard Worker  lea                diffq, [diffq+diff_strideq*2]
93*77c1e3ccSAndroid Build Coastguard Worker  add                predq, pred_str
94*77c1e3ccSAndroid Build Coastguard Worker  add                 srcq, src_strideq
95*77c1e3ccSAndroid Build Coastguard Worker  dec                rowsd
96*77c1e3ccSAndroid Build Coastguard Worker  jg .loop_32
97*77c1e3ccSAndroid Build Coastguard Worker  RET
98*77c1e3ccSAndroid Build Coastguard Worker
99*77c1e3ccSAndroid Build Coastguard Worker.case_16:
100*77c1e3ccSAndroid Build Coastguard Worker  mov             pred_str, pred_stridemp
101*77c1e3ccSAndroid Build Coastguard Worker.loop_16:
102*77c1e3ccSAndroid Build Coastguard Worker  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
103*77c1e3ccSAndroid Build Coastguard Worker  lea                diffq, [diffq+diff_strideq*4]
104*77c1e3ccSAndroid Build Coastguard Worker  lea                predq, [predq+pred_str*2]
105*77c1e3ccSAndroid Build Coastguard Worker  lea                 srcq, [srcq+src_strideq*2]
106*77c1e3ccSAndroid Build Coastguard Worker  sub                rowsd, 2
107*77c1e3ccSAndroid Build Coastguard Worker  jg .loop_16
108*77c1e3ccSAndroid Build Coastguard Worker  RET
109*77c1e3ccSAndroid Build Coastguard Worker
110*77c1e3ccSAndroid Build Coastguard Worker%macro loop_h 0
111*77c1e3ccSAndroid Build Coastguard Worker  movh                  m0, [srcq]
112*77c1e3ccSAndroid Build Coastguard Worker  movh                  m2, [srcq+src_strideq]
113*77c1e3ccSAndroid Build Coastguard Worker  movh                  m1, [predq]
114*77c1e3ccSAndroid Build Coastguard Worker  movh                  m3, [predq+pred_str]
115*77c1e3ccSAndroid Build Coastguard Worker  punpcklbw             m0, m7
116*77c1e3ccSAndroid Build Coastguard Worker  punpcklbw             m1, m7
117*77c1e3ccSAndroid Build Coastguard Worker  punpcklbw             m2, m7
118*77c1e3ccSAndroid Build Coastguard Worker  punpcklbw             m3, m7
119*77c1e3ccSAndroid Build Coastguard Worker  psubw                 m0, m1
120*77c1e3ccSAndroid Build Coastguard Worker  psubw                 m2, m3
121*77c1e3ccSAndroid Build Coastguard Worker  mova             [diffq], m0
122*77c1e3ccSAndroid Build Coastguard Worker  mova [diffq+diff_strideq*2], m2
123*77c1e3ccSAndroid Build Coastguard Worker%endmacro
124*77c1e3ccSAndroid Build Coastguard Worker
125*77c1e3ccSAndroid Build Coastguard Worker.case_8:
126*77c1e3ccSAndroid Build Coastguard Worker  mov             pred_str, pred_stridemp
127*77c1e3ccSAndroid Build Coastguard Worker.loop_8:
128*77c1e3ccSAndroid Build Coastguard Worker  loop_h
129*77c1e3ccSAndroid Build Coastguard Worker  lea                diffq, [diffq+diff_strideq*4]
130*77c1e3ccSAndroid Build Coastguard Worker  lea                 srcq, [srcq+src_strideq*2]
131*77c1e3ccSAndroid Build Coastguard Worker  lea                predq, [predq+pred_str*2]
132*77c1e3ccSAndroid Build Coastguard Worker  sub                rowsd, 2
133*77c1e3ccSAndroid Build Coastguard Worker  jg .loop_8
134*77c1e3ccSAndroid Build Coastguard Worker  RET
135*77c1e3ccSAndroid Build Coastguard Worker
136*77c1e3ccSAndroid Build Coastguard WorkerINIT_MMX
137*77c1e3ccSAndroid Build Coastguard Worker.case_4:
138*77c1e3ccSAndroid Build Coastguard Worker  mov             pred_str, pred_stridemp
139*77c1e3ccSAndroid Build Coastguard Worker.loop_4:
140*77c1e3ccSAndroid Build Coastguard Worker  loop_h
141*77c1e3ccSAndroid Build Coastguard Worker  lea                diffq, [diffq+diff_strideq*4]
142*77c1e3ccSAndroid Build Coastguard Worker  lea                 srcq, [srcq+src_strideq*2]
143*77c1e3ccSAndroid Build Coastguard Worker  lea                predq, [predq+pred_str*2]
144*77c1e3ccSAndroid Build Coastguard Worker  sub                rowsd, 2
145*77c1e3ccSAndroid Build Coastguard Worker  jg .loop_4
146*77c1e3ccSAndroid Build Coastguard Worker  emms
147*77c1e3ccSAndroid Build Coastguard Worker  RET
148