xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14pw_4:  times 8 dw 4
15pw_8:  times 8 dw 8
16pw_16: times 4 dd 16
17pw_32: times 4 dd 32
18
19SECTION .text
20INIT_XMM sse2
21cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
22  GET_GOT     goffsetq
23
24  movq                  m0, [aboveq]
25  movq                  m2, [leftq]
26  paddw                 m0, m2
27  pshuflw               m1, m0, 0xe
28  paddw                 m0, m1
29  pshuflw               m1, m0, 0x1
30  paddw                 m0, m1
31  paddw                 m0, [GLOBAL(pw_4)]
32  psraw                 m0, 3
33  pshuflw               m0, m0, 0x0
34  movq    [dstq          ], m0
35  movq    [dstq+strideq*2], m0
36  lea                 dstq, [dstq+strideq*4]
37  movq    [dstq          ], m0
38  movq    [dstq+strideq*2], m0
39
40  RESTORE_GOT
41  RET
42
43INIT_XMM sse2
44cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
45  GET_GOT     goffsetq
46
47  pxor                  m1, m1
48  mova                  m0, [aboveq]
49  mova                  m2, [leftq]
50  DEFINE_ARGS dst, stride, stride3, one
51  mov                 oned, 0x00010001
52  lea             stride3q, [strideq*3]
53  movd                  m3, oned
54  pshufd                m3, m3, 0x0
55  paddw                 m0, m2
56  pmaddwd               m0, m3
57  packssdw              m0, m1
58  pmaddwd               m0, m3
59  packssdw              m0, m1
60  pmaddwd               m0, m3
61  paddw                 m0, [GLOBAL(pw_8)]
62  psrlw                 m0, 4
63  pshuflw               m0, m0, 0x0
64  punpcklqdq            m0, m0
65  mova   [dstq           ], m0
66  mova   [dstq+strideq*2 ], m0
67  mova   [dstq+strideq*4 ], m0
68  mova   [dstq+stride3q*2], m0
69  lea                 dstq, [dstq+strideq*8]
70  mova   [dstq           ], m0
71  mova   [dstq+strideq*2 ], m0
72  mova   [dstq+strideq*4 ], m0
73  mova   [dstq+stride3q*2], m0
74
75  RESTORE_GOT
76  RET
77
78INIT_XMM sse2
79cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
80  GET_GOT     goffsetq
81
82  pxor                  m1, m1
83  mova                  m0, [aboveq]
84  mova                  m3, [aboveq+16]
85  mova                  m2, [leftq]
86  mova                  m4, [leftq+16]
87  DEFINE_ARGS dst, stride, stride3, lines4
88  lea             stride3q, [strideq*3]
89  mov              lines4d, 4
90  paddw                 m0, m2
91  paddw                 m0, m3
92  paddw                 m0, m4
93  movhlps               m2, m0
94  paddw                 m0, m2
95  punpcklwd             m0, m1
96  movhlps               m2, m0
97  paddd                 m0, m2
98  punpckldq             m0, m1
99  movhlps               m2, m0
100  paddd                 m0, m2
101  paddd                 m0, [GLOBAL(pw_16)]
102  psrad                 m0, 5
103  pshuflw               m0, m0, 0x0
104  punpcklqdq            m0, m0
105.loop:
106  mova   [dstq              ], m0
107  mova   [dstq           +16], m0
108  mova   [dstq+strideq*2    ], m0
109  mova   [dstq+strideq*2 +16], m0
110  mova   [dstq+strideq*4    ], m0
111  mova   [dstq+strideq*4 +16], m0
112  mova   [dstq+stride3q*2   ], m0
113  mova   [dstq+stride3q*2+16], m0
114  lea                 dstq, [dstq+strideq*8]
115  dec              lines4d
116  jnz .loop
117
118  RESTORE_GOT
119  REP_RET
120
121INIT_XMM sse2
122cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
123  GET_GOT     goffsetq
124
125  mova                  m0, [aboveq]
126  mova                  m2, [aboveq+16]
127  mova                  m3, [aboveq+32]
128  mova                  m4, [aboveq+48]
129  paddw                 m0, m2
130  paddw                 m3, m4
131  mova                  m2, [leftq]
132  mova                  m4, [leftq+16]
133  mova                  m5, [leftq+32]
134  mova                  m6, [leftq+48]
135  paddw                 m2, m4
136  paddw                 m5, m6
137  paddw                 m0, m3
138  paddw                 m2, m5
139  pxor                  m1, m1
140  paddw                 m0, m2
141  DEFINE_ARGS dst, stride, stride3, lines4
142  lea             stride3q, [strideq*3]
143  mov              lines4d, 8
144  movhlps               m2, m0
145  paddw                 m0, m2
146  punpcklwd             m0, m1
147  movhlps               m2, m0
148  paddd                 m0, m2
149  punpckldq             m0, m1
150  movhlps               m2, m0
151  paddd                 m0, m2
152  paddd                 m0, [GLOBAL(pw_32)]
153  psrad                 m0, 6
154  pshuflw               m0, m0, 0x0
155  punpcklqdq            m0, m0
156.loop:
157  mova [dstq               ], m0
158  mova [dstq          +16  ], m0
159  mova [dstq          +32  ], m0
160  mova [dstq          +48  ], m0
161  mova [dstq+strideq*2     ], m0
162  mova [dstq+strideq*2+16  ], m0
163  mova [dstq+strideq*2+32  ], m0
164  mova [dstq+strideq*2+48  ], m0
165  mova [dstq+strideq*4     ], m0
166  mova [dstq+strideq*4+16  ], m0
167  mova [dstq+strideq*4+32  ], m0
168  mova [dstq+strideq*4+48  ], m0
169  mova [dstq+stride3q*2    ], m0
170  mova [dstq+stride3q*2 +16], m0
171  mova [dstq+stride3q*2 +32], m0
172  mova [dstq+stride3q*2 +48], m0
173  lea                 dstq, [dstq+strideq*8]
174  dec              lines4d
175  jnz .loop
176
177  RESTORE_GOT
178  REP_RET
179
180INIT_XMM sse2
181cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
182  movq                  m0, [aboveq]
183  movq    [dstq          ], m0
184  movq    [dstq+strideq*2], m0
185  lea                 dstq, [dstq+strideq*4]
186  movq    [dstq          ], m0
187  movq    [dstq+strideq*2], m0
188  RET
189
190INIT_XMM sse2
191cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
192  mova                  m0, [aboveq]
193  DEFINE_ARGS dst, stride, stride3
194  lea             stride3q, [strideq*3]
195  mova   [dstq           ], m0
196  mova   [dstq+strideq*2 ], m0
197  mova   [dstq+strideq*4 ], m0
198  mova   [dstq+stride3q*2], m0
199  lea                 dstq, [dstq+strideq*8]
200  mova   [dstq           ], m0
201  mova   [dstq+strideq*2 ], m0
202  mova   [dstq+strideq*4 ], m0
203  mova   [dstq+stride3q*2], m0
204  RET
205
206INIT_XMM sse2
207cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
208  mova                  m0, [aboveq]
209  mova                  m1, [aboveq+16]
210  DEFINE_ARGS dst, stride, stride3, nlines4
211  lea             stride3q, [strideq*3]
212  mov              nlines4d, 4
213.loop:
214  mova    [dstq              ], m0
215  mova    [dstq           +16], m1
216  mova    [dstq+strideq*2    ], m0
217  mova    [dstq+strideq*2 +16], m1
218  mova    [dstq+strideq*4    ], m0
219  mova    [dstq+strideq*4 +16], m1
220  mova    [dstq+stride3q*2   ], m0
221  mova    [dstq+stride3q*2+16], m1
222  lea                 dstq, [dstq+strideq*8]
223  dec             nlines4d
224  jnz .loop
225  REP_RET
226
227INIT_XMM sse2
228cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
229  mova                  m0, [aboveq]
230  mova                  m1, [aboveq+16]
231  mova                  m2, [aboveq+32]
232  mova                  m3, [aboveq+48]
233  DEFINE_ARGS dst, stride, stride3, nlines4
234  lea             stride3q, [strideq*3]
235  mov              nlines4d, 8
236.loop:
237  mova [dstq               ], m0
238  mova [dstq            +16], m1
239  mova [dstq            +32], m2
240  mova [dstq            +48], m3
241  mova [dstq+strideq*2     ], m0
242  mova [dstq+strideq*2  +16], m1
243  mova [dstq+strideq*2  +32], m2
244  mova [dstq+strideq*2  +48], m3
245  mova [dstq+strideq*4     ], m0
246  mova [dstq+strideq*4  +16], m1
247  mova [dstq+strideq*4  +32], m2
248  mova [dstq+strideq*4  +48], m3
249  mova [dstq+stride3q*2    ], m0
250  mova [dstq+stride3q*2 +16], m1
251  mova [dstq+stride3q*2 +32], m2
252  mova [dstq+stride3q*2 +48], m3
253  lea                 dstq, [dstq+strideq*8]
254  dec             nlines4d
255  jnz .loop
256  REP_RET
257
258INIT_XMM sse2
259cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bd
260  movd                  m1, [aboveq-2]
261  movq                  m0, [aboveq]
262  pshuflw               m1, m1, 0x0
263  movlhps               m0, m0         ; t1 t2 t3 t4 t1 t2 t3 t4
264  movlhps               m1, m1         ; tl tl tl tl tl tl tl tl
265  ; Get the values to compute the maximum value at this bit depth
266  pcmpeqw               m3, m3
267  movd                  m4, bdd
268  psubw                 m0, m1         ; t1-tl t2-tl t3-tl t4-tl
269  psllw                 m3, m4
270  pcmpeqw               m2, m2
271  pxor                  m4, m4         ; min possible value
272  pxor                  m3, m2         ; max possible value
273  mova                  m1, [leftq]
274  pshuflw               m2, m1, 0x0
275  pshuflw               m5, m1, 0x55
276  movlhps               m2, m5         ; l1 l1 l1 l1 l2 l2 l2 l2
277  paddw                 m2, m0
278  ;Clamp to the bit-depth
279  pminsw                m2, m3
280  pmaxsw                m2, m4
281  ;Store the values
282  movq    [dstq          ], m2
283  movhpd  [dstq+strideq*2], m2
284  lea                 dstq, [dstq+strideq*4]
285  pshuflw               m2, m1, 0xaa
286  pshuflw               m5, m1, 0xff
287  movlhps               m2, m5
288  paddw                 m2, m0
289  ;Clamp to the bit-depth
290  pminsw                m2, m3
291  pmaxsw                m2, m4
292  ;Store the values
293  movq    [dstq          ], m2
294  movhpd  [dstq+strideq*2], m2
295  RET
296
297INIT_XMM sse2
298cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bd, one
299  movd                  m1, [aboveq-2]
300  mova                  m0, [aboveq]
301  pshuflw               m1, m1, 0x0
302  ; Get the values to compute the maximum value at this bit depth
303  mov                 oned, 1
304  pxor                  m3, m3
305  pxor                  m4, m4
306  pinsrw                m3, oned, 0
307  pinsrw                m4, bdd, 0
308  pshuflw               m3, m3, 0x0
309  DEFINE_ARGS dst, stride, line, left
310  punpcklqdq            m3, m3
311  mov                lineq, -4
312  mova                  m2, m3
313  punpcklqdq            m1, m1
314  psllw                 m3, m4
315  add                leftq, 16
316  psubw                 m3, m2 ; max possible value
317  pxor                  m4, m4 ; min possible value
318  psubw                 m0, m1
319.loop:
320  movd                  m1, [leftq+lineq*4]
321  movd                  m2, [leftq+lineq*4+2]
322  pshuflw               m1, m1, 0x0
323  pshuflw               m2, m2, 0x0
324  punpcklqdq            m1, m1
325  punpcklqdq            m2, m2
326  paddw                 m1, m0
327  paddw                 m2, m0
328  ;Clamp to the bit-depth
329  pminsw                m1, m3
330  pminsw                m2, m3
331  pmaxsw                m1, m4
332  pmaxsw                m2, m4
333  ;Store the values
334  mova      [dstq          ], m1
335  mova      [dstq+strideq*2], m2
336  lea                 dstq, [dstq+strideq*4]
337  inc                lineq
338  jnz .loop
339  REP_RET
340
341INIT_XMM sse2
342cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bd
343  movd                  m2, [aboveq-2]
344  mova                  m0, [aboveq]
345  mova                  m1, [aboveq+16]
346  pshuflw               m2, m2, 0x0
347  ; Get the values to compute the maximum value at this bit depth
348  pcmpeqw               m3, m3
349  movd                  m4, bdd
350  punpcklqdq            m2, m2
351  psllw                 m3, m4
352  pcmpeqw               m5, m5
353  pxor                  m4, m4         ; min possible value
354  pxor                  m3, m5         ; max possible value
355  DEFINE_ARGS dst, stride, line, left
356  mov                lineq, -8
357  psubw                 m0, m2
358  psubw                 m1, m2
359.loop:
360  movd                  m7, [leftq]
361  pshuflw               m5, m7, 0x0
362  pshuflw               m2, m7, 0x55
363  punpcklqdq            m5, m5         ; l1 l1 l1 l1 l1 l1 l1 l1
364  punpcklqdq            m2, m2         ; l2 l2 l2 l2 l2 l2 l2 l2
365  paddw                 m6, m5, m0     ; t1-tl+l1 to t4-tl+l1
366  paddw                 m5, m1         ; t5-tl+l1 to t8-tl+l1
367  pminsw                m6, m3
368  pminsw                m5, m3
369  pmaxsw                m6, m4         ; Clamp to the bit-depth
370  pmaxsw                m5, m4
371  mova   [dstq           ], m6
372  mova   [dstq        +16], m5
373  paddw                 m6, m2, m0
374  paddw                 m2, m1
375  pminsw                m6, m3
376  pminsw                m2, m3
377  pmaxsw                m6, m4
378  pmaxsw                m2, m4
379  mova   [dstq+strideq*2 ], m6
380  mova [dstq+strideq*2+16], m2
381  lea                 dstq, [dstq+strideq*4]
382  inc                lineq
383  lea                leftq, [leftq+4]
384
385  jnz .loop
386  REP_RET
387
388INIT_XMM sse2
389cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bd
390  movd                  m0, [aboveq-2]
391  mova                  m1, [aboveq]
392  mova                  m2, [aboveq+16]
393  mova                  m3, [aboveq+32]
394  mova                  m4, [aboveq+48]
395  pshuflw               m0, m0, 0x0
396  ; Get the values to compute the maximum value at this bit depth
397  pcmpeqw               m5, m5
398  movd                  m6, bdd
399  psllw                 m5, m6
400  pcmpeqw               m7, m7
401  pxor                  m6, m6         ; min possible value
402  pxor                  m5, m7         ; max possible value
403  punpcklqdq            m0, m0
404  DEFINE_ARGS dst, stride, line, left
405  mov                lineq, -16
406  psubw                 m1, m0
407  psubw                 m2, m0
408  psubw                 m3, m0
409  psubw                 m4, m0
410.loop:
411  movd                  m7, [leftq]
412  pshuflw               m7, m7, 0x0
413  punpcklqdq            m7, m7         ; l1 l1 l1 l1 l1 l1 l1 l1
414  paddw                 m0, m7, m1
415  pminsw                m0, m5
416  pmaxsw                m0, m6
417  mova   [dstq           ], m0
418  paddw                 m0, m7, m2
419  pminsw                m0, m5
420  pmaxsw                m0, m6
421  mova   [dstq        +16], m0
422  paddw                 m0, m7, m3
423  pminsw                m0, m5
424  pmaxsw                m0, m6
425  mova   [dstq        +32], m0
426  paddw                 m0, m7, m4
427  pminsw                m0, m5
428  pmaxsw                m0, m6
429  mova   [dstq        +48], m0
430  movd                  m7, [leftq+2]
431  pshuflw               m7, m7, 0x0
432  punpcklqdq            m7, m7         ; l2 l2 l2 l2 l2 l2 l2 l2
433  paddw                 m0, m7, m1
434  pminsw                m0, m5
435  pmaxsw                m0, m6
436  mova   [dstq+strideq*2 ], m0
437  paddw                 m0, m7, m2
438  pminsw                m0, m5
439  pmaxsw                m0, m6
440  mova   [dstq+strideq*2+16], m0
441  paddw                 m0, m7, m3
442  pminsw                m0, m5
443  pmaxsw                m0, m6
444  mova   [dstq+strideq*2+32], m0
445  paddw                 m0, m7, m4
446  pminsw                m0, m5
447  pmaxsw                m0, m6
448  mova   [dstq+strideq*2+48], m0
449  lea                 dstq, [dstq+strideq*4]
450  lea                leftq, [leftq+4]
451  inc                lineq
452  jnz .loop
453  REP_RET
454