xref: /aosp_15_r20/external/libvpx/config/arm-neon/vpx_dsp/arm/intrapred_neon_asm.asm.S (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1@ This file was created from a .asm file
2@  using the ads2gas.pl script.
3.syntax unified
4@
5@  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
6@
7@  Use of this source code is governed by a BSD-style license
8@  that can be found in the LICENSE file in the root of the source
9@  tree. An additional intellectual property rights grant can be found
10@  in the file PATENTS.  All contributing project authors may
11@  be found in the AUTHORS file in the root of the source tree.
12@
13
14    .global vpx_v_predictor_4x4_neon
15    .type vpx_v_predictor_4x4_neon, function
16    .global vpx_v_predictor_8x8_neon
17    .type vpx_v_predictor_8x8_neon, function
18    .global vpx_v_predictor_16x16_neon
19    .type vpx_v_predictor_16x16_neon, function
20    .global vpx_v_predictor_32x32_neon
21    .type vpx_v_predictor_32x32_neon, function
22    .global vpx_h_predictor_4x4_neon
23    .type vpx_h_predictor_4x4_neon, function
24    .global vpx_h_predictor_8x8_neon
25    .type vpx_h_predictor_8x8_neon, function
26    .global vpx_h_predictor_16x16_neon
27    .type vpx_h_predictor_16x16_neon, function
28    .global vpx_h_predictor_32x32_neon
29    .type vpx_h_predictor_32x32_neon, function
30    .global vpx_tm_predictor_4x4_neon
31    .type vpx_tm_predictor_4x4_neon, function
32    .global vpx_tm_predictor_8x8_neon
33    .type vpx_tm_predictor_8x8_neon, function
34    .global vpx_tm_predictor_16x16_neon
35    .type vpx_tm_predictor_16x16_neon, function
36    .global vpx_tm_predictor_32x32_neon
37    .type vpx_tm_predictor_32x32_neon, function
38    .arm
39    .eabi_attribute 24, 1 @Tag_ABI_align_needed
40    .eabi_attribute 25, 1 @Tag_ABI_align_preserved
41
42    .text
43    .p2align 2
44
45@void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
46@                              const uint8_t *above,
47@                              const uint8_t *left)
48@ r0  uint8_t *dst
49@ r1  ptrdiff_t y_stride
50@ r2  const uint8_t *above
51@ r3  const uint8_t *left
52
53vpx_v_predictor_4x4_neon: @ PROC
54    vld1.32             {d0[0]}, [r2]
55    vst1.32             {d0[0]}, [r0], r1
56    vst1.32             {d0[0]}, [r0], r1
57    vst1.32             {d0[0]}, [r0], r1
58    vst1.32             {d0[0]}, [r0], r1
59    bx                  lr
60.size vpx_v_predictor_4x4_neon, .-vpx_v_predictor_4x4_neon    @ ENDP                @ |vpx_v_predictor_4x4_neon|
61
62@void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
63@                              const uint8_t *above,
64@                              const uint8_t *left)
65@ r0  uint8_t *dst
66@ r1  ptrdiff_t y_stride
67@ r2  const uint8_t *above
68@ r3  const uint8_t *left
69
70vpx_v_predictor_8x8_neon: @ PROC
71    vld1.8              {d0}, [r2]
72    vst1.8              {d0}, [r0], r1
73    vst1.8              {d0}, [r0], r1
74    vst1.8              {d0}, [r0], r1
75    vst1.8              {d0}, [r0], r1
76    vst1.8              {d0}, [r0], r1
77    vst1.8              {d0}, [r0], r1
78    vst1.8              {d0}, [r0], r1
79    vst1.8              {d0}, [r0], r1
80    bx                  lr
81.size vpx_v_predictor_8x8_neon, .-vpx_v_predictor_8x8_neon    @ ENDP                @ |vpx_v_predictor_8x8_neon|
82
83@void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
84@                                const uint8_t *above,
85@                                const uint8_t *left)
86@ r0  uint8_t *dst
87@ r1  ptrdiff_t y_stride
88@ r2  const uint8_t *above
89@ r3  const uint8_t *left
90
91vpx_v_predictor_16x16_neon: @ PROC
92    vld1.8              {q0}, [r2]
93    vst1.8              {q0}, [r0], r1
94    vst1.8              {q0}, [r0], r1
95    vst1.8              {q0}, [r0], r1
96    vst1.8              {q0}, [r0], r1
97    vst1.8              {q0}, [r0], r1
98    vst1.8              {q0}, [r0], r1
99    vst1.8              {q0}, [r0], r1
100    vst1.8              {q0}, [r0], r1
101    vst1.8              {q0}, [r0], r1
102    vst1.8              {q0}, [r0], r1
103    vst1.8              {q0}, [r0], r1
104    vst1.8              {q0}, [r0], r1
105    vst1.8              {q0}, [r0], r1
106    vst1.8              {q0}, [r0], r1
107    vst1.8              {q0}, [r0], r1
108    vst1.8              {q0}, [r0], r1
109    bx                  lr
110.size vpx_v_predictor_16x16_neon, .-vpx_v_predictor_16x16_neon    @ ENDP                @ |vpx_v_predictor_16x16_neon|
111
112@void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
113@                                const uint8_t *above,
114@                                const uint8_t *left)
115@ r0  uint8_t *dst
116@ r1  ptrdiff_t y_stride
117@ r2  const uint8_t *above
118@ r3  const uint8_t *left
119
120vpx_v_predictor_32x32_neon: @ PROC
121    vld1.8              {q0, q1}, [r2]
122    mov                 r2, #2
123loop_v:
124    vst1.8              {q0, q1}, [r0], r1
125    vst1.8              {q0, q1}, [r0], r1
126    vst1.8              {q0, q1}, [r0], r1
127    vst1.8              {q0, q1}, [r0], r1
128    vst1.8              {q0, q1}, [r0], r1
129    vst1.8              {q0, q1}, [r0], r1
130    vst1.8              {q0, q1}, [r0], r1
131    vst1.8              {q0, q1}, [r0], r1
132    vst1.8              {q0, q1}, [r0], r1
133    vst1.8              {q0, q1}, [r0], r1
134    vst1.8              {q0, q1}, [r0], r1
135    vst1.8              {q0, q1}, [r0], r1
136    vst1.8              {q0, q1}, [r0], r1
137    vst1.8              {q0, q1}, [r0], r1
138    vst1.8              {q0, q1}, [r0], r1
139    vst1.8              {q0, q1}, [r0], r1
140    subs                r2, r2, #1
141    bgt                 loop_v
142    bx                  lr
143.size vpx_v_predictor_32x32_neon, .-vpx_v_predictor_32x32_neon    @ ENDP                @ |vpx_v_predictor_32x32_neon|
144
145@void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
146@                              const uint8_t *above,
147@                              const uint8_t *left)
148@ r0  uint8_t *dst
149@ r1  ptrdiff_t y_stride
150@ r2  const uint8_t *above
151@ r3  const uint8_t *left
152
153vpx_h_predictor_4x4_neon: @ PROC
154    vld1.32             {d1[0]}, [r3]
155    vdup.8              d0, d1[0]
156    vst1.32             {d0[0]}, [r0], r1
157    vdup.8              d0, d1[1]
158    vst1.32             {d0[0]}, [r0], r1
159    vdup.8              d0, d1[2]
160    vst1.32             {d0[0]}, [r0], r1
161    vdup.8              d0, d1[3]
162    vst1.32             {d0[0]}, [r0], r1
163    bx                  lr
164.size vpx_h_predictor_4x4_neon, .-vpx_h_predictor_4x4_neon    @ ENDP                @ |vpx_h_predictor_4x4_neon|
165
166@void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
167@                              const uint8_t *above,
168@                              const uint8_t *left)
169@ r0  uint8_t *dst
170@ r1  ptrdiff_t y_stride
171@ r2  const uint8_t *above
172@ r3  const uint8_t *left
173
174vpx_h_predictor_8x8_neon: @ PROC
175    vld1.64             {d1}, [r3]
176    vdup.8              d0, d1[0]
177    vst1.64             {d0}, [r0], r1
178    vdup.8              d0, d1[1]
179    vst1.64             {d0}, [r0], r1
180    vdup.8              d0, d1[2]
181    vst1.64             {d0}, [r0], r1
182    vdup.8              d0, d1[3]
183    vst1.64             {d0}, [r0], r1
184    vdup.8              d0, d1[4]
185    vst1.64             {d0}, [r0], r1
186    vdup.8              d0, d1[5]
187    vst1.64             {d0}, [r0], r1
188    vdup.8              d0, d1[6]
189    vst1.64             {d0}, [r0], r1
190    vdup.8              d0, d1[7]
191    vst1.64             {d0}, [r0], r1
192    bx                  lr
193.size vpx_h_predictor_8x8_neon, .-vpx_h_predictor_8x8_neon    @ ENDP                @ |vpx_h_predictor_8x8_neon|
194
195@void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
196@                                const uint8_t *above,
197@                                const uint8_t *left)
198@ r0  uint8_t *dst
199@ r1  ptrdiff_t y_stride
200@ r2  const uint8_t *above
201@ r3  const uint8_t *left
202
203vpx_h_predictor_16x16_neon: @ PROC
204    vld1.8              {q1}, [r3]
205    vdup.8              q0, d2[0]
206    vst1.8              {q0}, [r0], r1
207    vdup.8              q0, d2[1]
208    vst1.8              {q0}, [r0], r1
209    vdup.8              q0, d2[2]
210    vst1.8              {q0}, [r0], r1
211    vdup.8              q0, d2[3]
212    vst1.8              {q0}, [r0], r1
213    vdup.8              q0, d2[4]
214    vst1.8              {q0}, [r0], r1
215    vdup.8              q0, d2[5]
216    vst1.8              {q0}, [r0], r1
217    vdup.8              q0, d2[6]
218    vst1.8              {q0}, [r0], r1
219    vdup.8              q0, d2[7]
220    vst1.8              {q0}, [r0], r1
221    vdup.8              q0, d3[0]
222    vst1.8              {q0}, [r0], r1
223    vdup.8              q0, d3[1]
224    vst1.8              {q0}, [r0], r1
225    vdup.8              q0, d3[2]
226    vst1.8              {q0}, [r0], r1
227    vdup.8              q0, d3[3]
228    vst1.8              {q0}, [r0], r1
229    vdup.8              q0, d3[4]
230    vst1.8              {q0}, [r0], r1
231    vdup.8              q0, d3[5]
232    vst1.8              {q0}, [r0], r1
233    vdup.8              q0, d3[6]
234    vst1.8              {q0}, [r0], r1
235    vdup.8              q0, d3[7]
236    vst1.8              {q0}, [r0], r1
237    bx                  lr
238.size vpx_h_predictor_16x16_neon, .-vpx_h_predictor_16x16_neon    @ ENDP                @ |vpx_h_predictor_16x16_neon|
239
240@void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
241@                                const uint8_t *above,
242@                                const uint8_t *left)
243@ r0  uint8_t *dst
244@ r1  ptrdiff_t y_stride
245@ r2  const uint8_t *above
246@ r3  const uint8_t *left
247
248vpx_h_predictor_32x32_neon: @ PROC
249    sub                 r1, r1, #16
250    mov                 r2, #2
251loop_h:
252    vld1.8              {q1}, [r3]!
253    vdup.8              q0, d2[0]
254    vst1.8              {q0}, [r0]!
255    vst1.8              {q0}, [r0], r1
256    vdup.8              q0, d2[1]
257    vst1.8              {q0}, [r0]!
258    vst1.8              {q0}, [r0], r1
259    vdup.8              q0, d2[2]
260    vst1.8              {q0}, [r0]!
261    vst1.8              {q0}, [r0], r1
262    vdup.8              q0, d2[3]
263    vst1.8              {q0}, [r0]!
264    vst1.8              {q0}, [r0], r1
265    vdup.8              q0, d2[4]
266    vst1.8              {q0}, [r0]!
267    vst1.8              {q0}, [r0], r1
268    vdup.8              q0, d2[5]
269    vst1.8              {q0}, [r0]!
270    vst1.8              {q0}, [r0], r1
271    vdup.8              q0, d2[6]
272    vst1.8              {q0}, [r0]!
273    vst1.8              {q0}, [r0], r1
274    vdup.8              q0, d2[7]
275    vst1.8              {q0}, [r0]!
276    vst1.8              {q0}, [r0], r1
277    vdup.8              q0, d3[0]
278    vst1.8              {q0}, [r0]!
279    vst1.8              {q0}, [r0], r1
280    vdup.8              q0, d3[1]
281    vst1.8              {q0}, [r0]!
282    vst1.8              {q0}, [r0], r1
283    vdup.8              q0, d3[2]
284    vst1.8              {q0}, [r0]!
285    vst1.8              {q0}, [r0], r1
286    vdup.8              q0, d3[3]
287    vst1.8              {q0}, [r0]!
288    vst1.8              {q0}, [r0], r1
289    vdup.8              q0, d3[4]
290    vst1.8              {q0}, [r0]!
291    vst1.8              {q0}, [r0], r1
292    vdup.8              q0, d3[5]
293    vst1.8              {q0}, [r0]!
294    vst1.8              {q0}, [r0], r1
295    vdup.8              q0, d3[6]
296    vst1.8              {q0}, [r0]!
297    vst1.8              {q0}, [r0], r1
298    vdup.8              q0, d3[7]
299    vst1.8              {q0}, [r0]!
300    vst1.8              {q0}, [r0], r1
301    subs                r2, r2, #1
302    bgt                 loop_h
303    bx                  lr
304.size vpx_h_predictor_32x32_neon, .-vpx_h_predictor_32x32_neon    @ ENDP                @ |vpx_h_predictor_32x32_neon|
305
306@void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
307@                                const uint8_t *above,
308@                                const uint8_t *left)
309@ r0  uint8_t *dst
310@ r1  ptrdiff_t y_stride
311@ r2  const uint8_t *above
312@ r3  const uint8_t *left
313
314vpx_tm_predictor_4x4_neon: @ PROC
315    @ Load ytop_left = above[-1];
316    sub                 r12, r2, #1
317    vld1.u8             {d0[]}, [r12]
318
319    @ Load above 4 pixels
320    vld1.32             {d2[0]}, [r2]
321
322    @ Compute above - ytop_left
323    vsubl.u8            q3, d2, d0
324
325    @ Load left row by row and compute left + (above - ytop_left)
326    @ 1st row and 2nd row
327    vld1.u8             {d2[]}, [r3]!
328    vld1.u8             {d4[]}, [r3]!
329    vmovl.u8            q1, d2
330    vmovl.u8            q2, d4
331    vadd.s16            q1, q1, q3
332    vadd.s16            q2, q2, q3
333    vqmovun.s16         d0, q1
334    vqmovun.s16         d1, q2
335    vst1.32             {d0[0]}, [r0], r1
336    vst1.32             {d1[0]}, [r0], r1
337
338    @ 3rd row and 4th row
339    vld1.u8             {d2[]}, [r3]!
340    vld1.u8             {d4[]}, [r3]
341    vmovl.u8            q1, d2
342    vmovl.u8            q2, d4
343    vadd.s16            q1, q1, q3
344    vadd.s16            q2, q2, q3
345    vqmovun.s16         d0, q1
346    vqmovun.s16         d1, q2
347    vst1.32             {d0[0]}, [r0], r1
348    vst1.32             {d1[0]}, [r0], r1
349    bx                  lr
350.size vpx_tm_predictor_4x4_neon, .-vpx_tm_predictor_4x4_neon    @ ENDP                @ |vpx_tm_predictor_4x4_neon|
351
352@void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
353@                                const uint8_t *above,
354@                                const uint8_t *left)
355@ r0  uint8_t *dst
356@ r1  ptrdiff_t y_stride
357@ r2  const uint8_t *above
358@ r3  const uint8_t *left
359
360vpx_tm_predictor_8x8_neon: @ PROC
361    @ Load ytop_left = above[-1];
362    sub                 r12, r2, #1
363    vld1.8              {d0[]}, [r12]
364
365    @ preload 8 left
366    vld1.8              {d30}, [r3]
367
368    @ Load above 8 pixels
369    vld1.64             {d2}, [r2]
370
371    vmovl.u8            q10, d30
372
373    @ Compute above - ytop_left
374    vsubl.u8            q3, d2, d0
375
376    @ Load left row by row and compute left + (above - ytop_left)
377    @ 1st row and 2nd row
378    vdup.16             q0, d20[0]
379    vdup.16             q1, d20[1]
380    vadd.s16            q0, q3, q0
381    vadd.s16            q1, q3, q1
382
383    @ 3rd row and 4th row
384    vdup.16             q8, d20[2]
385    vdup.16             q9, d20[3]
386    vadd.s16            q8, q3, q8
387    vadd.s16            q9, q3, q9
388
389    vqmovun.s16         d0, q0
390    vqmovun.s16         d1, q1
391    vqmovun.s16         d2, q8
392    vqmovun.s16         d3, q9
393
394    vst1.64             {d0}, [r0], r1
395    vst1.64             {d1}, [r0], r1
396    vst1.64             {d2}, [r0], r1
397    vst1.64             {d3}, [r0], r1
398
399    @ 5th row and 6th row
400    vdup.16             q0, d21[0]
401    vdup.16             q1, d21[1]
402    vadd.s16            q0, q3, q0
403    vadd.s16            q1, q3, q1
404
405    @ 7th row and 8th row
406    vdup.16             q8, d21[2]
407    vdup.16             q9, d21[3]
408    vadd.s16            q8, q3, q8
409    vadd.s16            q9, q3, q9
410
411    vqmovun.s16         d0, q0
412    vqmovun.s16         d1, q1
413    vqmovun.s16         d2, q8
414    vqmovun.s16         d3, q9
415
416    vst1.64             {d0}, [r0], r1
417    vst1.64             {d1}, [r0], r1
418    vst1.64             {d2}, [r0], r1
419    vst1.64             {d3}, [r0], r1
420
421    bx                  lr
422.size vpx_tm_predictor_8x8_neon, .-vpx_tm_predictor_8x8_neon    @ ENDP                @ |vpx_tm_predictor_8x8_neon|
423
424@void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
425@                                const uint8_t *above,
426@                                const uint8_t *left)
427@ r0  uint8_t *dst
428@ r1  ptrdiff_t y_stride
429@ r2  const uint8_t *above
430@ r3  const uint8_t *left
431
432vpx_tm_predictor_16x16_neon: @ PROC
433    @ Load ytop_left = above[-1];
434    sub                 r12, r2, #1
435    vld1.8              {d0[]}, [r12]
436
437    @ Load above 8 pixels
438    vld1.8              {q1}, [r2]
439
440    @ preload 8 left into r12
441    vld1.8              {d18}, [r3]!
442
443    @ Compute above - ytop_left
444    vsubl.u8            q2, d2, d0
445    vsubl.u8            q3, d3, d0
446
447    vmovl.u8            q10, d18
448
449    @ Load left row by row and compute left + (above - ytop_left)
450    @ Process 8 rows in each single loop and loop 2 times to process 16 rows.
451    mov                 r2, #2
452
453loop_16x16_neon:
454    @ Process two rows.
455    vdup.16             q0, d20[0]
456    vdup.16             q8, d20[1]
457    vadd.s16            q1, q0, q2
458    vadd.s16            q0, q0, q3
459    vadd.s16            q11, q8, q2
460    vadd.s16            q8, q8, q3
461    vqmovun.s16         d2, q1
462    vqmovun.s16         d3, q0
463    vqmovun.s16         d22, q11
464    vqmovun.s16         d23, q8
465    vdup.16             q0, d20[2]                  @ proload next 2 rows data
466    vdup.16             q8, d20[3]
467    vst1.64             {d2,d3}, [r0], r1
468    vst1.64             {d22,d23}, [r0], r1
469
470    @ Process two rows.
471    vadd.s16            q1, q0, q2
472    vadd.s16            q0, q0, q3
473    vadd.s16            q11, q8, q2
474    vadd.s16            q8, q8, q3
475    vqmovun.s16         d2, q1
476    vqmovun.s16         d3, q0
477    vqmovun.s16         d22, q11
478    vqmovun.s16         d23, q8
479    vdup.16             q0, d21[0]                  @ proload next 2 rows data
480    vdup.16             q8, d21[1]
481    vst1.64             {d2,d3}, [r0], r1
482    vst1.64             {d22,d23}, [r0], r1
483
484    vadd.s16            q1, q0, q2
485    vadd.s16            q0, q0, q3
486    vadd.s16            q11, q8, q2
487    vadd.s16            q8, q8, q3
488    vqmovun.s16         d2, q1
489    vqmovun.s16         d3, q0
490    vqmovun.s16         d22, q11
491    vqmovun.s16         d23, q8
492    vdup.16             q0, d21[2]                  @ proload next 2 rows data
493    vdup.16             q8, d21[3]
494    vst1.64             {d2,d3}, [r0], r1
495    vst1.64             {d22,d23}, [r0], r1
496
497
498    vadd.s16            q1, q0, q2
499    vadd.s16            q0, q0, q3
500    vadd.s16            q11, q8, q2
501    vadd.s16            q8, q8, q3
502    vqmovun.s16         d2, q1
503    vqmovun.s16         d3, q0
504    vqmovun.s16         d22, q11
505    vqmovun.s16         d23, q8
506    vld1.8              {d18}, [r3]!                  @ preload 8 left into r12
507    vmovl.u8            q10, d18
508    vst1.64             {d2,d3}, [r0], r1
509    vst1.64             {d22,d23}, [r0], r1
510
511    subs                r2, r2, #1
512    bgt                 loop_16x16_neon
513
514    bx                  lr
515.size vpx_tm_predictor_16x16_neon, .-vpx_tm_predictor_16x16_neon    @ ENDP                @ |vpx_tm_predictor_16x16_neon|
516
517@void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
518@                                  const uint8_t *above,
519@                                  const uint8_t *left)
520@ r0  uint8_t *dst
521@ r1  ptrdiff_t y_stride
522@ r2  const uint8_t *above
523@ r3  const uint8_t *left
524
525vpx_tm_predictor_32x32_neon: @ PROC
526    @ Load ytop_left = above[-1];
527    sub                 r12, r2, #1
528    vld1.8              {d0[]}, [r12]
529
530    @ Load above 32 pixels
531    vld1.8              {q1}, [r2]!
532    vld1.8              {q2}, [r2]
533
534    @ preload 8 left pixels
535    vld1.8              {d26}, [r3]!
536
537    @ Compute above - ytop_left
538    vsubl.u8            q8, d2, d0
539    vsubl.u8            q9, d3, d0
540    vsubl.u8            q10, d4, d0
541    vsubl.u8            q11, d5, d0
542
543    vmovl.u8            q3, d26
544
545    @ Load left row by row and compute left + (above - ytop_left)
546    @ Process 8 rows in each single loop and loop 4 times to process 32 rows.
547    mov                 r2, #4
548
549loop_32x32_neon:
550    @ Process two rows.
551    vdup.16             q0, d6[0]
552    vdup.16             q2, d6[1]
553    vadd.s16            q12, q0, q8
554    vadd.s16            q13, q0, q9
555    vadd.s16            q14, q0, q10
556    vadd.s16            q15, q0, q11
557    vqmovun.s16         d0, q12
558    vqmovun.s16         d1, q13
559    vadd.s16            q12, q2, q8
560    vadd.s16            q13, q2, q9
561    vqmovun.s16         d2, q14
562    vqmovun.s16         d3, q15
563    vadd.s16            q14, q2, q10
564    vadd.s16            q15, q2, q11
565    vst1.64             {d0-d3}, [r0], r1
566    vqmovun.s16         d24, q12
567    vqmovun.s16         d25, q13
568    vqmovun.s16         d26, q14
569    vqmovun.s16         d27, q15
570    vdup.16             q1, d6[2]
571    vdup.16             q2, d6[3]
572    vst1.64             {d24-d27}, [r0], r1
573
574    @ Process two rows.
575    vadd.s16            q12, q1, q8
576    vadd.s16            q13, q1, q9
577    vadd.s16            q14, q1, q10
578    vadd.s16            q15, q1, q11
579    vqmovun.s16         d0, q12
580    vqmovun.s16         d1, q13
581    vadd.s16            q12, q2, q8
582    vadd.s16            q13, q2, q9
583    vqmovun.s16         d2, q14
584    vqmovun.s16         d3, q15
585    vadd.s16            q14, q2, q10
586    vadd.s16            q15, q2, q11
587    vst1.64             {d0-d3}, [r0], r1
588    vqmovun.s16         d24, q12
589    vqmovun.s16         d25, q13
590    vqmovun.s16         d26, q14
591    vqmovun.s16         d27, q15
592    vdup.16             q0, d7[0]
593    vdup.16             q2, d7[1]
594    vst1.64             {d24-d27}, [r0], r1
595
596    @ Process two rows.
597    vadd.s16            q12, q0, q8
598    vadd.s16            q13, q0, q9
599    vadd.s16            q14, q0, q10
600    vadd.s16            q15, q0, q11
601    vqmovun.s16         d0, q12
602    vqmovun.s16         d1, q13
603    vadd.s16            q12, q2, q8
604    vadd.s16            q13, q2, q9
605    vqmovun.s16         d2, q14
606    vqmovun.s16         d3, q15
607    vadd.s16            q14, q2, q10
608    vadd.s16            q15, q2, q11
609    vst1.64             {d0-d3}, [r0], r1
610    vqmovun.s16         d24, q12
611    vqmovun.s16         d25, q13
612    vqmovun.s16         d26, q14
613    vqmovun.s16         d27, q15
614    vdup.16             q0, d7[2]
615    vdup.16             q2, d7[3]
616    vst1.64             {d24-d27}, [r0], r1
617
618    @ Process two rows.
619    vadd.s16            q12, q0, q8
620    vadd.s16            q13, q0, q9
621    vadd.s16            q14, q0, q10
622    vadd.s16            q15, q0, q11
623    vqmovun.s16         d0, q12
624    vqmovun.s16         d1, q13
625    vadd.s16            q12, q2, q8
626    vadd.s16            q13, q2, q9
627    vqmovun.s16         d2, q14
628    vqmovun.s16         d3, q15
629    vadd.s16            q14, q2, q10
630    vadd.s16            q15, q2, q11
631    vst1.64             {d0-d3}, [r0], r1
632    vqmovun.s16         d24, q12
633    vqmovun.s16         d25, q13
634    vld1.8              {d0}, [r3]!                   @ preload 8 left pixels
635    vqmovun.s16         d26, q14
636    vqmovun.s16         d27, q15
637    vmovl.u8            q3, d0
638    vst1.64             {d24-d27}, [r0], r1
639
640    subs                r2, r2, #1
641    bgt                 loop_32x32_neon
642
643    bx                  lr
644.size vpx_tm_predictor_32x32_neon, .-vpx_tm_predictor_32x32_neon    @ ENDP                @ |vpx_tm_predictor_32x32_neon|
645
646    .section .note.GNU-stack,"",%progbits
647