xref: /aosp_15_r20/bionic/libc/arch-x86_64/string/sse2-memmove-slm.S (revision 8d67ca893c1523eb926b9080dbe4e2ffd2a27ba1)
1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31
32#ifndef MEMMOVE
33# define MEMMOVE		memmove
34#endif
35
36#ifndef L
37# define L(label)	.L##label
38#endif
39
40#ifndef cfi_startproc
41# define cfi_startproc	.cfi_startproc
42#endif
43
44#ifndef cfi_endproc
45# define cfi_endproc	.cfi_endproc
46#endif
47
48#ifndef cfi_rel_offset
49# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
50#endif
51
52#ifndef cfi_restore
53# define cfi_restore(reg)	.cfi_restore reg
54#endif
55
56#ifndef cfi_adjust_cfa_offset
57# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
58#endif
59
60#ifndef ENTRY
61# define ENTRY(name)		\
62	.type name,  @function;		\
63	.globl name;		\
64	.p2align 4;		\
65name:		\
66	cfi_startproc
67#endif
68
69#ifndef ALIAS_SYMBOL
70# define ALIAS_SYMBOL(alias, original) \
71	.globl alias; \
72	.equ alias, original
73#endif
74
75#ifndef END
76# define END(name)		\
77	cfi_endproc;		\
78	.size name, .-name
79#endif
80
81#define CFI_PUSH(REG)		\
82	cfi_adjust_cfa_offset (4);		\
83	cfi_rel_offset (REG, 0)
84
85#define CFI_POP(REG)		\
86	cfi_adjust_cfa_offset (-4);		\
87	cfi_restore (REG)
88
89#define PUSH(REG)	push REG;
90#define POP(REG)	pop REG;
91
92#define ENTRANCE	PUSH (%rbx);
93#define RETURN_END	POP (%rbx); ret
94#define RETURN		RETURN_END;
95
96	.section .text.sse2,"ax",@progbits
97ENTRY (MEMMOVE)
98	ENTRANCE
99	mov	%rdi, %rax
100
101/* Check whether we should copy backward or forward.  */
102	cmp	%rsi, %rdi
103	je	L(mm_return)
104	jg	L(mm_len_0_or_more_backward)
105
106/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
107	separately.  */
108	cmp	$16, %rdx
109	jbe	L(mm_len_0_16_bytes_forward)
110
111	cmp	$32, %rdx
112	ja	L(mm_len_32_or_more_forward)
113
114/* Copy [0..32] and return.  */
115	movdqu	(%rsi), %xmm0
116	movdqu	-16(%rsi, %rdx), %xmm1
117	movdqu	%xmm0, (%rdi)
118	movdqu	%xmm1, -16(%rdi, %rdx)
119	jmp	L(mm_return)
120
121L(mm_len_32_or_more_forward):
122	cmp	$64, %rdx
123	ja	L(mm_len_64_or_more_forward)
124
125/* Copy [0..64] and return.  */
126	movdqu	(%rsi), %xmm0
127	movdqu	16(%rsi), %xmm1
128	movdqu	-16(%rsi, %rdx), %xmm2
129	movdqu	-32(%rsi, %rdx), %xmm3
130	movdqu	%xmm0, (%rdi)
131	movdqu	%xmm1, 16(%rdi)
132	movdqu	%xmm2, -16(%rdi, %rdx)
133	movdqu	%xmm3, -32(%rdi, %rdx)
134	jmp	L(mm_return)
135
136L(mm_len_64_or_more_forward):
137	cmp	$128, %rdx
138	ja	L(mm_len_128_or_more_forward)
139
140/* Copy [0..128] and return.  */
141	movdqu	(%rsi), %xmm0
142	movdqu	16(%rsi), %xmm1
143	movdqu	32(%rsi), %xmm2
144	movdqu	48(%rsi), %xmm3
145	movdqu	-64(%rsi, %rdx), %xmm4
146	movdqu	-48(%rsi, %rdx), %xmm5
147	movdqu	-32(%rsi, %rdx), %xmm6
148	movdqu	-16(%rsi, %rdx), %xmm7
149	movdqu	%xmm0, (%rdi)
150	movdqu	%xmm1, 16(%rdi)
151	movdqu	%xmm2, 32(%rdi)
152	movdqu	%xmm3, 48(%rdi)
153	movdqu	%xmm4, -64(%rdi, %rdx)
154	movdqu	%xmm5, -48(%rdi, %rdx)
155	movdqu	%xmm6, -32(%rdi, %rdx)
156	movdqu	%xmm7, -16(%rdi, %rdx)
157	jmp	L(mm_return)
158
159L(mm_len_128_or_more_forward):
160/* Aligning the address of destination.  */
161/*  save first unaligned 64 bytes */
162	movdqu	(%rsi), %xmm0
163	movdqu	16(%rsi), %xmm1
164	movdqu	32(%rsi), %xmm2
165	movdqu	48(%rsi), %xmm3
166
167	lea	64(%rdi), %r8
168	and	$-64, %r8  /* r8 now aligned to next 64 byte boundary */
169	sub	%rdi, %rsi /* rsi = src - dst = diff */
170
171	movdqu	(%r8, %rsi), %xmm4
172	movdqu	16(%r8, %rsi), %xmm5
173	movdqu	32(%r8, %rsi), %xmm6
174	movdqu	48(%r8, %rsi), %xmm7
175
176	movdqu	%xmm0, (%rdi)
177	movdqu	%xmm1, 16(%rdi)
178	movdqu	%xmm2, 32(%rdi)
179	movdqu	%xmm3, 48(%rdi)
180	movdqa	%xmm4, (%r8)
181	movaps	%xmm5, 16(%r8)
182	movaps	%xmm6, 32(%r8)
183	movaps	%xmm7, 48(%r8)
184	add	$64, %r8
185
186	lea	(%rdi, %rdx), %rbx
187	and	$-64, %rbx
188	cmp	%r8, %rbx
189	jbe	L(mm_copy_remaining_forward)
190
191	cmp	__x86_shared_cache_size_half(%rip), %rdx
192
193	ja      L(mm_overlapping_check_forward)
194
195	.p2align 4
196L(mm_main_loop_forward):
197
198	prefetcht0 128(%r8, %rsi)
199
200	movdqu	(%r8, %rsi), %xmm0
201	movdqu	16(%r8, %rsi), %xmm1
202	movdqu	32(%r8, %rsi), %xmm2
203	movdqu	48(%r8, %rsi), %xmm3
204	movdqa	%xmm0, (%r8)
205	movaps	%xmm1, 16(%r8)
206	movaps	%xmm2, 32(%r8)
207	movaps	%xmm3, 48(%r8)
208	lea	64(%r8), %r8
209	cmp	%r8, %rbx
210	ja	L(mm_main_loop_forward)
211
212L(mm_copy_remaining_forward):
213	add	%rdi, %rdx
214	sub	%r8, %rdx
215/* We copied all up till %rdi position in the dst.
216	In %rdx now is how many bytes are left to copy.
217	Now we need to advance %r8. */
218	lea	(%r8, %rsi), %r9
219
220L(mm_remaining_0_64_bytes_forward):
221	cmp	$32, %rdx
222	ja	L(mm_remaining_33_64_bytes_forward)
223	cmp	$16, %rdx
224	ja	L(mm_remaining_17_32_bytes_forward)
225	test	%rdx, %rdx
226	.p2align 4,,2
227	je	L(mm_return)
228
229	cmpb	$8, %dl
230	ja	L(mm_remaining_9_16_bytes_forward)
231	cmpb	$4, %dl
232	.p2align 4,,5
233	ja	L(mm_remaining_5_8_bytes_forward)
234	cmpb	$2, %dl
235	.p2align 4,,1
236	ja	L(mm_remaining_3_4_bytes_forward)
237	movzbl	-1(%r9,%rdx), %esi
238	movzbl	(%r9), %ebx
239	movb	%sil, -1(%r8,%rdx)
240	movb	%bl, (%r8)
241	jmp	L(mm_return)
242
243L(mm_remaining_33_64_bytes_forward):
244	movdqu	(%r9), %xmm0
245	movdqu	16(%r9), %xmm1
246	movdqu	-32(%r9, %rdx), %xmm2
247	movdqu	-16(%r9, %rdx), %xmm3
248	movdqu	%xmm0, (%r8)
249	movdqu	%xmm1, 16(%r8)
250	movdqu	%xmm2, -32(%r8, %rdx)
251	movdqu	%xmm3, -16(%r8, %rdx)
252	jmp	L(mm_return)
253
254L(mm_remaining_17_32_bytes_forward):
255	movdqu	(%r9), %xmm0
256	movdqu	-16(%r9, %rdx), %xmm1
257	movdqu	%xmm0, (%r8)
258	movdqu	%xmm1, -16(%r8, %rdx)
259	jmp	L(mm_return)
260
261L(mm_remaining_5_8_bytes_forward):
262	movl	(%r9), %esi
263	movl	-4(%r9,%rdx), %ebx
264	movl	%esi, (%r8)
265	movl	%ebx, -4(%r8,%rdx)
266	jmp	L(mm_return)
267
268L(mm_remaining_9_16_bytes_forward):
269	mov	(%r9), %rsi
270	mov	-8(%r9, %rdx), %rbx
271	mov	%rsi, (%r8)
272	mov	%rbx, -8(%r8, %rdx)
273	jmp	L(mm_return)
274
275L(mm_remaining_3_4_bytes_forward):
276	movzwl	-2(%r9,%rdx), %esi
277	movzwl	(%r9), %ebx
278	movw	%si, -2(%r8,%rdx)
279	movw	%bx, (%r8)
280	jmp	L(mm_return)
281
282L(mm_len_0_16_bytes_forward):
283	testb	$24, %dl
284	jne	L(mm_len_9_16_bytes_forward)
285	testb	$4, %dl
286	.p2align 4,,5
287	jne	L(mm_len_5_8_bytes_forward)
288	test	%rdx, %rdx
289	.p2align 4,,2
290	je	L(mm_return)
291	testb	$2, %dl
292	.p2align 4,,1
293	jne	L(mm_len_2_4_bytes_forward)
294	movzbl	-1(%rsi,%rdx), %ebx
295	movzbl	(%rsi), %esi
296	movb	%bl, -1(%rdi,%rdx)
297	movb	%sil, (%rdi)
298	jmp	L(mm_return)
299
300L(mm_len_2_4_bytes_forward):
301	movzwl	-2(%rsi,%rdx), %ebx
302	movzwl	(%rsi), %esi
303	movw	%bx, -2(%rdi,%rdx)
304	movw	%si, (%rdi)
305	jmp	L(mm_return)
306
307L(mm_len_5_8_bytes_forward):
308	movl	(%rsi), %ebx
309	movl	-4(%rsi,%rdx), %esi
310	movl	%ebx, (%rdi)
311	movl	%esi, -4(%rdi,%rdx)
312	jmp	L(mm_return)
313
314L(mm_len_9_16_bytes_forward):
315	mov	(%rsi), %rbx
316	mov	-8(%rsi, %rdx), %rsi
317	mov	%rbx, (%rdi)
318	mov	%rsi, -8(%rdi, %rdx)
319	jmp	L(mm_return)
320
321L(mm_recalc_len):
322/* Compute in %rdx how many bytes are left to copy after
323	the main loop stops.  */
324	mov 	%rbx, %rdx
325	sub 	%rdi, %rdx
326/* The code for copying backwards.  */
327L(mm_len_0_or_more_backward):
328
329/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
330	separately.  */
331	cmp	$16, %rdx
332	jbe	L(mm_len_0_16_bytes_backward)
333
334	cmp	$32, %rdx
335	ja	L(mm_len_32_or_more_backward)
336
337/* Copy [0..32] and return.  */
338	movdqu	(%rsi), %xmm0
339	movdqu	-16(%rsi, %rdx), %xmm1
340	movdqu	%xmm0, (%rdi)
341	movdqu	%xmm1, -16(%rdi, %rdx)
342	jmp	L(mm_return)
343
344L(mm_len_32_or_more_backward):
345	cmp	$64, %rdx
346	ja	L(mm_len_64_or_more_backward)
347
348/* Copy [0..64] and return.  */
349	movdqu	(%rsi), %xmm0
350	movdqu	16(%rsi), %xmm1
351	movdqu	-16(%rsi, %rdx), %xmm2
352	movdqu	-32(%rsi, %rdx), %xmm3
353	movdqu	%xmm0, (%rdi)
354	movdqu	%xmm1, 16(%rdi)
355	movdqu	%xmm2, -16(%rdi, %rdx)
356	movdqu	%xmm3, -32(%rdi, %rdx)
357	jmp	L(mm_return)
358
359L(mm_len_64_or_more_backward):
360	cmp	$128, %rdx
361	ja	L(mm_len_128_or_more_backward)
362
363/* Copy [0..128] and return.  */
364	movdqu	(%rsi), %xmm0
365	movdqu	16(%rsi), %xmm1
366	movdqu	32(%rsi), %xmm2
367	movdqu	48(%rsi), %xmm3
368	movdqu	-64(%rsi, %rdx), %xmm4
369	movdqu	-48(%rsi, %rdx), %xmm5
370	movdqu	-32(%rsi, %rdx), %xmm6
371	movdqu	-16(%rsi, %rdx), %xmm7
372	movdqu	%xmm0, (%rdi)
373	movdqu	%xmm1, 16(%rdi)
374	movdqu	%xmm2, 32(%rdi)
375	movdqu	%xmm3, 48(%rdi)
376	movdqu	%xmm4, -64(%rdi, %rdx)
377	movdqu	%xmm5, -48(%rdi, %rdx)
378	movdqu	%xmm6, -32(%rdi, %rdx)
379	movdqu	%xmm7, -16(%rdi, %rdx)
380	jmp	L(mm_return)
381
382L(mm_len_128_or_more_backward):
383/* Aligning the address of destination. We need to save
384	16 bits from the source in order not to overwrite them.  */
385	movdqu	-16(%rsi, %rdx), %xmm0
386	movdqu	-32(%rsi, %rdx), %xmm1
387	movdqu	-48(%rsi, %rdx), %xmm2
388	movdqu	-64(%rsi, %rdx), %xmm3
389
390	lea	(%rdi, %rdx), %r9
391	and	$-64, %r9 /* r9 = aligned dst */
392
393	mov	%rsi, %r8
394	sub	%rdi, %r8 /* r8 = src - dst, diff */
395
396	movdqu	-16(%r9, %r8), %xmm4
397	movdqu	-32(%r9, %r8), %xmm5
398	movdqu	-48(%r9, %r8), %xmm6
399	movdqu	-64(%r9, %r8), %xmm7
400
401	movdqu	%xmm0, -16(%rdi, %rdx)
402	movdqu	%xmm1, -32(%rdi, %rdx)
403	movdqu	%xmm2, -48(%rdi, %rdx)
404	movdqu	%xmm3, -64(%rdi, %rdx)
405	movdqa	%xmm4, -16(%r9)
406	movaps	%xmm5, -32(%r9)
407	movaps	%xmm6, -48(%r9)
408	movaps	%xmm7, -64(%r9)
409	lea	-64(%r9), %r9
410
411	lea	64(%rdi), %rbx
412	and	$-64, %rbx
413
414	cmp	%r9, %rbx
415	jae	L(mm_recalc_len)
416
417	cmp	__x86_shared_cache_size_half(%rip), %rdx
418
419	ja	L(mm_overlapping_check_backward)
420
421	.p2align 4
422L(mm_main_loop_backward):
423
424	prefetcht0 -128(%r9, %r8)
425
426	movdqu	-64(%r9, %r8), %xmm0
427	movdqu	-48(%r9, %r8), %xmm1
428	movdqu	-32(%r9, %r8), %xmm2
429	movdqu	-16(%r9, %r8), %xmm3
430	movdqa	%xmm0, -64(%r9)
431	movaps	%xmm1, -48(%r9)
432	movaps	%xmm2, -32(%r9)
433	movaps	%xmm3, -16(%r9)
434	lea	-64(%r9), %r9
435	cmp	%r9, %rbx
436	jb	L(mm_main_loop_backward)
437	jmp	L(mm_recalc_len)
438
439/* Copy [0..16] and return.  */
440L(mm_len_0_16_bytes_backward):
441	testb	$24, %dl
442	jnz	L(mm_len_9_16_bytes_backward)
443	testb	$4, %dl
444	.p2align 4,,5
445	jnz	L(mm_len_5_8_bytes_backward)
446	test	%rdx, %rdx
447	.p2align 4,,2
448	je	L(mm_return)
449	testb	$2, %dl
450	.p2align 4,,1
451	jne	L(mm_len_3_4_bytes_backward)
452	movzbl	-1(%rsi,%rdx), %ebx
453	movzbl	(%rsi), %ecx
454	movb	%bl, -1(%rdi,%rdx)
455	movb	%cl, (%rdi)
456	jmp	L(mm_return)
457
458L(mm_len_3_4_bytes_backward):
459	movzwl	-2(%rsi,%rdx), %ebx
460	movzwl	(%rsi), %ecx
461	movw	%bx, -2(%rdi,%rdx)
462	movw	%cx, (%rdi)
463	jmp	L(mm_return)
464
465L(mm_len_9_16_bytes_backward):
466	movl	-4(%rsi,%rdx), %ebx
467	movl	-8(%rsi,%rdx), %ecx
468	movl	%ebx, -4(%rdi,%rdx)
469	movl	%ecx, -8(%rdi,%rdx)
470	sub	$8, %rdx
471	jmp	L(mm_len_0_16_bytes_backward)
472
473L(mm_len_5_8_bytes_backward):
474	movl	(%rsi), %ebx
475	movl	-4(%rsi,%rdx), %ecx
476	movl	%ebx, (%rdi)
477	movl	%ecx, -4(%rdi,%rdx)
478
479L(mm_return):
480	RETURN
481
482/* Big length copy forward part.  */
483
484	.p2align 4
485
486L(mm_overlapping_check_forward):
487	mov	%rsi, %r9
488	add	%rdx, %r9
489	cmp	__x86_shared_cache_size(%rip), %r9
490	jbe	L(mm_main_loop_forward)
491
492L(mm_large_page_loop_forward):
493	movdqu	(%r8, %rsi), %xmm0
494	movdqu	16(%r8, %rsi), %xmm1
495	movdqu	32(%r8, %rsi), %xmm2
496	movdqu	48(%r8, %rsi), %xmm3
497	movntdq	%xmm0, (%r8)
498	movntdq	%xmm1, 16(%r8)
499	movntdq	%xmm2, 32(%r8)
500	movntdq	%xmm3, 48(%r8)
501	lea 	64(%r8), %r8
502	cmp	%r8, %rbx
503	ja	L(mm_large_page_loop_forward)
504	sfence
505	jmp	L(mm_copy_remaining_forward)
506
507/* Big length copy backward part.  */
508	.p2align 4
509
510L(mm_overlapping_check_backward):
511	mov	%rdi, %r11
512	sub	%rsi, %r11 /* r11 = dst - src, diff */
513	add	%rdx, %r11
514	cmp	__x86_shared_cache_size(%rip), %r11
515	jbe	L(mm_main_loop_backward)
516
517L(mm_large_page_loop_backward):
518	movdqu	-64(%r9, %r8), %xmm0
519	movdqu	-48(%r9, %r8), %xmm1
520	movdqu	-32(%r9, %r8), %xmm2
521	movdqu	-16(%r9, %r8), %xmm3
522	movntdq	%xmm0, -64(%r9)
523	movntdq	%xmm1, -48(%r9)
524	movntdq	%xmm2, -32(%r9)
525	movntdq	%xmm3, -16(%r9)
526	lea 	-64(%r9), %r9
527	cmp	%r9, %rbx
528	jb	L(mm_large_page_loop_backward)
529	sfence
530	jmp	L(mm_recalc_len)
531
532END (MEMMOVE)
533
534ALIAS_SYMBOL(memcpy, MEMMOVE)
535