1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "textflag.h"
6
7// See memmove Go doc for important implementation constraints.
8
9// Register map
10//
11// dstin  R0
12// src    R1
13// count  R2
14// dst    R3 (same as R0, but gets modified in unaligned cases)
15// srcend R4
16// dstend R5
17// data   R6-R17
18// tmp1   R14
19
20// Copies are split into 3 main cases: small copies of up to 32 bytes, medium
21// copies of up to 128 bytes, and large copies. The overhead of the overlap
22// check is negligible since it is only required for large copies.
23//
24// Large copies use a software pipelined loop processing 64 bytes per iteration.
25// The destination pointer is 16-byte aligned to minimize unaligned accesses.
26// The loop tail is handled by always copying 64 bytes from the end.
27
28// func memmove(to, from unsafe.Pointer, n uintptr)
29TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
30	CBZ	R2, copy0
31
32	// Small copies: 1..16 bytes
33	CMP	$16, R2
34	BLE	copy16
35
36	// Large copies
37	CMP	$128, R2
38	BHI	copy_long
39	CMP	$32, R2
40	BHI	copy32_128
41
42	// Small copies: 17..32 bytes.
43	LDP	(R1), (R6, R7)
44	ADD	R1, R2, R4          // R4 points just past the last source byte
45	LDP	-16(R4), (R12, R13)
46	STP	(R6, R7), (R0)
47	ADD	R0, R2, R5          // R5 points just past the last destination byte
48	STP	(R12, R13), -16(R5)
49	RET
50
51// Small copies: 1..16 bytes.
52copy16:
53	ADD	R1, R2, R4 // R4 points just past the last source byte
54	ADD	R0, R2, R5 // R5 points just past the last destination byte
55	CMP	$8, R2
56	BLT	copy7
57	MOVD	(R1), R6
58	MOVD	-8(R4), R7
59	MOVD	R6, (R0)
60	MOVD	R7, -8(R5)
61	RET
62
63copy7:
64	TBZ	$2, R2, copy3
65	MOVWU	(R1), R6
66	MOVWU	-4(R4), R7
67	MOVW	R6, (R0)
68	MOVW	R7, -4(R5)
69	RET
70
71copy3:
72	TBZ	$1, R2, copy1
73	MOVHU	(R1), R6
74	MOVHU	-2(R4), R7
75	MOVH	R6, (R0)
76	MOVH	R7, -2(R5)
77	RET
78
79copy1:
80	MOVBU	(R1), R6
81	MOVB	R6, (R0)
82
83copy0:
84	RET
85
86	// Medium copies: 33..128 bytes.
87copy32_128:
88	ADD	R1, R2, R4          // R4 points just past the last source byte
89	ADD	R0, R2, R5          // R5 points just past the last destination byte
90	LDP	(R1), (R6, R7)
91	LDP	16(R1), (R8, R9)
92	LDP	-32(R4), (R10, R11)
93	LDP	-16(R4), (R12, R13)
94	CMP	$64, R2
95	BHI	copy128
96	STP	(R6, R7), (R0)
97	STP	(R8, R9), 16(R0)
98	STP	(R10, R11), -32(R5)
99	STP	(R12, R13), -16(R5)
100	RET
101
102	// Copy 65..128 bytes.
103copy128:
104	LDP	32(R1), (R14, R15)
105	LDP	48(R1), (R16, R17)
106	CMP	$96, R2
107	BLS	copy96
108	LDP	-64(R4), (R2, R3)
109	LDP	-48(R4), (R1, R4)
110	STP	(R2, R3), -64(R5)
111	STP	(R1, R4), -48(R5)
112
113copy96:
114	STP	(R6, R7), (R0)
115	STP	(R8, R9), 16(R0)
116	STP	(R14, R15), 32(R0)
117	STP	(R16, R17), 48(R0)
118	STP	(R10, R11), -32(R5)
119	STP	(R12, R13), -16(R5)
120	RET
121
122	// Copy more than 128 bytes.
123copy_long:
124	ADD	R1, R2, R4 // R4 points just past the last source byte
125	ADD	R0, R2, R5 // R5 points just past the last destination byte
126	MOVD	ZR, R7
127	MOVD	ZR, R8
128
129	CMP	$1024, R2
130	BLT	backward_check
131	// feature detect to decide how to align
132	MOVBU	runtime·arm64UseAlignedLoads(SB), R6
133	CBNZ	R6, use_aligned_loads
134	MOVD	R0, R7
135	MOVD	R5, R8
136	B	backward_check
137use_aligned_loads:
138	MOVD	R1, R7
139	MOVD	R4, R8
140	// R7 and R8 are used here for the realignment calculation. In
141	// the use_aligned_loads case, R7 is the src pointer and R8 is
142	// srcend pointer, which is used in the backward copy case.
143	// When doing aligned stores, R7 is the dst pointer and R8 is
144	// the dstend pointer.
145
146backward_check:
147	// Use backward copy if there is an overlap.
148	SUB	R1, R0, R14
149	CBZ	R14, copy0
150	CMP	R2, R14
151	BCC	copy_long_backward
152
153	// Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
154	LDP	(R1), (R12, R13)     // Load  A
155	AND	$15, R7, R14         // Calculate the realignment offset
156	SUB	R14, R1, R1
157	SUB	R14, R0, R3          // move dst back same amount as src
158	ADD	R14, R2, R2
159	LDP	16(R1), (R6, R7)     // Load   B
160	STP	(R12, R13), (R0)     // Store A
161	LDP	32(R1), (R8, R9)     // Load    C
162	LDP	48(R1), (R10, R11)   // Load     D
163	LDP.W	64(R1), (R12, R13)   // Load      E
164	// 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
165	SUBS	$144, R2, R2
166	BLS	copy64_from_end
167
168loop64:
169	STP	(R6, R7), 16(R3)     // Store  B
170	LDP	16(R1), (R6, R7)     // Load   B (next iteration)
171	STP	(R8, R9), 32(R3)     // Store   C
172	LDP	32(R1), (R8, R9)     // Load    C
173	STP	(R10, R11), 48(R3)   // Store    D
174	LDP	48(R1), (R10, R11)   // Load     D
175	STP.W	(R12, R13), 64(R3)   // Store     E
176	LDP.W	64(R1), (R12, R13)   // Load      E
177	SUBS	$64, R2, R2
178	BHI	loop64
179
180	// Write the last iteration and copy 64 bytes from the end.
181copy64_from_end:
182	LDP	-64(R4), (R14, R15)  // Load       F
183	STP	(R6, R7), 16(R3)     // Store  B
184	LDP	-48(R4), (R6, R7)    // Load        G
185	STP	(R8, R9), 32(R3)     // Store   C
186	LDP	-32(R4), (R8, R9)    // Load         H
187	STP	(R10, R11), 48(R3)   // Store    D
188	LDP	-16(R4), (R10, R11)  // Load          I
189	STP	(R12, R13), 64(R3)   // Store     E
190	STP	(R14, R15), -64(R5)  // Store      F
191	STP	(R6, R7), -48(R5)    // Store       G
192	STP	(R8, R9), -32(R5)    // Store        H
193	STP	(R10, R11), -16(R5)  // Store         I
194	RET
195
196	// Large backward copy for overlapping copies.
197	// Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
198copy_long_backward:
199	LDP	-16(R4), (R12, R13)
200	AND	$15, R8, R14
201	SUB	R14, R4, R4
202	SUB	R14, R2, R2
203	LDP	-16(R4), (R6, R7)
204	STP	(R12, R13), -16(R5)
205	LDP	-32(R4), (R8, R9)
206	LDP	-48(R4), (R10, R11)
207	LDP.W	-64(R4), (R12, R13)
208	SUB	R14, R5, R5
209	SUBS	$128, R2, R2
210	BLS	copy64_from_start
211
212loop64_backward:
213	STP	(R6, R7), -16(R5)
214	LDP	-16(R4), (R6, R7)
215	STP	(R8, R9), -32(R5)
216	LDP	-32(R4), (R8, R9)
217	STP	(R10, R11), -48(R5)
218	LDP	-48(R4), (R10, R11)
219	STP.W	(R12, R13), -64(R5)
220	LDP.W	-64(R4), (R12, R13)
221	SUBS	$64, R2, R2
222	BHI	loop64_backward
223
224	// Write the last iteration and copy 64 bytes from the start.
225copy64_from_start:
226	LDP	48(R1), (R2, R3)
227	STP	(R6, R7), -16(R5)
228	LDP	32(R1), (R6, R7)
229	STP	(R8, R9), -32(R5)
230	LDP	16(R1), (R8, R9)
231	STP	(R10, R11), -48(R5)
232	LDP	(R1), (R10, R11)
233	STP	(R12, R13), -64(R5)
234	STP	(R2, R3), 48(R0)
235	STP	(R6, R7), 32(R0)
236	STP	(R8, R9), 16(R0)
237	STP	(R10, R11), (R0)
238	RET
239