1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "textflag.h"
6
7// See memmove Go doc for important implementation constraints.
8
9// void runtime·memmove(void*, void*, uintptr)
10TEXT runtime·memmove<ABIInternal>(SB),NOSPLIT,$-0-24
11	// X10 = to
12	// X11 = from
13	// X12 = n
14	BEQ	X10, X11, done
15	BEQZ	X12, done
16
17	// If the destination is ahead of the source, start at the end of the
18	// buffer and go backward.
19	BGTU	X10, X11, backward
20
21	// If less than 8 bytes, do single byte copies.
22	MOV	$8, X9
23	BLT	X12, X9, f_loop4_check
24
25	// Check alignment - if alignment differs we have to do one byte at a time.
26	AND	$7, X10, X5
27	AND	$7, X11, X6
28	BNE	X5, X6, f_loop8_unaligned_check
29	BEQZ	X5, f_loop_check
30
31	// Move one byte at a time until we reach 8 byte alignment.
32	SUB	X5, X9, X5
33	SUB	X5, X12, X12
34f_align:
35	SUB	$1, X5
36	MOVB	0(X11), X14
37	MOVB	X14, 0(X10)
38	ADD	$1, X10
39	ADD	$1, X11
40	BNEZ	X5, f_align
41
42f_loop_check:
43	MOV	$16, X9
44	BLT	X12, X9, f_loop8_check
45	MOV	$32, X9
46	BLT	X12, X9, f_loop16_check
47	MOV	$64, X9
48	BLT	X12, X9, f_loop32_check
49f_loop64:
50	MOV	0(X11), X14
51	MOV	8(X11), X15
52	MOV	16(X11), X16
53	MOV	24(X11), X17
54	MOV	32(X11), X18
55	MOV	40(X11), X19
56	MOV	48(X11), X20
57	MOV	56(X11), X21
58	MOV	X14, 0(X10)
59	MOV	X15, 8(X10)
60	MOV	X16, 16(X10)
61	MOV	X17, 24(X10)
62	MOV	X18, 32(X10)
63	MOV	X19, 40(X10)
64	MOV	X20, 48(X10)
65	MOV	X21, 56(X10)
66	ADD	$64, X10
67	ADD	$64, X11
68	SUB	$64, X12
69	BGE	X12, X9, f_loop64
70	BEQZ	X12, done
71
72f_loop32_check:
73	MOV	$32, X9
74	BLT	X12, X9, f_loop16_check
75f_loop32:
76	MOV	0(X11), X14
77	MOV	8(X11), X15
78	MOV	16(X11), X16
79	MOV	24(X11), X17
80	MOV	X14, 0(X10)
81	MOV	X15, 8(X10)
82	MOV	X16, 16(X10)
83	MOV	X17, 24(X10)
84	ADD	$32, X10
85	ADD	$32, X11
86	SUB	$32, X12
87	BGE	X12, X9, f_loop32
88	BEQZ	X12, done
89
90f_loop16_check:
91	MOV	$16, X9
92	BLT	X12, X9, f_loop8_check
93f_loop16:
94	MOV	0(X11), X14
95	MOV	8(X11), X15
96	MOV	X14, 0(X10)
97	MOV	X15, 8(X10)
98	ADD	$16, X10
99	ADD	$16, X11
100	SUB	$16, X12
101	BGE	X12, X9, f_loop16
102	BEQZ	X12, done
103
104f_loop8_check:
105	MOV	$8, X9
106	BLT	X12, X9, f_loop4_check
107f_loop8:
108	MOV	0(X11), X14
109	MOV	X14, 0(X10)
110	ADD	$8, X10
111	ADD	$8, X11
112	SUB	$8, X12
113	BGE	X12, X9, f_loop8
114	BEQZ	X12, done
115	JMP	f_loop4_check
116
117f_loop8_unaligned_check:
118	MOV	$8, X9
119	BLT	X12, X9, f_loop4_check
120f_loop8_unaligned:
121	MOVB	0(X11), X14
122	MOVB	1(X11), X15
123	MOVB	2(X11), X16
124	MOVB	3(X11), X17
125	MOVB	4(X11), X18
126	MOVB	5(X11), X19
127	MOVB	6(X11), X20
128	MOVB	7(X11), X21
129	MOVB	X14, 0(X10)
130	MOVB	X15, 1(X10)
131	MOVB	X16, 2(X10)
132	MOVB	X17, 3(X10)
133	MOVB	X18, 4(X10)
134	MOVB	X19, 5(X10)
135	MOVB	X20, 6(X10)
136	MOVB	X21, 7(X10)
137	ADD	$8, X10
138	ADD	$8, X11
139	SUB	$8, X12
140	BGE	X12, X9, f_loop8_unaligned
141
142f_loop4_check:
143	MOV	$4, X9
144	BLT	X12, X9, f_loop1
145f_loop4:
146	MOVB	0(X11), X14
147	MOVB	1(X11), X15
148	MOVB	2(X11), X16
149	MOVB	3(X11), X17
150	MOVB	X14, 0(X10)
151	MOVB	X15, 1(X10)
152	MOVB	X16, 2(X10)
153	MOVB	X17, 3(X10)
154	ADD	$4, X10
155	ADD	$4, X11
156	SUB	$4, X12
157	BGE	X12, X9, f_loop4
158
159f_loop1:
160	BEQZ	X12, done
161	MOVB	0(X11), X14
162	MOVB	X14, 0(X10)
163	ADD	$1, X10
164	ADD	$1, X11
165	SUB	$1, X12
166	JMP	f_loop1
167
168backward:
169	ADD	X10, X12, X10
170	ADD	X11, X12, X11
171
172	// If less than 8 bytes, do single byte copies.
173	MOV	$8, X9
174	BLT	X12, X9, b_loop4_check
175
176	// Check alignment - if alignment differs we have to do one byte at a time.
177	AND	$7, X10, X5
178	AND	$7, X11, X6
179	BNE	X5, X6, b_loop8_unaligned_check
180	BEQZ	X5, b_loop_check
181
182	// Move one byte at a time until we reach 8 byte alignment.
183	SUB	X5, X12, X12
184b_align:
185	SUB	$1, X5
186	SUB	$1, X10
187	SUB	$1, X11
188	MOVB	0(X11), X14
189	MOVB	X14, 0(X10)
190	BNEZ	X5, b_align
191
192b_loop_check:
193	MOV	$16, X9
194	BLT	X12, X9, b_loop8_check
195	MOV	$32, X9
196	BLT	X12, X9, b_loop16_check
197	MOV	$64, X9
198	BLT	X12, X9, b_loop32_check
199b_loop64:
200	SUB	$64, X10
201	SUB	$64, X11
202	MOV	0(X11), X14
203	MOV	8(X11), X15
204	MOV	16(X11), X16
205	MOV	24(X11), X17
206	MOV	32(X11), X18
207	MOV	40(X11), X19
208	MOV	48(X11), X20
209	MOV	56(X11), X21
210	MOV	X14, 0(X10)
211	MOV	X15, 8(X10)
212	MOV	X16, 16(X10)
213	MOV	X17, 24(X10)
214	MOV	X18, 32(X10)
215	MOV	X19, 40(X10)
216	MOV	X20, 48(X10)
217	MOV	X21, 56(X10)
218	SUB	$64, X12
219	BGE	X12, X9, b_loop64
220	BEQZ	X12, done
221
222b_loop32_check:
223	MOV	$32, X9
224	BLT	X12, X9, b_loop16_check
225b_loop32:
226	SUB	$32, X10
227	SUB	$32, X11
228	MOV	0(X11), X14
229	MOV	8(X11), X15
230	MOV	16(X11), X16
231	MOV	24(X11), X17
232	MOV	X14, 0(X10)
233	MOV	X15, 8(X10)
234	MOV	X16, 16(X10)
235	MOV	X17, 24(X10)
236	SUB	$32, X12
237	BGE	X12, X9, b_loop32
238	BEQZ	X12, done
239
240b_loop16_check:
241	MOV	$16, X9
242	BLT	X12, X9, b_loop8_check
243b_loop16:
244	SUB	$16, X10
245	SUB	$16, X11
246	MOV	0(X11), X14
247	MOV	8(X11), X15
248	MOV	X14, 0(X10)
249	MOV	X15, 8(X10)
250	SUB	$16, X12
251	BGE	X12, X9, b_loop16
252	BEQZ	X12, done
253
254b_loop8_check:
255	MOV	$8, X9
256	BLT	X12, X9, b_loop4_check
257b_loop8:
258	SUB	$8, X10
259	SUB	$8, X11
260	MOV	0(X11), X14
261	MOV	X14, 0(X10)
262	SUB	$8, X12
263	BGE	X12, X9, b_loop8
264	BEQZ	X12, done
265	JMP	b_loop4_check
266
267b_loop8_unaligned_check:
268	MOV	$8, X9
269	BLT	X12, X9, b_loop4_check
270b_loop8_unaligned:
271	SUB	$8, X10
272	SUB	$8, X11
273	MOVB	0(X11), X14
274	MOVB	1(X11), X15
275	MOVB	2(X11), X16
276	MOVB	3(X11), X17
277	MOVB	4(X11), X18
278	MOVB	5(X11), X19
279	MOVB	6(X11), X20
280	MOVB	7(X11), X21
281	MOVB	X14, 0(X10)
282	MOVB	X15, 1(X10)
283	MOVB	X16, 2(X10)
284	MOVB	X17, 3(X10)
285	MOVB	X18, 4(X10)
286	MOVB	X19, 5(X10)
287	MOVB	X20, 6(X10)
288	MOVB	X21, 7(X10)
289	SUB	$8, X12
290	BGE	X12, X9, b_loop8_unaligned
291
292b_loop4_check:
293	MOV	$4, X9
294	BLT	X12, X9, b_loop1
295b_loop4:
296	SUB	$4, X10
297	SUB	$4, X11
298	MOVB	0(X11), X14
299	MOVB	1(X11), X15
300	MOVB	2(X11), X16
301	MOVB	3(X11), X17
302	MOVB	X14, 0(X10)
303	MOVB	X15, 1(X10)
304	MOVB	X16, 2(X10)
305	MOVB	X17, 3(X10)
306	SUB	$4, X12
307	BGE	X12, X9, b_loop4
308
309b_loop1:
310	BEQZ	X12, done
311	SUB	$1, X10
312	SUB	$1, X11
313	MOVB	0(X11), X14
314	MOVB	X14, 0(X10)
315	SUB	$1, X12
316	JMP	b_loop1
317
318done:
319	RET
320