1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !math_big_pure_go
6
7#include "textflag.h"
8
9// This file provides fast assembly versions for the elementary
10// arithmetic operations on vectors implemented in arith.go.
11
12// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
13// func addVV(z, x, y []Word) (c Word)
14
15TEXT ·addVV(SB), NOSPLIT, $0
16	MOVD addvectorfacility+0x00(SB), R1
17	BR   (R1)
18
19TEXT ·addVV_check(SB), NOSPLIT, $0
20	MOVB   ·hasVX(SB), R1
21	CMPBEQ R1, $1, vectorimpl              // vectorfacility = 1, vector supported
22	MOVD   $addvectorfacility+0x00(SB), R1
23	MOVDaddVV_novec(SB), R2
24	MOVD   R2, 0(R1)
25
26	// MOVD	$·addVV_novec(SB), 0(R1)
27	BR ·addVV_novec(SB)
28
29vectorimpl:
30	MOVD $addvectorfacility+0x00(SB), R1
31	MOVDaddVV_vec(SB), R2
32	MOVD R2, 0(R1)
33
34	// MOVD	$·addVV_vec(SB), 0(R1)
35	BR ·addVV_vec(SB)
36
37GLOBL addvectorfacility+0x00(SB), NOPTR, $8
38DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB)
39
40TEXT ·addVV_vec(SB), NOSPLIT, $0
41	MOVD z_len+8(FP), R3
42	MOVD x+24(FP), R8
43	MOVD y+48(FP), R9
44	MOVD z+0(FP), R2
45
46	MOVD $0, R4  // c = 0
47	MOVD $0, R0  // make sure it's zero
48	MOVD $0, R10 // i = 0
49
50	// s/JL/JMP/ below to disable the unrolled loop
51	SUB $4, R3
52	BLT v1
53	SUB $12, R3 // n -= 16
54	BLT A1      // if n < 0 goto A1
55
56	MOVD R8, R5
57	MOVD R9, R6
58	MOVD R2, R7
59
60	// n >= 0
61	// regular loop body unrolled 16x
62	VZERO V0 // c = 0
63
64UU1:
65	VLM  0(R5), V1, V4    // 64-bytes into V1..V8
66	ADD  $64, R5
67	VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
68	VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
69
70	VLM  0(R6), V9, V12      // 64-bytes into V9..V16
71	ADD  $64, R6
72	VPDI $0x4, V9, V9, V9    // flip the doublewords to big-endian order
73	VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
74
75	VACCCQ V1, V9, V0, V25
76	VACQ   V1, V9, V0, V17
77	VACCCQ V2, V10, V25, V26
78	VACQ   V2, V10, V25, V18
79
80	VLM 0(R5), V5, V6   // 32-bytes into V1..V8
81	VLM 0(R6), V13, V14 // 32-bytes into V9..V16
82	ADD $32, R5
83	ADD $32, R6
84
85	VPDI $0x4, V3, V3, V3    // flip the doublewords to big-endian order
86	VPDI $0x4, V4, V4, V4    // flip the doublewords to big-endian order
87	VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
88	VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
89
90	VACCCQ V3, V11, V26, V27
91	VACQ   V3, V11, V26, V19
92	VACCCQ V4, V12, V27, V28
93	VACQ   V4, V12, V27, V20
94
95	VLM 0(R5), V7, V8   // 32-bytes into V1..V8
96	VLM 0(R6), V15, V16 // 32-bytes into V9..V16
97	ADD $32, R5
98	ADD $32, R6
99
100	VPDI $0x4, V5, V5, V5    // flip the doublewords to big-endian order
101	VPDI $0x4, V6, V6, V6    // flip the doublewords to big-endian order
102	VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
103	VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
104
105	VACCCQ V5, V13, V28, V29
106	VACQ   V5, V13, V28, V21
107	VACCCQ V6, V14, V29, V30
108	VACQ   V6, V14, V29, V22
109
110	VPDI $0x4, V7, V7, V7    // flip the doublewords to big-endian order
111	VPDI $0x4, V8, V8, V8    // flip the doublewords to big-endian order
112	VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
113	VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
114
115	VACCCQ V7, V15, V30, V31
116	VACQ   V7, V15, V30, V23
117	VACCCQ V8, V16, V31, V0  // V0 has carry-over
118	VACQ   V8, V16, V31, V24
119
120	VPDI  $0x4, V17, V17, V17 // flip the doublewords to big-endian order
121	VPDI  $0x4, V18, V18, V18 // flip the doublewords to big-endian order
122	VPDI  $0x4, V19, V19, V19 // flip the doublewords to big-endian order
123	VPDI  $0x4, V20, V20, V20 // flip the doublewords to big-endian order
124	VPDI  $0x4, V21, V21, V21 // flip the doublewords to big-endian order
125	VPDI  $0x4, V22, V22, V22 // flip the doublewords to big-endian order
126	VPDI  $0x4, V23, V23, V23 // flip the doublewords to big-endian order
127	VPDI  $0x4, V24, V24, V24 // flip the doublewords to big-endian order
128	VSTM  V17, V24, 0(R7)     // 128-bytes into z
129	ADD   $128, R7
130	ADD   $128, R10           // i += 16
131	SUB   $16, R3             // n -= 16
132	BGE   UU1                 // if n >= 0 goto U1
133	VLGVG $1, V0, R4          // put cf into R4
134	NEG   R4, R4              // save cf
135
136A1:
137	ADD $12, R3 // n += 16
138
139	// s/JL/JMP/ below to disable the unrolled loop
140	BLT v1 // if n < 0 goto v1
141
142U1:  // n >= 0
143	// regular loop body unrolled 4x
144	MOVD 0(R8)(R10*1), R5
145	MOVD 8(R8)(R10*1), R6
146	MOVD 16(R8)(R10*1), R7
147	MOVD 24(R8)(R10*1), R1
148	ADDC R4, R4             // restore CF
149	MOVD 0(R9)(R10*1), R11
150	ADDE R11, R5
151	MOVD 8(R9)(R10*1), R11
152	ADDE R11, R6
153	MOVD 16(R9)(R10*1), R11
154	ADDE R11, R7
155	MOVD 24(R9)(R10*1), R11
156	ADDE R11, R1
157	MOVD R0, R4
158	ADDE R4, R4             // save CF
159	NEG  R4, R4
160	MOVD R5, 0(R2)(R10*1)
161	MOVD R6, 8(R2)(R10*1)
162	MOVD R7, 16(R2)(R10*1)
163	MOVD R1, 24(R2)(R10*1)
164
165	ADD $32, R10 // i += 4
166	SUB $4, R3   // n -= 4
167	BGE U1       // if n >= 0 goto U1
168
169v1:
170	ADD $4, R3 // n += 4
171	BLE E1     // if n <= 0 goto E1
172
173L1:  // n > 0
174	ADDC R4, R4            // restore CF
175	MOVD 0(R8)(R10*1), R5
176	MOVD 0(R9)(R10*1), R11
177	ADDE R11, R5
178	MOVD R5, 0(R2)(R10*1)
179	MOVD R0, R4
180	ADDE R4, R4            // save CF
181	NEG  R4, R4
182
183	ADD $8, R10 // i++
184	SUB $1, R3  // n--
185	BGT L1      // if n > 0 goto L1
186
187E1:
188	NEG  R4, R4
189	MOVD R4, c+72(FP) // return c
190	RET
191
192TEXT ·addVV_novec(SB), NOSPLIT, $0
193novec:
194	MOVD z_len+8(FP), R3
195	MOVD x+24(FP), R8
196	MOVD y+48(FP), R9
197	MOVD z+0(FP), R2
198
199	MOVD $0, R4  // c = 0
200	MOVD $0, R0  // make sure it's zero
201	MOVD $0, R10 // i = 0
202
203	// s/JL/JMP/ below to disable the unrolled loop
204	SUB $4, R3 // n -= 4
205	BLT v1n    // if n < 0 goto v1n
206
207U1n:  // n >= 0
208	// regular loop body unrolled 4x
209	MOVD 0(R8)(R10*1), R5
210	MOVD 8(R8)(R10*1), R6
211	MOVD 16(R8)(R10*1), R7
212	MOVD 24(R8)(R10*1), R1
213	ADDC R4, R4             // restore CF
214	MOVD 0(R9)(R10*1), R11
215	ADDE R11, R5
216	MOVD 8(R9)(R10*1), R11
217	ADDE R11, R6
218	MOVD 16(R9)(R10*1), R11
219	ADDE R11, R7
220	MOVD 24(R9)(R10*1), R11
221	ADDE R11, R1
222	MOVD R0, R4
223	ADDE R4, R4             // save CF
224	NEG  R4, R4
225	MOVD R5, 0(R2)(R10*1)
226	MOVD R6, 8(R2)(R10*1)
227	MOVD R7, 16(R2)(R10*1)
228	MOVD R1, 24(R2)(R10*1)
229
230	ADD $32, R10 // i += 4
231	SUB $4, R3   // n -= 4
232	BGE U1n      // if n >= 0 goto U1n
233
234v1n:
235	ADD $4, R3 // n += 4
236	BLE E1n    // if n <= 0 goto E1n
237
238L1n:  // n > 0
239	ADDC R4, R4            // restore CF
240	MOVD 0(R8)(R10*1), R5
241	MOVD 0(R9)(R10*1), R11
242	ADDE R11, R5
243	MOVD R5, 0(R2)(R10*1)
244	MOVD R0, R4
245	ADDE R4, R4            // save CF
246	NEG  R4, R4
247
248	ADD $8, R10 // i++
249	SUB $1, R3  // n--
250	BGT L1n     // if n > 0 goto L1n
251
252E1n:
253	NEG  R4, R4
254	MOVD R4, c+72(FP) // return c
255	RET
256
257TEXT ·subVV(SB), NOSPLIT, $0
258	MOVD subvectorfacility+0x00(SB), R1
259	BR   (R1)
260
261TEXT ·subVV_check(SB), NOSPLIT, $0
262	MOVB   ·hasVX(SB), R1
263	CMPBEQ R1, $1, vectorimpl              // vectorfacility = 1, vector supported
264	MOVD   $subvectorfacility+0x00(SB), R1
265	MOVDsubVV_novec(SB), R2
266	MOVD   R2, 0(R1)
267
268	// MOVD	$·subVV_novec(SB), 0(R1)
269	BR ·subVV_novec(SB)
270
271vectorimpl:
272	MOVD $subvectorfacility+0x00(SB), R1
273	MOVDsubVV_vec(SB), R2
274	MOVD R2, 0(R1)
275
276	// MOVD	$·subVV_vec(SB), 0(R1)
277	BR ·subVV_vec(SB)
278
279GLOBL subvectorfacility+0x00(SB), NOPTR, $8
280DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB)
281
282// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
283// func subVV(z, x, y []Word) (c Word)
284// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
285TEXT ·subVV_vec(SB), NOSPLIT, $0
286	MOVD z_len+8(FP), R3
287	MOVD x+24(FP), R8
288	MOVD y+48(FP), R9
289	MOVD z+0(FP), R2
290	MOVD $0, R4          // c = 0
291	MOVD $0, R0          // make sure it's zero
292	MOVD $0, R10         // i = 0
293
294	// s/JL/JMP/ below to disable the unrolled loop
295	SUB $4, R3  // n -= 4
296	BLT v1      // if n < 0 goto v1
297	SUB $12, R3 // n -= 16
298	BLT A1      // if n < 0 goto A1
299
300	MOVD R8, R5
301	MOVD R9, R6
302	MOVD R2, R7
303
304	// n >= 0
305	// regular loop body unrolled 16x
306	VZERO V0         // cf = 0
307	MOVD  $1, R4     // for 390 subtraction cf starts as 1 (no borrow)
308	VLVGG $1, R4, V0 // put carry into V0
309
310UU1:
311	VLM  0(R5), V1, V4    // 64-bytes into V1..V8
312	ADD  $64, R5
313	VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
314	VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
315
316	VLM  0(R6), V9, V12      // 64-bytes into V9..V16
317	ADD  $64, R6
318	VPDI $0x4, V9, V9, V9    // flip the doublewords to big-endian order
319	VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
320
321	VSBCBIQ V1, V9, V0, V25
322	VSBIQ   V1, V9, V0, V17
323	VSBCBIQ V2, V10, V25, V26
324	VSBIQ   V2, V10, V25, V18
325
326	VLM 0(R5), V5, V6   // 32-bytes into V1..V8
327	VLM 0(R6), V13, V14 // 32-bytes into V9..V16
328	ADD $32, R5
329	ADD $32, R6
330
331	VPDI $0x4, V3, V3, V3    // flip the doublewords to big-endian order
332	VPDI $0x4, V4, V4, V4    // flip the doublewords to big-endian order
333	VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
334	VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
335
336	VSBCBIQ V3, V11, V26, V27
337	VSBIQ   V3, V11, V26, V19
338	VSBCBIQ V4, V12, V27, V28
339	VSBIQ   V4, V12, V27, V20
340
341	VLM 0(R5), V7, V8   // 32-bytes into V1..V8
342	VLM 0(R6), V15, V16 // 32-bytes into V9..V16
343	ADD $32, R5
344	ADD $32, R6
345
346	VPDI $0x4, V5, V5, V5    // flip the doublewords to big-endian order
347	VPDI $0x4, V6, V6, V6    // flip the doublewords to big-endian order
348	VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
349	VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
350
351	VSBCBIQ V5, V13, V28, V29
352	VSBIQ   V5, V13, V28, V21
353	VSBCBIQ V6, V14, V29, V30
354	VSBIQ   V6, V14, V29, V22
355
356	VPDI $0x4, V7, V7, V7    // flip the doublewords to big-endian order
357	VPDI $0x4, V8, V8, V8    // flip the doublewords to big-endian order
358	VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
359	VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
360
361	VSBCBIQ V7, V15, V30, V31
362	VSBIQ   V7, V15, V30, V23
363	VSBCBIQ V8, V16, V31, V0  // V0 has carry-over
364	VSBIQ   V8, V16, V31, V24
365
366	VPDI  $0x4, V17, V17, V17 // flip the doublewords to big-endian order
367	VPDI  $0x4, V18, V18, V18 // flip the doublewords to big-endian order
368	VPDI  $0x4, V19, V19, V19 // flip the doublewords to big-endian order
369	VPDI  $0x4, V20, V20, V20 // flip the doublewords to big-endian order
370	VPDI  $0x4, V21, V21, V21 // flip the doublewords to big-endian order
371	VPDI  $0x4, V22, V22, V22 // flip the doublewords to big-endian order
372	VPDI  $0x4, V23, V23, V23 // flip the doublewords to big-endian order
373	VPDI  $0x4, V24, V24, V24 // flip the doublewords to big-endian order
374	VSTM  V17, V24, 0(R7)     // 128-bytes into z
375	ADD   $128, R7
376	ADD   $128, R10           // i += 16
377	SUB   $16, R3             // n -= 16
378	BGE   UU1                 // if n >= 0 goto U1
379	VLGVG $1, V0, R4          // put cf into R4
380	SUB   $1, R4              // save cf
381
382A1:
383	ADD $12, R3 // n += 16
384	BLT v1      // if n < 0 goto v1
385
386U1:  // n >= 0
387	// regular loop body unrolled 4x
388	MOVD 0(R8)(R10*1), R5
389	MOVD 8(R8)(R10*1), R6
390	MOVD 16(R8)(R10*1), R7
391	MOVD 24(R8)(R10*1), R1
392	MOVD R0, R11
393	SUBC R4, R11            // restore CF
394	MOVD 0(R9)(R10*1), R11
395	SUBE R11, R5
396	MOVD 8(R9)(R10*1), R11
397	SUBE R11, R6
398	MOVD 16(R9)(R10*1), R11
399	SUBE R11, R7
400	MOVD 24(R9)(R10*1), R11
401	SUBE R11, R1
402	MOVD R0, R4
403	SUBE R4, R4             // save CF
404	MOVD R5, 0(R2)(R10*1)
405	MOVD R6, 8(R2)(R10*1)
406	MOVD R7, 16(R2)(R10*1)
407	MOVD R1, 24(R2)(R10*1)
408
409	ADD $32, R10 // i += 4
410	SUB $4, R3   // n -= 4
411	BGE U1       // if n >= 0 goto U1n
412
413v1:
414	ADD $4, R3 // n += 4
415	BLE E1     // if n <= 0 goto E1
416
417L1:  // n > 0
418	MOVD R0, R11
419	SUBC R4, R11           // restore CF
420	MOVD 0(R8)(R10*1), R5
421	MOVD 0(R9)(R10*1), R11
422	SUBE R11, R5
423	MOVD R5, 0(R2)(R10*1)
424	MOVD R0, R4
425	SUBE R4, R4            // save CF
426
427	ADD $8, R10 // i++
428	SUB $1, R3  // n--
429	BGT L1      // if n > 0 goto L1n
430
431E1:
432	NEG  R4, R4
433	MOVD R4, c+72(FP) // return c
434	RET
435
436// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
437// func subVV(z, x, y []Word) (c Word)
438// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
439TEXT ·subVV_novec(SB), NOSPLIT, $0
440	MOVD z_len+8(FP), R3
441	MOVD x+24(FP), R8
442	MOVD y+48(FP), R9
443	MOVD z+0(FP), R2
444
445	MOVD $0, R4  // c = 0
446	MOVD $0, R0  // make sure it's zero
447	MOVD $0, R10 // i = 0
448
449	// s/JL/JMP/ below to disable the unrolled loop
450	SUB $4, R3 // n -= 4
451	BLT v1     // if n < 0 goto v1
452
453U1:  // n >= 0
454	// regular loop body unrolled 4x
455	MOVD 0(R8)(R10*1), R5
456	MOVD 8(R8)(R10*1), R6
457	MOVD 16(R8)(R10*1), R7
458	MOVD 24(R8)(R10*1), R1
459	MOVD R0, R11
460	SUBC R4, R11            // restore CF
461	MOVD 0(R9)(R10*1), R11
462	SUBE R11, R5
463	MOVD 8(R9)(R10*1), R11
464	SUBE R11, R6
465	MOVD 16(R9)(R10*1), R11
466	SUBE R11, R7
467	MOVD 24(R9)(R10*1), R11
468	SUBE R11, R1
469	MOVD R0, R4
470	SUBE R4, R4             // save CF
471	MOVD R5, 0(R2)(R10*1)
472	MOVD R6, 8(R2)(R10*1)
473	MOVD R7, 16(R2)(R10*1)
474	MOVD R1, 24(R2)(R10*1)
475
476	ADD $32, R10 // i += 4
477	SUB $4, R3   // n -= 4
478	BGE U1       // if n >= 0 goto U1
479
480v1:
481	ADD $4, R3 // n += 4
482	BLE E1     // if n <= 0 goto E1
483
484L1:  // n > 0
485	MOVD R0, R11
486	SUBC R4, R11           // restore CF
487	MOVD 0(R8)(R10*1), R5
488	MOVD 0(R9)(R10*1), R11
489	SUBE R11, R5
490	MOVD R5, 0(R2)(R10*1)
491	MOVD R0, R4
492	SUBE R4, R4            // save CF
493
494	ADD $8, R10 // i++
495	SUB $1, R3  // n--
496	BGT L1      // if n > 0 goto L1
497
498E1:
499	NEG  R4, R4
500	MOVD R4, c+72(FP) // return c
501	RET
502
503TEXT ·addVW(SB), NOSPLIT, $0
504	MOVD z_len+8(FP), R5 // length of z
505	MOVD x+24(FP), R6
506	MOVD y+48(FP), R7    // c = y
507	MOVD z+0(FP), R8
508
509	CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return
510
511	// Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag.
512	ADDC   0(R6), R7
513	MOVD   R7, 0(R8)
514	CMPBEQ R5, $1, returnResult // len(z) == 1
515	MOVD   $0, R9
516	ADDE   8(R6), R9
517	MOVD   R9, 8(R8)
518	CMPBEQ R5, $2, returnResult // len(z) == 2
519
520	// Update the counters
521	MOVD $16, R12    // i = 2
522	MOVD $-2(R5), R5 // n = n - 2
523
524loopOverEachWord:
525	BRC  $12, copySetup // carry = 0, copy the rest
526	MOVD $1, R9
527
528	// Originally we used the carry flag generated in the previous iteration
529	// (i.e: ADDE could be used here to do the addition).  However, since we
530	// already know carry is 1 (otherwise we will go to copy section), we can use
531	// ADDC here so the current iteration does not depend on the carry flag
532	// generated in the previous iteration. This could be useful when branch prediction happens.
533	ADDC 0(R6)(R12*1), R9
534	MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c
535
536	MOVD  $8(R12), R12         // i++
537	BRCTG R5, loopOverEachWord // n--
538
539// Return the current carry value
540returnResult:
541	MOVD $0, R0
542	ADDE R0, R0
543	MOVD R0, c+56(FP)
544	RET
545
546// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
547// With the assumption that x and z will not overlap with each other or x and z will
548// point to same memory region, we can use a faster version of copy using only MVC here.
549// In the following implementation, we have three copy loops, each copying a word, 4 words, and
550// 32 words at a time.  Via benchmarking, this implementation is faster than calling runtime·memmove.
551copySetup:
552	ADD R12, R6
553	ADD R12, R8
554
555	CMPBGE R5, $4, mediumLoop
556
557smallLoop:  // does a loop unrolling to copy word when n < 4
558	CMPBEQ R5, $0, returnZero
559	MVC    $8, 0(R6), 0(R8)
560	CMPBEQ R5, $1, returnZero
561	MVC    $8, 8(R6), 8(R8)
562	CMPBEQ R5, $2, returnZero
563	MVC    $8, 16(R6), 16(R8)
564
565returnZero:
566	MOVD $0, c+56(FP) // return 0 as carry
567	RET
568
569mediumLoop:
570	CMPBLT R5, $4, smallLoop
571	CMPBLT R5, $32, mediumLoopBody
572
573largeLoop:  // Copying 256 bytes at a time.
574	MVC    $256, 0(R6), 0(R8)
575	MOVD   $256(R6), R6
576	MOVD   $256(R8), R8
577	MOVD   $-32(R5), R5
578	CMPBGE R5, $32, largeLoop
579	BR     mediumLoop
580
581mediumLoopBody:  // Copying 32 bytes at a time
582	MVC    $32, 0(R6), 0(R8)
583	MOVD   $32(R6), R6
584	MOVD   $32(R8), R8
585	MOVD   $-4(R5), R5
586	CMPBGE R5, $4, mediumLoopBody
587	BR     smallLoop
588
589returnC:
590	MOVD R7, c+56(FP)
591	RET
592
593TEXT ·subVW(SB), NOSPLIT, $0
594	MOVD z_len+8(FP), R5
595	MOVD x+24(FP), R6
596	MOVD y+48(FP), R7    // The borrow bit passed in
597	MOVD z+0(FP), R8
598	MOVD $0, R0          // R0 is a temporary variable used during computation. Ensure it has zero in it.
599
600	CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
601
602	// Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
603	MOVD   0(R6), R9
604	SUBC   R7, R9
605	MOVD   R9, 0(R8)
606	CMPBEQ R5, $1, returnResult
607	MOVD   8(R6), R9
608	SUBE   R0, R9
609	MOVD   R9, 8(R8)
610	CMPBEQ R5, $2, returnResult
611
612	// Update the counters
613	MOVD $16, R12    // i = 2
614	MOVD $-2(R5), R5 // n = n - 2
615
616loopOverEachWord:
617	BRC  $3, copySetup    // no borrow, copy the rest
618	MOVD 0(R6)(R12*1), R9
619
620	// Originally we used the borrow flag generated in the previous iteration
621	// (i.e: SUBE could be used here to do the subtraction). However, since we
622	// already know borrow is 1 (otherwise we will go to copy section), we can
623	// use SUBC here so the current iteration does not depend on the borrow flag
624	// generated in the previous iteration. This could be useful when branch prediction happens.
625	SUBC $1, R9
626	MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
627
628	MOVD  $8(R12), R12         // i++
629	BRCTG R5, loopOverEachWord // n--
630
631// return the current borrow value
632returnResult:
633	SUBE R0, R0
634	NEG  R0, R0
635	MOVD R0, c+56(FP)
636	RET
637
638// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
639// With the assumption that x and z will not overlap with each other or x and z will
640// point to same memory region, we can use a faster version of copy using only MVC here.
641// In the following implementation, we have three copy loops, each copying a word, 4 words, and
642// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
643copySetup:
644	ADD R12, R6
645	ADD R12, R8
646
647	CMPBGE R5, $4, mediumLoop
648
649smallLoop:  // does a loop unrolling to copy word when n < 4
650	CMPBEQ R5, $0, returnZero
651	MVC    $8, 0(R6), 0(R8)
652	CMPBEQ R5, $1, returnZero
653	MVC    $8, 8(R6), 8(R8)
654	CMPBEQ R5, $2, returnZero
655	MVC    $8, 16(R6), 16(R8)
656
657returnZero:
658	MOVD $0, c+56(FP) // return 0 as borrow
659	RET
660
661mediumLoop:
662	CMPBLT R5, $4, smallLoop
663	CMPBLT R5, $32, mediumLoopBody
664
665largeLoop:  // Copying 256 bytes at a time
666	MVC    $256, 0(R6), 0(R8)
667	MOVD   $256(R6), R6
668	MOVD   $256(R8), R8
669	MOVD   $-32(R5), R5
670	CMPBGE R5, $32, largeLoop
671	BR     mediumLoop
672
673mediumLoopBody:  // Copying 32 bytes at a time
674	MVC    $32, 0(R6), 0(R8)
675	MOVD   $32(R6), R6
676	MOVD   $32(R8), R8
677	MOVD   $-4(R5), R5
678	CMPBGE R5, $4, mediumLoopBody
679	BR     smallLoop
680
681returnC:
682	MOVD R7, c+56(FP)
683	RET
684
685// func shlVU(z, x []Word, s uint) (c Word)
686TEXT ·shlVU(SB), NOSPLIT, $0
687	BR ·shlVU_g(SB)
688
689// func shrVU(z, x []Word, s uint) (c Word)
690TEXT ·shrVU(SB), NOSPLIT, $0
691	BR ·shrVU_g(SB)
692
693// CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, DX = r3, AX = r6, BX = R1, (R0 set to 0) + use R11 + use R7 for i
694// func mulAddVWW(z, x []Word, y, r Word) (c Word)
695TEXT ·mulAddVWW(SB), NOSPLIT, $0
696	MOVD z+0(FP), R2
697	MOVD x+24(FP), R8
698	MOVD y+48(FP), R9
699	MOVD r+56(FP), R4    // c = r
700	MOVD z_len+8(FP), R5
701	MOVD $0, R1          // i = 0
702	MOVD $0, R7          // i*8 = 0
703	MOVD $0, R0          // make sure it's zero
704	BR   E5
705
706L5:
707	MOVD   (R8)(R1*1), R6
708	MULHDU R9, R6
709	ADDC   R4, R11         // add to low order bits
710	ADDE   R0, R6
711	MOVD   R11, (R2)(R1*1)
712	MOVD   R6, R4
713	ADD    $8, R1          // i*8 + 8
714	ADD    $1, R7          // i++
715
716E5:
717	CMPBLT R7, R5, L5 // i < n
718
719	MOVD R4, c+64(FP)
720	RET
721
722// func addMulVVW(z, x []Word, y Word) (c Word)
723// CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1, (R0 set to 0) + use R11 + use R7 for i
724TEXT ·addMulVVW(SB), NOSPLIT, $0
725	MOVD z+0(FP), R2
726	MOVD x+24(FP), R8
727	MOVD y+48(FP), R9
728	MOVD z_len+8(FP), R5
729
730	MOVD $0, R1 // i*8 = 0
731	MOVD $0, R7 // i = 0
732	MOVD $0, R0 // make sure it's zero
733	MOVD $0, R4 // c = 0
734
735	MOVD   R5, R12
736	AND    $-2, R12
737	CMPBGE R5, $2, A6
738	BR     E6
739
740A6:
741	MOVD   (R8)(R1*1), R6
742	MULHDU R9, R6
743	MOVD   (R2)(R1*1), R10
744	ADDC   R10, R11        // add to low order bits
745	ADDE   R0, R6
746	ADDC   R4, R11
747	ADDE   R0, R6
748	MOVD   R6, R4
749	MOVD   R11, (R2)(R1*1)
750
751	MOVD   (8)(R8)(R1*1), R6
752	MULHDU R9, R6
753	MOVD   (8)(R2)(R1*1), R10
754	ADDC   R10, R11           // add to low order bits
755	ADDE   R0, R6
756	ADDC   R4, R11
757	ADDE   R0, R6
758	MOVD   R6, R4
759	MOVD   R11, (8)(R2)(R1*1)
760
761	ADD $16, R1 // i*8 + 8
762	ADD $2, R7  // i++
763
764	CMPBLT R7, R12, A6
765	BR     E6
766
767L6:
768	MOVD   (R8)(R1*1), R6
769	MULHDU R9, R6
770	MOVD   (R2)(R1*1), R10
771	ADDC   R10, R11        // add to low order bits
772	ADDE   R0, R6
773	ADDC   R4, R11
774	ADDE   R0, R6
775	MOVD   R6, R4
776	MOVD   R11, (R2)(R1*1)
777
778	ADD $8, R1 // i*8 + 8
779	ADD $1, R7 // i++
780
781E6:
782	CMPBLT R7, R5, L6 // i < n
783
784	MOVD R4, c+56(FP)
785	RET
786
787