1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !math_big_pure_go
6
7#include "textflag.h"
8
9// This file provides fast assembly versions for the elementary
10// arithmetic operations on vectors implemented in arith.go.
11
12// TODO: Consider re-implementing using Advanced SIMD
13// once the assembler supports those instructions.
14
15// func addVV(z, x, y []Word) (c Word)
16TEXT ·addVV(SB),NOSPLIT,$0
17	MOVD	z_len+8(FP), R0
18	MOVD	x+24(FP), R8
19	MOVD	y+48(FP), R9
20	MOVD	z+0(FP), R10
21	ADDS	$0, R0		// clear carry flag
22	TBZ	$0, R0, two
23	MOVD.P	8(R8), R11
24	MOVD.P	8(R9), R15
25	ADCS	R15, R11
26	MOVD.P	R11, 8(R10)
27	SUB	$1, R0
28two:
29	TBZ	$1, R0, loop
30	LDP.P	16(R8), (R11, R12)
31	LDP.P	16(R9), (R15, R16)
32	ADCS	R15, R11
33	ADCS	R16, R12
34	STP.P	(R11, R12), 16(R10)
35	SUB	$2, R0
36loop:
37	CBZ	R0, done	// careful not to touch the carry flag
38	LDP.P	32(R8), (R11, R12)
39	LDP	-16(R8), (R13, R14)
40	LDP.P	32(R9), (R15, R16)
41	LDP	-16(R9), (R17, R19)
42	ADCS	R15, R11
43	ADCS	R16, R12
44	ADCS	R17, R13
45	ADCS	R19, R14
46	STP.P	(R11, R12), 32(R10)
47	STP	(R13, R14), -16(R10)
48	SUB	$4, R0
49	B	loop
50done:
51	CSET	HS, R0		// extract carry flag
52	MOVD	R0, c+72(FP)
53	RET
54
55
56// func subVV(z, x, y []Word) (c Word)
57TEXT ·subVV(SB),NOSPLIT,$0
58	MOVD	z_len+8(FP), R0
59	MOVD	x+24(FP), R8
60	MOVD	y+48(FP), R9
61	MOVD	z+0(FP), R10
62	CMP	R0, R0		// set carry flag
63	TBZ	$0, R0, two
64	MOVD.P	8(R8), R11
65	MOVD.P	8(R9), R15
66	SBCS	R15, R11
67	MOVD.P	R11, 8(R10)
68	SUB	$1, R0
69two:
70	TBZ	$1, R0, loop
71	LDP.P	16(R8), (R11, R12)
72	LDP.P	16(R9), (R15, R16)
73	SBCS	R15, R11
74	SBCS	R16, R12
75	STP.P	(R11, R12), 16(R10)
76	SUB	$2, R0
77loop:
78	CBZ	R0, done	// careful not to touch the carry flag
79	LDP.P	32(R8), (R11, R12)
80	LDP	-16(R8), (R13, R14)
81	LDP.P	32(R9), (R15, R16)
82	LDP	-16(R9), (R17, R19)
83	SBCS	R15, R11
84	SBCS	R16, R12
85	SBCS	R17, R13
86	SBCS	R19, R14
87	STP.P	(R11, R12), 32(R10)
88	STP	(R13, R14), -16(R10)
89	SUB	$4, R0
90	B	loop
91done:
92	CSET	LO, R0		// extract carry flag
93	MOVD	R0, c+72(FP)
94	RET
95
96#define vwOneOp(instr, op1)				\
97	MOVD.P	8(R1), R4;				\
98	instr	op1, R4;				\
99	MOVD.P	R4, 8(R3);
100
101// handle the first 1~4 elements before starting iteration in addVW/subVW
102#define vwPreIter(instr1, instr2, counter, target)	\
103	vwOneOp(instr1, R2);				\
104	SUB	$1, counter;				\
105	CBZ	counter, target;			\
106	vwOneOp(instr2, $0);				\
107	SUB	$1, counter;				\
108	CBZ	counter, target;			\
109	vwOneOp(instr2, $0);				\
110	SUB	$1, counter;				\
111	CBZ	counter, target;			\
112	vwOneOp(instr2, $0);
113
114// do one iteration of add or sub in addVW/subVW
115#define vwOneIter(instr, counter, exit)	\
116	CBZ	counter, exit;		\	// careful not to touch the carry flag
117	LDP.P	32(R1), (R4, R5);	\
118	LDP	-16(R1), (R6, R7);	\
119	instr	$0, R4, R8;		\
120	instr	$0, R5, R9;		\
121	instr	$0, R6, R10;		\
122	instr	$0, R7, R11;		\
123	STP.P	(R8, R9), 32(R3);	\
124	STP	(R10, R11), -16(R3);	\
125	SUB	$4, counter;
126
127// do one iteration of copy in addVW/subVW
128#define vwOneIterCopy(counter, exit)			\
129	CBZ	counter, exit;				\
130	LDP.P	32(R1), (R4, R5);			\
131	LDP	-16(R1), (R6, R7);			\
132	STP.P	(R4, R5), 32(R3);			\
133	STP	(R6, R7), -16(R3);			\
134	SUB	$4, counter;
135
136// func addVW(z, x []Word, y Word) (c Word)
137// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
138// and switches to copy if we are done with carries. The copying is skipped as well
139// if 'x' and 'z' happen to share the same underlying storage.
140// The overhead of the checking and branching is visible when 'z' are small (~5%),
141// so set a threshold of 32, and remain the small-sized part entirely untouched.
142TEXT ·addVW(SB),NOSPLIT,$0
143	MOVD	z+0(FP), R3
144	MOVD	z_len+8(FP), R0
145	MOVD	x+24(FP), R1
146	MOVD	y+48(FP), R2
147	CMP	$32, R0
148	BGE	large		// large-sized 'z' and 'x'
149	CBZ	R0, len0	// the length of z is 0
150	MOVD.P	8(R1), R4
151	ADDS	R2, R4		// z[0] = x[0] + y, set carry
152	MOVD.P	R4, 8(R3)
153	SUB	$1, R0
154	CBZ	R0, len1	// the length of z is 1
155	TBZ	$0, R0, two
156	MOVD.P	8(R1), R4	// do it once
157	ADCS	$0, R4
158	MOVD.P	R4, 8(R3)
159	SUB	$1, R0
160two:				// do it twice
161	TBZ	$1, R0, loop
162	LDP.P	16(R1), (R4, R5)
163	ADCS	$0, R4, R8	// c, z[i] = x[i] + c
164	ADCS	$0, R5, R9
165	STP.P	(R8, R9), 16(R3)
166	SUB	$2, R0
167loop:				// do four times per round
168	vwOneIter(ADCS, R0, len1)
169	B	loop
170len1:
171	CSET	HS, R2		// extract carry flag
172len0:
173	MOVD	R2, c+56(FP)
174done:
175	RET
176large:
177	AND	$0x3, R0, R10
178	AND	$~0x3, R0
179	// unrolling for the first 1~4 elements to avoid saving the carry
180	// flag in each step, adjust $R0 if we unrolled 4 elements
181	vwPreIter(ADDS, ADCS, R10, add4)
182	SUB	$4, R0
183add4:
184	BCC	copy
185	vwOneIter(ADCS, R0, len1)
186	B	add4
187copy:
188	MOVD	ZR, c+56(FP)
189	CMP	R1, R3
190	BEQ	done
191copy_4:				// no carry flag, copy the rest
192	vwOneIterCopy(R0, done)
193	B	copy_4
194
195// func subVW(z, x []Word, y Word) (c Word)
196// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
197// and switches to copy if we are done with carries. The copying is skipped as well
198// if 'x' and 'z' happen to share the same underlying storage.
199// The overhead of the checking and branching is visible when 'z' are small (~5%),
200// so set a threshold of 32, and remain the small-sized part entirely untouched.
201TEXT ·subVW(SB),NOSPLIT,$0
202	MOVD	z+0(FP), R3
203	MOVD	z_len+8(FP), R0
204	MOVD	x+24(FP), R1
205	MOVD	y+48(FP), R2
206	CMP	$32, R0
207	BGE	large		// large-sized 'z' and 'x'
208	CBZ	R0, len0	// the length of z is 0
209	MOVD.P	8(R1), R4
210	SUBS	R2, R4		// z[0] = x[0] - y, set carry
211	MOVD.P	R4, 8(R3)
212	SUB	$1, R0
213	CBZ	R0, len1	// the length of z is 1
214	TBZ	$0, R0, two	// do it once
215	MOVD.P	8(R1), R4
216	SBCS	$0, R4
217	MOVD.P	R4, 8(R3)
218	SUB	$1, R0
219two:				// do it twice
220	TBZ	$1, R0, loop
221	LDP.P	16(R1), (R4, R5)
222	SBCS	$0, R4, R8	// c, z[i] = x[i] + c
223	SBCS	$0, R5, R9
224	STP.P	(R8, R9), 16(R3)
225	SUB	$2, R0
226loop:				// do four times per round
227	vwOneIter(SBCS, R0, len1)
228	B	loop
229len1:
230	CSET	LO, R2		// extract carry flag
231len0:
232	MOVD	R2, c+56(FP)
233done:
234	RET
235large:
236	AND	$0x3, R0, R10
237	AND	$~0x3, R0
238	// unrolling for the first 1~4 elements to avoid saving the carry
239	// flag in each step, adjust $R0 if we unrolled 4 elements
240	vwPreIter(SUBS, SBCS, R10, sub4)
241	SUB	$4, R0
242sub4:
243	BCS	copy
244	vwOneIter(SBCS, R0, len1)
245	B	sub4
246copy:
247	MOVD	ZR, c+56(FP)
248	CMP	R1, R3
249	BEQ	done
250copy_4:				// no carry flag, copy the rest
251	vwOneIterCopy(R0, done)
252	B	copy_4
253
254// func shlVU(z, x []Word, s uint) (c Word)
255// This implementation handles the shift operation from the high word to the low word,
256// which may be an error for the case where the low word of x overlaps with the high
257// word of z. When calling this function directly, you need to pay attention to this
258// situation.
259TEXT ·shlVU(SB),NOSPLIT,$0
260	LDP	z+0(FP), (R0, R1)	// R0 = z.ptr, R1 = len(z)
261	MOVD	x+24(FP), R2
262	MOVD	s+48(FP), R3
263	ADD	R1<<3, R0	// R0 = &z[n]
264	ADD	R1<<3, R2	// R2 = &x[n]
265	CBZ	R1, len0
266	CBZ	R3, copy	// if the number of shift is 0, just copy x to z
267	MOVD	$64, R4
268	SUB	R3, R4
269	// handling the most significant element x[n-1]
270	MOVD.W	-8(R2), R6
271	LSR	R4, R6, R5	// return value
272	LSL	R3, R6, R8	// x[i] << s
273	SUB	$1, R1
274one:	TBZ	$0, R1, two
275	MOVD.W	-8(R2), R6
276	LSR	R4, R6, R7
277	ORR	R8, R7
278	LSL	R3, R6, R8
279	SUB	$1, R1
280	MOVD.W	R7, -8(R0)
281two:
282	TBZ	$1, R1, loop
283	LDP.W	-16(R2), (R6, R7)
284	LSR	R4, R7, R10
285	ORR	R8, R10
286	LSL	R3, R7
287	LSR	R4, R6, R9
288	ORR	R7, R9
289	LSL	R3, R6, R8
290	SUB	$2, R1
291	STP.W	(R9, R10), -16(R0)
292loop:
293	CBZ	R1, done
294	LDP.W	-32(R2), (R10, R11)
295	LDP	16(R2), (R12, R13)
296	LSR	R4, R13, R23
297	ORR	R8, R23		// z[i] = (x[i] << s) | (x[i-1] >> (64 - s))
298	LSL	R3, R13
299	LSR	R4, R12, R22
300	ORR	R13, R22
301	LSL	R3, R12
302	LSR	R4, R11, R21
303	ORR	R12, R21
304	LSL	R3, R11
305	LSR	R4, R10, R20
306	ORR	R11, R20
307	LSL	R3, R10, R8
308	STP.W	(R20, R21), -32(R0)
309	STP	(R22, R23), 16(R0)
310	SUB	$4, R1
311	B	loop
312done:
313	MOVD.W	R8, -8(R0)	// the first element x[0]
314	MOVD	R5, c+56(FP)	// the part moved out from x[n-1]
315	RET
316copy:
317	CMP	R0, R2
318	BEQ	len0
319	TBZ	$0, R1, ctwo
320	MOVD.W	-8(R2), R4
321	MOVD.W	R4, -8(R0)
322	SUB	$1, R1
323ctwo:
324	TBZ	$1, R1, cloop
325	LDP.W	-16(R2), (R4, R5)
326	STP.W	(R4, R5), -16(R0)
327	SUB	$2, R1
328cloop:
329	CBZ	R1, len0
330	LDP.W	-32(R2), (R4, R5)
331	LDP	16(R2), (R6, R7)
332	STP.W	(R4, R5), -32(R0)
333	STP	(R6, R7), 16(R0)
334	SUB	$4, R1
335	B	cloop
336len0:
337	MOVD	$0, c+56(FP)
338	RET
339
340// func shrVU(z, x []Word, s uint) (c Word)
341// This implementation handles the shift operation from the low word to the high word,
342// which may be an error for the case where the high word of x overlaps with the low
343// word of z. When calling this function directly, you need to pay attention to this
344// situation.
345TEXT ·shrVU(SB),NOSPLIT,$0
346	MOVD	z+0(FP), R0
347	MOVD	z_len+8(FP), R1
348	MOVD	x+24(FP), R2
349	MOVD	s+48(FP), R3
350	MOVD	$0, R8
351	MOVD	$64, R4
352	SUB	R3, R4
353	CBZ	R1, len0
354	CBZ	R3, copy	// if the number of shift is 0, just copy x to z
355
356	MOVD.P	8(R2), R20
357	LSR	R3, R20, R8
358	LSL	R4, R20
359	MOVD	R20, c+56(FP)	// deal with the first element
360	SUB	$1, R1
361
362	TBZ	$0, R1, two
363	MOVD.P	8(R2), R6
364	LSL	R4, R6, R20
365	ORR	R8, R20
366	LSR	R3, R6, R8
367	MOVD.P	R20, 8(R0)
368	SUB	$1, R1
369two:
370	TBZ	$1, R1, loop
371	LDP.P	16(R2), (R6, R7)
372	LSL	R4, R6, R20
373	LSR	R3, R6
374	ORR	R8, R20
375	LSL	R4, R7, R21
376	LSR	R3, R7, R8
377	ORR	R6, R21
378	STP.P	(R20, R21), 16(R0)
379	SUB	$2, R1
380loop:
381	CBZ	R1, done
382	LDP.P	32(R2), (R10, R11)
383	LDP	-16(R2), (R12, R13)
384	LSL	R4, R10, R20
385	LSR	R3, R10
386	ORR	R8, R20		// z[i] = (x[i] >> s) | (x[i+1] << (64 - s))
387	LSL	R4, R11, R21
388	LSR	R3, R11
389	ORR	R10, R21
390	LSL	R4, R12, R22
391	LSR	R3, R12
392	ORR	R11, R22
393	LSL	R4, R13, R23
394	LSR	R3, R13, R8
395	ORR	R12, R23
396	STP.P	(R20, R21), 32(R0)
397	STP	(R22, R23), -16(R0)
398	SUB	$4, R1
399	B	loop
400done:
401	MOVD	R8, (R0)	// deal with the last element
402	RET
403copy:
404	CMP	R0, R2
405	BEQ	len0
406	TBZ	$0, R1, ctwo
407	MOVD.P	8(R2), R3
408	MOVD.P	R3, 8(R0)
409	SUB	$1, R1
410ctwo:
411	TBZ	$1, R1, cloop
412	LDP.P	16(R2), (R4, R5)
413	STP.P	(R4, R5), 16(R0)
414	SUB	$2, R1
415cloop:
416	CBZ	R1, len0
417	LDP.P	32(R2), (R4, R5)
418	LDP	-16(R2), (R6, R7)
419	STP.P	(R4, R5), 32(R0)
420	STP	(R6, R7), -16(R0)
421	SUB	$4, R1
422	B	cloop
423len0:
424	MOVD	$0, c+56(FP)
425	RET
426
427
428// func mulAddVWW(z, x []Word, y, r Word) (c Word)
429TEXT ·mulAddVWW(SB),NOSPLIT,$0
430	MOVD	z+0(FP), R1
431	MOVD	z_len+8(FP), R0
432	MOVD	x+24(FP), R2
433	MOVD	y+48(FP), R3
434	MOVD	r+56(FP), R4
435	// c, z = x * y + r
436	TBZ	$0, R0, two
437	MOVD.P	8(R2), R5
438	MUL	R3, R5, R7
439	UMULH	R3, R5, R8
440	ADDS	R4, R7
441	ADC	$0, R8, R4	// c, z[i] = x[i] * y +  r
442	MOVD.P	R7, 8(R1)
443	SUB	$1, R0
444two:
445	TBZ	$1, R0, loop
446	LDP.P	16(R2), (R5, R6)
447	MUL	R3, R5, R10
448	UMULH	R3, R5, R11
449	ADDS	R4, R10
450	MUL	R3, R6, R12
451	UMULH	R3, R6, R13
452	ADCS	R12, R11
453	ADC	$0, R13, R4
454
455	STP.P	(R10, R11), 16(R1)
456	SUB	$2, R0
457loop:
458	CBZ	R0, done
459	LDP.P	32(R2), (R5, R6)
460	LDP	-16(R2), (R7, R8)
461
462	MUL	R3, R5, R10
463	UMULH	R3, R5, R11
464	ADDS	R4, R10
465	MUL	R3, R6, R12
466	UMULH	R3, R6, R13
467	ADCS	R11, R12
468
469	MUL	R3, R7, R14
470	UMULH	R3, R7, R15
471	ADCS	R13, R14
472	MUL	R3, R8, R16
473	UMULH	R3, R8, R17
474	ADCS	R15, R16
475	ADC	$0, R17, R4
476
477	STP.P	(R10, R12), 32(R1)
478	STP	(R14, R16), -16(R1)
479	SUB	$4, R0
480	B	loop
481done:
482	MOVD	R4, c+64(FP)
483	RET
484
485
486// func addMulVVW(z, x []Word, y Word) (c Word)
487TEXT ·addMulVVW(SB),NOSPLIT,$0
488	MOVD	z+0(FP), R1
489	MOVD	z_len+8(FP), R0
490	MOVD	x+24(FP), R2
491	MOVD	y+48(FP), R3
492	MOVD	$0, R4
493
494	TBZ	$0, R0, two
495
496	MOVD.P	8(R2), R5
497	MOVD	(R1), R6
498
499	MUL	R5, R3, R7
500	UMULH	R5, R3, R8
501
502	ADDS	R7, R6
503	ADC	$0, R8, R4
504
505	MOVD.P	R6, 8(R1)
506	SUB	$1, R0
507
508two:
509	TBZ	$1, R0, loop
510
511	LDP.P	16(R2), (R5, R10)
512	LDP	(R1), (R6, R11)
513
514	MUL	R10, R3, R13
515	UMULH	R10, R3, R12
516
517	MUL	R5, R3, R7
518	UMULH	R5, R3, R8
519
520	ADDS	R4, R6
521	ADCS	R13, R11
522	ADC	$0, R12
523
524	ADDS	R7, R6
525	ADCS	R8, R11
526	ADC	$0, R12, R4
527
528	STP.P	(R6, R11), 16(R1)
529	SUB	$2, R0
530
531// The main loop of this code operates on a block of 4 words every iteration
532// performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
533// where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
534// 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
535loop:
536	CBZ	R0, done
537
538	LDP.P	16(R2), (R5, R6)
539	LDP.P	16(R2), (R7, R8)
540
541	LDP	(R1), (R9, R10)
542	ADDS	R4, R9
543	MUL	R6, R3, R14
544	ADCS	R14, R10
545	MUL	R7, R3, R15
546	LDP	16(R1), (R11, R12)
547	ADCS	R15, R11
548	MUL	R8, R3, R16
549	ADCS	R16, R12
550	UMULH	R8, R3, R20
551	ADC	$0, R20
552
553	MUL	R5, R3, R13
554	ADDS	R13, R9
555	UMULH	R5, R3, R17
556	ADCS	R17, R10
557	UMULH	R6, R3, R21
558	STP.P	(R9, R10), 16(R1)
559	ADCS	R21, R11
560	UMULH	R7, R3, R19
561	ADCS	R19, R12
562	STP.P	(R11, R12), 16(R1)
563	ADC	$0, R20, R4
564
565	SUB	$4, R0
566	B	loop
567
568done:
569	MOVD	R4, c+56(FP)
570	RET
571
572
573