1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !math_big_pure_go
6
7#include "textflag.h"
8
9// This file provides fast assembly versions for the elementary
10// arithmetic operations on vectors implemented in arith.go.
11
12// func addVV(z, x, y []Word) (c Word)
13TEXT ·addVV(SB),NOSPLIT,$0
14	MOVL z+0(FP), DI
15	MOVL x+12(FP), SI
16	MOVL y+24(FP), CX
17	MOVL z_len+4(FP), BP
18	MOVL $0, BX		// i = 0
19	MOVL $0, DX		// c = 0
20	JMP E1
21
22L1:	MOVL (SI)(BX*4), AX
23	ADDL DX, DX		// restore CF
24	ADCL (CX)(BX*4), AX
25	SBBL DX, DX		// save CF
26	MOVL AX, (DI)(BX*4)
27	ADDL $1, BX		// i++
28
29E1:	CMPL BX, BP		// i < n
30	JL L1
31
32	NEGL DX
33	MOVL DX, c+36(FP)
34	RET
35
36
37// func subVV(z, x, y []Word) (c Word)
38// (same as addVV except for SBBL instead of ADCL and label names)
39TEXT ·subVV(SB),NOSPLIT,$0
40	MOVL z+0(FP), DI
41	MOVL x+12(FP), SI
42	MOVL y+24(FP), CX
43	MOVL z_len+4(FP), BP
44	MOVL $0, BX		// i = 0
45	MOVL $0, DX		// c = 0
46	JMP E2
47
48L2:	MOVL (SI)(BX*4), AX
49	ADDL DX, DX		// restore CF
50	SBBL (CX)(BX*4), AX
51	SBBL DX, DX		// save CF
52	MOVL AX, (DI)(BX*4)
53	ADDL $1, BX		// i++
54
55E2:	CMPL BX, BP		// i < n
56	JL L2
57
58	NEGL DX
59	MOVL DX, c+36(FP)
60	RET
61
62
63// func addVW(z, x []Word, y Word) (c Word)
64TEXT ·addVW(SB),NOSPLIT,$0
65	MOVL z+0(FP), DI
66	MOVL x+12(FP), SI
67	MOVL y+24(FP), AX	// c = y
68	MOVL z_len+4(FP), BP
69	MOVL $0, BX		// i = 0
70	JMP E3
71
72L3:	ADDL (SI)(BX*4), AX
73	MOVL AX, (DI)(BX*4)
74	SBBL AX, AX		// save CF
75	NEGL AX
76	ADDL $1, BX		// i++
77
78E3:	CMPL BX, BP		// i < n
79	JL L3
80
81	MOVL AX, c+28(FP)
82	RET
83
84
85// func subVW(z, x []Word, y Word) (c Word)
86TEXT ·subVW(SB),NOSPLIT,$0
87	MOVL z+0(FP), DI
88	MOVL x+12(FP), SI
89	MOVL y+24(FP), AX	// c = y
90	MOVL z_len+4(FP), BP
91	MOVL $0, BX		// i = 0
92	JMP E4
93
94L4:	MOVL (SI)(BX*4), DX
95	SUBL AX, DX
96	MOVL DX, (DI)(BX*4)
97	SBBL AX, AX		// save CF
98	NEGL AX
99	ADDL $1, BX		// i++
100
101E4:	CMPL BX, BP		// i < n
102	JL L4
103
104	MOVL AX, c+28(FP)
105	RET
106
107
108// func shlVU(z, x []Word, s uint) (c Word)
109TEXT ·shlVU(SB),NOSPLIT,$0
110	MOVL z_len+4(FP), BX	// i = z
111	SUBL $1, BX		// i--
112	JL X8b			// i < 0	(n <= 0)
113
114	// n > 0
115	MOVL z+0(FP), DI
116	MOVL x+12(FP), SI
117	MOVL s+24(FP), CX
118	MOVL (SI)(BX*4), AX	// w1 = x[n-1]
119	MOVL $0, DX
120	SHLL CX, AX, DX		// w1>>ŝ
121	MOVL DX, c+28(FP)
122
123	CMPL BX, $0
124	JLE X8a			// i <= 0
125
126	// i > 0
127L8:	MOVL AX, DX		// w = w1
128	MOVL -4(SI)(BX*4), AX	// w1 = x[i-1]
129	SHLL CX, AX, DX		// w<<s | w1>>ŝ
130	MOVL DX, (DI)(BX*4)	// z[i] = w<<s | w1>>ŝ
131	SUBL $1, BX		// i--
132	JG L8			// i > 0
133
134	// i <= 0
135X8a:	SHLL CX, AX		// w1<<s
136	MOVL AX, (DI)		// z[0] = w1<<s
137	RET
138
139X8b:	MOVL $0, c+28(FP)
140	RET
141
142
143// func shrVU(z, x []Word, s uint) (c Word)
144TEXT ·shrVU(SB),NOSPLIT,$0
145	MOVL z_len+4(FP), BP
146	SUBL $1, BP		// n--
147	JL X9b			// n < 0	(n <= 0)
148
149	// n > 0
150	MOVL z+0(FP), DI
151	MOVL x+12(FP), SI
152	MOVL s+24(FP), CX
153	MOVL (SI), AX		// w1 = x[0]
154	MOVL $0, DX
155	SHRL CX, AX, DX		// w1<<ŝ
156	MOVL DX, c+28(FP)
157
158	MOVL $0, BX		// i = 0
159	JMP E9
160
161	// i < n-1
162L9:	MOVL AX, DX		// w = w1
163	MOVL 4(SI)(BX*4), AX	// w1 = x[i+1]
164	SHRL CX, AX, DX		// w>>s | w1<<ŝ
165	MOVL DX, (DI)(BX*4)	// z[i] = w>>s | w1<<ŝ
166	ADDL $1, BX		// i++
167
168E9:	CMPL BX, BP
169	JL L9			// i < n-1
170
171	// i >= n-1
172X9a:	SHRL CX, AX		// w1>>s
173	MOVL AX, (DI)(BP*4)	// z[n-1] = w1>>s
174	RET
175
176X9b:	MOVL $0, c+28(FP)
177	RET
178
179
180// func mulAddVWW(z, x []Word, y, r Word) (c Word)
181TEXT ·mulAddVWW(SB),NOSPLIT,$0
182	MOVL z+0(FP), DI
183	MOVL x+12(FP), SI
184	MOVL y+24(FP), BP
185	MOVL r+28(FP), CX	// c = r
186	MOVL z_len+4(FP), BX
187	LEAL (DI)(BX*4), DI
188	LEAL (SI)(BX*4), SI
189	NEGL BX			// i = -n
190	JMP E5
191
192L5:	MOVL (SI)(BX*4), AX
193	MULL BP
194	ADDL CX, AX
195	ADCL $0, DX
196	MOVL AX, (DI)(BX*4)
197	MOVL DX, CX
198	ADDL $1, BX		// i++
199
200E5:	CMPL BX, $0		// i < 0
201	JL L5
202
203	MOVL CX, c+32(FP)
204	RET
205
206
207// func addMulVVW(z, x []Word, y Word) (c Word)
208TEXT ·addMulVVW(SB),NOSPLIT,$0
209	MOVL z+0(FP), DI
210	MOVL x+12(FP), SI
211	MOVL y+24(FP), BP
212	MOVL z_len+4(FP), BX
213	LEAL (DI)(BX*4), DI
214	LEAL (SI)(BX*4), SI
215	NEGL BX			// i = -n
216	MOVL $0, CX		// c = 0
217	JMP E6
218
219L6:	MOVL (SI)(BX*4), AX
220	MULL BP
221	ADDL CX, AX
222	ADCL $0, DX
223	ADDL AX, (DI)(BX*4)
224	ADCL $0, DX
225	MOVL DX, CX
226	ADDL $1, BX		// i++
227
228E6:	CMPL BX, $0		// i < 0
229	JL L6
230
231	MOVL CX, c+28(FP)
232	RET
233
234
235
236