1// Copyright 2013 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build !math_big_pure_go 6 7#include "textflag.h" 8 9// This file provides fast assembly versions for the elementary 10// arithmetic operations on vectors implemented in arith.go. 11 12// TODO: Consider re-implementing using Advanced SIMD 13// once the assembler supports those instructions. 14 15// func addVV(z, x, y []Word) (c Word) 16TEXT ·addVV(SB),NOSPLIT,$0 17 MOVD z_len+8(FP), R0 18 MOVD x+24(FP), R8 19 MOVD y+48(FP), R9 20 MOVD z+0(FP), R10 21 ADDS $0, R0 // clear carry flag 22 TBZ $0, R0, two 23 MOVD.P 8(R8), R11 24 MOVD.P 8(R9), R15 25 ADCS R15, R11 26 MOVD.P R11, 8(R10) 27 SUB $1, R0 28two: 29 TBZ $1, R0, loop 30 LDP.P 16(R8), (R11, R12) 31 LDP.P 16(R9), (R15, R16) 32 ADCS R15, R11 33 ADCS R16, R12 34 STP.P (R11, R12), 16(R10) 35 SUB $2, R0 36loop: 37 CBZ R0, done // careful not to touch the carry flag 38 LDP.P 32(R8), (R11, R12) 39 LDP -16(R8), (R13, R14) 40 LDP.P 32(R9), (R15, R16) 41 LDP -16(R9), (R17, R19) 42 ADCS R15, R11 43 ADCS R16, R12 44 ADCS R17, R13 45 ADCS R19, R14 46 STP.P (R11, R12), 32(R10) 47 STP (R13, R14), -16(R10) 48 SUB $4, R0 49 B loop 50done: 51 CSET HS, R0 // extract carry flag 52 MOVD R0, c+72(FP) 53 RET 54 55 56// func subVV(z, x, y []Word) (c Word) 57TEXT ·subVV(SB),NOSPLIT,$0 58 MOVD z_len+8(FP), R0 59 MOVD x+24(FP), R8 60 MOVD y+48(FP), R9 61 MOVD z+0(FP), R10 62 CMP R0, R0 // set carry flag 63 TBZ $0, R0, two 64 MOVD.P 8(R8), R11 65 MOVD.P 8(R9), R15 66 SBCS R15, R11 67 MOVD.P R11, 8(R10) 68 SUB $1, R0 69two: 70 TBZ $1, R0, loop 71 LDP.P 16(R8), (R11, R12) 72 LDP.P 16(R9), (R15, R16) 73 SBCS R15, R11 74 SBCS R16, R12 75 STP.P (R11, R12), 16(R10) 76 SUB $2, R0 77loop: 78 CBZ R0, done // careful not to touch the carry flag 79 LDP.P 32(R8), (R11, R12) 80 LDP -16(R8), (R13, R14) 81 LDP.P 32(R9), (R15, R16) 82 LDP -16(R9), (R17, R19) 83 SBCS R15, R11 84 SBCS R16, R12 85 SBCS R17, R13 86 SBCS R19, R14 87 STP.P (R11, R12), 32(R10) 88 STP (R13, R14), -16(R10) 89 SUB $4, R0 90 B loop 91done: 92 CSET LO, R0 // extract carry flag 93 MOVD R0, c+72(FP) 94 RET 95 96#define vwOneOp(instr, op1) \ 97 MOVD.P 8(R1), R4; \ 98 instr op1, R4; \ 99 MOVD.P R4, 8(R3); 100 101// handle the first 1~4 elements before starting iteration in addVW/subVW 102#define vwPreIter(instr1, instr2, counter, target) \ 103 vwOneOp(instr1, R2); \ 104 SUB $1, counter; \ 105 CBZ counter, target; \ 106 vwOneOp(instr2, $0); \ 107 SUB $1, counter; \ 108 CBZ counter, target; \ 109 vwOneOp(instr2, $0); \ 110 SUB $1, counter; \ 111 CBZ counter, target; \ 112 vwOneOp(instr2, $0); 113 114// do one iteration of add or sub in addVW/subVW 115#define vwOneIter(instr, counter, exit) \ 116 CBZ counter, exit; \ // careful not to touch the carry flag 117 LDP.P 32(R1), (R4, R5); \ 118 LDP -16(R1), (R6, R7); \ 119 instr $0, R4, R8; \ 120 instr $0, R5, R9; \ 121 instr $0, R6, R10; \ 122 instr $0, R7, R11; \ 123 STP.P (R8, R9), 32(R3); \ 124 STP (R10, R11), -16(R3); \ 125 SUB $4, counter; 126 127// do one iteration of copy in addVW/subVW 128#define vwOneIterCopy(counter, exit) \ 129 CBZ counter, exit; \ 130 LDP.P 32(R1), (R4, R5); \ 131 LDP -16(R1), (R6, R7); \ 132 STP.P (R4, R5), 32(R3); \ 133 STP (R6, R7), -16(R3); \ 134 SUB $4, counter; 135 136// func addVW(z, x []Word, y Word) (c Word) 137// The 'large' branch handles large 'z'. It checks the carry flag on every iteration 138// and switches to copy if we are done with carries. The copying is skipped as well 139// if 'x' and 'z' happen to share the same underlying storage. 140// The overhead of the checking and branching is visible when 'z' are small (~5%), 141// so set a threshold of 32, and remain the small-sized part entirely untouched. 142TEXT ·addVW(SB),NOSPLIT,$0 143 MOVD z+0(FP), R3 144 MOVD z_len+8(FP), R0 145 MOVD x+24(FP), R1 146 MOVD y+48(FP), R2 147 CMP $32, R0 148 BGE large // large-sized 'z' and 'x' 149 CBZ R0, len0 // the length of z is 0 150 MOVD.P 8(R1), R4 151 ADDS R2, R4 // z[0] = x[0] + y, set carry 152 MOVD.P R4, 8(R3) 153 SUB $1, R0 154 CBZ R0, len1 // the length of z is 1 155 TBZ $0, R0, two 156 MOVD.P 8(R1), R4 // do it once 157 ADCS $0, R4 158 MOVD.P R4, 8(R3) 159 SUB $1, R0 160two: // do it twice 161 TBZ $1, R0, loop 162 LDP.P 16(R1), (R4, R5) 163 ADCS $0, R4, R8 // c, z[i] = x[i] + c 164 ADCS $0, R5, R9 165 STP.P (R8, R9), 16(R3) 166 SUB $2, R0 167loop: // do four times per round 168 vwOneIter(ADCS, R0, len1) 169 B loop 170len1: 171 CSET HS, R2 // extract carry flag 172len0: 173 MOVD R2, c+56(FP) 174done: 175 RET 176large: 177 AND $0x3, R0, R10 178 AND $~0x3, R0 179 // unrolling for the first 1~4 elements to avoid saving the carry 180 // flag in each step, adjust $R0 if we unrolled 4 elements 181 vwPreIter(ADDS, ADCS, R10, add4) 182 SUB $4, R0 183add4: 184 BCC copy 185 vwOneIter(ADCS, R0, len1) 186 B add4 187copy: 188 MOVD ZR, c+56(FP) 189 CMP R1, R3 190 BEQ done 191copy_4: // no carry flag, copy the rest 192 vwOneIterCopy(R0, done) 193 B copy_4 194 195// func subVW(z, x []Word, y Word) (c Word) 196// The 'large' branch handles large 'z'. It checks the carry flag on every iteration 197// and switches to copy if we are done with carries. The copying is skipped as well 198// if 'x' and 'z' happen to share the same underlying storage. 199// The overhead of the checking and branching is visible when 'z' are small (~5%), 200// so set a threshold of 32, and remain the small-sized part entirely untouched. 201TEXT ·subVW(SB),NOSPLIT,$0 202 MOVD z+0(FP), R3 203 MOVD z_len+8(FP), R0 204 MOVD x+24(FP), R1 205 MOVD y+48(FP), R2 206 CMP $32, R0 207 BGE large // large-sized 'z' and 'x' 208 CBZ R0, len0 // the length of z is 0 209 MOVD.P 8(R1), R4 210 SUBS R2, R4 // z[0] = x[0] - y, set carry 211 MOVD.P R4, 8(R3) 212 SUB $1, R0 213 CBZ R0, len1 // the length of z is 1 214 TBZ $0, R0, two // do it once 215 MOVD.P 8(R1), R4 216 SBCS $0, R4 217 MOVD.P R4, 8(R3) 218 SUB $1, R0 219two: // do it twice 220 TBZ $1, R0, loop 221 LDP.P 16(R1), (R4, R5) 222 SBCS $0, R4, R8 // c, z[i] = x[i] + c 223 SBCS $0, R5, R9 224 STP.P (R8, R9), 16(R3) 225 SUB $2, R0 226loop: // do four times per round 227 vwOneIter(SBCS, R0, len1) 228 B loop 229len1: 230 CSET LO, R2 // extract carry flag 231len0: 232 MOVD R2, c+56(FP) 233done: 234 RET 235large: 236 AND $0x3, R0, R10 237 AND $~0x3, R0 238 // unrolling for the first 1~4 elements to avoid saving the carry 239 // flag in each step, adjust $R0 if we unrolled 4 elements 240 vwPreIter(SUBS, SBCS, R10, sub4) 241 SUB $4, R0 242sub4: 243 BCS copy 244 vwOneIter(SBCS, R0, len1) 245 B sub4 246copy: 247 MOVD ZR, c+56(FP) 248 CMP R1, R3 249 BEQ done 250copy_4: // no carry flag, copy the rest 251 vwOneIterCopy(R0, done) 252 B copy_4 253 254// func shlVU(z, x []Word, s uint) (c Word) 255// This implementation handles the shift operation from the high word to the low word, 256// which may be an error for the case where the low word of x overlaps with the high 257// word of z. When calling this function directly, you need to pay attention to this 258// situation. 259TEXT ·shlVU(SB),NOSPLIT,$0 260 LDP z+0(FP), (R0, R1) // R0 = z.ptr, R1 = len(z) 261 MOVD x+24(FP), R2 262 MOVD s+48(FP), R3 263 ADD R1<<3, R0 // R0 = &z[n] 264 ADD R1<<3, R2 // R2 = &x[n] 265 CBZ R1, len0 266 CBZ R3, copy // if the number of shift is 0, just copy x to z 267 MOVD $64, R4 268 SUB R3, R4 269 // handling the most significant element x[n-1] 270 MOVD.W -8(R2), R6 271 LSR R4, R6, R5 // return value 272 LSL R3, R6, R8 // x[i] << s 273 SUB $1, R1 274one: TBZ $0, R1, two 275 MOVD.W -8(R2), R6 276 LSR R4, R6, R7 277 ORR R8, R7 278 LSL R3, R6, R8 279 SUB $1, R1 280 MOVD.W R7, -8(R0) 281two: 282 TBZ $1, R1, loop 283 LDP.W -16(R2), (R6, R7) 284 LSR R4, R7, R10 285 ORR R8, R10 286 LSL R3, R7 287 LSR R4, R6, R9 288 ORR R7, R9 289 LSL R3, R6, R8 290 SUB $2, R1 291 STP.W (R9, R10), -16(R0) 292loop: 293 CBZ R1, done 294 LDP.W -32(R2), (R10, R11) 295 LDP 16(R2), (R12, R13) 296 LSR R4, R13, R23 297 ORR R8, R23 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s)) 298 LSL R3, R13 299 LSR R4, R12, R22 300 ORR R13, R22 301 LSL R3, R12 302 LSR R4, R11, R21 303 ORR R12, R21 304 LSL R3, R11 305 LSR R4, R10, R20 306 ORR R11, R20 307 LSL R3, R10, R8 308 STP.W (R20, R21), -32(R0) 309 STP (R22, R23), 16(R0) 310 SUB $4, R1 311 B loop 312done: 313 MOVD.W R8, -8(R0) // the first element x[0] 314 MOVD R5, c+56(FP) // the part moved out from x[n-1] 315 RET 316copy: 317 CMP R0, R2 318 BEQ len0 319 TBZ $0, R1, ctwo 320 MOVD.W -8(R2), R4 321 MOVD.W R4, -8(R0) 322 SUB $1, R1 323ctwo: 324 TBZ $1, R1, cloop 325 LDP.W -16(R2), (R4, R5) 326 STP.W (R4, R5), -16(R0) 327 SUB $2, R1 328cloop: 329 CBZ R1, len0 330 LDP.W -32(R2), (R4, R5) 331 LDP 16(R2), (R6, R7) 332 STP.W (R4, R5), -32(R0) 333 STP (R6, R7), 16(R0) 334 SUB $4, R1 335 B cloop 336len0: 337 MOVD $0, c+56(FP) 338 RET 339 340// func shrVU(z, x []Word, s uint) (c Word) 341// This implementation handles the shift operation from the low word to the high word, 342// which may be an error for the case where the high word of x overlaps with the low 343// word of z. When calling this function directly, you need to pay attention to this 344// situation. 345TEXT ·shrVU(SB),NOSPLIT,$0 346 MOVD z+0(FP), R0 347 MOVD z_len+8(FP), R1 348 MOVD x+24(FP), R2 349 MOVD s+48(FP), R3 350 MOVD $0, R8 351 MOVD $64, R4 352 SUB R3, R4 353 CBZ R1, len0 354 CBZ R3, copy // if the number of shift is 0, just copy x to z 355 356 MOVD.P 8(R2), R20 357 LSR R3, R20, R8 358 LSL R4, R20 359 MOVD R20, c+56(FP) // deal with the first element 360 SUB $1, R1 361 362 TBZ $0, R1, two 363 MOVD.P 8(R2), R6 364 LSL R4, R6, R20 365 ORR R8, R20 366 LSR R3, R6, R8 367 MOVD.P R20, 8(R0) 368 SUB $1, R1 369two: 370 TBZ $1, R1, loop 371 LDP.P 16(R2), (R6, R7) 372 LSL R4, R6, R20 373 LSR R3, R6 374 ORR R8, R20 375 LSL R4, R7, R21 376 LSR R3, R7, R8 377 ORR R6, R21 378 STP.P (R20, R21), 16(R0) 379 SUB $2, R1 380loop: 381 CBZ R1, done 382 LDP.P 32(R2), (R10, R11) 383 LDP -16(R2), (R12, R13) 384 LSL R4, R10, R20 385 LSR R3, R10 386 ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s)) 387 LSL R4, R11, R21 388 LSR R3, R11 389 ORR R10, R21 390 LSL R4, R12, R22 391 LSR R3, R12 392 ORR R11, R22 393 LSL R4, R13, R23 394 LSR R3, R13, R8 395 ORR R12, R23 396 STP.P (R20, R21), 32(R0) 397 STP (R22, R23), -16(R0) 398 SUB $4, R1 399 B loop 400done: 401 MOVD R8, (R0) // deal with the last element 402 RET 403copy: 404 CMP R0, R2 405 BEQ len0 406 TBZ $0, R1, ctwo 407 MOVD.P 8(R2), R3 408 MOVD.P R3, 8(R0) 409 SUB $1, R1 410ctwo: 411 TBZ $1, R1, cloop 412 LDP.P 16(R2), (R4, R5) 413 STP.P (R4, R5), 16(R0) 414 SUB $2, R1 415cloop: 416 CBZ R1, len0 417 LDP.P 32(R2), (R4, R5) 418 LDP -16(R2), (R6, R7) 419 STP.P (R4, R5), 32(R0) 420 STP (R6, R7), -16(R0) 421 SUB $4, R1 422 B cloop 423len0: 424 MOVD $0, c+56(FP) 425 RET 426 427 428// func mulAddVWW(z, x []Word, y, r Word) (c Word) 429TEXT ·mulAddVWW(SB),NOSPLIT,$0 430 MOVD z+0(FP), R1 431 MOVD z_len+8(FP), R0 432 MOVD x+24(FP), R2 433 MOVD y+48(FP), R3 434 MOVD r+56(FP), R4 435 // c, z = x * y + r 436 TBZ $0, R0, two 437 MOVD.P 8(R2), R5 438 MUL R3, R5, R7 439 UMULH R3, R5, R8 440 ADDS R4, R7 441 ADC $0, R8, R4 // c, z[i] = x[i] * y + r 442 MOVD.P R7, 8(R1) 443 SUB $1, R0 444two: 445 TBZ $1, R0, loop 446 LDP.P 16(R2), (R5, R6) 447 MUL R3, R5, R10 448 UMULH R3, R5, R11 449 ADDS R4, R10 450 MUL R3, R6, R12 451 UMULH R3, R6, R13 452 ADCS R12, R11 453 ADC $0, R13, R4 454 455 STP.P (R10, R11), 16(R1) 456 SUB $2, R0 457loop: 458 CBZ R0, done 459 LDP.P 32(R2), (R5, R6) 460 LDP -16(R2), (R7, R8) 461 462 MUL R3, R5, R10 463 UMULH R3, R5, R11 464 ADDS R4, R10 465 MUL R3, R6, R12 466 UMULH R3, R6, R13 467 ADCS R11, R12 468 469 MUL R3, R7, R14 470 UMULH R3, R7, R15 471 ADCS R13, R14 472 MUL R3, R8, R16 473 UMULH R3, R8, R17 474 ADCS R15, R16 475 ADC $0, R17, R4 476 477 STP.P (R10, R12), 32(R1) 478 STP (R14, R16), -16(R1) 479 SUB $4, R0 480 B loop 481done: 482 MOVD R4, c+64(FP) 483 RET 484 485 486// func addMulVVW(z, x []Word, y Word) (c Word) 487TEXT ·addMulVVW(SB),NOSPLIT,$0 488 MOVD z+0(FP), R1 489 MOVD z_len+8(FP), R0 490 MOVD x+24(FP), R2 491 MOVD y+48(FP), R3 492 MOVD $0, R4 493 494 TBZ $0, R0, two 495 496 MOVD.P 8(R2), R5 497 MOVD (R1), R6 498 499 MUL R5, R3, R7 500 UMULH R5, R3, R8 501 502 ADDS R7, R6 503 ADC $0, R8, R4 504 505 MOVD.P R6, 8(R1) 506 SUB $1, R0 507 508two: 509 TBZ $1, R0, loop 510 511 LDP.P 16(R2), (R5, R10) 512 LDP (R1), (R6, R11) 513 514 MUL R10, R3, R13 515 UMULH R10, R3, R12 516 517 MUL R5, R3, R7 518 UMULH R5, R3, R8 519 520 ADDS R4, R6 521 ADCS R13, R11 522 ADC $0, R12 523 524 ADDS R7, R6 525 ADCS R8, R11 526 ADC $0, R12, R4 527 528 STP.P (R6, R11), 16(R1) 529 SUB $2, R0 530 531// The main loop of this code operates on a block of 4 words every iteration 532// performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9] 533// where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next 534// 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z. 535loop: 536 CBZ R0, done 537 538 LDP.P 16(R2), (R5, R6) 539 LDP.P 16(R2), (R7, R8) 540 541 LDP (R1), (R9, R10) 542 ADDS R4, R9 543 MUL R6, R3, R14 544 ADCS R14, R10 545 MUL R7, R3, R15 546 LDP 16(R1), (R11, R12) 547 ADCS R15, R11 548 MUL R8, R3, R16 549 ADCS R16, R12 550 UMULH R8, R3, R20 551 ADC $0, R20 552 553 MUL R5, R3, R13 554 ADDS R13, R9 555 UMULH R5, R3, R17 556 ADCS R17, R10 557 UMULH R6, R3, R21 558 STP.P (R9, R10), 16(R1) 559 ADCS R21, R11 560 UMULH R7, R3, R19 561 ADCS R19, R12 562 STP.P (R11, R12), 16(R1) 563 ADC $0, R20, R4 564 565 SUB $4, R0 566 B loop 567 568done: 569 MOVD R4, c+56(FP) 570 RET 571 572 573