130 lines
2.2 KiB
ArmAsm
130 lines
2.2 KiB
ArmAsm
|
// +build amd64,!generic
|
||
|
|
||
|
#define storeBlock(a0,a1,a2,a3, r) \
|
||
|
MOVQ a0, 0+r \
|
||
|
MOVQ a1, 8+r \
|
||
|
MOVQ a2, 16+r \
|
||
|
MOVQ a3, 24+r
|
||
|
|
||
|
#define loadBlock(r, a0,a1,a2,a3) \
|
||
|
MOVQ 0+r, a0 \
|
||
|
MOVQ 8+r, a1 \
|
||
|
MOVQ 16+r, a2 \
|
||
|
MOVQ 24+r, a3
|
||
|
|
||
|
#define gfpCarry(a0,a1,a2,a3,a4, b0,b1,b2,b3,b4) \
|
||
|
\ // b = a-p
|
||
|
MOVQ a0, b0 \
|
||
|
MOVQ a1, b1 \
|
||
|
MOVQ a2, b2 \
|
||
|
MOVQ a3, b3 \
|
||
|
MOVQ a4, b4 \
|
||
|
\
|
||
|
SUBQ ·p2+0(SB), b0 \
|
||
|
SBBQ ·p2+8(SB), b1 \
|
||
|
SBBQ ·p2+16(SB), b2 \
|
||
|
SBBQ ·p2+24(SB), b3 \
|
||
|
SBBQ $0, b4 \
|
||
|
\
|
||
|
\ // if b is negative then return a
|
||
|
\ // else return b
|
||
|
CMOVQCC b0, a0 \
|
||
|
CMOVQCC b1, a1 \
|
||
|
CMOVQCC b2, a2 \
|
||
|
CMOVQCC b3, a3
|
||
|
|
||
|
#include "mul_amd64.h"
|
||
|
#include "mul_bmi2_amd64.h"
|
||
|
|
||
|
TEXT ·gfpNeg(SB),0,$0-16
|
||
|
MOVQ ·p2+0(SB), R8
|
||
|
MOVQ ·p2+8(SB), R9
|
||
|
MOVQ ·p2+16(SB), R10
|
||
|
MOVQ ·p2+24(SB), R11
|
||
|
|
||
|
MOVQ a+8(FP), DI
|
||
|
SUBQ 0(DI), R8
|
||
|
SBBQ 8(DI), R9
|
||
|
SBBQ 16(DI), R10
|
||
|
SBBQ 24(DI), R11
|
||
|
|
||
|
MOVQ $0, AX
|
||
|
gfpCarry(R8,R9,R10,R11,AX, R12,R13,R14,R15,BX)
|
||
|
|
||
|
MOVQ c+0(FP), DI
|
||
|
storeBlock(R8,R9,R10,R11, 0(DI))
|
||
|
RET
|
||
|
|
||
|
TEXT ·gfpAdd(SB),0,$0-24
|
||
|
MOVQ a+8(FP), DI
|
||
|
MOVQ b+16(FP), SI
|
||
|
|
||
|
loadBlock(0(DI), R8,R9,R10,R11)
|
||
|
MOVQ $0, R12
|
||
|
|
||
|
ADDQ 0(SI), R8
|
||
|
ADCQ 8(SI), R9
|
||
|
ADCQ 16(SI), R10
|
||
|
ADCQ 24(SI), R11
|
||
|
ADCQ $0, R12
|
||
|
|
||
|
gfpCarry(R8,R9,R10,R11,R12, R13,R14,R15,AX,BX)
|
||
|
|
||
|
MOVQ c+0(FP), DI
|
||
|
storeBlock(R8,R9,R10,R11, 0(DI))
|
||
|
RET
|
||
|
|
||
|
TEXT ·gfpSub(SB),0,$0-24
|
||
|
MOVQ a+8(FP), DI
|
||
|
MOVQ b+16(FP), SI
|
||
|
|
||
|
loadBlock(0(DI), R8,R9,R10,R11)
|
||
|
|
||
|
MOVQ ·p2+0(SB), R12
|
||
|
MOVQ ·p2+8(SB), R13
|
||
|
MOVQ ·p2+16(SB), R14
|
||
|
MOVQ ·p2+24(SB), R15
|
||
|
MOVQ $0, AX
|
||
|
|
||
|
SUBQ 0(SI), R8
|
||
|
SBBQ 8(SI), R9
|
||
|
SBBQ 16(SI), R10
|
||
|
SBBQ 24(SI), R11
|
||
|
|
||
|
CMOVQCC AX, R12
|
||
|
CMOVQCC AX, R13
|
||
|
CMOVQCC AX, R14
|
||
|
CMOVQCC AX, R15
|
||
|
|
||
|
ADDQ R12, R8
|
||
|
ADCQ R13, R9
|
||
|
ADCQ R14, R10
|
||
|
ADCQ R15, R11
|
||
|
|
||
|
MOVQ c+0(FP), DI
|
||
|
storeBlock(R8,R9,R10,R11, 0(DI))
|
||
|
RET
|
||
|
|
||
|
TEXT ·gfpMul(SB),0,$160-24
|
||
|
MOVQ a+8(FP), DI
|
||
|
MOVQ b+16(FP), SI
|
||
|
|
||
|
// Jump to a slightly different implementation if MULX isn't supported.
|
||
|
CMPB ·hasBMI2(SB), $0
|
||
|
JE nobmi2Mul
|
||
|
|
||
|
mulBMI2(0(DI),8(DI),16(DI),24(DI), 0(SI))
|
||
|
storeBlock( R8, R9,R10,R11, 0(SP))
|
||
|
storeBlock(R12,R13,R14,R15, 32(SP))
|
||
|
gfpReduceBMI2()
|
||
|
JMP end
|
||
|
|
||
|
nobmi2Mul:
|
||
|
mul(0(DI),8(DI),16(DI),24(DI), 0(SI), 0(SP))
|
||
|
gfpReduce(0(SP))
|
||
|
|
||
|
end:
|
||
|
MOVQ c+0(FP), DI
|
||
|
storeBlock(R12,R13,R14,R15, 0(DI))
|
||
|
RET
|