optimize sm2 p256 amd64 implementation

This commit is contained in:
Emman 2021-12-24 13:13:11 +08:00
parent 3a701fe2d8
commit 4ff0c4547f
2 changed files with 386 additions and 444 deletions

View File

@ -246,94 +246,66 @@ sqrLoop:
ADCQ DX, t1
MOVQ t1, x_ptr
// First reduction step
MOVQ p256p<>+0x08(SB), AX
MULQ acc0
MOVQ acc0, AX
MOVQ acc0, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc0, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc0
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ $0, acc0
SUBQ AX, acc1
SBBQ DX, acc2
SBBQ AX, acc3
SBBQ DX, acc0
// Second reduction step
MOVQ p256p<>+0x08(SB), AX
MULQ acc1
MOVQ acc1, AX
MOVQ acc1, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc1
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc1
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, acc1
ADCQ $0, acc3
ADCQ $0, acc0
ADCQ $0, acc1
SUBQ AX, acc2
SBBQ DX, acc3
SBBQ AX, acc0
SBBQ DX, acc1
// Third reduction step
MOVQ p256p<>+0x08(SB), AX
MULQ acc2
MOVQ acc2, AX
MOVQ acc2, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc2, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc2
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc2
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, acc2
ADCQ $0, acc0
ADCQ $0, acc1
ADCQ $0, acc2
SUBQ AX, acc3
SBBQ DX, acc0
SBBQ AX, acc1
SBBQ DX, acc2
// Last reduction step
XORQ t0, t0
MOVQ p256p<>+0x08(SB), AX
MULQ acc3
MOVQ acc3, AX
MOVQ acc3, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc3, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc3
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc3
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, acc3
ADCQ $0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
SUBQ AX, acc0
SBBQ DX, acc1
SBBQ AX, acc2
SBBQ DX, acc3
// Add bits [511:256] of the sqr result
ADCQ acc4, acc0
@ -400,27 +372,22 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
MOVQ DX, acc4
XORQ acc5, acc5
// First reduction step
MOVQ p256p<>+0x08(SB), AX
MULQ acc0
MOVQ acc0, AX
MOVQ acc0, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc0, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc4
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ acc0, acc4
ADCQ $0, acc5
SUBQ AX, acc1
SBBQ DX, acc2
SBBQ AX, acc3
SBBQ DX, acc4
SBBQ $0, acc5
XORQ acc0, acc0
// x * y[1]
@ -456,27 +423,22 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
ADCQ DX, acc5
ADCQ $0, acc0
// Second reduction step
MOVQ p256p<>+0x08(SB), AX
MULQ acc1
MOVQ acc1, AX
MOVQ acc1, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc1
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc1
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc3
ADCQ $0, acc4
ADCQ acc1, acc5
ADCQ $0, acc0
SUBQ AX, acc2
SBBQ DX, acc3
SBBQ AX, acc4
SBBQ DX, acc5
SBBQ $0, acc0
XORQ acc1, acc1
// x * y[2]
@ -512,27 +474,22 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
ADCQ DX, acc0
ADCQ $0, acc1
// Third reduction step
MOVQ p256p<>+0x08(SB), AX
MULQ acc2
MOVQ acc2, AX
MOVQ acc2, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc2, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc2
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc2
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
ADCQ $0, acc4
ADCQ $0, acc5
ADCQ acc2, acc0
ADCQ $0, acc1
SUBQ AX, acc3
SBBQ DX, acc4
SBBQ AX, acc5
SBBQ DX, acc0
SBBQ $0, acc1
XORQ acc2, acc2
// x * y[3]
MOVQ (8*3)(y_ptr), t0
@ -567,27 +524,22 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
ADCQ DX, acc1
ADCQ $0, acc2
// Last reduction step
MOVQ p256p<>+0x08(SB), AX
MULQ acc3
MOVQ acc3, AX
MOVQ acc3, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc3, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc3
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc3
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc5
ADCQ $0, acc0
ADCQ acc3, acc1
ADCQ $0, acc2
SUBQ AX, acc4
SBBQ DX, acc5
SBBQ AX, acc0
SBBQ DX, acc1
SBBQ $0, acc2
// Copy result [255:0]
MOVQ acc4, x_ptr
MOVQ acc5, acc3
@ -625,93 +577,69 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
// Only reduce, no multiplications are needed
// First stage
MOVQ p256p<>+0x08(SB), AX
MULQ acc0
MOVQ acc0, AX
MOVQ acc0, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc0, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc4
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ acc0, acc4
SUBQ AX, acc1
SBBQ DX, acc2
SBBQ AX, acc3
SBBQ DX, acc4
XORQ acc5, acc5
// Second stage
MOVQ p256p<>+0x08(SB), AX
MULQ acc1
MOVQ acc1, AX
MOVQ acc1, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc1
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc1
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc3
ADCQ $0, acc4
ADCQ acc1, acc5
SUBQ AX, acc2
SBBQ DX, acc3
SBBQ AX, acc4
SBBQ DX, acc5
XORQ acc0, acc0
// Third stage
MOVQ p256p<>+0x08(SB), AX
MULQ acc2
MOVQ acc2, AX
MOVQ acc2, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc2, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc2
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc2
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
ADCQ $0, acc4
ADCQ $0, acc5
ADCQ acc2, acc0
SUBQ AX, acc3
SBBQ DX, acc4
SBBQ AX, acc5
SBBQ DX, acc0
XORQ acc1, acc1
// Last stage
MOVQ p256p<>+0x08(SB), AX
MULQ acc3
MOVQ acc3, AX
MOVQ acc3, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc3, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc3
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc3
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc5
ADCQ $0, acc0
ADCQ acc3, acc1
SUBQ AX, acc4
SBBQ DX, acc5
SBBQ AX, acc0
SBBQ DX, acc1
MOVQ acc4, x_ptr
MOVQ acc5, acc3
@ -1249,38 +1177,33 @@ ordSqrLoop:
// First reduction step
MOVQ acc0, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1
ADDQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0
ADCQ $0, DX // DX = carry1 + H(t0 * ord0)
MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0)
MOVQ t0, acc0
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1
ADCQ $0, DX // DX = carry2 + H(t0*ord1)
MOVQ t0, t1
ADDQ AX, acc1 // (carry3, acc1) = acc1 + t1 + L(t0*ord1)
ADCQ DX, acc2
ADCQ $0, t1
SUBQ t0, acc2
SBBQ $0, t1
ADCQ $0, acc3
ADCQ $0, acc0
MOVQ t0, AX
MOVQ t0, DX
MOVQ t0, acc0
SHLQ $32, AX
SHRQ $32, DX
ADDQ t1, acc3
ADCQ $0, acc0
SUBQ AX, acc3
SUBQ t0, acc2
SBBQ AX, acc3
SBBQ DX, acc0
SUBQ t0, acc3
SBBQ $0, acc0
// Second reduction step
MOVQ acc1, AX
MULQ p256ordK0<>(SB)
@ -1291,31 +1214,26 @@ ordSqrLoop:
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ t0, acc1
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
MOVQ t0, t1
ADDQ AX, acc2
ADCQ DX, acc3
ADCQ $0, t1
SUBQ t0, acc3
SBBQ $0, t1
ADCQ $0, acc0
ADCQ $0, acc1
MOVQ t0, AX
MOVQ t0, DX
MOVQ t0, acc1
SHLQ $32, AX
SHRQ $32, DX
ADDQ t1, acc0
ADCQ $0, acc1
SUBQ AX, acc0
SUBQ t0, acc3
SBBQ AX, acc0
SBBQ DX, acc1
SUBQ t0, acc0
SBBQ $0, acc1
// Third reduction step
MOVQ acc2, AX
MULQ p256ordK0<>(SB)
@ -1326,31 +1244,26 @@ ordSqrLoop:
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ t0, acc2
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
MOVQ t0, t1
ADDQ AX, acc3
ADCQ DX, acc0
ADCQ $0, t1
SUBQ t0, acc0
SBBQ $0, t1
ADCQ $0, acc1
ADCQ $0, acc2
MOVQ t0, AX
MOVQ t0, DX
MOVQ t0, acc2
SHLQ $32, AX
SHRQ $32, DX
ADDQ t1, acc1
ADCQ $0, acc2
SUBQ AX, acc1
SUBQ t0, acc0
SBBQ AX, acc1
SBBQ DX, acc2
SUBQ t0, acc1
SBBQ $0, acc2
// Last reduction step
MOVQ acc3, AX
MULQ p256ordK0<>(SB)
@ -1361,33 +1274,27 @@ ordSqrLoop:
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ t0, acc3
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1
MOVQ t0, t1
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, t1
SUBQ t0, acc1
SBBQ $0, t1
ADCQ $0, acc2
ADCQ $0, acc3
MOVQ t0, AX
MOVQ t0, DX
MOVQ t0, acc3
SHLQ $32, AX
SHRQ $32, DX
ADDQ t1, acc2
ADCQ $0, acc3
SUBQ AX, acc2
SUBQ t0, acc1
SBBQ AX, acc2
SBBQ DX, acc3
SUBQ t0, acc2
SBBQ $0, acc3
XORQ t0, t0
// Add bits [511:256] of the sqr result
ADCQ acc4, acc0
@ -1591,93 +1498,65 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$0
ADCQ $0, mul1
MOVQ mul1, acc7
// First reduction step
MOVQ p256p<>+0x08(SB), mul0
MULQ acc0
MOVQ acc0, mul0
MOVQ acc0, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc0, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x010(SB), mul0
MULQ acc0
ADDQ hlp, acc2
ADCQ $0, mul1
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc0
ADDQ hlp, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, acc0
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ $0, acc0
SUBQ mul0, acc1
SBBQ mul1, acc2
SBBQ mul0, acc3
SBBQ mul1, acc0
// Second reduction step
MOVQ p256p<>+0x08(SB), mul0
MULQ acc1
MOVQ acc1, mul0
MOVQ acc1, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc1, acc2
ADCQ $0, mul1
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x010(SB), mul0
MULQ acc1
ADDQ hlp, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc1
ADDQ hlp, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, acc1
ADCQ $0, acc3
ADCQ $0, acc0
ADCQ $0, acc1
SUBQ mul0, acc2
SBBQ mul1, acc3
SBBQ mul0, acc0
SBBQ mul1, acc1
// Third reduction step
MOVQ p256p<>+0x08(SB), mul0
MULQ acc2
MOVQ acc2, mul0
MOVQ acc2, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc2, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x010(SB), mul0
MULQ acc2
ADDQ hlp, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc2
ADDQ hlp, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, acc2
ADCQ $0, acc0
ADCQ $0, acc1
ADCQ $0, acc2
SUBQ mul0, acc3
SBBQ mul1, acc0
SBBQ mul0, acc1
SBBQ mul1, acc2
// Last reduction step
MOVQ p256p<>+0x08(SB), mul0
MULQ acc3
MOVQ acc3, mul0
MOVQ acc3, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc3, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x010(SB), mul0
MULQ acc3
ADDQ hlp, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc3
ADDQ hlp, acc2
ADCQ $0, mul1
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, acc3
ADCQ $0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
SUBQ mul0, acc0
SBBQ mul1, acc1
SBBQ mul0, acc2
SBBQ mul1, acc3
MOVQ $0, BP
// Add bits [511:256] of the result
ADCQ acc0, acc4
@ -1777,93 +1656,65 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$0
ADCQ mul0, t2
ADCQ DX, t3
// First reduction step
MOVQ p256p<>+0x08(SB), mul0
MULQ acc0
MOVQ acc0, mul0
MOVQ acc0, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc0, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x010(SB), mul0
MULQ acc0
ADDQ hlp, acc2
ADCQ $0, mul1
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc0
ADDQ hlp, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, acc0
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ $0, acc0
SUBQ mul0, acc1
SBBQ mul1, acc2
SBBQ mul0, acc3
SBBQ mul1, acc0
// Second reduction step
MOVQ p256p<>+0x08(SB), mul0
MULQ acc1
MOVQ acc1, mul0
MOVQ acc1, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc1, acc2
ADCQ $0, mul1
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x010(SB), mul0
MULQ acc1
ADDQ hlp, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc1
ADDQ hlp, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, acc1
ADCQ $0, acc3
ADCQ $0, acc0
ADCQ $0, acc1
SUBQ mul0, acc2
SBBQ mul1, acc3
SBBQ mul0, acc0
SBBQ mul1, acc1
// Third reduction step
MOVQ p256p<>+0x08(SB), mul0
MULQ acc2
MOVQ acc2, mul0
MOVQ acc2, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc2, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x010(SB), mul0
MULQ acc2
ADDQ hlp, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc2
ADDQ hlp, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, acc2
ADCQ $0, acc0
ADCQ $0, acc1
ADCQ $0, acc2
SUBQ mul0, acc3
SBBQ mul1, acc0
SBBQ mul0, acc1
SBBQ mul1, acc2
// Last reduction step
MOVQ p256p<>+0x08(SB), mul0
MULQ acc3
MOVQ acc3, mul0
MOVQ acc3, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc3, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x010(SB), mul0
MULQ acc3
ADDQ hlp, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc3
ADDQ hlp, acc2
ADCQ $0, mul1
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, acc3
ADCQ $0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
SUBQ mul0, acc0
SBBQ mul1, acc1
SBBQ mul0, acc2
SBBQ mul1, acc3
MOVQ $0, BP
// Add bits [511:256] of the result
ADCQ acc0, t0

View File

@ -7,8 +7,10 @@ import (
"crypto/rand"
"encoding/hex"
"fmt"
"io"
"math/big"
"testing"
"time"
)
func toBigInt(in []uint64) *big.Int {
@ -115,6 +117,95 @@ func Test_p256Mul(t *testing.T) {
}
}
func p256SqrTest(t *testing.T, x, p, r *big.Int) {
x1 := new(big.Int).Mul(x, r)
x1 = x1.Mod(x1, p)
ax := make([]uint64, 4)
res := make([]uint64, 4)
res2 := make([]uint64, 4)
fromBig(ax, x1)
p256Sqr(res2, ax, 1)
p256FromMont(res, res2)
resInt := toBigInt(res)
expected := new(big.Int).Mul(x, x)
expected = expected.Mod(expected, p)
if resInt.Cmp(expected) != 0 {
t.FailNow()
}
}
func TestFuzzyP256Sqr(t *testing.T) {
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
var scalar1 [32]byte
var timeout *time.Timer
if testing.Short() {
timeout = time.NewTimer(10 * time.Millisecond)
} else {
timeout = time.NewTimer(2 * time.Second)
}
for {
select {
case <-timeout.C:
return
default:
}
io.ReadFull(rand.Reader, scalar1[:])
x := new(big.Int).SetBytes(scalar1[:])
p256SqrTest(t, x, p, r)
}
}
func p256MulTest(t *testing.T, x, y, p, r *big.Int) {
x1 := new(big.Int).Mul(x, r)
x1 = x1.Mod(x1, p)
y1 := new(big.Int).Mul(y, r)
y1 = y1.Mod(y1, p)
ax := make([]uint64, 4)
ay := make([]uint64, 4)
res := make([]uint64, 4)
res2 := make([]uint64, 4)
fromBig(ax, x1)
fromBig(ay, y1)
p256Mul(res2, ax, ay)
p256FromMont(res, res2)
resInt := toBigInt(res)
expected := new(big.Int).Mul(x, y)
expected = expected.Mod(expected, p)
if resInt.Cmp(expected) != 0 {
t.FailNow()
}
}
func TestFuzzyP256Mul(t *testing.T) {
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
var scalar1 [32]byte
var scalar2 [32]byte
var timeout *time.Timer
if testing.Short() {
timeout = time.NewTimer(10 * time.Millisecond)
} else {
timeout = time.NewTimer(2 * time.Second)
}
for {
select {
case <-timeout.C:
return
default:
}
io.ReadFull(rand.Reader, scalar1[:])
io.ReadFull(rand.Reader, scalar2[:])
x := new(big.Int).SetBytes(scalar1[:])
y := new(big.Int).SetBytes(scalar2[:])
p256MulTest(t, x, y, p, r)
}
}
func Test_p256MulSqr(t *testing.T) {
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)