From 371ea2f4b38f392e858abe03ef37a7e627e1124a Mon Sep 17 00:00:00 2001 From: emmansun Date: Fri, 12 Feb 2021 14:54:21 +0800 Subject: [PATCH] MAGIC - optimize p256Mul p256FromMont --- sm2/p256_asm_amd64.s | 290 ++++++++++++++++++++----------------------- 1 file changed, 137 insertions(+), 153 deletions(-) diff --git a/sm2/p256_asm_amd64.s b/sm2/p256_asm_amd64.s index 8ff9bf7..5d1cb06 100644 --- a/sm2/p256_asm_amd64.s +++ b/sm2/p256_asm_amd64.s @@ -388,26 +388,24 @@ TEXT ·p256Mul(SB),NOSPLIT,$0 MOVQ DX, acc4 XORQ acc5, acc5 // First reduction step - MOVQ p256p<>+0x08(SB), AX - MULQ acc0 + MOVQ acc0, AX + MOVQ acc0, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc0, acc1 - ADCQ $0, DX - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ DX, acc4 + ADCQ $0, acc2 + SUBQ AX, acc1 + SBBQ DX, acc2 + + MOVQ acc0, t1 + ADDQ acc0, acc3 + ADCQ $0, t1 + SUBQ AX, acc3 + SBBQ DX, t1 + SUBQ acc0, acc3 + SBBQ $0, t1 + ADDQ t1, acc4 ADCQ $0, acc5 XORQ acc0, acc0 @@ -444,26 +442,24 @@ TEXT ·p256Mul(SB),NOSPLIT,$0 ADCQ DX, acc5 ADCQ $0, acc0 // Second reduction step - MOVQ p256p<>+0x08(SB), AX - MULQ acc1 + MOVQ acc1, AX + MOVQ acc1, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc1 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc1 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ DX, acc5 + ADCQ $0, acc3 + SUBQ AX, acc2 + SBBQ DX, acc3 + + MOVQ acc1, t1 + ADDQ acc1, acc4 + ADCQ $0, t1 + SUBQ AX, acc4 + SBBQ DX, t1 + SUBQ acc1, acc4 + SBBQ $0, t1 + ADDQ t1, acc5 ADCQ $0, acc0 XORQ acc1, acc1 @@ -500,26 +496,24 @@ TEXT ·p256Mul(SB),NOSPLIT,$0 ADCQ DX, acc0 ADCQ $0, acc1 // Third reduction step - MOVQ p256p<>+0x08(SB), AX - MULQ acc2 + MOVQ acc2, AX + MOVQ acc2, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc2, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc2 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc2 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ DX, acc0 + ADCQ $0, acc4 + SUBQ AX, acc3 + SBBQ DX, acc4 + + MOVQ acc2, t1 + ADDQ acc2, acc5 + ADCQ $0, t1 + SUBQ AX, acc5 + SBBQ DX, t1 + SUBQ acc2, acc5 + SBBQ $0, t1 + ADDQ t1, acc0 ADCQ $0, acc1 XORQ acc2, acc2 // x * y[3] @@ -555,26 +549,24 @@ TEXT ·p256Mul(SB),NOSPLIT,$0 ADCQ DX, acc1 ADCQ $0, acc2 // Last reduction step - MOVQ p256p<>+0x08(SB), AX - MULQ acc3 + MOVQ acc3, AX + MOVQ acc3, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc3, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc3 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc3 - ADDQ t1, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ DX, acc1 + ADCQ $0, acc5 + SUBQ AX, acc4 + SBBQ DX, acc5 + + MOVQ acc3, t1 + ADDQ acc3, acc0 + ADCQ $0, t1 + SUBQ AX, acc0 + SBBQ DX, t1 + SUBQ acc3, acc0 + SBBQ $0, t1 + ADDQ t1, acc1 ADCQ $0, acc2 // Copy result [255:0] MOVQ acc4, x_ptr @@ -613,94 +605,86 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0 // Only reduce, no multiplications are needed // First stage - MOVQ p256p<>+0x08(SB), AX - MULQ acc0 + MOVQ acc0, AX + MOVQ acc0, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc0, acc1 - ADCQ $0, DX - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ DX, acc4 + ADCQ $0, acc2 + SUBQ AX, acc1 + SBBQ DX, acc2 + + MOVQ acc0, t1 + ADDQ acc0, acc3 + ADCQ $0, t1 + SUBQ AX, acc3 + SBBQ DX, t1 + SUBQ acc0, acc3 + SBBQ $0, t1 + ADDQ t1, acc4 XORQ acc5, acc5 // Second stage - MOVQ p256p<>+0x08(SB), AX - MULQ acc1 + MOVQ acc1, AX + MOVQ acc1, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc1 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc1 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ DX, acc5 + ADCQ $0, acc3 + SUBQ AX, acc2 + SBBQ DX, acc3 + + MOVQ acc1, t1 + ADDQ acc1, acc4 + ADCQ $0, t1 + SUBQ AX, acc4 + SBBQ DX, t1 + SUBQ acc1, acc4 + SBBQ $0, t1 + ADDQ t1, acc5 XORQ acc0, acc0 // Third stage - MOVQ p256p<>+0x08(SB), AX - MULQ acc2 + MOVQ acc2, AX + MOVQ acc2, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc2, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc2 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc2 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ DX, acc0 + ADCQ $0, acc4 + SUBQ AX, acc3 + SBBQ DX, acc4 + + MOVQ acc2, t1 + ADDQ acc2, acc5 + ADCQ $0, t1 + SUBQ AX, acc5 + SBBQ DX, t1 + SUBQ acc2, acc5 + SBBQ $0, t1 + ADDQ t1, acc0 XORQ acc1, acc1 // Last stage - MOVQ p256p<>+0x08(SB), AX - MULQ acc3 - ADDQ acc3, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc3 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc3 - ADDQ t1, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ DX, acc1 + MOVQ acc3, AX + MOVQ acc3, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc3, acc4 + ADCQ $0, acc5 + SUBQ AX, acc4 + SBBQ DX, acc5 + + MOVQ acc3, t1 + ADDQ acc3, acc0 + ADCQ $0, t1 + SUBQ AX, acc0 + SBBQ DX, t1 + SUBQ acc3, acc0 + SBBQ $0, t1 + ADDQ t1, acc1 + MOVQ acc4, x_ptr MOVQ acc5, acc3 MOVQ acc0, t0