From 48589f08760c6906e10c4399857851b742826fa1 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Mon, 4 Mar 2024 17:50:35 +0800 Subject: [PATCH] internal/sm2ec: amd64, optimize select SIMD --- internal/sm2ec/p256_asm_amd64.s | 14 ++++---------- internal/sm2ec/p256_common_amd64.s | 19 +++++++------------ internal/sm2ec/p256_plugin_amd64.s | 14 ++++---------- 3 files changed, 15 insertions(+), 32 deletions(-) diff --git a/internal/sm2ec/p256_asm_amd64.s b/internal/sm2ec/p256_asm_amd64.s index 315e35e..fafcb53 100644 --- a/internal/sm2ec/p256_asm_amd64.s +++ b/internal/sm2ec/p256_asm_amd64.s @@ -483,8 +483,6 @@ internalSqrBMI2: ST (yout) \ \// Load stored values from stack MOVQ rptr, AX \ - MOVL sel_save, BX \ - MOVL zero_save, CX \ // func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int) TEXT ·p256PointAddAffineAsm(SB),0,$512-48 @@ -528,8 +526,8 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-48 MOVOU zout(16*0), X4 MOVOU zout(16*1), X5 - MOVL BX, X6 // sel - MOVL CX, X7 // zero + MOVL sel_save, X6 // sel + MOVL zero_save, X7 // zero PXOR X8, X8 // X8's bits are all 0 PCMPEQL X9, X9 // X9's bits are all 1 @@ -626,13 +624,9 @@ pointaddaffine_avx2: p256PointAddAffineInline() // The result is not valid if (sel == 0), conditional choose - MOVL BX, X6 // sel - MOVL CX, X7 // zero - VPXOR Y8, Y8, Y8 // Y8's bits are all 0 - - VPBROADCASTD X6, Y6 - VPBROADCASTD X7, Y7 + VPBROADCASTD sel_save, Y6 // sel + VPBROADCASTD zero_save, Y7 // zero VPCMPEQD Y8, Y6, Y6 // Y6's bits are all 1 if sel = 0, else are 0 VPCMPEQD Y8, Y7, Y7 // Y7's bits are all 1 if zero = 0, else are 0 diff --git a/internal/sm2ec/p256_common_amd64.s b/internal/sm2ec/p256_common_amd64.s index 513c0e6..7eb13c8 100644 --- a/internal/sm2ec/p256_common_amd64.s +++ b/internal/sm2ec/p256_common_amd64.s @@ -705,9 +705,8 @@ loop_select: select_avx2: VPXOR Y15, Y15, Y15 VPCMPEQD Y14, Y14, Y14 - VPSUBD Y14, Y15, Y15 - MOVL idx+16(FP), X14 // x14 = idx - VPBROADCASTD X14, Y14 + VPSUBD Y14, Y15, Y15 // Y15 = 1 + VPBROADCASTD idx+16(FP), Y14 MOVQ limit+24(FP),AX VMOVDQU Y15, Y13 @@ -717,9 +716,8 @@ select_avx2: VPXOR Y2, Y2, Y2 loop_select_avx2: - VMOVDQU Y13, Y12 + VPCMPEQD Y14, Y13, Y12 VPADDD Y15, Y13, Y13 - VPCMPEQD Y14, Y12, Y12 VPAND (32*0)(DI), Y12, Y3 VPAND (32*1)(DI), Y12, Y4 @@ -753,7 +751,7 @@ TEXT ·p256SelectAffine(SB),NOSPLIT,$0 PXOR X15, X15 // X15 = 0 PCMPEQL X14, X14 // X14 = -1 PSUBL X14, X15 // X15 = 1 - MOVL AX, X14 // x14 = idx + MOVL idx+16(FP), X14 // x14 = idx PSHUFD $0, X14, X14 MOVQ $16, AX @@ -820,8 +818,7 @@ select_base_avx2: VPXOR Y15, Y15, Y15 VPCMPEQD Y14, Y14, Y14 VPSUBD Y14, Y15, Y15 - MOVL AX, X14 // x14 = idx - VPBROADCASTD X14, Y14 + VPBROADCASTD idx+16(FP), Y14 MOVQ $16, AX VMOVDQU Y15, Y13 @@ -829,16 +826,14 @@ select_base_avx2: VPXOR Y1, Y1, Y1 loop_select_base_avx2: - VMOVDQU Y13, Y12 + VPCMPEQD Y14, Y13, Y12 VPADDD Y15, Y13, Y13 - VPCMPEQD Y14, Y12, Y12 VPAND (32*0)(DI), Y12, Y2 VPAND (32*1)(DI), Y12, Y3 - VMOVDQU Y13, Y12 + VPCMPEQD Y14, Y13, Y12 VPADDD Y15, Y13, Y13 - VPCMPEQD Y14, Y12, Y12 VPAND (32*2)(DI), Y12, Y4 VPAND (32*3)(DI), Y12, Y5 diff --git a/internal/sm2ec/p256_plugin_amd64.s b/internal/sm2ec/p256_plugin_amd64.s index 336ec26..00cd6f0 100644 --- a/internal/sm2ec/p256_plugin_amd64.s +++ b/internal/sm2ec/p256_plugin_amd64.s @@ -500,8 +500,6 @@ internalSqrBMI2: ST (yout) \ \// Load stored values from stack MOVQ rptr, AX \ - MOVL sel_save, BX \ - MOVL zero_save, CX \ // func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int) TEXT ·p256PointAddAffineAsm(SB),0,$512-48 @@ -545,8 +543,8 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-48 MOVOU zout(16*0), X4 MOVOU zout(16*1), X5 - MOVL BX, X6 // sel - MOVL CX, X7 // zero + MOVL sel_save, X6 // sel + MOVL zero_save, X7 // zero PXOR X8, X8 // X8's bits are all 0 PCMPEQL X9, X9 // X9's bits are all 1 @@ -643,13 +641,9 @@ pointaddaffine_avx2: p256PointAddAffineInline() // The result is not valid if (sel == 0), conditional choose - MOVL BX, X6 // sel - MOVL CX, X7 // zero - VPXOR Y8, Y8, Y8 // Y8's bits are all 0 - - VPBROADCASTD X6, Y6 - VPBROADCASTD X7, Y7 + VPBROADCASTD sel_save, Y6 // sel + VPBROADCASTD zero_save, Y7 // zero VPCMPEQD Y8, Y6, Y6 // Y6's bits are all 1 if sel = 0, else are 0 VPCMPEQD Y8, Y7, Y7 // Y7's bits are all 1 if zero = 0, else are 0