internal/sm2ec: s390x uses VLM/VSTM batch 3

This commit is contained in:
Sun Yimin 2024-08-27 08:25:47 +08:00 committed by GitHub
parent 260c84eeb4
commit 189dcd4ca0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -875,8 +875,8 @@ TEXT sm2p256OrdSqrInternal<>(SB), NOFRAME|NOSPLIT, $0
#define X1 V1 #define X1 V1
#define Y0 V2 #define Y0 V2
#define Y1 V3 #define Y1 V3
#define M0 V4 #define M0 V5
#define M1 V5 #define M1 V4
#define T0 V6 #define T0 V6
#define T1 V7 #define T1 V7
#define K0 V31 #define K0 V31
@ -892,24 +892,19 @@ TEXT ·p256OrdMul(SB), NOSPLIT, $0
//BYTE $0x38 //BYTE $0x38
//BYTE $0x03 //BYTE $0x03
MOVD $p256ord<>+0x00(SB), R4 MOVD $p256ord<>+0x00(SB), R4
VL 16(R4), M0 VLM (R4), M1, M0
VL 0(R4), M1
VL (0*16)(x_ptr), X0 VLM (x_ptr), X0, Y1
VPDI $0x4, X0, X0, X0 VPDI $0x4, X0, X0, X0
VL (1*16)(x_ptr), X1
VPDI $0x4, X1, X1, X1 VPDI $0x4, X1, X1, X1
VL (0*16)(y_ptr), Y0
VPDI $0x4, Y0, Y0, Y0 VPDI $0x4, Y0, Y0, Y0
VL (1*16)(y_ptr), Y1
VPDI $0x4, Y1, Y1, Y1 VPDI $0x4, Y1, Y1, Y1
CALL sm2p256OrdMulInternal<>(SB) CALL sm2p256OrdMulInternal<>(SB)
VPDI $0x4, T0, T0, T0 VPDI $0x4, T0, T0, T0
VST T0, (0*16)(res_ptr)
VPDI $0x4, T1, T1, T1 VPDI $0x4, T1, T1, T1
VST T1, (1*16)(res_ptr) VSTM T0, T1, (res_ptr)
RET RET
@ -934,8 +929,8 @@ TEXT ·p256OrdMul(SB), NOSPLIT, $0
#define N R6 #define N R6
#define X0 V0 #define X0 V0
#define X1 V1 #define X1 V1
#define M0 V4 #define M0 V5
#define M1 V5 #define M1 V4
#define T0 V6 #define T0 V6
#define T1 V7 #define T1 V7
#define K0 V31 #define K0 V31
@ -953,12 +948,10 @@ TEXT ·p256OrdSqr(SB), NOSPLIT, $0
//BYTE $0x38 //BYTE $0x38
//BYTE $0x03 //BYTE $0x03
MOVD $p256ord<>+0x00(SB), R4 MOVD $p256ord<>+0x00(SB), R4
VL 16(R4), M0 VLM (R4), M1, M0
VL 0(R4), M1
VL (0*16)(x_ptr), X0 VLM (x_ptr), X0, X1
VPDI $0x4, X0, X0, X0 VPDI $0x4, X0, X0, X0
VL (1*16)(x_ptr), X1
VPDI $0x4, X1, X1, X1 VPDI $0x4, X1, X1, X1
loop: loop:
@ -970,9 +963,8 @@ loop:
BLT loop BLT loop
VPDI $0x4, T0, T0, T0 VPDI $0x4, T0, T0, T0
VST T0, (0*16)(res_ptr)
VPDI $0x4, T1, T1, T1 VPDI $0x4, T1, T1, T1
VST T1, (1*16)(res_ptr) VSTM T0, T1, (res_ptr)
RET RET