From 761746de78c74f443dd642a0d57cfe0fca1110e1 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Thu, 19 Sep 2024 17:52:12 +0800 Subject: [PATCH] sm4: amd64, a bit change for single block --- sm4/aesni_macros_amd64.s | 9 ++------- sm4/asm_amd64.s | 26 +++++++++++++++----------- sm4/cipher_asm_test.go | 17 +++++++++++++++-- sm4/gcm_amd64.s | 19 ++++++++++++------- 4 files changed, 44 insertions(+), 27 deletions(-) diff --git a/sm4/aesni_macros_amd64.s b/sm4/aesni_macros_amd64.s index 19bbce4..4fd1b40 100644 --- a/sm4/aesni_macros_amd64.s +++ b/sm4/aesni_macros_amd64.s @@ -158,20 +158,15 @@ GLOBL fk_mask<>(SB), 8, $16 // SM4 single round function, handle 16 bytes data // t0 ^= tao_l1(t1^t2^t3^xk) -// used R19 as temp 32/64 bits register // parameters: -// - index: round key index immediate number -// - RK: round key register -// - IND: round key index base register -// - x: 128 bits temp register +// - x: 128 bits temp register (also as input RK) // - y: 128 bits temp register // - z: 128 bits temp register // - t0: 128 bits register for data as result // - t1: 128 bits register for data // - t2: 128 bits register for data // - t3: 128 bits register for data -#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ - MOVL (index * 4)(RK)(IND*1), x; \ +#define SM4_SINGLE_ROUND(x, y, z, t0, t1, t2, t3) \ PXOR t1, x; \ PXOR t2, x; \ PXOR t3, x; \ diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s index ea4f9f9..974315c 100644 --- a/sm4/asm_amd64.s +++ b/sm4/asm_amd64.s @@ -327,7 +327,7 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 MOVQ dst+8(FP), BX MOVQ src+16(FP), DX - MOVOU (DX), t0 + MOVUPS (DX), t0 PSHUFB flip_mask<>(SB), t0 PSHUFD $1, t0, t1 PSHUFD $2, t0, t2 @@ -336,21 +336,25 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 XORL CX, CX loop: - SM4_SINGLE_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3) - SM4_SINGLE_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0) - SM4_SINGLE_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1) - SM4_SINGLE_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2) + MOVUPS (AX)(CX*1), XTMP7 + MOVOU XTMP7, x + SM4_SINGLE_ROUND(x, y, XTMP6, t0, t1, t2, t3) + PSHUFD $1, XTMP7, x + SM4_SINGLE_ROUND(x, y, XTMP6, t1, t2, t3, t0) + PSHUFD $2, XTMP7, x + SM4_SINGLE_ROUND(x, y, XTMP6, t2, t3, t0, t1) + PSHUFD $3, XTMP7, x + SM4_SINGLE_ROUND(x, y, XTMP6, t3, t0, t1, t2) ADDL $16, CX CMPL CX, $4*32 JB loop - PALIGNR $4, t3, t3 - PALIGNR $4, t3, t2 - PALIGNR $4, t2, t1 - PALIGNR $4, t1, t0 - PSHUFB flip_mask<>(SB), t0 - MOVOU t0, (BX) + PUNPCKLLQ t2, t3 + PUNPCKLLQ t0, t1 + PUNPCKLQDQ t1, t3 + PSHUFB flip_mask<>(SB), t3 + MOVUPS t3, (BX) done_sm4: RET diff --git a/sm4/cipher_asm_test.go b/sm4/cipher_asm_test.go index 75e3052..329a289 100644 --- a/sm4/cipher_asm_test.go +++ b/sm4/cipher_asm_test.go @@ -131,10 +131,23 @@ func TestEncryptBlocksDoubleWithAESNI(t *testing.T) { } } -func BenchmarkExpand(b *testing.B) { +func BenchmarkExpandAESNI(b *testing.B) { c := &sm4Cipher{} b.ResetTimer() for i := 0; i < b.N; i++ { - expandKey(encryptTests[0].key, c.enc[:], c.dec[:]) + expandKeyAsm(&encryptTests[0].key[0], &ck[0], &c.enc[0], &c.dec[0], INST_AES) + } +} + +func BenchmarkEncryptAsm(b *testing.B) { + src := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10} + encRes2 := make([]uint32, 32) + decRes2 := make([]uint32, 32) + expandKeyAsm(&src[0], &ck[0], &encRes2[0], &decRes2[0], 0) + dst := make([]byte, 16) + b.SetBytes(int64(len(src))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + encryptBlockAsm(&encRes2[0], &dst[0], &src[0], 0) } } diff --git a/sm4/gcm_amd64.s b/sm4/gcm_amd64.s index a6b6a10..3fdf1bc 100644 --- a/sm4/gcm_amd64.s +++ b/sm4/gcm_amd64.s @@ -166,14 +166,19 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0 XORL CX, CX sm4InitEncLoop: - SM4_SINGLE_ROUND(0, RK, CX, T0, T1, T2, B3, B2, B1, B0) - SM4_SINGLE_ROUND(1, RK, CX, T0, T1, T2, B2, B1, B0, B3) - SM4_SINGLE_ROUND(2, RK, CX, T0, T1, T2, B1, B0, B3, B2) - SM4_SINGLE_ROUND(3, RK, CX, T0, T1, T2, B0, B3, B2, B1) + MOVUPS (RK)(CX*1), B4 + MOVOU B4, T0 + SM4_SINGLE_ROUND(T0, T1, T2, B3, B2, B1, B0) + PSHUFD $1, B4, T0 + SM4_SINGLE_ROUND(T0, T1, T2, B2, B1, B0, B3) + PSHUFD $2, B4, T0 + SM4_SINGLE_ROUND(T0, T1, T2, B1, B0, B3, B2) + PSHUFD $3, B4, T0 + SM4_SINGLE_ROUND(T0, T1, T2, B0, B3, B2, B1) - ADDL $16, CX - CMPL CX, $4*32 - JB sm4InitEncLoop + ADDL $16, CX + CMPL CX, $4*32 + JB sm4InitEncLoop PALIGNR $4, B3, B3 PALIGNR $4, B3, B2