sm4: optimize TAO L1 #168

This commit is contained in:
Sun Yimin 2023-09-28 10:11:31 +08:00 committed by GitHub
parent cc441bed27
commit 53e121c2b5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 42 additions and 91 deletions

View File

@ -41,14 +41,6 @@ DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
GLOBL r08_mask<>(SB), 8, $16
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
GLOBL r16_mask<>(SB), 8, $16
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
GLOBL r24_mask<>(SB), 8, $16
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
GLOBL fk_mask<>(SB), 8, $16
@ -66,18 +58,6 @@ DATA r08_mask256<>+0x10(SB)/8, $0x0605040702010003
DATA r08_mask256<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
GLOBL r08_mask256<>(SB), 8, $32
DATA r16_mask256<>+0x00(SB)/8, $0x0504070601000302
DATA r16_mask256<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
DATA r16_mask256<>+0x10(SB)/8, $0x0504070601000302
DATA r16_mask256<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
GLOBL r16_mask256<>(SB), 8, $32
DATA r24_mask256<>+0x00(SB)/8, $0x0407060500030201
DATA r24_mask256<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
DATA r24_mask256<>+0x10(SB)/8, $0x0407060500030201
DATA r24_mask256<>+0x18(SB)/8, $0x0C0F0E0D080B0A09
GLOBL r24_mask256<>(SB), 8, $32
// Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance!
// input: from high to low
// r0 = [w3, w2, w1, w0]
@ -164,19 +144,18 @@ GLOBL r24_mask256<>(SB), 8, $32
SM4_SBOX(x, y, z); \
; \ //#################### 4 parallel L1 linear transforms ##################//
MOVOU x, y; \
PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
MOVOU x, z; \
PSHUFB r16_mask<>(SB), z; \
PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
PSHUFB r08_mask<>(SB), y; \ //y = x <<< 8
MOVOU y, z; \
PSHUFB r08_mask<>(SB), z; \ //z = x <<< 16
PXOR x, y; \ //y = x ^ (x <<< 8)
PXOR z, y; \ //y = x ^ (x <<< 8) ^ (x <<< 16)
PSHUFB r08_mask<>(SB), z; \ //z = x <<< 24
PXOR z, x; \ //x = x ^ (x <<< 24)
MOVOU y, z; \
PSLLL $2, z; \
PSRLL $30, y; \
POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
MOVOU x, z; \
PSHUFB r24_mask<>(SB), z; \
PXOR y, x; \ //x = x xor y
PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
POR z, y; \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18)
PXOR y, x
// SM4 single round function, handle 16 bytes data
// t0 ^= tao_l1(t1^t2^t3^xk)
@ -239,6 +218,7 @@ GLOBL r24_mask256<>(SB), 8, $32
PSHUFD $0xFF, rk128, x; \
SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2); \
// Requires: SSSE3
#define SM4_SINGLE_BLOCK(RK, rk128, x, y, z, t0, t1, t2, t3) \
PSHUFB flip_mask<>(SB), t0; \
PSHUFD $1, t0, t1; \
@ -388,16 +368,16 @@ GLOBL r24_mask256<>(SB), 8, $32
// - tmp: 128 bits temp register
#define AVX_SM4_TAO_L1(x, y, tmp) \
AVX_SM4_SBOX(x, y, tmp); \
VPSHUFB r08_mask<>(SB), x, y; \
VPXOR x, y, y; \
VPSHUFB r16_mask<>(SB), x, tmp; \
VPXOR tmp, y, y; \
VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8
VPSHUFB r08_mask<>(SB), y, tmp; \ // tmp = x <<< 16
VPXOR x, y, y; \ // y = x ^ (x <<< 8)
VPXOR tmp, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
VPSHUFB r08_mask<>(SB), tmp, tmp; \ // tmp = x <<< 24
VPXOR x, tmp, x; \ // x = x ^ (x <<< 24)
VPSLLD $2, y, tmp; \
VPSRLD $30, y, y; \
VPOR tmp, y, y; \
VPSHUFB r24_mask<>(SB), x, tmp; \
VPXOR y, x, x; \
VPXOR x, tmp, x
VPOR tmp, y, y; \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18)
VPXOR y, x, x
// transpose matrix function, AVX/AVX2 version
// parameters:
@ -433,7 +413,7 @@ GLOBL r24_mask256<>(SB), 8, $32
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, tmp); \
AVX_SM4_TAO_L1(x, y, tmp); \
VPXOR x, t0, t0
@ -591,16 +571,16 @@ GLOBL r24_mask256<>(SB), 8, $32
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
#define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \
VPSHUFB r08_mask256<>(SB), x, y; \
VPXOR x, y, y; \
VPSHUFB r16_mask256<>(SB), x, z; \
VPXOR z, y, y; \
VPSHUFB r08_mask256<>(SB), x, y; \ // y = x <<< 8
VPSHUFB r08_mask256<>(SB), y, z; \ // z = x <<< 16
VPXOR x, y, y; \ // y = x ^ (x <<< 8)
VPXOR z, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
VPSHUFB r08_mask256<>(SB), z, z; \ // z = x <<< 24
VPXOR x, z, x; \ // x = x ^ (x <<< 24)
VPSLLD $2, y, z; \
VPSRLD $30, y, y; \
VPOR z, y, y; \
VPSHUFB r24_mask256<>(SB), x, z; \
VPXOR y, x, x; \
VPXOR x, z, x
VPOR z, y, y; \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18)
VPXOR y, x, x
// SM4 round function, AVX2 version, handle 256 bits
// t0 ^= tao_l1(t1^t2^t3^xk)

View File

@ -31,14 +31,6 @@ DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
GLOBL r08_mask<>(SB), (16+8), $16
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
GLOBL r16_mask<>(SB), (16+8), $16
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
GLOBL r24_mask<>(SB), (16+8), $16
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
GLOBL fk_mask<>(SB), (16+8), $16
@ -64,13 +56,7 @@ GLOBL fk_mask<>(SB), (16+8), $16
VMOV R21, INVERSE_SHIFT_ROWS.D[1] \
LDP r08_mask<>(SB), (R20, R21) \
VMOV R20, R08_MASK.D[0] \
VMOV R21, R08_MASK.D[1] \
LDP r16_mask<>(SB), (R20, R21) \
VMOV R20, R16_MASK.D[0] \
VMOV R21, R16_MASK.D[1] \
LDP r24_mask<>(SB), (R20, R21) \
VMOV R20, R24_MASK.D[0] \
VMOV R21, R24_MASK.D[1]
VMOV R21, R08_MASK.D[1]
// input: from high to low
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
@ -141,15 +127,15 @@ GLOBL fk_mask<>(SB), (16+8), $16
// - z: 128 bits temp register
#define SM4_TAO_L1(x, y, z) \
SM4_SBOX(x, y, z); \
VTBL R08_MASK.B16, [x.B16], y.B16; \
VEOR y.B16, x.B16, y.B16; \
VTBL R16_MASK.B16, [x.B16], z.B16; \
VEOR z.B16, y.B16, z.B16; \
VSHL $2, z.S4, y.S4; \
VSRI $30, z.S4, y.S4; \
VTBL R24_MASK.B16, [x.B16], z.B16; \
VEOR z.B16, x.B16, x.B16; \
VEOR y.B16, x.B16, x.B16
VTBL R08_MASK.B16, [x.B16], y.B16; \ // y = x <<< 8
VTBL R08_MASK.B16, [y.B16], z.B16; \ // z = x <<< 16
VEOR x.B16, y.B16, y.B16; \ // y = x ^ (x <<< 8)
VEOR z.B16, y.B16, y.B16; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
VTBL R08_MASK.B16, [z.B16], z.B16; \ // z = x <<< 24
VEOR z.B16, x.B16, x.B16; \ // x = x ^ (x <<< 24)
VSHL $2, y.S4, z.S4; \
VSRI $30, y.S4, z.S4; \ // z = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18)
VEOR z.B16, x.B16, x.B16
// SM4 round function
// t0 ^= tao_l1(t1^t2^t3^xk)

View File

@ -322,6 +322,7 @@ avx2_sm4_done:
RET
// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
// Requires: SSSE3
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
MOVQ xk+0(FP), AX
MOVQ dst+8(FP), BX

View File

@ -21,9 +21,7 @@
#define M2L V24
#define M2H V25
#define R08_MASK V26
#define R16_MASK V27
#define R24_MASK V28
#define FK_MASK V29
#define FK_MASK V27
#define XTMP6 V6
#define XTMP7 V7
@ -78,13 +76,7 @@
load_global_data_1() \
LDP r08_mask<>(SB), (R0, R1) \
VMOV R0, R08_MASK.D[0] \
VMOV R1, R08_MASK.D[1] \
LDP r16_mask<>(SB), (R0, R1) \
VMOV R0, R16_MASK.D[0] \
VMOV R1, R16_MASK.D[1] \
LDP r24_mask<>(SB), (R0, R1) \
VMOV R0, R24_MASK.D[0] \
VMOV R1, R24_MASK.D[1]
VMOV R1, R08_MASK.D[1]
#define SM4EKEY_EXPORT_KEYS() \
VMOV V9.S[3], V10.S[0] \

View File

@ -17,9 +17,7 @@
#define M2L V24
#define M2H V25
#define R08_MASK V26
#define R16_MASK V27
#define R24_MASK V28
#define FK_MASK V29
#define FK_MASK V27
#define XTMP6 V6
#define IV V7

View File

@ -17,9 +17,7 @@
#define M2L V24
#define M2H V25
#define R08_MASK V26
#define R16_MASK V27
#define R24_MASK V28
#define FK_MASK V29
#define FK_MASK V27
#define XTMP6 V6
#define XTMP7 V7
#define t4 V10

View File

@ -37,8 +37,6 @@
#define M2L V27
#define M2H V28
#define R08_MASK V29
#define R16_MASK V30
#define R24_MASK V31
#define reduce() \
VEOR ACC0.B16, ACCM.B16, ACCM.B16 \

View File

@ -37,8 +37,6 @@
#define M2L V27
#define M2H V28
#define R08_MASK V29
#define R16_MASK V30
#define R24_MASK V31
#include "aesni_macros_arm64.s"
#include "xts_macros_arm64.s"