mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-28 05:06:18 +08:00
sm4: optimize TAO L1 #168
This commit is contained in:
parent
cc441bed27
commit
53e121c2b5
@ -41,14 +41,6 @@ DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL r08_mask<>(SB), 8, $16
|
||||
|
||||
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL r16_mask<>(SB), 8, $16
|
||||
|
||||
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
|
||||
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
||||
GLOBL r24_mask<>(SB), 8, $16
|
||||
|
||||
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
||||
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
||||
GLOBL fk_mask<>(SB), 8, $16
|
||||
@ -66,18 +58,6 @@ DATA r08_mask256<>+0x10(SB)/8, $0x0605040702010003
|
||||
DATA r08_mask256<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL r08_mask256<>(SB), 8, $32
|
||||
|
||||
DATA r16_mask256<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA r16_mask256<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
DATA r16_mask256<>+0x10(SB)/8, $0x0504070601000302
|
||||
DATA r16_mask256<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL r16_mask256<>(SB), 8, $32
|
||||
|
||||
DATA r24_mask256<>+0x00(SB)/8, $0x0407060500030201
|
||||
DATA r24_mask256<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
||||
DATA r24_mask256<>+0x10(SB)/8, $0x0407060500030201
|
||||
DATA r24_mask256<>+0x18(SB)/8, $0x0C0F0E0D080B0A09
|
||||
GLOBL r24_mask256<>(SB), 8, $32
|
||||
|
||||
// Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance!
|
||||
// input: from high to low
|
||||
// r0 = [w3, w2, w1, w0]
|
||||
@ -164,19 +144,18 @@ GLOBL r24_mask256<>(SB), 8, $32
|
||||
SM4_SBOX(x, y, z); \
|
||||
; \ //#################### 4 parallel L1 linear transforms ##################//
|
||||
MOVOU x, y; \
|
||||
PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
|
||||
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
|
||||
MOVOU x, z; \
|
||||
PSHUFB r16_mask<>(SB), z; \
|
||||
PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
|
||||
PSHUFB r08_mask<>(SB), y; \ //y = x <<< 8
|
||||
MOVOU y, z; \
|
||||
PSHUFB r08_mask<>(SB), z; \ //z = x <<< 16
|
||||
PXOR x, y; \ //y = x ^ (x <<< 8)
|
||||
PXOR z, y; \ //y = x ^ (x <<< 8) ^ (x <<< 16)
|
||||
PSHUFB r08_mask<>(SB), z; \ //z = x <<< 24
|
||||
PXOR z, x; \ //x = x ^ (x <<< 24)
|
||||
MOVOU y, z; \
|
||||
PSLLL $2, z; \
|
||||
PSRLL $30, y; \
|
||||
POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
|
||||
MOVOU x, z; \
|
||||
PSHUFB r24_mask<>(SB), z; \
|
||||
PXOR y, x; \ //x = x xor y
|
||||
PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
|
||||
POR z, y; \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18)
|
||||
PXOR y, x
|
||||
|
||||
// SM4 single round function, handle 16 bytes data
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
@ -239,6 +218,7 @@ GLOBL r24_mask256<>(SB), 8, $32
|
||||
PSHUFD $0xFF, rk128, x; \
|
||||
SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2); \
|
||||
|
||||
// Requires: SSSE3
|
||||
#define SM4_SINGLE_BLOCK(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
||||
PSHUFB flip_mask<>(SB), t0; \
|
||||
PSHUFD $1, t0, t1; \
|
||||
@ -388,16 +368,16 @@ GLOBL r24_mask256<>(SB), 8, $32
|
||||
// - tmp: 128 bits temp register
|
||||
#define AVX_SM4_TAO_L1(x, y, tmp) \
|
||||
AVX_SM4_SBOX(x, y, tmp); \
|
||||
VPSHUFB r08_mask<>(SB), x, y; \
|
||||
VPXOR x, y, y; \
|
||||
VPSHUFB r16_mask<>(SB), x, tmp; \
|
||||
VPXOR tmp, y, y; \
|
||||
VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8
|
||||
VPSHUFB r08_mask<>(SB), y, tmp; \ // tmp = x <<< 16
|
||||
VPXOR x, y, y; \ // y = x ^ (x <<< 8)
|
||||
VPXOR tmp, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
|
||||
VPSHUFB r08_mask<>(SB), tmp, tmp; \ // tmp = x <<< 24
|
||||
VPXOR x, tmp, x; \ // x = x ^ (x <<< 24)
|
||||
VPSLLD $2, y, tmp; \
|
||||
VPSRLD $30, y, y; \
|
||||
VPOR tmp, y, y; \
|
||||
VPSHUFB r24_mask<>(SB), x, tmp; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, tmp, x
|
||||
VPOR tmp, y, y; \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18)
|
||||
VPXOR y, x, x
|
||||
|
||||
// transpose matrix function, AVX/AVX2 version
|
||||
// parameters:
|
||||
@ -433,7 +413,7 @@ GLOBL r24_mask256<>(SB), 8, $32
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX_SM4_TAO_L1(x, y, tmp); \
|
||||
AVX_SM4_TAO_L1(x, y, tmp); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
|
||||
@ -591,16 +571,16 @@ GLOBL r24_mask256<>(SB), 8, $32
|
||||
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
|
||||
#define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
|
||||
AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \
|
||||
VPSHUFB r08_mask256<>(SB), x, y; \
|
||||
VPXOR x, y, y; \
|
||||
VPSHUFB r16_mask256<>(SB), x, z; \
|
||||
VPXOR z, y, y; \
|
||||
VPSHUFB r08_mask256<>(SB), x, y; \ // y = x <<< 8
|
||||
VPSHUFB r08_mask256<>(SB), y, z; \ // z = x <<< 16
|
||||
VPXOR x, y, y; \ // y = x ^ (x <<< 8)
|
||||
VPXOR z, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
|
||||
VPSHUFB r08_mask256<>(SB), z, z; \ // z = x <<< 24
|
||||
VPXOR x, z, x; \ // x = x ^ (x <<< 24)
|
||||
VPSLLD $2, y, z; \
|
||||
VPSRLD $30, y, y; \
|
||||
VPOR z, y, y; \
|
||||
VPSHUFB r24_mask256<>(SB), x, z; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, z, x
|
||||
VPOR z, y, y; \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18)
|
||||
VPXOR y, x, x
|
||||
|
||||
// SM4 round function, AVX2 version, handle 256 bits
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
|
@ -31,14 +31,6 @@ DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL r08_mask<>(SB), (16+8), $16
|
||||
|
||||
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL r16_mask<>(SB), (16+8), $16
|
||||
|
||||
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
|
||||
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
||||
GLOBL r24_mask<>(SB), (16+8), $16
|
||||
|
||||
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
||||
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
||||
GLOBL fk_mask<>(SB), (16+8), $16
|
||||
@ -64,13 +56,7 @@ GLOBL fk_mask<>(SB), (16+8), $16
|
||||
VMOV R21, INVERSE_SHIFT_ROWS.D[1] \
|
||||
LDP r08_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, R08_MASK.D[0] \
|
||||
VMOV R21, R08_MASK.D[1] \
|
||||
LDP r16_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, R16_MASK.D[0] \
|
||||
VMOV R21, R16_MASK.D[1] \
|
||||
LDP r24_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, R24_MASK.D[0] \
|
||||
VMOV R21, R24_MASK.D[1]
|
||||
VMOV R21, R08_MASK.D[1]
|
||||
|
||||
// input: from high to low
|
||||
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
|
||||
@ -141,15 +127,15 @@ GLOBL fk_mask<>(SB), (16+8), $16
|
||||
// - z: 128 bits temp register
|
||||
#define SM4_TAO_L1(x, y, z) \
|
||||
SM4_SBOX(x, y, z); \
|
||||
VTBL R08_MASK.B16, [x.B16], y.B16; \
|
||||
VEOR y.B16, x.B16, y.B16; \
|
||||
VTBL R16_MASK.B16, [x.B16], z.B16; \
|
||||
VEOR z.B16, y.B16, z.B16; \
|
||||
VSHL $2, z.S4, y.S4; \
|
||||
VSRI $30, z.S4, y.S4; \
|
||||
VTBL R24_MASK.B16, [x.B16], z.B16; \
|
||||
VEOR z.B16, x.B16, x.B16; \
|
||||
VEOR y.B16, x.B16, x.B16
|
||||
VTBL R08_MASK.B16, [x.B16], y.B16; \ // y = x <<< 8
|
||||
VTBL R08_MASK.B16, [y.B16], z.B16; \ // z = x <<< 16
|
||||
VEOR x.B16, y.B16, y.B16; \ // y = x ^ (x <<< 8)
|
||||
VEOR z.B16, y.B16, y.B16; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
|
||||
VTBL R08_MASK.B16, [z.B16], z.B16; \ // z = x <<< 24
|
||||
VEOR z.B16, x.B16, x.B16; \ // x = x ^ (x <<< 24)
|
||||
VSHL $2, y.S4, z.S4; \
|
||||
VSRI $30, y.S4, z.S4; \ // z = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18)
|
||||
VEOR z.B16, x.B16, x.B16
|
||||
|
||||
// SM4 round function
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
|
@ -322,6 +322,7 @@ avx2_sm4_done:
|
||||
RET
|
||||
|
||||
// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
|
||||
// Requires: SSSE3
|
||||
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
|
||||
MOVQ xk+0(FP), AX
|
||||
MOVQ dst+8(FP), BX
|
||||
|
@ -21,9 +21,7 @@
|
||||
#define M2L V24
|
||||
#define M2H V25
|
||||
#define R08_MASK V26
|
||||
#define R16_MASK V27
|
||||
#define R24_MASK V28
|
||||
#define FK_MASK V29
|
||||
#define FK_MASK V27
|
||||
#define XTMP6 V6
|
||||
#define XTMP7 V7
|
||||
|
||||
@ -78,13 +76,7 @@
|
||||
load_global_data_1() \
|
||||
LDP r08_mask<>(SB), (R0, R1) \
|
||||
VMOV R0, R08_MASK.D[0] \
|
||||
VMOV R1, R08_MASK.D[1] \
|
||||
LDP r16_mask<>(SB), (R0, R1) \
|
||||
VMOV R0, R16_MASK.D[0] \
|
||||
VMOV R1, R16_MASK.D[1] \
|
||||
LDP r24_mask<>(SB), (R0, R1) \
|
||||
VMOV R0, R24_MASK.D[0] \
|
||||
VMOV R1, R24_MASK.D[1]
|
||||
VMOV R1, R08_MASK.D[1]
|
||||
|
||||
#define SM4EKEY_EXPORT_KEYS() \
|
||||
VMOV V9.S[3], V10.S[0] \
|
||||
|
@ -17,9 +17,7 @@
|
||||
#define M2L V24
|
||||
#define M2H V25
|
||||
#define R08_MASK V26
|
||||
#define R16_MASK V27
|
||||
#define R24_MASK V28
|
||||
#define FK_MASK V29
|
||||
#define FK_MASK V27
|
||||
#define XTMP6 V6
|
||||
#define IV V7
|
||||
|
||||
|
@ -17,9 +17,7 @@
|
||||
#define M2L V24
|
||||
#define M2H V25
|
||||
#define R08_MASK V26
|
||||
#define R16_MASK V27
|
||||
#define R24_MASK V28
|
||||
#define FK_MASK V29
|
||||
#define FK_MASK V27
|
||||
#define XTMP6 V6
|
||||
#define XTMP7 V7
|
||||
#define t4 V10
|
||||
|
@ -37,8 +37,6 @@
|
||||
#define M2L V27
|
||||
#define M2H V28
|
||||
#define R08_MASK V29
|
||||
#define R16_MASK V30
|
||||
#define R24_MASK V31
|
||||
|
||||
#define reduce() \
|
||||
VEOR ACC0.B16, ACCM.B16, ACCM.B16 \
|
||||
|
@ -37,8 +37,6 @@
|
||||
#define M2L V27
|
||||
#define M2H V28
|
||||
#define R08_MASK V29
|
||||
#define R16_MASK V30
|
||||
#define R24_MASK V31
|
||||
|
||||
#include "aesni_macros_arm64.s"
|
||||
#include "xts_macros_arm64.s"
|
||||
|
Loading…
x
Reference in New Issue
Block a user