mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-28 13:16:19 +08:00
sm4: reduce duplicated asm code
This commit is contained in:
parent
572bf6574e
commit
8ddf1bc68f
255
sm4/aesni_amd64.h
Normal file
255
sm4/aesni_amd64.h
Normal file
@ -0,0 +1,255 @@
|
||||
// shuffle byte order from LE to BE
|
||||
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
|
||||
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
|
||||
GLOBL flip_mask<>(SB), RODATA, $16
|
||||
|
||||
// shuffle byte and word order
|
||||
DATA bswap_mask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
|
||||
DATA bswap_mask<>+0x08(SB)/8, $0x0001020304050607
|
||||
GLOBL bswap_mask<>(SB), RODATA, $16
|
||||
|
||||
//nibble mask
|
||||
DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
GLOBL nibble_mask<>(SB), RODATA, $16
|
||||
|
||||
// inverse shift rows
|
||||
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
|
||||
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
|
||||
GLOBL inverse_shift_rows<>(SB), RODATA, $16
|
||||
|
||||
// Affine transform 1 (low and high hibbles)
|
||||
DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
|
||||
DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
|
||||
GLOBL m1_low<>(SB), RODATA, $16
|
||||
|
||||
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
|
||||
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
|
||||
GLOBL m1_high<>(SB), RODATA, $16
|
||||
|
||||
// Affine transform 2 (low and high hibbles)
|
||||
DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
|
||||
DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
|
||||
GLOBL m2_low<>(SB), RODATA, $16
|
||||
|
||||
DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400
|
||||
DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5
|
||||
GLOBL m2_high<>(SB), RODATA, $16
|
||||
|
||||
// left rotations of 32-bit words by 8-bit increments
|
||||
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL r08_mask<>(SB), RODATA, $16
|
||||
|
||||
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL r16_mask<>(SB), RODATA, $16
|
||||
|
||||
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
|
||||
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
||||
GLOBL r24_mask<>(SB), RODATA, $16
|
||||
|
||||
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
||||
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
||||
GLOBL fk_mask<>(SB), RODATA, $16
|
||||
|
||||
// MOVOU r0, tmp2;
|
||||
// PUNPCKHDQ r1, tmp2;
|
||||
// PUNPCKLDQ r1, r0;
|
||||
// MOVOU r2, tmp1;
|
||||
// PUNPCKLDQ r3, tmp1;
|
||||
// PUNPCKHDQ r3, r2;
|
||||
// MOVOU r0, r1;
|
||||
// PUNPCKHQDQ tmp1, r1;
|
||||
// PUNPCKLQDQ tmp1, r0;
|
||||
// MOVOU tmp2, r3;
|
||||
// PUNPCKHQDQ r2, r3;
|
||||
// PUNPCKLQDQ r2, tmp2;
|
||||
// MOVOU tmp2, r2
|
||||
#define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \
|
||||
PEXTRD $2, r0, r; \
|
||||
PINSRD $0, r, tmp2; \
|
||||
PEXTRD $2, r1, r; \
|
||||
PINSRD $1, r, tmp2; \
|
||||
; \
|
||||
PEXTRD $3, r0, r; \
|
||||
PINSRD $2, r, tmp2; \
|
||||
PEXTRD $3, r1, r; \
|
||||
PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2]
|
||||
; \
|
||||
PEXTRD $1, r0, r; \
|
||||
PINSRD $2, r, r0; \
|
||||
PEXTRD $0, r1, r; \
|
||||
PINSRD $1, r, r0; \
|
||||
PEXTRD $1, r1, r; \
|
||||
PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0]
|
||||
; \
|
||||
PEXTRD $0, r2, r; \
|
||||
PINSRD $0, r, tmp1; \
|
||||
PEXTRD $0, r3, r; \
|
||||
PINSRD $1, r, tmp1; \
|
||||
PEXTRD $1, r2, r; \
|
||||
PINSRD $2, r, tmp1; \
|
||||
PEXTRD $1, r3, r; \
|
||||
PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8]
|
||||
; \
|
||||
PEXTRD $2, r2, r; \
|
||||
PINSRD $0, r, r2; \
|
||||
PEXTRD $2, r3, r; \
|
||||
PINSRD $1, r, r2; \
|
||||
PEXTRD $3, r2, r; \
|
||||
PINSRD $2, r, r2; \
|
||||
PEXTRD $3, r3, r; \
|
||||
PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10]
|
||||
; \
|
||||
MOVOU r0, r1; \
|
||||
PEXTRQ $1, r1, r; \
|
||||
PINSRQ $0, r, r1; \
|
||||
PEXTRQ $1, tmp1, r; \
|
||||
PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1]
|
||||
; \
|
||||
PEXTRQ $0, tmp1, r; \
|
||||
PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0]
|
||||
; \
|
||||
MOVOU tmp2, r3; \
|
||||
PEXTRQ $1, r3, r; \
|
||||
PINSRQ $0, r, r3; \
|
||||
PEXTRQ $1, r2, r; \
|
||||
PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3]
|
||||
; \
|
||||
PEXTRQ $0, r2, r; \
|
||||
PINSRQ $1, r, r2; \
|
||||
PEXTRQ $0, tmp2, r; \
|
||||
PINSRQ $0, r, r2
|
||||
|
||||
#define SM4_SBOX(x, y, z) \
|
||||
; \ //############################# inner affine ############################//
|
||||
MOVOU x, z; \
|
||||
PAND nibble_mask<>(SB), z; \ //y = _mm_and_si128(x, c0f);
|
||||
MOVOU m1_low<>(SB), y; \
|
||||
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y);
|
||||
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
||||
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
|
||||
MOVOU m1_high<>(SB), z; \
|
||||
PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x);
|
||||
MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x);
|
||||
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
|
||||
; \ // inverse ShiftRows
|
||||
PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
|
||||
AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction
|
||||
; \ //############################# outer affine ############################//
|
||||
MOVOU x, z; \
|
||||
PANDN nibble_mask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f);
|
||||
MOVOU m2_low<>(SB), y; \
|
||||
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z)
|
||||
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
||||
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
|
||||
MOVOU m2_high<>(SB), z; \
|
||||
PSHUFB x, z; \
|
||||
MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x)
|
||||
PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y;
|
||||
|
||||
#define SM4_TAO_L1(x, y, z) \
|
||||
SM4_SBOX(x, y, z); \
|
||||
; \ //#################### 4 parallel L1 linear transforms ##################//
|
||||
MOVOU x, y; \
|
||||
PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
|
||||
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
|
||||
MOVOU x, z; \
|
||||
PSHUFB r16_mask<>(SB), z; \
|
||||
PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
|
||||
MOVOU y, z; \
|
||||
PSLLL $2, z; \
|
||||
PSRLL $30, y; \
|
||||
POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
|
||||
MOVOU x, z; \
|
||||
PSHUFB r24_mask<>(SB), z; \
|
||||
PXOR y, x; \ //x = x xor y
|
||||
PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
|
||||
|
||||
#define AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp) \
|
||||
VPAND X_NIBBLE_MASK, x, tmp; \
|
||||
VMOVDQU m1_low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND X_NIBBLE_MASK, x, x; \
|
||||
VMOVDQU m1_high<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x; \
|
||||
VMOVDQU inverse_shift_rows<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, x; \
|
||||
VAESENCLAST X_NIBBLE_MASK, x, x; \
|
||||
VPANDN X_NIBBLE_MASK, x, tmp; \
|
||||
VMOVDQU m2_low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND X_NIBBLE_MASK, x, x; \
|
||||
VMOVDQU m2_high<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x
|
||||
|
||||
#define AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp) \
|
||||
AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp); \
|
||||
VMOVDQU r08_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, y; \
|
||||
VPXOR x, y, y; \
|
||||
VMOVDQU r16_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR tmp, y, y; \
|
||||
VPSLLD $2, y, tmp; \
|
||||
VPSRLD $30, y, y; \
|
||||
VPXOR tmp, y, y; \
|
||||
VMOVDQU r24_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, tmp, x
|
||||
|
||||
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
|
||||
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
|
||||
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
|
||||
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8]
|
||||
VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10]
|
||||
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1]
|
||||
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0]
|
||||
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
|
||||
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
|
||||
|
||||
#define AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \
|
||||
VPAND yNibbleMask, x, tmp; \
|
||||
VBROADCASTI128 m1_low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND yNibbleMask, x, x; \
|
||||
VBROADCASTI128 m1_high<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x; \
|
||||
VBROADCASTI128 inverse_shift_rows<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, x; \
|
||||
VEXTRACTI128 $1, x, yw \
|
||||
VAESENCLAST xNibbleMask, xw, xw; \
|
||||
VAESENCLAST xNibbleMask, yw, yw; \
|
||||
VINSERTI128 $1, yw, x, x; \
|
||||
VPANDN yNibbleMask, x, tmp; \
|
||||
VBROADCASTI128 m2_low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND yNibbleMask, x, x; \
|
||||
VBROADCASTI128 m2_high<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x
|
||||
|
||||
#define AVX2_SM4_TAO_L1(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \
|
||||
AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp); \
|
||||
VBROADCASTI128 r08_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, y; \
|
||||
VPXOR x, y, y; \
|
||||
VBROADCASTI128 r16_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR tmp, y, y; \
|
||||
VPSLLD $2, y, tmp; \
|
||||
VPSRLD $30, y, y; \
|
||||
VPXOR tmp, y, y; \
|
||||
VBROADCASTI128 r24_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, tmp, x
|
111
sm4/aesni_arm64.h
Normal file
111
sm4/aesni_arm64.h
Normal file
@ -0,0 +1,111 @@
|
||||
//nibble mask
|
||||
DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
GLOBL nibble_mask<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
// inverse shift rows
|
||||
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
|
||||
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
|
||||
GLOBL inverse_shift_rows<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
// Affine transform 1 (low and high hibbles)
|
||||
DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
|
||||
DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
|
||||
GLOBL m1_low<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
|
||||
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
|
||||
GLOBL m1_high<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
// Affine transform 2 (low and high hibbles)
|
||||
DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
|
||||
DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
|
||||
GLOBL m2_low<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400
|
||||
DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5
|
||||
GLOBL m2_high<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
// left rotations of 32-bit words by 8-bit increments
|
||||
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL r08_mask<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL r16_mask<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
|
||||
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
||||
GLOBL r24_mask<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
||||
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
||||
GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
#define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \
|
||||
VMOV t0.B16, K.B16 \
|
||||
VMOV t1.S[0], t0.S[1] \
|
||||
VMOV t2.S[0], t0.S[2] \
|
||||
VMOV t3.S[0], t0.S[3] \
|
||||
VMOV K.S[1], t1.S[0] \
|
||||
VMOV K.S[2], t2.S[0] \
|
||||
VMOV K.S[3], t3.S[0] \
|
||||
VMOV t1.D[1], K.D[1] \
|
||||
VMOV t2.S[1], t1.S[2] \
|
||||
VMOV t3.S[1], t1.S[3] \
|
||||
VMOV K.S[2], t2.S[1] \
|
||||
VMOV K.S[3], t3.S[1] \
|
||||
VMOV t2.S[3], K.S[3] \
|
||||
VMOV t3.S[2], t2.S[3] \
|
||||
VMOV K.S[3], t3.S[2]
|
||||
|
||||
#define TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \
|
||||
VMOV t0.B16, K.B16 \
|
||||
VMOV t3.S[0], t0.S[0] \
|
||||
VMOV t2.S[0], t0.S[1] \
|
||||
VMOV t1.S[0], t0.S[2] \
|
||||
VMOV K0.S[0], t0.S[3] \
|
||||
VMOV t3.S[1], t1.S[0] \
|
||||
VMOV t3.S[2], t2.S[0] \
|
||||
VMOV t3.S[3], t3.S[0] \
|
||||
VMOV t2.S[3], t3.S[1] \
|
||||
VMOV t1.S[3], t3.S[2] \
|
||||
VMOV K.S[3], t3.S[3] \
|
||||
VMOV K.S[2], t2.S[3] \
|
||||
VMOV K.S[1], t1.S[3] \
|
||||
VMOV t1.B16, K.B16 \
|
||||
VMOV t2.S[1], t1.S[1] \
|
||||
VMOV K.S[1], t1.S[2] \
|
||||
VMOV t2.S[2], t2.S[1] \
|
||||
VMOV K.S[2], t2.S[2]
|
||||
|
||||
#define SM4_SBOX(x, y, z) \
|
||||
; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
VTBL z.B16, [M1L.B16], y.B16; \
|
||||
VUSHR $4, x.D2, x.D2; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
VTBL z.B16, [M1H.B16], z.B16; \
|
||||
VEOR y.B16, z.B16, x.B16; \
|
||||
VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \
|
||||
AESE ZERO.B16, x.B16; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
VTBL z.B16, [M2L.B16], y.B16; \
|
||||
VUSHR $4, x.D2, x.D2; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
VTBL z.B16, [M2H.B16], z.B16; \
|
||||
VEOR y.B16, z.B16, x.B16
|
||||
|
||||
#define SM4_TAO_L1(x, y, z) \
|
||||
SM4_SBOX(x, y, z); \
|
||||
VTBL R08_MASK.B16, [x.B16], y.B16; \
|
||||
VEOR y.B16, x.B16, y.B16; \
|
||||
VTBL R16_MASK.B16, [x.B16], z.B16; \
|
||||
VEOR z.B16, y.B16, y.B16; \
|
||||
VSHL $2, y.S4, z.S4; \
|
||||
VUSHR $30, y.S4, y.S4; \
|
||||
VORR y.B16, z.B16, y.B16; \
|
||||
VTBL R24_MASK.B16, [x.B16], z.B16; \
|
||||
VEOR z.B16, x.B16, x.B16; \
|
||||
VEOR y.B16, x.B16, x.B16
|
211
sm4/asm_amd64.s
211
sm4/asm_amd64.s
@ -14,108 +14,10 @@
|
||||
#define XTMP6 X6
|
||||
#define XTMP7 X7
|
||||
|
||||
// shuffle byte order from LE to BE
|
||||
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
|
||||
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
|
||||
GLOBL flip_mask<>(SB), RODATA, $16
|
||||
|
||||
// shuffle byte and word order
|
||||
DATA bswap_mask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
|
||||
DATA bswap_mask<>+0x08(SB)/8, $0x0001020304050607
|
||||
GLOBL bswap_mask<>(SB), RODATA, $16
|
||||
|
||||
//nibble mask
|
||||
DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
GLOBL nibble_mask<>(SB), RODATA, $16
|
||||
|
||||
// inverse shift rows
|
||||
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
|
||||
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
|
||||
GLOBL inverse_shift_rows<>(SB), RODATA, $16
|
||||
|
||||
// Affine transform 1 (low and high hibbles)
|
||||
DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
|
||||
DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
|
||||
GLOBL m1_low<>(SB), RODATA, $16
|
||||
|
||||
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
|
||||
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
|
||||
GLOBL m1_high<>(SB), RODATA, $16
|
||||
|
||||
// Affine transform 2 (low and high hibbles)
|
||||
DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
|
||||
DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
|
||||
GLOBL m2_low<>(SB), RODATA, $16
|
||||
|
||||
DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400
|
||||
DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5
|
||||
GLOBL m2_high<>(SB), RODATA, $16
|
||||
|
||||
// left rotations of 32-bit words by 8-bit increments
|
||||
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL r08_mask<>(SB), RODATA, $16
|
||||
|
||||
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL r16_mask<>(SB), RODATA, $16
|
||||
|
||||
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
|
||||
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
||||
GLOBL r24_mask<>(SB), RODATA, $16
|
||||
|
||||
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
||||
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
||||
GLOBL fk_mask<>(SB), RODATA, $16
|
||||
|
||||
#define SM4_SBOX(x, y) \
|
||||
; \ //############################# inner affine ############################//
|
||||
MOVOU x, XTMP6; \
|
||||
PAND nibble_mask<>(SB), XTMP6; \ //y = _mm_and_si128(x, c0f);
|
||||
MOVOU m1_low<>(SB), y; \
|
||||
PSHUFB XTMP6, y; \ //y = _mm_shuffle_epi8(m1l, y);
|
||||
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
||||
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
|
||||
MOVOU m1_high<>(SB), XTMP6; \
|
||||
PSHUFB x, XTMP6; \ //x = _mm_shuffle_epi8(m1h, x);
|
||||
MOVOU XTMP6, x; \ //x = _mm_shuffle_epi8(m1h, x);
|
||||
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
|
||||
; \ // inverse ShiftRows
|
||||
PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
|
||||
AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction
|
||||
; \ //############################# outer affine ############################//
|
||||
MOVOU x, XTMP6; \
|
||||
PANDN nibble_mask<>(SB), XTMP6; \ //XTMP6 = _mm_andnot_si128(x, c0f);
|
||||
MOVOU m2_low<>(SB), y; \
|
||||
PSHUFB XTMP6, y; \ //y = _mm_shuffle_epi8(m2l, XTMP6)
|
||||
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
||||
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
|
||||
MOVOU m2_high<>(SB), XTMP6; \
|
||||
PSHUFB x, XTMP6; \
|
||||
MOVOU XTMP6, x; \ //x = _mm_shuffle_epi8(m2h, x)
|
||||
PXOR y, x; \ //x = _mm_shuffle_epi8(m2h, x) ^ y;
|
||||
|
||||
#define SM4_TAO_L1(x, y) \
|
||||
SM4_SBOX(x, y); \
|
||||
; \ //#################### 4 parallel L1 linear transforms ##################//
|
||||
MOVOU x, y; \
|
||||
PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
|
||||
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
|
||||
MOVOU x, XTMP6; \
|
||||
PSHUFB r16_mask<>(SB), XTMP6; \
|
||||
PXOR XTMP6, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
|
||||
MOVOU y, XTMP6; \
|
||||
PSLLL $2, XTMP6; \
|
||||
PSRLL $30, y; \
|
||||
POR XTMP6, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
|
||||
MOVOU x, XTMP7; \
|
||||
PSHUFB r24_mask<>(SB), XTMP7; \
|
||||
PXOR y, x; \ //x = x xor y
|
||||
PXOR XTMP7, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
|
||||
#include "aesni_amd64.h"
|
||||
|
||||
#define SM4_TAO_L2(x, y) \
|
||||
SM4_SBOX(x, y); \
|
||||
SM4_SBOX(x, y, XTMP6); \
|
||||
; \ //#################### 4 parallel L2 linear transforms ##################//
|
||||
MOVOU x, y; \
|
||||
MOVOU x, XTMP6; \
|
||||
@ -135,7 +37,7 @@ GLOBL fk_mask<>(SB), RODATA, $16
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y); \
|
||||
SM4_TAO_L1(x, y, XTMP6); \
|
||||
PXOR x, t0
|
||||
|
||||
#define SM4_SINGLE_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
@ -143,7 +45,7 @@ GLOBL fk_mask<>(SB), RODATA, $16
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y); \
|
||||
SM4_TAO_L1(x, y, XTMP6); \
|
||||
PXOR x, t0
|
||||
|
||||
#define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
@ -187,110 +89,20 @@ GLOBL fk_mask<>(SB), RODATA, $16
|
||||
#define XWORD X8
|
||||
#define YWORD X9
|
||||
|
||||
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
|
||||
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
|
||||
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
|
||||
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8]
|
||||
VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10]
|
||||
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1]
|
||||
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0]
|
||||
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
|
||||
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
|
||||
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
|
||||
#define AVX2_SM4_SBOX(x, y) \
|
||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK; \
|
||||
VPAND NIBBLE_MASK, x, XDWTMP1; \
|
||||
VBROADCASTI128 m1_low<>(SB), y; \
|
||||
VPSHUFB XDWTMP1, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND NIBBLE_MASK, x, x; \
|
||||
VBROADCASTI128 m1_high<>(SB), XDWTMP1; \
|
||||
VPSHUFB x, XDWTMP1, x; \
|
||||
VPXOR y, x, x; \
|
||||
VBROADCASTI128 inverse_shift_rows<>(SB), XDWTMP1;\
|
||||
VPSHUFB XDWTMP1, x, x; \
|
||||
VEXTRACTI128 $1, x, YWORD \
|
||||
VAESENCLAST X_NIBBLE_MASK, XWORD, XWORD; \
|
||||
VAESENCLAST X_NIBBLE_MASK, YWORD, YWORD; \
|
||||
VINSERTI128 $1, YWORD, x, x; \
|
||||
VPANDN NIBBLE_MASK, x, XDWTMP1; \
|
||||
VBROADCASTI128 m2_low<>(SB), y; \
|
||||
VPSHUFB XDWTMP1, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND NIBBLE_MASK, x, x; \
|
||||
VBROADCASTI128 m2_high<>(SB), XDWTMP1; \
|
||||
VPSHUFB x, XDWTMP1, x; \
|
||||
VPXOR y, x, x
|
||||
|
||||
#define AVX2_SM4_TAO_L1(x, y) \
|
||||
AVX2_SM4_SBOX(x, y); \
|
||||
VBROADCASTI128 r08_mask<>(SB), XDWTMP0; \
|
||||
VPSHUFB XDWTMP0, x, y; \
|
||||
VPXOR x, y, y; \
|
||||
VBROADCASTI128 r16_mask<>(SB), XDWTMP0; \
|
||||
VPSHUFB XDWTMP0, x, XDWTMP0; \
|
||||
VPXOR XDWTMP0, y, y; \
|
||||
VPSLLD $2, y, XDWTMP1; \
|
||||
VPSRLD $30, y, y; \
|
||||
VPXOR XDWTMP1, y, y; \
|
||||
VBROADCASTI128 r24_mask<>(SB), XDWTMP0; \
|
||||
VPSHUFB XDWTMP0, x, XDWTMP0; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, XDWTMP0, x
|
||||
|
||||
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y); \
|
||||
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK, XDWTMP0); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
#define AVX_SM4_SBOX(x, y) \
|
||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK; \
|
||||
VPAND X_NIBBLE_MASK, x, XWTMP1; \
|
||||
VMOVDQU m1_low<>(SB), y; \
|
||||
VPSHUFB XWTMP1, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND X_NIBBLE_MASK, x, x; \
|
||||
VMOVDQU m1_high<>(SB), XWTMP1; \
|
||||
VPSHUFB x, XWTMP1, x; \
|
||||
VPXOR y, x, x; \
|
||||
VMOVDQU inverse_shift_rows<>(SB), XWTMP1; \
|
||||
VPSHUFB XWTMP1, x, x; \
|
||||
VAESENCLAST X_NIBBLE_MASK, x, x; \
|
||||
VPANDN X_NIBBLE_MASK, x, XWTMP1; \
|
||||
VMOVDQU m2_low<>(SB), y; \
|
||||
VPSHUFB XWTMP1, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND X_NIBBLE_MASK, x, x; \
|
||||
VMOVDQU m2_high<>(SB), XWTMP1; \
|
||||
VPSHUFB x, XWTMP1, x; \
|
||||
VPXOR y, x, x
|
||||
|
||||
#define AVX_SM4_TAO_L1(x, y) \
|
||||
AVX_SM4_SBOX(x, y); \
|
||||
VMOVDQU r08_mask<>(SB), XWTMP0; \
|
||||
VPSHUFB XWTMP0, x, y; \
|
||||
VPXOR x, y, y; \
|
||||
VMOVDQU r16_mask<>(SB), XWTMP0; \
|
||||
VPSHUFB XWTMP0, x, XWTMP0; \
|
||||
VPXOR XWTMP0, y, y; \
|
||||
VPSLLD $2, y, XWTMP1; \
|
||||
VPSRLD $30, y, y; \
|
||||
VPXOR XWTMP1, y, y; \
|
||||
VMOVDQU r24_mask<>(SB), XWTMP0; \
|
||||
VPSHUFB XWTMP0, x, XWTMP0; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, XWTMP0, x
|
||||
|
||||
#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX_SM4_TAO_L1(x, y); \
|
||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
|
||||
@ -408,6 +220,7 @@ done_sm4:
|
||||
RET
|
||||
|
||||
avx2:
|
||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||
CMPQ DI, $64
|
||||
JBE avx2_4blocks
|
||||
|
||||
|
@ -23,84 +23,10 @@
|
||||
#define XTMP6 V6
|
||||
#define XTMP7 V7
|
||||
|
||||
//nibble mask
|
||||
DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
GLOBL nibble_mask<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
// inverse shift rows
|
||||
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
|
||||
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
|
||||
GLOBL inverse_shift_rows<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
// Affine transform 1 (low and high hibbles)
|
||||
DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
|
||||
DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
|
||||
GLOBL m1_low<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
|
||||
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
|
||||
GLOBL m1_high<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
// Affine transform 2 (low and high hibbles)
|
||||
DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
|
||||
DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
|
||||
GLOBL m2_low<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400
|
||||
DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5
|
||||
GLOBL m2_high<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
// left rotations of 32-bit words by 8-bit increments
|
||||
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL r08_mask<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL r16_mask<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
|
||||
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
||||
GLOBL r24_mask<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
||||
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
||||
GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
#define SM4_SBOX(x, y) \
|
||||
; \ //############################# inner affine ############################//
|
||||
VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \
|
||||
VTBL XTMP7.B16, [M1L.B16], y.B16; \
|
||||
VUSHR $4, x.D2, x.D2; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \
|
||||
VTBL XTMP7.B16, [M1H.B16], XTMP7.B16; \
|
||||
VEOR y.B16, XTMP7.B16, x.B16; \
|
||||
VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \
|
||||
AESE ZERO.B16, x.B16; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \
|
||||
VTBL XTMP7.B16, [M2L.B16], y.B16; \
|
||||
VUSHR $4, x.D2, x.D2; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \
|
||||
VTBL XTMP7.B16, [M2H.B16], XTMP7.B16; \
|
||||
VEOR y.B16, XTMP7.B16, x.B16
|
||||
|
||||
#define SM4_TAO_L1(x, y) \
|
||||
SM4_SBOX(x, y); \
|
||||
; \ //#################### 4 parallel L1 linear transforms ##################//
|
||||
VTBL R08_MASK.B16, [x.B16], y.B16; \
|
||||
VEOR y.B16, x.B16, y.B16; \
|
||||
VTBL R16_MASK.B16, [x.B16], XTMP7.B16; \
|
||||
VEOR XTMP7.B16, y.B16, y.B16; \
|
||||
VSHL $2, y.S4, XTMP7.S4; \
|
||||
VUSHR $30, y.S4, y.S4; \
|
||||
VORR y.B16, XTMP7.B16, y.B16; \
|
||||
VTBL R24_MASK.B16, [x.B16], XTMP7.B16; \
|
||||
VEOR XTMP7.B16, x.B16, x.B16; \
|
||||
VEOR y.B16, x.B16, x.B16
|
||||
#include "aesni_arm64.h"
|
||||
|
||||
#define SM4_TAO_L2(x, y) \
|
||||
SM4_SBOX(x, y); \
|
||||
SM4_SBOX(x, y, XTMP6); \
|
||||
; \ //#################### 4 parallel L2 linear transforms ##################//
|
||||
VSHL $13, x.S4, XTMP6.S4; \
|
||||
VUSHR $19, x.S4, y.S4; \
|
||||
@ -117,7 +43,7 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
|
||||
VEOR t1.B16, x.B16, x.B16; \
|
||||
VEOR t2.B16, x.B16, x.B16; \
|
||||
VEOR t3.B16, x.B16, x.B16; \
|
||||
SM4_TAO_L1(x, y); \
|
||||
SM4_TAO_L1(x, y, XTMP6); \
|
||||
VEOR x.B16, t0.B16, t0.B16
|
||||
|
||||
#define SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) \
|
||||
|
296
sm4/gcm_amd64.s
296
sm4/gcm_amd64.s
@ -41,44 +41,6 @@
|
||||
#define NIBBLE_MASK Y11
|
||||
#define X_NIBBLE_MASK X11
|
||||
|
||||
// shuffle byte order from LE to BE
|
||||
DATA flipMask<>+0x00(SB)/8, $0x0405060700010203
|
||||
DATA flipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
|
||||
|
||||
//nibble mask
|
||||
DATA nibbleMask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
DATA nibbleMask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
|
||||
// inverse shift rows
|
||||
DATA inverseShiftRows<>+0x00(SB)/8, $0x0B0E0104070A0D00
|
||||
DATA inverseShiftRows<>+0x08(SB)/8, $0x0306090C0F020508
|
||||
|
||||
// Affine transform 1 (low and high hibbles)
|
||||
DATA m1Low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
|
||||
DATA m1Low<>+0x08(SB)/8, $0x3045F98CEF9A2653
|
||||
|
||||
DATA m1High<>+0x00(SB)/8, $0xC35BF46CAF379800
|
||||
DATA m1High<>+0x08(SB)/8, $0x68F05FC7049C33AB
|
||||
|
||||
// Affine transform 2 (low and high hibbles)
|
||||
DATA m2Low<>+0x00(SB)/8, $0x9A950A05FEF16E61
|
||||
DATA m2Low<>+0x08(SB)/8, $0x0E019E916A65FAF5
|
||||
|
||||
DATA m2High<>+0x00(SB)/8, $0x892D69CD44E0A400
|
||||
DATA m2High<>+0x08(SB)/8, $0x2C88CC68E14501A5
|
||||
|
||||
// left rotations of 32-bit words by 8-bit increments
|
||||
DATA r08Mask<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA r08Mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
|
||||
DATA r16Mask<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA r16Mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
|
||||
DATA r24Mask<>+0x00(SB)/8, $0x0407060500030201
|
||||
DATA r24Mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
||||
|
||||
DATA fkMask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
||||
DATA fkMask<>+0x08(SB)/8, $0xb27022dc677d9197
|
||||
|
||||
DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
|
||||
DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
|
||||
@ -117,21 +79,12 @@ DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
|
||||
DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
|
||||
DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
|
||||
|
||||
GLOBL flipMask<>(SB), (NOPTR+RODATA), $16
|
||||
GLOBL nibbleMask<>(SB), (NOPTR+RODATA), $16
|
||||
GLOBL inverseShiftRows<>(SB), (NOPTR+RODATA), $16
|
||||
GLOBL m1Low<>(SB), (NOPTR+RODATA), $16
|
||||
GLOBL m1High<>(SB), (NOPTR+RODATA), $16
|
||||
GLOBL m2Low<>(SB), (NOPTR+RODATA), $16
|
||||
GLOBL m2High<>(SB), (NOPTR+RODATA), $16
|
||||
GLOBL r08Mask<>(SB), (NOPTR+RODATA), $16
|
||||
GLOBL r16Mask<>(SB), (NOPTR+RODATA), $16
|
||||
GLOBL r24Mask<>(SB), (NOPTR+RODATA), $16
|
||||
GLOBL fkMask<>(SB), (NOPTR+RODATA), $16
|
||||
GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
|
||||
GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
|
||||
GLOBL andMask<>(SB), (NOPTR+RODATA), $240
|
||||
|
||||
#include "aesni_amd64.h"
|
||||
|
||||
// func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
|
||||
TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
||||
#define pTbl DI
|
||||
@ -202,51 +155,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
||||
#undef plen
|
||||
#undef dlen
|
||||
|
||||
#define SM4_SBOX(x, y, z) \
|
||||
; \ //############################# inner affine ############################//
|
||||
MOVOU x, z; \
|
||||
PAND nibbleMask<>(SB), z; \ //y = _mm_and_si128(x, c0f);
|
||||
MOVOU m1Low<>(SB), y; \
|
||||
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y);
|
||||
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
||||
PAND nibbleMask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
|
||||
MOVOU m1High<>(SB), z; \
|
||||
PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x);
|
||||
MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x);
|
||||
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
|
||||
; \ // inverse ShiftRows
|
||||
PSHUFB inverseShiftRows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
|
||||
AESENCLAST nibbleMask<>(SB), x; \ // AESNI instruction
|
||||
; \ //############################# outer affine ############################//
|
||||
MOVOU x, z; \
|
||||
PANDN nibbleMask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f);
|
||||
MOVOU m2Low<>(SB), y; \
|
||||
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z)
|
||||
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
||||
PAND nibbleMask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
|
||||
MOVOU m2High<>(SB), z; \
|
||||
PSHUFB x, z; \
|
||||
MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x)
|
||||
PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y;
|
||||
|
||||
#define SM4_TAO_L1(x, y, z) \
|
||||
SM4_SBOX(x, y, z); \
|
||||
; \ //#################### 4 parallel L1 linear transforms ##################//
|
||||
MOVOU x, y; \
|
||||
PSHUFB r08Mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
|
||||
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
|
||||
MOVOU x, z; \
|
||||
PSHUFB r16Mask<>(SB), z; \
|
||||
PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
|
||||
MOVOU y, z; \
|
||||
PSLLL $2, z; \
|
||||
PSRLL $30, y; \
|
||||
POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
|
||||
MOVOU x, z; \
|
||||
PSHUFB r24Mask<>(SB), z; \
|
||||
PXOR y, x; \ //x = x xor y
|
||||
PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
|
||||
|
||||
#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(RK)(IND*1), x; \
|
||||
PXOR t1, x; \
|
||||
@ -264,80 +172,11 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
PXOR x, t0
|
||||
|
||||
// MOVOU r0, tmp2;
|
||||
// PUNPCKHDQ r1, tmp2;
|
||||
// PUNPCKLDQ r1, r0;
|
||||
// MOVOU r2, tmp1;
|
||||
// PUNPCKLDQ r3, tmp1;
|
||||
// PUNPCKHDQ r3, r2;
|
||||
// MOVOU r0, r1;
|
||||
// PUNPCKHQDQ tmp1, r1;
|
||||
// PUNPCKLQDQ tmp1, r0;
|
||||
// MOVOU tmp2, r3;
|
||||
// PUNPCKHQDQ r2, r3;
|
||||
// PUNPCKLQDQ r2, tmp2;
|
||||
// MOVOU tmp2, r2
|
||||
#define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \
|
||||
PEXTRD $2, r0, r; \
|
||||
PINSRD $0, r, tmp2; \
|
||||
PEXTRD $2, r1, r; \
|
||||
PINSRD $1, r, tmp2; \
|
||||
; \
|
||||
PEXTRD $3, r0, r; \
|
||||
PINSRD $2, r, tmp2; \
|
||||
PEXTRD $3, r1, r; \
|
||||
PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2]
|
||||
; \
|
||||
PEXTRD $1, r0, r; \
|
||||
PINSRD $2, r, r0; \
|
||||
PEXTRD $0, r1, r; \
|
||||
PINSRD $1, r, r0; \
|
||||
PEXTRD $1, r1, r; \
|
||||
PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0]
|
||||
; \
|
||||
PEXTRD $0, r2, r; \
|
||||
PINSRD $0, r, tmp1; \
|
||||
PEXTRD $0, r3, r; \
|
||||
PINSRD $1, r, tmp1; \
|
||||
PEXTRD $1, r2, r; \
|
||||
PINSRD $2, r, tmp1; \
|
||||
PEXTRD $1, r3, r; \
|
||||
PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8]
|
||||
; \
|
||||
PEXTRD $2, r2, r; \
|
||||
PINSRD $0, r, r2; \
|
||||
PEXTRD $2, r3, r; \
|
||||
PINSRD $1, r, r2; \
|
||||
PEXTRD $3, r2, r; \
|
||||
PINSRD $2, r, r2; \
|
||||
PEXTRD $3, r3, r; \
|
||||
PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10]
|
||||
; \
|
||||
MOVOU r0, r1; \
|
||||
PEXTRQ $1, r1, r; \
|
||||
PINSRQ $0, r, r1; \
|
||||
PEXTRQ $1, tmp1, r; \
|
||||
PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1]
|
||||
; \
|
||||
PEXTRQ $0, tmp1, r; \
|
||||
PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0]
|
||||
; \
|
||||
MOVOU tmp2, r3; \
|
||||
PEXTRQ $1, r3, r; \
|
||||
PINSRQ $0, r, r3; \
|
||||
PEXTRQ $1, r2, r; \
|
||||
PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3]
|
||||
; \
|
||||
PEXTRQ $0, r2, r; \
|
||||
PINSRQ $1, r, r2; \
|
||||
PEXTRQ $0, tmp2, r; \
|
||||
PINSRQ $0, r, r2
|
||||
|
||||
#define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||
PSHUFB flipMask<>(SB), t0; \
|
||||
PSHUFB flipMask<>(SB), t1; \
|
||||
PSHUFB flipMask<>(SB), t2; \
|
||||
PSHUFB flipMask<>(SB), t3; \
|
||||
PSHUFB flip_mask<>(SB), t0; \
|
||||
PSHUFB flip_mask<>(SB), t1; \
|
||||
PSHUFB flip_mask<>(SB), t2; \
|
||||
PSHUFB flip_mask<>(SB), t3; \
|
||||
SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); \
|
||||
XORL IND, IND; \
|
||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
@ -385,107 +224,20 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
||||
PSHUFB BSWAP, t1; \
|
||||
PSHUFB BSWAP, t0
|
||||
|
||||
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
|
||||
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
|
||||
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
|
||||
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8]
|
||||
VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10]
|
||||
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1]
|
||||
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0]
|
||||
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
|
||||
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
|
||||
|
||||
#define AVX2_SM4_SBOX(x, y, xw, yw, tmp) \
|
||||
VPAND NIBBLE_MASK, x, tmp; \
|
||||
VBROADCASTI128 m1Low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND NIBBLE_MASK, x, x; \
|
||||
VBROADCASTI128 m1High<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x; \
|
||||
VBROADCASTI128 inverseShiftRows<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, x; \
|
||||
VEXTRACTI128 $1, x, yw \
|
||||
VAESENCLAST X_NIBBLE_MASK, xw, xw; \
|
||||
VAESENCLAST X_NIBBLE_MASK, yw, yw; \
|
||||
VINSERTI128 $1, yw, x, x; \
|
||||
VPANDN NIBBLE_MASK, x, tmp; \
|
||||
VBROADCASTI128 m2Low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND NIBBLE_MASK, x, x; \
|
||||
VBROADCASTI128 m2High<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x
|
||||
|
||||
#define AVX2_SM4_TAO_L1(x, y, xw, yw, tmp) \
|
||||
AVX2_SM4_SBOX(x, y, xw, yw, tmp); \
|
||||
VBROADCASTI128 r08Mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, y; \
|
||||
VPXOR x, y, y; \
|
||||
VBROADCASTI128 r16Mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR tmp, y, y; \
|
||||
VPSLLD $2, y, tmp; \
|
||||
VPSRLD $30, y, y; \
|
||||
VPXOR tmp, y, y; \
|
||||
VBROADCASTI128 r24Mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, tmp, x
|
||||
|
||||
#define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, xw, yw, tmp); \
|
||||
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK, tmp); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
#define AVX_SM4_SBOX(x, y, tmp) \
|
||||
VPAND X_NIBBLE_MASK, x, tmp; \
|
||||
VMOVDQU m1Low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND X_NIBBLE_MASK, x, x; \
|
||||
VMOVDQU m1High<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x; \
|
||||
VMOVDQU inverseShiftRows<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, x; \
|
||||
VAESENCLAST X_NIBBLE_MASK, x, x; \
|
||||
VPANDN X_NIBBLE_MASK, x, tmp; \
|
||||
VMOVDQU m2Low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND X_NIBBLE_MASK, x, x; \
|
||||
VMOVDQU m2High<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x
|
||||
|
||||
#define AVX_SM4_TAO_L1(x, y, tmp) \
|
||||
AVX_SM4_SBOX(x, y, tmp); \
|
||||
VMOVDQU r08Mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, y; \
|
||||
VPXOR x, y, y; \
|
||||
VMOVDQU r16Mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR tmp, y, y; \
|
||||
VPSLLD $2, y, tmp; \
|
||||
VPSRLD $30, y, y; \
|
||||
VPXOR tmp, y, y; \
|
||||
VMOVDQU r24Mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, tmp, x
|
||||
|
||||
#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX_SM4_TAO_L1(x, y, tmp); \
|
||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
// func gcmSm4Init(productTable *[256]byte, rk []uint32)
|
||||
@ -1206,7 +958,7 @@ avx2GcmSm4Enc:
|
||||
VMOVDQU (4*32 + 2*32)(SP), DWB2
|
||||
VMOVDQU (4*32 + 3*32)(SP), DWB3
|
||||
|
||||
VBROADCASTI128 flipMask<>(SB), XDWTMP0
|
||||
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
|
||||
// Apply Byte Flip Mask: LE -> BE
|
||||
VPSHUFB XDWTMP0, DWB0, DWB0
|
||||
VPSHUFB XDWTMP0, DWB1, DWB1
|
||||
@ -1216,7 +968,7 @@ avx2GcmSm4Enc:
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||
XORL BX, BX
|
||||
VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
|
||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Enc8Loop1:
|
||||
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
||||
@ -1289,7 +1041,7 @@ avx2GcmSm4EncOctetsLoop:
|
||||
VMOVDQU (4*32 + 2*32)(SP), DWB2
|
||||
VMOVDQU (4*32 + 3*32)(SP), DWB3
|
||||
|
||||
VBROADCASTI128 flipMask<>(SB), XDWTMP0
|
||||
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
|
||||
// Apply Byte Flip Mask: LE -> BE
|
||||
VPSHUFB XDWTMP0, DWB0, DWB0
|
||||
VPSHUFB XDWTMP0, DWB1, DWB1
|
||||
@ -1311,7 +1063,7 @@ avx2GcmSm4EncOctetsLoop:
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||
XORL BX, BX
|
||||
VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
|
||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Enc8Loop2:
|
||||
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
||||
@ -1430,7 +1182,7 @@ avx2GcmSm4EncOctetsEnd:
|
||||
SUBQ $4, aluCTR
|
||||
|
||||
avx2GcmSm4EncNibbles:
|
||||
VMOVDQU flipMask<>(SB), B7
|
||||
VMOVDQU flip_mask<>(SB), B7
|
||||
CMPQ ptxLen, $64
|
||||
JBE avx2GcmSm4EncSingles
|
||||
SUBQ $64, ptxLen
|
||||
@ -1447,7 +1199,7 @@ avx2GcmSm4EncNibbles:
|
||||
|
||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
|
||||
XORL BX, BX
|
||||
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
|
||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Enc4Loop2:
|
||||
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||
@ -1509,7 +1261,7 @@ avx2GcmSm4EncSingles:
|
||||
|
||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
|
||||
XORL BX, BX
|
||||
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
|
||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Enc4Loop1:
|
||||
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||
@ -1937,7 +1689,7 @@ avx2GcmSm4DecOctetsLoop:
|
||||
VMOVDQU (2*32)(SP), DWB2
|
||||
VMOVDQU (3*32)(SP), DWB3
|
||||
|
||||
VBROADCASTI128 flipMask<>(SB), XDWTMP0
|
||||
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
|
||||
// Apply Byte Flip Mask: LE -> BE
|
||||
VPSHUFB XDWTMP0, DWB0, DWB0
|
||||
VPSHUFB XDWTMP0, DWB1, DWB1
|
||||
@ -1962,7 +1714,7 @@ avx2GcmSm4DecOctetsLoop:
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||
XORL BX, BX
|
||||
VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
|
||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Dec8Loop2:
|
||||
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
||||
@ -2047,7 +1799,7 @@ avx2GcmSm4DecEndOctets:
|
||||
SUBQ $4, aluCTR
|
||||
|
||||
avx2GcmSm4DecNibbles:
|
||||
VMOVDQU flipMask<>(SB), B7 // DO NOT CHANGE B7
|
||||
VMOVDQU flip_mask<>(SB), B7 // DO NOT CHANGE B7
|
||||
CMPQ ptxLen, $64
|
||||
JBE avx2GcmSm4DecSingles
|
||||
SUBQ $64, ptxLen
|
||||
@ -2064,7 +1816,7 @@ avx2GcmSm4DecNibbles:
|
||||
|
||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
|
||||
XORL BX, BX
|
||||
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
|
||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Dec4Loop2:
|
||||
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||
@ -2130,7 +1882,7 @@ avx2GcmSm4DecSingles:
|
||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
|
||||
|
||||
XORL BX, BX
|
||||
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
|
||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Dec4Loop1:
|
||||
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||
|
108
sm4/gcm_arm64.s
108
sm4/gcm_arm64.s
@ -3,47 +3,6 @@
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
//nibble mask
|
||||
DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
GLOBL nibble_mask<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
// inverse shift rows
|
||||
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
|
||||
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
|
||||
GLOBL inverse_shift_rows<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
// Affine transform 1 (low and high hibbles)
|
||||
DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
|
||||
DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
|
||||
GLOBL m1_low<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
|
||||
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
|
||||
GLOBL m1_high<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
// Affine transform 2 (low and high hibbles)
|
||||
DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
|
||||
DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
|
||||
GLOBL m2_low<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400
|
||||
DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5
|
||||
GLOBL m2_high<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
// left rotations of 32-bit words by 8-bit increments
|
||||
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL r08_mask<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL r16_mask<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
|
||||
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
||||
GLOBL r24_mask<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
#define B0 V0
|
||||
#define B1 V1
|
||||
#define B2 V2
|
||||
@ -150,42 +109,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
||||
#undef plen
|
||||
#undef dlen
|
||||
|
||||
#define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \
|
||||
VMOV t0.B16, K.B16 \
|
||||
VMOV t1.S[0], t0.S[1] \
|
||||
VMOV t2.S[0], t0.S[2] \
|
||||
VMOV t3.S[0], t0.S[3] \
|
||||
VMOV K.S[1], t1.S[0] \
|
||||
VMOV K.S[2], t2.S[0] \
|
||||
VMOV K.S[3], t3.S[0] \
|
||||
VMOV t1.D[1], K.D[1] \
|
||||
VMOV t2.S[1], t1.S[2] \
|
||||
VMOV t3.S[1], t1.S[3] \
|
||||
VMOV K.S[2], t2.S[1] \
|
||||
VMOV K.S[3], t3.S[1] \
|
||||
VMOV t2.S[3], K.S[3] \
|
||||
VMOV t3.S[2], t2.S[3] \
|
||||
VMOV K.S[3], t3.S[2]
|
||||
|
||||
#define TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \
|
||||
VMOV t0.B16, K.B16 \
|
||||
VMOV t3.S[0], t0.S[0] \
|
||||
VMOV t2.S[0], t0.S[1] \
|
||||
VMOV t1.S[0], t0.S[2] \
|
||||
VMOV K0.S[0], t0.S[3] \
|
||||
VMOV t3.S[1], t1.S[0] \
|
||||
VMOV t3.S[2], t2.S[0] \
|
||||
VMOV t3.S[3], t3.S[0] \
|
||||
VMOV t2.S[3], t3.S[1] \
|
||||
VMOV t1.S[3], t3.S[2] \
|
||||
VMOV K.S[3], t3.S[3] \
|
||||
VMOV K.S[2], t2.S[3] \
|
||||
VMOV K.S[1], t1.S[3] \
|
||||
VMOV t1.B16, K.B16 \
|
||||
VMOV t2.S[1], t1.S[1] \
|
||||
VMOV K.S[1], t1.S[2] \
|
||||
VMOV t2.S[2], t2.S[1] \
|
||||
VMOV K.S[2], t2.S[2]
|
||||
#include "aesni_arm64.h"
|
||||
|
||||
#define LOAD_SM4_AESNI_CONSTS() \
|
||||
LDP nibble_mask<>(SB), (R20, R21) \
|
||||
@ -216,36 +140,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
||||
VMOV R20, R24_MASK.D[0] \
|
||||
VMOV R21, R24_MASK.D[1]
|
||||
|
||||
#define SM4_SBOX(x, y, z) \
|
||||
; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
VTBL z.B16, [M1L.B16], y.B16; \
|
||||
VUSHR $4, x.D2, x.D2; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
VTBL z.B16, [M1H.B16], z.B16; \
|
||||
VEOR y.B16, z.B16, x.B16; \
|
||||
VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \
|
||||
AESE ZERO.B16, x.B16; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
VTBL z.B16, [M2L.B16], y.B16; \
|
||||
VUSHR $4, x.D2, x.D2; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
VTBL z.B16, [M2H.B16], z.B16; \
|
||||
VEOR y.B16, z.B16, x.B16
|
||||
|
||||
#define SM4_TAO_L1(x, y, z) \
|
||||
SM4_SBOX(x, y, z); \
|
||||
VTBL R08_MASK.B16, [x.B16], y.B16; \
|
||||
VEOR y.B16, x.B16, y.B16; \
|
||||
VTBL R16_MASK.B16, [x.B16], z.B16; \
|
||||
VEOR z.B16, y.B16, y.B16; \
|
||||
VSHL $2, y.S4, z.S4; \
|
||||
VUSHR $30, y.S4, y.S4; \
|
||||
VORR y.B16, z.B16, y.B16; \
|
||||
VTBL R24_MASK.B16, [x.B16], z.B16; \
|
||||
VEOR z.B16, x.B16, x.B16; \
|
||||
VEOR y.B16, x.B16, x.B16
|
||||
|
||||
#define SM4_ROUND(RK, x, y, z, t0, t1, t2, t3) \
|
||||
MOVW.P 4(RK), R19; \
|
||||
VMOV R19, x.S4; \
|
||||
|
Loading…
x
Reference in New Issue
Block a user