sm4: reduce duplicated asm code

This commit is contained in:
Sun Yimin 2022-07-21 13:41:56 +08:00 committed by GitHub
parent 572bf6574e
commit 8ddf1bc68f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 406 additions and 655 deletions

255
sm4/aesni_amd64.h Normal file
View File

@ -0,0 +1,255 @@
// shuffle byte order from LE to BE
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
GLOBL flip_mask<>(SB), RODATA, $16
// shuffle byte and word order
DATA bswap_mask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
DATA bswap_mask<>+0x08(SB)/8, $0x0001020304050607
GLOBL bswap_mask<>(SB), RODATA, $16
//nibble mask
DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL nibble_mask<>(SB), RODATA, $16
// inverse shift rows
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
GLOBL inverse_shift_rows<>(SB), RODATA, $16
// Affine transform 1 (low and high hibbles)
DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
GLOBL m1_low<>(SB), RODATA, $16
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
GLOBL m1_high<>(SB), RODATA, $16
// Affine transform 2 (low and high hibbles)
DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
GLOBL m2_low<>(SB), RODATA, $16
DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400
DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5
GLOBL m2_high<>(SB), RODATA, $16
// left rotations of 32-bit words by 8-bit increments
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
GLOBL r08_mask<>(SB), RODATA, $16
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
GLOBL r16_mask<>(SB), RODATA, $16
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
GLOBL r24_mask<>(SB), RODATA, $16
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
GLOBL fk_mask<>(SB), RODATA, $16
// MOVOU r0, tmp2;
// PUNPCKHDQ r1, tmp2;
// PUNPCKLDQ r1, r0;
// MOVOU r2, tmp1;
// PUNPCKLDQ r3, tmp1;
// PUNPCKHDQ r3, r2;
// MOVOU r0, r1;
// PUNPCKHQDQ tmp1, r1;
// PUNPCKLQDQ tmp1, r0;
// MOVOU tmp2, r3;
// PUNPCKHQDQ r2, r3;
// PUNPCKLQDQ r2, tmp2;
// MOVOU tmp2, r2
#define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \
PEXTRD $2, r0, r; \
PINSRD $0, r, tmp2; \
PEXTRD $2, r1, r; \
PINSRD $1, r, tmp2; \
; \
PEXTRD $3, r0, r; \
PINSRD $2, r, tmp2; \
PEXTRD $3, r1, r; \
PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2]
; \
PEXTRD $1, r0, r; \
PINSRD $2, r, r0; \
PEXTRD $0, r1, r; \
PINSRD $1, r, r0; \
PEXTRD $1, r1, r; \
PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0]
; \
PEXTRD $0, r2, r; \
PINSRD $0, r, tmp1; \
PEXTRD $0, r3, r; \
PINSRD $1, r, tmp1; \
PEXTRD $1, r2, r; \
PINSRD $2, r, tmp1; \
PEXTRD $1, r3, r; \
PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8]
; \
PEXTRD $2, r2, r; \
PINSRD $0, r, r2; \
PEXTRD $2, r3, r; \
PINSRD $1, r, r2; \
PEXTRD $3, r2, r; \
PINSRD $2, r, r2; \
PEXTRD $3, r3, r; \
PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10]
; \
MOVOU r0, r1; \
PEXTRQ $1, r1, r; \
PINSRQ $0, r, r1; \
PEXTRQ $1, tmp1, r; \
PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1]
; \
PEXTRQ $0, tmp1, r; \
PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0]
; \
MOVOU tmp2, r3; \
PEXTRQ $1, r3, r; \
PINSRQ $0, r, r3; \
PEXTRQ $1, r2, r; \
PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3]
; \
PEXTRQ $0, r2, r; \
PINSRQ $1, r, r2; \
PEXTRQ $0, tmp2, r; \
PINSRQ $0, r, r2
#define SM4_SBOX(x, y, z) \
; \ //############################# inner affine ############################//
MOVOU x, z; \
PAND nibble_mask<>(SB), z; \ //y = _mm_and_si128(x, c0f);
MOVOU m1_low<>(SB), y; \
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y);
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m1_high<>(SB), z; \
PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x);
MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x);
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
; \ // inverse ShiftRows
PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction
; \ //############################# outer affine ############################//
MOVOU x, z; \
PANDN nibble_mask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f);
MOVOU m2_low<>(SB), y; \
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z)
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m2_high<>(SB), z; \
PSHUFB x, z; \
MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x)
PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y;
#define SM4_TAO_L1(x, y, z) \
SM4_SBOX(x, y, z); \
; \ //#################### 4 parallel L1 linear transforms ##################//
MOVOU x, y; \
PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
MOVOU x, z; \
PSHUFB r16_mask<>(SB), z; \
PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
MOVOU y, z; \
PSLLL $2, z; \
PSRLL $30, y; \
POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
MOVOU x, z; \
PSHUFB r24_mask<>(SB), z; \
PXOR y, x; \ //x = x xor y
PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
#define AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp) \
VPAND X_NIBBLE_MASK, x, tmp; \
VMOVDQU m1_low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND X_NIBBLE_MASK, x, x; \
VMOVDQU m1_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x; \
VMOVDQU inverse_shift_rows<>(SB), tmp; \
VPSHUFB tmp, x, x; \
VAESENCLAST X_NIBBLE_MASK, x, x; \
VPANDN X_NIBBLE_MASK, x, tmp; \
VMOVDQU m2_low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND X_NIBBLE_MASK, x, x; \
VMOVDQU m2_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x
#define AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp) \
AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp); \
VMOVDQU r08_mask<>(SB), tmp; \
VPSHUFB tmp, x, y; \
VPXOR x, y, y; \
VMOVDQU r16_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR tmp, y, y; \
VPSLLD $2, y, tmp; \
VPSRLD $30, y, y; \
VPXOR tmp, y, y; \
VMOVDQU r24_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR y, x, x; \
VPXOR x, tmp, x
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8]
VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10]
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1]
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0]
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
#define AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \
VPAND yNibbleMask, x, tmp; \
VBROADCASTI128 m1_low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND yNibbleMask, x, x; \
VBROADCASTI128 m1_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x; \
VBROADCASTI128 inverse_shift_rows<>(SB), tmp; \
VPSHUFB tmp, x, x; \
VEXTRACTI128 $1, x, yw \
VAESENCLAST xNibbleMask, xw, xw; \
VAESENCLAST xNibbleMask, yw, yw; \
VINSERTI128 $1, yw, x, x; \
VPANDN yNibbleMask, x, tmp; \
VBROADCASTI128 m2_low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND yNibbleMask, x, x; \
VBROADCASTI128 m2_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x
#define AVX2_SM4_TAO_L1(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \
AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp); \
VBROADCASTI128 r08_mask<>(SB), tmp; \
VPSHUFB tmp, x, y; \
VPXOR x, y, y; \
VBROADCASTI128 r16_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR tmp, y, y; \
VPSLLD $2, y, tmp; \
VPSRLD $30, y, y; \
VPXOR tmp, y, y; \
VBROADCASTI128 r24_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR y, x, x; \
VPXOR x, tmp, x

111
sm4/aesni_arm64.h Normal file
View File

@ -0,0 +1,111 @@
//nibble mask
DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL nibble_mask<>(SB), (NOPTR+RODATA), $16
// inverse shift rows
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
GLOBL inverse_shift_rows<>(SB), (NOPTR+RODATA), $16
// Affine transform 1 (low and high hibbles)
DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
GLOBL m1_low<>(SB), (NOPTR+RODATA), $16
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
GLOBL m1_high<>(SB), (NOPTR+RODATA), $16
// Affine transform 2 (low and high hibbles)
DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
GLOBL m2_low<>(SB), (NOPTR+RODATA), $16
DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400
DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5
GLOBL m2_high<>(SB), (NOPTR+RODATA), $16
// left rotations of 32-bit words by 8-bit increments
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
GLOBL r08_mask<>(SB), (NOPTR+RODATA), $16
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
GLOBL r16_mask<>(SB), (NOPTR+RODATA), $16
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
GLOBL r24_mask<>(SB), (NOPTR+RODATA), $16
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
#define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \
VMOV t0.B16, K.B16 \
VMOV t1.S[0], t0.S[1] \
VMOV t2.S[0], t0.S[2] \
VMOV t3.S[0], t0.S[3] \
VMOV K.S[1], t1.S[0] \
VMOV K.S[2], t2.S[0] \
VMOV K.S[3], t3.S[0] \
VMOV t1.D[1], K.D[1] \
VMOV t2.S[1], t1.S[2] \
VMOV t3.S[1], t1.S[3] \
VMOV K.S[2], t2.S[1] \
VMOV K.S[3], t3.S[1] \
VMOV t2.S[3], K.S[3] \
VMOV t3.S[2], t2.S[3] \
VMOV K.S[3], t3.S[2]
#define TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \
VMOV t0.B16, K.B16 \
VMOV t3.S[0], t0.S[0] \
VMOV t2.S[0], t0.S[1] \
VMOV t1.S[0], t0.S[2] \
VMOV K0.S[0], t0.S[3] \
VMOV t3.S[1], t1.S[0] \
VMOV t3.S[2], t2.S[0] \
VMOV t3.S[3], t3.S[0] \
VMOV t2.S[3], t3.S[1] \
VMOV t1.S[3], t3.S[2] \
VMOV K.S[3], t3.S[3] \
VMOV K.S[2], t2.S[3] \
VMOV K.S[1], t1.S[3] \
VMOV t1.B16, K.B16 \
VMOV t2.S[1], t1.S[1] \
VMOV K.S[1], t1.S[2] \
VMOV t2.S[2], t2.S[1] \
VMOV K.S[2], t2.S[2]
#define SM4_SBOX(x, y, z) \
; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M1L.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M1H.B16], z.B16; \
VEOR y.B16, z.B16, x.B16; \
VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \
AESE ZERO.B16, x.B16; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M2L.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M2H.B16], z.B16; \
VEOR y.B16, z.B16, x.B16
#define SM4_TAO_L1(x, y, z) \
SM4_SBOX(x, y, z); \
VTBL R08_MASK.B16, [x.B16], y.B16; \
VEOR y.B16, x.B16, y.B16; \
VTBL R16_MASK.B16, [x.B16], z.B16; \
VEOR z.B16, y.B16, y.B16; \
VSHL $2, y.S4, z.S4; \
VUSHR $30, y.S4, y.S4; \
VORR y.B16, z.B16, y.B16; \
VTBL R24_MASK.B16, [x.B16], z.B16; \
VEOR z.B16, x.B16, x.B16; \
VEOR y.B16, x.B16, x.B16

View File

@ -14,108 +14,10 @@
#define XTMP6 X6
#define XTMP7 X7
// shuffle byte order from LE to BE
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
GLOBL flip_mask<>(SB), RODATA, $16
// shuffle byte and word order
DATA bswap_mask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
DATA bswap_mask<>+0x08(SB)/8, $0x0001020304050607
GLOBL bswap_mask<>(SB), RODATA, $16
//nibble mask
DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL nibble_mask<>(SB), RODATA, $16
// inverse shift rows
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
GLOBL inverse_shift_rows<>(SB), RODATA, $16
// Affine transform 1 (low and high hibbles)
DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
GLOBL m1_low<>(SB), RODATA, $16
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
GLOBL m1_high<>(SB), RODATA, $16
// Affine transform 2 (low and high hibbles)
DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
GLOBL m2_low<>(SB), RODATA, $16
DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400
DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5
GLOBL m2_high<>(SB), RODATA, $16
// left rotations of 32-bit words by 8-bit increments
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
GLOBL r08_mask<>(SB), RODATA, $16
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
GLOBL r16_mask<>(SB), RODATA, $16
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
GLOBL r24_mask<>(SB), RODATA, $16
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
GLOBL fk_mask<>(SB), RODATA, $16
#define SM4_SBOX(x, y) \
; \ //############################# inner affine ############################//
MOVOU x, XTMP6; \
PAND nibble_mask<>(SB), XTMP6; \ //y = _mm_and_si128(x, c0f);
MOVOU m1_low<>(SB), y; \
PSHUFB XTMP6, y; \ //y = _mm_shuffle_epi8(m1l, y);
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m1_high<>(SB), XTMP6; \
PSHUFB x, XTMP6; \ //x = _mm_shuffle_epi8(m1h, x);
MOVOU XTMP6, x; \ //x = _mm_shuffle_epi8(m1h, x);
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
; \ // inverse ShiftRows
PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction
; \ //############################# outer affine ############################//
MOVOU x, XTMP6; \
PANDN nibble_mask<>(SB), XTMP6; \ //XTMP6 = _mm_andnot_si128(x, c0f);
MOVOU m2_low<>(SB), y; \
PSHUFB XTMP6, y; \ //y = _mm_shuffle_epi8(m2l, XTMP6)
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m2_high<>(SB), XTMP6; \
PSHUFB x, XTMP6; \
MOVOU XTMP6, x; \ //x = _mm_shuffle_epi8(m2h, x)
PXOR y, x; \ //x = _mm_shuffle_epi8(m2h, x) ^ y;
#define SM4_TAO_L1(x, y) \
SM4_SBOX(x, y); \
; \ //#################### 4 parallel L1 linear transforms ##################//
MOVOU x, y; \
PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
MOVOU x, XTMP6; \
PSHUFB r16_mask<>(SB), XTMP6; \
PXOR XTMP6, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
MOVOU y, XTMP6; \
PSLLL $2, XTMP6; \
PSRLL $30, y; \
POR XTMP6, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
MOVOU x, XTMP7; \
PSHUFB r24_mask<>(SB), XTMP7; \
PXOR y, x; \ //x = x xor y
PXOR XTMP7, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
#include "aesni_amd64.h"
#define SM4_TAO_L2(x, y) \
SM4_SBOX(x, y); \
SM4_SBOX(x, y, XTMP6); \
; \ //#################### 4 parallel L2 linear transforms ##################//
MOVOU x, y; \
MOVOU x, XTMP6; \
@ -135,7 +37,7 @@ GLOBL fk_mask<>(SB), RODATA, $16
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y); \
SM4_TAO_L1(x, y, XTMP6); \
PXOR x, t0
#define SM4_SINGLE_ROUND(index, x, y, t0, t1, t2, t3) \
@ -143,7 +45,7 @@ GLOBL fk_mask<>(SB), RODATA, $16
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y); \
SM4_TAO_L1(x, y, XTMP6); \
PXOR x, t0
#define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \
@ -187,110 +89,20 @@ GLOBL fk_mask<>(SB), RODATA, $16
#define XWORD X8
#define YWORD X9
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8]
VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10]
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1]
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0]
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
#define AVX2_SM4_SBOX(x, y) \
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK; \
VPAND NIBBLE_MASK, x, XDWTMP1; \
VBROADCASTI128 m1_low<>(SB), y; \
VPSHUFB XDWTMP1, y, y; \
VPSRLQ $4, x, x; \
VPAND NIBBLE_MASK, x, x; \
VBROADCASTI128 m1_high<>(SB), XDWTMP1; \
VPSHUFB x, XDWTMP1, x; \
VPXOR y, x, x; \
VBROADCASTI128 inverse_shift_rows<>(SB), XDWTMP1;\
VPSHUFB XDWTMP1, x, x; \
VEXTRACTI128 $1, x, YWORD \
VAESENCLAST X_NIBBLE_MASK, XWORD, XWORD; \
VAESENCLAST X_NIBBLE_MASK, YWORD, YWORD; \
VINSERTI128 $1, YWORD, x, x; \
VPANDN NIBBLE_MASK, x, XDWTMP1; \
VBROADCASTI128 m2_low<>(SB), y; \
VPSHUFB XDWTMP1, y, y; \
VPSRLQ $4, x, x; \
VPAND NIBBLE_MASK, x, x; \
VBROADCASTI128 m2_high<>(SB), XDWTMP1; \
VPSHUFB x, XDWTMP1, x; \
VPXOR y, x, x
#define AVX2_SM4_TAO_L1(x, y) \
AVX2_SM4_SBOX(x, y); \
VBROADCASTI128 r08_mask<>(SB), XDWTMP0; \
VPSHUFB XDWTMP0, x, y; \
VPXOR x, y, y; \
VBROADCASTI128 r16_mask<>(SB), XDWTMP0; \
VPSHUFB XDWTMP0, x, XDWTMP0; \
VPXOR XDWTMP0, y, y; \
VPSLLD $2, y, XDWTMP1; \
VPSRLD $30, y, y; \
VPXOR XDWTMP1, y, y; \
VBROADCASTI128 r24_mask<>(SB), XDWTMP0; \
VPSHUFB XDWTMP0, x, XDWTMP0; \
VPXOR y, x, x; \
VPXOR x, XDWTMP0, x
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y); \
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK, XDWTMP0); \
VPXOR x, t0, t0
#define AVX_SM4_SBOX(x, y) \
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK; \
VPAND X_NIBBLE_MASK, x, XWTMP1; \
VMOVDQU m1_low<>(SB), y; \
VPSHUFB XWTMP1, y, y; \
VPSRLQ $4, x, x; \
VPAND X_NIBBLE_MASK, x, x; \
VMOVDQU m1_high<>(SB), XWTMP1; \
VPSHUFB x, XWTMP1, x; \
VPXOR y, x, x; \
VMOVDQU inverse_shift_rows<>(SB), XWTMP1; \
VPSHUFB XWTMP1, x, x; \
VAESENCLAST X_NIBBLE_MASK, x, x; \
VPANDN X_NIBBLE_MASK, x, XWTMP1; \
VMOVDQU m2_low<>(SB), y; \
VPSHUFB XWTMP1, y, y; \
VPSRLQ $4, x, x; \
VPAND X_NIBBLE_MASK, x, x; \
VMOVDQU m2_high<>(SB), XWTMP1; \
VPSHUFB x, XWTMP1, x; \
VPXOR y, x, x
#define AVX_SM4_TAO_L1(x, y) \
AVX_SM4_SBOX(x, y); \
VMOVDQU r08_mask<>(SB), XWTMP0; \
VPSHUFB XWTMP0, x, y; \
VPXOR x, y, y; \
VMOVDQU r16_mask<>(SB), XWTMP0; \
VPSHUFB XWTMP0, x, XWTMP0; \
VPXOR XWTMP0, y, y; \
VPSLLD $2, y, XWTMP1; \
VPSRLD $30, y, y; \
VPXOR XWTMP1, y, y; \
VMOVDQU r24_mask<>(SB), XWTMP0; \
VPSHUFB XWTMP0, x, XWTMP0; \
VPXOR y, x, x; \
VPXOR x, XWTMP0, x
#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y); \
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \
VPXOR x, t0, t0
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
@ -408,6 +220,7 @@ done_sm4:
RET
avx2:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
CMPQ DI, $64
JBE avx2_4blocks

View File

@ -23,84 +23,10 @@
#define XTMP6 V6
#define XTMP7 V7
//nibble mask
DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL nibble_mask<>(SB), (NOPTR+RODATA), $16
// inverse shift rows
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
GLOBL inverse_shift_rows<>(SB), (NOPTR+RODATA), $16
// Affine transform 1 (low and high hibbles)
DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
GLOBL m1_low<>(SB), (NOPTR+RODATA), $16
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
GLOBL m1_high<>(SB), (NOPTR+RODATA), $16
// Affine transform 2 (low and high hibbles)
DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
GLOBL m2_low<>(SB), (NOPTR+RODATA), $16
DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400
DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5
GLOBL m2_high<>(SB), (NOPTR+RODATA), $16
// left rotations of 32-bit words by 8-bit increments
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
GLOBL r08_mask<>(SB), (NOPTR+RODATA), $16
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
GLOBL r16_mask<>(SB), (NOPTR+RODATA), $16
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
GLOBL r24_mask<>(SB), (NOPTR+RODATA), $16
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
#define SM4_SBOX(x, y) \
; \ //############################# inner affine ############################//
VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \
VTBL XTMP7.B16, [M1L.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \
VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \
VTBL XTMP7.B16, [M1H.B16], XTMP7.B16; \
VEOR y.B16, XTMP7.B16, x.B16; \
VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \
AESE ZERO.B16, x.B16; \
VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \
VTBL XTMP7.B16, [M2L.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \
VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \
VTBL XTMP7.B16, [M2H.B16], XTMP7.B16; \
VEOR y.B16, XTMP7.B16, x.B16
#define SM4_TAO_L1(x, y) \
SM4_SBOX(x, y); \
; \ //#################### 4 parallel L1 linear transforms ##################//
VTBL R08_MASK.B16, [x.B16], y.B16; \
VEOR y.B16, x.B16, y.B16; \
VTBL R16_MASK.B16, [x.B16], XTMP7.B16; \
VEOR XTMP7.B16, y.B16, y.B16; \
VSHL $2, y.S4, XTMP7.S4; \
VUSHR $30, y.S4, y.S4; \
VORR y.B16, XTMP7.B16, y.B16; \
VTBL R24_MASK.B16, [x.B16], XTMP7.B16; \
VEOR XTMP7.B16, x.B16, x.B16; \
VEOR y.B16, x.B16, x.B16
#include "aesni_arm64.h"
#define SM4_TAO_L2(x, y) \
SM4_SBOX(x, y); \
SM4_SBOX(x, y, XTMP6); \
; \ //#################### 4 parallel L2 linear transforms ##################//
VSHL $13, x.S4, XTMP6.S4; \
VUSHR $19, x.S4, y.S4; \
@ -117,7 +43,7 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
VEOR t1.B16, x.B16, x.B16; \
VEOR t2.B16, x.B16, x.B16; \
VEOR t3.B16, x.B16, x.B16; \
SM4_TAO_L1(x, y); \
SM4_TAO_L1(x, y, XTMP6); \
VEOR x.B16, t0.B16, t0.B16
#define SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) \

View File

@ -41,44 +41,6 @@
#define NIBBLE_MASK Y11
#define X_NIBBLE_MASK X11
// shuffle byte order from LE to BE
DATA flipMask<>+0x00(SB)/8, $0x0405060700010203
DATA flipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
//nibble mask
DATA nibbleMask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA nibbleMask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
// inverse shift rows
DATA inverseShiftRows<>+0x00(SB)/8, $0x0B0E0104070A0D00
DATA inverseShiftRows<>+0x08(SB)/8, $0x0306090C0F020508
// Affine transform 1 (low and high hibbles)
DATA m1Low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
DATA m1Low<>+0x08(SB)/8, $0x3045F98CEF9A2653
DATA m1High<>+0x00(SB)/8, $0xC35BF46CAF379800
DATA m1High<>+0x08(SB)/8, $0x68F05FC7049C33AB
// Affine transform 2 (low and high hibbles)
DATA m2Low<>+0x00(SB)/8, $0x9A950A05FEF16E61
DATA m2Low<>+0x08(SB)/8, $0x0E019E916A65FAF5
DATA m2High<>+0x00(SB)/8, $0x892D69CD44E0A400
DATA m2High<>+0x08(SB)/8, $0x2C88CC68E14501A5
// left rotations of 32-bit words by 8-bit increments
DATA r08Mask<>+0x00(SB)/8, $0x0605040702010003
DATA r08Mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
DATA r16Mask<>+0x00(SB)/8, $0x0504070601000302
DATA r16Mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
DATA r24Mask<>+0x00(SB)/8, $0x0407060500030201
DATA r24Mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
DATA fkMask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
DATA fkMask<>+0x08(SB)/8, $0xb27022dc677d9197
DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
@ -117,21 +79,12 @@ DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
GLOBL flipMask<>(SB), (NOPTR+RODATA), $16
GLOBL nibbleMask<>(SB), (NOPTR+RODATA), $16
GLOBL inverseShiftRows<>(SB), (NOPTR+RODATA), $16
GLOBL m1Low<>(SB), (NOPTR+RODATA), $16
GLOBL m1High<>(SB), (NOPTR+RODATA), $16
GLOBL m2Low<>(SB), (NOPTR+RODATA), $16
GLOBL m2High<>(SB), (NOPTR+RODATA), $16
GLOBL r08Mask<>(SB), (NOPTR+RODATA), $16
GLOBL r16Mask<>(SB), (NOPTR+RODATA), $16
GLOBL r24Mask<>(SB), (NOPTR+RODATA), $16
GLOBL fkMask<>(SB), (NOPTR+RODATA), $16
GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
GLOBL andMask<>(SB), (NOPTR+RODATA), $240
#include "aesni_amd64.h"
// func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
#define pTbl DI
@ -202,51 +155,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
#undef plen
#undef dlen
#define SM4_SBOX(x, y, z) \
; \ //############################# inner affine ############################//
MOVOU x, z; \
PAND nibbleMask<>(SB), z; \ //y = _mm_and_si128(x, c0f);
MOVOU m1Low<>(SB), y; \
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y);
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibbleMask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m1High<>(SB), z; \
PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x);
MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x);
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
; \ // inverse ShiftRows
PSHUFB inverseShiftRows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
AESENCLAST nibbleMask<>(SB), x; \ // AESNI instruction
; \ //############################# outer affine ############################//
MOVOU x, z; \
PANDN nibbleMask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f);
MOVOU m2Low<>(SB), y; \
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z)
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibbleMask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m2High<>(SB), z; \
PSHUFB x, z; \
MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x)
PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y;
#define SM4_TAO_L1(x, y, z) \
SM4_SBOX(x, y, z); \
; \ //#################### 4 parallel L1 linear transforms ##################//
MOVOU x, y; \
PSHUFB r08Mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
MOVOU x, z; \
PSHUFB r16Mask<>(SB), z; \
PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
MOVOU y, z; \
PSLLL $2, z; \
PSRLL $30, y; \
POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
MOVOU x, z; \
PSHUFB r24Mask<>(SB), z; \
PXOR y, x; \ //x = x xor y
PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(RK)(IND*1), x; \
PXOR t1, x; \
@ -264,80 +172,11 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
SM4_TAO_L1(x, y, z); \
PXOR x, t0
// MOVOU r0, tmp2;
// PUNPCKHDQ r1, tmp2;
// PUNPCKLDQ r1, r0;
// MOVOU r2, tmp1;
// PUNPCKLDQ r3, tmp1;
// PUNPCKHDQ r3, r2;
// MOVOU r0, r1;
// PUNPCKHQDQ tmp1, r1;
// PUNPCKLQDQ tmp1, r0;
// MOVOU tmp2, r3;
// PUNPCKHQDQ r2, r3;
// PUNPCKLQDQ r2, tmp2;
// MOVOU tmp2, r2
#define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \
PEXTRD $2, r0, r; \
PINSRD $0, r, tmp2; \
PEXTRD $2, r1, r; \
PINSRD $1, r, tmp2; \
; \
PEXTRD $3, r0, r; \
PINSRD $2, r, tmp2; \
PEXTRD $3, r1, r; \
PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2]
; \
PEXTRD $1, r0, r; \
PINSRD $2, r, r0; \
PEXTRD $0, r1, r; \
PINSRD $1, r, r0; \
PEXTRD $1, r1, r; \
PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0]
; \
PEXTRD $0, r2, r; \
PINSRD $0, r, tmp1; \
PEXTRD $0, r3, r; \
PINSRD $1, r, tmp1; \
PEXTRD $1, r2, r; \
PINSRD $2, r, tmp1; \
PEXTRD $1, r3, r; \
PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8]
; \
PEXTRD $2, r2, r; \
PINSRD $0, r, r2; \
PEXTRD $2, r3, r; \
PINSRD $1, r, r2; \
PEXTRD $3, r2, r; \
PINSRD $2, r, r2; \
PEXTRD $3, r3, r; \
PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10]
; \
MOVOU r0, r1; \
PEXTRQ $1, r1, r; \
PINSRQ $0, r, r1; \
PEXTRQ $1, tmp1, r; \
PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1]
; \
PEXTRQ $0, tmp1, r; \
PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0]
; \
MOVOU tmp2, r3; \
PEXTRQ $1, r3, r; \
PINSRQ $0, r, r3; \
PEXTRQ $1, r2, r; \
PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3]
; \
PEXTRQ $0, r2, r; \
PINSRQ $1, r, r2; \
PEXTRQ $0, tmp2, r; \
PINSRQ $0, r, r2
#define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \
PSHUFB flipMask<>(SB), t0; \
PSHUFB flipMask<>(SB), t1; \
PSHUFB flipMask<>(SB), t2; \
PSHUFB flipMask<>(SB), t3; \
PSHUFB flip_mask<>(SB), t0; \
PSHUFB flip_mask<>(SB), t1; \
PSHUFB flip_mask<>(SB), t2; \
PSHUFB flip_mask<>(SB), t3; \
SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); \
XORL IND, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
@ -385,107 +224,20 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
PSHUFB BSWAP, t1; \
PSHUFB BSWAP, t0
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8]
VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10]
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1]
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0]
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
#define AVX2_SM4_SBOX(x, y, xw, yw, tmp) \
VPAND NIBBLE_MASK, x, tmp; \
VBROADCASTI128 m1Low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND NIBBLE_MASK, x, x; \
VBROADCASTI128 m1High<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x; \
VBROADCASTI128 inverseShiftRows<>(SB), tmp; \
VPSHUFB tmp, x, x; \
VEXTRACTI128 $1, x, yw \
VAESENCLAST X_NIBBLE_MASK, xw, xw; \
VAESENCLAST X_NIBBLE_MASK, yw, yw; \
VINSERTI128 $1, yw, x, x; \
VPANDN NIBBLE_MASK, x, tmp; \
VBROADCASTI128 m2Low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND NIBBLE_MASK, x, x; \
VBROADCASTI128 m2High<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x
#define AVX2_SM4_TAO_L1(x, y, xw, yw, tmp) \
AVX2_SM4_SBOX(x, y, xw, yw, tmp); \
VBROADCASTI128 r08Mask<>(SB), tmp; \
VPSHUFB tmp, x, y; \
VPXOR x, y, y; \
VBROADCASTI128 r16Mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR tmp, y, y; \
VPSLLD $2, y, tmp; \
VPSRLD $30, y, y; \
VPXOR tmp, y, y; \
VBROADCASTI128 r24Mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR y, x, x; \
VPXOR x, tmp, x
#define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, xw, yw, tmp); \
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK, tmp); \
VPXOR x, t0, t0
#define AVX_SM4_SBOX(x, y, tmp) \
VPAND X_NIBBLE_MASK, x, tmp; \
VMOVDQU m1Low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND X_NIBBLE_MASK, x, x; \
VMOVDQU m1High<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x; \
VMOVDQU inverseShiftRows<>(SB), tmp; \
VPSHUFB tmp, x, x; \
VAESENCLAST X_NIBBLE_MASK, x, x; \
VPANDN X_NIBBLE_MASK, x, tmp; \
VMOVDQU m2Low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND X_NIBBLE_MASK, x, x; \
VMOVDQU m2High<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x
#define AVX_SM4_TAO_L1(x, y, tmp) \
AVX_SM4_SBOX(x, y, tmp); \
VMOVDQU r08Mask<>(SB), tmp; \
VPSHUFB tmp, x, y; \
VPXOR x, y, y; \
VMOVDQU r16Mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR tmp, y, y; \
VPSLLD $2, y, tmp; \
VPSRLD $30, y, y; \
VPXOR tmp, y, y; \
VMOVDQU r24Mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR y, x, x; \
VPXOR x, tmp, x
#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, tmp); \
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
VPXOR x, t0, t0
// func gcmSm4Init(productTable *[256]byte, rk []uint32)
@ -1206,7 +958,7 @@ avx2GcmSm4Enc:
VMOVDQU (4*32 + 2*32)(SP), DWB2
VMOVDQU (4*32 + 3*32)(SP), DWB3
VBROADCASTI128 flipMask<>(SB), XDWTMP0
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
// Apply Byte Flip Mask: LE -> BE
VPSHUFB XDWTMP0, DWB0, DWB0
VPSHUFB XDWTMP0, DWB1, DWB1
@ -1216,7 +968,7 @@ avx2GcmSm4Enc:
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
XORL BX, BX
VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
avx2GcmSm4Enc8Loop1:
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
@ -1289,7 +1041,7 @@ avx2GcmSm4EncOctetsLoop:
VMOVDQU (4*32 + 2*32)(SP), DWB2
VMOVDQU (4*32 + 3*32)(SP), DWB3
VBROADCASTI128 flipMask<>(SB), XDWTMP0
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
// Apply Byte Flip Mask: LE -> BE
VPSHUFB XDWTMP0, DWB0, DWB0
VPSHUFB XDWTMP0, DWB1, DWB1
@ -1311,7 +1063,7 @@ avx2GcmSm4EncOctetsLoop:
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
XORL BX, BX
VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
avx2GcmSm4Enc8Loop2:
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
@ -1430,7 +1182,7 @@ avx2GcmSm4EncOctetsEnd:
SUBQ $4, aluCTR
avx2GcmSm4EncNibbles:
VMOVDQU flipMask<>(SB), B7
VMOVDQU flip_mask<>(SB), B7
CMPQ ptxLen, $64
JBE avx2GcmSm4EncSingles
SUBQ $64, ptxLen
@ -1447,7 +1199,7 @@ avx2GcmSm4EncNibbles:
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
XORL BX, BX
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
avx2GcmSm4Enc4Loop2:
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
@ -1509,7 +1261,7 @@ avx2GcmSm4EncSingles:
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
XORL BX, BX
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
avx2GcmSm4Enc4Loop1:
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
@ -1937,7 +1689,7 @@ avx2GcmSm4DecOctetsLoop:
VMOVDQU (2*32)(SP), DWB2
VMOVDQU (3*32)(SP), DWB3
VBROADCASTI128 flipMask<>(SB), XDWTMP0
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
// Apply Byte Flip Mask: LE -> BE
VPSHUFB XDWTMP0, DWB0, DWB0
VPSHUFB XDWTMP0, DWB1, DWB1
@ -1962,7 +1714,7 @@ avx2GcmSm4DecOctetsLoop:
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
XORL BX, BX
VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
avx2GcmSm4Dec8Loop2:
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
@ -2047,7 +1799,7 @@ avx2GcmSm4DecEndOctets:
SUBQ $4, aluCTR
avx2GcmSm4DecNibbles:
VMOVDQU flipMask<>(SB), B7 // DO NOT CHANGE B7
VMOVDQU flip_mask<>(SB), B7 // DO NOT CHANGE B7
CMPQ ptxLen, $64
JBE avx2GcmSm4DecSingles
SUBQ $64, ptxLen
@ -2064,7 +1816,7 @@ avx2GcmSm4DecNibbles:
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
XORL BX, BX
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
avx2GcmSm4Dec4Loop2:
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
@ -2130,7 +1882,7 @@ avx2GcmSm4DecSingles:
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
XORL BX, BX
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
avx2GcmSm4Dec4Loop1:
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)

View File

@ -3,47 +3,6 @@
#include "textflag.h"
//nibble mask
DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL nibble_mask<>(SB), (NOPTR+RODATA), $16
// inverse shift rows
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
GLOBL inverse_shift_rows<>(SB), (NOPTR+RODATA), $16
// Affine transform 1 (low and high hibbles)
DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
GLOBL m1_low<>(SB), (NOPTR+RODATA), $16
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
GLOBL m1_high<>(SB), (NOPTR+RODATA), $16
// Affine transform 2 (low and high hibbles)
DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
GLOBL m2_low<>(SB), (NOPTR+RODATA), $16
DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400
DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5
GLOBL m2_high<>(SB), (NOPTR+RODATA), $16
// left rotations of 32-bit words by 8-bit increments
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
GLOBL r08_mask<>(SB), (NOPTR+RODATA), $16
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
GLOBL r16_mask<>(SB), (NOPTR+RODATA), $16
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
GLOBL r24_mask<>(SB), (NOPTR+RODATA), $16
#define B0 V0
#define B1 V1
#define B2 V2
@ -150,42 +109,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
#undef plen
#undef dlen
#define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \
VMOV t0.B16, K.B16 \
VMOV t1.S[0], t0.S[1] \
VMOV t2.S[0], t0.S[2] \
VMOV t3.S[0], t0.S[3] \
VMOV K.S[1], t1.S[0] \
VMOV K.S[2], t2.S[0] \
VMOV K.S[3], t3.S[0] \
VMOV t1.D[1], K.D[1] \
VMOV t2.S[1], t1.S[2] \
VMOV t3.S[1], t1.S[3] \
VMOV K.S[2], t2.S[1] \
VMOV K.S[3], t3.S[1] \
VMOV t2.S[3], K.S[3] \
VMOV t3.S[2], t2.S[3] \
VMOV K.S[3], t3.S[2]
#define TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \
VMOV t0.B16, K.B16 \
VMOV t3.S[0], t0.S[0] \
VMOV t2.S[0], t0.S[1] \
VMOV t1.S[0], t0.S[2] \
VMOV K0.S[0], t0.S[3] \
VMOV t3.S[1], t1.S[0] \
VMOV t3.S[2], t2.S[0] \
VMOV t3.S[3], t3.S[0] \
VMOV t2.S[3], t3.S[1] \
VMOV t1.S[3], t3.S[2] \
VMOV K.S[3], t3.S[3] \
VMOV K.S[2], t2.S[3] \
VMOV K.S[1], t1.S[3] \
VMOV t1.B16, K.B16 \
VMOV t2.S[1], t1.S[1] \
VMOV K.S[1], t1.S[2] \
VMOV t2.S[2], t2.S[1] \
VMOV K.S[2], t2.S[2]
#include "aesni_arm64.h"
#define LOAD_SM4_AESNI_CONSTS() \
LDP nibble_mask<>(SB), (R20, R21) \
@ -216,36 +140,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
VMOV R20, R24_MASK.D[0] \
VMOV R21, R24_MASK.D[1]
#define SM4_SBOX(x, y, z) \
; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M1L.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M1H.B16], z.B16; \
VEOR y.B16, z.B16, x.B16; \
VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \
AESE ZERO.B16, x.B16; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M2L.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M2H.B16], z.B16; \
VEOR y.B16, z.B16, x.B16
#define SM4_TAO_L1(x, y, z) \
SM4_SBOX(x, y, z); \
VTBL R08_MASK.B16, [x.B16], y.B16; \
VEOR y.B16, x.B16, y.B16; \
VTBL R16_MASK.B16, [x.B16], z.B16; \
VEOR z.B16, y.B16, y.B16; \
VSHL $2, y.S4, z.S4; \
VUSHR $30, y.S4, y.S4; \
VORR y.B16, z.B16, y.B16; \
VTBL R24_MASK.B16, [x.B16], z.B16; \
VEOR z.B16, x.B16, x.B16; \
VEOR y.B16, x.B16, x.B16
#define SM4_ROUND(RK, x, y, z, t0, t1, t2, t3) \
MOVW.P 4(RK), R19; \
VMOV R19, x.S4; \