From b0889c34328c28ad54d866638af2a3dd4a1f652f Mon Sep 17 00:00:00 2001 From: Emman Date: Thu, 18 Mar 2021 13:01:24 +0800 Subject: [PATCH] MAGIC - use AES-NI --- sm4/asm_amd64.s | 214 ++++++++++++++++++++++++++++++++++++++++++ sm4/cipher.go | 2 +- sm4/cipher_asm.go | 65 +++++++++++++ sm4/cipher_generic.go | 14 +++ 4 files changed, 294 insertions(+), 1 deletion(-) create mode 100644 sm4/asm_amd64.s create mode 100644 sm4/cipher_asm.go create mode 100644 sm4/cipher_generic.go diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s new file mode 100644 index 0000000..7016d2e --- /dev/null +++ b/sm4/asm_amd64.s @@ -0,0 +1,214 @@ +#include "textflag.h" + +#define x X0 +#define y X1 +#define t0 X2 +#define t1 X3 +#define t2 X4 +#define t3 X5 + +#define XTMP6 X6 +#define XTMP7 X7 + +// shuffle byte order from LE to BE +DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 +DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b +GLOBL flip_mask<>(SB), RODATA, $16 + +//nibble mask +DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F +DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F +GLOBL nibble_mask<>(SB), RODATA, $16 + +// inverse shift rows +DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00 +DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 +GLOBL inverse_shift_rows<>(SB), RODATA, $16 + +// Affine transform 1 (low and high hibbles) +DATA m1_low<>+0x00(SB)/8, $0x9197E2E474720701 +DATA m1_low<>+0x08(SB)/8, $0xC7C1B4B222245157 +GLOBL m1_low<>(SB), RODATA, $16 + +DATA m1_high<>+0x00(SB)/8, $0xE240AB09EB49A200 +DATA m1_high<>+0x08(SB)/8, $0xF052B91BF95BB012 +GLOBL m1_high<>(SB), RODATA, $16 + +// Affine transform 2 (low and high hibbles) +DATA m2_low<>+0x00(SB)/8, $0x5B67F2CEA19D0834 +DATA m2_low<>+0x08(SB)/8, $0xEDD14478172BBE82 +GLOBL m2_low<>(SB), RODATA, $16 + +DATA m2_high<>+0x00(SB)/8, $0xAE7201DD73AFDC00 +DATA m2_high<>+0x08(SB)/8, $0x11CDBE62CC1063BF +GLOBL m2_high<>(SB), RODATA, $16 + +// left rotations of 32-bit words by 8-bit increments +DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 +DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B +GLOBL r08_mask<>(SB), RODATA, $16 + +DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302 +DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A +GLOBL r16_mask<>(SB), RODATA, $16 + +DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201 +DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 +GLOBL r24_mask<>(SB), RODATA, $16 + +#define SM4_TAO_L1(x, y) \ + ; \ //############################# inner affine ############################// + MOVOU x, XTMP6; \ + PAND nibble_mask<>(SB), XTMP6; \ //y = _mm_and_si128(x, c0f); + MOVOU m1_low<>(SB), y; \ + PSHUFB XTMP6, y; \ //y = _mm_shuffle_epi8(m1l, y); + PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); + PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f); + MOVOU m1_high<>(SB), XTMP6; \ + PSHUFB x, XTMP6; \ //x = _mm_shuffle_epi8(m1h, x); + MOVOU XTMP6, x; \ //x = _mm_shuffle_epi8(m1h, x); + PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y; + ; \ // inverse MixColumns + PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr); + AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction + ; \ //############################# outer affine ############################// + MOVOU x, XTMP6; \ + PANDN nibble_mask<>(SB), XTMP6; \ //XTMP6 = _mm_andnot_si128(x, c0f); + MOVOU m2_low<>(SB), y; \ + PSHUFB XTMP6, y; \ //y = _mm_shuffle_epi8(m2l, XTMP6) + PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); + PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f); + MOVOU m2_high<>(SB), XTMP6; \ + PSHUFB x, XTMP6; \ + MOVOU XTMP6, x; \ //x = _mm_shuffle_epi8(m2h, x) + PXOR y, x; \ //x = _mm_shuffle_epi8(m2h, x) ^ y; + ; \ //#################### 4 parallel L1 linear transforms ##################// + MOVOU x, y; \ + PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08) + PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08) + MOVOU x, XTMP6; \ + PSHUFB r16_mask<>(SB), XTMP6; \ + PXOR XTMP6, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16) + MOVOU y, XTMP6; \ + PSLLL $2, XTMP6; \ + PSRLL $30, y; \ + PXOR XTMP6, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30); + MOVOU x, XTMP7; \ + PSHUFB r24_mask<>(SB), XTMP7; \ + PXOR y, x; \ //x = x xor y + PXOR XTMP7, x //x = x xor y xor _mm_shuffle_epi8(x, r24); + +// func encryptBlocksAsm(xk *uint32, dst, src *byte) +TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 + MOVQ xk+0(FP), AX + MOVQ dst+8(FP), BX + MOVQ src+16(FP), DX + + PINSRD $0, 0(DX), t0 + PINSRD $1, 16(DX), t0 + PINSRD $2, 32(DX), t0 + PINSRD $3, 48(DX), t0 + PSHUFB flip_mask<>(SB), t0 + + PINSRD $0, 4(DX), t1 + PINSRD $1, 20(DX), t1 + PINSRD $2, 36(DX), t1 + PINSRD $3, 52(DX), t1 + PSHUFB flip_mask<>(SB), t1 + + PINSRD $0, 8(DX), t2 + PINSRD $1, 24(DX), t2 + PINSRD $2, 40(DX), t2 + PINSRD $3, 56(DX), t2 + PSHUFB flip_mask<>(SB), t2 + + PINSRD $0, 12(DX), t3 + PINSRD $1, 28(DX), t3 + PINSRD $2, 44(DX), t3 + PINSRD $3, 60(DX), t3 + PSHUFB flip_mask<>(SB), t3 + + XORL CX, CX + +loop: + PINSRD $0, 0(AX)(CX*1), x + PINSRD $1, 0(AX)(CX*1), x + PINSRD $2, 0(AX)(CX*1), x + PINSRD $3, 0(AX)(CX*1), x + PXOR t1, x + PXOR t2, x + PXOR t3, x + + SM4_TAO_L1(x, y) + PXOR x, t0 + + PINSRD $0, 4(AX)(CX*1), x + PINSRD $1, 4(AX)(CX*1), x + PINSRD $2, 4(AX)(CX*1), x + PINSRD $3, 4(AX)(CX*1), x + PXOR t0, x + PXOR t2, x + PXOR t3, x + SM4_TAO_L1(x, y) + PXOR x, t1 + + PINSRD $0, 8(AX)(CX*1), x + PINSRD $1, 8(AX)(CX*1), x + PINSRD $2, 8(AX)(CX*1), x + PINSRD $3, 8(AX)(CX*1), x + PXOR t0, x + PXOR t1, x + PXOR t3, x + SM4_TAO_L1(x, y) + PXOR x, t2 + + PINSRD $0, 12(AX)(CX*1), x + PINSRD $1, 12(AX)(CX*1), x + PINSRD $2, 12(AX)(CX*1), x + PINSRD $3, 12(AX)(CX*1), x + PXOR t0, x + PXOR t1, x + PXOR t2, x + SM4_TAO_L1(x, y) + PXOR x, t3 + + ADDL $16, CX + CMPL CX, $4*32 + JB loop + + PSHUFB flip_mask<>(SB), t3 + PSHUFB flip_mask<>(SB), t2 + PSHUFB flip_mask<>(SB), t1 + PSHUFB flip_mask<>(SB), t0 + MOVUPS t3, 0(BX) + MOVUPS t2, 16(BX) + MOVUPS t1, 32(BX) + MOVUPS t0, 48(BX) + MOVL 4(BX), R8 + MOVL 8(BX), R9 + MOVL 12(BX), R10 + MOVL 16(BX), R11 + MOVL 32(BX), R12 + MOVL 48(BX), R13 + MOVL R11, 4(BX) + MOVL R12, 8(BX) + MOVL R13, 12(BX) + MOVL R8, 16(BX) + MOVL R9, 32(BX) + MOVL R10, 48(BX) + MOVL 24(BX), R8 + MOVL 28(BX), R9 + MOVL 36(BX), R10 + MOVL 52(BX), R11 + MOVL R10, 24(BX) + MOVL R11, 28(BX) + MOVL R8, 36(BX) + MOVL R9, 52(BX) + MOVL 44(BX), R8 + MOVL 56(BX), R9 + MOVL R9, 44(BX) + MOVL R8, 56(BX) + +done_sm4: + VZEROUPPER + RET diff --git a/sm4/cipher.go b/sm4/cipher.go index 85af4c9..9f62120 100644 --- a/sm4/cipher.go +++ b/sm4/cipher.go @@ -32,7 +32,7 @@ func NewCipher(key []byte) (cipher.Block, error) { // newCipher creates and returns a new cipher.Block // implemented in pure Go. -func newCipher(key []byte) (cipher.Block, error) { +func newCipherGeneric(key []byte) (cipher.Block, error) { c := sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)} expandKeyGo(key, c.enc, c.dec) return &c, nil diff --git a/sm4/cipher_asm.go b/sm4/cipher_asm.go new file mode 100644 index 0000000..146c0d8 --- /dev/null +++ b/sm4/cipher_asm.go @@ -0,0 +1,65 @@ +// +build amd64 + +package sm4 + +import ( + "crypto/cipher" + + "golang.org/x/sys/cpu" +) + +//go:noescape +func encryptBlocksAsm(xk *uint32, dst, src *byte) + +type sm4CipherAsm struct { + sm4Cipher +} + +var supportsAES = cpu.X86.HasAES + +func newCipher(key []byte) (cipher.Block, error) { + if !supportsAES { + return newCipherGeneric(key) + } + c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}} + expandKeyGo(key, c.enc, c.dec) + return &c, nil +} + +const FourBlocksSize = 64 + +func (c *sm4CipherAsm) BlockSize() int { return BlockSize } + +func (c *sm4CipherAsm) Encrypt(dst, src []byte) { + if len(src) < BlockSize { + panic("sm4: input not full block") + } + if len(dst) < BlockSize { + panic("sm4: output not full block") + } + if InexactOverlap(dst[:BlockSize], src[:BlockSize]) { + panic("sm4: invalid buffer overlap") + } + var src64 []byte = make([]byte, FourBlocksSize) + var dst64 []byte = make([]byte, FourBlocksSize) + copy(src64, src) + encryptBlocksAsm(&c.enc[0], &dst64[0], &src64[0]) + copy(dst, dst64[:BlockSize]) +} + +func (c *sm4CipherAsm) Decrypt(dst, src []byte) { + if len(src) < BlockSize { + panic("sm4: input not full block") + } + if len(dst) < BlockSize { + panic("sm4: output not full block") + } + if InexactOverlap(dst[:BlockSize], src[:BlockSize]) { + panic("sm4: invalid buffer overlap") + } + var src64 []byte = make([]byte, FourBlocksSize) + var dst64 []byte = make([]byte, FourBlocksSize) + copy(src64, src) + encryptBlocksAsm(&c.dec[0], &dst64[0], &src64[0]) + copy(dst, dst64[:BlockSize]) +} diff --git a/sm4/cipher_generic.go b/sm4/cipher_generic.go new file mode 100644 index 0000000..f547cd1 --- /dev/null +++ b/sm4/cipher_generic.go @@ -0,0 +1,14 @@ +// +build !amd64 + +package sm4 + +import "crypto/cipher" + +// newCipher calls the newCipherGeneric function +// directly. Platforms with hardware accelerated +// implementations of SM4 should implement their +// own version of newCipher (which may then call +// newCipherGeneric if needed). +func newCipher(key []byte) (cipher.Block, error) { + return newCipherGeneric(key) +}