mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-27 12:46:18 +08:00
MAGIC - use AES-NI
This commit is contained in:
parent
9513b9c5d9
commit
b0889c3432
214
sm4/asm_amd64.s
Normal file
214
sm4/asm_amd64.s
Normal file
@ -0,0 +1,214 @@
|
||||
#include "textflag.h"
|
||||
|
||||
#define x X0
|
||||
#define y X1
|
||||
#define t0 X2
|
||||
#define t1 X3
|
||||
#define t2 X4
|
||||
#define t3 X5
|
||||
|
||||
#define XTMP6 X6
|
||||
#define XTMP7 X7
|
||||
|
||||
// shuffle byte order from LE to BE
|
||||
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
|
||||
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
|
||||
GLOBL flip_mask<>(SB), RODATA, $16
|
||||
|
||||
//nibble mask
|
||||
DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
GLOBL nibble_mask<>(SB), RODATA, $16
|
||||
|
||||
// inverse shift rows
|
||||
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
|
||||
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
|
||||
GLOBL inverse_shift_rows<>(SB), RODATA, $16
|
||||
|
||||
// Affine transform 1 (low and high hibbles)
|
||||
DATA m1_low<>+0x00(SB)/8, $0x9197E2E474720701
|
||||
DATA m1_low<>+0x08(SB)/8, $0xC7C1B4B222245157
|
||||
GLOBL m1_low<>(SB), RODATA, $16
|
||||
|
||||
DATA m1_high<>+0x00(SB)/8, $0xE240AB09EB49A200
|
||||
DATA m1_high<>+0x08(SB)/8, $0xF052B91BF95BB012
|
||||
GLOBL m1_high<>(SB), RODATA, $16
|
||||
|
||||
// Affine transform 2 (low and high hibbles)
|
||||
DATA m2_low<>+0x00(SB)/8, $0x5B67F2CEA19D0834
|
||||
DATA m2_low<>+0x08(SB)/8, $0xEDD14478172BBE82
|
||||
GLOBL m2_low<>(SB), RODATA, $16
|
||||
|
||||
DATA m2_high<>+0x00(SB)/8, $0xAE7201DD73AFDC00
|
||||
DATA m2_high<>+0x08(SB)/8, $0x11CDBE62CC1063BF
|
||||
GLOBL m2_high<>(SB), RODATA, $16
|
||||
|
||||
// left rotations of 32-bit words by 8-bit increments
|
||||
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL r08_mask<>(SB), RODATA, $16
|
||||
|
||||
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL r16_mask<>(SB), RODATA, $16
|
||||
|
||||
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
|
||||
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
||||
GLOBL r24_mask<>(SB), RODATA, $16
|
||||
|
||||
#define SM4_TAO_L1(x, y) \
|
||||
; \ //############################# inner affine ############################//
|
||||
MOVOU x, XTMP6; \
|
||||
PAND nibble_mask<>(SB), XTMP6; \ //y = _mm_and_si128(x, c0f);
|
||||
MOVOU m1_low<>(SB), y; \
|
||||
PSHUFB XTMP6, y; \ //y = _mm_shuffle_epi8(m1l, y);
|
||||
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
||||
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
|
||||
MOVOU m1_high<>(SB), XTMP6; \
|
||||
PSHUFB x, XTMP6; \ //x = _mm_shuffle_epi8(m1h, x);
|
||||
MOVOU XTMP6, x; \ //x = _mm_shuffle_epi8(m1h, x);
|
||||
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
|
||||
; \ // inverse MixColumns
|
||||
PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
|
||||
AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction
|
||||
; \ //############################# outer affine ############################//
|
||||
MOVOU x, XTMP6; \
|
||||
PANDN nibble_mask<>(SB), XTMP6; \ //XTMP6 = _mm_andnot_si128(x, c0f);
|
||||
MOVOU m2_low<>(SB), y; \
|
||||
PSHUFB XTMP6, y; \ //y = _mm_shuffle_epi8(m2l, XTMP6)
|
||||
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
||||
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
|
||||
MOVOU m2_high<>(SB), XTMP6; \
|
||||
PSHUFB x, XTMP6; \
|
||||
MOVOU XTMP6, x; \ //x = _mm_shuffle_epi8(m2h, x)
|
||||
PXOR y, x; \ //x = _mm_shuffle_epi8(m2h, x) ^ y;
|
||||
; \ //#################### 4 parallel L1 linear transforms ##################//
|
||||
MOVOU x, y; \
|
||||
PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
|
||||
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
|
||||
MOVOU x, XTMP6; \
|
||||
PSHUFB r16_mask<>(SB), XTMP6; \
|
||||
PXOR XTMP6, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
|
||||
MOVOU y, XTMP6; \
|
||||
PSLLL $2, XTMP6; \
|
||||
PSRLL $30, y; \
|
||||
PXOR XTMP6, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
|
||||
MOVOU x, XTMP7; \
|
||||
PSHUFB r24_mask<>(SB), XTMP7; \
|
||||
PXOR y, x; \ //x = x xor y
|
||||
PXOR XTMP7, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
|
||||
|
||||
// func encryptBlocksAsm(xk *uint32, dst, src *byte)
|
||||
TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
|
||||
MOVQ xk+0(FP), AX
|
||||
MOVQ dst+8(FP), BX
|
||||
MOVQ src+16(FP), DX
|
||||
|
||||
PINSRD $0, 0(DX), t0
|
||||
PINSRD $1, 16(DX), t0
|
||||
PINSRD $2, 32(DX), t0
|
||||
PINSRD $3, 48(DX), t0
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
|
||||
PINSRD $0, 4(DX), t1
|
||||
PINSRD $1, 20(DX), t1
|
||||
PINSRD $2, 36(DX), t1
|
||||
PINSRD $3, 52(DX), t1
|
||||
PSHUFB flip_mask<>(SB), t1
|
||||
|
||||
PINSRD $0, 8(DX), t2
|
||||
PINSRD $1, 24(DX), t2
|
||||
PINSRD $2, 40(DX), t2
|
||||
PINSRD $3, 56(DX), t2
|
||||
PSHUFB flip_mask<>(SB), t2
|
||||
|
||||
PINSRD $0, 12(DX), t3
|
||||
PINSRD $1, 28(DX), t3
|
||||
PINSRD $2, 44(DX), t3
|
||||
PINSRD $3, 60(DX), t3
|
||||
PSHUFB flip_mask<>(SB), t3
|
||||
|
||||
XORL CX, CX
|
||||
|
||||
loop:
|
||||
PINSRD $0, 0(AX)(CX*1), x
|
||||
PINSRD $1, 0(AX)(CX*1), x
|
||||
PINSRD $2, 0(AX)(CX*1), x
|
||||
PINSRD $3, 0(AX)(CX*1), x
|
||||
PXOR t1, x
|
||||
PXOR t2, x
|
||||
PXOR t3, x
|
||||
|
||||
SM4_TAO_L1(x, y)
|
||||
PXOR x, t0
|
||||
|
||||
PINSRD $0, 4(AX)(CX*1), x
|
||||
PINSRD $1, 4(AX)(CX*1), x
|
||||
PINSRD $2, 4(AX)(CX*1), x
|
||||
PINSRD $3, 4(AX)(CX*1), x
|
||||
PXOR t0, x
|
||||
PXOR t2, x
|
||||
PXOR t3, x
|
||||
SM4_TAO_L1(x, y)
|
||||
PXOR x, t1
|
||||
|
||||
PINSRD $0, 8(AX)(CX*1), x
|
||||
PINSRD $1, 8(AX)(CX*1), x
|
||||
PINSRD $2, 8(AX)(CX*1), x
|
||||
PINSRD $3, 8(AX)(CX*1), x
|
||||
PXOR t0, x
|
||||
PXOR t1, x
|
||||
PXOR t3, x
|
||||
SM4_TAO_L1(x, y)
|
||||
PXOR x, t2
|
||||
|
||||
PINSRD $0, 12(AX)(CX*1), x
|
||||
PINSRD $1, 12(AX)(CX*1), x
|
||||
PINSRD $2, 12(AX)(CX*1), x
|
||||
PINSRD $3, 12(AX)(CX*1), x
|
||||
PXOR t0, x
|
||||
PXOR t1, x
|
||||
PXOR t2, x
|
||||
SM4_TAO_L1(x, y)
|
||||
PXOR x, t3
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB loop
|
||||
|
||||
PSHUFB flip_mask<>(SB), t3
|
||||
PSHUFB flip_mask<>(SB), t2
|
||||
PSHUFB flip_mask<>(SB), t1
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
MOVUPS t3, 0(BX)
|
||||
MOVUPS t2, 16(BX)
|
||||
MOVUPS t1, 32(BX)
|
||||
MOVUPS t0, 48(BX)
|
||||
MOVL 4(BX), R8
|
||||
MOVL 8(BX), R9
|
||||
MOVL 12(BX), R10
|
||||
MOVL 16(BX), R11
|
||||
MOVL 32(BX), R12
|
||||
MOVL 48(BX), R13
|
||||
MOVL R11, 4(BX)
|
||||
MOVL R12, 8(BX)
|
||||
MOVL R13, 12(BX)
|
||||
MOVL R8, 16(BX)
|
||||
MOVL R9, 32(BX)
|
||||
MOVL R10, 48(BX)
|
||||
MOVL 24(BX), R8
|
||||
MOVL 28(BX), R9
|
||||
MOVL 36(BX), R10
|
||||
MOVL 52(BX), R11
|
||||
MOVL R10, 24(BX)
|
||||
MOVL R11, 28(BX)
|
||||
MOVL R8, 36(BX)
|
||||
MOVL R9, 52(BX)
|
||||
MOVL 44(BX), R8
|
||||
MOVL 56(BX), R9
|
||||
MOVL R9, 44(BX)
|
||||
MOVL R8, 56(BX)
|
||||
|
||||
done_sm4:
|
||||
VZEROUPPER
|
||||
RET
|
@ -32,7 +32,7 @@ func NewCipher(key []byte) (cipher.Block, error) {
|
||||
|
||||
// newCipher creates and returns a new cipher.Block
|
||||
// implemented in pure Go.
|
||||
func newCipher(key []byte) (cipher.Block, error) {
|
||||
func newCipherGeneric(key []byte) (cipher.Block, error) {
|
||||
c := sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}
|
||||
expandKeyGo(key, c.enc, c.dec)
|
||||
return &c, nil
|
||||
|
65
sm4/cipher_asm.go
Normal file
65
sm4/cipher_asm.go
Normal file
@ -0,0 +1,65 @@
|
||||
// +build amd64
|
||||
|
||||
package sm4
|
||||
|
||||
import (
|
||||
"crypto/cipher"
|
||||
|
||||
"golang.org/x/sys/cpu"
|
||||
)
|
||||
|
||||
//go:noescape
|
||||
func encryptBlocksAsm(xk *uint32, dst, src *byte)
|
||||
|
||||
type sm4CipherAsm struct {
|
||||
sm4Cipher
|
||||
}
|
||||
|
||||
var supportsAES = cpu.X86.HasAES
|
||||
|
||||
func newCipher(key []byte) (cipher.Block, error) {
|
||||
if !supportsAES {
|
||||
return newCipherGeneric(key)
|
||||
}
|
||||
c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}}
|
||||
expandKeyGo(key, c.enc, c.dec)
|
||||
return &c, nil
|
||||
}
|
||||
|
||||
const FourBlocksSize = 64
|
||||
|
||||
func (c *sm4CipherAsm) BlockSize() int { return BlockSize }
|
||||
|
||||
func (c *sm4CipherAsm) Encrypt(dst, src []byte) {
|
||||
if len(src) < BlockSize {
|
||||
panic("sm4: input not full block")
|
||||
}
|
||||
if len(dst) < BlockSize {
|
||||
panic("sm4: output not full block")
|
||||
}
|
||||
if InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
|
||||
panic("sm4: invalid buffer overlap")
|
||||
}
|
||||
var src64 []byte = make([]byte, FourBlocksSize)
|
||||
var dst64 []byte = make([]byte, FourBlocksSize)
|
||||
copy(src64, src)
|
||||
encryptBlocksAsm(&c.enc[0], &dst64[0], &src64[0])
|
||||
copy(dst, dst64[:BlockSize])
|
||||
}
|
||||
|
||||
func (c *sm4CipherAsm) Decrypt(dst, src []byte) {
|
||||
if len(src) < BlockSize {
|
||||
panic("sm4: input not full block")
|
||||
}
|
||||
if len(dst) < BlockSize {
|
||||
panic("sm4: output not full block")
|
||||
}
|
||||
if InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
|
||||
panic("sm4: invalid buffer overlap")
|
||||
}
|
||||
var src64 []byte = make([]byte, FourBlocksSize)
|
||||
var dst64 []byte = make([]byte, FourBlocksSize)
|
||||
copy(src64, src)
|
||||
encryptBlocksAsm(&c.dec[0], &dst64[0], &src64[0])
|
||||
copy(dst, dst64[:BlockSize])
|
||||
}
|
14
sm4/cipher_generic.go
Normal file
14
sm4/cipher_generic.go
Normal file
@ -0,0 +1,14 @@
|
||||
// +build !amd64
|
||||
|
||||
package sm4
|
||||
|
||||
import "crypto/cipher"
|
||||
|
||||
// newCipher calls the newCipherGeneric function
|
||||
// directly. Platforms with hardware accelerated
|
||||
// implementations of SM4 should implement their
|
||||
// own version of newCipher (which may then call
|
||||
// newCipherGeneric if needed).
|
||||
func newCipher(key []byte) (cipher.Block, error) {
|
||||
return newCipherGeneric(key)
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user