mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-28 05:06:18 +08:00
sm4: improve throughput #146
This commit is contained in:
parent
fe84641340
commit
4bc3c5d27b
@ -25,6 +25,12 @@ func BenchmarkSM4EBCEncrypt1K(b *testing.B) {
|
||||
benchmarkEBCEncrypt1K(b, c)
|
||||
}
|
||||
|
||||
func BenchmarkAES128EBCEncrypt1K(b *testing.B) {
|
||||
var key [16]byte
|
||||
c, _ := aes.NewCipher(key[:])
|
||||
benchmarkEBCEncrypt1K(b, c)
|
||||
}
|
||||
|
||||
func benchmarkCBCEncrypt1K(b *testing.B, block cipher.Block) {
|
||||
buf := make([]byte, 1024)
|
||||
b.SetBytes(int64(len(buf)))
|
||||
|
@ -2,6 +2,8 @@ package cipher_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/rand"
|
||||
"io"
|
||||
"testing"
|
||||
|
||||
"github.com/emmansun/gmsm/cipher"
|
||||
@ -63,6 +65,11 @@ var ecbSM4Tests = []struct {
|
||||
[]byte("0123456789ABCDEF"),
|
||||
[]byte("exampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintext"),
|
||||
},
|
||||
{
|
||||
"18 same blocks",
|
||||
[]byte("0123456789ABCDEF"),
|
||||
[]byte("exampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintext"),
|
||||
},
|
||||
}
|
||||
|
||||
func TestECBBasic(t *testing.T) {
|
||||
@ -80,11 +87,30 @@ func TestECBBasic(t *testing.T) {
|
||||
decrypter := cipher.NewECBDecrypter(c)
|
||||
decrypter.CryptBlocks(plaintext, ciphertext)
|
||||
if !bytes.Equal(test.in, plaintext) {
|
||||
t.Errorf("%s: ECB encrypt/decrypt failed", test.name)
|
||||
t.Errorf("%s: ECB encrypt/decrypt failed, %s", test.name, string(plaintext))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestECBRandom(t *testing.T) {
|
||||
key := []byte("0123456789ABCDEF")
|
||||
plaintext := make([]byte, 448)
|
||||
ciphertext := make([]byte, 448)
|
||||
io.ReadFull(rand.Reader, plaintext)
|
||||
c, err := sm4.NewCipher(key)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
encrypter := cipher.NewECBEncrypter(c)
|
||||
encrypter.CryptBlocks(ciphertext, plaintext)
|
||||
result := make([]byte, 448)
|
||||
decrypter := cipher.NewECBDecrypter(c)
|
||||
decrypter.CryptBlocks(result, ciphertext)
|
||||
if !bytes.Equal(result, plaintext) {
|
||||
t.Error("ECB encrypt/decrypt failed")
|
||||
}
|
||||
}
|
||||
|
||||
func shouldPanic(t *testing.T, f func()) {
|
||||
t.Helper()
|
||||
defer func() { _ = recover() }()
|
||||
|
@ -15,7 +15,7 @@ GLOBL nibble_mask<>(SB), 8, $16
|
||||
|
||||
// inverse shift rows
|
||||
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
|
||||
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
|
||||
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
|
||||
GLOBL inverse_shift_rows<>(SB), 8, $16
|
||||
|
||||
// Affine transform 1 (low and high hibbles)
|
||||
@ -24,7 +24,7 @@ DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
|
||||
GLOBL m1_low<>(SB), 8, $16
|
||||
|
||||
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
|
||||
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
|
||||
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
|
||||
GLOBL m1_high<>(SB), 8, $16
|
||||
|
||||
// Affine transform 2 (low and high hibbles)
|
||||
@ -38,21 +38,46 @@ GLOBL m2_high<>(SB), 8, $16
|
||||
|
||||
// left rotations of 32-bit words by 8-bit increments
|
||||
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL r08_mask<>(SB), 8, $16
|
||||
|
||||
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL r16_mask<>(SB), 8, $16
|
||||
|
||||
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
|
||||
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
||||
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
||||
GLOBL r24_mask<>(SB), 8, $16
|
||||
|
||||
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
||||
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
||||
GLOBL fk_mask<>(SB), 8, $16
|
||||
|
||||
// inverse shift rows
|
||||
DATA inverse_shift_rows256<>+0x00(SB)/8, $0x0B0E0104070A0D00
|
||||
DATA inverse_shift_rows256<>+0x08(SB)/8, $0x0306090C0F020508
|
||||
DATA inverse_shift_rows256<>+0x10(SB)/8, $0x0B0E0104070A0D00
|
||||
DATA inverse_shift_rows256<>+0x18(SB)/8, $0x0306090C0F020508
|
||||
GLOBL inverse_shift_rows256<>(SB), 8, $32
|
||||
|
||||
DATA r08_mask256<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA r08_mask256<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
DATA r08_mask256<>+0x10(SB)/8, $0x0605040702010003
|
||||
DATA r08_mask256<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL r08_mask256<>(SB), 8, $32
|
||||
|
||||
DATA r16_mask256<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA r16_mask256<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
DATA r16_mask256<>+0x10(SB)/8, $0x0504070601000302
|
||||
DATA r16_mask256<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL r16_mask256<>(SB), 8, $32
|
||||
|
||||
DATA r24_mask256<>+0x00(SB)/8, $0x0407060500030201
|
||||
DATA r24_mask256<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
||||
DATA r24_mask256<>+0x10(SB)/8, $0x0407060500030201
|
||||
DATA r24_mask256<>+0x18(SB)/8, $0x0C0F0E0D080B0A09
|
||||
GLOBL r24_mask256<>(SB), 8, $32
|
||||
|
||||
// Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance!
|
||||
// input: from high to low
|
||||
// r0 = [w3, w2, w1, w0]
|
||||
@ -189,7 +214,7 @@ GLOBL fk_mask<>(SB), 8, $16
|
||||
// - t2: 128 bits register for data
|
||||
// - t3: 128 bits register for data
|
||||
#define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(RK)(IND*1), x; \
|
||||
MOVL (index * 4)(RK)(IND*1), x; \
|
||||
PSHUFD $0, x, x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
@ -197,29 +222,128 @@ GLOBL fk_mask<>(SB), 8, $16
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
PXOR x, t0
|
||||
|
||||
#define SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3) \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
PXOR x, t0 \
|
||||
|
||||
#define SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3) \
|
||||
PSHUFD $0, rk128, x; \
|
||||
SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3); \
|
||||
PSHUFD $0x55, rk128, x; \
|
||||
SM4_ONE_ROUND_SSE(x, y, z, t1, t2, t3, t0); \
|
||||
PSHUFD $0xAA, rk128, x; \
|
||||
SM4_ONE_ROUND_SSE(x, y, z, t2, t3, t0, t1); \
|
||||
PSHUFD $0xFF, rk128, x; \
|
||||
SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2); \
|
||||
|
||||
#define SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
||||
PSHUFB flip_mask<>(SB), t0; \
|
||||
PSHUFB flip_mask<>(SB), t1; \
|
||||
PSHUFB flip_mask<>(SB), t2; \
|
||||
PSHUFB flip_mask<>(SB), t3; \
|
||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
||||
MOVOU (0*16)(RK), rk128; \
|
||||
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||
MOVOU (1*16)(RK), rk128; \
|
||||
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||
MOVOU (2*16)(RK), rk128; \
|
||||
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||
MOVOU (3*16)(RK), rk128; \
|
||||
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||
MOVOU (4*16)(RK), rk128; \
|
||||
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||
MOVOU (5*16)(RK), rk128; \
|
||||
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||
MOVOU (6*16)(RK), rk128; \
|
||||
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||
MOVOU (7*16)(RK), rk128; \
|
||||
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
||||
PSHUFB bswap_mask<>(SB), t3; \
|
||||
PSHUFB bswap_mask<>(SB), t2; \
|
||||
PSHUFB bswap_mask<>(SB), t1; \
|
||||
PSHUFB bswap_mask<>(SB), t0
|
||||
|
||||
#define SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||
PSHUFD $0, rk128, x; \
|
||||
SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3); \
|
||||
PSHUFD $0, rk128, x; \
|
||||
SM4_ONE_ROUND_SSE(x, y, z, t4, t5, t6, t7); \
|
||||
PSHUFD $0x55, rk128, x; \
|
||||
SM4_ONE_ROUND_SSE(x, y, z, t1, t2, t3, t0); \
|
||||
PSHUFD $0x55, rk128, x; \
|
||||
SM4_ONE_ROUND_SSE(x, y, z, t5, t6, t7, t4); \
|
||||
PSHUFD $0xAA, rk128, x; \
|
||||
SM4_ONE_ROUND_SSE(x, y, z, t2, t3, t0, t1); \
|
||||
PSHUFD $0xAA, rk128, x; \
|
||||
SM4_ONE_ROUND_SSE(x, y, z, t6, t7, t4, t5); \
|
||||
PSHUFD $0xFF, rk128, x; \
|
||||
SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2); \
|
||||
PSHUFD $0xFF, rk128, x; \
|
||||
SM4_ONE_ROUND_SSE(x, y, z, t7, t4, t5, t6); \
|
||||
|
||||
#define SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||
PSHUFB flip_mask<>(SB), t0; \
|
||||
PSHUFB flip_mask<>(SB), t1; \
|
||||
PSHUFB flip_mask<>(SB), t2; \
|
||||
PSHUFB flip_mask<>(SB), t3; \
|
||||
PSHUFB flip_mask<>(SB), t4; \
|
||||
PSHUFB flip_mask<>(SB), t5; \
|
||||
PSHUFB flip_mask<>(SB), t6; \
|
||||
PSHUFB flip_mask<>(SB), t7; \
|
||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
||||
SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \
|
||||
MOVOU (0*16)(RK), rk128; \
|
||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
MOVOU (1*16)(RK), rk128; \
|
||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
MOVOU (2*16)(RK), rk128; \
|
||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
MOVOU (3*16)(RK), rk128; \
|
||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
MOVOU (4*16)(RK), rk128; \
|
||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
MOVOU (5*16)(RK), rk128; \
|
||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
MOVOU (6*16)(RK), rk128; \
|
||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
MOVOU (7*16)(RK), rk128; \
|
||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
||||
SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \
|
||||
PSHUFB bswap_mask<>(SB), t3; \
|
||||
PSHUFB bswap_mask<>(SB), t2; \
|
||||
PSHUFB bswap_mask<>(SB), t1; \
|
||||
PSHUFB bswap_mask<>(SB), t0; \
|
||||
PSHUFB bswap_mask<>(SB), t7; \
|
||||
PSHUFB bswap_mask<>(SB), t6; \
|
||||
PSHUFB bswap_mask<>(SB), t5; \
|
||||
PSHUFB bswap_mask<>(SB), t4
|
||||
|
||||
// SM4 sbox function, AVX version
|
||||
// parameters:
|
||||
// - x: 128 bits register as sbox input/output data
|
||||
// - y: 128 bits temp register
|
||||
// - X_NIBBLE_MASK: 128 bits register stored nibble mask, should be loaded earlier.
|
||||
// - tmp: 128 bits temp register
|
||||
#define AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp) \
|
||||
VPAND X_NIBBLE_MASK, x, tmp; \
|
||||
#define AVX_SM4_SBOX(x, y, tmp) \
|
||||
VPAND nibble_mask<>(SB), x, tmp; \
|
||||
VMOVDQU m1_low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND X_NIBBLE_MASK, x, x; \
|
||||
VPAND nibble_mask<>(SB), x, x; \
|
||||
VMOVDQU m1_high<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x; \
|
||||
VMOVDQU inverse_shift_rows<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, x; \
|
||||
VAESENCLAST X_NIBBLE_MASK, x, x; \
|
||||
VPANDN X_NIBBLE_MASK, x, tmp; \
|
||||
VPSHUFB inverse_shift_rows<>(SB), x, x; \
|
||||
VAESENCLAST nibble_mask<>(SB), x, x; \
|
||||
VPANDN nibble_mask<>(SB), x, tmp; \
|
||||
VMOVDQU m2_low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND X_NIBBLE_MASK, x, x; \
|
||||
VPAND nibble_mask<>(SB), x, x; \
|
||||
VMOVDQU m2_high<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x
|
||||
@ -228,21 +352,17 @@ GLOBL fk_mask<>(SB), 8, $16
|
||||
// parameters:
|
||||
// - x: 128 bits register as sbox input/output data
|
||||
// - y: 128 bits temp register
|
||||
// - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier.
|
||||
// - tmp: 128 bits temp register
|
||||
#define AVX_SM4_TAO_L1(x, y, xNibbleMask, tmp) \
|
||||
AVX_SM4_SBOX(x, y, xNibbleMask, tmp); \
|
||||
VMOVDQU r08_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, y; \
|
||||
#define AVX_SM4_TAO_L1(x, y, tmp) \
|
||||
AVX_SM4_SBOX(x, y, tmp); \
|
||||
VPSHUFB r08_mask<>(SB), x, y; \
|
||||
VPXOR x, y, y; \
|
||||
VMOVDQU r16_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPSHUFB r16_mask<>(SB), x, tmp; \
|
||||
VPXOR tmp, y, y; \
|
||||
VPSLLD $2, y, tmp; \
|
||||
VPSRLD $30, y, y; \
|
||||
VPXOR tmp, y, y; \
|
||||
VMOVDQU r24_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPSHUFB r24_mask<>(SB), x, tmp; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, tmp, x
|
||||
|
||||
@ -280,9 +400,115 @@ GLOBL fk_mask<>(SB), 8, $16
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
|
||||
AVX_SM4_TAO_L1(x, y, tmp); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
|
||||
#define SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3) \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX_SM4_TAO_L1(x, y, z); \
|
||||
VPXOR x, t0, t0 \
|
||||
|
||||
#define SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3) \
|
||||
VPSHUFD $0, rk128, x; \
|
||||
SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3); \
|
||||
VPSHUFD $0x55, rk128, x; \
|
||||
SM4_ONE_ROUND_AVX(x, y, z, t1, t2, t3, t0); \
|
||||
VPSHUFD $0xAA, rk128, x; \
|
||||
SM4_ONE_ROUND_AVX(x, y, z, t2, t3, t0, t1); \
|
||||
VPSHUFD $0xFF, rk128, x; \
|
||||
SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \
|
||||
|
||||
#define AVX_SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
||||
VPSHUFB flip_mask<>(SB), t0, t0 \
|
||||
VPSHUFB flip_mask<>(SB), t1, t1 \
|
||||
VPSHUFB flip_mask<>(SB), t2, t2 \
|
||||
VPSHUFB flip_mask<>(SB), t3, t3 \
|
||||
; \
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
||||
VMOVDQU (0*16)(RK), rk128; \
|
||||
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||
VMOVDQU (1*16)(RK), rk128; \
|
||||
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||
VMOVDQU (2*16)(RK), rk128; \
|
||||
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||
VMOVDQU (3*16)(RK), rk128; \
|
||||
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||
VMOVDQU (4*16)(RK), rk128; \
|
||||
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||
VMOVDQU (5*16)(RK), rk128; \
|
||||
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||
VMOVDQU (6*16)(RK), rk128; \
|
||||
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||
VMOVDQU (7*16)(RK), rk128; \
|
||||
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||
; \ // Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
||||
VPSHUFB bswap_mask<>(SB), t0, t0 \
|
||||
VPSHUFB bswap_mask<>(SB), t1, t1 \
|
||||
VPSHUFB bswap_mask<>(SB), t2, t2 \
|
||||
VPSHUFB bswap_mask<>(SB), t3, t3 \
|
||||
|
||||
#define SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||
VPSHUFD $0, rk128, x; \
|
||||
SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3); \
|
||||
VPSHUFD $0, rk128, x; \
|
||||
SM4_ONE_ROUND_AVX(x, y, z, t4, t5, t6, t7); \
|
||||
VPSHUFD $0x55, rk128, x; \
|
||||
SM4_ONE_ROUND_AVX(x, y, z, t1, t2, t3, t0); \
|
||||
VPSHUFD $0x55, rk128, x; \
|
||||
SM4_ONE_ROUND_AVX(x, y, z, t5, t6, t7, t4); \
|
||||
VPSHUFD $0xAA, rk128, x; \
|
||||
SM4_ONE_ROUND_AVX(x, y, z, t2, t3, t0, t1); \
|
||||
VPSHUFD $0xAA, rk128, x; \
|
||||
SM4_ONE_ROUND_AVX(x, y, z, t6, t7, t4, t5); \
|
||||
VPSHUFD $0xFF, rk128, x; \
|
||||
SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \
|
||||
VPSHUFD $0xFF, rk128, x; \
|
||||
SM4_ONE_ROUND_AVX(x, y, z, t7, t4, t5, t6); \
|
||||
|
||||
#define AVX_SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||
VPSHUFB flip_mask<>(SB), t0, t0 \
|
||||
VPSHUFB flip_mask<>(SB), t1, t1 \
|
||||
VPSHUFB flip_mask<>(SB), t2, t2 \
|
||||
VPSHUFB flip_mask<>(SB), t3, t3 \
|
||||
VPSHUFB flip_mask<>(SB), t4, t4 \
|
||||
VPSHUFB flip_mask<>(SB), t5, t5 \
|
||||
VPSHUFB flip_mask<>(SB), t6, t6 \
|
||||
VPSHUFB flip_mask<>(SB), t7, t7 \
|
||||
; \
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
||||
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \
|
||||
VMOVDQU (0*16)(RK), rk128; \
|
||||
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
VMOVDQU (1*16)(RK), rk128; \
|
||||
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
VMOVDQU (2*16)(RK), rk128; \
|
||||
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
VMOVDQU (3*16)(RK), rk128; \
|
||||
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
VMOVDQU (4*16)(RK), rk128; \
|
||||
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
VMOVDQU (5*16)(RK), rk128; \
|
||||
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
VMOVDQU (6*16)(RK), rk128; \
|
||||
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
VMOVDQU (7*16)(RK), rk128; \
|
||||
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
; \ // Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
||||
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \
|
||||
VPSHUFB bswap_mask<>(SB), t0, t0 \
|
||||
VPSHUFB bswap_mask<>(SB), t1, t1 \
|
||||
VPSHUFB bswap_mask<>(SB), t2, t2 \
|
||||
VPSHUFB bswap_mask<>(SB), t3, t3 \
|
||||
VPSHUFB bswap_mask<>(SB), t4, t4 \
|
||||
VPSHUFB bswap_mask<>(SB), t5, t5 \
|
||||
VPSHUFB bswap_mask<>(SB), t6, t6 \
|
||||
VPSHUFB bswap_mask<>(SB), t7, t7 \
|
||||
|
||||
// SM4 sbox function, AVX2 version
|
||||
// parameters:
|
||||
// - x: 256 bits register as sbox input/output data
|
||||
@ -301,8 +527,7 @@ GLOBL fk_mask<>(SB), 8, $16
|
||||
VBROADCASTI128 m1_high<>(SB), z; \
|
||||
VPSHUFB x, z, x; \
|
||||
VPXOR y, x, x; \
|
||||
VBROADCASTI128 inverse_shift_rows<>(SB), z; \
|
||||
VPSHUFB z, x, x; \
|
||||
VPSHUFB inverse_shift_rows256<>(SB), x, x; \
|
||||
VEXTRACTI128 $1, x, yw \
|
||||
VAESENCLAST xNibbleMask, xw, xw; \
|
||||
VAESENCLAST xNibbleMask, yw, yw; \
|
||||
@ -327,17 +552,14 @@ GLOBL fk_mask<>(SB), 8, $16
|
||||
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
|
||||
#define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
|
||||
AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \
|
||||
VBROADCASTI128 r08_mask<>(SB), z; \
|
||||
VPSHUFB z, x, y; \
|
||||
VPSHUFB r08_mask256<>(SB), x, y; \
|
||||
VPXOR x, y, y; \
|
||||
VBROADCASTI128 r16_mask<>(SB), z; \
|
||||
VPSHUFB z, x, z; \
|
||||
VPSHUFB r16_mask256<>(SB), x, z; \
|
||||
VPXOR z, y, y; \
|
||||
VPSLLD $2, y, z; \
|
||||
VPSRLD $30, y, y; \
|
||||
VPXOR z, y, y; \
|
||||
VBROADCASTI128 r24_mask<>(SB), z; \
|
||||
VPSHUFB z, x, z; \
|
||||
VPSHUFB r24_mask256<>(SB), x, z; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, z, x
|
||||
|
||||
@ -359,6 +581,24 @@ GLOBL fk_mask<>(SB), 8, $16
|
||||
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
// SM4 round function, AVX2 version, handle 256 bits
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// parameters:
|
||||
// - index: round key index immediate number
|
||||
// - x: 256 bits temp register, MUST use XDWORD!
|
||||
// - y: 256 bits temp register, MUST use YDWORD!
|
||||
// - t0: 256 bits register for data as result
|
||||
// - t1: 256 bits register for data
|
||||
// - t2: 256 bits register for data
|
||||
// - t3: 256 bits register for data
|
||||
#define AVX2_SM4_ROUND2(index, RK, x, y, xw, yw, tmp, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(RK), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
// SM4 round function, AVX version, handle 128 bits
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// parameters:
|
||||
@ -371,9 +611,100 @@ GLOBL fk_mask<>(SB), 8, $16
|
||||
// - t3: 128 bits register for data
|
||||
#define AVX2_SM4_ROUND_4BLOCKS(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
||||
VPSHUFD $0, x, x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
|
||||
AVX_SM4_TAO_L1(x, y, tmp); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
#define AVX2_SM4_8BLOCKS(RK, x, y, xw, yw, tmp, t0, t1, t2, t3) \
|
||||
AVX2_SM4_ROUND2(0, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
|
||||
AVX2_SM4_ROUND2(1, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
|
||||
AVX2_SM4_ROUND2(2, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
|
||||
AVX2_SM4_ROUND2(3, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
|
||||
AVX2_SM4_ROUND2(4, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
|
||||
AVX2_SM4_ROUND2(5, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
|
||||
AVX2_SM4_ROUND2(6, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
|
||||
AVX2_SM4_ROUND2(7, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
|
||||
AVX2_SM4_ROUND2(8, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
|
||||
AVX2_SM4_ROUND2(9, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
|
||||
AVX2_SM4_ROUND2(10, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
|
||||
AVX2_SM4_ROUND2(11, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
|
||||
AVX2_SM4_ROUND2(12, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
|
||||
AVX2_SM4_ROUND2(13, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
|
||||
AVX2_SM4_ROUND2(14, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
|
||||
AVX2_SM4_ROUND2(15, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
|
||||
AVX2_SM4_ROUND2(16, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
|
||||
AVX2_SM4_ROUND2(17, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
|
||||
AVX2_SM4_ROUND2(18, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
|
||||
AVX2_SM4_ROUND2(19, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
|
||||
AVX2_SM4_ROUND2(20, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
|
||||
AVX2_SM4_ROUND2(21, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
|
||||
AVX2_SM4_ROUND2(22, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
|
||||
AVX2_SM4_ROUND2(23, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
|
||||
AVX2_SM4_ROUND2(24, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
|
||||
AVX2_SM4_ROUND2(25, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
|
||||
AVX2_SM4_ROUND2(26, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
|
||||
AVX2_SM4_ROUND2(27, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
|
||||
AVX2_SM4_ROUND2(28, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
|
||||
AVX2_SM4_ROUND2(29, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
|
||||
AVX2_SM4_ROUND2(30, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
|
||||
AVX2_SM4_ROUND2(31, RK, x, y, xw, yw, tmp, t3, t0, t1, t2)
|
||||
|
||||
// SM4 round function, AVX2 version, handle 256 bits
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// parameters:
|
||||
// - index: round key index immediate number
|
||||
// - x: 256 bits temp register, MUST use XDWORD!
|
||||
// - y: 256 bits temp register, MUST use YDWORD!
|
||||
// - t0: 256 bits register for data as result
|
||||
// - t1: 256 bits register for data
|
||||
// - t2: 256 bits register for data
|
||||
// - t3: 256 bits register for data
|
||||
#define AVX2_SM4_16BLOCKS_ROUND(index, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||
VPBROADCASTD (index * 4)(RK), tmp1; \
|
||||
VPXOR t1, tmp1, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||
VPXOR x, t0, t0; \
|
||||
;\
|
||||
VPXOR t5, tmp1, x; \
|
||||
VPXOR t6, x, x; \
|
||||
VPXOR t7, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||
VPXOR x, t4, t4; \
|
||||
|
||||
#define AVX2_SM4_16BLOCKS(RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||
AVX2_SM4_16BLOCKS_ROUND(0, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(1, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(2, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(3, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(4, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(5, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(6, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(7, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(8, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(9, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(10, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(11, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(12, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(13, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(14, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(15, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(16, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(17, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(18, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(19, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(20, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(21, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(22, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(23, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(24, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(25, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(26, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(27, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(28, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(29, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(30, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||
AVX2_SM4_16BLOCKS_ROUND(31, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6)
|
||||
|
@ -171,3 +171,30 @@ GLOBL fk_mask<>(SB), (16+8), $16
|
||||
VEOR t3.B16, x.B16, x.B16; \
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
VEOR x.B16, t0.B16, t0.B16
|
||||
|
||||
// SM4 round function
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// parameters:
|
||||
// - RK: round key register
|
||||
// - tmp32: temp 32/64 bits register
|
||||
// - x: 128 bits temp register
|
||||
// - y: 128 bits temp register
|
||||
// - z: 128 bits temp register
|
||||
// - t0: 128 bits register for data as result
|
||||
// - t1: 128 bits register for data
|
||||
// - t2: 128 bits register for data
|
||||
// - t3: 128 bits register for data
|
||||
#define SM4_8BLOCKS_ROUND(RK, tmp32, x, y, z, tmp, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||
MOVW.P 4(RK), tmp32; \
|
||||
VMOV tmp32, tmp.S4; \
|
||||
VEOR t1.B16, tmp.B16, x.B16; \
|
||||
VEOR t2.B16, x.B16, x.B16; \
|
||||
VEOR t3.B16, x.B16, x.B16; \
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
VEOR x.B16, t0.B16, t0.B16; \
|
||||
; \
|
||||
VEOR t1.B16, tmp.B16, x.B16; \
|
||||
VEOR t2.B16, x.B16, x.B16; \
|
||||
VEOR t3.B16, x.B16, x.B16; \
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
VEOR x.B16, t0.B16, t0.B16
|
||||
|
245
sm4/asm_amd64.s
245
sm4/asm_amd64.s
@ -4,15 +4,15 @@
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define x X0
|
||||
#define y X1
|
||||
#define t0 X2
|
||||
#define t1 X3
|
||||
#define t2 X4
|
||||
#define t3 X5
|
||||
#define t0 X0
|
||||
#define t1 X1
|
||||
#define t2 X2
|
||||
#define t3 X3
|
||||
|
||||
#define XTMP6 X6
|
||||
#define XTMP7 X7
|
||||
#define x X8
|
||||
#define y X9
|
||||
#define XTMP6 X10
|
||||
#define XTMP7 X11
|
||||
|
||||
#include "aesni_macros_amd64.s"
|
||||
|
||||
@ -48,7 +48,7 @@
|
||||
// - t2: 128 bits register for data
|
||||
// - t3: 128 bits register for data
|
||||
#define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(BX)(CX*1), x; \
|
||||
MOVL (index * 4)(BX)(CX*1), x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
@ -68,6 +68,16 @@
|
||||
#define XWORD2 X6
|
||||
#define XWORD3 X7
|
||||
|
||||
#define XDWORD4 Y10
|
||||
#define XDWORD5 Y11
|
||||
#define XDWORD6 Y12
|
||||
#define XDWORD7 Y14
|
||||
|
||||
#define XWORD4 X10
|
||||
#define XWORD5 X11
|
||||
#define XWORD6 X12
|
||||
#define XWORD7 X14
|
||||
|
||||
#define XDWTMP0 Y0
|
||||
#define XDWTMP1 Y1
|
||||
#define XDWTMP2 Y2
|
||||
@ -133,91 +143,93 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
|
||||
JE avx
|
||||
|
||||
non_avx2_start:
|
||||
MOVOU 0(DX), t0
|
||||
MOVOU 16(DX), t1
|
||||
MOVOU 32(DX), t2
|
||||
MOVOU 48(DX), t3
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
PSHUFB flip_mask<>(SB), t1
|
||||
PSHUFB flip_mask<>(SB), t2
|
||||
PSHUFB flip_mask<>(SB), t3
|
||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y)
|
||||
CMPQ DI, $128
|
||||
JEQ sse_8blocks
|
||||
|
||||
XORL CX, CX
|
||||
MOVOU 0(DX), XWORD0
|
||||
MOVOU 16(DX), XWORD1
|
||||
MOVOU 32(DX), XWORD2
|
||||
MOVOU 48(DX), XWORD3
|
||||
|
||||
loop:
|
||||
SM4_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB loop
|
||||
|
||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y);
|
||||
PSHUFB bswap_mask<>(SB), t3
|
||||
PSHUFB bswap_mask<>(SB), t2
|
||||
PSHUFB bswap_mask<>(SB), t1
|
||||
PSHUFB bswap_mask<>(SB), t0
|
||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
MOVOU t0, 0(BX)
|
||||
MOVOU t1, 16(BX)
|
||||
MOVOU t2, 32(BX)
|
||||
MOVOU t3, 48(BX)
|
||||
MOVOU XWORD0, 0(BX)
|
||||
MOVOU XWORD1, 16(BX)
|
||||
MOVOU XWORD2, 32(BX)
|
||||
MOVOU XWORD3, 48(BX)
|
||||
|
||||
RET
|
||||
|
||||
sse_8blocks:
|
||||
MOVOU 0(DX), XWORD0
|
||||
MOVOU 16(DX), XWORD1
|
||||
MOVOU 32(DX), XWORD2
|
||||
MOVOU 48(DX), XWORD3
|
||||
MOVOU 64(DX), XWORD4
|
||||
MOVOU 80(DX), XWORD5
|
||||
MOVOU 96(DX), XWORD6
|
||||
MOVOU 112(DX), XWORD7
|
||||
|
||||
SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
||||
|
||||
MOVOU XWORD0, 0(BX)
|
||||
MOVOU XWORD1, 16(BX)
|
||||
MOVOU XWORD2, 32(BX)
|
||||
MOVOU XWORD3, 48(BX)
|
||||
MOVOU XWORD4, 64(BX)
|
||||
MOVOU XWORD5, 80(BX)
|
||||
MOVOU XWORD6, 96(BX)
|
||||
MOVOU XWORD7, 112(BX)
|
||||
done_sm4:
|
||||
RET
|
||||
|
||||
avx:
|
||||
CMPQ DI, $128
|
||||
JEQ avx_8blocks
|
||||
|
||||
VMOVDQU 0(DX), XWORD0
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
VMOVDQU 32(DX), XWORD2
|
||||
VMOVDQU 48(DX), XWORD3
|
||||
|
||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
|
||||
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
||||
|
||||
XORL CX, CX
|
||||
|
||||
avx_loop:
|
||||
AVX_SM4_ROUND(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
AVX_SM4_ROUND(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
AVX_SM4_ROUND(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
AVX_SM4_ROUND(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB avx_loop
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
||||
|
||||
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
VMOVDQU XWORD2, 32(BX)
|
||||
VMOVDQU XWORD3, 48(BX)
|
||||
|
||||
RET
|
||||
|
||||
avx_8blocks:
|
||||
VMOVDQU 0(DX), XWORD0
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
VMOVDQU 32(DX), XWORD2
|
||||
VMOVDQU 48(DX), XWORD3
|
||||
VMOVDQU 64(DX), XWORD4
|
||||
VMOVDQU 80(DX), XWORD5
|
||||
VMOVDQU 96(DX), XWORD6
|
||||
VMOVDQU 112(DX), XWORD7
|
||||
|
||||
AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
||||
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
VMOVDQU XWORD2, 32(BX)
|
||||
VMOVDQU XWORD3, 48(BX)
|
||||
VMOVDQU XWORD4, 64(BX)
|
||||
VMOVDQU XWORD5, 80(BX)
|
||||
VMOVDQU XWORD6, 96(BX)
|
||||
VMOVDQU XWORD7, 112(BX)
|
||||
|
||||
avx_done_sm4:
|
||||
RET
|
||||
|
||||
avx2:
|
||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||
CMPQ DI, $64
|
||||
JBE avx2_4blocks
|
||||
|
||||
CMPQ DI, $256
|
||||
JEQ avx2_16blocks
|
||||
|
||||
avx2_8blocks:
|
||||
VMOVDQU 0(DX), XDWORD0
|
||||
@ -235,17 +247,7 @@ avx2_8blocks:
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
|
||||
XORL CX, CX
|
||||
|
||||
avx2_loop:
|
||||
AVX2_SM4_ROUND(0, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
AVX2_SM4_ROUND(1, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
AVX2_SM4_ROUND(2, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
AVX2_SM4_ROUND(3, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB avx2_loop
|
||||
AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
@ -260,49 +262,60 @@ avx2_loop:
|
||||
VMOVDQU XDWORD1, 32(BX)
|
||||
VMOVDQU XDWORD2, 64(BX)
|
||||
VMOVDQU XDWORD3, 96(BX)
|
||||
JMP avx2_sm4_done
|
||||
|
||||
avx2_4blocks:
|
||||
VMOVDQU 0(DX), XWORD0
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
VMOVDQU 32(DX), XWORD2
|
||||
VMOVDQU 48(DX), XWORD3
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
|
||||
avx2_16blocks:
|
||||
VMOVDQU 0(DX), XDWORD0
|
||||
VMOVDQU 32(DX), XDWORD1
|
||||
VMOVDQU 64(DX), XDWORD2
|
||||
VMOVDQU 96(DX), XDWORD3
|
||||
VMOVDQU 128(DX), XDWORD4
|
||||
VMOVDQU 160(DX), XDWORD5
|
||||
VMOVDQU 192(DX), XDWORD6
|
||||
VMOVDQU 224(DX), XDWORD7
|
||||
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
||||
|
||||
// Apply Byte Flip Mask: LE -> BE
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
|
||||
|
||||
XORL CX, CX
|
||||
|
||||
avx2_4blocks_loop:
|
||||
AVX2_SM4_ROUND_4BLOCKS(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
AVX2_SM4_ROUND_4BLOCKS(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
AVX2_SM4_ROUND_4BLOCKS(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
AVX2_SM4_ROUND_4BLOCKS(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB avx2_4blocks_loop
|
||||
AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7)
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
|
||||
|
||||
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
VMOVDQU XWORD2, 32(BX)
|
||||
VMOVDQU XWORD3, 48(BX)
|
||||
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
|
||||
|
||||
VMOVDQU XDWORD0, 0(BX)
|
||||
VMOVDQU XDWORD1, 32(BX)
|
||||
VMOVDQU XDWORD2, 64(BX)
|
||||
VMOVDQU XDWORD3, 96(BX)
|
||||
VMOVDQU XDWORD4, 128(BX)
|
||||
VMOVDQU XDWORD5, 160(BX)
|
||||
VMOVDQU XDWORD6, 192(BX)
|
||||
VMOVDQU XDWORD7, 224(BX)
|
||||
|
||||
avx2_sm4_done:
|
||||
VZEROUPPER
|
||||
|
@ -9,6 +9,10 @@
|
||||
#define t1 V3
|
||||
#define t2 V4
|
||||
#define t3 V5
|
||||
#define t4 V8
|
||||
#define t5 V9
|
||||
#define t6 V10
|
||||
#define t7 V11
|
||||
#define ZERO V16
|
||||
#define NIBBLE_MASK V20
|
||||
#define INVERSE_SHIFT_ROWS V21
|
||||
@ -184,6 +188,9 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
|
||||
CMP $1, R11
|
||||
BEQ sm4niblocks
|
||||
|
||||
CMP $128, R12
|
||||
BEQ double_enc
|
||||
|
||||
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
@ -215,6 +222,51 @@ encryptBlocksLoop:
|
||||
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9)
|
||||
RET
|
||||
|
||||
double_enc:
|
||||
VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||
VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VREV32 t3.B16, t3.B16
|
||||
VREV32 t4.B16, t4.B16
|
||||
VREV32 t5.B16, t5.B16
|
||||
VREV32 t6.B16, t6.B16
|
||||
VREV32 t7.B16, t7.B16
|
||||
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
||||
|
||||
load_global_data_2()
|
||||
|
||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||
EOR R0, R0
|
||||
|
||||
encrypt8BlocksLoop:
|
||||
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
|
||||
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
|
||||
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
|
||||
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE encrypt8BlocksLoop
|
||||
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VREV32 t3.B16, t3.B16
|
||||
VREV32 t4.B16, t4.B16
|
||||
VREV32 t5.B16, t5.B16
|
||||
VREV32 t6.B16, t6.B16
|
||||
VREV32 t7.B16, t7.B16
|
||||
|
||||
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R9)
|
||||
VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R9)
|
||||
|
||||
RET
|
||||
|
||||
sm4niblocks:
|
||||
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
|
||||
VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4]
|
||||
|
@ -74,11 +74,18 @@ func (x *cbc) CryptBlocks(dst, src []byte) {
|
||||
// Copy the last block of ciphertext in preparation as the new iv.
|
||||
copy(x.tmp, src[end-BlockSize:end])
|
||||
|
||||
start := end - x.b.blocksSize
|
||||
var temp []byte = make([]byte, x.b.blocksSize)
|
||||
var batchSrc []byte = make([]byte, x.b.blocksSize+BlockSize)
|
||||
|
||||
|
||||
decKeyPtr := &x.b.dec[0]
|
||||
|
||||
start := end - 2*x.b.blocksSize
|
||||
for start > 0 {
|
||||
decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize])
|
||||
end = start
|
||||
start -= 2*x.b.blocksSize
|
||||
}
|
||||
|
||||
start = end - x.b.blocksSize
|
||||
for start > 0 {
|
||||
decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize])
|
||||
end = start
|
||||
@ -86,6 +93,8 @@ func (x *cbc) CryptBlocks(dst, src []byte) {
|
||||
}
|
||||
|
||||
// Handle remain first blocks
|
||||
var temp []byte = make([]byte, x.b.blocksSize)
|
||||
var batchSrc []byte = make([]byte, x.b.blocksSize+BlockSize)
|
||||
copy(batchSrc, x.iv)
|
||||
copy(batchSrc[BlockSize:], src[:end])
|
||||
decryptBlocksChain(decKeyPtr, temp, batchSrc[BlockSize:], &batchSrc[0])
|
||||
|
@ -85,6 +85,11 @@ done_sm4:
|
||||
#define XDWORD2 Y6
|
||||
#define XDWORD3 Y7
|
||||
|
||||
#define XDWORD4 Y10
|
||||
#define XDWORD5 Y11
|
||||
#define XDWORD6 Y12
|
||||
#define XDWORD7 Y14
|
||||
|
||||
#define XWTMP0 X0
|
||||
#define XWTMP1 X1
|
||||
#define XWTMP2 X2
|
||||
@ -94,6 +99,11 @@ done_sm4:
|
||||
#define XWORD2 X6
|
||||
#define XWORD3 X7
|
||||
|
||||
#define XWORD4 X10
|
||||
#define XWORD5 X11
|
||||
#define XWORD6 X12
|
||||
#define XWORD7 X14
|
||||
|
||||
#define NIBBLE_MASK Y3
|
||||
#define X_NIBBLE_MASK X3
|
||||
|
||||
@ -111,6 +121,7 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
||||
MOVQ xk+0(FP), AX
|
||||
MOVQ dst+8(FP), BX
|
||||
MOVQ src+32(FP), DX
|
||||
MOVQ src_len+40(FP), DI
|
||||
MOVQ iv+56(FP), SI
|
||||
|
||||
CMPB ·useAVX2(SB), $1
|
||||
@ -120,84 +131,71 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
||||
JE avx
|
||||
|
||||
non_avx2_start:
|
||||
MOVOU 0(DX), t0
|
||||
MOVOU 16(DX), t1
|
||||
MOVOU 32(DX), t2
|
||||
MOVOU 48(DX), t3
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
PSHUFB flip_mask<>(SB), t1
|
||||
PSHUFB flip_mask<>(SB), t2
|
||||
PSHUFB flip_mask<>(SB), t3
|
||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y)
|
||||
CMPQ DI, $128
|
||||
JEQ sse_8blocks
|
||||
|
||||
XORL CX, CX
|
||||
MOVOU 0(DX), XWORD0
|
||||
MOVOU 16(DX), XWORD1
|
||||
MOVOU 32(DX), XWORD2
|
||||
MOVOU 48(DX), XWORD3
|
||||
|
||||
loop:
|
||||
SM4_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
|
||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB loop
|
||||
PXOR 0(SI), XWORD0
|
||||
PXOR 16(SI), XWORD1
|
||||
PXOR 32(SI), XWORD2
|
||||
PXOR 48(SI), XWORD3
|
||||
|
||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y);
|
||||
PSHUFB bswap_mask<>(SB), t3
|
||||
PSHUFB bswap_mask<>(SB), t2
|
||||
PSHUFB bswap_mask<>(SB), t1
|
||||
PSHUFB bswap_mask<>(SB), t0
|
||||
MOVUPS XWORD0, 0(BX)
|
||||
MOVUPS XWORD1, 16(BX)
|
||||
MOVUPS XWORD2, 32(BX)
|
||||
MOVUPS XWORD3, 48(BX)
|
||||
|
||||
PXOR 0(SI), t0
|
||||
PXOR 16(SI), t1
|
||||
PXOR 32(SI), t2
|
||||
PXOR 48(SI), t3
|
||||
RET
|
||||
|
||||
MOVUPS t0, 0(BX)
|
||||
MOVUPS t1, 16(BX)
|
||||
MOVUPS t2, 32(BX)
|
||||
MOVUPS t3, 48(BX)
|
||||
sse_8blocks:
|
||||
MOVOU 0(DX), XWORD0
|
||||
MOVOU 16(DX), XWORD1
|
||||
MOVOU 32(DX), XWORD2
|
||||
MOVOU 48(DX), XWORD3
|
||||
MOVOU 64(DX), XWORD4
|
||||
MOVOU 80(DX), XWORD5
|
||||
MOVOU 96(DX), XWORD6
|
||||
MOVOU 112(DX), XWORD7
|
||||
|
||||
SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
||||
|
||||
PXOR 0(SI), XWORD0
|
||||
PXOR 16(SI), XWORD1
|
||||
PXOR 32(SI), XWORD2
|
||||
PXOR 48(SI), XWORD3
|
||||
PXOR 64(SI), XWORD4
|
||||
PXOR 80(SI), XWORD5
|
||||
PXOR 96(SI), XWORD6
|
||||
PXOR 112(SI), XWORD7
|
||||
|
||||
MOVOU XWORD0, 0(BX)
|
||||
MOVOU XWORD1, 16(BX)
|
||||
MOVOU XWORD2, 32(BX)
|
||||
MOVOU XWORD3, 48(BX)
|
||||
MOVOU XWORD4, 64(BX)
|
||||
MOVOU XWORD5, 80(BX)
|
||||
MOVOU XWORD6, 96(BX)
|
||||
MOVOU XWORD7, 112(BX)
|
||||
|
||||
done_sm4:
|
||||
RET
|
||||
|
||||
avx:
|
||||
CMPQ DI, $128
|
||||
JEQ avx_8blocks
|
||||
|
||||
VMOVDQU 0(DX), XWORD0
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
VMOVDQU 32(DX), XWORD2
|
||||
VMOVDQU 48(DX), XWORD3
|
||||
|
||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
|
||||
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
||||
|
||||
XORL CX, CX
|
||||
|
||||
avx_loop:
|
||||
AVX_SM4_ROUND(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
AVX_SM4_ROUND(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
AVX_SM4_ROUND(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
AVX_SM4_ROUND(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB avx_loop
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
||||
|
||||
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
VPXOR 0(SI), XWORD0, XWORD0
|
||||
VPXOR 16(SI), XWORD1, XWORD1
|
||||
@ -208,11 +206,45 @@ avx_loop:
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
VMOVDQU XWORD2, 32(BX)
|
||||
VMOVDQU XWORD3, 48(BX)
|
||||
RET
|
||||
|
||||
avx_8blocks:
|
||||
VMOVDQU 0(DX), XWORD0
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
VMOVDQU 32(DX), XWORD2
|
||||
VMOVDQU 48(DX), XWORD3
|
||||
VMOVDQU 64(DX), XWORD4
|
||||
VMOVDQU 80(DX), XWORD5
|
||||
VMOVDQU 96(DX), XWORD6
|
||||
VMOVDQU 112(DX), XWORD7
|
||||
|
||||
AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
||||
|
||||
VPXOR 0(SI), XWORD0, XWORD0
|
||||
VPXOR 16(SI), XWORD1, XWORD1
|
||||
VPXOR 32(SI), XWORD2, XWORD2
|
||||
VPXOR 48(SI), XWORD3, XWORD3
|
||||
VPXOR 64(SI), XWORD4, XWORD4
|
||||
VPXOR 80(SI), XWORD5, XWORD5
|
||||
VPXOR 96(SI), XWORD6, XWORD6
|
||||
VPXOR 112(SI), XWORD7, XWORD7
|
||||
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
VMOVDQU XWORD2, 32(BX)
|
||||
VMOVDQU XWORD3, 48(BX)
|
||||
VMOVDQU XWORD4, 64(BX)
|
||||
VMOVDQU XWORD5, 80(BX)
|
||||
VMOVDQU XWORD6, 96(BX)
|
||||
VMOVDQU XWORD7, 112(BX)
|
||||
|
||||
avx_sm4_done:
|
||||
RET
|
||||
|
||||
avx2:
|
||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||
CMPQ DI, $256
|
||||
JEQ avx2_16blocks
|
||||
|
||||
avx2_8blocks:
|
||||
VMOVDQU 0(DX), XDWORD0
|
||||
@ -230,17 +262,7 @@ avx2_8blocks:
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
|
||||
XORL CX, CX
|
||||
|
||||
avx2_loop:
|
||||
AVX2_SM4_ROUND(0, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
AVX2_SM4_ROUND(1, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
AVX2_SM4_ROUND(2, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
AVX2_SM4_ROUND(3, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB avx2_loop
|
||||
AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
@ -261,6 +283,68 @@ avx2_loop:
|
||||
VMOVDQU XDWORD2, 64(BX)
|
||||
VMOVDQU XDWORD3, 96(BX)
|
||||
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
avx2_16blocks:
|
||||
VMOVDQU 0(DX), XDWORD0
|
||||
VMOVDQU 32(DX), XDWORD1
|
||||
VMOVDQU 64(DX), XDWORD2
|
||||
VMOVDQU 96(DX), XDWORD3
|
||||
VMOVDQU 128(DX), XDWORD4
|
||||
VMOVDQU 160(DX), XDWORD5
|
||||
VMOVDQU 192(DX), XDWORD6
|
||||
VMOVDQU 224(DX), XDWORD7
|
||||
|
||||
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
||||
|
||||
// Apply Byte Flip Mask: LE -> BE
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
|
||||
|
||||
AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7)
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
|
||||
|
||||
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
|
||||
|
||||
VPXOR 0(SI), XDWORD0, XDWORD0
|
||||
VPXOR 32(SI), XDWORD1, XDWORD1
|
||||
VPXOR 64(SI), XDWORD2, XDWORD2
|
||||
VPXOR 96(SI), XDWORD3, XDWORD3
|
||||
VPXOR 128(SI), XDWORD4, XDWORD4
|
||||
VPXOR 160(SI), XDWORD5, XDWORD5
|
||||
VPXOR 192(SI), XDWORD6, XDWORD6
|
||||
VPXOR 224(SI), XDWORD7, XDWORD7
|
||||
|
||||
VMOVDQU XDWORD0, 0(BX)
|
||||
VMOVDQU XDWORD1, 32(BX)
|
||||
VMOVDQU XDWORD2, 64(BX)
|
||||
VMOVDQU XDWORD3, 96(BX)
|
||||
VMOVDQU XDWORD4, 128(BX)
|
||||
VMOVDQU XDWORD5, 160(BX)
|
||||
VMOVDQU XDWORD6, 192(BX)
|
||||
VMOVDQU XDWORD7, 224(BX)
|
||||
|
||||
avx2_sm4_done:
|
||||
VZEROUPPER
|
||||
|
@ -88,6 +88,10 @@ done_sm4:
|
||||
#undef rkSave
|
||||
|
||||
#define XTMP7 V7
|
||||
#define t4 V10
|
||||
#define t5 V11
|
||||
#define t6 V12
|
||||
#define t7 V13
|
||||
|
||||
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
||||
@ -99,6 +103,8 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
||||
MOVD src_len+40(FP), R12
|
||||
MOVD iv+56(FP), R11
|
||||
|
||||
CMP $128, R12
|
||||
BEQ double_dec
|
||||
|
||||
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
@ -135,3 +141,57 @@ encryptBlocksLoop:
|
||||
|
||||
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9)
|
||||
RET
|
||||
|
||||
double_dec:
|
||||
VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||
VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VREV32 t3.B16, t3.B16
|
||||
VREV32 t4.B16, t4.B16
|
||||
VREV32 t5.B16, t5.B16
|
||||
VREV32 t6.B16, t6.B16
|
||||
VREV32 t7.B16, t7.B16
|
||||
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
||||
|
||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||
EOR R0, R0
|
||||
|
||||
decrypt8BlocksLoop:
|
||||
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
|
||||
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
|
||||
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
|
||||
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE decrypt8BlocksLoop
|
||||
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VREV32 t3.B16, t3.B16
|
||||
VREV32 t4.B16, t4.B16
|
||||
VREV32 t5.B16, t5.B16
|
||||
VREV32 t6.B16, t6.B16
|
||||
VREV32 t7.B16, t7.B16
|
||||
|
||||
VLD1.P 64(R11), [V6.S4, V7.S4, V8.S4, V9.S4]
|
||||
VEOR V6.B16, t0.B16, t0.B16
|
||||
VEOR V7.B16, t1.B16, t1.B16
|
||||
VEOR V8.B16, t2.B16, t2.B16
|
||||
VEOR V9.B16, t3.B16, t3.B16
|
||||
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R9)
|
||||
|
||||
VLD1.P 64(R11), [V6.S4, V7.S4, V8.S4, V9.S4]
|
||||
VEOR V6.B16, t4.B16, t4.B16
|
||||
VEOR V7.B16, t5.B16, t5.B16
|
||||
VEOR V8.B16, t6.B16, t6.B16
|
||||
VEOR V9.B16, t7.B16, t7.B16
|
||||
VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R9)
|
||||
|
||||
RET
|
||||
|
@ -54,6 +54,15 @@ func (x *ecb) CryptBlocks(dst, src []byte) {
|
||||
if len(src) == 0 {
|
||||
return
|
||||
}
|
||||
for len(src) >= 2*x.b.blocksSize {
|
||||
if x.enc == ecbEncrypt {
|
||||
x.b.EncryptBlocks(dst[:2*x.b.blocksSize], src[:2*x.b.blocksSize])
|
||||
} else {
|
||||
x.b.DecryptBlocks(dst[:2*x.b.blocksSize], src[:2*x.b.blocksSize])
|
||||
}
|
||||
src = src[2*x.b.blocksSize:]
|
||||
dst = dst[2*x.b.blocksSize:]
|
||||
}
|
||||
for len(src) >= x.b.blocksSize {
|
||||
if x.enc == ecbEncrypt {
|
||||
x.b.EncryptBlocks(dst[:x.b.blocksSize], src[:x.b.blocksSize])
|
||||
|
378
sm4/gcm_amd64.s
378
sm4/gcm_amd64.s
@ -155,114 +155,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
||||
#undef plen
|
||||
#undef dlen
|
||||
|
||||
#define AVX_SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||
VMOVDQU flip_mask<>(SB), x \
|
||||
VPSHUFB x, t0, t0 \
|
||||
VPSHUFB x, t1, t1 \
|
||||
VPSHUFB x, t2, t2 \
|
||||
VPSHUFB x, t3, t3 \
|
||||
; \
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
||||
XORL IND, IND \
|
||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK \
|
||||
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||
ADDL $16, IND; \
|
||||
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||
ADDL $16, IND; \
|
||||
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||
ADDL $16, IND; \
|
||||
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||
ADDL $16, IND; \
|
||||
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||
ADDL $16, IND; \
|
||||
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||
ADDL $16, IND; \
|
||||
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||
ADDL $16, IND; \
|
||||
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||
; \ // Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
||||
VPSHUFB BSWAP, t0, t0 \
|
||||
VPSHUFB BSWAP, t1, t1 \
|
||||
VPSHUFB BSWAP, t2, t2 \
|
||||
VPSHUFB BSWAP, t3, t3 \
|
||||
|
||||
#define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||
PSHUFB flip_mask<>(SB), t0; \
|
||||
PSHUFB flip_mask<>(SB), t1; \
|
||||
PSHUFB flip_mask<>(SB), t2; \
|
||||
PSHUFB flip_mask<>(SB), t3; \
|
||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
||||
XORL IND, IND; \
|
||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||
ADDL $16, IND; \
|
||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||
ADDL $16, IND; \
|
||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||
ADDL $16, IND; \
|
||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||
ADDL $16, IND; \
|
||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||
ADDL $16, IND; \
|
||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||
ADDL $16, IND; \
|
||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||
ADDL $16, IND; \
|
||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
||||
PSHUFB BSWAP, t3; \
|
||||
PSHUFB BSWAP, t2; \
|
||||
PSHUFB BSWAP, t1; \
|
||||
PSHUFB BSWAP, t0
|
||||
|
||||
// func gcmSm4Init(productTable *[256]byte, rk []uint32)
|
||||
TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
||||
#define dst DI
|
||||
@ -676,12 +568,12 @@ TEXT ·gcmSm4Enc(SB),0,$256-96
|
||||
MOVOU (8*16 + 6*16)(SP), B6
|
||||
MOVOU (8*16 + 7*16)(SP), B7
|
||||
|
||||
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
|
||||
SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||
PXOR ACC1, ACC1
|
||||
increment(0)
|
||||
increment(1)
|
||||
increment(2)
|
||||
increment(3)
|
||||
SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
|
||||
increment(4)
|
||||
increment(5)
|
||||
increment(6)
|
||||
@ -762,7 +654,6 @@ gcmSm4EncOctetsLoop:
|
||||
PCLMULQDQ $0x00, T0, ACC0
|
||||
PCLMULQDQ $0x11, T0, ACC1
|
||||
|
||||
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
|
||||
mulRound(1)
|
||||
increment(0)
|
||||
mulRound(2)
|
||||
@ -771,7 +662,6 @@ gcmSm4EncOctetsLoop:
|
||||
increment(2)
|
||||
mulRound(4)
|
||||
increment(3)
|
||||
SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
|
||||
mulRound(5)
|
||||
increment(4)
|
||||
mulRound(6)
|
||||
@ -791,6 +681,8 @@ gcmSm4EncOctetsLoop:
|
||||
reduceRound(ACC0)
|
||||
PXOR ACC1, ACC0
|
||||
|
||||
SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||
|
||||
MOVOU (16*0)(ptx), T0
|
||||
PXOR T0, B0
|
||||
MOVOU (16*1)(ptx), T0
|
||||
@ -886,7 +778,7 @@ gcmSm4EncNibbles:
|
||||
MOVOU (8*16 + 2*16)(SP), B2
|
||||
MOVOU (8*16 + 3*16)(SP), B3
|
||||
|
||||
SM4_4BLOCKS(AX, BX, T0, T1, T2, B0, B1, B2, B3)
|
||||
SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
|
||||
MOVOU (16*0)(ptx), T0
|
||||
PXOR T0, B0
|
||||
MOVOU (16*1)(ptx), T0
|
||||
@ -922,7 +814,7 @@ gcmSm4EncSingles:
|
||||
MOVOU (8*16 + 2*16)(SP), B2
|
||||
MOVOU (8*16 + 3*16)(SP), B3
|
||||
|
||||
SM4_4BLOCKS(AX, BX, T0, T1, T2, B0, B1, B2, B3)
|
||||
SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
|
||||
MOVOU B0, (16*0)(SP)
|
||||
MOVOU B1, (16*1)(SP)
|
||||
MOVOU B2, (16*2)(SP)
|
||||
@ -1014,17 +906,30 @@ avxGcmSm4Enc:
|
||||
VMOVDQU (8*16 + 1*16)(SP), B1
|
||||
VMOVDQU (8*16 + 2*16)(SP), B2
|
||||
VMOVDQU (8*16 + 3*16)(SP), B3
|
||||
VMOVDQU (8*16 + 4*16)(SP), B4
|
||||
VMOVDQU (8*16 + 5*16)(SP), B5
|
||||
VMOVDQU (8*16 + 6*16)(SP), B6
|
||||
VMOVDQU (8*16 + 7*16)(SP), B7
|
||||
|
||||
AVX_SM4_4BLOCKS(rk, BX, B7, T1, T2, B0, B1, B2, B3)
|
||||
AVX_SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||
VPXOR ACC1, ACC1, ACC1 // clean ACC1
|
||||
increment(0)
|
||||
increment(1)
|
||||
increment(2)
|
||||
increment(3)
|
||||
increment(4)
|
||||
increment(5)
|
||||
increment(6)
|
||||
increment(7)
|
||||
// XOR plaintext
|
||||
VPXOR (16*0)(ptx), B0, B0
|
||||
VPXOR (16*1)(ptx), B1, B1
|
||||
VPXOR (16*2)(ptx), B2, B2
|
||||
VPXOR (16*3)(ptx), B3, B3
|
||||
VPXOR (16*4)(ptx), B4, B4
|
||||
VPXOR (16*5)(ptx), B5, B5
|
||||
VPXOR (16*6)(ptx), B6, B6
|
||||
VPXOR (16*7)(ptx), B7, B7
|
||||
// Store ciphertext
|
||||
VMOVDQU B0, (16*0)(ctx)
|
||||
VPSHUFB BSWAP, B0, B0
|
||||
@ -1034,31 +939,6 @@ avxGcmSm4Enc:
|
||||
VPSHUFB BSWAP, B2, B2
|
||||
VMOVDQU B3, (16*3)(ctx)
|
||||
VPSHUFB BSWAP, B3, B3
|
||||
VPXOR ACC0, B0, B0
|
||||
|
||||
VMOVDQU B0, (16*0)(SP)
|
||||
VMOVDQU B1, (16*1)(SP)
|
||||
VMOVDQU B2, (16*2)(SP)
|
||||
VMOVDQU B3, (16*3)(SP)
|
||||
|
||||
// load 4 ctrs for encryption
|
||||
VMOVDQU (8*16 + 4*16)(SP), B4
|
||||
VMOVDQU (8*16 + 5*16)(SP), B5
|
||||
VMOVDQU (8*16 + 6*16)(SP), B6
|
||||
VMOVDQU (8*16 + 7*16)(SP), B7
|
||||
AVX_SM4_4BLOCKS(rk, BX, B0, T1, T2, B4, B5, B6, B7)
|
||||
increment(4)
|
||||
increment(5)
|
||||
increment(6)
|
||||
increment(7)
|
||||
|
||||
// XOR plaintext
|
||||
VPXOR (16*4)(ptx), B4, B4
|
||||
VPXOR (16*5)(ptx), B5, B5
|
||||
VPXOR (16*6)(ptx), B6, B6
|
||||
VPXOR (16*7)(ptx), B7, B7
|
||||
|
||||
// Store ciphertext
|
||||
VMOVDQU B4, (16*4)(ctx)
|
||||
VPSHUFB BSWAP, B4, B4
|
||||
VMOVDQU B5, (16*5)(ctx)
|
||||
@ -1068,6 +948,12 @@ avxGcmSm4Enc:
|
||||
VMOVDQU B7, (16*7)(ctx)
|
||||
VPSHUFB BSWAP, B7, B7
|
||||
|
||||
VPXOR ACC0, B0, B0
|
||||
|
||||
VMOVDQU B0, (16*0)(SP)
|
||||
VMOVDQU B1, (16*1)(SP)
|
||||
VMOVDQU B2, (16*2)(SP)
|
||||
VMOVDQU B3, (16*3)(SP)
|
||||
VMOVDQU B4, (16*4)(SP)
|
||||
VMOVDQU B5, (16*5)(SP)
|
||||
VMOVDQU B6, (16*6)(SP)
|
||||
@ -1129,12 +1015,16 @@ avxGcmSm4EncOctetsLoop:
|
||||
avxReduceRound(ACC0)
|
||||
VPXOR ACC1, ACC0, ACC0
|
||||
|
||||
AVX_SM4_4BLOCKS(rk, BX, ACC1, T1, T2, B0, B1, B2, B3)
|
||||
AVX_SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||
// XOR plaintext
|
||||
VPXOR (16*0)(ptx), B0, B0
|
||||
VPXOR (16*1)(ptx), B1, B1
|
||||
VPXOR (16*2)(ptx), B2, B2
|
||||
VPXOR (16*3)(ptx), B3, B3
|
||||
VPXOR (16*4)(ptx), B4, B4
|
||||
VPXOR (16*5)(ptx), B5, B5
|
||||
VPXOR (16*6)(ptx), B6, B6
|
||||
VPXOR (16*7)(ptx), B7, B7
|
||||
|
||||
// Store ciphertext
|
||||
VMOVDQU B0, (16*0)(ctx)
|
||||
@ -1145,21 +1035,6 @@ avxGcmSm4EncOctetsLoop:
|
||||
VPSHUFB BSWAP, B2, B2
|
||||
VMOVDQU B3, (16*3)(ctx)
|
||||
VPSHUFB BSWAP, B3, B3
|
||||
|
||||
VPXOR ACC0, B0, B0
|
||||
VMOVDQU B0, (16*0)(SP)
|
||||
VMOVDQU B1, (16*1)(SP)
|
||||
VMOVDQU B2, (16*2)(SP)
|
||||
VMOVDQU B3, (16*3)(SP)
|
||||
|
||||
AVX_SM4_4BLOCKS(rk, BX, B0, T1, T2, B4, B5, B6, B7)
|
||||
// XOR plaintext
|
||||
VPXOR (16*4)(ptx), B4, B4
|
||||
VPXOR (16*5)(ptx), B5, B5
|
||||
VPXOR (16*6)(ptx), B6, B6
|
||||
VPXOR (16*7)(ptx), B7, B7
|
||||
|
||||
// Store ciphertext
|
||||
VMOVDQU B4, (16*4)(ctx)
|
||||
VPSHUFB BSWAP, B4, B4
|
||||
VMOVDQU B5, (16*5)(ctx)
|
||||
@ -1169,6 +1044,11 @@ avxGcmSm4EncOctetsLoop:
|
||||
VMOVDQU B7, (16*7)(ctx)
|
||||
VPSHUFB BSWAP, B7, B7
|
||||
|
||||
VPXOR ACC0, B0, B0
|
||||
VMOVDQU B0, (16*0)(SP)
|
||||
VMOVDQU B1, (16*1)(SP)
|
||||
VMOVDQU B2, (16*2)(SP)
|
||||
VMOVDQU B3, (16*3)(SP)
|
||||
VMOVDQU B4, (16*4)(SP)
|
||||
VMOVDQU B5, (16*5)(SP)
|
||||
VMOVDQU B6, (16*6)(SP)
|
||||
@ -1226,7 +1106,7 @@ avxGcmSm4EncNibbles:
|
||||
VMOVDQU (8*16 + 2*16)(SP), B2
|
||||
VMOVDQU (8*16 + 3*16)(SP), B3
|
||||
|
||||
AVX_SM4_4BLOCKS(rk, BX, B7, T1, T2, B0, B1, B2, B3)
|
||||
AVX_SM4_4BLOCKS(rk, B6, B7, T1, T2, B0, B1, B2, B3)
|
||||
// XOR plaintext
|
||||
VPXOR (16*0)(ptx), B0, B0
|
||||
VPXOR (16*1)(ptx), B1, B1
|
||||
@ -1261,7 +1141,7 @@ avxGcmSm4EncSingles:
|
||||
VMOVDQU (8*16 + 2*16)(SP), B2
|
||||
VMOVDQU (8*16 + 3*16)(SP), B3
|
||||
|
||||
AVX_SM4_4BLOCKS(rk, BX, B7, T1, T2, B0, B1, B2, B3)
|
||||
AVX_SM4_4BLOCKS(rk, B6, B7, T1, T2, B0, B1, B2, B3)
|
||||
VMOVDQU B0, (16*0)(SP)
|
||||
VMOVDQU B1, (16*1)(SP)
|
||||
VMOVDQU B2, (16*2)(SP)
|
||||
@ -1364,18 +1244,9 @@ avx2GcmSm4Enc:
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||
XORL BX, BX
|
||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Enc8Loop1:
|
||||
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
||||
AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0)
|
||||
AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1)
|
||||
AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2)
|
||||
|
||||
ADDL $16, BX
|
||||
CMPL BX, $4*32
|
||||
JB avx2GcmSm4Enc8Loop1
|
||||
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||
@ -1458,18 +1329,9 @@ avx2GcmSm4EncOctetsLoop:
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||
XORL BX, BX
|
||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Enc8Loop2:
|
||||
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
||||
AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0)
|
||||
AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1)
|
||||
AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2)
|
||||
|
||||
ADDL $16, BX
|
||||
CMPL BX, $4*32
|
||||
JB avx2GcmSm4Enc8Loop2
|
||||
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||
@ -1578,7 +1440,6 @@ avx2GcmSm4EncOctetsEnd:
|
||||
SUBQ $4, aluCTR
|
||||
|
||||
avx2GcmSm4EncNibbles:
|
||||
VMOVDQU flip_mask<>(SB), B7
|
||||
CMPQ ptxLen, $64
|
||||
JBE avx2GcmSm4EncSingles
|
||||
SUBQ $64, ptxLen
|
||||
@ -1588,31 +1449,7 @@ avx2GcmSm4EncNibbles:
|
||||
VMOVDQU (8*16 + 2*16)(SP), B2
|
||||
VMOVDQU (8*16 + 3*16)(SP), B3
|
||||
|
||||
VPSHUFB B7, B0, B0
|
||||
VPSHUFB B7, B1, B1
|
||||
VPSHUFB B7, B2, B2
|
||||
VPSHUFB B7, B3, B3
|
||||
|
||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
|
||||
XORL BX, BX
|
||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Enc4Loop2:
|
||||
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||
|
||||
ADDL $16, BX
|
||||
CMPL BX, $4*32
|
||||
JB avx2GcmSm4Enc4Loop2
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
|
||||
VPSHUFB BSWAP, B0, B0
|
||||
VPSHUFB BSWAP, B1, B1
|
||||
VPSHUFB BSWAP, B2, B2
|
||||
VPSHUFB BSWAP, B3, B3
|
||||
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
|
||||
|
||||
VMOVDQU (16*0)(ptx), T0
|
||||
VPXOR T0, B0, B0
|
||||
@ -1650,31 +1487,7 @@ avx2GcmSm4EncSingles:
|
||||
VMOVDQU (8*16 + 2*16)(SP), B2
|
||||
VMOVDQU (8*16 + 3*16)(SP), B3
|
||||
|
||||
VPSHUFB B7, B0, B0
|
||||
VPSHUFB B7, B1, B1
|
||||
VPSHUFB B7, B2, B2
|
||||
VPSHUFB B7, B3, B3
|
||||
|
||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
|
||||
XORL BX, BX
|
||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Enc4Loop1:
|
||||
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||
|
||||
ADDL $16, BX
|
||||
CMPL BX, $4*32
|
||||
JB avx2GcmSm4Enc4Loop1
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
|
||||
VPSHUFB BSWAP, B0, B0
|
||||
VPSHUFB BSWAP, B1, B1
|
||||
VPSHUFB BSWAP, B2, B2
|
||||
VPSHUFB BSWAP, B3, B3
|
||||
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
|
||||
|
||||
VMOVDQU B0, (16*0)(SP)
|
||||
VMOVDQU B1, (16*1)(SP)
|
||||
@ -1890,7 +1703,6 @@ gcmSm4DecOctetsLoop:
|
||||
PCLMULQDQ $0x00, T0, ACC0
|
||||
PCLMULQDQ $0x11, T0, ACC1
|
||||
|
||||
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
|
||||
decMulRound(1)
|
||||
increment(0)
|
||||
decMulRound(2)
|
||||
@ -1899,7 +1711,6 @@ gcmSm4DecOctetsLoop:
|
||||
increment(2)
|
||||
decMulRound(4)
|
||||
increment(3)
|
||||
SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
|
||||
decMulRound(5)
|
||||
increment(4)
|
||||
decMulRound(6)
|
||||
@ -1920,6 +1731,8 @@ gcmSm4DecOctetsLoop:
|
||||
reduceRound(ACC0)
|
||||
PXOR ACC1, ACC0
|
||||
|
||||
SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||
|
||||
MOVOU (16*0)(ctx), T0
|
||||
PXOR T0, B0
|
||||
MOVOU (16*1)(ctx), T0
|
||||
@ -1964,7 +1777,7 @@ gcmSm4DecNibbles:
|
||||
MOVOU (2*16)(SP), B6
|
||||
MOVOU (3*16)(SP), B7
|
||||
|
||||
SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
|
||||
SM4_4BLOCKS(rk, B0, T0, T1, T2, B4, B5, B6, B7)
|
||||
MOVOU (16*14)(pTbl), T2
|
||||
MOVOU (16*0)(ctx), T0
|
||||
PXOR T0, B4
|
||||
@ -2000,7 +1813,7 @@ gcmSm4DecSingles:
|
||||
MOVOU (2*16)(SP), B2
|
||||
MOVOU (3*16)(SP), B3
|
||||
|
||||
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
|
||||
SM4_4BLOCKS(rk, B4, T0, T1, T2, B0, B1, B2, B3)
|
||||
MOVOU B0, (16*4)(SP)
|
||||
MOVOU B1, (16*5)(SP)
|
||||
MOVOU B2, (16*6)(SP)
|
||||
@ -2145,25 +1958,21 @@ avxGcmSm4DecOctetsLoop:
|
||||
avxReduceRound(ACC0)
|
||||
VPXOR ACC1, ACC0, ACC0
|
||||
|
||||
AVX_SM4_4BLOCKS(rk, BX, ACC1, T1, T2, B0, B1, B2, B3)
|
||||
AVX_SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||
|
||||
VPXOR (16*0)(ctx), B0, B0
|
||||
VPXOR (16*1)(ctx), B1, B1
|
||||
VPXOR (16*2)(ctx), B2, B2
|
||||
VPXOR (16*3)(ctx), B3, B3
|
||||
|
||||
VMOVDQU B0, (16*0)(ptx)
|
||||
VMOVDQU B1, (16*1)(ptx)
|
||||
VMOVDQU B2, (16*2)(ptx)
|
||||
VMOVDQU B3, (16*3)(ptx)
|
||||
|
||||
AVX_SM4_4BLOCKS(rk, BX, B0, T1, T2, B4, B5, B6, B7)
|
||||
|
||||
VPXOR (16*4)(ctx), B4, B4
|
||||
VPXOR (16*5)(ctx), B5, B5
|
||||
VPXOR (16*6)(ctx), B6, B6
|
||||
VPXOR (16*7)(ctx), B7, B7
|
||||
|
||||
VMOVDQU B0, (16*0)(ptx)
|
||||
VMOVDQU B1, (16*1)(ptx)
|
||||
VMOVDQU B2, (16*2)(ptx)
|
||||
VMOVDQU B3, (16*3)(ptx)
|
||||
VMOVDQU B4, (16*4)(ptx)
|
||||
VMOVDQU B5, (16*5)(ptx)
|
||||
VMOVDQU B6, (16*6)(ptx)
|
||||
@ -2187,7 +1996,7 @@ avxGcmSm4DecNibbles:
|
||||
VMOVDQU (2*16)(SP), B6
|
||||
VMOVDQU (3*16)(SP), B7
|
||||
|
||||
AVX_SM4_4BLOCKS(rk, BX, B0, T1, T2, B4, B5, B6, B7)
|
||||
AVX_SM4_4BLOCKS(rk, B0, B1, T1, T2, B4, B5, B6, B7)
|
||||
|
||||
VMOVDQU (16*14)(pTbl), T2
|
||||
VMOVDQU (16*0)(ctx), B0
|
||||
@ -2227,7 +2036,7 @@ avxGcmSm4DecSingles:
|
||||
VMOVDQU (2*16)(SP), B2
|
||||
VMOVDQU (3*16)(SP), B3
|
||||
|
||||
AVX_SM4_4BLOCKS(rk, BX, B7, B6, B5, B0, B1, B2, B3)
|
||||
AVX_SM4_4BLOCKS(rk, B7, B6, B5, B4, B0, B1, B2, B3)
|
||||
VMOVDQU B0, (16*4)(SP)
|
||||
VMOVDQU B1, (16*5)(SP)
|
||||
VMOVDQU B2, (16*6)(SP)
|
||||
@ -2328,13 +2137,6 @@ avx2GcmSm4DecOctetsLoop:
|
||||
VMOVDQU (2*32)(SP), DWB2
|
||||
VMOVDQU (3*32)(SP), DWB3
|
||||
|
||||
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
|
||||
// Apply Byte Flip Mask: LE -> BE
|
||||
VPSHUFB XDWTMP0, DWB0, DWB0
|
||||
VPSHUFB XDWTMP0, DWB1, DWB1
|
||||
VPSHUFB XDWTMP0, DWB2, DWB2
|
||||
VPSHUFB XDWTMP0, DWB3, DWB3
|
||||
|
||||
VMOVDQU (16*0)(ctx), T0
|
||||
VPSHUFB BSWAP, T0, T0
|
||||
VPXOR ACC0, T0, T0
|
||||
@ -2348,20 +2150,18 @@ avx2GcmSm4DecOctetsLoop:
|
||||
VPCLMULQDQ $0x00, T0, ACC1, ACC0
|
||||
VPCLMULQDQ $0x11, T0, ACC1, ACC1
|
||||
|
||||
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
|
||||
// Apply Byte Flip Mask: LE -> BE
|
||||
VPSHUFB XDWTMP0, DWB0, DWB0
|
||||
VPSHUFB XDWTMP0, DWB1, DWB1
|
||||
VPSHUFB XDWTMP0, DWB2, DWB2
|
||||
VPSHUFB XDWTMP0, DWB3, DWB3
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||
XORL BX, BX
|
||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Dec8Loop2:
|
||||
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
||||
AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0)
|
||||
AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1)
|
||||
AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2)
|
||||
|
||||
ADDL $16, BX
|
||||
CMPL BX, $4*32
|
||||
JB avx2GcmSm4Dec8Loop2
|
||||
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||
@ -2374,8 +2174,8 @@ avx2GcmSm4Dec8Loop2:
|
||||
|
||||
VMOVDQU (32*0)(ctx), XDWTMP0
|
||||
VPXOR XDWTMP0, DWB0, DWB0
|
||||
VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0
|
||||
VEXTRACTI128 $1, XDWTMP0, T0
|
||||
VPSHUFB BSWAP, T0, T0
|
||||
internalAvxDecMulRound(1)
|
||||
increment(0)
|
||||
|
||||
@ -2436,7 +2236,6 @@ avx2GcmSm4DecEndOctets:
|
||||
SUBQ $4, aluCTR
|
||||
|
||||
avx2GcmSm4DecNibbles:
|
||||
VMOVDQU flip_mask<>(SB), B7 // DO NOT CHANGE B7
|
||||
CMPQ ptxLen, $64
|
||||
JBE avx2GcmSm4DecSingles
|
||||
SUBQ $64, ptxLen
|
||||
@ -2446,31 +2245,7 @@ avx2GcmSm4DecNibbles:
|
||||
VMOVDQU (2*16)(SP), B2
|
||||
VMOVDQU (3*16)(SP), B3
|
||||
|
||||
VPSHUFB B7, B0, B0
|
||||
VPSHUFB B7, B1, B1
|
||||
VPSHUFB B7, B2, B2
|
||||
VPSHUFB B7, B3, B3
|
||||
|
||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
|
||||
XORL BX, BX
|
||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Dec4Loop2:
|
||||
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||
|
||||
ADDL $16, BX
|
||||
CMPL BX, $4*32
|
||||
JB avx2GcmSm4Dec4Loop2
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
|
||||
VPSHUFB BSWAP, B0, B4
|
||||
VPSHUFB BSWAP, B1, B1
|
||||
VPSHUFB BSWAP, B2, B2
|
||||
VPSHUFB BSWAP, B3, B3
|
||||
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
|
||||
|
||||
VMOVDQU (16*14)(pTbl), T2
|
||||
VMOVDQU (16*0)(ctx), B0
|
||||
@ -2511,32 +2286,7 @@ avx2GcmSm4DecSingles:
|
||||
VMOVDQU (2*16)(SP), B2
|
||||
VMOVDQU (3*16)(SP), B3
|
||||
|
||||
VPSHUFB B7, B0, B0
|
||||
VPSHUFB B7, B1, B1
|
||||
VPSHUFB B7, B2, B2
|
||||
VPSHUFB B7, B3, B3
|
||||
|
||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
|
||||
|
||||
XORL BX, BX
|
||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Dec4Loop1:
|
||||
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||
|
||||
ADDL $16, BX
|
||||
CMPL BX, $4*32
|
||||
JB avx2GcmSm4Dec4Loop1
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
|
||||
VPSHUFB BSWAP, B0, B0
|
||||
VPSHUFB BSWAP, B1, B1
|
||||
VPSHUFB BSWAP, B2, B2
|
||||
VPSHUFB BSWAP, B3, B3
|
||||
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
|
||||
|
||||
VMOVDQU B0, (16*4)(SP)
|
||||
VMOVDQU B1, (16*5)(SP)
|
||||
|
@ -449,36 +449,24 @@ encOctetsLoop:
|
||||
|
||||
// encryption first 4 blocks
|
||||
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
|
||||
EOR R13, R13
|
||||
MOVD rkSave, rk
|
||||
|
||||
encOctetsEnc4Blocks1:
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
|
||||
encOctetsEnc8Blocks:
|
||||
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4)
|
||||
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5)
|
||||
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $8, R13
|
||||
BNE encOctetsEnc4Blocks1
|
||||
BNE encOctetsEnc8Blocks
|
||||
VREV32 B0.B16, B0.B16
|
||||
VREV32 B1.B16, B1.B16
|
||||
VREV32 B2.B16, B2.B16
|
||||
VREV32 B3.B16, B3.B16
|
||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||
// encryption second 4 blocks
|
||||
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
|
||||
MOVD rkSave, rk
|
||||
|
||||
encOctetsEnc4Blocks2:
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B4, B5, B6, B7)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B5, B6, B7, B4)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B6, B7, B4, B5)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B7, B4, B5, B6)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $16, R13
|
||||
BNE encOctetsEnc4Blocks2
|
||||
VREV32 B4.B16, B4.B16
|
||||
VREV32 B5.B16, B5.B16
|
||||
VREV32 B6.B16, B6.B16
|
||||
@ -741,41 +729,28 @@ decOctetsLoop:
|
||||
|
||||
// encryption first 4 blocks
|
||||
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
|
||||
EOR R13, R13
|
||||
MOVD rkSave, rk
|
||||
|
||||
decOctetsEnc4Blocks1:
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
|
||||
decOctetsEnc8Blocks:
|
||||
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4)
|
||||
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5)
|
||||
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $8, R13
|
||||
BNE decOctetsEnc4Blocks1
|
||||
BNE decOctetsEnc8Blocks
|
||||
VREV32 B0.B16, T1.B16
|
||||
VREV32 B1.B16, T2.B16
|
||||
VREV32 B2.B16, B2.B16
|
||||
VREV32 B3.B16, B3.B16
|
||||
TRANSPOSE_MATRIX(T1, T2, B2, B3, K0, K1, K2, K3)
|
||||
|
||||
// encryption second 4 blocks
|
||||
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
|
||||
MOVD rkSave, rk
|
||||
|
||||
decOctetsEnc4Blocks2:
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B4, B5, B6, B7)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B5, B6, B7, B4)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B6, B7, B4, B5)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B7, B4, B5, B6)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $16, R13
|
||||
BNE decOctetsEnc4Blocks2
|
||||
VREV32 B4.B16, B4.B16
|
||||
VREV32 B5.B16, B5.B16
|
||||
VREV32 B6.B16, B6.B16
|
||||
VREV32 B7.B16, B7.B16
|
||||
VREV32 B7.B16, B7.B16
|
||||
TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
|
||||
|
||||
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
||||
|
Loading…
x
Reference in New Issue
Block a user