sm4: improve throughput #146

This commit is contained in:
Sun Yimin 2023-08-03 15:17:01 +08:00 committed by GitHub
parent fe84641340
commit 4bc3c5d27b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 924 additions and 582 deletions

View File

@ -25,6 +25,12 @@ func BenchmarkSM4EBCEncrypt1K(b *testing.B) {
benchmarkEBCEncrypt1K(b, c)
}
func BenchmarkAES128EBCEncrypt1K(b *testing.B) {
var key [16]byte
c, _ := aes.NewCipher(key[:])
benchmarkEBCEncrypt1K(b, c)
}
func benchmarkCBCEncrypt1K(b *testing.B, block cipher.Block) {
buf := make([]byte, 1024)
b.SetBytes(int64(len(buf)))

View File

@ -2,6 +2,8 @@ package cipher_test
import (
"bytes"
"crypto/rand"
"io"
"testing"
"github.com/emmansun/gmsm/cipher"
@ -63,6 +65,11 @@ var ecbSM4Tests = []struct {
[]byte("0123456789ABCDEF"),
[]byte("exampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintext"),
},
{
"18 same blocks",
[]byte("0123456789ABCDEF"),
[]byte("exampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintext"),
},
}
func TestECBBasic(t *testing.T) {
@ -80,11 +87,30 @@ func TestECBBasic(t *testing.T) {
decrypter := cipher.NewECBDecrypter(c)
decrypter.CryptBlocks(plaintext, ciphertext)
if !bytes.Equal(test.in, plaintext) {
t.Errorf("%s: ECB encrypt/decrypt failed", test.name)
t.Errorf("%s: ECB encrypt/decrypt failed, %s", test.name, string(plaintext))
}
}
}
func TestECBRandom(t *testing.T) {
key := []byte("0123456789ABCDEF")
plaintext := make([]byte, 448)
ciphertext := make([]byte, 448)
io.ReadFull(rand.Reader, plaintext)
c, err := sm4.NewCipher(key)
if err != nil {
t.Fatal(err)
}
encrypter := cipher.NewECBEncrypter(c)
encrypter.CryptBlocks(ciphertext, plaintext)
result := make([]byte, 448)
decrypter := cipher.NewECBDecrypter(c)
decrypter.CryptBlocks(result, ciphertext)
if !bytes.Equal(result, plaintext) {
t.Error("ECB encrypt/decrypt failed")
}
}
func shouldPanic(t *testing.T, f func()) {
t.Helper()
defer func() { _ = recover() }()

View File

@ -15,7 +15,7 @@ GLOBL nibble_mask<>(SB), 8, $16
// inverse shift rows
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
GLOBL inverse_shift_rows<>(SB), 8, $16
// Affine transform 1 (low and high hibbles)
@ -24,7 +24,7 @@ DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
GLOBL m1_low<>(SB), 8, $16
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
GLOBL m1_high<>(SB), 8, $16
// Affine transform 2 (low and high hibbles)
@ -38,21 +38,46 @@ GLOBL m2_high<>(SB), 8, $16
// left rotations of 32-bit words by 8-bit increments
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
GLOBL r08_mask<>(SB), 8, $16
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
GLOBL r16_mask<>(SB), 8, $16
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
GLOBL r24_mask<>(SB), 8, $16
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
GLOBL fk_mask<>(SB), 8, $16
// inverse shift rows
DATA inverse_shift_rows256<>+0x00(SB)/8, $0x0B0E0104070A0D00
DATA inverse_shift_rows256<>+0x08(SB)/8, $0x0306090C0F020508
DATA inverse_shift_rows256<>+0x10(SB)/8, $0x0B0E0104070A0D00
DATA inverse_shift_rows256<>+0x18(SB)/8, $0x0306090C0F020508
GLOBL inverse_shift_rows256<>(SB), 8, $32
DATA r08_mask256<>+0x00(SB)/8, $0x0605040702010003
DATA r08_mask256<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
DATA r08_mask256<>+0x10(SB)/8, $0x0605040702010003
DATA r08_mask256<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
GLOBL r08_mask256<>(SB), 8, $32
DATA r16_mask256<>+0x00(SB)/8, $0x0504070601000302
DATA r16_mask256<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
DATA r16_mask256<>+0x10(SB)/8, $0x0504070601000302
DATA r16_mask256<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
GLOBL r16_mask256<>(SB), 8, $32
DATA r24_mask256<>+0x00(SB)/8, $0x0407060500030201
DATA r24_mask256<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
DATA r24_mask256<>+0x10(SB)/8, $0x0407060500030201
DATA r24_mask256<>+0x18(SB)/8, $0x0C0F0E0D080B0A09
GLOBL r24_mask256<>(SB), 8, $32
// Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance!
// input: from high to low
// r0 = [w3, w2, w1, w0]
@ -189,7 +214,7 @@ GLOBL fk_mask<>(SB), 8, $16
// - t2: 128 bits register for data
// - t3: 128 bits register for data
#define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(RK)(IND*1), x; \
MOVL (index * 4)(RK)(IND*1), x; \
PSHUFD $0, x, x; \
PXOR t1, x; \
PXOR t2, x; \
@ -197,29 +222,128 @@ GLOBL fk_mask<>(SB), 8, $16
SM4_TAO_L1(x, y, z); \
PXOR x, t0
#define SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3) \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, z); \
PXOR x, t0 \
#define SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3) \
PSHUFD $0, rk128, x; \
SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3); \
PSHUFD $0x55, rk128, x; \
SM4_ONE_ROUND_SSE(x, y, z, t1, t2, t3, t0); \
PSHUFD $0xAA, rk128, x; \
SM4_ONE_ROUND_SSE(x, y, z, t2, t3, t0, t1); \
PSHUFD $0xFF, rk128, x; \
SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2); \
#define SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
PSHUFB flip_mask<>(SB), t0; \
PSHUFB flip_mask<>(SB), t1; \
PSHUFB flip_mask<>(SB), t2; \
PSHUFB flip_mask<>(SB), t3; \
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
MOVOU (0*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
MOVOU (1*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
MOVOU (2*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
MOVOU (3*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
MOVOU (4*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
MOVOU (5*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
MOVOU (6*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
MOVOU (7*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
PSHUFB bswap_mask<>(SB), t3; \
PSHUFB bswap_mask<>(SB), t2; \
PSHUFB bswap_mask<>(SB), t1; \
PSHUFB bswap_mask<>(SB), t0
#define SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
PSHUFD $0, rk128, x; \
SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3); \
PSHUFD $0, rk128, x; \
SM4_ONE_ROUND_SSE(x, y, z, t4, t5, t6, t7); \
PSHUFD $0x55, rk128, x; \
SM4_ONE_ROUND_SSE(x, y, z, t1, t2, t3, t0); \
PSHUFD $0x55, rk128, x; \
SM4_ONE_ROUND_SSE(x, y, z, t5, t6, t7, t4); \
PSHUFD $0xAA, rk128, x; \
SM4_ONE_ROUND_SSE(x, y, z, t2, t3, t0, t1); \
PSHUFD $0xAA, rk128, x; \
SM4_ONE_ROUND_SSE(x, y, z, t6, t7, t4, t5); \
PSHUFD $0xFF, rk128, x; \
SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2); \
PSHUFD $0xFF, rk128, x; \
SM4_ONE_ROUND_SSE(x, y, z, t7, t4, t5, t6); \
#define SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
PSHUFB flip_mask<>(SB), t0; \
PSHUFB flip_mask<>(SB), t1; \
PSHUFB flip_mask<>(SB), t2; \
PSHUFB flip_mask<>(SB), t3; \
PSHUFB flip_mask<>(SB), t4; \
PSHUFB flip_mask<>(SB), t5; \
PSHUFB flip_mask<>(SB), t6; \
PSHUFB flip_mask<>(SB), t7; \
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \
MOVOU (0*16)(RK), rk128; \
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
MOVOU (1*16)(RK), rk128; \
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
MOVOU (2*16)(RK), rk128; \
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
MOVOU (3*16)(RK), rk128; \
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
MOVOU (4*16)(RK), rk128; \
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
MOVOU (5*16)(RK), rk128; \
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
MOVOU (6*16)(RK), rk128; \
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
MOVOU (7*16)(RK), rk128; \
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \
PSHUFB bswap_mask<>(SB), t3; \
PSHUFB bswap_mask<>(SB), t2; \
PSHUFB bswap_mask<>(SB), t1; \
PSHUFB bswap_mask<>(SB), t0; \
PSHUFB bswap_mask<>(SB), t7; \
PSHUFB bswap_mask<>(SB), t6; \
PSHUFB bswap_mask<>(SB), t5; \
PSHUFB bswap_mask<>(SB), t4
// SM4 sbox function, AVX version
// parameters:
// - x: 128 bits register as sbox input/output data
// - y: 128 bits temp register
// - X_NIBBLE_MASK: 128 bits register stored nibble mask, should be loaded earlier.
// - tmp: 128 bits temp register
#define AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp) \
VPAND X_NIBBLE_MASK, x, tmp; \
#define AVX_SM4_SBOX(x, y, tmp) \
VPAND nibble_mask<>(SB), x, tmp; \
VMOVDQU m1_low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND X_NIBBLE_MASK, x, x; \
VPAND nibble_mask<>(SB), x, x; \
VMOVDQU m1_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x; \
VMOVDQU inverse_shift_rows<>(SB), tmp; \
VPSHUFB tmp, x, x; \
VAESENCLAST X_NIBBLE_MASK, x, x; \
VPANDN X_NIBBLE_MASK, x, tmp; \
VPSHUFB inverse_shift_rows<>(SB), x, x; \
VAESENCLAST nibble_mask<>(SB), x, x; \
VPANDN nibble_mask<>(SB), x, tmp; \
VMOVDQU m2_low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND X_NIBBLE_MASK, x, x; \
VPAND nibble_mask<>(SB), x, x; \
VMOVDQU m2_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x
@ -228,21 +352,17 @@ GLOBL fk_mask<>(SB), 8, $16
// parameters:
// - x: 128 bits register as sbox input/output data
// - y: 128 bits temp register
// - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier.
// - tmp: 128 bits temp register
#define AVX_SM4_TAO_L1(x, y, xNibbleMask, tmp) \
AVX_SM4_SBOX(x, y, xNibbleMask, tmp); \
VMOVDQU r08_mask<>(SB), tmp; \
VPSHUFB tmp, x, y; \
#define AVX_SM4_TAO_L1(x, y, tmp) \
AVX_SM4_SBOX(x, y, tmp); \
VPSHUFB r08_mask<>(SB), x, y; \
VPXOR x, y, y; \
VMOVDQU r16_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPSHUFB r16_mask<>(SB), x, tmp; \
VPXOR tmp, y, y; \
VPSLLD $2, y, tmp; \
VPSRLD $30, y, y; \
VPXOR tmp, y, y; \
VMOVDQU r24_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPSHUFB r24_mask<>(SB), x, tmp; \
VPXOR y, x, x; \
VPXOR x, tmp, x
@ -280,9 +400,115 @@ GLOBL fk_mask<>(SB), 8, $16
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
AVX_SM4_TAO_L1(x, y, tmp); \
VPXOR x, t0, t0
#define SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3) \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, z); \
VPXOR x, t0, t0 \
#define SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3) \
VPSHUFD $0, rk128, x; \
SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3); \
VPSHUFD $0x55, rk128, x; \
SM4_ONE_ROUND_AVX(x, y, z, t1, t2, t3, t0); \
VPSHUFD $0xAA, rk128, x; \
SM4_ONE_ROUND_AVX(x, y, z, t2, t3, t0, t1); \
VPSHUFD $0xFF, rk128, x; \
SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \
#define AVX_SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
VPSHUFB flip_mask<>(SB), t0, t0 \
VPSHUFB flip_mask<>(SB), t1, t1 \
VPSHUFB flip_mask<>(SB), t2, t2 \
VPSHUFB flip_mask<>(SB), t3, t3 \
; \
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
VMOVDQU (0*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
VMOVDQU (1*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
VMOVDQU (2*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
VMOVDQU (3*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
VMOVDQU (4*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
VMOVDQU (5*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
VMOVDQU (6*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
VMOVDQU (7*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
; \ // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
VPSHUFB bswap_mask<>(SB), t0, t0 \
VPSHUFB bswap_mask<>(SB), t1, t1 \
VPSHUFB bswap_mask<>(SB), t2, t2 \
VPSHUFB bswap_mask<>(SB), t3, t3 \
#define SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
VPSHUFD $0, rk128, x; \
SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3); \
VPSHUFD $0, rk128, x; \
SM4_ONE_ROUND_AVX(x, y, z, t4, t5, t6, t7); \
VPSHUFD $0x55, rk128, x; \
SM4_ONE_ROUND_AVX(x, y, z, t1, t2, t3, t0); \
VPSHUFD $0x55, rk128, x; \
SM4_ONE_ROUND_AVX(x, y, z, t5, t6, t7, t4); \
VPSHUFD $0xAA, rk128, x; \
SM4_ONE_ROUND_AVX(x, y, z, t2, t3, t0, t1); \
VPSHUFD $0xAA, rk128, x; \
SM4_ONE_ROUND_AVX(x, y, z, t6, t7, t4, t5); \
VPSHUFD $0xFF, rk128, x; \
SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \
VPSHUFD $0xFF, rk128, x; \
SM4_ONE_ROUND_AVX(x, y, z, t7, t4, t5, t6); \
#define AVX_SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
VPSHUFB flip_mask<>(SB), t0, t0 \
VPSHUFB flip_mask<>(SB), t1, t1 \
VPSHUFB flip_mask<>(SB), t2, t2 \
VPSHUFB flip_mask<>(SB), t3, t3 \
VPSHUFB flip_mask<>(SB), t4, t4 \
VPSHUFB flip_mask<>(SB), t5, t5 \
VPSHUFB flip_mask<>(SB), t6, t6 \
VPSHUFB flip_mask<>(SB), t7, t7 \
; \
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \
VMOVDQU (0*16)(RK), rk128; \
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
VMOVDQU (1*16)(RK), rk128; \
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
VMOVDQU (2*16)(RK), rk128; \
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
VMOVDQU (3*16)(RK), rk128; \
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
VMOVDQU (4*16)(RK), rk128; \
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
VMOVDQU (5*16)(RK), rk128; \
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
VMOVDQU (6*16)(RK), rk128; \
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
VMOVDQU (7*16)(RK), rk128; \
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
; \ // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \
VPSHUFB bswap_mask<>(SB), t0, t0 \
VPSHUFB bswap_mask<>(SB), t1, t1 \
VPSHUFB bswap_mask<>(SB), t2, t2 \
VPSHUFB bswap_mask<>(SB), t3, t3 \
VPSHUFB bswap_mask<>(SB), t4, t4 \
VPSHUFB bswap_mask<>(SB), t5, t5 \
VPSHUFB bswap_mask<>(SB), t6, t6 \
VPSHUFB bswap_mask<>(SB), t7, t7 \
// SM4 sbox function, AVX2 version
// parameters:
// - x: 256 bits register as sbox input/output data
@ -301,8 +527,7 @@ GLOBL fk_mask<>(SB), 8, $16
VBROADCASTI128 m1_high<>(SB), z; \
VPSHUFB x, z, x; \
VPXOR y, x, x; \
VBROADCASTI128 inverse_shift_rows<>(SB), z; \
VPSHUFB z, x, x; \
VPSHUFB inverse_shift_rows256<>(SB), x, x; \
VEXTRACTI128 $1, x, yw \
VAESENCLAST xNibbleMask, xw, xw; \
VAESENCLAST xNibbleMask, yw, yw; \
@ -327,17 +552,14 @@ GLOBL fk_mask<>(SB), 8, $16
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
#define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \
VBROADCASTI128 r08_mask<>(SB), z; \
VPSHUFB z, x, y; \
VPSHUFB r08_mask256<>(SB), x, y; \
VPXOR x, y, y; \
VBROADCASTI128 r16_mask<>(SB), z; \
VPSHUFB z, x, z; \
VPSHUFB r16_mask256<>(SB), x, z; \
VPXOR z, y, y; \
VPSLLD $2, y, z; \
VPSRLD $30, y, y; \
VPXOR z, y, y; \
VBROADCASTI128 r24_mask<>(SB), z; \
VPSHUFB z, x, z; \
VPSHUFB r24_mask256<>(SB), x, z; \
VPXOR y, x, x; \
VPXOR x, z, x
@ -359,6 +581,24 @@ GLOBL fk_mask<>(SB), 8, $16
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
VPXOR x, t0, t0
// SM4 round function, AVX2 version, handle 256 bits
// t0 ^= tao_l1(t1^t2^t3^xk)
// parameters:
// - index: round key index immediate number
// - x: 256 bits temp register, MUST use XDWORD!
// - y: 256 bits temp register, MUST use YDWORD!
// - t0: 256 bits register for data as result
// - t1: 256 bits register for data
// - t2: 256 bits register for data
// - t3: 256 bits register for data
#define AVX2_SM4_ROUND2(index, RK, x, y, xw, yw, tmp, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(RK), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
VPXOR x, t0, t0
// SM4 round function, AVX version, handle 128 bits
// t0 ^= tao_l1(t1^t2^t3^xk)
// parameters:
@ -371,9 +611,100 @@ GLOBL fk_mask<>(SB), 8, $16
// - t3: 128 bits register for data
#define AVX2_SM4_ROUND_4BLOCKS(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
VPSHUFD $0, x, x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
AVX_SM4_TAO_L1(x, y, tmp); \
VPXOR x, t0, t0
#define AVX2_SM4_8BLOCKS(RK, x, y, xw, yw, tmp, t0, t1, t2, t3) \
AVX2_SM4_ROUND2(0, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
AVX2_SM4_ROUND2(1, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
AVX2_SM4_ROUND2(2, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
AVX2_SM4_ROUND2(3, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
AVX2_SM4_ROUND2(4, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
AVX2_SM4_ROUND2(5, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
AVX2_SM4_ROUND2(6, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
AVX2_SM4_ROUND2(7, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
AVX2_SM4_ROUND2(8, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
AVX2_SM4_ROUND2(9, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
AVX2_SM4_ROUND2(10, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
AVX2_SM4_ROUND2(11, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
AVX2_SM4_ROUND2(12, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
AVX2_SM4_ROUND2(13, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
AVX2_SM4_ROUND2(14, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
AVX2_SM4_ROUND2(15, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
AVX2_SM4_ROUND2(16, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
AVX2_SM4_ROUND2(17, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
AVX2_SM4_ROUND2(18, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
AVX2_SM4_ROUND2(19, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
AVX2_SM4_ROUND2(20, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
AVX2_SM4_ROUND2(21, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
AVX2_SM4_ROUND2(22, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
AVX2_SM4_ROUND2(23, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
AVX2_SM4_ROUND2(24, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
AVX2_SM4_ROUND2(25, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
AVX2_SM4_ROUND2(26, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
AVX2_SM4_ROUND2(27, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
AVX2_SM4_ROUND2(28, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
AVX2_SM4_ROUND2(29, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
AVX2_SM4_ROUND2(30, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
AVX2_SM4_ROUND2(31, RK, x, y, xw, yw, tmp, t3, t0, t1, t2)
// SM4 round function, AVX2 version, handle 256 bits
// t0 ^= tao_l1(t1^t2^t3^xk)
// parameters:
// - index: round key index immediate number
// - x: 256 bits temp register, MUST use XDWORD!
// - y: 256 bits temp register, MUST use YDWORD!
// - t0: 256 bits register for data as result
// - t1: 256 bits register for data
// - t2: 256 bits register for data
// - t3: 256 bits register for data
#define AVX2_SM4_16BLOCKS_ROUND(index, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7) \
VPBROADCASTD (index * 4)(RK), tmp1; \
VPXOR t1, tmp1, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
VPXOR x, t0, t0; \
;\
VPXOR t5, tmp1, x; \
VPXOR t6, x, x; \
VPXOR t7, x, x; \
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
VPXOR x, t4, t4; \
#define AVX2_SM4_16BLOCKS(RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7) \
AVX2_SM4_16BLOCKS_ROUND(0, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
AVX2_SM4_16BLOCKS_ROUND(1, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
AVX2_SM4_16BLOCKS_ROUND(2, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
AVX2_SM4_16BLOCKS_ROUND(3, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
AVX2_SM4_16BLOCKS_ROUND(4, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
AVX2_SM4_16BLOCKS_ROUND(5, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
AVX2_SM4_16BLOCKS_ROUND(6, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
AVX2_SM4_16BLOCKS_ROUND(7, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
AVX2_SM4_16BLOCKS_ROUND(8, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
AVX2_SM4_16BLOCKS_ROUND(9, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
AVX2_SM4_16BLOCKS_ROUND(10, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
AVX2_SM4_16BLOCKS_ROUND(11, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
AVX2_SM4_16BLOCKS_ROUND(12, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
AVX2_SM4_16BLOCKS_ROUND(13, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
AVX2_SM4_16BLOCKS_ROUND(14, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
AVX2_SM4_16BLOCKS_ROUND(15, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
AVX2_SM4_16BLOCKS_ROUND(16, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
AVX2_SM4_16BLOCKS_ROUND(17, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
AVX2_SM4_16BLOCKS_ROUND(18, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
AVX2_SM4_16BLOCKS_ROUND(19, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
AVX2_SM4_16BLOCKS_ROUND(20, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
AVX2_SM4_16BLOCKS_ROUND(21, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
AVX2_SM4_16BLOCKS_ROUND(22, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
AVX2_SM4_16BLOCKS_ROUND(23, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
AVX2_SM4_16BLOCKS_ROUND(24, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
AVX2_SM4_16BLOCKS_ROUND(25, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
AVX2_SM4_16BLOCKS_ROUND(26, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
AVX2_SM4_16BLOCKS_ROUND(27, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
AVX2_SM4_16BLOCKS_ROUND(28, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
AVX2_SM4_16BLOCKS_ROUND(29, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
AVX2_SM4_16BLOCKS_ROUND(30, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
AVX2_SM4_16BLOCKS_ROUND(31, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6)

View File

@ -171,3 +171,30 @@ GLOBL fk_mask<>(SB), (16+8), $16
VEOR t3.B16, x.B16, x.B16; \
SM4_TAO_L1(x, y, z); \
VEOR x.B16, t0.B16, t0.B16
// SM4 round function
// t0 ^= tao_l1(t1^t2^t3^xk)
// parameters:
// - RK: round key register
// - tmp32: temp 32/64 bits register
// - x: 128 bits temp register
// - y: 128 bits temp register
// - z: 128 bits temp register
// - t0: 128 bits register for data as result
// - t1: 128 bits register for data
// - t2: 128 bits register for data
// - t3: 128 bits register for data
#define SM4_8BLOCKS_ROUND(RK, tmp32, x, y, z, tmp, t0, t1, t2, t3, t4, t5, t6, t7) \
MOVW.P 4(RK), tmp32; \
VMOV tmp32, tmp.S4; \
VEOR t1.B16, tmp.B16, x.B16; \
VEOR t2.B16, x.B16, x.B16; \
VEOR t3.B16, x.B16, x.B16; \
SM4_TAO_L1(x, y, z); \
VEOR x.B16, t0.B16, t0.B16; \
; \
VEOR t1.B16, tmp.B16, x.B16; \
VEOR t2.B16, x.B16, x.B16; \
VEOR t3.B16, x.B16, x.B16; \
SM4_TAO_L1(x, y, z); \
VEOR x.B16, t0.B16, t0.B16

View File

@ -4,15 +4,15 @@
#include "textflag.h"
#define x X0
#define y X1
#define t0 X2
#define t1 X3
#define t2 X4
#define t3 X5
#define t0 X0
#define t1 X1
#define t2 X2
#define t3 X3
#define XTMP6 X6
#define XTMP7 X7
#define x X8
#define y X9
#define XTMP6 X10
#define XTMP7 X11
#include "aesni_macros_amd64.s"
@ -48,7 +48,7 @@
// - t2: 128 bits register for data
// - t3: 128 bits register for data
#define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(BX)(CX*1), x; \
MOVL (index * 4)(BX)(CX*1), x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
@ -68,6 +68,16 @@
#define XWORD2 X6
#define XWORD3 X7
#define XDWORD4 Y10
#define XDWORD5 Y11
#define XDWORD6 Y12
#define XDWORD7 Y14
#define XWORD4 X10
#define XWORD5 X11
#define XWORD6 X12
#define XWORD7 X14
#define XDWTMP0 Y0
#define XDWTMP1 Y1
#define XDWTMP2 Y2
@ -133,91 +143,93 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
JE avx
non_avx2_start:
MOVOU 0(DX), t0
MOVOU 16(DX), t1
MOVOU 32(DX), t2
MOVOU 48(DX), t3
PSHUFB flip_mask<>(SB), t0
PSHUFB flip_mask<>(SB), t1
PSHUFB flip_mask<>(SB), t2
PSHUFB flip_mask<>(SB), t3
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y)
CMPQ DI, $128
JEQ sse_8blocks
XORL CX, CX
MOVOU 0(DX), XWORD0
MOVOU 16(DX), XWORD1
MOVOU 32(DX), XWORD2
MOVOU 48(DX), XWORD3
loop:
SM4_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
ADDL $16, CX
CMPL CX, $4*32
JB loop
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y);
PSHUFB bswap_mask<>(SB), t3
PSHUFB bswap_mask<>(SB), t2
PSHUFB bswap_mask<>(SB), t1
PSHUFB bswap_mask<>(SB), t0
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
MOVOU t0, 0(BX)
MOVOU t1, 16(BX)
MOVOU t2, 32(BX)
MOVOU t3, 48(BX)
MOVOU XWORD0, 0(BX)
MOVOU XWORD1, 16(BX)
MOVOU XWORD2, 32(BX)
MOVOU XWORD3, 48(BX)
RET
sse_8blocks:
MOVOU 0(DX), XWORD0
MOVOU 16(DX), XWORD1
MOVOU 32(DX), XWORD2
MOVOU 48(DX), XWORD3
MOVOU 64(DX), XWORD4
MOVOU 80(DX), XWORD5
MOVOU 96(DX), XWORD6
MOVOU 112(DX), XWORD7
SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
MOVOU XWORD0, 0(BX)
MOVOU XWORD1, 16(BX)
MOVOU XWORD2, 32(BX)
MOVOU XWORD3, 48(BX)
MOVOU XWORD4, 64(BX)
MOVOU XWORD5, 80(BX)
MOVOU XWORD6, 96(BX)
MOVOU XWORD7, 112(BX)
done_sm4:
RET
avx:
CMPQ DI, $128
JEQ avx_8blocks
VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
VMOVDQU 48(DX), XWORD3
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
XORL CX, CX
avx_loop:
AVX_SM4_ROUND(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
AVX_SM4_ROUND(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
AVX_SM4_ROUND(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
AVX_SM4_ROUND(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)
ADDL $16, CX
CMPL CX, $4*32
JB avx_loop
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
VMOVDQU XWORD3, 48(BX)
RET
avx_8blocks:
VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
VMOVDQU 48(DX), XWORD3
VMOVDQU 64(DX), XWORD4
VMOVDQU 80(DX), XWORD5
VMOVDQU 96(DX), XWORD6
VMOVDQU 112(DX), XWORD7
AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
VMOVDQU XWORD3, 48(BX)
VMOVDQU XWORD4, 64(BX)
VMOVDQU XWORD5, 80(BX)
VMOVDQU XWORD6, 96(BX)
VMOVDQU XWORD7, 112(BX)
avx_done_sm4:
RET
avx2:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
CMPQ DI, $64
JBE avx2_4blocks
CMPQ DI, $256
JEQ avx2_16blocks
avx2_8blocks:
VMOVDQU 0(DX), XDWORD0
@ -235,17 +247,7 @@ avx2_8blocks:
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
XORL CX, CX
avx2_loop:
AVX2_SM4_ROUND(0, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
AVX2_SM4_ROUND(1, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
AVX2_SM4_ROUND(2, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
AVX2_SM4_ROUND(3, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
ADDL $16, CX
CMPL CX, $4*32
JB avx2_loop
AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
@ -260,49 +262,60 @@ avx2_loop:
VMOVDQU XDWORD1, 32(BX)
VMOVDQU XDWORD2, 64(BX)
VMOVDQU XDWORD3, 96(BX)
JMP avx2_sm4_done
avx2_4blocks:
VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
VMOVDQU 48(DX), XWORD3
VZEROUPPER
RET
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
avx2_16blocks:
VMOVDQU 0(DX), XDWORD0
VMOVDQU 32(DX), XDWORD1
VMOVDQU 64(DX), XDWORD2
VMOVDQU 96(DX), XDWORD3
VMOVDQU 128(DX), XDWORD4
VMOVDQU 160(DX), XDWORD5
VMOVDQU 192(DX), XDWORD6
VMOVDQU 224(DX), XDWORD7
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
// Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
XORL CX, CX
avx2_4blocks_loop:
AVX2_SM4_ROUND_4BLOCKS(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
AVX2_SM4_ROUND_4BLOCKS(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
AVX2_SM4_ROUND_4BLOCKS(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
AVX2_SM4_ROUND_4BLOCKS(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)
ADDL $16, CX
CMPL CX, $4*32
JB avx2_4blocks_loop
AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7)
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
VMOVDQU XWORD3, 48(BX)
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
VMOVDQU XDWORD0, 0(BX)
VMOVDQU XDWORD1, 32(BX)
VMOVDQU XDWORD2, 64(BX)
VMOVDQU XDWORD3, 96(BX)
VMOVDQU XDWORD4, 128(BX)
VMOVDQU XDWORD5, 160(BX)
VMOVDQU XDWORD6, 192(BX)
VMOVDQU XDWORD7, 224(BX)
avx2_sm4_done:
VZEROUPPER

View File

@ -9,6 +9,10 @@
#define t1 V3
#define t2 V4
#define t3 V5
#define t4 V8
#define t5 V9
#define t6 V10
#define t7 V11
#define ZERO V16
#define NIBBLE_MASK V20
#define INVERSE_SHIFT_ROWS V21
@ -184,6 +188,9 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
CMP $1, R11
BEQ sm4niblocks
CMP $128, R12
BEQ double_enc
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
@ -215,6 +222,51 @@ encryptBlocksLoop:
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9)
RET
double_enc:
VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4]
VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4]
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VREV32 t4.B16, t4.B16
VREV32 t5.B16, t5.B16
VREV32 t6.B16, t6.B16
VREV32 t7.B16, t7.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
load_global_data_2()
VEOR ZERO.B16, ZERO.B16, ZERO.B16
EOR R0, R0
encrypt8BlocksLoop:
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
ADD $16, R0
CMP $128, R0
BNE encrypt8BlocksLoop
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VREV32 t4.B16, t4.B16
VREV32 t5.B16, t5.B16
VREV32 t6.B16, t6.B16
VREV32 t7.B16, t7.B16
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R9)
VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R9)
RET
sm4niblocks:
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4]

View File

@ -74,11 +74,18 @@ func (x *cbc) CryptBlocks(dst, src []byte) {
// Copy the last block of ciphertext in preparation as the new iv.
copy(x.tmp, src[end-BlockSize:end])
start := end - x.b.blocksSize
var temp []byte = make([]byte, x.b.blocksSize)
var batchSrc []byte = make([]byte, x.b.blocksSize+BlockSize)
decKeyPtr := &x.b.dec[0]
start := end - 2*x.b.blocksSize
for start > 0 {
decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize])
end = start
start -= 2*x.b.blocksSize
}
start = end - x.b.blocksSize
for start > 0 {
decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize])
end = start
@ -86,6 +93,8 @@ func (x *cbc) CryptBlocks(dst, src []byte) {
}
// Handle remain first blocks
var temp []byte = make([]byte, x.b.blocksSize)
var batchSrc []byte = make([]byte, x.b.blocksSize+BlockSize)
copy(batchSrc, x.iv)
copy(batchSrc[BlockSize:], src[:end])
decryptBlocksChain(decKeyPtr, temp, batchSrc[BlockSize:], &batchSrc[0])

View File

@ -85,6 +85,11 @@ done_sm4:
#define XDWORD2 Y6
#define XDWORD3 Y7
#define XDWORD4 Y10
#define XDWORD5 Y11
#define XDWORD6 Y12
#define XDWORD7 Y14
#define XWTMP0 X0
#define XWTMP1 X1
#define XWTMP2 X2
@ -94,6 +99,11 @@ done_sm4:
#define XWORD2 X6
#define XWORD3 X7
#define XWORD4 X10
#define XWORD5 X11
#define XWORD6 X12
#define XWORD7 X14
#define NIBBLE_MASK Y3
#define X_NIBBLE_MASK X3
@ -111,6 +121,7 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
MOVQ xk+0(FP), AX
MOVQ dst+8(FP), BX
MOVQ src+32(FP), DX
MOVQ src_len+40(FP), DI
MOVQ iv+56(FP), SI
CMPB ·useAVX2(SB), $1
@ -120,84 +131,71 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
JE avx
non_avx2_start:
MOVOU 0(DX), t0
MOVOU 16(DX), t1
MOVOU 32(DX), t2
MOVOU 48(DX), t3
PSHUFB flip_mask<>(SB), t0
PSHUFB flip_mask<>(SB), t1
PSHUFB flip_mask<>(SB), t2
PSHUFB flip_mask<>(SB), t3
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y)
CMPQ DI, $128
JEQ sse_8blocks
XORL CX, CX
MOVOU 0(DX), XWORD0
MOVOU 16(DX), XWORD1
MOVOU 32(DX), XWORD2
MOVOU 48(DX), XWORD3
loop:
SM4_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
ADDL $16, CX
CMPL CX, $4*32
JB loop
PXOR 0(SI), XWORD0
PXOR 16(SI), XWORD1
PXOR 32(SI), XWORD2
PXOR 48(SI), XWORD3
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y);
PSHUFB bswap_mask<>(SB), t3
PSHUFB bswap_mask<>(SB), t2
PSHUFB bswap_mask<>(SB), t1
PSHUFB bswap_mask<>(SB), t0
MOVUPS XWORD0, 0(BX)
MOVUPS XWORD1, 16(BX)
MOVUPS XWORD2, 32(BX)
MOVUPS XWORD3, 48(BX)
PXOR 0(SI), t0
PXOR 16(SI), t1
PXOR 32(SI), t2
PXOR 48(SI), t3
RET
MOVUPS t0, 0(BX)
MOVUPS t1, 16(BX)
MOVUPS t2, 32(BX)
MOVUPS t3, 48(BX)
sse_8blocks:
MOVOU 0(DX), XWORD0
MOVOU 16(DX), XWORD1
MOVOU 32(DX), XWORD2
MOVOU 48(DX), XWORD3
MOVOU 64(DX), XWORD4
MOVOU 80(DX), XWORD5
MOVOU 96(DX), XWORD6
MOVOU 112(DX), XWORD7
SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
PXOR 0(SI), XWORD0
PXOR 16(SI), XWORD1
PXOR 32(SI), XWORD2
PXOR 48(SI), XWORD3
PXOR 64(SI), XWORD4
PXOR 80(SI), XWORD5
PXOR 96(SI), XWORD6
PXOR 112(SI), XWORD7
MOVOU XWORD0, 0(BX)
MOVOU XWORD1, 16(BX)
MOVOU XWORD2, 32(BX)
MOVOU XWORD3, 48(BX)
MOVOU XWORD4, 64(BX)
MOVOU XWORD5, 80(BX)
MOVOU XWORD6, 96(BX)
MOVOU XWORD7, 112(BX)
done_sm4:
RET
avx:
CMPQ DI, $128
JEQ avx_8blocks
VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
VMOVDQU 48(DX), XWORD3
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
XORL CX, CX
avx_loop:
AVX_SM4_ROUND(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
AVX_SM4_ROUND(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
AVX_SM4_ROUND(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
AVX_SM4_ROUND(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)
ADDL $16, CX
CMPL CX, $4*32
JB avx_loop
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
VPXOR 16(SI), XWORD1, XWORD1
@ -208,11 +206,45 @@ avx_loop:
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
VMOVDQU XWORD3, 48(BX)
RET
avx_8blocks:
VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
VMOVDQU 48(DX), XWORD3
VMOVDQU 64(DX), XWORD4
VMOVDQU 80(DX), XWORD5
VMOVDQU 96(DX), XWORD6
VMOVDQU 112(DX), XWORD7
AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
VPXOR 0(SI), XWORD0, XWORD0
VPXOR 16(SI), XWORD1, XWORD1
VPXOR 32(SI), XWORD2, XWORD2
VPXOR 48(SI), XWORD3, XWORD3
VPXOR 64(SI), XWORD4, XWORD4
VPXOR 80(SI), XWORD5, XWORD5
VPXOR 96(SI), XWORD6, XWORD6
VPXOR 112(SI), XWORD7, XWORD7
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
VMOVDQU XWORD3, 48(BX)
VMOVDQU XWORD4, 64(BX)
VMOVDQU XWORD5, 80(BX)
VMOVDQU XWORD6, 96(BX)
VMOVDQU XWORD7, 112(BX)
avx_sm4_done:
RET
avx2:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
CMPQ DI, $256
JEQ avx2_16blocks
avx2_8blocks:
VMOVDQU 0(DX), XDWORD0
@ -230,17 +262,7 @@ avx2_8blocks:
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
XORL CX, CX
avx2_loop:
AVX2_SM4_ROUND(0, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
AVX2_SM4_ROUND(1, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
AVX2_SM4_ROUND(2, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
AVX2_SM4_ROUND(3, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
ADDL $16, CX
CMPL CX, $4*32
JB avx2_loop
AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
@ -261,6 +283,68 @@ avx2_loop:
VMOVDQU XDWORD2, 64(BX)
VMOVDQU XDWORD3, 96(BX)
VZEROUPPER
RET
avx2_16blocks:
VMOVDQU 0(DX), XDWORD0
VMOVDQU 32(DX), XDWORD1
VMOVDQU 64(DX), XDWORD2
VMOVDQU 96(DX), XDWORD3
VMOVDQU 128(DX), XDWORD4
VMOVDQU 160(DX), XDWORD5
VMOVDQU 192(DX), XDWORD6
VMOVDQU 224(DX), XDWORD7
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
// Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7)
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
VPXOR 0(SI), XDWORD0, XDWORD0
VPXOR 32(SI), XDWORD1, XDWORD1
VPXOR 64(SI), XDWORD2, XDWORD2
VPXOR 96(SI), XDWORD3, XDWORD3
VPXOR 128(SI), XDWORD4, XDWORD4
VPXOR 160(SI), XDWORD5, XDWORD5
VPXOR 192(SI), XDWORD6, XDWORD6
VPXOR 224(SI), XDWORD7, XDWORD7
VMOVDQU XDWORD0, 0(BX)
VMOVDQU XDWORD1, 32(BX)
VMOVDQU XDWORD2, 64(BX)
VMOVDQU XDWORD3, 96(BX)
VMOVDQU XDWORD4, 128(BX)
VMOVDQU XDWORD5, 160(BX)
VMOVDQU XDWORD6, 192(BX)
VMOVDQU XDWORD7, 224(BX)
avx2_sm4_done:
VZEROUPPER

View File

@ -88,6 +88,10 @@ done_sm4:
#undef rkSave
#define XTMP7 V7
#define t4 V10
#define t5 V11
#define t6 V12
#define t7 V13
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
@ -99,6 +103,8 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
MOVD src_len+40(FP), R12
MOVD iv+56(FP), R11
CMP $128, R12
BEQ double_dec
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
VREV32 t0.B16, t0.B16
@ -135,3 +141,57 @@ encryptBlocksLoop:
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9)
RET
double_dec:
VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4]
VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4]
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VREV32 t4.B16, t4.B16
VREV32 t5.B16, t5.B16
VREV32 t6.B16, t6.B16
VREV32 t7.B16, t7.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
VEOR ZERO.B16, ZERO.B16, ZERO.B16
EOR R0, R0
decrypt8BlocksLoop:
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
ADD $16, R0
CMP $128, R0
BNE decrypt8BlocksLoop
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VREV32 t4.B16, t4.B16
VREV32 t5.B16, t5.B16
VREV32 t6.B16, t6.B16
VREV32 t7.B16, t7.B16
VLD1.P 64(R11), [V6.S4, V7.S4, V8.S4, V9.S4]
VEOR V6.B16, t0.B16, t0.B16
VEOR V7.B16, t1.B16, t1.B16
VEOR V8.B16, t2.B16, t2.B16
VEOR V9.B16, t3.B16, t3.B16
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R9)
VLD1.P 64(R11), [V6.S4, V7.S4, V8.S4, V9.S4]
VEOR V6.B16, t4.B16, t4.B16
VEOR V7.B16, t5.B16, t5.B16
VEOR V8.B16, t6.B16, t6.B16
VEOR V9.B16, t7.B16, t7.B16
VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R9)
RET

View File

@ -54,6 +54,15 @@ func (x *ecb) CryptBlocks(dst, src []byte) {
if len(src) == 0 {
return
}
for len(src) >= 2*x.b.blocksSize {
if x.enc == ecbEncrypt {
x.b.EncryptBlocks(dst[:2*x.b.blocksSize], src[:2*x.b.blocksSize])
} else {
x.b.DecryptBlocks(dst[:2*x.b.blocksSize], src[:2*x.b.blocksSize])
}
src = src[2*x.b.blocksSize:]
dst = dst[2*x.b.blocksSize:]
}
for len(src) >= x.b.blocksSize {
if x.enc == ecbEncrypt {
x.b.EncryptBlocks(dst[:x.b.blocksSize], src[:x.b.blocksSize])

View File

@ -155,114 +155,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
#undef plen
#undef dlen
#define AVX_SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \
VMOVDQU flip_mask<>(SB), x \
VPSHUFB x, t0, t0 \
VPSHUFB x, t1, t1 \
VPSHUFB x, t2, t2 \
VPSHUFB x, t3, t3 \
; \
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
XORL IND, IND \
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK \
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
; \ // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
VPSHUFB BSWAP, t0, t0 \
VPSHUFB BSWAP, t1, t1 \
VPSHUFB BSWAP, t2, t2 \
VPSHUFB BSWAP, t3, t3 \
#define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \
PSHUFB flip_mask<>(SB), t0; \
PSHUFB flip_mask<>(SB), t1; \
PSHUFB flip_mask<>(SB), t2; \
PSHUFB flip_mask<>(SB), t3; \
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
XORL IND, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
PSHUFB BSWAP, t3; \
PSHUFB BSWAP, t2; \
PSHUFB BSWAP, t1; \
PSHUFB BSWAP, t0
// func gcmSm4Init(productTable *[256]byte, rk []uint32)
TEXT ·gcmSm4Init(SB),NOSPLIT,$0
#define dst DI
@ -676,12 +568,12 @@ TEXT ·gcmSm4Enc(SB),0,$256-96
MOVOU (8*16 + 6*16)(SP), B6
MOVOU (8*16 + 7*16)(SP), B7
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
PXOR ACC1, ACC1
increment(0)
increment(1)
increment(2)
increment(3)
SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
increment(4)
increment(5)
increment(6)
@ -762,7 +654,6 @@ gcmSm4EncOctetsLoop:
PCLMULQDQ $0x00, T0, ACC0
PCLMULQDQ $0x11, T0, ACC1
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
mulRound(1)
increment(0)
mulRound(2)
@ -771,7 +662,6 @@ gcmSm4EncOctetsLoop:
increment(2)
mulRound(4)
increment(3)
SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
mulRound(5)
increment(4)
mulRound(6)
@ -791,6 +681,8 @@ gcmSm4EncOctetsLoop:
reduceRound(ACC0)
PXOR ACC1, ACC0
SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
MOVOU (16*0)(ptx), T0
PXOR T0, B0
MOVOU (16*1)(ptx), T0
@ -886,7 +778,7 @@ gcmSm4EncNibbles:
MOVOU (8*16 + 2*16)(SP), B2
MOVOU (8*16 + 3*16)(SP), B3
SM4_4BLOCKS(AX, BX, T0, T1, T2, B0, B1, B2, B3)
SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
MOVOU (16*0)(ptx), T0
PXOR T0, B0
MOVOU (16*1)(ptx), T0
@ -922,7 +814,7 @@ gcmSm4EncSingles:
MOVOU (8*16 + 2*16)(SP), B2
MOVOU (8*16 + 3*16)(SP), B3
SM4_4BLOCKS(AX, BX, T0, T1, T2, B0, B1, B2, B3)
SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
MOVOU B0, (16*0)(SP)
MOVOU B1, (16*1)(SP)
MOVOU B2, (16*2)(SP)
@ -1014,17 +906,30 @@ avxGcmSm4Enc:
VMOVDQU (8*16 + 1*16)(SP), B1
VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3
VMOVDQU (8*16 + 4*16)(SP), B4
VMOVDQU (8*16 + 5*16)(SP), B5
VMOVDQU (8*16 + 6*16)(SP), B6
VMOVDQU (8*16 + 7*16)(SP), B7
AVX_SM4_4BLOCKS(rk, BX, B7, T1, T2, B0, B1, B2, B3)
AVX_SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
VPXOR ACC1, ACC1, ACC1 // clean ACC1
increment(0)
increment(1)
increment(2)
increment(3)
increment(4)
increment(5)
increment(6)
increment(7)
// XOR plaintext
VPXOR (16*0)(ptx), B0, B0
VPXOR (16*1)(ptx), B1, B1
VPXOR (16*2)(ptx), B2, B2
VPXOR (16*3)(ptx), B3, B3
VPXOR (16*4)(ptx), B4, B4
VPXOR (16*5)(ptx), B5, B5
VPXOR (16*6)(ptx), B6, B6
VPXOR (16*7)(ptx), B7, B7
// Store ciphertext
VMOVDQU B0, (16*0)(ctx)
VPSHUFB BSWAP, B0, B0
@ -1034,31 +939,6 @@ avxGcmSm4Enc:
VPSHUFB BSWAP, B2, B2
VMOVDQU B3, (16*3)(ctx)
VPSHUFB BSWAP, B3, B3
VPXOR ACC0, B0, B0
VMOVDQU B0, (16*0)(SP)
VMOVDQU B1, (16*1)(SP)
VMOVDQU B2, (16*2)(SP)
VMOVDQU B3, (16*3)(SP)
// load 4 ctrs for encryption
VMOVDQU (8*16 + 4*16)(SP), B4
VMOVDQU (8*16 + 5*16)(SP), B5
VMOVDQU (8*16 + 6*16)(SP), B6
VMOVDQU (8*16 + 7*16)(SP), B7
AVX_SM4_4BLOCKS(rk, BX, B0, T1, T2, B4, B5, B6, B7)
increment(4)
increment(5)
increment(6)
increment(7)
// XOR plaintext
VPXOR (16*4)(ptx), B4, B4
VPXOR (16*5)(ptx), B5, B5
VPXOR (16*6)(ptx), B6, B6
VPXOR (16*7)(ptx), B7, B7
// Store ciphertext
VMOVDQU B4, (16*4)(ctx)
VPSHUFB BSWAP, B4, B4
VMOVDQU B5, (16*5)(ctx)
@ -1068,6 +948,12 @@ avxGcmSm4Enc:
VMOVDQU B7, (16*7)(ctx)
VPSHUFB BSWAP, B7, B7
VPXOR ACC0, B0, B0
VMOVDQU B0, (16*0)(SP)
VMOVDQU B1, (16*1)(SP)
VMOVDQU B2, (16*2)(SP)
VMOVDQU B3, (16*3)(SP)
VMOVDQU B4, (16*4)(SP)
VMOVDQU B5, (16*5)(SP)
VMOVDQU B6, (16*6)(SP)
@ -1129,12 +1015,16 @@ avxGcmSm4EncOctetsLoop:
avxReduceRound(ACC0)
VPXOR ACC1, ACC0, ACC0
AVX_SM4_4BLOCKS(rk, BX, ACC1, T1, T2, B0, B1, B2, B3)
AVX_SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
// XOR plaintext
VPXOR (16*0)(ptx), B0, B0
VPXOR (16*1)(ptx), B1, B1
VPXOR (16*2)(ptx), B2, B2
VPXOR (16*3)(ptx), B3, B3
VPXOR (16*4)(ptx), B4, B4
VPXOR (16*5)(ptx), B5, B5
VPXOR (16*6)(ptx), B6, B6
VPXOR (16*7)(ptx), B7, B7
// Store ciphertext
VMOVDQU B0, (16*0)(ctx)
@ -1145,21 +1035,6 @@ avxGcmSm4EncOctetsLoop:
VPSHUFB BSWAP, B2, B2
VMOVDQU B3, (16*3)(ctx)
VPSHUFB BSWAP, B3, B3
VPXOR ACC0, B0, B0
VMOVDQU B0, (16*0)(SP)
VMOVDQU B1, (16*1)(SP)
VMOVDQU B2, (16*2)(SP)
VMOVDQU B3, (16*3)(SP)
AVX_SM4_4BLOCKS(rk, BX, B0, T1, T2, B4, B5, B6, B7)
// XOR plaintext
VPXOR (16*4)(ptx), B4, B4
VPXOR (16*5)(ptx), B5, B5
VPXOR (16*6)(ptx), B6, B6
VPXOR (16*7)(ptx), B7, B7
// Store ciphertext
VMOVDQU B4, (16*4)(ctx)
VPSHUFB BSWAP, B4, B4
VMOVDQU B5, (16*5)(ctx)
@ -1169,6 +1044,11 @@ avxGcmSm4EncOctetsLoop:
VMOVDQU B7, (16*7)(ctx)
VPSHUFB BSWAP, B7, B7
VPXOR ACC0, B0, B0
VMOVDQU B0, (16*0)(SP)
VMOVDQU B1, (16*1)(SP)
VMOVDQU B2, (16*2)(SP)
VMOVDQU B3, (16*3)(SP)
VMOVDQU B4, (16*4)(SP)
VMOVDQU B5, (16*5)(SP)
VMOVDQU B6, (16*6)(SP)
@ -1226,7 +1106,7 @@ avxGcmSm4EncNibbles:
VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3
AVX_SM4_4BLOCKS(rk, BX, B7, T1, T2, B0, B1, B2, B3)
AVX_SM4_4BLOCKS(rk, B6, B7, T1, T2, B0, B1, B2, B3)
// XOR plaintext
VPXOR (16*0)(ptx), B0, B0
VPXOR (16*1)(ptx), B1, B1
@ -1261,7 +1141,7 @@ avxGcmSm4EncSingles:
VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3
AVX_SM4_4BLOCKS(rk, BX, B7, T1, T2, B0, B1, B2, B3)
AVX_SM4_4BLOCKS(rk, B6, B7, T1, T2, B0, B1, B2, B3)
VMOVDQU B0, (16*0)(SP)
VMOVDQU B1, (16*1)(SP)
VMOVDQU B2, (16*2)(SP)
@ -1364,18 +1244,9 @@ avx2GcmSm4Enc:
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
XORL BX, BX
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
avx2GcmSm4Enc8Loop1:
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0)
AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1)
AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2)
ADDL $16, BX
CMPL BX, $4*32
JB avx2GcmSm4Enc8Loop1
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
@ -1458,18 +1329,9 @@ avx2GcmSm4EncOctetsLoop:
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
XORL BX, BX
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
avx2GcmSm4Enc8Loop2:
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0)
AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1)
AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2)
ADDL $16, BX
CMPL BX, $4*32
JB avx2GcmSm4Enc8Loop2
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
@ -1578,7 +1440,6 @@ avx2GcmSm4EncOctetsEnd:
SUBQ $4, aluCTR
avx2GcmSm4EncNibbles:
VMOVDQU flip_mask<>(SB), B7
CMPQ ptxLen, $64
JBE avx2GcmSm4EncSingles
SUBQ $64, ptxLen
@ -1588,31 +1449,7 @@ avx2GcmSm4EncNibbles:
VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3
VPSHUFB B7, B0, B0
VPSHUFB B7, B1, B1
VPSHUFB B7, B2, B2
VPSHUFB B7, B3, B3
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
XORL BX, BX
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
avx2GcmSm4Enc4Loop2:
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
ADDL $16, BX
CMPL BX, $4*32
JB avx2GcmSm4Enc4Loop2
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
VPSHUFB BSWAP, B0, B0
VPSHUFB BSWAP, B1, B1
VPSHUFB BSWAP, B2, B2
VPSHUFB BSWAP, B3, B3
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
VMOVDQU (16*0)(ptx), T0
VPXOR T0, B0, B0
@ -1650,31 +1487,7 @@ avx2GcmSm4EncSingles:
VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3
VPSHUFB B7, B0, B0
VPSHUFB B7, B1, B1
VPSHUFB B7, B2, B2
VPSHUFB B7, B3, B3
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
XORL BX, BX
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
avx2GcmSm4Enc4Loop1:
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
ADDL $16, BX
CMPL BX, $4*32
JB avx2GcmSm4Enc4Loop1
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
VPSHUFB BSWAP, B0, B0
VPSHUFB BSWAP, B1, B1
VPSHUFB BSWAP, B2, B2
VPSHUFB BSWAP, B3, B3
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
VMOVDQU B0, (16*0)(SP)
VMOVDQU B1, (16*1)(SP)
@ -1890,7 +1703,6 @@ gcmSm4DecOctetsLoop:
PCLMULQDQ $0x00, T0, ACC0
PCLMULQDQ $0x11, T0, ACC1
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
decMulRound(1)
increment(0)
decMulRound(2)
@ -1899,7 +1711,6 @@ gcmSm4DecOctetsLoop:
increment(2)
decMulRound(4)
increment(3)
SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
decMulRound(5)
increment(4)
decMulRound(6)
@ -1920,6 +1731,8 @@ gcmSm4DecOctetsLoop:
reduceRound(ACC0)
PXOR ACC1, ACC0
SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
MOVOU (16*0)(ctx), T0
PXOR T0, B0
MOVOU (16*1)(ctx), T0
@ -1964,7 +1777,7 @@ gcmSm4DecNibbles:
MOVOU (2*16)(SP), B6
MOVOU (3*16)(SP), B7
SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
SM4_4BLOCKS(rk, B0, T0, T1, T2, B4, B5, B6, B7)
MOVOU (16*14)(pTbl), T2
MOVOU (16*0)(ctx), T0
PXOR T0, B4
@ -2000,7 +1813,7 @@ gcmSm4DecSingles:
MOVOU (2*16)(SP), B2
MOVOU (3*16)(SP), B3
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
SM4_4BLOCKS(rk, B4, T0, T1, T2, B0, B1, B2, B3)
MOVOU B0, (16*4)(SP)
MOVOU B1, (16*5)(SP)
MOVOU B2, (16*6)(SP)
@ -2145,25 +1958,21 @@ avxGcmSm4DecOctetsLoop:
avxReduceRound(ACC0)
VPXOR ACC1, ACC0, ACC0
AVX_SM4_4BLOCKS(rk, BX, ACC1, T1, T2, B0, B1, B2, B3)
AVX_SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
VPXOR (16*0)(ctx), B0, B0
VPXOR (16*1)(ctx), B1, B1
VPXOR (16*2)(ctx), B2, B2
VPXOR (16*3)(ctx), B3, B3
VMOVDQU B0, (16*0)(ptx)
VMOVDQU B1, (16*1)(ptx)
VMOVDQU B2, (16*2)(ptx)
VMOVDQU B3, (16*3)(ptx)
AVX_SM4_4BLOCKS(rk, BX, B0, T1, T2, B4, B5, B6, B7)
VPXOR (16*4)(ctx), B4, B4
VPXOR (16*5)(ctx), B5, B5
VPXOR (16*6)(ctx), B6, B6
VPXOR (16*7)(ctx), B7, B7
VMOVDQU B0, (16*0)(ptx)
VMOVDQU B1, (16*1)(ptx)
VMOVDQU B2, (16*2)(ptx)
VMOVDQU B3, (16*3)(ptx)
VMOVDQU B4, (16*4)(ptx)
VMOVDQU B5, (16*5)(ptx)
VMOVDQU B6, (16*6)(ptx)
@ -2187,7 +1996,7 @@ avxGcmSm4DecNibbles:
VMOVDQU (2*16)(SP), B6
VMOVDQU (3*16)(SP), B7
AVX_SM4_4BLOCKS(rk, BX, B0, T1, T2, B4, B5, B6, B7)
AVX_SM4_4BLOCKS(rk, B0, B1, T1, T2, B4, B5, B6, B7)
VMOVDQU (16*14)(pTbl), T2
VMOVDQU (16*0)(ctx), B0
@ -2227,7 +2036,7 @@ avxGcmSm4DecSingles:
VMOVDQU (2*16)(SP), B2
VMOVDQU (3*16)(SP), B3
AVX_SM4_4BLOCKS(rk, BX, B7, B6, B5, B0, B1, B2, B3)
AVX_SM4_4BLOCKS(rk, B7, B6, B5, B4, B0, B1, B2, B3)
VMOVDQU B0, (16*4)(SP)
VMOVDQU B1, (16*5)(SP)
VMOVDQU B2, (16*6)(SP)
@ -2328,13 +2137,6 @@ avx2GcmSm4DecOctetsLoop:
VMOVDQU (2*32)(SP), DWB2
VMOVDQU (3*32)(SP), DWB3
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
// Apply Byte Flip Mask: LE -> BE
VPSHUFB XDWTMP0, DWB0, DWB0
VPSHUFB XDWTMP0, DWB1, DWB1
VPSHUFB XDWTMP0, DWB2, DWB2
VPSHUFB XDWTMP0, DWB3, DWB3
VMOVDQU (16*0)(ctx), T0
VPSHUFB BSWAP, T0, T0
VPXOR ACC0, T0, T0
@ -2348,20 +2150,18 @@ avx2GcmSm4DecOctetsLoop:
VPCLMULQDQ $0x00, T0, ACC1, ACC0
VPCLMULQDQ $0x11, T0, ACC1, ACC1
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
// Apply Byte Flip Mask: LE -> BE
VPSHUFB XDWTMP0, DWB0, DWB0
VPSHUFB XDWTMP0, DWB1, DWB1
VPSHUFB XDWTMP0, DWB2, DWB2
VPSHUFB XDWTMP0, DWB3, DWB3
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
XORL BX, BX
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
avx2GcmSm4Dec8Loop2:
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0)
AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1)
AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2)
ADDL $16, BX
CMPL BX, $4*32
JB avx2GcmSm4Dec8Loop2
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
@ -2374,8 +2174,8 @@ avx2GcmSm4Dec8Loop2:
VMOVDQU (32*0)(ctx), XDWTMP0
VPXOR XDWTMP0, DWB0, DWB0
VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0
VEXTRACTI128 $1, XDWTMP0, T0
VPSHUFB BSWAP, T0, T0
internalAvxDecMulRound(1)
increment(0)
@ -2436,7 +2236,6 @@ avx2GcmSm4DecEndOctets:
SUBQ $4, aluCTR
avx2GcmSm4DecNibbles:
VMOVDQU flip_mask<>(SB), B7 // DO NOT CHANGE B7
CMPQ ptxLen, $64
JBE avx2GcmSm4DecSingles
SUBQ $64, ptxLen
@ -2446,31 +2245,7 @@ avx2GcmSm4DecNibbles:
VMOVDQU (2*16)(SP), B2
VMOVDQU (3*16)(SP), B3
VPSHUFB B7, B0, B0
VPSHUFB B7, B1, B1
VPSHUFB B7, B2, B2
VPSHUFB B7, B3, B3
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
XORL BX, BX
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
avx2GcmSm4Dec4Loop2:
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
ADDL $16, BX
CMPL BX, $4*32
JB avx2GcmSm4Dec4Loop2
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
VPSHUFB BSWAP, B0, B4
VPSHUFB BSWAP, B1, B1
VPSHUFB BSWAP, B2, B2
VPSHUFB BSWAP, B3, B3
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
VMOVDQU (16*14)(pTbl), T2
VMOVDQU (16*0)(ctx), B0
@ -2511,32 +2286,7 @@ avx2GcmSm4DecSingles:
VMOVDQU (2*16)(SP), B2
VMOVDQU (3*16)(SP), B3
VPSHUFB B7, B0, B0
VPSHUFB B7, B1, B1
VPSHUFB B7, B2, B2
VPSHUFB B7, B3, B3
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
XORL BX, BX
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
avx2GcmSm4Dec4Loop1:
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
ADDL $16, BX
CMPL BX, $4*32
JB avx2GcmSm4Dec4Loop1
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
VPSHUFB BSWAP, B0, B0
VPSHUFB BSWAP, B1, B1
VPSHUFB BSWAP, B2, B2
VPSHUFB BSWAP, B3, B3
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
VMOVDQU B0, (16*4)(SP)
VMOVDQU B1, (16*5)(SP)

View File

@ -449,36 +449,24 @@ encOctetsLoop:
// encryption first 4 blocks
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
EOR R13, R13
MOVD rkSave, rk
encOctetsEnc4Blocks1:
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
encOctetsEnc8Blocks:
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7)
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4)
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5)
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6)
ADD $1, R13
CMP $8, R13
BNE encOctetsEnc4Blocks1
BNE encOctetsEnc8Blocks
VREV32 B0.B16, B0.B16
VREV32 B1.B16, B1.B16
VREV32 B2.B16, B2.B16
VREV32 B3.B16, B3.B16
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
// encryption second 4 blocks
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
MOVD rkSave, rk
encOctetsEnc4Blocks2:
SM4_ROUND(rk, R19, K0, K1, K2, B4, B5, B6, B7)
SM4_ROUND(rk, R19, K0, K1, K2, B5, B6, B7, B4)
SM4_ROUND(rk, R19, K0, K1, K2, B6, B7, B4, B5)
SM4_ROUND(rk, R19, K0, K1, K2, B7, B4, B5, B6)
ADD $1, R13
CMP $16, R13
BNE encOctetsEnc4Blocks2
VREV32 B4.B16, B4.B16
VREV32 B5.B16, B5.B16
VREV32 B6.B16, B6.B16
@ -741,41 +729,28 @@ decOctetsLoop:
// encryption first 4 blocks
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
EOR R13, R13
MOVD rkSave, rk
decOctetsEnc4Blocks1:
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
decOctetsEnc8Blocks:
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7)
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4)
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5)
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6)
ADD $1, R13
CMP $8, R13
BNE decOctetsEnc4Blocks1
BNE decOctetsEnc8Blocks
VREV32 B0.B16, T1.B16
VREV32 B1.B16, T2.B16
VREV32 B2.B16, B2.B16
VREV32 B3.B16, B3.B16
TRANSPOSE_MATRIX(T1, T2, B2, B3, K0, K1, K2, K3)
// encryption second 4 blocks
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
MOVD rkSave, rk
decOctetsEnc4Blocks2:
SM4_ROUND(rk, R19, K0, K1, K2, B4, B5, B6, B7)
SM4_ROUND(rk, R19, K0, K1, K2, B5, B6, B7, B4)
SM4_ROUND(rk, R19, K0, K1, K2, B6, B7, B4, B5)
SM4_ROUND(rk, R19, K0, K1, K2, B7, B4, B5, B6)
ADD $1, R13
CMP $16, R13
BNE decOctetsEnc4Blocks2
VREV32 B4.B16, B4.B16
VREV32 B5.B16, B5.B16
VREV32 B6.B16, B6.B16
VREV32 B7.B16, B7.B16
VREV32 B7.B16, B7.B16
TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
VLD1.P 32(srcPtr), [B0.B16, B1.B16]