From dddebb8c1d8c2afc1aef0ae9e38efc9c3c60bd1e Mon Sep 17 00:00:00 2001 From: Emman Date: Thu, 18 Mar 2021 17:54:10 +0800 Subject: [PATCH] MAGIC - optimize cbc decryption --- sm4/asm_amd64.s | 49 +++++++++++++++++++++++++++++++ sm4/cbc_amd64.go | 62 ++++++++++++++++++++++++++++++++++++++++ sm4/modes.go | 10 +++++++ sm4/xor_amd64.go | 23 +++++++++++++++ sm4_test/cbc_sm4_test.go | 25 ++++++++++++++++ 5 files changed, 169 insertions(+) create mode 100644 sm4/cbc_amd64.go create mode 100644 sm4/modes.go create mode 100644 sm4/xor_amd64.go diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s index 7016d2e..22afc1d 100644 --- a/sm4/asm_amd64.s +++ b/sm4/asm_amd64.s @@ -212,3 +212,52 @@ loop: done_sm4: VZEROUPPER RET + +// func xorBytesSSE2(dst, a, b *byte, n int) +TEXT ·xorBytesSSE2(SB), NOSPLIT, $0 + MOVQ dst+0(FP), BX + MOVQ a+8(FP), SI + MOVQ b+16(FP), CX + MOVQ n+24(FP), DX + TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned. + JNZ not_aligned + +aligned: + MOVQ $0, AX // position in slices + +loop16b: + MOVOU (SI)(AX*1), X0 // XOR 16byte forwards. + MOVOU (CX)(AX*1), X1 + PXOR X1, X0 + MOVOU X0, (BX)(AX*1) + ADDQ $16, AX + CMPQ DX, AX + JNE loop16b + RET + +loop_1b: + SUBQ $1, DX // XOR 1byte backwards. + MOVB (SI)(DX*1), DI + MOVB (CX)(DX*1), AX + XORB AX, DI + MOVB DI, (BX)(DX*1) + TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b. + JNZ loop_1b + CMPQ DX, $0 // if len is 0, ret. + JE ret + TESTQ $15, DX // AND 15 & len, if zero jump to aligned. + JZ aligned + +not_aligned: + TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b. + JNE loop_1b + SUBQ $8, DX // XOR 8bytes backwards. + MOVQ (SI)(DX*1), DI + MOVQ (CX)(DX*1), AX + XORQ AX, DI + MOVQ DI, (BX)(DX*1) + CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned. + JGE aligned + +ret: + RET diff --git a/sm4/cbc_amd64.go b/sm4/cbc_amd64.go new file mode 100644 index 0000000..006d9bf --- /dev/null +++ b/sm4/cbc_amd64.go @@ -0,0 +1,62 @@ +package sm4 + +import "crypto/cipher" + +type cbc struct { + b *sm4CipherAsm + iv [BlockSize]byte +} + +func (b *sm4CipherAsm) NewCBCDecrypter(iv []byte) cipher.BlockMode { + var c cbc + c.b = b + copy(c.iv[:], iv) + return &c +} + +func (x *cbc) BlockSize() int { return BlockSize } + +func (x *cbc) CryptBlocks(dst, src []byte) { + if len(src)%BlockSize != 0 { + panic("crypto/cipher: input not full blocks") + } + if len(dst) < len(src) { + panic("crypto/cipher: output smaller than input") + } + if InexactOverlap(dst[:len(src)], src) { + panic("crypto/cipher: invalid buffer overlap") + } + if len(src) == 0 { + return + } + end := len(src) + start := end - FourBlocksSize + var temp []byte = make([]byte, FourBlocksSize) + var src64 []byte = make([]byte, FourBlocksSize) + for start > 0 { + encryptBlocksAsm(&x.b.dec[0], &temp[0], &src[start:end][0]) + xorBytes(dst[end-BlockSize:end], temp[FourBlocksSize-BlockSize:FourBlocksSize], src[end-2*BlockSize:end-BlockSize]) + xorBytes(dst[end-2*BlockSize:end-BlockSize], temp[FourBlocksSize-2*BlockSize:FourBlocksSize-BlockSize], src[end-3*BlockSize:end-2*BlockSize]) + xorBytes(dst[end-3*BlockSize:end-2*BlockSize], temp[FourBlocksSize-3*BlockSize:FourBlocksSize-2*BlockSize], src[end-4*BlockSize:end-3*BlockSize]) + xorBytes(dst[end-4*BlockSize:end-3*BlockSize], temp[:BlockSize], src[end-5*BlockSize:end-4*BlockSize]) + + end = start + start -= FourBlocksSize + } + + copy(src64, src[:end]) + encryptBlocksAsm(&x.b.dec[0], &temp[0], &src[:end][0]) + count := end / BlockSize + for i := count; i > 1; i-- { + xorBytes(dst[end-BlockSize:end], temp[end-BlockSize:end], src[end-2*BlockSize:end-BlockSize]) + end -= BlockSize + } + xorBytes(dst[0:end], temp[0:end], x.iv[:]) +} + +func (x *cbc) SetIV(iv []byte) { + if len(iv) != BlockSize { + panic("cipher: incorrect length IV") + } + copy(x.iv[:], iv) +} diff --git a/sm4/modes.go b/sm4/modes.go new file mode 100644 index 0000000..77c4b85 --- /dev/null +++ b/sm4/modes.go @@ -0,0 +1,10 @@ +package sm4 + +import "crypto/cipher" + +// cbcDecAble is implemented by cipher.Blocks that can provide an optimized +// implementation of CBC decryption through the cipher.BlockMode interface. +// See crypto/cipher/cbc.go. +type cbcDecAble interface { + NewCBCDecrypter(iv []byte) cipher.BlockMode +} diff --git a/sm4/xor_amd64.go b/sm4/xor_amd64.go new file mode 100644 index 0000000..ca0ec52 --- /dev/null +++ b/sm4/xor_amd64.go @@ -0,0 +1,23 @@ +package sm4 + +// xorBytes xors the bytes in a and b. The destination should have enough +// space, otherwise xorBytes will panic. Returns the number of bytes xor'd. +func xorBytes(dst, a, b []byte) int { + n := len(a) + if len(b) < n { + n = len(b) + } + if n == 0 { + return 0 + } + _ = dst[n-1] + xorBytesSSE2(&dst[0], &a[0], &b[0], n) // amd64 must have SSE2 + return n +} + +func xorWords(dst, a, b []byte) { + xorBytes(dst, a, b) +} + +//go:noescape +func xorBytesSSE2(dst, a, b *byte, n int) \ No newline at end of file diff --git a/sm4_test/cbc_sm4_test.go b/sm4_test/cbc_sm4_test.go index 9b4d021..72ba303 100644 --- a/sm4_test/cbc_sm4_test.go +++ b/sm4_test/cbc_sm4_test.go @@ -50,6 +50,31 @@ var cbcSM4Tests = []struct { []byte("Hello World"), []byte{0x0a, 0x67, 0x06, 0x2f, 0x0c, 0xd2, 0xdc, 0xe2, 0x6a, 0x7b, 0x97, 0x8e, 0xbf, 0x21, 0x34, 0xf9}, }, + { + "Three blocks", + []byte("0123456789ABCDEF"), + []byte("0123456789ABCDEF"), + []byte("Hello World Hello World Hello World Hello Worldd"), + []byte{ + 0xd3, 0x1e, 0x36, 0x83, 0xe4, 0xfc, 0x9b, 0x51, 0x6a, 0x2c, 0x0f, 0x98, 0x36, 0x76, 0xa9, 0xeb, + 0x1f, 0xdc, 0xc3, 0x2a, 0xf3, 0x84, 0x08, 0x97, 0x81, 0x57, 0xa2, 0x06, 0x5d, 0xe3, 0x4c, 0x6a, + 0x06, 0x8d, 0x0f, 0xef, 0x4e, 0x2b, 0xfa, 0xb4, 0xbc, 0xab, 0xa6, 0x64, 0x41, 0xfd, 0xe0, 0xfe, + 0x92, 0xc1, 0x64, 0xec, 0xa1, 0x70, 0x24, 0x75, 0x72, 0xde, 0x12, 0x02, 0x95, 0x2e, 0xc7, 0x27, + }, + }, + { + "Four blocks", + []byte("0123456789ABCDEF"), + []byte("0123456789ABCDEF"), + []byte("Hello World Hello World Hello World Hello World Hello World Hell"), + []byte{ + 0xd3, 0x1e, 0x36, 0x83, 0xe4, 0xfc, 0x9b, 0x51, 0x6a, 0x2c, 0x0f, 0x98, 0x36, 0x76, 0xa9, 0xeb, + 0x1f, 0xdc, 0xc3, 0x2a, 0xf3, 0x84, 0x08, 0x97, 0x81, 0x57, 0xa2, 0x06, 0x5d, 0xe3, 0x4c, 0x6a, + 0xe0, 0x02, 0xd6, 0xe4, 0xf5, 0x66, 0x87, 0xc4, 0xcc, 0x54, 0x1d, 0x1f, 0x1c, 0xc4, 0x2f, 0xe6, + 0xe5, 0x1d, 0xea, 0x52, 0xb8, 0x0c, 0xc8, 0xbe, 0xae, 0xcc, 0x44, 0xa8, 0x51, 0x81, 0x08, 0x60, + 0x34, 0x6e, 0x9d, 0xad, 0xe1, 0x8a, 0xf4, 0xa1, 0x83, 0x69, 0x57, 0xb9, 0x37, 0x26, 0x7e, 0x03, + }, + }, { "A.1", []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10},