MAGIC - optimize cbc decryption

This commit is contained in:
Emman 2021-03-18 17:54:10 +08:00
parent b0889c3432
commit dddebb8c1d
5 changed files with 169 additions and 0 deletions

View File

@ -212,3 +212,52 @@ loop:
done_sm4:
VZEROUPPER
RET
// func xorBytesSSE2(dst, a, b *byte, n int)
TEXT ·xorBytesSSE2(SB), NOSPLIT, $0
MOVQ dst+0(FP), BX
MOVQ a+8(FP), SI
MOVQ b+16(FP), CX
MOVQ n+24(FP), DX
TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned.
JNZ not_aligned
aligned:
MOVQ $0, AX // position in slices
loop16b:
MOVOU (SI)(AX*1), X0 // XOR 16byte forwards.
MOVOU (CX)(AX*1), X1
PXOR X1, X0
MOVOU X0, (BX)(AX*1)
ADDQ $16, AX
CMPQ DX, AX
JNE loop16b
RET
loop_1b:
SUBQ $1, DX // XOR 1byte backwards.
MOVB (SI)(DX*1), DI
MOVB (CX)(DX*1), AX
XORB AX, DI
MOVB DI, (BX)(DX*1)
TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b.
JNZ loop_1b
CMPQ DX, $0 // if len is 0, ret.
JE ret
TESTQ $15, DX // AND 15 & len, if zero jump to aligned.
JZ aligned
not_aligned:
TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b.
JNE loop_1b
SUBQ $8, DX // XOR 8bytes backwards.
MOVQ (SI)(DX*1), DI
MOVQ (CX)(DX*1), AX
XORQ AX, DI
MOVQ DI, (BX)(DX*1)
CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned.
JGE aligned
ret:
RET

62
sm4/cbc_amd64.go Normal file
View File

@ -0,0 +1,62 @@
package sm4
import "crypto/cipher"
type cbc struct {
b *sm4CipherAsm
iv [BlockSize]byte
}
func (b *sm4CipherAsm) NewCBCDecrypter(iv []byte) cipher.BlockMode {
var c cbc
c.b = b
copy(c.iv[:], iv)
return &c
}
func (x *cbc) BlockSize() int { return BlockSize }
func (x *cbc) CryptBlocks(dst, src []byte) {
if len(src)%BlockSize != 0 {
panic("crypto/cipher: input not full blocks")
}
if len(dst) < len(src) {
panic("crypto/cipher: output smaller than input")
}
if InexactOverlap(dst[:len(src)], src) {
panic("crypto/cipher: invalid buffer overlap")
}
if len(src) == 0 {
return
}
end := len(src)
start := end - FourBlocksSize
var temp []byte = make([]byte, FourBlocksSize)
var src64 []byte = make([]byte, FourBlocksSize)
for start > 0 {
encryptBlocksAsm(&x.b.dec[0], &temp[0], &src[start:end][0])
xorBytes(dst[end-BlockSize:end], temp[FourBlocksSize-BlockSize:FourBlocksSize], src[end-2*BlockSize:end-BlockSize])
xorBytes(dst[end-2*BlockSize:end-BlockSize], temp[FourBlocksSize-2*BlockSize:FourBlocksSize-BlockSize], src[end-3*BlockSize:end-2*BlockSize])
xorBytes(dst[end-3*BlockSize:end-2*BlockSize], temp[FourBlocksSize-3*BlockSize:FourBlocksSize-2*BlockSize], src[end-4*BlockSize:end-3*BlockSize])
xorBytes(dst[end-4*BlockSize:end-3*BlockSize], temp[:BlockSize], src[end-5*BlockSize:end-4*BlockSize])
end = start
start -= FourBlocksSize
}
copy(src64, src[:end])
encryptBlocksAsm(&x.b.dec[0], &temp[0], &src[:end][0])
count := end / BlockSize
for i := count; i > 1; i-- {
xorBytes(dst[end-BlockSize:end], temp[end-BlockSize:end], src[end-2*BlockSize:end-BlockSize])
end -= BlockSize
}
xorBytes(dst[0:end], temp[0:end], x.iv[:])
}
func (x *cbc) SetIV(iv []byte) {
if len(iv) != BlockSize {
panic("cipher: incorrect length IV")
}
copy(x.iv[:], iv)
}

10
sm4/modes.go Normal file
View File

@ -0,0 +1,10 @@
package sm4
import "crypto/cipher"
// cbcDecAble is implemented by cipher.Blocks that can provide an optimized
// implementation of CBC decryption through the cipher.BlockMode interface.
// See crypto/cipher/cbc.go.
type cbcDecAble interface {
NewCBCDecrypter(iv []byte) cipher.BlockMode
}

23
sm4/xor_amd64.go Normal file
View File

@ -0,0 +1,23 @@
package sm4
// xorBytes xors the bytes in a and b. The destination should have enough
// space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
func xorBytes(dst, a, b []byte) int {
n := len(a)
if len(b) < n {
n = len(b)
}
if n == 0 {
return 0
}
_ = dst[n-1]
xorBytesSSE2(&dst[0], &a[0], &b[0], n) // amd64 must have SSE2
return n
}
func xorWords(dst, a, b []byte) {
xorBytes(dst, a, b)
}
//go:noescape
func xorBytesSSE2(dst, a, b *byte, n int)

View File

@ -50,6 +50,31 @@ var cbcSM4Tests = []struct {
[]byte("Hello World"),
[]byte{0x0a, 0x67, 0x06, 0x2f, 0x0c, 0xd2, 0xdc, 0xe2, 0x6a, 0x7b, 0x97, 0x8e, 0xbf, 0x21, 0x34, 0xf9},
},
{
"Three blocks",
[]byte("0123456789ABCDEF"),
[]byte("0123456789ABCDEF"),
[]byte("Hello World Hello World Hello World Hello Worldd"),
[]byte{
0xd3, 0x1e, 0x36, 0x83, 0xe4, 0xfc, 0x9b, 0x51, 0x6a, 0x2c, 0x0f, 0x98, 0x36, 0x76, 0xa9, 0xeb,
0x1f, 0xdc, 0xc3, 0x2a, 0xf3, 0x84, 0x08, 0x97, 0x81, 0x57, 0xa2, 0x06, 0x5d, 0xe3, 0x4c, 0x6a,
0x06, 0x8d, 0x0f, 0xef, 0x4e, 0x2b, 0xfa, 0xb4, 0xbc, 0xab, 0xa6, 0x64, 0x41, 0xfd, 0xe0, 0xfe,
0x92, 0xc1, 0x64, 0xec, 0xa1, 0x70, 0x24, 0x75, 0x72, 0xde, 0x12, 0x02, 0x95, 0x2e, 0xc7, 0x27,
},
},
{
"Four blocks",
[]byte("0123456789ABCDEF"),
[]byte("0123456789ABCDEF"),
[]byte("Hello World Hello World Hello World Hello World Hello World Hell"),
[]byte{
0xd3, 0x1e, 0x36, 0x83, 0xe4, 0xfc, 0x9b, 0x51, 0x6a, 0x2c, 0x0f, 0x98, 0x36, 0x76, 0xa9, 0xeb,
0x1f, 0xdc, 0xc3, 0x2a, 0xf3, 0x84, 0x08, 0x97, 0x81, 0x57, 0xa2, 0x06, 0x5d, 0xe3, 0x4c, 0x6a,
0xe0, 0x02, 0xd6, 0xe4, 0xf5, 0x66, 0x87, 0xc4, 0xcc, 0x54, 0x1d, 0x1f, 0x1c, 0xc4, 0x2f, 0xe6,
0xe5, 0x1d, 0xea, 0x52, 0xb8, 0x0c, 0xc8, 0xbe, 0xae, 0xcc, 0x44, 0xa8, 0x51, 0x81, 0x08, 0x60,
0x34, 0x6e, 0x9d, 0xad, 0xe1, 0x8a, 0xf4, 0xa1, 0x83, 0x69, 0x57, 0xb9, 0x37, 0x26, 0x7e, 0x03,
},
},
{
"A.1",
[]byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10},