mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-28 05:06:18 +08:00
MAGIC - optimize cbc decryption
This commit is contained in:
parent
b0889c3432
commit
dddebb8c1d
@ -212,3 +212,52 @@ loop:
|
|||||||
done_sm4:
|
done_sm4:
|
||||||
VZEROUPPER
|
VZEROUPPER
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
// func xorBytesSSE2(dst, a, b *byte, n int)
|
||||||
|
TEXT ·xorBytesSSE2(SB), NOSPLIT, $0
|
||||||
|
MOVQ dst+0(FP), BX
|
||||||
|
MOVQ a+8(FP), SI
|
||||||
|
MOVQ b+16(FP), CX
|
||||||
|
MOVQ n+24(FP), DX
|
||||||
|
TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned.
|
||||||
|
JNZ not_aligned
|
||||||
|
|
||||||
|
aligned:
|
||||||
|
MOVQ $0, AX // position in slices
|
||||||
|
|
||||||
|
loop16b:
|
||||||
|
MOVOU (SI)(AX*1), X0 // XOR 16byte forwards.
|
||||||
|
MOVOU (CX)(AX*1), X1
|
||||||
|
PXOR X1, X0
|
||||||
|
MOVOU X0, (BX)(AX*1)
|
||||||
|
ADDQ $16, AX
|
||||||
|
CMPQ DX, AX
|
||||||
|
JNE loop16b
|
||||||
|
RET
|
||||||
|
|
||||||
|
loop_1b:
|
||||||
|
SUBQ $1, DX // XOR 1byte backwards.
|
||||||
|
MOVB (SI)(DX*1), DI
|
||||||
|
MOVB (CX)(DX*1), AX
|
||||||
|
XORB AX, DI
|
||||||
|
MOVB DI, (BX)(DX*1)
|
||||||
|
TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b.
|
||||||
|
JNZ loop_1b
|
||||||
|
CMPQ DX, $0 // if len is 0, ret.
|
||||||
|
JE ret
|
||||||
|
TESTQ $15, DX // AND 15 & len, if zero jump to aligned.
|
||||||
|
JZ aligned
|
||||||
|
|
||||||
|
not_aligned:
|
||||||
|
TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b.
|
||||||
|
JNE loop_1b
|
||||||
|
SUBQ $8, DX // XOR 8bytes backwards.
|
||||||
|
MOVQ (SI)(DX*1), DI
|
||||||
|
MOVQ (CX)(DX*1), AX
|
||||||
|
XORQ AX, DI
|
||||||
|
MOVQ DI, (BX)(DX*1)
|
||||||
|
CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned.
|
||||||
|
JGE aligned
|
||||||
|
|
||||||
|
ret:
|
||||||
|
RET
|
||||||
|
62
sm4/cbc_amd64.go
Normal file
62
sm4/cbc_amd64.go
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
package sm4
|
||||||
|
|
||||||
|
import "crypto/cipher"
|
||||||
|
|
||||||
|
type cbc struct {
|
||||||
|
b *sm4CipherAsm
|
||||||
|
iv [BlockSize]byte
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *sm4CipherAsm) NewCBCDecrypter(iv []byte) cipher.BlockMode {
|
||||||
|
var c cbc
|
||||||
|
c.b = b
|
||||||
|
copy(c.iv[:], iv)
|
||||||
|
return &c
|
||||||
|
}
|
||||||
|
|
||||||
|
func (x *cbc) BlockSize() int { return BlockSize }
|
||||||
|
|
||||||
|
func (x *cbc) CryptBlocks(dst, src []byte) {
|
||||||
|
if len(src)%BlockSize != 0 {
|
||||||
|
panic("crypto/cipher: input not full blocks")
|
||||||
|
}
|
||||||
|
if len(dst) < len(src) {
|
||||||
|
panic("crypto/cipher: output smaller than input")
|
||||||
|
}
|
||||||
|
if InexactOverlap(dst[:len(src)], src) {
|
||||||
|
panic("crypto/cipher: invalid buffer overlap")
|
||||||
|
}
|
||||||
|
if len(src) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
end := len(src)
|
||||||
|
start := end - FourBlocksSize
|
||||||
|
var temp []byte = make([]byte, FourBlocksSize)
|
||||||
|
var src64 []byte = make([]byte, FourBlocksSize)
|
||||||
|
for start > 0 {
|
||||||
|
encryptBlocksAsm(&x.b.dec[0], &temp[0], &src[start:end][0])
|
||||||
|
xorBytes(dst[end-BlockSize:end], temp[FourBlocksSize-BlockSize:FourBlocksSize], src[end-2*BlockSize:end-BlockSize])
|
||||||
|
xorBytes(dst[end-2*BlockSize:end-BlockSize], temp[FourBlocksSize-2*BlockSize:FourBlocksSize-BlockSize], src[end-3*BlockSize:end-2*BlockSize])
|
||||||
|
xorBytes(dst[end-3*BlockSize:end-2*BlockSize], temp[FourBlocksSize-3*BlockSize:FourBlocksSize-2*BlockSize], src[end-4*BlockSize:end-3*BlockSize])
|
||||||
|
xorBytes(dst[end-4*BlockSize:end-3*BlockSize], temp[:BlockSize], src[end-5*BlockSize:end-4*BlockSize])
|
||||||
|
|
||||||
|
end = start
|
||||||
|
start -= FourBlocksSize
|
||||||
|
}
|
||||||
|
|
||||||
|
copy(src64, src[:end])
|
||||||
|
encryptBlocksAsm(&x.b.dec[0], &temp[0], &src[:end][0])
|
||||||
|
count := end / BlockSize
|
||||||
|
for i := count; i > 1; i-- {
|
||||||
|
xorBytes(dst[end-BlockSize:end], temp[end-BlockSize:end], src[end-2*BlockSize:end-BlockSize])
|
||||||
|
end -= BlockSize
|
||||||
|
}
|
||||||
|
xorBytes(dst[0:end], temp[0:end], x.iv[:])
|
||||||
|
}
|
||||||
|
|
||||||
|
func (x *cbc) SetIV(iv []byte) {
|
||||||
|
if len(iv) != BlockSize {
|
||||||
|
panic("cipher: incorrect length IV")
|
||||||
|
}
|
||||||
|
copy(x.iv[:], iv)
|
||||||
|
}
|
10
sm4/modes.go
Normal file
10
sm4/modes.go
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
package sm4
|
||||||
|
|
||||||
|
import "crypto/cipher"
|
||||||
|
|
||||||
|
// cbcDecAble is implemented by cipher.Blocks that can provide an optimized
|
||||||
|
// implementation of CBC decryption through the cipher.BlockMode interface.
|
||||||
|
// See crypto/cipher/cbc.go.
|
||||||
|
type cbcDecAble interface {
|
||||||
|
NewCBCDecrypter(iv []byte) cipher.BlockMode
|
||||||
|
}
|
23
sm4/xor_amd64.go
Normal file
23
sm4/xor_amd64.go
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
package sm4
|
||||||
|
|
||||||
|
// xorBytes xors the bytes in a and b. The destination should have enough
|
||||||
|
// space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
|
||||||
|
func xorBytes(dst, a, b []byte) int {
|
||||||
|
n := len(a)
|
||||||
|
if len(b) < n {
|
||||||
|
n = len(b)
|
||||||
|
}
|
||||||
|
if n == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
_ = dst[n-1]
|
||||||
|
xorBytesSSE2(&dst[0], &a[0], &b[0], n) // amd64 must have SSE2
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
|
||||||
|
func xorWords(dst, a, b []byte) {
|
||||||
|
xorBytes(dst, a, b)
|
||||||
|
}
|
||||||
|
|
||||||
|
//go:noescape
|
||||||
|
func xorBytesSSE2(dst, a, b *byte, n int)
|
@ -50,6 +50,31 @@ var cbcSM4Tests = []struct {
|
|||||||
[]byte("Hello World"),
|
[]byte("Hello World"),
|
||||||
[]byte{0x0a, 0x67, 0x06, 0x2f, 0x0c, 0xd2, 0xdc, 0xe2, 0x6a, 0x7b, 0x97, 0x8e, 0xbf, 0x21, 0x34, 0xf9},
|
[]byte{0x0a, 0x67, 0x06, 0x2f, 0x0c, 0xd2, 0xdc, 0xe2, 0x6a, 0x7b, 0x97, 0x8e, 0xbf, 0x21, 0x34, 0xf9},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"Three blocks",
|
||||||
|
[]byte("0123456789ABCDEF"),
|
||||||
|
[]byte("0123456789ABCDEF"),
|
||||||
|
[]byte("Hello World Hello World Hello World Hello Worldd"),
|
||||||
|
[]byte{
|
||||||
|
0xd3, 0x1e, 0x36, 0x83, 0xe4, 0xfc, 0x9b, 0x51, 0x6a, 0x2c, 0x0f, 0x98, 0x36, 0x76, 0xa9, 0xeb,
|
||||||
|
0x1f, 0xdc, 0xc3, 0x2a, 0xf3, 0x84, 0x08, 0x97, 0x81, 0x57, 0xa2, 0x06, 0x5d, 0xe3, 0x4c, 0x6a,
|
||||||
|
0x06, 0x8d, 0x0f, 0xef, 0x4e, 0x2b, 0xfa, 0xb4, 0xbc, 0xab, 0xa6, 0x64, 0x41, 0xfd, 0xe0, 0xfe,
|
||||||
|
0x92, 0xc1, 0x64, 0xec, 0xa1, 0x70, 0x24, 0x75, 0x72, 0xde, 0x12, 0x02, 0x95, 0x2e, 0xc7, 0x27,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Four blocks",
|
||||||
|
[]byte("0123456789ABCDEF"),
|
||||||
|
[]byte("0123456789ABCDEF"),
|
||||||
|
[]byte("Hello World Hello World Hello World Hello World Hello World Hell"),
|
||||||
|
[]byte{
|
||||||
|
0xd3, 0x1e, 0x36, 0x83, 0xe4, 0xfc, 0x9b, 0x51, 0x6a, 0x2c, 0x0f, 0x98, 0x36, 0x76, 0xa9, 0xeb,
|
||||||
|
0x1f, 0xdc, 0xc3, 0x2a, 0xf3, 0x84, 0x08, 0x97, 0x81, 0x57, 0xa2, 0x06, 0x5d, 0xe3, 0x4c, 0x6a,
|
||||||
|
0xe0, 0x02, 0xd6, 0xe4, 0xf5, 0x66, 0x87, 0xc4, 0xcc, 0x54, 0x1d, 0x1f, 0x1c, 0xc4, 0x2f, 0xe6,
|
||||||
|
0xe5, 0x1d, 0xea, 0x52, 0xb8, 0x0c, 0xc8, 0xbe, 0xae, 0xcc, 0x44, 0xa8, 0x51, 0x81, 0x08, 0x60,
|
||||||
|
0x34, 0x6e, 0x9d, 0xad, 0xe1, 0x8a, 0xf4, 0xa1, 0x83, 0x69, 0x57, 0xb9, 0x37, 0x26, 0x7e, 0x03,
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"A.1",
|
"A.1",
|
||||||
[]byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10},
|
[]byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10},
|
||||||
|
Loading…
x
Reference in New Issue
Block a user