mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-27 04:36:19 +08:00
zuc: minor optimization
This commit is contained in:
parent
a33c2ae118
commit
b721bed0cc
@ -17,10 +17,6 @@ DATA Low_nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
|
|||||||
DATA Low_nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
|
DATA Low_nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||||
GLOBL Low_nibble_mask<>(SB), RODATA, $16
|
GLOBL Low_nibble_mask<>(SB), RODATA, $16
|
||||||
|
|
||||||
DATA High_nibble_mask<>+0x00(SB)/8, $0xF0F0F0F0F0F0F0F0
|
|
||||||
DATA High_nibble_mask<>+0x08(SB)/8, $0xF0F0F0F0F0F0F0F0
|
|
||||||
GLOBL High_nibble_mask<>(SB), RODATA, $16
|
|
||||||
|
|
||||||
DATA P1<>+0x00(SB)/8, $0x0A020F0F0E000F09
|
DATA P1<>+0x00(SB)/8, $0x0A020F0F0E000F09
|
||||||
DATA P1<>+0x08(SB)/8, $0x090305070C000400
|
DATA P1<>+0x08(SB)/8, $0x090305070C000400
|
||||||
GLOBL P1<>(SB), RODATA, $16
|
GLOBL P1<>(SB), RODATA, $16
|
||||||
@ -99,10 +95,9 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
|||||||
#define S0_comput_SSE(IN_OUT, XTMP1, XTMP2) \
|
#define S0_comput_SSE(IN_OUT, XTMP1, XTMP2) \
|
||||||
MOVOU IN_OUT, XTMP1 \
|
MOVOU IN_OUT, XTMP1 \
|
||||||
\
|
\
|
||||||
PAND Low_nibble_mask<>(SB), IN_OUT \ // x2
|
|
||||||
\
|
|
||||||
PAND High_nibble_mask<>(SB), XTMP1 \
|
|
||||||
PSRLQ $4, XTMP1 \ // x1
|
PSRLQ $4, XTMP1 \ // x1
|
||||||
|
PAND Low_nibble_mask<>(SB), XTMP1 \
|
||||||
|
PAND Low_nibble_mask<>(SB), IN_OUT \ // x2
|
||||||
\
|
\
|
||||||
MOVOU P1<>(SB), XTMP2 \
|
MOVOU P1<>(SB), XTMP2 \
|
||||||
PSHUFB IN_OUT, XTMP2 \ // P1[x2]
|
PSHUFB IN_OUT, XTMP2 \ // P1[x2]
|
||||||
@ -124,16 +119,15 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
|||||||
// for high and low nible of each input byte, SSE versiion.
|
// for high and low nible of each input byte, SSE versiion.
|
||||||
#define MUL_PSHUFB_SSE(XIN, XLO, XHI_OUT, XTMP) \
|
#define MUL_PSHUFB_SSE(XIN, XLO, XHI_OUT, XTMP) \
|
||||||
\ // Get low nibble of input data
|
\ // Get low nibble of input data
|
||||||
MOVOU Low_nibble_mask<>(SB), XTMP \
|
MOVOU XIN, XTMP \
|
||||||
PAND XIN, XTMP \
|
PAND Low_nibble_mask<>(SB), XTMP \
|
||||||
\ // Get low nibble of output
|
\ // Get low nibble of output
|
||||||
PSHUFB XTMP, XLO \
|
PSHUFB XTMP, XLO \
|
||||||
\ // Get high nibble of input data
|
\ // Get high nibble of input data
|
||||||
MOVOU High_nibble_mask<>(SB), XTMP \
|
PSRLQ $4, XIN \
|
||||||
PAND XIN, XTMP \
|
PAND Low_nibble_mask<>(SB), XIN \
|
||||||
PSRLQ $4, XTMP \
|
|
||||||
\ // Get high nibble of output
|
\ // Get high nibble of output
|
||||||
PSHUFB XTMP, XHI_OUT \
|
PSHUFB XIN, XHI_OUT \
|
||||||
\ // XOR high and low nibbles to get full bytes
|
\ // XOR high and low nibbles to get full bytes
|
||||||
PXOR XLO, XHI_OUT
|
PXOR XLO, XHI_OUT
|
||||||
|
|
||||||
@ -146,8 +140,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
|||||||
PSHUFB Shuf_mask<>(SB), XTMP2 \
|
PSHUFB Shuf_mask<>(SB), XTMP2 \
|
||||||
AESENCLAST Cancel_aes<>(SB), XTMP2 \
|
AESENCLAST Cancel_aes<>(SB), XTMP2 \
|
||||||
\
|
\
|
||||||
MOVOU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \
|
MOVOU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \
|
||||||
MOVOU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \
|
MOVOU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \
|
||||||
MUL_PSHUFB_SSE(XTMP2, XTMP1, XIN_OUT, XTMP3)
|
MUL_PSHUFB_SSE(XTMP2, XTMP1, XIN_OUT, XTMP3)
|
||||||
|
|
||||||
// Rotate left 5 bits in each byte, within an XMM register, AVX version.
|
// Rotate left 5 bits in each byte, within an XMM register, AVX version.
|
||||||
@ -160,9 +154,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
|||||||
|
|
||||||
// Compute 16 S0 box values from 16 bytes, AVX version.
|
// Compute 16 S0 box values from 16 bytes, AVX version.
|
||||||
#define S0_comput_AVX(IN_OUT, XTMP1, XTMP2) \
|
#define S0_comput_AVX(IN_OUT, XTMP1, XTMP2) \
|
||||||
VPAND High_nibble_mask<>(SB), IN_OUT, XTMP1 \
|
VPSRLQ $4, IN_OUT, XTMP1 \ // x1
|
||||||
VPSRLQ $4, XTMP1, XTMP1 \ // x1
|
VPAND Low_nibble_mask<>(SB), XTMP1, XTMP1 \
|
||||||
\
|
|
||||||
VPAND Low_nibble_mask<>(SB), IN_OUT, IN_OUT \ // x2
|
VPAND Low_nibble_mask<>(SB), IN_OUT, IN_OUT \ // x2
|
||||||
\
|
\
|
||||||
VMOVDQU P1<>(SB), XTMP2 \
|
VMOVDQU P1<>(SB), XTMP2 \
|
||||||
@ -189,8 +182,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
|||||||
\ // Get low nibble of output
|
\ // Get low nibble of output
|
||||||
VPSHUFB XTMP, XLO, XLO \
|
VPSHUFB XTMP, XLO, XLO \
|
||||||
\ // Get high nibble of input data
|
\ // Get high nibble of input data
|
||||||
VPAND High_nibble_mask<>(SB), XIN, XTMP \
|
VPSRLQ $4, XIN, XTMP \
|
||||||
VPSRLQ $4, XTMP, XTMP \
|
VPAND Low_nibble_mask<>(SB), XTMP, XTMP \
|
||||||
\ // Get high nibble of output
|
\ // Get high nibble of output
|
||||||
VPSHUFB XTMP, XHI_OUT, XHI_OUT \
|
VPSHUFB XTMP, XHI_OUT, XHI_OUT \
|
||||||
\ // XOR high and low nibbles to get full bytes
|
\ // XOR high and low nibbles to get full bytes
|
||||||
|
@ -94,9 +94,7 @@ GLOBL mask_S01<>(SB), RODATA, $32
|
|||||||
VORR XTMP0.B16, XDATA.B16, XDATA.B16
|
VORR XTMP0.B16, XDATA.B16, XDATA.B16
|
||||||
|
|
||||||
#define S0_comput(IN_OUT, XTMP1, XTMP2) \
|
#define S0_comput(IN_OUT, XTMP1, XTMP2) \
|
||||||
VUSHR $4, IN_OUT.S4, XTMP1.S4 \
|
VUSHR $4, IN_OUT.B16, XTMP1.B16 \
|
||||||
VAND NIBBLE_MASK.B16, XTMP1.B16, XTMP1.B16 \
|
|
||||||
\
|
|
||||||
VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \
|
VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \
|
||||||
\
|
\
|
||||||
VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \
|
VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \
|
||||||
|
@ -65,9 +65,8 @@ GLOBL rcon<>(SB), RODATA, $160
|
|||||||
LXVD2X (R4)(R5), S1_MASK
|
LXVD2X (R4)(R5), S1_MASK
|
||||||
|
|
||||||
#define S0_comput(IN_OUT, V_FOUR, XTMP1, XTMP2) \
|
#define S0_comput(IN_OUT, V_FOUR, XTMP1, XTMP2) \
|
||||||
VSRW IN_OUT, V_FOUR, XTMP1; \
|
VSRB IN_OUT, V_FOUR, XTMP1; \ // XTMP1 = hi 4 bits of IN_OUT
|
||||||
VAND XTMP1, NIBBLE_MASK, XTMP1; \
|
VAND IN_OUT, NIBBLE_MASK, IN_OUT; \ // low 4 bits of IN_OUT
|
||||||
VAND IN_OUT, NIBBLE_MASK, IN_OUT; \
|
|
||||||
VPERM P1, P1, IN_OUT, XTMP2; \
|
VPERM P1, P1, IN_OUT, XTMP2; \
|
||||||
VXOR XTMP1, XTMP2, XTMP2; \
|
VXOR XTMP1, XTMP2, XTMP2; \
|
||||||
VPERM P2, P2, XTMP2, XTMP1; \
|
VPERM P2, P2, XTMP2, XTMP1; \
|
||||||
@ -87,8 +86,6 @@ GLOBL rcon<>(SB), RODATA, $160
|
|||||||
// zuc sbox function
|
// zuc sbox function
|
||||||
// parameters:
|
// parameters:
|
||||||
// - x: 128 bits register as sbox input/output data
|
// - x: 128 bits register as sbox input/output data
|
||||||
// - y: 128 bits temp register
|
|
||||||
// - z: 128 bits temp register
|
|
||||||
#define S1_comput(x, y, z) \
|
#define S1_comput(x, y, z) \
|
||||||
VPERMXOR M1H, M1L, x, x; \
|
VPERMXOR M1H, M1L, x, x; \
|
||||||
VSBOX x, x; \
|
VSBOX x, x; \
|
||||||
@ -213,7 +210,7 @@ GLOBL rcon<>(SB), RODATA, $160
|
|||||||
\ // LFSR_S16 = (LFSR_S15++) = W
|
\ // LFSR_S16 = (LFSR_S15++) = W
|
||||||
MOVW W, (((0 + idx) % 16)*4)(addr)
|
MOVW W, (((0 + idx) % 16)*4)(addr)
|
||||||
|
|
||||||
#define RESTORE_LFSR_0(addr, tmpR1, tmpR2, tmpR3, tmpR4) \
|
#define RESTORE_LFSR_0(addr, tmpR1, tmpR2, tmpR3, tmpR4) \
|
||||||
MOVWZ (addr), tmpR1 \
|
MOVWZ (addr), tmpR1 \
|
||||||
MOVD $4, tmpR4 \
|
MOVD $4, tmpR4 \
|
||||||
LXVD2X (tmpR4)(addr), V0 \
|
LXVD2X (tmpR4)(addr), V0 \
|
||||||
@ -232,7 +229,7 @@ GLOBL rcon<>(SB), RODATA, $160
|
|||||||
MOVW tmpR3, 56(addr) \
|
MOVW tmpR3, 56(addr) \
|
||||||
MOVW tmpR1, 60(addr)
|
MOVW tmpR1, 60(addr)
|
||||||
|
|
||||||
#define RESTORE_LFSR_2(addr, tmpR1, tmpR2, tmpR3) \
|
#define RESTORE_LFSR_2(addr, tmpR1, tmpR2, tmpR3) \
|
||||||
MOVD (addr), tmpR1 \
|
MOVD (addr), tmpR1 \
|
||||||
MOVD $8, tmpR2 \
|
MOVD $8, tmpR2 \
|
||||||
LXVD2X (tmpR2)(addr), V0 \
|
LXVD2X (tmpR2)(addr), V0 \
|
||||||
@ -250,7 +247,7 @@ GLOBL rcon<>(SB), RODATA, $160
|
|||||||
MOVD tmpR3, 48(addr) \
|
MOVD tmpR3, 48(addr) \
|
||||||
MOVD tmpR1, 56(addr)
|
MOVD tmpR1, 56(addr)
|
||||||
|
|
||||||
#define RESTORE_LFSR_4(addr, tmpR1, tmpR2, tmpR3) \
|
#define RESTORE_LFSR_4(addr, tmpR1, tmpR2, tmpR3) \
|
||||||
LXVD2X (addr), V0 \
|
LXVD2X (addr), V0 \
|
||||||
MOVD $16, tmpR1 \
|
MOVD $16, tmpR1 \
|
||||||
LXVD2X (tmpR1)(addr), V1 \
|
LXVD2X (tmpR1)(addr), V1 \
|
||||||
@ -264,7 +261,7 @@ GLOBL rcon<>(SB), RODATA, $160
|
|||||||
STXVD2X V3, (tmpR2)(addr) \
|
STXVD2X V3, (tmpR2)(addr) \
|
||||||
STXVD2X V0, (tmpR3)(addr)
|
STXVD2X V0, (tmpR3)(addr)
|
||||||
|
|
||||||
#define RESTORE_LFSR_8(addr, tmpR1, tmpR2, tmpR3) \
|
#define RESTORE_LFSR_8(addr, tmpR1, tmpR2, tmpR3) \
|
||||||
LXVD2X (addr), V0 \
|
LXVD2X (addr), V0 \
|
||||||
MOVD $16, tmpR1 \
|
MOVD $16, tmpR1 \
|
||||||
LXVD2X (tmpR1)(addr), V1 \
|
LXVD2X (tmpR1)(addr), V1 \
|
||||||
|
@ -2,6 +2,11 @@
|
|||||||
|
|
||||||
package zuc
|
package zuc
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/emmansun/gmsm/internal/cpuid"
|
||||||
|
"golang.org/x/sys/cpu"
|
||||||
|
)
|
||||||
|
|
||||||
// Generate single keyword, 4 bytes.
|
// Generate single keyword, 4 bytes.
|
||||||
//
|
//
|
||||||
//go:noescape
|
//go:noescape
|
||||||
@ -12,6 +17,9 @@ func genKeywordAsm(s *zucState32) uint32
|
|||||||
//go:noescape
|
//go:noescape
|
||||||
func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
|
func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
|
||||||
|
|
||||||
|
var supportsAES = cpuid.HasAES
|
||||||
|
var useAVX = cpu.X86.HasAVX
|
||||||
|
|
||||||
func genKeyStream(keyStream []uint32, pState *zucState32) {
|
func genKeyStream(keyStream []uint32, pState *zucState32) {
|
||||||
if supportsAES {
|
if supportsAES {
|
||||||
genKeyStreamAsm(keyStream, pState)
|
genKeyStreamAsm(keyStream, pState)
|
||||||
|
@ -42,8 +42,7 @@ TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
|
|||||||
// Reverse data bytes
|
// Reverse data bytes
|
||||||
VLD1 (CX), [XDATA.B16]
|
VLD1 (CX), [XDATA.B16]
|
||||||
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
|
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
|
||||||
VUSHR $4, XDATA.S4, XTMP1.S4
|
VUSHR $4, XDATA.B16, XTMP1.B16
|
||||||
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
|
|
||||||
|
|
||||||
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
|
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
|
||||||
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
|
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
|
||||||
@ -115,8 +114,7 @@ TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
|
|||||||
// Reverse data bytes
|
// Reverse data bytes
|
||||||
VLD1 (CX), [XDATA.B16]
|
VLD1 (CX), [XDATA.B16]
|
||||||
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
|
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
|
||||||
VUSHR $4, XDATA.S4, XTMP1.S4
|
VUSHR $4, XDATA.B16, XTMP1.B16
|
||||||
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
|
|
||||||
|
|
||||||
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
|
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
|
||||||
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
|
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
|
||||||
|
@ -4,12 +4,9 @@ package zuc
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/emmansun/gmsm/internal/cpuid"
|
"github.com/emmansun/gmsm/internal/cpuid"
|
||||||
"golang.org/x/sys/cpu"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
var supportsAES = cpuid.HasAES
|
|
||||||
var supportsGFMUL = cpuid.HasGFMUL || cpuid.HasVPMSUMD
|
var supportsGFMUL = cpuid.HasGFMUL || cpuid.HasVPMSUMD
|
||||||
var useAVX = cpu.X86.HasAVX
|
|
||||||
|
|
||||||
//go:noescape
|
//go:noescape
|
||||||
func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
|
func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
|
||||||
|
@ -53,8 +53,7 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
|
|||||||
// Reverse data bytes
|
// Reverse data bytes
|
||||||
VLD1 (CX), [XDATA.B16]
|
VLD1 (CX), [XDATA.B16]
|
||||||
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
|
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
|
||||||
VUSHR $4, XDATA.S4, XTMP1.S4
|
VUSHR $4, XDATA.B16, XTMP1.B16
|
||||||
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
|
|
||||||
|
|
||||||
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
|
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
|
||||||
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
|
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
|
||||||
|
Loading…
x
Reference in New Issue
Block a user