zuc: minor optimization

This commit is contained in:
Sun Yimin 2024-11-08 11:03:43 +08:00 committed by GitHub
parent a33c2ae118
commit b721bed0cc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 31 additions and 41 deletions

View File

@ -17,10 +17,6 @@ DATA Low_nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA Low_nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL Low_nibble_mask<>(SB), RODATA, $16
DATA High_nibble_mask<>+0x00(SB)/8, $0xF0F0F0F0F0F0F0F0
DATA High_nibble_mask<>+0x08(SB)/8, $0xF0F0F0F0F0F0F0F0
GLOBL High_nibble_mask<>(SB), RODATA, $16
DATA P1<>+0x00(SB)/8, $0x0A020F0F0E000F09
DATA P1<>+0x08(SB)/8, $0x090305070C000400
GLOBL P1<>(SB), RODATA, $16
@ -99,10 +95,9 @@ GLOBL flip_mask<>(SB), RODATA, $16
#define S0_comput_SSE(IN_OUT, XTMP1, XTMP2) \
MOVOU IN_OUT, XTMP1 \
\
PAND Low_nibble_mask<>(SB), IN_OUT \ // x2
\
PAND High_nibble_mask<>(SB), XTMP1 \
PSRLQ $4, XTMP1 \ // x1
PAND Low_nibble_mask<>(SB), XTMP1 \
PAND Low_nibble_mask<>(SB), IN_OUT \ // x2
\
MOVOU P1<>(SB), XTMP2 \
PSHUFB IN_OUT, XTMP2 \ // P1[x2]
@ -124,16 +119,15 @@ GLOBL flip_mask<>(SB), RODATA, $16
// for high and low nible of each input byte, SSE versiion.
#define MUL_PSHUFB_SSE(XIN, XLO, XHI_OUT, XTMP) \
\ // Get low nibble of input data
MOVOU Low_nibble_mask<>(SB), XTMP \
PAND XIN, XTMP \
MOVOU XIN, XTMP \
PAND Low_nibble_mask<>(SB), XTMP \
\ // Get low nibble of output
PSHUFB XTMP, XLO \
\ // Get high nibble of input data
MOVOU High_nibble_mask<>(SB), XTMP \
PAND XIN, XTMP \
PSRLQ $4, XTMP \
PSRLQ $4, XIN \
PAND Low_nibble_mask<>(SB), XIN \
\ // Get high nibble of output
PSHUFB XTMP, XHI_OUT \
PSHUFB XIN, XHI_OUT \
\ // XOR high and low nibbles to get full bytes
PXOR XLO, XHI_OUT
@ -146,8 +140,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
PSHUFB Shuf_mask<>(SB), XTMP2 \
AESENCLAST Cancel_aes<>(SB), XTMP2 \
\
MOVOU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \
MOVOU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \
MOVOU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \
MOVOU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \
MUL_PSHUFB_SSE(XTMP2, XTMP1, XIN_OUT, XTMP3)
// Rotate left 5 bits in each byte, within an XMM register, AVX version.
@ -160,9 +154,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
// Compute 16 S0 box values from 16 bytes, AVX version.
#define S0_comput_AVX(IN_OUT, XTMP1, XTMP2) \
VPAND High_nibble_mask<>(SB), IN_OUT, XTMP1 \
VPSRLQ $4, XTMP1, XTMP1 \ // x1
\
VPSRLQ $4, IN_OUT, XTMP1 \ // x1
VPAND Low_nibble_mask<>(SB), XTMP1, XTMP1 \
VPAND Low_nibble_mask<>(SB), IN_OUT, IN_OUT \ // x2
\
VMOVDQU P1<>(SB), XTMP2 \
@ -189,8 +182,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
\ // Get low nibble of output
VPSHUFB XTMP, XLO, XLO \
\ // Get high nibble of input data
VPAND High_nibble_mask<>(SB), XIN, XTMP \
VPSRLQ $4, XTMP, XTMP \
VPSRLQ $4, XIN, XTMP \
VPAND Low_nibble_mask<>(SB), XTMP, XTMP \
\ // Get high nibble of output
VPSHUFB XTMP, XHI_OUT, XHI_OUT \
\ // XOR high and low nibbles to get full bytes

View File

@ -94,9 +94,7 @@ GLOBL mask_S01<>(SB), RODATA, $32
VORR XTMP0.B16, XDATA.B16, XDATA.B16
#define S0_comput(IN_OUT, XTMP1, XTMP2) \
VUSHR $4, IN_OUT.S4, XTMP1.S4 \
VAND NIBBLE_MASK.B16, XTMP1.B16, XTMP1.B16 \
\
VUSHR $4, IN_OUT.B16, XTMP1.B16 \
VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \
\
VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \

View File

@ -65,9 +65,8 @@ GLOBL rcon<>(SB), RODATA, $160
LXVD2X (R4)(R5), S1_MASK
#define S0_comput(IN_OUT, V_FOUR, XTMP1, XTMP2) \
VSRW IN_OUT, V_FOUR, XTMP1; \
VAND XTMP1, NIBBLE_MASK, XTMP1; \
VAND IN_OUT, NIBBLE_MASK, IN_OUT; \
VSRB IN_OUT, V_FOUR, XTMP1; \ // XTMP1 = hi 4 bits of IN_OUT
VAND IN_OUT, NIBBLE_MASK, IN_OUT; \ // low 4 bits of IN_OUT
VPERM P1, P1, IN_OUT, XTMP2; \
VXOR XTMP1, XTMP2, XTMP2; \
VPERM P2, P2, XTMP2, XTMP1; \
@ -87,8 +86,6 @@ GLOBL rcon<>(SB), RODATA, $160
// zuc sbox function
// parameters:
// - x: 128 bits register as sbox input/output data
// - y: 128 bits temp register
// - z: 128 bits temp register
#define S1_comput(x, y, z) \
VPERMXOR M1H, M1L, x, x; \
VSBOX x, x; \
@ -213,7 +210,7 @@ GLOBL rcon<>(SB), RODATA, $160
\ // LFSR_S16 = (LFSR_S15++) = W
MOVW W, (((0 + idx) % 16)*4)(addr)
#define RESTORE_LFSR_0(addr, tmpR1, tmpR2, tmpR3, tmpR4) \
#define RESTORE_LFSR_0(addr, tmpR1, tmpR2, tmpR3, tmpR4) \
MOVWZ (addr), tmpR1 \
MOVD $4, tmpR4 \
LXVD2X (tmpR4)(addr), V0 \
@ -232,7 +229,7 @@ GLOBL rcon<>(SB), RODATA, $160
MOVW tmpR3, 56(addr) \
MOVW tmpR1, 60(addr)
#define RESTORE_LFSR_2(addr, tmpR1, tmpR2, tmpR3) \
#define RESTORE_LFSR_2(addr, tmpR1, tmpR2, tmpR3) \
MOVD (addr), tmpR1 \
MOVD $8, tmpR2 \
LXVD2X (tmpR2)(addr), V0 \
@ -250,7 +247,7 @@ GLOBL rcon<>(SB), RODATA, $160
MOVD tmpR3, 48(addr) \
MOVD tmpR1, 56(addr)
#define RESTORE_LFSR_4(addr, tmpR1, tmpR2, tmpR3) \
#define RESTORE_LFSR_4(addr, tmpR1, tmpR2, tmpR3) \
LXVD2X (addr), V0 \
MOVD $16, tmpR1 \
LXVD2X (tmpR1)(addr), V1 \
@ -264,7 +261,7 @@ GLOBL rcon<>(SB), RODATA, $160
STXVD2X V3, (tmpR2)(addr) \
STXVD2X V0, (tmpR3)(addr)
#define RESTORE_LFSR_8(addr, tmpR1, tmpR2, tmpR3) \
#define RESTORE_LFSR_8(addr, tmpR1, tmpR2, tmpR3) \
LXVD2X (addr), V0 \
MOVD $16, tmpR1 \
LXVD2X (tmpR1)(addr), V1 \

View File

@ -2,6 +2,11 @@
package zuc
import (
"github.com/emmansun/gmsm/internal/cpuid"
"golang.org/x/sys/cpu"
)
// Generate single keyword, 4 bytes.
//
//go:noescape
@ -12,6 +17,9 @@ func genKeywordAsm(s *zucState32) uint32
//go:noescape
func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
var supportsAES = cpuid.HasAES
var useAVX = cpu.X86.HasAVX
func genKeyStream(keyStream []uint32, pState *zucState32) {
if supportsAES {
genKeyStreamAsm(keyStream, pState)

View File

@ -42,8 +42,7 @@ TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
// Reverse data bytes
VLD1 (CX), [XDATA.B16]
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
VUSHR $4, XDATA.S4, XTMP1.S4
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
VUSHR $4, XDATA.B16, XTMP1.B16
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
@ -115,8 +114,7 @@ TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
// Reverse data bytes
VLD1 (CX), [XDATA.B16]
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
VUSHR $4, XDATA.S4, XTMP1.S4
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
VUSHR $4, XDATA.B16, XTMP1.B16
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16

View File

@ -4,12 +4,9 @@ package zuc
import (
"github.com/emmansun/gmsm/internal/cpuid"
"golang.org/x/sys/cpu"
)
var supportsAES = cpuid.HasAES
var supportsGFMUL = cpuid.HasGFMUL || cpuid.HasVPMSUMD
var useAVX = cpu.X86.HasAVX
//go:noescape
func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)

View File

@ -53,8 +53,7 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
// Reverse data bytes
VLD1 (CX), [XDATA.B16]
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
VUSHR $4, XDATA.S4, XTMP1.S4
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
VUSHR $4, XDATA.B16, XTMP1.B16
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16