zuc: minor optimization

This commit is contained in:
Sun Yimin 2024-11-08 11:03:43 +08:00 committed by GitHub
parent a33c2ae118
commit b721bed0cc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 31 additions and 41 deletions

View File

@ -17,10 +17,6 @@ DATA Low_nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA Low_nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F DATA Low_nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL Low_nibble_mask<>(SB), RODATA, $16 GLOBL Low_nibble_mask<>(SB), RODATA, $16
DATA High_nibble_mask<>+0x00(SB)/8, $0xF0F0F0F0F0F0F0F0
DATA High_nibble_mask<>+0x08(SB)/8, $0xF0F0F0F0F0F0F0F0
GLOBL High_nibble_mask<>(SB), RODATA, $16
DATA P1<>+0x00(SB)/8, $0x0A020F0F0E000F09 DATA P1<>+0x00(SB)/8, $0x0A020F0F0E000F09
DATA P1<>+0x08(SB)/8, $0x090305070C000400 DATA P1<>+0x08(SB)/8, $0x090305070C000400
GLOBL P1<>(SB), RODATA, $16 GLOBL P1<>(SB), RODATA, $16
@ -99,10 +95,9 @@ GLOBL flip_mask<>(SB), RODATA, $16
#define S0_comput_SSE(IN_OUT, XTMP1, XTMP2) \ #define S0_comput_SSE(IN_OUT, XTMP1, XTMP2) \
MOVOU IN_OUT, XTMP1 \ MOVOU IN_OUT, XTMP1 \
\ \
PAND Low_nibble_mask<>(SB), IN_OUT \ // x2
\
PAND High_nibble_mask<>(SB), XTMP1 \
PSRLQ $4, XTMP1 \ // x1 PSRLQ $4, XTMP1 \ // x1
PAND Low_nibble_mask<>(SB), XTMP1 \
PAND Low_nibble_mask<>(SB), IN_OUT \ // x2
\ \
MOVOU P1<>(SB), XTMP2 \ MOVOU P1<>(SB), XTMP2 \
PSHUFB IN_OUT, XTMP2 \ // P1[x2] PSHUFB IN_OUT, XTMP2 \ // P1[x2]
@ -124,16 +119,15 @@ GLOBL flip_mask<>(SB), RODATA, $16
// for high and low nible of each input byte, SSE versiion. // for high and low nible of each input byte, SSE versiion.
#define MUL_PSHUFB_SSE(XIN, XLO, XHI_OUT, XTMP) \ #define MUL_PSHUFB_SSE(XIN, XLO, XHI_OUT, XTMP) \
\ // Get low nibble of input data \ // Get low nibble of input data
MOVOU Low_nibble_mask<>(SB), XTMP \ MOVOU XIN, XTMP \
PAND XIN, XTMP \ PAND Low_nibble_mask<>(SB), XTMP \
\ // Get low nibble of output \ // Get low nibble of output
PSHUFB XTMP, XLO \ PSHUFB XTMP, XLO \
\ // Get high nibble of input data \ // Get high nibble of input data
MOVOU High_nibble_mask<>(SB), XTMP \ PSRLQ $4, XIN \
PAND XIN, XTMP \ PAND Low_nibble_mask<>(SB), XIN \
PSRLQ $4, XTMP \
\ // Get high nibble of output \ // Get high nibble of output
PSHUFB XTMP, XHI_OUT \ PSHUFB XIN, XHI_OUT \
\ // XOR high and low nibbles to get full bytes \ // XOR high and low nibbles to get full bytes
PXOR XLO, XHI_OUT PXOR XLO, XHI_OUT
@ -146,8 +140,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
PSHUFB Shuf_mask<>(SB), XTMP2 \ PSHUFB Shuf_mask<>(SB), XTMP2 \
AESENCLAST Cancel_aes<>(SB), XTMP2 \ AESENCLAST Cancel_aes<>(SB), XTMP2 \
\ \
MOVOU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \ MOVOU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \
MOVOU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \ MOVOU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \
MUL_PSHUFB_SSE(XTMP2, XTMP1, XIN_OUT, XTMP3) MUL_PSHUFB_SSE(XTMP2, XTMP1, XIN_OUT, XTMP3)
// Rotate left 5 bits in each byte, within an XMM register, AVX version. // Rotate left 5 bits in each byte, within an XMM register, AVX version.
@ -160,9 +154,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
// Compute 16 S0 box values from 16 bytes, AVX version. // Compute 16 S0 box values from 16 bytes, AVX version.
#define S0_comput_AVX(IN_OUT, XTMP1, XTMP2) \ #define S0_comput_AVX(IN_OUT, XTMP1, XTMP2) \
VPAND High_nibble_mask<>(SB), IN_OUT, XTMP1 \ VPSRLQ $4, IN_OUT, XTMP1 \ // x1
VPSRLQ $4, XTMP1, XTMP1 \ // x1 VPAND Low_nibble_mask<>(SB), XTMP1, XTMP1 \
\
VPAND Low_nibble_mask<>(SB), IN_OUT, IN_OUT \ // x2 VPAND Low_nibble_mask<>(SB), IN_OUT, IN_OUT \ // x2
\ \
VMOVDQU P1<>(SB), XTMP2 \ VMOVDQU P1<>(SB), XTMP2 \
@ -189,8 +182,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
\ // Get low nibble of output \ // Get low nibble of output
VPSHUFB XTMP, XLO, XLO \ VPSHUFB XTMP, XLO, XLO \
\ // Get high nibble of input data \ // Get high nibble of input data
VPAND High_nibble_mask<>(SB), XIN, XTMP \ VPSRLQ $4, XIN, XTMP \
VPSRLQ $4, XTMP, XTMP \ VPAND Low_nibble_mask<>(SB), XTMP, XTMP \
\ // Get high nibble of output \ // Get high nibble of output
VPSHUFB XTMP, XHI_OUT, XHI_OUT \ VPSHUFB XTMP, XHI_OUT, XHI_OUT \
\ // XOR high and low nibbles to get full bytes \ // XOR high and low nibbles to get full bytes

View File

@ -94,9 +94,7 @@ GLOBL mask_S01<>(SB), RODATA, $32
VORR XTMP0.B16, XDATA.B16, XDATA.B16 VORR XTMP0.B16, XDATA.B16, XDATA.B16
#define S0_comput(IN_OUT, XTMP1, XTMP2) \ #define S0_comput(IN_OUT, XTMP1, XTMP2) \
VUSHR $4, IN_OUT.S4, XTMP1.S4 \ VUSHR $4, IN_OUT.B16, XTMP1.B16 \
VAND NIBBLE_MASK.B16, XTMP1.B16, XTMP1.B16 \
\
VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \ VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \
\ \
VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \ VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \

View File

@ -65,9 +65,8 @@ GLOBL rcon<>(SB), RODATA, $160
LXVD2X (R4)(R5), S1_MASK LXVD2X (R4)(R5), S1_MASK
#define S0_comput(IN_OUT, V_FOUR, XTMP1, XTMP2) \ #define S0_comput(IN_OUT, V_FOUR, XTMP1, XTMP2) \
VSRW IN_OUT, V_FOUR, XTMP1; \ VSRB IN_OUT, V_FOUR, XTMP1; \ // XTMP1 = hi 4 bits of IN_OUT
VAND XTMP1, NIBBLE_MASK, XTMP1; \ VAND IN_OUT, NIBBLE_MASK, IN_OUT; \ // low 4 bits of IN_OUT
VAND IN_OUT, NIBBLE_MASK, IN_OUT; \
VPERM P1, P1, IN_OUT, XTMP2; \ VPERM P1, P1, IN_OUT, XTMP2; \
VXOR XTMP1, XTMP2, XTMP2; \ VXOR XTMP1, XTMP2, XTMP2; \
VPERM P2, P2, XTMP2, XTMP1; \ VPERM P2, P2, XTMP2, XTMP1; \
@ -87,8 +86,6 @@ GLOBL rcon<>(SB), RODATA, $160
// zuc sbox function // zuc sbox function
// parameters: // parameters:
// - x: 128 bits register as sbox input/output data // - x: 128 bits register as sbox input/output data
// - y: 128 bits temp register
// - z: 128 bits temp register
#define S1_comput(x, y, z) \ #define S1_comput(x, y, z) \
VPERMXOR M1H, M1L, x, x; \ VPERMXOR M1H, M1L, x, x; \
VSBOX x, x; \ VSBOX x, x; \
@ -213,7 +210,7 @@ GLOBL rcon<>(SB), RODATA, $160
\ // LFSR_S16 = (LFSR_S15++) = W \ // LFSR_S16 = (LFSR_S15++) = W
MOVW W, (((0 + idx) % 16)*4)(addr) MOVW W, (((0 + idx) % 16)*4)(addr)
#define RESTORE_LFSR_0(addr, tmpR1, tmpR2, tmpR3, tmpR4) \ #define RESTORE_LFSR_0(addr, tmpR1, tmpR2, tmpR3, tmpR4) \
MOVWZ (addr), tmpR1 \ MOVWZ (addr), tmpR1 \
MOVD $4, tmpR4 \ MOVD $4, tmpR4 \
LXVD2X (tmpR4)(addr), V0 \ LXVD2X (tmpR4)(addr), V0 \
@ -232,7 +229,7 @@ GLOBL rcon<>(SB), RODATA, $160
MOVW tmpR3, 56(addr) \ MOVW tmpR3, 56(addr) \
MOVW tmpR1, 60(addr) MOVW tmpR1, 60(addr)
#define RESTORE_LFSR_2(addr, tmpR1, tmpR2, tmpR3) \ #define RESTORE_LFSR_2(addr, tmpR1, tmpR2, tmpR3) \
MOVD (addr), tmpR1 \ MOVD (addr), tmpR1 \
MOVD $8, tmpR2 \ MOVD $8, tmpR2 \
LXVD2X (tmpR2)(addr), V0 \ LXVD2X (tmpR2)(addr), V0 \
@ -250,7 +247,7 @@ GLOBL rcon<>(SB), RODATA, $160
MOVD tmpR3, 48(addr) \ MOVD tmpR3, 48(addr) \
MOVD tmpR1, 56(addr) MOVD tmpR1, 56(addr)
#define RESTORE_LFSR_4(addr, tmpR1, tmpR2, tmpR3) \ #define RESTORE_LFSR_4(addr, tmpR1, tmpR2, tmpR3) \
LXVD2X (addr), V0 \ LXVD2X (addr), V0 \
MOVD $16, tmpR1 \ MOVD $16, tmpR1 \
LXVD2X (tmpR1)(addr), V1 \ LXVD2X (tmpR1)(addr), V1 \
@ -264,7 +261,7 @@ GLOBL rcon<>(SB), RODATA, $160
STXVD2X V3, (tmpR2)(addr) \ STXVD2X V3, (tmpR2)(addr) \
STXVD2X V0, (tmpR3)(addr) STXVD2X V0, (tmpR3)(addr)
#define RESTORE_LFSR_8(addr, tmpR1, tmpR2, tmpR3) \ #define RESTORE_LFSR_8(addr, tmpR1, tmpR2, tmpR3) \
LXVD2X (addr), V0 \ LXVD2X (addr), V0 \
MOVD $16, tmpR1 \ MOVD $16, tmpR1 \
LXVD2X (tmpR1)(addr), V1 \ LXVD2X (tmpR1)(addr), V1 \

View File

@ -2,6 +2,11 @@
package zuc package zuc
import (
"github.com/emmansun/gmsm/internal/cpuid"
"golang.org/x/sys/cpu"
)
// Generate single keyword, 4 bytes. // Generate single keyword, 4 bytes.
// //
//go:noescape //go:noescape
@ -12,6 +17,9 @@ func genKeywordAsm(s *zucState32) uint32
//go:noescape //go:noescape
func genKeyStreamAsm(keyStream []uint32, pState *zucState32) func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
var supportsAES = cpuid.HasAES
var useAVX = cpu.X86.HasAVX
func genKeyStream(keyStream []uint32, pState *zucState32) { func genKeyStream(keyStream []uint32, pState *zucState32) {
if supportsAES { if supportsAES {
genKeyStreamAsm(keyStream, pState) genKeyStreamAsm(keyStream, pState)

View File

@ -42,8 +42,7 @@ TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
// Reverse data bytes // Reverse data bytes
VLD1 (CX), [XDATA.B16] VLD1 (CX), [XDATA.B16]
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16 VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
VUSHR $4, XDATA.S4, XTMP1.S4 VUSHR $4, XDATA.B16, XTMP1.B16
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16 VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16 VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
@ -115,8 +114,7 @@ TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
// Reverse data bytes // Reverse data bytes
VLD1 (CX), [XDATA.B16] VLD1 (CX), [XDATA.B16]
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16 VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
VUSHR $4, XDATA.S4, XTMP1.S4 VUSHR $4, XDATA.B16, XTMP1.B16
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16 VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16 VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16

View File

@ -4,12 +4,9 @@ package zuc
import ( import (
"github.com/emmansun/gmsm/internal/cpuid" "github.com/emmansun/gmsm/internal/cpuid"
"golang.org/x/sys/cpu"
) )
var supportsAES = cpuid.HasAES
var supportsGFMUL = cpuid.HasGFMUL || cpuid.HasVPMSUMD var supportsGFMUL = cpuid.HasGFMUL || cpuid.HasVPMSUMD
var useAVX = cpu.X86.HasAVX
//go:noescape //go:noescape
func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int) func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)

View File

@ -53,8 +53,7 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
// Reverse data bytes // Reverse data bytes
VLD1 (CX), [XDATA.B16] VLD1 (CX), [XDATA.B16]
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16 VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
VUSHR $4, XDATA.S4, XTMP1.S4 VUSHR $4, XDATA.B16, XTMP1.B16
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16 VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16 VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16