diff --git a/zuc/asm_amd64.s b/zuc/asm_amd64.s index 8b7c0f0..6f1c269 100644 --- a/zuc/asm_amd64.s +++ b/zuc/asm_amd64.s @@ -17,10 +17,6 @@ DATA Low_nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F DATA Low_nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F GLOBL Low_nibble_mask<>(SB), RODATA, $16 -DATA High_nibble_mask<>+0x00(SB)/8, $0xF0F0F0F0F0F0F0F0 -DATA High_nibble_mask<>+0x08(SB)/8, $0xF0F0F0F0F0F0F0F0 -GLOBL High_nibble_mask<>(SB), RODATA, $16 - DATA P1<>+0x00(SB)/8, $0x0A020F0F0E000F09 DATA P1<>+0x08(SB)/8, $0x090305070C000400 GLOBL P1<>(SB), RODATA, $16 @@ -99,10 +95,9 @@ GLOBL flip_mask<>(SB), RODATA, $16 #define S0_comput_SSE(IN_OUT, XTMP1, XTMP2) \ MOVOU IN_OUT, XTMP1 \ \ - PAND Low_nibble_mask<>(SB), IN_OUT \ // x2 - \ - PAND High_nibble_mask<>(SB), XTMP1 \ PSRLQ $4, XTMP1 \ // x1 + PAND Low_nibble_mask<>(SB), XTMP1 \ + PAND Low_nibble_mask<>(SB), IN_OUT \ // x2 \ MOVOU P1<>(SB), XTMP2 \ PSHUFB IN_OUT, XTMP2 \ // P1[x2] @@ -124,16 +119,15 @@ GLOBL flip_mask<>(SB), RODATA, $16 // for high and low nible of each input byte, SSE versiion. #define MUL_PSHUFB_SSE(XIN, XLO, XHI_OUT, XTMP) \ \ // Get low nibble of input data - MOVOU Low_nibble_mask<>(SB), XTMP \ - PAND XIN, XTMP \ + MOVOU XIN, XTMP \ + PAND Low_nibble_mask<>(SB), XTMP \ \ // Get low nibble of output PSHUFB XTMP, XLO \ \ // Get high nibble of input data - MOVOU High_nibble_mask<>(SB), XTMP \ - PAND XIN, XTMP \ - PSRLQ $4, XTMP \ + PSRLQ $4, XIN \ + PAND Low_nibble_mask<>(SB), XIN \ \ // Get high nibble of output - PSHUFB XTMP, XHI_OUT \ + PSHUFB XIN, XHI_OUT \ \ // XOR high and low nibbles to get full bytes PXOR XLO, XHI_OUT @@ -146,8 +140,8 @@ GLOBL flip_mask<>(SB), RODATA, $16 PSHUFB Shuf_mask<>(SB), XTMP2 \ AESENCLAST Cancel_aes<>(SB), XTMP2 \ \ - MOVOU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \ - MOVOU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \ + MOVOU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \ + MOVOU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \ MUL_PSHUFB_SSE(XTMP2, XTMP1, XIN_OUT, XTMP3) // Rotate left 5 bits in each byte, within an XMM register, AVX version. @@ -160,9 +154,8 @@ GLOBL flip_mask<>(SB), RODATA, $16 // Compute 16 S0 box values from 16 bytes, AVX version. #define S0_comput_AVX(IN_OUT, XTMP1, XTMP2) \ - VPAND High_nibble_mask<>(SB), IN_OUT, XTMP1 \ - VPSRLQ $4, XTMP1, XTMP1 \ // x1 - \ + VPSRLQ $4, IN_OUT, XTMP1 \ // x1 + VPAND Low_nibble_mask<>(SB), XTMP1, XTMP1 \ VPAND Low_nibble_mask<>(SB), IN_OUT, IN_OUT \ // x2 \ VMOVDQU P1<>(SB), XTMP2 \ @@ -189,8 +182,8 @@ GLOBL flip_mask<>(SB), RODATA, $16 \ // Get low nibble of output VPSHUFB XTMP, XLO, XLO \ \ // Get high nibble of input data - VPAND High_nibble_mask<>(SB), XIN, XTMP \ - VPSRLQ $4, XTMP, XTMP \ + VPSRLQ $4, XIN, XTMP \ + VPAND Low_nibble_mask<>(SB), XTMP, XTMP \ \ // Get high nibble of output VPSHUFB XTMP, XHI_OUT, XHI_OUT \ \ // XOR high and low nibbles to get full bytes diff --git a/zuc/asm_arm64.s b/zuc/asm_arm64.s index b48b387..b7a2a8a 100644 --- a/zuc/asm_arm64.s +++ b/zuc/asm_arm64.s @@ -94,9 +94,7 @@ GLOBL mask_S01<>(SB), RODATA, $32 VORR XTMP0.B16, XDATA.B16, XDATA.B16 #define S0_comput(IN_OUT, XTMP1, XTMP2) \ - VUSHR $4, IN_OUT.S4, XTMP1.S4 \ - VAND NIBBLE_MASK.B16, XTMP1.B16, XTMP1.B16 \ - \ + VUSHR $4, IN_OUT.B16, XTMP1.B16 \ VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \ \ VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \ diff --git a/zuc/asm_ppc64x.s b/zuc/asm_ppc64x.s index ff252ff..188d64a 100644 --- a/zuc/asm_ppc64x.s +++ b/zuc/asm_ppc64x.s @@ -65,9 +65,8 @@ GLOBL rcon<>(SB), RODATA, $160 LXVD2X (R4)(R5), S1_MASK #define S0_comput(IN_OUT, V_FOUR, XTMP1, XTMP2) \ - VSRW IN_OUT, V_FOUR, XTMP1; \ - VAND XTMP1, NIBBLE_MASK, XTMP1; \ - VAND IN_OUT, NIBBLE_MASK, IN_OUT; \ + VSRB IN_OUT, V_FOUR, XTMP1; \ // XTMP1 = hi 4 bits of IN_OUT + VAND IN_OUT, NIBBLE_MASK, IN_OUT; \ // low 4 bits of IN_OUT VPERM P1, P1, IN_OUT, XTMP2; \ VXOR XTMP1, XTMP2, XTMP2; \ VPERM P2, P2, XTMP2, XTMP1; \ @@ -87,8 +86,6 @@ GLOBL rcon<>(SB), RODATA, $160 // zuc sbox function // parameters: // - x: 128 bits register as sbox input/output data -// - y: 128 bits temp register -// - z: 128 bits temp register #define S1_comput(x, y, z) \ VPERMXOR M1H, M1L, x, x; \ VSBOX x, x; \ @@ -213,7 +210,7 @@ GLOBL rcon<>(SB), RODATA, $160 \ // LFSR_S16 = (LFSR_S15++) = W MOVW W, (((0 + idx) % 16)*4)(addr) -#define RESTORE_LFSR_0(addr, tmpR1, tmpR2, tmpR3, tmpR4) \ +#define RESTORE_LFSR_0(addr, tmpR1, tmpR2, tmpR3, tmpR4) \ MOVWZ (addr), tmpR1 \ MOVD $4, tmpR4 \ LXVD2X (tmpR4)(addr), V0 \ @@ -232,7 +229,7 @@ GLOBL rcon<>(SB), RODATA, $160 MOVW tmpR3, 56(addr) \ MOVW tmpR1, 60(addr) -#define RESTORE_LFSR_2(addr, tmpR1, tmpR2, tmpR3) \ +#define RESTORE_LFSR_2(addr, tmpR1, tmpR2, tmpR3) \ MOVD (addr), tmpR1 \ MOVD $8, tmpR2 \ LXVD2X (tmpR2)(addr), V0 \ @@ -250,7 +247,7 @@ GLOBL rcon<>(SB), RODATA, $160 MOVD tmpR3, 48(addr) \ MOVD tmpR1, 56(addr) -#define RESTORE_LFSR_4(addr, tmpR1, tmpR2, tmpR3) \ +#define RESTORE_LFSR_4(addr, tmpR1, tmpR2, tmpR3) \ LXVD2X (addr), V0 \ MOVD $16, tmpR1 \ LXVD2X (tmpR1)(addr), V1 \ @@ -264,7 +261,7 @@ GLOBL rcon<>(SB), RODATA, $160 STXVD2X V3, (tmpR2)(addr) \ STXVD2X V0, (tmpR3)(addr) -#define RESTORE_LFSR_8(addr, tmpR1, tmpR2, tmpR3) \ +#define RESTORE_LFSR_8(addr, tmpR1, tmpR2, tmpR3) \ LXVD2X (addr), V0 \ MOVD $16, tmpR1 \ LXVD2X (tmpR1)(addr), V1 \ diff --git a/zuc/core_asm.go b/zuc/core_asm.go index ee42b5f..37b30aa 100644 --- a/zuc/core_asm.go +++ b/zuc/core_asm.go @@ -2,6 +2,11 @@ package zuc +import ( + "github.com/emmansun/gmsm/internal/cpuid" + "golang.org/x/sys/cpu" +) + // Generate single keyword, 4 bytes. // //go:noescape @@ -12,6 +17,9 @@ func genKeywordAsm(s *zucState32) uint32 //go:noescape func genKeyStreamAsm(keyStream []uint32, pState *zucState32) +var supportsAES = cpuid.HasAES +var useAVX = cpu.X86.HasAVX + func genKeyStream(keyStream []uint32, pState *zucState32) { if supportsAES { genKeyStreamAsm(keyStream, pState) diff --git a/zuc/eia256_asm_arm64.s b/zuc/eia256_asm_arm64.s index 592bb84..e67e9f3 100644 --- a/zuc/eia256_asm_arm64.s +++ b/zuc/eia256_asm_arm64.s @@ -42,8 +42,7 @@ TEXT ·eia256RoundTag8(SB),NOSPLIT,$0 // Reverse data bytes VLD1 (CX), [XDATA.B16] VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16 - VUSHR $4, XDATA.S4, XTMP1.S4 - VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16 + VUSHR $4, XDATA.B16, XTMP1.B16 VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16 VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16 @@ -115,8 +114,7 @@ TEXT ·eia256RoundTag16(SB),NOSPLIT,$0 // Reverse data bytes VLD1 (CX), [XDATA.B16] VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16 - VUSHR $4, XDATA.S4, XTMP1.S4 - VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16 + VUSHR $4, XDATA.B16, XTMP1.B16 VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16 VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16 diff --git a/zuc/eia_asm.go b/zuc/eia_asm.go index 04733d1..df321dc 100644 --- a/zuc/eia_asm.go +++ b/zuc/eia_asm.go @@ -4,12 +4,9 @@ package zuc import ( "github.com/emmansun/gmsm/internal/cpuid" - "golang.org/x/sys/cpu" ) -var supportsAES = cpuid.HasAES var supportsGFMUL = cpuid.HasGFMUL || cpuid.HasVPMSUMD -var useAVX = cpu.X86.HasAVX //go:noescape func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int) diff --git a/zuc/eia_asm_arm64.s b/zuc/eia_asm_arm64.s index 2e0d01a..23f8190 100644 --- a/zuc/eia_asm_arm64.s +++ b/zuc/eia_asm_arm64.s @@ -53,8 +53,7 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0 // Reverse data bytes VLD1 (CX), [XDATA.B16] VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16 - VUSHR $4, XDATA.S4, XTMP1.S4 - VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16 + VUSHR $4, XDATA.B16, XTMP1.B16 VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16 VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16