From f47051ed862fb7d07b2f0e49e8825badb7cfd9f4 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Tue, 24 Sep 2024 08:09:57 +0800 Subject: [PATCH] cipher: xts reduce duplicated code --- README.md | 2 +- cipher/xts_amd64.s | 84 ++++++++++++++++-------------------------- cipher/xts_arm64.s | 89 ++++++++++++++++++--------------------------- cipher/xts_ppc64x.s | 4 +- 4 files changed, 70 insertions(+), 109 deletions(-) diff --git a/README.md b/README.md index da54015..4d96be9 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Go语言商用密码软件,简称**GMSM**,一个安全、高性能、易于 - **SM3** - SM3密码杂凑算法实现。**amd64**下分别针对**AVX2+BMI2、AVX、SSE2+SSSE3**做了消息扩展部分的SIMD实现; **arm64**下使用NEON指令做了消息扩展部分的SIMD实现,同时也提供了基于**A64扩展密码指令**的汇编实现;**s390x**和**ppc64x**通过向量指令做了消息扩展部分的优化实现。您也可以参考[SM3性能优化](https://github.com/emmansun/gmsm/wiki/SM3%E6%80%A7%E8%83%BD%E4%BC%98%E5%8C%96)及相关Wiki和代码,以获得更多实现细节。 -- **SM4** - SM4分组密码算法实现。**amd64**下使用**AES**指令加上**AVX2、AVX、SSE2+SSSE3**实现了比较好的性能。**arm64**下使用**AES**指令加上NEON指令实现了比较好的性能,同时也提供了基于**A64扩展密码指令**的汇编实现。**ppc64x**下使用**VCIPHERLAST**指令加上向量指令进行了并行优化。针对**ECB/CBC/GCM/XTS**加密模式,做了和SM4分组密码算法的融合汇编优化实现。您也可以参考[SM4性能优化](https://github.com/emmansun/gmsm/wiki/SM4%E6%80%A7%E8%83%BD%E4%BC%98%E5%8C%96)及相关Wiki和代码,以获得更多实现细节。 +- **SM4** - SM4分组密码算法实现。**amd64**下使用**AES**指令加上**AVX2、AVX、SSE2+SSSE3**实现了比较好的性能。**arm64**下使用**AES**指令加上NEON指令实现了比较好的性能,同时也提供了基于**A64扩展密码指令**的汇编实现。**ppc64x**下使用**vsbox**指令加上向量指令进行了并行优化。针对**ECB/CBC/GCM/XTS**加密模式,做了和SM4分组密码算法的融合汇编优化实现。您也可以参考[SM4性能优化](https://github.com/emmansun/gmsm/wiki/SM4%E6%80%A7%E8%83%BD%E4%BC%98%E5%8C%96)及相关Wiki和代码,以获得更多实现细节。 - **SM9** - SM9标识密码算法实现。基础的素域、扩域、椭圆曲线运算以及双线性对运算位于[bn256](https://github.com/emmansun/gmsm/tree/main/sm9/bn256)包中,分别对**amd64**、**arm64**架构做了优化实现。您也可以参考[SM9实现及优化](https://github.com/emmansun/gmsm/wiki/SM9%E5%AE%9E%E7%8E%B0%E5%8F%8A%E4%BC%98%E5%8C%96)及相关讨论和代码,以获得更多实现细节。SM9包实现了SM9标识密码算法的密钥生成、数字签名算法、密钥封装机制和公钥加密算法、密钥交换协议。 diff --git a/cipher/xts_amd64.s b/cipher/xts_amd64.s index 0ef8b1f..e5707a1 100644 --- a/cipher/xts_amd64.s +++ b/cipher/xts_amd64.s @@ -22,6 +22,34 @@ GLOBL gbGcmPoly<>(SB), (NOPTR+RODATA), $16 #define T0 X3 #define T1 X4 +#define doubleTweak(B0, POLY, T0, T1) \ + \ // B0 * 2 + PSHUFD $0xff, B0, T0 \ + MOVOU B0, T1 \ + PSRAL $31, T0 \ // T0 for reduction + PAND POLY, T0 \ + PSRLL $31, T1 \ + PSLLDQ $4, T1 \ + PSLLL $1, B0 \ + PXOR T0, B0 \ + PXOR T1, B0 + +#define gbDoubleTweak(B0, BSWAP, POLY, T0, T1) \ + PSHUFB BSWAP, B0 \ + \ // B0 * 2 + MOVOU B0, T0 \ + PSHUFD $0, B0, T1 \ + PSRLQ $1, B0 \ + PSLLQ $63, T0 \ + PSRLDQ $8, T0 \ + POR T0, B0 \ + \ // reduction + PSLLL $31, T1 \ + PSRAL $31, T1 \ + PAND POLY, T1 \ + PXOR T1, B0 \ + PSHUFB BSWAP, B0 + // func mul2(tweak *[blockSize]byte, isGB bool) TEXT ·mul2(SB),NOSPLIT,$0 MOVQ tweak+0(FP), DI @@ -34,16 +62,7 @@ TEXT ·mul2(SB),NOSPLIT,$0 MOVOU gcmPoly<>(SB), POLY - // B0 * 2 - PSHUFD $0xff, B0, T0 - MOVOU B0, T1 - PSRAL $31, T0 // T0 for reduction - PAND POLY, T0 - PSRLL $31, T1 - PSLLDQ $4, T1 - PSLLL $1, B0 - PXOR T0, B0 - PXOR T1, B0 + doubleTweak(B0, POLY, T0, T1) MOVOU B0, (0*16)(DI) @@ -53,23 +72,8 @@ gb_alg: MOVOU bswapMask<>(SB), BSWAP MOVOU gbGcmPoly<>(SB), POLY - PSHUFB BSWAP, B0 + gbDoubleTweak(B0, BSWAP, POLY, T0, T1) - // B0 * 2 - MOVOU B0, T0 - PSHUFD $0, B0, T1 - PSRLQ $1, B0 - PSLLQ $63, T0 - PSRLDQ $8, T0 - POR T0, B0 - - // reduction - PSLLL $31, T1 - PSRAL $31, T1 - PAND POLY, T1 - PXOR T1, B0 - - PSHUFB BSWAP, B0 MOVOU B0, (0*16)(DI) RET @@ -94,16 +98,7 @@ loop: MOVOU B0, (0*16)(AX) LEAQ 16(AX), AX - // B0 * 2 - PSHUFD $0xff, B0, T0 - MOVOU B0, T1 - PSRAL $31, T0 // T0 for reduction - PAND POLY, T0 - PSRLL $31, T1 - PSLLDQ $4, T1 - PSLLL $1, B0 - PXOR T0, B0 - PXOR T1, B0 + doubleTweak(B0, POLY, T0, T1) ADDQ $1, DX CMPQ DX, BX @@ -120,23 +115,8 @@ gb_loop: MOVOU B0, (0*16)(AX) LEAQ 16(AX), AX - PSHUFB BSWAP, B0 + gbDoubleTweak(B0, BSWAP, POLY, T0, T1) - // B0 * 2 - MOVOU B0, T0 - PSHUFD $0, B0, T1 - PSRLQ $1, B0 - PSLLQ $63, T0 - PSRLDQ $8, T0 - POR T0, B0 - - // reduction - PSLLL $31, T1 - PSRAL $31, T1 - PAND POLY, T1 - PXOR T1, B0 - - PSHUFB BSWAP, B0 ADDQ $1, DX CMPQ DX, BX JB gb_loop diff --git a/cipher/xts_arm64.s b/cipher/xts_arm64.s index 057f749..cb398e7 100644 --- a/cipher/xts_arm64.s +++ b/cipher/xts_arm64.s @@ -13,6 +13,37 @@ #define GB R1 #define I R2 +#define doubleTweak(B0, ZERO, POLY, I, T1, T2) \ + VMOV B0.D[1], I \ + ASR $63, I \ + VMOV I, T1.D2 \ + VAND POLY.B16, T1.B16, T1.B16 \ + \ + VUSHR $63, B0.D2, T2.D2 \ + VEXT $8, T2.B16, ZERO.B16, T2.B16 \ + VSHL $1, B0.D2, B0.D2 \ + VEOR T1.B16, B0.B16, B0.B16 \ + VEOR T2.B16, B0.B16, B0.B16 + +#define gbDoubleTweak(B0, ZERO, POLY, I, T1, T2) \ + VREV64 B0.B16, B0.B16 \ + VEXT $8, B0.B16, B0.B16, B0.B16 \ + \ + VMOV B0.D[0], I \ + LSL $63, I \ + ASR $63, I \ + VMOV I, T1.D2 \ + VAND POLY.B16, T1.B16, T1.B16 \ + \ + VSHL $63, B0.D2, T2.D2 \ + VEXT $8, ZERO.B16, T2.B16, T2.B16 \ + VUSHR $1, B0.D2, B0.D2 \ + VEOR T1.B16, B0.B16, B0.B16 \ + VEOR T2.B16, B0.B16, B0.B16 \ + \ + VEXT $8, B0.B16, B0.B16, B0.B16 \ + VREV64 B0.B16, B0.B16 + // func mul2(tweak *[blockSize]byte, isGB bool) TEXT ·mul2(SB),NOSPLIT,$0 MOVD tweak+0(FP), TW @@ -29,16 +60,7 @@ TEXT ·mul2(SB),NOSPLIT,$0 MOVD $0x87, I VMOV I, POLY.D[0] - VMOV B0.D[1], I - ASR $63, I - VMOV I, T1.D2 - VAND POLY.B16, T1.B16, T1.B16 - - VUSHR $63, B0.D2, T2.D2 - VEXT $8, T2.B16, ZERO.B16, T2.B16 - VSHL $1, B0.D2, B0.D2 - VEOR T1.B16, B0.B16, B0.B16 - VEOR T2.B16, B0.B16, B0.B16 + doubleTweak(B0, ZERO, POLY, I, T1, T2) VST1 [B0.B16], (TW) RET @@ -48,23 +70,7 @@ gb_alg: LSL $56, I VMOV I, POLY.D[1] - VREV64 B0.B16, B0.B16 - VEXT $8, B0.B16, B0.B16, B0.B16 - - VMOV B0.D[0], I - LSL $63, I - ASR $63, I - VMOV I, T1.D2 - VAND POLY.B16, T1.B16, T1.B16 - - VSHL $63, B0.D2, T2.D2 - VEXT $8, ZERO.B16, T2.B16, T2.B16 - VUSHR $1, B0.D2, B0.D2 - VEOR T1.B16, B0.B16, B0.B16 - VEOR T2.B16, B0.B16, B0.B16 - - VEXT $8, B0.B16, B0.B16, B0.B16 - VREV64 B0.B16, B0.B16 + gbDoubleTweak(B0, ZERO, POLY, I, T1, T2) VST1 [B0.B16], (TW) RET @@ -93,16 +99,7 @@ TEXT ·doubleTweaks(SB),NOSPLIT,$0 loop: VST1.P [B0.B16], 16(R3) - VMOV B0.D[1], I - ASR $63, I - VMOV I, T1.D2 - VAND POLY.B16, T1.B16, T1.B16 - - VUSHR $63, B0.D2, T2.D2 - VEXT $8, T2.B16, ZERO.B16, T2.B16 - VSHL $1, B0.D2, B0.D2 - VEOR T1.B16, B0.B16, B0.B16 - VEOR T2.B16, B0.B16, B0.B16 + doubleTweak(B0, ZERO, POLY, I, T1, T2) ADD $1, R5 CMP R4, R5 @@ -119,23 +116,7 @@ dt_gb_alg: gb_loop: VST1.P [B0.B16], 16(R3) - VREV64 B0.B16, B0.B16 - VEXT $8, B0.B16, B0.B16, B0.B16 - - VMOV B0.D[0], I - LSL $63, I - ASR $63, I - VMOV I, T1.D2 - VAND POLY.B16, T1.B16, T1.B16 - - VSHL $63, B0.D2, T2.D2 - VEXT $8, ZERO.B16, T2.B16, T2.B16 - VUSHR $1, B0.D2, B0.D2 - VEOR T1.B16, B0.B16, B0.B16 - VEOR T2.B16, B0.B16, B0.B16 - - VEXT $8, B0.B16, B0.B16, B0.B16 - VREV64 B0.B16, B0.B16 + gbDoubleTweak(B0, ZERO, POLY, I, T1, T2) ADD $1, R5 CMP R4, R5 diff --git a/cipher/xts_ppc64x.s b/cipher/xts_ppc64x.s index d5e522d..b0ec9eb 100644 --- a/cipher/xts_ppc64x.s +++ b/cipher/xts_ppc64x.s @@ -149,8 +149,8 @@ gb_alg: #endif gbLoop: - STXVD2X B0, (R4) - ADD $16, R4 + STXVD2X B0, (R4) + ADD $16, R4 #ifdef GOARCH_ppc64le VPERM B0, B0, ESPERM, B0