From 8f45f4efcbcb17d972b38141949f1789a6c26cc2 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Wed, 13 Nov 2024 17:41:09 +0800 Subject: [PATCH] sm3: ppc64x change T handling method #267 --- sm3/sm3blocks_ppc64x.s | 168 +++++++++++++++++++++-------------------- 1 file changed, 87 insertions(+), 81 deletions(-) diff --git a/sm3/sm3blocks_ppc64x.s b/sm3/sm3blocks_ppc64x.s index 2a622d7..05311fb 100644 --- a/sm3/sm3blocks_ppc64x.s +++ b/sm3/sm3blocks_ppc64x.s @@ -5,7 +5,6 @@ //go:build (ppc64 || ppc64le) && !purego #include "textflag.h" -#include "sm3_const_asm.s" #define a V0 #define e V1 @@ -15,6 +14,10 @@ #define g V5 #define d V6 #define h V7 +#define T0 V8 +#define T1 V9 +#define T2 V10 +#define ONE V11 #define TMP0 V12 #define TMP1 V13 #define TMP2 V14 @@ -25,6 +28,12 @@ // For instruction emulation #define ESPERMW V31 // Endian swapping permute into BE +DATA t_const<>+0x00(SB)/8, $0x79cc451979cc4519 +DATA t_const<>+0x08(SB)/8, $0x79cc451979cc4519 +DATA t_const<>+0x10(SB)/8, $0x9d8a7a879d8a7a87 +DATA t_const<>+0x18(SB)/8, $0x9d8a7a879d8a7a87 +GLOBL t_const<>(SB), RODATA, $32 + #define R_x08 R15 #define R_x10 R16 #define R_x18 R17 @@ -83,20 +92,11 @@ XXPERMDI TMP2, TMP3, $0, T1; \ XXPERMDI TMP2, TMP3, $3, T3 -// Load constant T, How to simlify it? -// Solution 1: big constant table like sha256block_ppc64x.s -// Solution 2: 2 constant T, rotate shift left one bit every time -// Solution 1's performance is better but it uses more memory. -#define LOAD_T(index, const, target) \ - MOVD $const, R_TMP \ - MTVSRWZ R_TMP, target \ - VSPLTW $1, target, target - -#define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \ +#define ROUND_00_11(index, T, a, b, c, d, e, f, g, h) \ PROLD(a, TMP0, 12) \ VOR TMP0, TMP0, TMP1 \ - LOAD_T(index, const, TMP2) \ - VADDUWM TMP2, TMP0, TMP0 \ + VADDUWM T, TMP0, TMP0 \ + VRLW T, ONE, T \ VADDUWM e, TMP0, TMP0 \ PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 VXOR TMP2, TMP1, TMP0 \ // TMP0 = SS2 @@ -141,16 +141,16 @@ STXVW4X TMP1, (wordPtr) \ ADD $16, wordPtr \ -#define ROUND_12_15(index, const, a, b, c, d, e, f, g, h) \ +#define ROUND_12_15(index, T, a, b, c, d, e, f, g, h) \ MESSAGE_SCHEDULE(index) \ - ROUND_00_11(index, const, a, b, c, d, e, f, g, h) + ROUND_00_11(index, T, a, b, c, d, e, f, g, h) -#define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \ +#define ROUND_16_63(index, T, a, b, c, d, e, f, g, h) \ MESSAGE_SCHEDULE(index) \ // TMP1 is Wt+4 now, Pls do not use it PROLD(a, TMP0, 12) \ VOR TMP0, TMP0, TMP4 \ - LOAD_T(index, const, TMP2) \ - VADDUWM TMP2, TMP0, TMP0 \ + VADDUWM T, TMP0, TMP0 \ + VRLW T, ONE, T \ VADDUWM e, TMP0, TMP0 \ PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 VXOR TMP2, TMP4, TMP0 \ // TMP0 = SS2 @@ -188,6 +188,10 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0 MOVD $·flip_mask(SB), R4 LVX (R4), ESPERMW #endif + VSPLTISW $1, ONE + MOVD $t_const<>(SB), R4 + LXVD2X (R0)(R4), T0 + LXVD2X (R_x10)(R4), T1 #define digPtr R11 #define srcPtrPtr R5 #define statePtr R4 @@ -247,72 +251,74 @@ loop: prepare4Words prepare4Words - ROUND_00_11(0, T0, a, b, c, d, e, f, g, h) - ROUND_00_11(1, T1, h, a, b, c, d, e, f, g) + XXLOR T0, T0, T2 + ROUND_00_11(0, T2, a, b, c, d, e, f, g, h) + ROUND_00_11(1, T2, h, a, b, c, d, e, f, g) ROUND_00_11(2, T2, g, h, a, b, c, d, e, f) - ROUND_00_11(3, T3, f, g, h, a, b, c, d, e) - ROUND_00_11(4, T4, e, f, g, h, a, b, c, d) - ROUND_00_11(5, T5, d, e, f, g, h, a, b, c) - ROUND_00_11(6, T6, c, d, e, f, g, h, a, b) - ROUND_00_11(7, T7, b, c, d, e, f, g, h, a) - ROUND_00_11(8, T8, a, b, c, d, e, f, g, h) - ROUND_00_11(9, T9, h, a, b, c, d, e, f, g) - ROUND_00_11(10, T10, g, h, a, b, c, d, e, f) - ROUND_00_11(11, T11, f, g, h, a, b, c, d, e) + ROUND_00_11(3, T2, f, g, h, a, b, c, d, e) + ROUND_00_11(4, T2, e, f, g, h, a, b, c, d) + ROUND_00_11(5, T2, d, e, f, g, h, a, b, c) + ROUND_00_11(6, T2, c, d, e, f, g, h, a, b) + ROUND_00_11(7, T2, b, c, d, e, f, g, h, a) + ROUND_00_11(8, T2, a, b, c, d, e, f, g, h) + ROUND_00_11(9, T2, h, a, b, c, d, e, f, g) + ROUND_00_11(10, T2, g, h, a, b, c, d, e, f) + ROUND_00_11(11, T2, f, g, h, a, b, c, d, e) - ROUND_12_15(12, T12, e, f, g, h, a, b, c, d) - ROUND_12_15(13, T13, d, e, f, g, h, a, b, c) - ROUND_12_15(14, T14, c, d, e, f, g, h, a, b) - ROUND_12_15(15, T15, b, c, d, e, f, g, h, a) + ROUND_12_15(12, T2, e, f, g, h, a, b, c, d) + ROUND_12_15(13, T2, d, e, f, g, h, a, b, c) + ROUND_12_15(14, T2, c, d, e, f, g, h, a, b) + ROUND_12_15(15, T2, b, c, d, e, f, g, h, a) - ROUND_16_63(16, T16, a, b, c, d, e, f, g, h) - ROUND_16_63(17, T17, h, a, b, c, d, e, f, g) - ROUND_16_63(18, T18, g, h, a, b, c, d, e, f) - ROUND_16_63(19, T19, f, g, h, a, b, c, d, e) - ROUND_16_63(20, T20, e, f, g, h, a, b, c, d) - ROUND_16_63(21, T21, d, e, f, g, h, a, b, c) - ROUND_16_63(22, T22, c, d, e, f, g, h, a, b) - ROUND_16_63(23, T23, b, c, d, e, f, g, h, a) - ROUND_16_63(24, T24, a, b, c, d, e, f, g, h) - ROUND_16_63(25, T25, h, a, b, c, d, e, f, g) - ROUND_16_63(26, T26, g, h, a, b, c, d, e, f) - ROUND_16_63(27, T27, f, g, h, a, b, c, d, e) - ROUND_16_63(28, T28, e, f, g, h, a, b, c, d) - ROUND_16_63(29, T29, d, e, f, g, h, a, b, c) - ROUND_16_63(30, T30, c, d, e, f, g, h, a, b) - ROUND_16_63(31, T31, b, c, d, e, f, g, h, a) - ROUND_16_63(32, T32, a, b, c, d, e, f, g, h) - ROUND_16_63(33, T33, h, a, b, c, d, e, f, g) - ROUND_16_63(34, T34, g, h, a, b, c, d, e, f) - ROUND_16_63(35, T35, f, g, h, a, b, c, d, e) - ROUND_16_63(36, T36, e, f, g, h, a, b, c, d) - ROUND_16_63(37, T37, d, e, f, g, h, a, b, c) - ROUND_16_63(38, T38, c, d, e, f, g, h, a, b) - ROUND_16_63(39, T39, b, c, d, e, f, g, h, a) - ROUND_16_63(40, T40, a, b, c, d, e, f, g, h) - ROUND_16_63(41, T41, h, a, b, c, d, e, f, g) - ROUND_16_63(42, T42, g, h, a, b, c, d, e, f) - ROUND_16_63(43, T43, f, g, h, a, b, c, d, e) - ROUND_16_63(44, T44, e, f, g, h, a, b, c, d) - ROUND_16_63(45, T45, d, e, f, g, h, a, b, c) - ROUND_16_63(46, T46, c, d, e, f, g, h, a, b) - ROUND_16_63(47, T47, b, c, d, e, f, g, h, a) - ROUND_16_63(48, T16, a, b, c, d, e, f, g, h) - ROUND_16_63(49, T17, h, a, b, c, d, e, f, g) - ROUND_16_63(50, T18, g, h, a, b, c, d, e, f) - ROUND_16_63(51, T19, f, g, h, a, b, c, d, e) - ROUND_16_63(52, T20, e, f, g, h, a, b, c, d) - ROUND_16_63(53, T21, d, e, f, g, h, a, b, c) - ROUND_16_63(54, T22, c, d, e, f, g, h, a, b) - ROUND_16_63(55, T23, b, c, d, e, f, g, h, a) - ROUND_16_63(56, T24, a, b, c, d, e, f, g, h) - ROUND_16_63(57, T25, h, a, b, c, d, e, f, g) - ROUND_16_63(58, T26, g, h, a, b, c, d, e, f) - ROUND_16_63(59, T27, f, g, h, a, b, c, d, e) - ROUND_16_63(60, T28, e, f, g, h, a, b, c, d) - ROUND_16_63(61, T29, d, e, f, g, h, a, b, c) - ROUND_16_63(62, T30, c, d, e, f, g, h, a, b) - ROUND_16_63(63, T31, b, c, d, e, f, g, h, a) + XXLOR T1, T1, T2 + ROUND_16_63(16, T2, a, b, c, d, e, f, g, h) + ROUND_16_63(17, T2, h, a, b, c, d, e, f, g) + ROUND_16_63(18, T2, g, h, a, b, c, d, e, f) + ROUND_16_63(19, T2, f, g, h, a, b, c, d, e) + ROUND_16_63(20, T2, e, f, g, h, a, b, c, d) + ROUND_16_63(21, T2, d, e, f, g, h, a, b, c) + ROUND_16_63(22, T2, c, d, e, f, g, h, a, b) + ROUND_16_63(23, T2, b, c, d, e, f, g, h, a) + ROUND_16_63(24, T2, a, b, c, d, e, f, g, h) + ROUND_16_63(25, T2, h, a, b, c, d, e, f, g) + ROUND_16_63(26, T2, g, h, a, b, c, d, e, f) + ROUND_16_63(27, T2, f, g, h, a, b, c, d, e) + ROUND_16_63(28, T2, e, f, g, h, a, b, c, d) + ROUND_16_63(29, T2, d, e, f, g, h, a, b, c) + ROUND_16_63(30, T2, c, d, e, f, g, h, a, b) + ROUND_16_63(31, T2, b, c, d, e, f, g, h, a) + ROUND_16_63(32, T2, a, b, c, d, e, f, g, h) + ROUND_16_63(33, T2, h, a, b, c, d, e, f, g) + ROUND_16_63(34, T2, g, h, a, b, c, d, e, f) + ROUND_16_63(35, T2, f, g, h, a, b, c, d, e) + ROUND_16_63(36, T2, e, f, g, h, a, b, c, d) + ROUND_16_63(37, T2, d, e, f, g, h, a, b, c) + ROUND_16_63(38, T2, c, d, e, f, g, h, a, b) + ROUND_16_63(39, T2, b, c, d, e, f, g, h, a) + ROUND_16_63(40, T2, a, b, c, d, e, f, g, h) + ROUND_16_63(41, T2, h, a, b, c, d, e, f, g) + ROUND_16_63(42, T2, g, h, a, b, c, d, e, f) + ROUND_16_63(43, T2, f, g, h, a, b, c, d, e) + ROUND_16_63(44, T2, e, f, g, h, a, b, c, d) + ROUND_16_63(45, T2, d, e, f, g, h, a, b, c) + ROUND_16_63(46, T2, c, d, e, f, g, h, a, b) + ROUND_16_63(47, T2, b, c, d, e, f, g, h, a) + ROUND_16_63(48, T2, a, b, c, d, e, f, g, h) + ROUND_16_63(49, T2, h, a, b, c, d, e, f, g) + ROUND_16_63(50, T2, g, h, a, b, c, d, e, f) + ROUND_16_63(51, T2, f, g, h, a, b, c, d, e) + ROUND_16_63(52, T2, e, f, g, h, a, b, c, d) + ROUND_16_63(53, T2, d, e, f, g, h, a, b, c) + ROUND_16_63(54, T2, c, d, e, f, g, h, a, b) + ROUND_16_63(55, T2, b, c, d, e, f, g, h, a) + ROUND_16_63(56, T2, a, b, c, d, e, f, g, h) + ROUND_16_63(57, T2, h, a, b, c, d, e, f, g) + ROUND_16_63(58, T2, g, h, a, b, c, d, e, f) + ROUND_16_63(59, T2, f, g, h, a, b, c, d, e) + ROUND_16_63(60, T2, e, f, g, h, a, b, c, d) + ROUND_16_63(61, T2, d, e, f, g, h, a, b, c) + ROUND_16_63(62, T2, c, d, e, f, g, h, a, b) + ROUND_16_63(63, T2, b, c, d, e, f, g, h, a) XXLXOR V0, VS24, V0 XXLXOR V1, VS25, V1