From ec3aff95f93923def18a375fa5781595c9fb43bf Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Mon, 9 Sep 2024 15:47:42 +0800 Subject: [PATCH] sm3: ppc64x block init #245 --- sm3/sm3block_generic.go | 6 +- sm3/sm3block_ppc64x.go | 15 ++ sm3/sm3block_ppc64x.s | 409 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 429 insertions(+), 1 deletion(-) create mode 100644 sm3/sm3block_ppc64x.go create mode 100644 sm3/sm3block_ppc64x.s diff --git a/sm3/sm3block_generic.go b/sm3/sm3block_generic.go index 4da5439..19d8b66 100644 --- a/sm3/sm3block_generic.go +++ b/sm3/sm3block_generic.go @@ -1,4 +1,8 @@ -//go:build purego || !(amd64 || arm64) +// Copyright 2021 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +//go:build purego || !(amd64 || arm64 || ppc64 || ppc64le) package sm3 diff --git a/sm3/sm3block_ppc64x.go b/sm3/sm3block_ppc64x.go new file mode 100644 index 0000000..7d6e73e --- /dev/null +++ b/sm3/sm3block_ppc64x.go @@ -0,0 +1,15 @@ +// Copyright 2024 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +//go:build (ppc64 || ppc64le) && !purego + +package sm3 + +//go:noescape +func blockASM(dig *digest, p []byte, buffer *uint32) + +func block(dig *digest, p []byte) { + var buffer [8]uint32 // 32 bytes buffer, avoid stack usage in asm code + blockASM(dig, p, &buffer[0]) +} diff --git a/sm3/sm3block_ppc64x.s b/sm3/sm3block_ppc64x.s new file mode 100644 index 0000000..376d96a --- /dev/null +++ b/sm3/sm3block_ppc64x.s @@ -0,0 +1,409 @@ +// Copyright 2024 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +//go:build (ppc64 || ppc64le) && !purego + +#include "textflag.h" +#include "sm3_const_asm.s" + +#ifdef GOARCH_ppc64le +#define NEEDS_PERMW + +#define PPC64X_LXVW4X(RA,RB,VT) \ + LXVW4X (RA+RB), VT \ + VPERM VT, VT, ESPERMW, VT + +#else +#define PPC64X_LXVW4X(RA,RB,VT) LXVW4X (RA+RB), VT +#endif // defined(GOARCH_ppc64le) + +#define a R7 +#define b R8 +#define c R9 +#define d R10 +#define e R11 +#define f R12 +#define g R14 +#define h R15 + +#define CTX R3 +#define INP R4 +#define LEN R5 +#define BUFFER R16 + +#define R_x000 R0 +#define R_x010 R17 +#define R_x020 R18 +#define R_x030 R19 + +#define y0 R20 +#define y1 R21 +#define y2 R22 +#define TEMP R6 + +#define XWORD0 V0 +#define XWORD1 V1 +#define XWORD2 V2 +#define XWORD3 V3 + +#define XTMP0 V4 +#define XTMP1 V5 +#define XTMP2 V6 +#define XTMP3 V7 +#define XTMP4 V8 + +#define XFER V9 + +// For instruction emulation +#define ESPERMW V31 // Endian swapping permute into BE + +// shuffle byte order from LE to BE +DATA ·flip_mask+0x00(SB)/8, $0x0b0a09080f0e0d0c // byte swap per word +DATA ·flip_mask+0x08(SB)/8, $0x0302010007060504 + +GLOBL flip_mask<>(SB), RODATA, $16 + +#define SS12(a, e, const, ss1, ss2) \ + ROTLW $12, a, ss2; \ // y0 = a <<< 12 + ADD $const, e, ss1; \ + ADD ss2, ss1; \ // y2 = a <<< 12 + e + T + ROTLW $7, ss1; \ // y2 = SS1 + XOR ss1, ss2 + +#define P0(tt2, tmp, out) \ + ROTLW $9, tt2, tmp; \ + ROTLW $17, tt2, out; \ + XOR tmp, out; \ + XOR tt2, out + +// Load w from buffer +#define LOAD_WORD1(idx, dst) \ + MOVWZ $(idx*4)(BUFFER), dst + +// Load w' from buffer +#define LOAD_WORD2(idx, dst) \ + MOVWZ $(idx*4 + 16)(BUFFER), dst + +// For rounds [0 - 16) +#define DO_ROUND_N_0(idx, const, a, b, c, d, e, f, g, h) \ + ; \ // ############################# RND N + 0 ############################// + SS12(a, e, const, y2, y0); \ + LOAD_WORD1(idx, y1); \ + ADD y1, y2; \ // y2 = SS1 + W + ADD h, y2; \ // y2 = h + SS1 + W + LOAD_WORD2(idx, y1); \ + ADD y1, y0; \ // y0 = SS2 + W' + ADD d, y0; \ // y0 = d + SS2 + W' + ; \ + XOR a, b, h; \ + XOR c, h; \ + ADD y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 + ; \ + XOR e, f, y1; \ + XOR g, y1; \ + ADD y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 + ; \ + ROTLW $9, b; \ + ROTLW $19, f; \ + ; \ + P0(y2, y0, d) + +// For rounds [16 - 64) +#define DO_ROUND_N_1(idx, const, a, b, c, d, e, f, g, h) \ + ; \ // ############################# RND N + 0 ############################// + SS12(a, e, const, y2, y0); \ + LOAD_WORD1(idx, y1); \ + ADD y1, y2; \ // y2 = SS1 + W + ADD h, y2; \ // y2 = h + SS1 + W + LOAD_WORD2(idx, y1); \ + ADD y1, y0; \ // y0 = SS2 + W' + ADD d, y0; \ // y0 = d + SS2 + W' + ; \ + OR a, b, y1; \ + AND a, b, h; \ + AND c, y1; \ + OR y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) + ADD y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 + ; \ + XOR f, g, y1; \ + AND e, y1; \ + XOR g, y1; \ // y1 = GG2(e, f, g) + ADD y1, y2; \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 + ; \ + ROTLW $9, b; \ + ROTLW $19, f; \ + ; \ + P0(y2, y0, d) + +// r = s <<< n +// Due to VSPLTISW's limitation, the n MUST be [0, 15], +// If n > 15, we have to call it multiple times. +// VSPLTISW takes a 5-bit immediate value as an operand. +// I also did NOT find one vector instruction to use immediate value for ROTL. +#define PROLD(s, r, n) \ + VSPLTISW $n, XFER \ + VRLW s, XFER, r + +#define MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) \ + VSLDOI $12, XWORD0, XWORD1, XTMP0; \ // XTMP0 = W[-13] = {w3, w4, w5, w6} + PROLD(XTMP0, XTMP1, 7); \ // XTMP1 = W[-13] rol 7 + VSLDOI $8, XWORD2, XWORD3, XTMP0; \ // XTMP0 = W[-6] = {w10, w11, w12, w13} + VXOR XTMP0, XTMP1, XTMP0; \ // XTMP0 = W[-6] xor (W[-13] rol 7) + ; \ + VSLDOI $12, XWORD1, XWORD2, XTMP1; \ // XTMP1 = W[-9] = {w7, w8, w9, w10} + VXOR XTMP1, XWORD0, XTMP1; \ // XTMP1 = W[-9] xor W[-16] + VSLDOI $4, XWORD3, XWORD2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w8} + PROLD(XTMP3, XTMP2, 15); \ // XTMP2 = W[-3] rol 15 + VXOR XTMP1, XTMP2, XTMP2; \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABxx} + ; \ + PROLD(XTMP2, XTMP4, 15); \ // XTMP4 = = XTMP2 rol 15 {ABxx} + PROLD(XTMP4, XTMP3, 8); \ // XTMP3 = XTMP2 rol 23 {ABxx} + VXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) + VXOR XTMP4, XTMP3, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) XOR (XTMP2 rol 23 {ABxx}) + ; \ // First 2 words message schedule result + VXOR XTMP4, XTMP0, XTMP2; \ // XTMP2 = {w[0], w[1], ..., ...} + ; \ + VSLDOI $4, XWORD3, XTMP2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w0} + PROLD(XTMP3, XTMP4, 15); \ // XTMP4 = W[-3] rol 15 + VXOR XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABCD} + ; \ + PROLD(XTMP4, XTMP3, 15); \ // XTMP3 = = XTMP4 rol 15 {ABCD} + PROLD(XTMP3, XTMP1, 8); \ // XTMP1 = XTMP4 rol 23 {ABCD} + VXOR XTMP4, XTMP3, XTMP3; \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {ABCD}) + VXOR XTMP3, XTMP1, XTMP1; \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {ABCD}) XOR (XTMP4 rol 23 {ABCD}) + ; \ // 4 words message schedule result + VXOR XTMP1, XTMP0, XWORD0; \ // XWORD0 = {w[0], w[1], w[2], w[3]} + + +// func blockASM(dig *digest, p []byte, buffer *uint32) +TEXT ·blockASM(SB), NOSPLIT, $0 +#ifdef NEEDS_PERMW + MOVD $·flip_mask(SB), TEMP + LVX (TEMP), ESPERMW + ADD $0x10, TEMP +#endif + + MOVD dig+0(FP), CTX + MOVD p_base+8(FP), INP + MOVD p_len+16(FP), LEN + MOVD buffer+32(FP), BUFFER + + // We assume p_len >= 64 + SRD $6, LEN + MOVD LEN, CTR + + MOVD $16, R_x010 + MOVD $32, R_x020 + MOVD $48, R_x030 + + // Load initial digest + MOVWZ 0(CTX), a + MOVWZ 4(CTX), b + MOVWZ 8(CTX), c + MOVWZ 12(CTX), d + MOVWZ 16(CTX), e + MOVWZ 20(CTX), f + MOVWZ 24(CTX), g + MOVWZ 28(CTX), h + +loop: + PPC64X_LXVW4X(INP, R_x000, XWORD0) + PPC64X_LXVW4X(INP, R_x010, XWORD1) + PPC64X_LXVW4X(INP, R_x020, XWORD2) + PPC64X_LXVW4X(INP, R_x030, XWORD3) + + ADD $64, INP + +schedule_compress: // for w0 - w47 + // Do 4 rounds and scheduling + STXVW4X XWORD0, (BUFFER)(R_x000) + VXOR XWORD0, XWORD1, XFER + STXVW4X XFER, (BUFFER)(R_x010) + DO_ROUND_N_0(0, T0, a, b, c, d, e, f, g, h) + DO_ROUND_N_0(1, T1, h, a, b, c, d, e, f, g) + MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) + DO_ROUND_N_0(2, T2, g, h, a, b, c, d, e, f) + DO_ROUND_N_0(3, T3, f, g, h, a, b, c, d, e) + + // Do 4 rounds and scheduling + STXVW4X XWORD1, (BUFFER)(R_x000) + VXOR XWORD1, XWORD2, XFER + STXVW4X XFER, (BUFFER)(R_x010) + DO_ROUND_N_0(0, T4, e, f, g, h, a, b, c, d) + DO_ROUND_N_0(1, T5, d, e, f, g, h, a, b, c) + MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0) + DO_ROUND_N_0(2, T6, c, d, e, f, g, h, a, b) + DO_ROUND_N_0(3, T7, b, c, d, e, f, g, h, a) + + // Do 4 rounds and scheduling + STXVW4X XWORD2, (BUFFER)(R_x000) + VXOR XWORD2, XWORD3, XFER + STXVW4X XFER, (BUFFER)(R_x010) + DO_ROUND_N_0(0, T8, a, b, c, d, e, f, g, h) + DO_ROUND_N_0(1, T9, h, a, b, c, d, e, f, g) + MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1) + DO_ROUND_N_0(2, T10, g, h, a, b, c, d, e, f) + DO_ROUND_N_0(3, T11, f, g, h, a, b, c, d, e) + + // Do 4 rounds and scheduling + STXVW4X XWORD3, (BUFFER)(R_x000) + VXOR XWORD3, XWORD0, XFER + STXVW4X XFER, (BUFFER)(R_x010) + DO_ROUND_N_0(0, T12, e, f, g, h, a, b, c, d) + DO_ROUND_N_0(1, T13, d, e, f, g, h, a, b, c) + MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2) + DO_ROUND_N_0(2, T14, c, d, e, f, g, h, a, b) + DO_ROUND_N_0(3, T15, b, c, d, e, f, g, h, a) + + // Do 4 rounds and scheduling + STXVW4X XWORD0, (BUFFER)(R_x000) + VXOR XWORD0, XWORD1, XFER + STXVW4X XFER, (BUFFER)(R_x010) + DO_ROUND_N_1(0, T16, a, b, c, d, e, f, g, h) + DO_ROUND_N_1(1, T17, h, a, b, c, d, e, f, g) + MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) + DO_ROUND_N_1(2, T18, g, h, a, b, c, d, e, f) + DO_ROUND_N_1(3, T19, f, g, h, a, b, c, d, e) + + // Do 4 rounds and scheduling + STXVW4X XWORD1, (BUFFER)(R_x000) + VXOR XWORD1, XWORD2, XFER + STXVW4X XFER, (BUFFER)(R_x010) + DO_ROUND_N_1(0, T20, e, f, g, h, a, b, c, d) + DO_ROUND_N_1(1, T21, d, e, f, g, h, a, b, c) + MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0) + DO_ROUND_N_1(2, T22, c, d, e, f, g, h, a, b) + DO_ROUND_N_1(3, T23, b, c, d, e, f, g, h, a) + + // Do 4 rounds and scheduling + STXVW4X XWORD2, (BUFFER)(R_x000) + VXOR XWORD2, XWORD3, XFER + STXVW4X XFER, (BUFFER)(R_x010) + DO_ROUND_N_1(0, T24, a, b, c, d, e, f, g, h) + DO_ROUND_N_1(1, T25, h, a, b, c, d, e, f, g) + MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1) + DO_ROUND_N_1(2, T26, g, h, a, b, c, d, e, f) + DO_ROUND_N_1(3, T27, f, g, h, a, b, c, d, e) + + // Do 4 rounds and scheduling + STXVW4X XWORD3, (BUFFER)(R_x000) + VXOR XWORD3, XWORD0, XFER + STXVW4X XFER, (BUFFER)(R_x010) + DO_ROUND_N_1(0, T28, e, f, g, h, a, b, c, d) + DO_ROUND_N_1(1, T29, d, e, f, g, h, a, b, c) + MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2) + DO_ROUND_N_1(2, T30, c, d, e, f, g, h, a, b) + DO_ROUND_N_1(3, T31, b, c, d, e, f, g, h, a) + + // Do 4 rounds and scheduling + STXVW4X XWORD0, (BUFFER)(R_x000) + VXOR XWORD0, XWORD1, XFER + STXVW4X XFER, (BUFFER)(R_x010) + DO_ROUND_N_1(0, T32, a, b, c, d, e, f, g, h) + DO_ROUND_N_1(1, T33, h, a, b, c, d, e, f, g) + MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) + DO_ROUND_N_1(2, T34, g, h, a, b, c, d, e, f) + DO_ROUND_N_1(3, T35, f, g, h, a, b, c, d, e) + + // Do 4 rounds and scheduling + STXVW4X XWORD1, (BUFFER)(R_x000) + VXOR XWORD1, XWORD2, XFER + STXVW4X XFER, (BUFFER)(R_x010) + DO_ROUND_N_1(0, T36, e, f, g, h, a, b, c, d) + DO_ROUND_N_1(1, T37, d, e, f, g, h, a, b, c) + MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0) + DO_ROUND_N_1(2, T38, c, d, e, f, g, h, a, b) + DO_ROUND_N_1(3, T39, b, c, d, e, f, g, h, a) + + // Do 4 rounds and scheduling + STXVW4X XWORD2, (BUFFER)(R_x000) + VXOR XWORD2, XWORD3, XFER + STXVW4X XFER, (BUFFER)(R_x010) + DO_ROUND_N_1(0, T40, a, b, c, d, e, f, g, h) + DO_ROUND_N_1(1, T41, h, a, b, c, d, e, f, g) + MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1) + DO_ROUND_N_1(2, T42, g, h, a, b, c, d, e, f) + DO_ROUND_N_1(3, T43, f, g, h, a, b, c, d, e) + + // Do 4 rounds and scheduling + STXVW4X XWORD3, (BUFFER)(R_x000) + VXOR XWORD3, XWORD0, XFER + STXVW4X XFER, (BUFFER)(R_x010) + DO_ROUND_N_1(0, T44, e, f, g, h, a, b, c, d) + DO_ROUND_N_1(1, T45, d, e, f, g, h, a, b, c) + MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2) + DO_ROUND_N_1(2, T46, c, d, e, f, g, h, a, b) + DO_ROUND_N_1(3, T47, b, c, d, e, f, g, h, a) + + // w48 - w63 processed with only 4 rounds scheduling (last 16 rounds) + // Do 4 rounds + STXVW4X XWORD0, (BUFFER)(R_x000) + VXOR XWORD0, XWORD1, XFER + STXVW4X XFER, (BUFFER)(R_x010) + DO_ROUND_N_1(0, T48, a, b, c, d, e, f, g, h) + DO_ROUND_N_1(1, T49, h, a, b, c, d, e, f, g) + DO_ROUND_N_1(2, T50, g, h, a, b, c, d, e, f) + DO_ROUND_N_1(3, T51, f, g, h, a, b, c, d, e) + + STXVW4X XWORD1, (BUFFER)(R_x000) + VXOR XWORD1, XWORD2, XFER + STXVW4X XFER, (BUFFER)(R_x010) + DO_ROUND_N_1(0, T52, e, f, g, h, a, b, c, d) + DO_ROUND_N_1(1, T53, d, e, f, g, h, a, b, c) + DO_ROUND_N_1(2, T54, c, d, e, f, g, h, a, b) + DO_ROUND_N_1(3, T55, b, c, d, e, f, g, h, a) + + STXVW4X XWORD2, (BUFFER)(R_x000) + VXOR XWORD2, XWORD3, XFER + STXVW4X XFER, (BUFFER)(R_x010) + MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) + DO_ROUND_N_1(0, T56, a, b, c, d, e, f, g, h) + DO_ROUND_N_1(1, T57, h, a, b, c, d, e, f, g) + DO_ROUND_N_1(2, T58, g, h, a, b, c, d, e, f) + DO_ROUND_N_1(3, T59, f, g, h, a, b, c, d, e) + + STXVW4X XWORD3, (BUFFER)(R_x000) + VXOR XWORD3, XWORD0, XFER + STXVW4X XFER, (BUFFER)(R_x010) + DO_ROUND_N_1(0, T60, e, f, g, h, a, b, c, d) + DO_ROUND_N_1(1, T61, d, e, f, g, h, a, b, c) + DO_ROUND_N_1(2, T62, c, d, e, f, g, h, a, b) + DO_ROUND_N_1(3, T63, b, c, d, e, f, g, h, a) + + MOVWZ 0(CTX), TEMP + XOR TEMP, a + MOVWZ a, 0(CTX) + + MOVWZ 4(CTX), TEMP + XOR TEMP, b + MOVWZ b, 4(CTX) + + MOVWZ 8(CTX), TEMP + XOR TEMP, c + MOVWZ c, 8(CTX) + + MOVWZ 12(CTX), TEMP + XOR TEMP, d + MOVWZ d, 12(CTX) + + MOVWZ 16(CTX), TEMP + XOR TEMP, e + MOVWZ e, 16(CTX) + + MOVWZ 20(CTX), TEMP + XOR TEMP, f + MOVWZ f, 20(CTX) + + MOVWZ 24(CTX), TEMP + XOR TEMP, g + MOVWZ g, 24(CTX) + + MOVWZ 28(CTX), TEMP + XOR TEMP, h + MOVWZ h, 28(CTX) + + BDNZ loop + +end: + RET