diff --git a/sm3/sm3blocks_ppc64x.s b/sm3/sm3blocks_ppc64x.s index b0854cc..c89bc1e 100644 --- a/sm3/sm3blocks_ppc64x.s +++ b/sm3/sm3blocks_ppc64x.s @@ -20,7 +20,9 @@ DATA ·mask+0x30(SB)/8, $0x0001020304050607 DATA ·mask+0x38(SB)/8, $0x1011121314151617 DATA ·mask+0x40(SB)/8, $0x08090a0b0c0d0e0f DATA ·mask+0x48(SB)/8, $0x18191a1b1c1d1e1f -GLOBL ·mask(SB), RODATA, $80 +DATA ·mask+0x50(SB)/8, $0x0b0a09080f0e0d0c // Permute for vector doubleword endian swap +DATA ·mask+0x58(SB)/8, $0x0302010007060504 +GLOBL ·mask(SB), RODATA, $96 #ifdef GOARCH_ppc64le #define NEEDS_ESPERM @@ -72,23 +74,15 @@ TEXT ·transposeMatrix(SB),NOSPLIT,$0 MOVD (R0)(R3), R4 LXVW4X (R0)(R4), V0 LXVW4X (R6)(R4), V4 - //LOADWORDS(R4, R0, V0) - //LOADWORDS(R4, R6, V4) MOVD (R5)(R3), R4 LXVW4X (R0)(R4), V1 LXVW4X (R6)(R4), V5 - //LOADWORDS(R4, R0, V1) - //LOADWORDS(R4, R6, V5) MOVD (R6)(R3), R4 LXVW4X (R0)(R4), V2 LXVW4X (R6)(R4), V6 - //LOADWORDS(R4, R0, V2) - //LOADWORDS(R4, R6, V6) MOVD (R7)(R3), R4 LXVW4X (R0)(R4), V3 LXVW4X (R6)(R4), V7 - //LOADWORDS(R4, R0, V3) - //LOADWORDS(R4, R6, V7) TRANSPOSE_MATRIX(V0, V1, V2, V3, V8, V9, V10, V11, V12, V13, V14, V15) @@ -97,60 +91,68 @@ TEXT ·transposeMatrix(SB),NOSPLIT,$0 MOVD (R0)(R3), R4 STXVW4X V0, (R0)(R4) STXVW4X V4, (R6)(R4) - //STOREWORDS(V0, R4, R0) - //STOREWORDS(V4, R4, R6) MOVD (R5)(R3), R4 STXVW4X V1, (R0)(R4) STXVW4X V5, (R6)(R4) - //STOREWORDS(V1, R4, R0) - //STOREWORDS(V5, R4, R6) MOVD (R6)(R3), R4 STXVW4X V2, (R0)(R4) STXVW4X V6, (R6)(R4) - //STOREWORDS(V2, R4, R0) - //STOREWORDS(V6, R4, R6) MOVD (R7)(R3), R4 STXVW4X V3, (R0)(R4) STXVW4X V7, (R6)(R4) - //STOREWORDS(V3, R4, R0) - //STOREWORDS(V7, R4, R6) RET +#ifdef GOARCH_ppc64le +#define NEEDS_ESPERM + +#define PPC64X_STXVD2X(VS,RA,RB) \ + VPERM VS, VS, ESPERMW, TMP2 \ + STXVD2X TMP2, (RA+RB) + +#else +#define STORED2X(VS,RA,RB) STXVD2X VS, (RA+RB) +#endif // defined(GOARCH_ppc64le) + // func copyResultsBy4(dig *uint32, dst *byte) TEXT ·copyResultsBy4(SB),NOSPLIT,$0 MOVD dig+0(FP), R3 MOVD dst+8(FP), R4 - + +#ifdef NEEDS_ESPERM + MOVD $·mask+0x80(SB), R5 + LVX (R5), ESPERMW +#endif + LXVD2X (R0)(R3), V0 - STXVD2X V0, (R0)(R4) + PPC64X_STXVD2X(V0, R0, R4) MOVD $16, R5 LXVD2X (R5)(R3), V0 - STXVD2X V0, (R5)(R4) + PPC64X_STXVD2X(V0, R0, R4) ADD $16, R5 LXVD2X (R5)(R3), V0 - STXVD2X V0, (R5)(R4) + PPC64X_STXVD2X(V0, R0, R4) ADD $16, R5 LXVD2X (R5)(R3), V0 - STXVD2X V0, (R5)(R4) + PPC64X_STXVD2X(V0, R0, R4) ADD $16, R5 LXVD2X (R5)(R3), V0 - STXVD2X V0, (R5)(R4) + PPC64X_STXVD2X(V0, R0, R4) ADD $16, R5 LXVD2X (R5)(R3), V0 - STXVD2X V0, (R5)(R4) + PPC64X_STXVD2X(V0, R0, R4) ADD $16, R5 LXVD2X (R5)(R3), V0 - STXVD2X V0, (R5)(R4) + PPC64X_STXVD2X(V0, R0, R4) ADD $16, R5 LXVD2X (R5)(R3), V0 - STXVD2X V0, (R5)(R4) + PPC64X_STXVD2X(V0, R0, R4) RET