diff --git a/sm3/sm3block_ppc64x.s b/sm3/sm3block_ppc64x.s
index 07c4c5b..3397dba 100644
--- a/sm3/sm3block_ppc64x.s
+++ b/sm3/sm3block_ppc64x.s
@@ -7,6 +7,11 @@
 #include "textflag.h"
 #include "sm3_const_asm.s"
 
+// We can also use MFVSRWZ to extract the first word from vector register
+// and then use VSLDOI to shift the vector register, thus we can avoid the usage of memory (buffer).
+// But due to we have no real phisical machine to test the performance difference,
+// we'll keep current implementation first.
+
 #ifdef GOARCH_ppc64le
 #define NEEDS_PERMW
 
@@ -77,17 +82,10 @@ GLOBL ·flip_mask(SB), RODATA, $16
 	XOR     tmp, out;                              \
 	XOR     tt2, out
 
-// Load w from buffer
-#define LOAD_WORD1(idx, dst) \
-	MOVWZ $(idx*4)(BUFFER),  dst
-
-// Load w' from buffer
-#define LOAD_WORD2(idx, dst)  \
-	MOVWZ $(idx*4 + 16)(BUFFER),  dst
-
 // For rounds [0 - 16)
+// addr1 for w, addr2 for w'
 #define DO_ROUND_N_0(addr1, addr2, const, a, b, c, d, e, f, g, h) \
-	;                                            \ // #############################  RND N + 0 ############################//
+	;                                            \
 	SS12(a, e, const, y2, y0);                   \
 	MOVWZ addr1, y1;                             \
 	ADD   y1, y2;                                \ // y2 = SS1 + W
@@ -110,8 +108,9 @@ GLOBL ·flip_mask(SB), RODATA, $16
 	P0(y2, y0, d)
 
 // For rounds [16 - 64)
+// addr1 for w, addr2 for w'
 #define DO_ROUND_N_1(addr1, addr2, const, a, b, c, d, e, f, g, h) \
-	;                                            \ // #############################  RND N + 0 ############################//
+	;                                            \
 	SS12(a, e, const, y2, y0);                   \
 	MOVWZ addr1, y1;                             \
 	ADD     y1, y2;                              \ // y2 = SS1 + W
@@ -150,24 +149,24 @@ GLOBL ·flip_mask(SB), RODATA, $16
 	PROLD(XTMP0, XTMP1, 7);            \ // XTMP1 = W[-13] rol 7
 	VSLDOI $8, XWORD2, XWORD3, XTMP0;  \ // XTMP0 = W[-6] = {w10, w11, w12, w13}
 	VXOR XTMP0, XTMP1, XTMP0;          \ // XTMP0 = W[-6] xor (W[-13] rol 7)
-	; \
+	; \ // Prepare P1 parameters
 	VSLDOI $12, XWORD1, XWORD2, XTMP1; \ // XTMP1 = W[-9] = {w7, w8, w9, w10}
 	VXOR XTMP1, XWORD0, XTMP1;         \ // XTMP1 = W[-9] xor W[-16]
 	VSLDOI $4, XWORD3, XWORD2, XTMP3;  \ // XTMP3 = W[-3] = {w13, w14, w15, w8}
 	PROLD(XTMP3, XTMP2, 15);           \ // XTMP2 = W[-3] rol 15
 	VXOR XTMP1, XTMP2, XTMP2;          \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABxx}
-	; \
+	; \ // P1
 	PROLD(XTMP2, XTMP4, 15);           \ // XTMP4 =  = XTMP2 rol 15 {ABxx}
 	PROLD(XTMP4, XTMP3, 8);            \ // XTMP3 = XTMP2 rol 23 {ABxx}
 	VXOR XTMP2, XTMP4, XTMP4;          \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx})
 	VXOR XTMP4, XTMP3, XTMP4;          \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) XOR (XTMP2 rol 23 {ABxx})
 	; \ // First 2 words message schedule result
 	VXOR XTMP4, XTMP0, XTMP2;          \ // XTMP2 = {w[0], w[1], ..., ...}
-	; \
+	; \ // Prepare P1 parameters
 	VSLDOI $4, XWORD3, XTMP2, XTMP3;   \ // XTMP3 = W[-3] = {w13, w14, w15, w0}
 	PROLD(XTMP3, XTMP4, 15);           \ // XTMP4 = W[-3] rol 15
 	VXOR XTMP1, XTMP4, XTMP4;		   \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABCD}
-	; \
+	; \ // P1
 	PROLD(XTMP4, XTMP3, 15);           \ // XTMP3 =  = XTMP4 rol 15 {ABCD}
 	PROLD(XTMP3, XTMP1, 8);            \ // XTMP1 = XTMP4 rol 23 {ABCD}
 	VXOR XTMP4, XTMP3, XTMP3;          \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {ABCD})
@@ -181,7 +180,6 @@ TEXT ·blockASM(SB), NOSPLIT, $0
 #ifdef NEEDS_PERMW
 	MOVD	$·flip_mask(SB), TEMP
 	LVX	(TEMP), ESPERMW
-	ADD	$0x10, TEMP
 #endif
 
 	MOVD	dig+0(FP), CTX