diff --git a/sm3/sm3blocks_ppc64x.s b/sm3/sm3blocks_ppc64x.s
index a190c02..569431f 100644
--- a/sm3/sm3blocks_ppc64x.s
+++ b/sm3/sm3blocks_ppc64x.s
@@ -60,7 +60,7 @@ GLOBL ·mask(SB), RODATA, $80
 
 // r = s <<< n
 #define PROLD(s, r, n) \
-	VSPLTISW $n, TMP5 \
+	XXSPLTIW $n, TMP5 \
 	VRLW	s, TMP5, r
 
 #define loadWordByIndex(W, i) \
@@ -94,10 +94,19 @@ GLOBL ·mask(SB), RODATA, $80
 	VPERM TMP2, TMP3, M2, T2; \
 	VPERM TMP2, TMP3, M3, T3
 
+// Load constant T, How to simlify it?
+// Solution 1: big constant table
+// Solution 2: 2 constant T, rotate shift left one bit every time
+// Which solution's performance is better?
+#define LOAD_T(index, const, target) \
+	MOVD $const, R19                 \
+	MTVSRWZ R19, target                \
+	VSPLTW $3, target, target
+
 #define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \
 	PROLD(a, TMP0, 12)               \
 	VOR TMP0, TMP0, TMP1             \
-	VSPLTISW $const, TMP2			 \
+	LOAD_T(index, const, TMP2)       \
 	VADDUWM TMP2, TMP0, TMP0         \
 	VADDUWM e, TMP0, TMP0            \
 	PROLD(TMP0, TMP2, 7)             \ // TMP2 = SS1
@@ -152,7 +161,7 @@ GLOBL ·mask(SB), RODATA, $80
 	MESSAGE_SCHEDULE(index)          \ // TMP1 is Wt+4 now, Pls do not use it
 	PROLD(a, TMP0, 12)               \
 	VOR TMP0, TMP0, TMP4             \
-	VSPLTISW $const, TMP2            \
+	LOAD_T(index, const, TMP2)       \
 	VADDUWM TMP2, TMP0, TMP0         \
 	VADDUWM e, TMP0, TMP0            \
 	PROLD(TMP0, TMP2, 7)             \ // TMP2 = SS1