From 37493fe3dfd1de472b5efe12c878befbf9bd8ad2 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Fri, 13 Sep 2024 11:23:31 +0800 Subject: [PATCH] sm4: arm64 cbc, fix register usage issue --- sm4/cbc_arm64.s | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/sm4/cbc_arm64.s b/sm4/cbc_arm64.s index b7a1f49..746b2ec 100644 --- a/sm4/cbc_arm64.s +++ b/sm4/cbc_arm64.s @@ -123,6 +123,9 @@ cbcSm4Nibbles: ADD dstPtr, srcPtrLen, R12 VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4] + VMOV t0.B16, t5.B16 + VMOV t1.B16, t6.B16 + VMOV t2.B16, t7.B16 VREV32 t0.B16, t0.B16 VREV32 t1.B16, t1.B16 VREV32 t2.B16, t2.B16 @@ -147,11 +150,11 @@ cbc4BlocksLoop: VREV32 t2.B16, t2.B16 VREV32 t3.B16, t3.B16 - VLD1 (R11), [V6.S4, V7.S4, V8.S4, V9.S4] - VEOR V6.B16, t0.B16, t0.B16 - VEOR V7.B16, t1.B16, t1.B16 - VEOR V8.B16, t2.B16, t2.B16 - VEOR V9.B16, t3.B16, t3.B16 + VLD1 (R11), [t4.S4] + VEOR t4.B16, t0.B16, t0.B16 + VEOR t5.B16, t1.B16, t1.B16 + VEOR t6.B16, t2.B16, t2.B16 + VEOR t7.B16, t3.B16, t3.B16 VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R12) @@ -170,9 +173,9 @@ cbcSm4Single: // 4 blocks VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4] - VMOV t0.B16, V6.B16 - VMOV t1.B16, V7.B16 - VMOV t2.B16, V8.B16 + VMOV t0.B16, t4.B16 + VMOV t1.B16, t5.B16 + VMOV t2.B16, t6.B16 VREV32 t0.B16, t0.B16 VREV32 t1.B16, t1.B16 VREV32 t2.B16, t2.B16 @@ -196,9 +199,9 @@ cbc4BlocksLoop64: VREV32 t3.B16, t3.B16 VEOR IV.B16, t0.B16, t0.B16 - VEOR V6.B16, t1.B16, t1.B16 - VEOR V7.B16, t2.B16, t2.B16 - VEOR V8.B16, t3.B16, t3.B16 + VEOR t4.B16, t1.B16, t1.B16 + VEOR t5.B16, t2.B16, t2.B16 + VEOR t6.B16, t3.B16, t3.B16 VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (dstPtr) @@ -234,7 +237,7 @@ cbc4BlocksLoop16: cbcSm4Single32: VLD1 (srcPtr), [t0.S4, t1.S4] - VMOV t0.B16, V6.B16 + VMOV t0.B16, t4.B16 VREV32 t0.B16, t0.B16 VREV32 t1.B16, t1.B16 PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) @@ -254,15 +257,15 @@ cbc4BlocksLoop32: VREV32 t1.B16, t1.B16 VEOR IV.B16, t0.B16, t0.B16 - VEOR V6.B16, t1.B16, t1.B16 + VEOR t4.B16, t1.B16, t1.B16 VST1 [t0.S4, t1.S4], (dstPtr) B cbcSm4Done cbcSm4Single48: VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4] - VMOV t0.B16, V6.B16 - VMOV t1.B16, V7.B16 + VMOV t0.B16, t4.B16 + VMOV t1.B16, t5.B16 VREV32 t0.B16, t0.B16 VREV32 t1.B16, t1.B16 VREV32 t2.B16, t2.B16 @@ -284,8 +287,8 @@ cbc4BlocksLoop48: VREV32 t2.B16, t2.B16 VEOR IV.B16, t0.B16, t0.B16 - VEOR V6.B16, t1.B16, t1.B16 - VEOR V7.B16, t2.B16, t2.B16 + VEOR t4.B16, t1.B16, t1.B16 + VEOR t5.B16, t2.B16, t2.B16 VST1 [t0.S4, t1.S4, t2.S4], (dstPtr)