internal/subtle: remove xor related codes #315

2025-04-27 04:36:19 +08:00 · 2025-03-13 15:20:05 +08:00 · 2025-03-13 15:20:05 +08:00 · 7ec46d700d
commit 7ec46d700d
parent 7a5253bfb5
9 changed files with 0 additions and 823 deletions
--- a/internal/subtle/xor.go
+++ b/internal/subtle/xor.go
@ -1,32 +0,0 @@
 // Copyright 2022 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package subtle
 import "github.com/emmansun/gmsm/internal/alias"
 // XORBytes sets dst[i] = x[i] ^ y[i] for all i < n = min(len(x), len(y)),
 // returning n, the number of bytes written to dst.
 // If dst does not have length at least n,
 // XORBytes panics without writing anything to dst.
 //
 // dst and x or y may overlap exactly or not at all,
 // otherwise XORBytes may panic.
 func XORBytes(dst, x, y []byte) int {
 	n := len(x)
 	if len(y) < n {
 		n = len(y)
 	}
 	if n == 0 {
 		return 0
 	}
 	if n > len(dst) {
 		panic("subtle.XORBytes: dst too short")
 	}
 	if alias.InexactOverlap(dst[:n], x[:n]) || alias.InexactOverlap(dst[:n], y[:n]) {
 		panic("subtle.XORBytes: invalid overlap")
 	}
 	xorBytes(&dst[0], &x[0], &y[0], n) // arch-specific
 	return n
 }
--- a/internal/subtle/xor_amd64.s
+++ b/internal/subtle/xor_amd64.s
@ -1,112 +0,0 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 //
 //go:build !purego
 #include "textflag.h"
 // func xorBytes(dst, a, b *byte, n int)
 TEXT ·xorBytes(SB), NOSPLIT, $0
 	MOVQ  dst+0(FP), BX
 	MOVQ  a+8(FP), SI
 	MOVQ  b+16(FP), CX
 	MOVQ  n+24(FP), DX
 	CMPQ  DX, $32         // if len less than 32, non avx2.
 	JL non_avx2
 	CMPB ·useAVX2(SB), $1
 	JE   avx2
 non_avx2:
 	TESTQ $15, DX            // AND 15 & len, if not zero jump to not_aligned.
 	JNZ   not_aligned
 aligned:
 	MOVQ $0, AX // position in slices
 loop16b:
 	MOVOU (SI)(AX*1), X0   // XOR 16byte forwards.
 	MOVOU (CX)(AX*1), X1
 	PXOR  X1, X0
 	MOVOU X0, (BX)(AX*1)
 	ADDQ  $16, AX
 	CMPQ  DX, AX
 	JNE   loop16b
 	RET
 loop_1b:
 	SUBQ  $1, DX           // XOR 1byte backwards.
 	MOVB  (SI)(DX*1), DI
 	MOVB  (CX)(DX*1), AX
 	XORB  AX, DI
 	MOVB  DI, (BX)(DX*1)
 	TESTQ $7, DX           // AND 7 & len, if not zero jump to loop_1b.
 	JNZ   loop_1b
 	CMPQ  DX, $0           // if len is 0, ret.
 	JE    ret
 	TESTQ $15, DX          // AND 15 & len, if zero jump to aligned.
 	JZ    aligned
 not_aligned:
 	TESTQ $7, DX           // AND $7 & len, if not zero jump to loop_1b.
 	JNE   loop_1b
 	SUBQ  $8, DX           // XOR 8bytes backwards.
 	MOVQ  (SI)(DX*1), DI
 	MOVQ  (CX)(DX*1), AX
 	XORQ  AX, DI
 	MOVQ  DI, (BX)(DX*1)
 	CMPQ  DX, $16          // if len is greater or equal 16 here, it must be aligned.
 	JGE   aligned
 ret:
 	RET
 avx2:
 	TESTQ $31, DX          // AND 31 & len, if not zero jump to avx2_not_aligned.
 	JNZ   avx2_not_aligned
 avx2_aligned:              // input length = 16*n, where n is greater or equal 2.
 	TESTQ $16, DX          // AND 16 & len, if zero jump to loop32b_start.
 	JE loop32b_start
 	SUBQ  $16, DX          // XOR 16bytes backwards.
 	VMOVDQU (SI)(DX*1), X0
 	VPXOR  (CX)(DX*1), X0, X0
 	VMOVDQU X0, (BX)(DX*1)
 loop32b_start:
 	MOVQ $0, AX            // position in slices
 loop32b:
 	VMOVDQU (SI)(AX*1), Y0   // XOR 32byte forwards.
 	VPXOR (CX)(AX*1), Y0, Y0
 	VMOVDQU Y0, (BX)(AX*1)
 	ADDQ  $32, AX
 	CMPQ  DX, AX
 	JNE   loop32b
 avx2_ret:	
 	VZEROUPPER
 	RET
 avx2_loop_1b:
 	SUBQ  $1, DX           // XOR 1byte backwards.
 	MOVB  (SI)(DX*1), DI
 	MOVB  (CX)(DX*1), AX
 	XORB  AX, DI
 	MOVB  DI, (BX)(DX*1)
 	TESTQ $7, DX           // AND 7 & len, if not zero jump to avx2_loop_1b.
 	JNZ   avx2_loop_1b
 	TESTQ $15, DX          // AND 15 & len, if zero jump to aligned.
 	JZ    avx2_aligned
 avx2_not_aligned:
 	TESTQ $7, DX           // AND $7 & len, if not zero jump to avx2_loop_1b.
 	JNE   avx2_loop_1b
 	TESTQ $8, DX           // AND $8 & len, if zero jump to avx2_aligned.
 	JE   avx2_aligned
 	SUBQ  $8, DX           // XOR 8bytes backwards.
 	MOVQ  (SI)(DX*1), DI
 	MOVQ  (CX)(DX*1), AX
 	XORQ  AX, DI
 	MOVQ  DI, (BX)(DX*1)
 	JMP  avx2_aligned
--- a/internal/subtle/xor_arm64.s
+++ b/internal/subtle/xor_arm64.s
@ -1,69 +0,0 @@
 // Copyright 2020 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 //
 //go:build !purego
 #include "textflag.h"
 // func xorBytes(dst, a, b *byte, n int)
 TEXT ·xorBytes(SB), NOSPLIT|NOFRAME, $0
 	MOVD	dst+0(FP), R0
 	MOVD	a+8(FP), R1
 	MOVD	b+16(FP), R2
 	MOVD	n+24(FP), R3
 	CMP	$64, R3
 	BLT	tail
 loop_64:
 	VLD1.P	64(R1), [V0.B16, V1.B16, V2.B16, V3.B16]
 	VLD1.P	64(R2), [V4.B16, V5.B16, V6.B16, V7.B16]
 	VEOR	V0.B16, V4.B16, V4.B16
 	VEOR	V1.B16, V5.B16, V5.B16
 	VEOR	V2.B16, V6.B16, V6.B16
 	VEOR	V3.B16, V7.B16, V7.B16
 	VST1.P	[V4.B16, V5.B16, V6.B16, V7.B16], 64(R0)
 	SUBS	$64, R3
 	CMP	$64, R3
 	BGE	loop_64
 tail:
 	// quick end
 	CBZ	R3, end
 	TBZ	$5, R3, less_than32
 	VLD1.P	32(R1), [V0.B16, V1.B16]
 	VLD1.P	32(R2), [V2.B16, V3.B16]
 	VEOR	V0.B16, V2.B16, V2.B16
 	VEOR	V1.B16, V3.B16, V3.B16
 	VST1.P	[V2.B16, V3.B16], 32(R0)
 less_than32:
 	TBZ	$4, R3, less_than16
 	LDP.P	16(R1), (R11, R12)
 	LDP.P	16(R2), (R13, R14)
 	EOR	R11, R13, R13
 	EOR	R12, R14, R14
 	STP.P	(R13, R14), 16(R0)
 less_than16:
 	TBZ	$3, R3, less_than8
 	MOVD.P	8(R1), R11
 	MOVD.P	8(R2), R12
 	EOR	R11, R12, R12
 	MOVD.P	R12, 8(R0)
 less_than8:
 	TBZ	$2, R3, less_than4
 	MOVWU.P	4(R1), R13
 	MOVWU.P	4(R2), R14
 	EORW	R13, R14, R14
 	MOVWU.P	R14, 4(R0)
 less_than4:
 	TBZ	$1, R3, less_than2
 	MOVHU.P	2(R1), R15
 	MOVHU.P	2(R2), R16
 	EORW	R15, R16, R16
 	MOVHU.P	R16, 2(R0)
 less_than2:
 	TBZ	$0, R3, end
 	MOVBU	(R1), R17
 	MOVBU	(R2), R19
 	EORW	R17, R19, R19
 	MOVBU	R19, (R0)
 end:
 	RET
--- a/internal/subtle/xor_asm.go
+++ b/internal/subtle/xor_asm.go
@ -1,14 +0,0 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 //
 //go:build !purego && (amd64 || arm64 || ppc64 || ppc64le || riscv64 || s390x)
 package subtle
 import "github.com/emmansun/gmsm/internal/cpu"
 var useAVX2 = cpu.X86.HasAVX2
 //go:noescape
 func xorBytes(dst, a, b *byte, n int)
--- a/internal/subtle/xor_generic.go
+++ b/internal/subtle/xor_generic.go
@ -1,64 +0,0 @@
 // Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 //
 //go:build purego || !(amd64 || arm64 || s390x || ppc64 || ppc64le || riscv64)
 package subtle
 import (
 	"runtime"
 	"unsafe"
 )
 const wordSize = unsafe.Sizeof(uintptr(0))
 const supportsUnaligned = runtime.GOARCH == "386" ||
 	runtime.GOARCH == "amd64" ||
 	runtime.GOARCH == "ppc64" ||
 	runtime.GOARCH == "ppc64le" ||
 	runtime.GOARCH == "s390x"
 func xorBytes(dstb, xb, yb *byte, n int) {
 	// xorBytes assembly is written using pointers and n. Back to slices.
 	dst := unsafe.Slice(dstb, n)
 	x := unsafe.Slice(xb, n)
 	y := unsafe.Slice(yb, n)
 	if supportsUnaligned || aligned(dstb, xb, yb) {
 		xorLoop(words(dst), words(x), words(y))
 		if uintptr(n)%wordSize == 0 {
 			return
 		}
 		done := n &^ int(wordSize-1)
 		dst = dst[done:]
 		x = x[done:]
 		y = y[done:]
 	}
 	xorLoop(dst, x, y)
 }
 // aligned reports whether dst, x, and y are all word-aligned pointers.
 func aligned(dst, x, y *byte) bool {
 	return (uintptr(unsafe.Pointer(dst))|uintptr(unsafe.Pointer(x))|uintptr(unsafe.Pointer(y)))&(wordSize-1) == 0
 }
 // words returns a []uintptr pointing at the same data as x,
 // with any trailing partial word removed.
 func words(x []byte) []uintptr {
 	n := uintptr(len(x)) / wordSize
 	if n == 0 {
 		// Avoid creating a *uintptr that refers to data smaller than a uintptr;
 		// see issue 59334.
 		return nil
 	}
 	return unsafe.Slice((*uintptr)(unsafe.Pointer(&x[0])), n)
 }
 func xorLoop[T byte | uintptr](dst, x, y []T) {
 	x = x[:len(dst)] // remove bounds check in loop
 	y = y[:len(dst)] // remove bounds check in loop
 	for i := range dst {
 		dst[i] = x[i] ^ y[i]
 	}
 }
--- a/internal/subtle/xor_ppc64x.s
+++ b/internal/subtle/xor_ppc64x.s
@ -1,142 +0,0 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 //go:build (ppc64 || ppc64le) && !purego
 #include "textflag.h"
 // func xorBytes(dst, a, b *byte, n int)
 TEXT ·xorBytes(SB), NOSPLIT, $0
 	MOVD	dst+0(FP), R3	// R3 = dst
 	MOVD	a+8(FP), R4	// R4 = a
 	MOVD	b+16(FP), R5	// R5 = b
 	MOVD	n+24(FP), R6	// R6 = n
 	CMPU	R6, $64, CR7	// Check if n ≥ 64 bytes
 	MOVD	R0, R8		// R8 = index
 	CMPU	R6, $8, CR6	// Check if 8 ≤ n < 64 bytes
 	BLE	CR6, small	// <= 8
 	BLT	CR7, xor32	// Case for 32 ≤ n < 64 bytes
 	// Case for n ≥ 64 bytes
 preloop64:
 	SRD	$6, R6, R7	// Set up loop counter
 	MOVD	R7, CTR
 	MOVD	$16, R10
 	MOVD	$32, R14
 	MOVD	$48, R15
 	ANDCC	$63, R6, R9	// Check for tailing bytes for later
 	PCALIGN $16
 	// Case for >= 64 bytes
 	// Process 64 bytes per iteration
 	// Load 4 vectors of a and b
 	// XOR the corresponding vectors
 	// from a and b and store the result
 loop64:
 	LXVD2X	(R4)(R8), VS32
 	LXVD2X	(R4)(R10), VS34
 	LXVD2X	(R4)(R14), VS36
 	LXVD2X	(R4)(R15), VS38
 	LXVD2X	(R5)(R8), VS33
 	LXVD2X	(R5)(R10), VS35
 	LXVD2X	(R5)(R14), VS37
 	LXVD2X	(R5)(R15), VS39
 	XXLXOR	VS32, VS33, VS32
 	XXLXOR	VS34, VS35, VS34
 	XXLXOR	VS36, VS37, VS36
 	XXLXOR	VS38, VS39, VS38
 	STXVD2X	VS32, (R3)(R8)
 	STXVD2X	VS34, (R3)(R10)
 	STXVD2X	VS36, (R3)(R14)
 	STXVD2X	VS38, (R3)(R15)
 	ADD	$64, R8
 	ADD	$64, R10
 	ADD	$64, R14
 	ADD	$64, R15
 	BDNZ	loop64
 	BC	12,2,LR		// BEQLR
 	MOVD	R9, R6
 	CMP	R6, $8
 	BLE	small
 	// Case for 8 <= n < 64 bytes
 	// Process 32 bytes if available
 xor32:
 	CMP	R6, $32
 	BLT	xor16
 	ADD	$16, R8, R9
 	LXVD2X	(R4)(R8), VS32
 	LXVD2X	(R4)(R9), VS33
 	LXVD2X	(R5)(R8), VS34
 	LXVD2X	(R5)(R9), VS35
 	XXLXOR	VS32, VS34, VS32
 	XXLXOR	VS33, VS35, VS33
 	STXVD2X	VS32, (R3)(R8)
 	STXVD2X	VS33, (R3)(R9)
 	ADD	$32, R8
 	ADD	$-32, R6
 	CMP	R6, $8
 	BLE	small
 	// Case for 8 <= n < 32 bytes
 	// Process 16 bytes if available
 xor16:
 	CMP	R6, $16
 	BLT	xor8
 	LXVD2X	(R4)(R8), VS32
 	LXVD2X	(R5)(R8), VS33
 	XXLXOR	VS32, VS33, VS32
 	STXVD2X	VS32, (R3)(R8)
 	ADD	$16, R8
 	ADD	$-16, R6
 small:
 	CMP	R6, $0
 	BC	12,2,LR		// BEQLR
 xor8:
 #ifdef GOPPC64_power10
 	SLD	$56,R6,R17
 	ADD	R4,R8,R18
 	ADD	R5,R8,R19
 	ADD	R3,R8,R20
 	LXVL	R18,R17,V0
 	LXVL	R19,R17,V1
 	VXOR	V0,V1,V1
 	STXVL	V1,R20,R17
 	RET
 #else
 	CMP	R6, $8
 	BLT	xor4
 	// Case for 8 ≤ n < 16 bytes
 	MOVD	(R4)(R8), R14   // R14 = a[i,...,i+7]
 	MOVD	(R5)(R8), R15   // R15 = b[i,...,i+7]
 	XOR	R14, R15, R16   // R16 = a[] ^ b[]
 	SUB	$8, R6          // n = n - 8
 	MOVD	R16, (R3)(R8)   // Store to dst
 	ADD	$8, R8
 xor4:
 	CMP	R6, $4
 	BLT	xor2
 	MOVWZ	(R4)(R8), R14
 	MOVWZ	(R5)(R8), R15
 	XOR	R14, R15, R16
 	MOVW	R16, (R3)(R8)
 	ADD	$4,R8
 	ADD	$-4,R6
 xor2:
 	CMP	R6, $2
 	BLT	xor1
 	MOVHZ	(R4)(R8), R14
 	MOVHZ	(R5)(R8), R15
 	XOR	R14, R15, R16
 	MOVH	R16, (R3)(R8)
 	ADD	$2,R8
 	ADD	$-2,R6
 xor1:
 	CMP	R6, $0
 	BC	12,2,LR		// BEQLR
 	MOVBZ	(R4)(R8), R14	// R14 = a[i]
 	MOVBZ	(R5)(R8), R15	// R15 = b[i]
 	XOR	R14, R15, R16	// R16 = a[i] ^ b[i]
 	MOVB	R16, (R3)(R8)	// Store to dst
 #endif
 done:
 	RET
--- a/internal/subtle/xor_riscv64.s
+++ b/internal/subtle/xor_riscv64.s
@ -1,169 +0,0 @@
 // Copyright 2024 Sun Yimin. All rights reserved.
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
 //go:build !purego
 #include "textflag.h"
 // func xorBytes(dst, a, b *byte, n int)
 TEXT ·xorBytes(SB), NOSPLIT|NOFRAME, $0
 	MOV	dst+0(FP), X10
 	MOV	a+8(FP), X11
 	MOV	b+16(FP), X12
 	MOV	n+24(FP), X13
 	MOV	$32, X15
 	BLT	X13, X15, loop4_check
 	// Check alignment - if alignment differs we have to do one byte at a time.
 	AND	$7, X10, X5
 	AND	$7, X11, X6
 	AND	$7, X12, X7
 	BNE	X5, X6, loop4_check
 	BNE	X5, X7, loop4_check
 	BEQZ	X5, loop64_check
 	// Check one byte at a time until we reach 8 byte alignment.
 	MOV	$8, X8
 	SUB	X5, X8
 	SUB	X8, X13
 align:
 	MOVBU	0(X11), X16
 	MOVBU	0(X12), X17
 	XOR	X16, X17
 	MOVB	X17, 0(X10)
 	ADD	$1, X10
 	ADD	$1, X11
 	ADD	$1, X12
 	SUB	$1, X8
 	BNEZ	X8, align
 loop64_check:
 	MOV	$64, X15
 	BLT	X13, X15, tail32_check
 	PCALIGN	$16
 loop64:
 	MOV	0(X11), X16
 	MOV	0(X12), X17
 	MOV	8(X11), X18
 	MOV	8(X12), X19
 	XOR	X16, X17
 	XOR	X18, X19
 	MOV	X17, 0(X10)
 	MOV	X19, 8(X10)
 	MOV	16(X11), X20
 	MOV	16(X12), X21
 	MOV	24(X11), X22
 	MOV	24(X12), X23
 	XOR	X20, X21
 	XOR	X22, X23
 	MOV	X21, 16(X10)
 	MOV	X23, 24(X10)
 	MOV	32(X11), X16
 	MOV	32(X12), X17
 	MOV	40(X11), X18
 	MOV	40(X12), X19
 	XOR	X16, X17
 	XOR	X18, X19
 	MOV	X17, 32(X10)
 	MOV	X19, 40(X10)
 	MOV	48(X11), X20
 	MOV	48(X12), X21
 	MOV	56(X11), X22
 	MOV	56(X12), X23
 	XOR	X20, X21
 	XOR	X22, X23
 	MOV	X21, 48(X10)
 	MOV	X23, 56(X10)
 	ADD	$64, X10
 	ADD	$64, X11
 	ADD	$64, X12
 	SUB	$64, X13
 	BGE	X13, X15, loop64
 	BEQZ	X13, done
 tail32_check:
 	MOV	$32, X15
 	BLT	X13, X15, tail16_check
 	MOV	0(X11), X16
 	MOV	0(X12), X17
 	MOV	8(X11), X18
 	MOV	8(X12), X19
 	XOR	X16, X17
 	XOR	X18, X19
 	MOV	X17, 0(X10)
 	MOV	X19, 8(X10)
 	MOV	16(X11), X20
 	MOV	16(X12), X21
 	MOV	24(X11), X22
 	MOV	24(X12), X23
 	XOR	X20, X21
 	XOR	X22, X23
 	MOV	X21, 16(X10)
 	MOV	X23, 24(X10)
 	ADD	$32, X10
 	ADD	$32, X11
 	ADD	$32, X12
 	SUB	$32, X13
 	BEQZ	X13, done
 tail16_check:
 	MOV	$16, X15
 	BLT	X13, X15, loop4_check
 	MOV	0(X11), X16
 	MOV	0(X12), X17
 	MOV	8(X11), X18
 	MOV	8(X12), X19
 	XOR	X16, X17
 	XOR	X18, X19
 	MOV	X17, 0(X10)
 	MOV	X19, 8(X10)
 	ADD	$16, X10
 	ADD	$16, X11
 	ADD	$16, X12
 	SUB	$16, X13
 	BEQZ	X13, done
 loop4_check:
 	MOV	$4, X15
 	BLT	X13, X15, loop1
 	PCALIGN	$16
 loop4:
 	MOVBU	0(X11), X16
 	MOVBU	0(X12), X17
 	MOVBU	1(X11), X18
 	MOVBU	1(X12), X19
 	XOR	X16, X17
 	XOR	X18, X19
 	MOVB	X17, 0(X10)
 	MOVB	X19, 1(X10)
 	MOVBU	2(X11), X20
 	MOVBU	2(X12), X21
 	MOVBU	3(X11), X22
 	MOVBU	3(X12), X23
 	XOR	X20, X21
 	XOR	X22, X23
 	MOVB	X21, 2(X10)
 	MOVB	X23, 3(X10)
 	ADD	$4, X10
 	ADD	$4, X11
 	ADD	$4, X12
 	SUB	$4, X13
 	BGE	X13, X15, loop4
 	PCALIGN	$16
 loop1:
 	BEQZ	X13, done
 	MOVBU	0(X11), X16
 	MOVBU	0(X12), X17
 	XOR	X16, X17
 	MOVB	X17, 0(X10)
 	ADD	$1, X10
 	ADD	$1, X11
 	ADD	$1, X12
 	SUB	$1, X13
 	JMP	loop1
 done:
 	RET
--- a/internal/subtle/xor_s390x.s
+++ b/internal/subtle/xor_s390x.s
@ -1,98 +0,0 @@
 // Copyright 2024 Sun Yimin. All rights reserved.
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
 //go:build !purego
 #include "textflag.h"
 // func xorBytes(dst, a, b *byte, n int)
 TEXT ·xorBytes(SB),NOSPLIT,$0-32
 	MOVD	dst+0(FP), R1
 	MOVD	a+8(FP), R2
 	MOVD	b+16(FP), R3
 	MOVD	n+24(FP), R4
 	MOVD	$0, R5
 	CMPBLT	R4, $64, tail
 loop_64:
 	VL 0(R2)(R5*1), V0
 	VL 16(R2)(R5*1), V1
 	VL 32(R2)(R5*1), V2
 	VL 48(R2)(R5*1), V3
 	VL 0(R3)(R5*1), V4
 	VL 16(R3)(R5*1), V5
 	VL 32(R3)(R5*1), V6
 	VL 48(R3)(R5*1), V7
 	VX V0, V4, V4
 	VX V1, V5, V5
 	VX V2, V6, V6
 	VX V3, V7, V7
 	VST V4, 0(R1)(R5*1)
 	VST V5, 16(R1)(R5*1)
 	VST V6, 32(R1)(R5*1)
 	VST V7, 48(R1)(R5*1)
 	LAY	64(R5), R5
 	SUB	$64, R4
 	CMPBGE	R4, $64, loop_64
 tail:
 	CMPBEQ	R4, $0, done
 	CMPBLT	R4, $32, less_than32
 	VL 0(R2)(R5*1), V0
 	VL 16(R2)(R5*1), V1
 	VL 0(R3)(R5*1), V2
 	VL 16(R3)(R5*1), V3
 	VX V0, V2, V2
 	VX V1, V3, V3
 	VST V2, 0(R1)(R5*1)
 	VST V3, 16(R1)(R5*1)
 	LAY	32(R5), R5
 	SUB	$32, R4
 less_than32:
 	CMPBLT	R4, $16, less_than16
 	VL 0(R2)(R5*1), V0
 	VL 0(R3)(R5*1), V1
 	VX V0, V1, V1
 	VST V1, 0(R1)(R5*1)
 	LAY	16(R5), R5
 	SUB	$16, R4
 less_than16:	
 	CMPBLT	R4, $8, less_than8
 	MOVD	0(R2)(R5*1), R7
 	MOVD	0(R3)(R5*1), R8
 	XOR	R7, R8
 	MOVD	R8, 0(R1)(R5*1)
 	LAY	8(R5), R5
 	SUB	$8, R4
 less_than8:
 	CMPBLT	R4, $4, less_than4
 	MOVWZ	0(R2)(R5*1), R7
 	MOVWZ	0(R3)(R5*1), R8
 	XOR	R7, R8
 	MOVW	R8, 0(R1)(R5*1)
 	LAY	4(R5), R5
 	SUB	$4, R4
 less_than4:
 	CMPBLT	R4, $2, less_than2
 	MOVHZ	0(R2)(R5*1), R7
 	MOVHZ	0(R3)(R5*1), R8
 	XOR	R7, R8
 	MOVH	R8, 0(R1)(R5*1)
 	LAY	2(R5), R5
 	SUB	$2, R4
 less_than2:
 	CMPBEQ	R4, $0, done
 	MOVB	0(R2)(R5*1), R7
 	MOVB	0(R3)(R5*1), R8
 	XOR	R7, R8
 	MOVB	R8, 0(R1)(R5*1)
 done:
 	RET
--- a/internal/subtle/xor_test.go
+++ b/internal/subtle/xor_test.go
@ -1,123 +0,0 @@
 package subtle_test
 import (
 	"bytes"
 	"crypto/rand"
 	"fmt"
 	"io"
 	"testing"
 	"github.com/emmansun/gmsm/internal/subtle"
 )
 func TestXORBytes(t *testing.T) {
 	for n := 1; n <= 1024; n++ {
 		if n > 16 && testing.Short() {
 			n += n >> 3
 		}
 		for alignP := 0; alignP < 8; alignP++ {
 			for alignQ := 0; alignQ < 8; alignQ++ {
 				for alignD := 0; alignD < 8; alignD++ {
 					p := make([]byte, alignP+n, alignP+n+10)[alignP:]
 					q := make([]byte, alignQ+n, alignQ+n+10)[alignQ:]
 					if n&1 != 0 {
 						p = p[:n]
 					} else {
 						q = q[:n]
 					}
 					if _, err := io.ReadFull(rand.Reader, p); err != nil {
 						t.Fatal(err)
 					}
 					if _, err := io.ReadFull(rand.Reader, q); err != nil {
 						t.Fatal(err)
 					}
 					d := make([]byte, alignD+n, alignD+n+10)
 					for i := range d {
 						d[i] = 0xdd
 					}
 					want := make([]byte, len(d), cap(d))
 					copy(want[:cap(want)], d[:cap(d)])
 					for i := 0; i < n; i++ {
 						want[alignD+i] = p[i] ^ q[i]
 					}
 					if subtle.XORBytes(d[alignD:], p, q); !bytes.Equal(d, want) {
 						t.Fatalf("n=%d alignP=%d alignQ=%d alignD=%d:\n\tp = %x\n\tq = %x\n\td = %x\n\twant %x\n", n, alignP, alignQ, alignD, p, q, d, want)
 					}
 				}
 			}
 		}
 	}
 }
 func TestXorBytesPanic(t *testing.T) {
 	mustPanic(t, "subtle.XORBytes: dst too short", func() {
 		subtle.XORBytes(nil, make([]byte, 1), make([]byte, 1))
 	})
 	mustPanic(t, "subtle.XORBytes: dst too short", func() {
 		subtle.XORBytes(make([]byte, 1), make([]byte, 2), make([]byte, 3))
 	})
 	mustPanic(t, "subtle.XORBytes: invalid overlap", func() {
 		x := make([]byte, 3)
 		subtle.XORBytes(x, x[1:], make([]byte, 2))
 	})
 	mustPanic(t, "subtle.XORBytes: invalid overlap", func() {
 		x := make([]byte, 3)
 		subtle.XORBytes(x, make([]byte, 2), x[1:])
 	})
 }
 func BenchmarkXORBytes(b *testing.B) {
 	dst := make([]byte, 1<<15)
 	data0 := make([]byte, 1<<15)
 	data1 := make([]byte, 1<<15)
 	sizes := []int64{1 << 3, 1 << 4, 1 << 5, 1 << 7, 1 << 11, 1 << 13, 1 << 15}
 	for _, size := range sizes {
 		b.Run(fmt.Sprintf("%dBytes", size), func(b *testing.B) {
 			s0 := data0[:size]
 			s1 := data1[:size]
 			b.SetBytes(int64(size))
 			for i := 0; i < b.N; i++ {
 				subtle.XORBytes(dst, s0, s1)
 			}
 		})
 	}
 }
 func BenchmarkXORBytesAlignment(b *testing.B) {
 	dst := make([]byte, 8+1<<11)
 	data0 := make([]byte, 8+1<<11)
 	data1 := make([]byte, 8+1<<11)
 	sizes := []int64{1 << 3, 1 << 7, 1 << 11}
 	for _, size := range sizes {
 		for offset := int64(0); offset < 8; offset++ {
 			b.Run(fmt.Sprintf("%dBytes%dOffset", size, offset), func(b *testing.B) {
 				d := dst[offset : offset+size]
 				s0 := data0[offset : offset+size]
 				s1 := data1[offset : offset+size]
 				b.SetBytes(int64(size))
 				for i := 0; i < b.N; i++ {
 					subtle.XORBytes(d, s0, s1)
 				}
 			})
 		}
 	}
 }
 func mustPanic(t *testing.T, expected string, f func()) {
 	t.Helper()
 	defer func() {
 		switch msg := recover().(type) {
 		case nil:
 			t.Errorf("expected panic(%q), but did not panic", expected)
 		case string:
 			if msg != expected {
 				t.Errorf("expected panic(%q), but got panic(%q)", expected, msg)
 			}
 		default:
 			t.Errorf("expected panic(%q), but got panic(%T%v)", expected, msg, msg)
 		}
 	}()
 	f()
 }