diff --git a/internal/sm2ec/p256_asm_ord.go b/internal/sm2ec/p256_asm_ord.go index 1d27bcc..c154f27 100644 --- a/internal/sm2ec/p256_asm_ord.go +++ b/internal/sm2ec/p256_asm_ord.go @@ -1,4 +1,4 @@ -//go:build (amd64 || arm64) && !purego +//go:build (amd64 || arm64 || s390x) && !purego package sm2ec diff --git a/internal/sm2ec/p256_asm_ord_test.go b/internal/sm2ec/p256_asm_ord_test.go index 72d0c67..77b216e 100644 --- a/internal/sm2ec/p256_asm_ord_test.go +++ b/internal/sm2ec/p256_asm_ord_test.go @@ -1,4 +1,4 @@ -//go:build (amd64 && !purego) || (arm64 && !purego) +//go:build (amd64 || arm64 || s390x) && !purego package sm2ec diff --git a/internal/sm2ec/p256_asm_s390x.s b/internal/sm2ec/p256_asm_s390x.s index 4848c6a..e11d8bb 100644 --- a/internal/sm2ec/p256_asm_s390x.s +++ b/internal/sm2ec/p256_asm_s390x.s @@ -1109,7 +1109,7 @@ TEXT sm2p256MulInternal<>(SB), NOSPLIT, $0-0 VSBIQ RED2, RED3, CAR1, RED2 // Guaranteed not to underflow VSLDB $12, T1, T0, T0 - VSLDB $12, T2, T1, T1 + VSLDB $12, T2, T1, T1 // T2 Free VACCQ T0, ADD3H, CAR1 VAQ T0, ADD3H, T0 @@ -1187,7 +1187,7 @@ TEXT sm2p256MulInternal<>(SB), NOSPLIT, $0-0 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] VSLDB $12, ADD2, ADD1, T0 // ADD1 Free - VSLDB $12, T2, ADD2, T1 // ADD2 Free + VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free VACCQ T0, ADD3, CAR1 VAQ T0, ADD3, T0 @@ -1233,8 +1233,8 @@ TEXT sm2p256MulInternal<>(SB), NOSPLIT, $0-0 VL 32(CPOOL), SEL1 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] - VSLDB $12, ADD2, ADD1, T0 - VSLDB $12, T2, ADD2, T1 + VSLDB $12, ADD2, ADD1, T0 // ADD1 Free + VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free VACCQ T0, ADD3, CAR1 VAQ T0, ADD3, T0 @@ -1961,6 +1961,106 @@ TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 * Y3 = T1-Y3 */ +#define p256PointDoubleRound(P1ptr, P3ptr) \ + \ // X=Z1; Y=Z1; MUL; T- // T1 = Z1² + VL 80(P1ptr), X1 \ // Z1H + VPDI $0x4, X1, X1, X1 \ + VL 64(P1ptr), X0 \ // Z1L + VPDI $0x4, X0, X0, X0 \ + VLR X0, Y0 \ + VLR X1, Y1 \ + CALL sm2p256SqrInternal<>(SB) \ + \ + \ // SUB(X(SB) \ + \ + \ // ADD(T2(SB) \ + VPDI $0x4, T1, T1, TT1 \ + VST TT1, 80(P3ptr) \ + VPDI $0x4, T0, T0, TT0 \ + VST TT0, 64(P3ptr) \ + \ + \ // X- ; Y=X ; MUL; T- // Y3 = Y3² + VLR X0, Y0 \ + VLR X1, Y1 \ + CALL sm2p256SqrInternal<>(SB) \ + \ + \ // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 + VLR T0, X0 \ + VLR T1, X1 \ + VL 16(P1ptr), Y1 \ + VPDI $0x4, Y1, Y1, Y1 \ + VL 0(P1ptr), Y0 \ + VPDI $0x4, Y0, Y0, Y0 \ + CALL sm2p256MulInternal<>(SB) \ + VLR T0, T3L \ + VLR T1, T3H \ + \ + \ // X- ; Y=X ; MUL; T- // Y3 = Y3² + VLR X0, Y0 \ + VLR X1, Y1 \ + CALL sm2p256SqrInternal<>(SB) \ + \ + \ // HAL(Y3(SB) \ + \ + \ // ADD(T1(SB) \ + \ + \ // SUB(Y3(SB) - - // SUB(X(SB) - - // ADD(T2(SB) - VPDI $0x4, T1, T1, TT1 - VST TT1, 80(P3ptr) - VPDI $0x4, T0, T0, TT0 - VST TT0, 64(P3ptr) - - // X- ; Y=X ; MUL; T- // Y3 = Y3² - VLR X0, Y0 - VLR X1, Y1 - CALL sm2p256SqrInternal<>(SB) - - // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 - VLR T0, X0 - VLR T1, X1 - VL 16(P1ptr), Y1 - VPDI $0x4, Y1, Y1, Y1 - VL 0(P1ptr), Y0 - VPDI $0x4, Y0, Y0, Y0 - CALL sm2p256MulInternal<>(SB) - VLR T0, T3L - VLR T1, T3H - - // X- ; Y=X ; MUL; T- // Y3 = Y3² - VLR X0, Y0 - VLR X1, Y1 - CALL sm2p256SqrInternal<>(SB) - - // HAL(Y3(SB) - - // ADD(T1(SB) - - // SUB(Y3+0x00(SB), CPOOL + VL 16(CPOOL), PL + VL 0(CPOOL), PH + + p256PointDoubleRound(P1ptr, P3ptr) + p256PointDoubleRound(P3ptr, P3ptr) + p256PointDoubleRound(P3ptr, P3ptr) + p256PointDoubleRound(P3ptr, P3ptr) + p256PointDoubleRound(P3ptr, P3ptr) + p256PointDoubleRound(P3ptr, P3ptr) + RET #undef P3ptr diff --git a/internal/sm2ec/p256_asm_table_test.go b/internal/sm2ec/p256_asm_table_test.go index 9ce2247..a1ad62c 100644 --- a/internal/sm2ec/p256_asm_table_test.go +++ b/internal/sm2ec/p256_asm_table_test.go @@ -1,4 +1,4 @@ -//go:build !purego && (amd64 || arm64) +//go:build (amd64 || arm64 || s390x) && !purego package sm2ec diff --git a/internal/sm2ec/sm2p256.go b/internal/sm2ec/sm2p256.go index af2b93e..8c08944 100644 --- a/internal/sm2ec/sm2p256.go +++ b/internal/sm2ec/sm2p256.go @@ -4,7 +4,7 @@ // Code generated by generate.go. DO NOT EDIT. -//go:build purego || !(amd64 || arm64) +//go:build purego || !(amd64 || arm64 || s390x) package sm2ec diff --git a/internal/sm2ec/sm2p256_asm.go b/internal/sm2ec/sm2p256_asm.go index 83c6530..102533d 100644 --- a/internal/sm2ec/sm2p256_asm.go +++ b/internal/sm2ec/sm2p256_asm.go @@ -7,7 +7,7 @@ // 256-bit primes" // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x // https://eprint.iacr.org/2013/816.pdf -//go:build (amd64 || arm64) && !purego +//go:build (amd64 || arm64 || s390x) && !purego package sm2ec diff --git a/internal/sm2ec/sm2p256_asm_s390x.go b/internal/sm2ec/sm2p256_asm_s390x.go deleted file mode 100644 index bfbd4fe..0000000 --- a/internal/sm2ec/sm2p256_asm_s390x.go +++ /dev/null @@ -1,65 +0,0 @@ -//go:build !purego - -package sm2ec - - -// p256Element is a P-256 base field element in [0, P-1] in the Montgomery -// domain (with R 2²⁵⁶) as four limbs in little-endian order value. -type p256Element [4]uint64 - -// p256OrdElement is a P-256 scalar field element in [0, ord(G)-1] in the -// Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order. -type p256OrdElement [4]uint64 - -// Montgomery multiplication. Sets res = in1 * in2 * R⁻¹ mod p. -// -//go:noescape -func p256Mul(res, in1, in2 *p256Element) - -// Montgomery square, repeated n times (n >= 1). -// -//go:noescape -func p256Sqr(res, in *p256Element, n int) - -// Montgomery multiplication by R⁻¹, or 1 outside the domain. -// Sets res = in * R⁻¹, bringing res out of the Montgomery domain. -// -//go:noescape -func p256FromMont(res, in *p256Element) - -// If cond is not 0, sets val = -val mod p. -// -//go:noescape -func p256NegCond(val *p256Element, cond int) - -// If cond is 0, sets res = b, otherwise sets res = a. -// -//go:noescape -func p256MovCond(res, a, b *SM2P256Point, cond int) - -//go:noescape -func p256BigToLittle(res *p256Element, in *[32]byte) - -//go:noescape -func p256LittleToBig(res *[32]byte, in *p256Element) - -//go:noescape -func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) - -//go:noescape -func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) - -// p256OrdReduce ensures s is in the range [0, ord(G)-1]. -// -//go:noescape -func p256OrdReduce(s *p256OrdElement) - -// Montgomery multiplication modulo org(G). Sets res = in1 * in2 * R⁻¹. -// -//go:noescape -func p256OrdMul(res, in1, in2 *p256OrdElement) - -// Montgomery square modulo org(G), repeated n times (n >= 1). -// -//go:noescape -func p256OrdSqr(res, in *p256OrdElement, n int) diff --git a/internal/sm2ec/sm2p256_asm_s390x_test.go b/internal/sm2ec/sm2p256_asm_s390x_test.go deleted file mode 100644 index f9513b0..0000000 --- a/internal/sm2ec/sm2p256_asm_s390x_test.go +++ /dev/null @@ -1,304 +0,0 @@ -//go:build s390x && !purego - -package sm2ec - -import ( - "crypto/rand" - "io" - "math/big" - "testing" - "time" -) - -var bigOne = big.NewInt(1) - -// fromBig converts a *big.Int into a format used by this code. -func fromBig(out *[4]uint64, big *big.Int) { - for i := range out { - out[i] = 0 - } - - for i, v := range big.Bits() { - out[i] = uint64(v) - } -} - -func montFromBig(out *[4]uint64, n *big.Int) { - p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16) - r := new(big.Int).Lsh(bigOne, 256) - // out = big * R mod P - outBig := new(big.Int).Mul(n, r) - outBig.Mod(outBig, p) - fromBig(out, outBig) -} - -func toBigInt(in *p256Element) *big.Int { - var valBytes [32]byte - p256LittleToBig(&valBytes, in) - return new(big.Int).SetBytes(valBytes[:]) -} - -func ordElmToBigInt(in *p256OrdElement) *big.Int { - var valBytes [32]byte - p256OrdLittleToBig(&valBytes, in) - return new(big.Int).SetBytes(valBytes[:]) -} - -func testP256FromMont(v *big.Int, t *testing.T) { - val := new(p256Element) - montFromBig((*[4]uint64)(val), v) - res := new(p256Element) - p256FromMont(res, val) - if toBigInt(res).Cmp(v) != 0 { - t.Errorf("p256FromMont failed for %x", v.Bytes()) - } -} - -func TestP256FromMont(t *testing.T) { - p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16) - for i := 0; i < 20; i++ { - bigVal := big.NewInt(int64(i)) - testP256FromMont(bigVal, t) - if i != 0 { - bigVal = new(big.Int).Sub(p, big.NewInt(int64(i))) - testP256FromMont(bigVal, t) - } - } -} - -func testP256OrderReduce(v, expected *big.Int, t *testing.T) { - val := new(p256OrdElement) - fromBig((*[4]uint64)(val), v) - p256OrdReduce(val) - if ordElmToBigInt(val).Cmp(expected) != 0 { - t.Errorf("p256OrdReduce failed for %x, expected %x", v.Bytes(), expected.Bytes()) - } -} - -func TestP256OrderReduce(t *testing.T) { - p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16) - for i := 0; i < 20; i++ { - bigVal := big.NewInt(int64(i)) - testP256OrderReduce(bigVal, bigVal, t) - bigVal = new(big.Int).Add(p, big.NewInt(int64(i))) - testP256OrderReduce(bigVal, big.NewInt(int64(i)), t) - } - testP256OrderReduce(p, big.NewInt(0), t) - for i := 1; i < 20; i++ { - bigVal := new(big.Int).Sub(p, big.NewInt(int64(i))) - testP256OrderReduce(bigVal, bigVal, t) - } -} - -func p256OrderFromMont(in *p256OrdElement) []byte { - // Montgomery multiplication by R⁻¹, or 1 outside the domain as R⁻¹×R = 1, - // converts a Montgomery value out of the domain. - one := &p256OrdElement{1} - p256OrdMul(in, in, one) - - var xOut [32]byte - p256OrdLittleToBig(&xOut, in) - return xOut[:] -} - -func p256OrdMulTest(t *testing.T, x, y, p, r *big.Int) { - x1 := new(big.Int).Mul(x, r) - x1 = x1.Mod(x1, p) - y1 := new(big.Int).Mul(y, r) - y1 = y1.Mod(y1, p) - ax := new(p256OrdElement) - ay := new(p256OrdElement) - res2 := new(p256OrdElement) - fromBig((*[4]uint64)(ax), x1) - fromBig((*[4]uint64)(ay), y1) - p256OrdMul(res2, ax, ay) - resInt := new(big.Int).SetBytes(p256OrderFromMont(res2)) - - expected := new(big.Int).Mul(x, y) - expected = expected.Mod(expected, p) - if resInt.Cmp(expected) != 0 { - t.FailNow() - } -} - -func TestP256OrdMulOrdMinus1(t *testing.T) { - p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16) - r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) - pMinus1 := new(big.Int).Sub(p, big.NewInt(1)) - p256OrdMulTest(t, pMinus1, pMinus1, p, r) -} - -func TestFuzzyP256OrdMul(t *testing.T) { - p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16) - r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) - var scalar1 [32]byte - var scalar2 [32]byte - var timeout *time.Timer - - if testing.Short() { - timeout = time.NewTimer(10 * time.Millisecond) - } else { - timeout = time.NewTimer(2 * time.Second) - } - for { - select { - case <-timeout.C: - return - default: - } - io.ReadFull(rand.Reader, scalar1[:]) - io.ReadFull(rand.Reader, scalar2[:]) - x := new(big.Int).SetBytes(scalar1[:]) - y := new(big.Int).SetBytes(scalar2[:]) - p256OrdMulTest(t, x, y, p, r) - } -} - -func p256OrderSqrTest(t *testing.T, x, p, r *big.Int) { - x1 := new(big.Int).Mul(x, r) - x1 = x1.Mod(x1, p) - ax := new(p256OrdElement) - res2 := new(p256OrdElement) - fromBig((*[4]uint64)(ax), x1) - p256OrdSqr(res2, ax, 1) - resInt := new(big.Int).SetBytes(p256OrderFromMont(res2)) - - expected := new(big.Int).Mul(x, x) - expected = expected.Mod(expected, p) - if resInt.Cmp(expected) != 0 { - t.FailNow() - } -} - -func TestP256OrdSqrOrdMinus1(t *testing.T) { - p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16) - r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) - pMinus1 := new(big.Int).Sub(p, big.NewInt(1)) - p256OrderSqrTest(t, pMinus1, p, r) -} - -func TestFuzzyP256OrdSqr(t *testing.T) { - p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16) - r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) - var scalar1 [32]byte - var timeout *time.Timer - - if testing.Short() { - timeout = time.NewTimer(10 * time.Millisecond) - } else { - timeout = time.NewTimer(2 * time.Second) - } - for { - select { - case <-timeout.C: - return - default: - } - io.ReadFull(rand.Reader, scalar1[:]) - x := new(big.Int).SetBytes(scalar1[:]) - p256OrderSqrTest(t, x, p, r) - } -} - -func p256MulTest(t *testing.T, x, y, p, r *big.Int) { - x1 := new(big.Int).Mul(x, r) - x1 = x1.Mod(x1, p) - y1 := new(big.Int).Mul(y, r) - y1 = y1.Mod(y1, p) - ax := new(p256Element) - ay := new(p256Element) - res := new(p256Element) - res2 := new(p256Element) - fromBig((*[4]uint64)(ax), x1) - fromBig((*[4]uint64)(ay), y1) - p256Mul(res2, ax, ay) - p256FromMont(res, res2) - resInt := toBigInt(res) - - expected := new(big.Int).Mul(x, y) - expected = expected.Mod(expected, p) - if resInt.Cmp(expected) != 0 { - t.FailNow() - } -} - -func TestP256MulPMinus1(t *testing.T) { - p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16) - r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) - pMinus1 := new(big.Int).Sub(p, big.NewInt(1)) - p256MulTest(t, pMinus1, pMinus1, p, r) -} - -func TestFuzzyP256Mul(t *testing.T) { - p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16) - r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) - var scalar1 [32]byte - var scalar2 [32]byte - var timeout *time.Timer - - if testing.Short() { - timeout = time.NewTimer(10 * time.Millisecond) - } else { - timeout = time.NewTimer(2 * time.Second) - } - for { - select { - case <-timeout.C: - return - default: - } - io.ReadFull(rand.Reader, scalar1[:]) - io.ReadFull(rand.Reader, scalar2[:]) - x := new(big.Int).SetBytes(scalar1[:]) - y := new(big.Int).SetBytes(scalar2[:]) - p256MulTest(t, x, y, p, r) - } -} - -func p256SqrTest(t *testing.T, x, p, r *big.Int) { - x1 := new(big.Int).Mul(x, r) - x1 = x1.Mod(x1, p) - ax := new(p256Element) - res := new(p256Element) - res2 := new(p256Element) - fromBig((*[4]uint64)(ax), x1) - p256Sqr(res2, ax, 1) - p256FromMont(res, res2) - resInt := toBigInt(res) - - expected := new(big.Int).Mul(x, x) - expected = expected.Mod(expected, p) - if resInt.Cmp(expected) != 0 { - t.FailNow() - } -} - -func TestP256SqrPMinus1(t *testing.T) { - p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16) - r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) - pMinus1 := new(big.Int).Sub(p, big.NewInt(1)) - p256SqrTest(t, pMinus1, p, r) -} - -func TestFuzzyP256Sqr(t *testing.T) { - p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16) - r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) - var scalar1 [32]byte - var timeout *time.Timer - - if testing.Short() { - timeout = time.NewTimer(10 * time.Millisecond) - } else { - timeout = time.NewTimer(2 * time.Second) - } - for { - select { - case <-timeout.C: - return - default: - } - io.ReadFull(rand.Reader, scalar1[:]) - x := new(big.Int).SetBytes(scalar1[:]) - p256SqrTest(t, x, p, r) - } -} diff --git a/internal/sm2ec/sm2p256_asm_test.go b/internal/sm2ec/sm2p256_asm_test.go index 59563ca..6554a9e 100644 --- a/internal/sm2ec/sm2p256_asm_test.go +++ b/internal/sm2ec/sm2p256_asm_test.go @@ -1,4 +1,4 @@ -//go:build (amd64 || arm64) && !purego +//go:build (amd64 || arm64 || s390x) && !purego package sm2ec diff --git a/internal/sm2ec/sm2p256_ord.go b/internal/sm2ec/sm2p256_ord.go index 76a917f..31df591 100644 --- a/internal/sm2ec/sm2p256_ord.go +++ b/internal/sm2ec/sm2p256_ord.go @@ -1,4 +1,4 @@ -//go:build purego || !(amd64 || arm64) +//go:build purego || !(amd64 || arm64 || s390x) package sm2ec