From 4de671cd8bd1c4003076b677750b79a755256763 Mon Sep 17 00:00:00 2001 From: Emman Date: Mon, 8 Feb 2021 08:53:07 +0800 Subject: [PATCH] MAGIC - init amd64 optimization --- sm2/p256.go | 28 +- sm2/p256_asm.go | 599 +++++++++++ sm2/p256_asm_amd64.s | 2353 ++++++++++++++++++++++++++++++++++++++++++ sm2/p256_generic.go | 12 + sm2/sm2.go | 13 +- 5 files changed, 2988 insertions(+), 17 deletions(-) create mode 100644 sm2/p256_asm.go create mode 100644 sm2/p256_asm_amd64.s create mode 100644 sm2/p256_generic.go diff --git a/sm2/p256.go b/sm2/p256.go index fb891c1..e4d6743 100644 --- a/sm2/p256.go +++ b/sm2/p256.go @@ -1,10 +1,11 @@ +// +build !amd64 + package sm2 import ( "crypto/elliptic" "fmt" "math/big" - "sync" ) // See https://www.imperialviolet.org/2010/12/04/ecc.html ([1]) for background. @@ -14,24 +15,22 @@ type p256Curve struct { } var ( - p256 p256Curve + p256Params *elliptic.CurveParams // RInverse contains 1/R mod p - the inverse of the Montgomery constant // (2**257). p256RInverse *big.Int - - initonce sync.Once ) func initP256() { - p256.CurveParams = &elliptic.CurveParams{Name: "sm2p256v1"} + p256Params = &elliptic.CurveParams{Name: "sm2p256v1"} // 2**256 - 2**224 - 2**96 + 2**64 - 1 - p256.P, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16) - p256.N, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16) - p256.B, _ = new(big.Int).SetString("28E9FA9E9D9F5E344D5A9E4BCF6509A7F39789F515AB8F92DDBCBD414D940E93", 16) - p256.Gx, _ = new(big.Int).SetString("32C4AE2C1F1981195F9904466A39C9948FE30BBFF2660BE1715A4589334C74C7", 16) - p256.Gy, _ = new(big.Int).SetString("BC3736A2F4F6779C59BDCEE36B692153D0A9877CC62A474002DF32E52139F0A0", 16) - p256.BitSize = 256 + p256Params.P, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16) + p256Params.N, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16) + p256Params.B, _ = new(big.Int).SetString("28E9FA9E9D9F5E344D5A9E4BCF6509A7F39789F515AB8F92DDBCBD414D940E93", 16) + p256Params.Gx, _ = new(big.Int).SetString("32C4AE2C1F1981195F9904466A39C9948FE30BBFF2660BE1715A4589334C74C7", 16) + p256Params.Gy, _ = new(big.Int).SetString("BC3736A2F4F6779C59BDCEE36B692153D0A9877CC62A474002DF32E52139F0A0", 16) + p256Params.BitSize = 256 // ModeInverse(2**257, P) // p256RInverse = big.NewInt(0) @@ -39,12 +38,9 @@ func initP256() { // p256RInverse.ModInverse(r, p256.P) // fmt.Printf("%s\n", hex.EncodeToString(p256RInverse.Bytes())) p256RInverse, _ = new(big.Int).SetString("7ffffffd80000002fffffffe000000017ffffffe800000037ffffffc80000002", 16) -} -// P256 init and return the singleton -func P256() elliptic.Curve { - initonce.Do(initP256) - return p256 + // Arch-specific initialization, i.e. let a platform dynamically pick a P256 implementation + initP256Arch() } func (curve p256Curve) Params() *elliptic.CurveParams { diff --git a/sm2/p256_asm.go b/sm2/p256_asm.go new file mode 100644 index 0000000..7457d03 --- /dev/null +++ b/sm2/p256_asm.go @@ -0,0 +1,599 @@ +// This file contains the Go wrapper for the constant-time, 64-bit assembly +// implementation of P256. The optimizations performed here are described in +// detail in: +// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with +// 256-bit primes" +// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x +// https://eprint.iacr.org/2013/816.pdf + +// +build amd64 + +package sm2 + +import ( + "crypto/elliptic" + "math/big" + "sync" +) + +type ( + p256Curve struct { + *elliptic.CurveParams + } + + p256Point struct { + xyz [12]uint64 + } +) + +var ( + p256 p256Curve + p256Precomputed *[43][32 * 8]uint64 + precomputeOnce sync.Once +) + +func initP256() { + // 2**256 - 2**224 - 2**96 + 2**64 - 1 + p256.CurveParams = &elliptic.CurveParams{Name: "sm2p256v1"} + p256.P, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16) + p256.N, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16) + p256.B, _ = new(big.Int).SetString("28E9FA9E9D9F5E344D5A9E4BCF6509A7F39789F515AB8F92DDBCBD414D940E93", 16) + p256.Gx, _ = new(big.Int).SetString("32C4AE2C1F1981195F9904466A39C9948FE30BBFF2660BE1715A4589334C74C7", 16) + p256.Gy, _ = new(big.Int).SetString("BC3736A2F4F6779C59BDCEE36B692153D0A9877CC62A474002DF32E52139F0A0", 16) + p256.BitSize = 256 +} + +func (curve p256Curve) Params() *elliptic.CurveParams { + return curve.CurveParams +} + +// Functions implemented in p256_asm_*64.s +// Montgomery multiplication modulo P256 +//go:noescape +func p256Mul(res, in1, in2 []uint64) + +// Montgomery square modulo P256, repeated n times (n >= 1) +//go:noescape +func p256Sqr(res, in []uint64, n int) + +// Montgomery multiplication by 1 +//go:noescape +func p256FromMont(res, in []uint64) + +// iff cond == 1 val <- -val +//go:noescape +func p256NegCond(val []uint64, cond int) + +// if cond == 0 res <- b; else res <- a +//go:noescape +func p256MovCond(res, a, b []uint64, cond int) + +// Endianness swap +//go:noescape +func p256BigToLittle(res []uint64, in []byte) + +//go:noescape +func p256LittleToBig(res []byte, in []uint64) + +// Constant time table access +//go:noescape +func p256Select(point, table []uint64, idx int) + +//go:noescape +func p256SelectBase(point, table []uint64, idx int) + +// Montgomery multiplication modulo Ord(G) +//go:noescape +func p256OrdMul(res, in1, in2 []uint64) + +// Montgomery square modulo Ord(G), repeated n times +//go:noescape +func p256OrdSqr(res, in []uint64, n int) + +// Point add with in2 being affine point +// If sign == 1 -> in2 = -in2 +// If sel == 0 -> res = in1 +// if zero == 0 -> res = in2 +//go:noescape +func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) + +// Point add. Returns one if the two input points were equal and zero +// otherwise. (Note that, due to the way that the equations work out, some +// representations of ∞ are considered equal to everything by this function.) +//go:noescape +func p256PointAddAsm(res, in1, in2 []uint64) int + +// Point double +//go:noescape +func p256PointDoubleAsm(res, in []uint64) + +func (curve p256Curve) Inverse(k *big.Int) *big.Int { + if k.Sign() < 0 { + // This should never happen. + k = new(big.Int).Neg(k) + } + + if k.Cmp(p256.N) >= 0 { + // This should never happen. + k = new(big.Int).Mod(k, p256.N) + } + + // table will store precomputed powers of x. + var table [4 * 9]uint64 + var ( + _1 = table[4*0 : 4*1] + _11 = table[4*1 : 4*2] + _101 = table[4*2 : 4*3] + _111 = table[4*3 : 4*4] + _1111 = table[4*4 : 4*5] + _10101 = table[4*5 : 4*6] + _101111 = table[4*6 : 4*7] + x = table[4*7 : 4*8] + t = table[4*8 : 4*9] + ) + + fromBig(x[:], k) + // This code operates in the Montgomery domain where R = 2^256 mod n + // and n is the order of the scalar field. (See initP256 for the + // value.) Elements in the Montgomery domain take the form a×R and + // multiplication of x and y in the calculates (x × y × R^-1) mod n. RR + // is R×R mod n thus the Montgomery multiplication x and RR gives x×R, + // i.e. converts x into the Montgomery domain. + // Window values borrowed from https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion + RR := []uint64{0x901192af7c114f20, 0x3464504ade6fa2fa, 0x620fc84c3affe0d4, 0x1eb5e412a22b3d3b} + + p256OrdMul(_1, x, RR) // _1 + p256OrdSqr(x, _1, 1) // _10 + p256OrdMul(_11, x, _1) // _11 + p256OrdMul(_101, x, _11) // _101 + p256OrdMul(_111, x, _101) // _111 + p256OrdSqr(x, _101, 1) // _1010 + p256OrdMul(_1111, _101, x) // _1111 + + p256OrdSqr(t, x, 1) // _10100 + p256OrdMul(_10101, t, _1) // _10101 + p256OrdSqr(x, _10101, 1) // _101010 + p256OrdMul(_101111, _101, x) // _101111 + p256OrdMul(x, _10101, x) // _111111 = x6 + p256OrdSqr(t, x, 2) // _11111100 + p256OrdMul(t, t, _11) // _11111111 = x8 + p256OrdSqr(x, t, 8) // _ff00 + p256OrdMul(x, x, t) // _ffff = x16 + p256OrdSqr(t, x, 16) // _ffff0000 + p256OrdMul(t, t, x) // _ffffffff = x32 + + p256OrdSqr(x, t, 64) + p256OrdMul(x, x, t) + p256OrdSqr(x, x, 32) + p256OrdMul(x, x, t) + + sqrs := []uint8{ + 6, 5, 4, 5, 5, + 4, 3, 3, 5, 9, + 6, 2, 5, 6, 5, + 4, 5, 5, 3, 10, + 2, 5, 5, 3, 7, 6} + muls := [][]uint64{ + _101111, _111, _11, _1111, _10101, + _101, _101, _101, _111, _101111, + _1111, _1, _1, _1111, _111, + _111, _111, _101, _11, _101111, + _11, _11, _11, _1, _10101, _1111} + + for i, s := range sqrs { + p256OrdSqr(x, x, int(s)) + p256OrdMul(x, x, muls[i]) + } + + // Multiplying by one in the Montgomery domain converts a Montgomery + // value out of the domain. + one := []uint64{1, 0, 0, 0} + p256OrdMul(x, x, one) + + xOut := make([]byte, 32) + p256LittleToBig(xOut, x) + return new(big.Int).SetBytes(xOut) +} + +// fromBig converts a *big.Int into a format used by this code. +func fromBig(out []uint64, big *big.Int) { + for i := range out { + out[i] = 0 + } + + for i, v := range big.Bits() { + out[i] = uint64(v) + } +} + +// p256GetScalar endian-swaps the big-endian scalar value from in and writes it +// to out. If the scalar is equal or greater than the order of the group, it's +// reduced modulo that order. +func p256GetScalar(out []uint64, in []byte) { + n := new(big.Int).SetBytes(in) + + if n.Cmp(p256.N) >= 0 { + n.Mod(n, p256.N) + } + fromBig(out, n) +} + +// p256Mul operates in a Montgomery domain with R = 2^256 mod p, where p is the +// underlying field of the curve. (See initP256 for the value.) Thus rr here is +// R×R mod p. See comment in Inverse about how this is used. +var rr = []uint64{0x0000000000000003, 0x00000002ffffffff, 0x0000000100000001, 0x0000000400000002} + +func maybeReduceModP(in *big.Int) *big.Int { + if in.Cmp(p256.P) < 0 { + return in + } + return new(big.Int).Mod(in, p256.P) +} + +func (curve p256Curve) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) { + scalarReversed := make([]uint64, 4) + var r1, r2 p256Point + p256GetScalar(scalarReversed, baseScalar) + r1IsInfinity := scalarIsZero(scalarReversed) + r1.p256BaseMult(scalarReversed) + + p256GetScalar(scalarReversed, scalar) + r2IsInfinity := scalarIsZero(scalarReversed) + fromBig(r2.xyz[0:4], maybeReduceModP(bigX)) + fromBig(r2.xyz[4:8], maybeReduceModP(bigY)) + p256Mul(r2.xyz[0:4], r2.xyz[0:4], rr[:]) + p256Mul(r2.xyz[4:8], r2.xyz[4:8], rr[:]) + + // This sets r2's Z value to 1, in the Montgomery domain. + r2.xyz[8] = 0x0000000000000001 + r2.xyz[9] = 0x00000000ffffffff + r2.xyz[10] = 0x0000000000000000 + r2.xyz[11] = 0x0000000100000000 + + r2.p256ScalarMult(scalarReversed) + + var sum, double p256Point + pointsEqual := p256PointAddAsm(sum.xyz[:], r1.xyz[:], r2.xyz[:]) + p256PointDoubleAsm(double.xyz[:], r1.xyz[:]) + sum.CopyConditional(&double, pointsEqual) + sum.CopyConditional(&r1, r2IsInfinity) + sum.CopyConditional(&r2, r1IsInfinity) + + return sum.p256PointToAffine() +} + +func (curve p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) { + scalarReversed := make([]uint64, 4) + p256GetScalar(scalarReversed, scalar) + + var r p256Point + r.p256BaseMult(scalarReversed) + return r.p256PointToAffine() +} + +func (curve p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) { + scalarReversed := make([]uint64, 4) + p256GetScalar(scalarReversed, scalar) + + var r p256Point + fromBig(r.xyz[0:4], maybeReduceModP(bigX)) + fromBig(r.xyz[4:8], maybeReduceModP(bigY)) + p256Mul(r.xyz[0:4], r.xyz[0:4], rr[:]) + p256Mul(r.xyz[4:8], r.xyz[4:8], rr[:]) + // This sets r2's Z value to 1, in the Montgomery domain. + r.xyz[8] = 0x0000000000000001 + r.xyz[9] = 0x00000000ffffffff + r.xyz[10] = 0x0000000000000000 + r.xyz[11] = 0x0000000100000000 + + r.p256ScalarMult(scalarReversed) + return r.p256PointToAffine() +} + +// uint64IsZero returns 1 if x is zero and zero otherwise. +func uint64IsZero(x uint64) int { + x = ^x + x &= x >> 32 + x &= x >> 16 + x &= x >> 8 + x &= x >> 4 + x &= x >> 2 + x &= x >> 1 + return int(x & 1) +} + +// scalarIsZero returns 1 if scalar represents the zero value, and zero +// otherwise. +func scalarIsZero(scalar []uint64) int { + return uint64IsZero(scalar[0] | scalar[1] | scalar[2] | scalar[3]) +} + +func (p *p256Point) p256PointToAffine() (x, y *big.Int) { + zInv := make([]uint64, 4) + zInvSq := make([]uint64, 4) + p256Inverse(zInv, p.xyz[8:12]) + p256Sqr(zInvSq, zInv, 1) + p256Mul(zInv, zInv, zInvSq) + + p256Mul(zInvSq, p.xyz[0:4], zInvSq) + p256Mul(zInv, p.xyz[4:8], zInv) + + p256FromMont(zInvSq, zInvSq) + p256FromMont(zInv, zInv) + + xOut := make([]byte, 32) + yOut := make([]byte, 32) + p256LittleToBig(xOut, zInvSq) + p256LittleToBig(yOut, zInv) + + return new(big.Int).SetBytes(xOut), new(big.Int).SetBytes(yOut) +} + +// CopyConditional copies overwrites p with src if v == 1, and leaves p +// unchanged if v == 0. +func (p *p256Point) CopyConditional(src *p256Point, v int) { + pMask := uint64(v) - 1 + srcMask := ^pMask + + for i, n := range p.xyz { + p.xyz[i] = (n & pMask) | (src.xyz[i] & srcMask) + } +} + +// p256Inverse sets out to in^-1 mod p. +func p256Inverse(out, in []uint64) { + var stack [9 * 4]uint64 + p2 := stack[4*0 : 4*0+4] + p4 := stack[4*1 : 4*1+4] + p8 := stack[4*2 : 4*2+4] + p16 := stack[4*3 : 4*3+4] + p32 := stack[4*4 : 4*4+4] + p64 := stack[4*5 : 4*5+4] + p32m2 := stack[4*6 : 4*6+4] + ptmp := stack[4*7 : 4*7+4] + + p256Sqr(out, in, 1) // 2^1 + p256Mul(p2, out, in) // 2^2 - 2^0 + + p256Sqr(out, p2, 2) // 2^4 - 2^2 + p256Mul(ptmp, out, in) // 2^4 - 2^1 + p256Mul(p4, out, p2) // 2^4 - 2^0 + + p256Sqr(out, p4, 4) // 2^8 - 2^4 + p256Mul(ptmp, out, ptmp) // 2^8 - 2^1 + p256Mul(p8, out, p4) // 2^8 - 2^0 + + p256Sqr(out, p8, 8) // 2^16 - 2^8 + p256Mul(ptmp, out, ptmp) // 2^16 - 2^1 + p256Mul(p16, out, p8) // 2^16 - 2^0 + + p256Sqr(out, p16, 16) // 2^32 - 2^16 + p256Mul(p32m2, out, ptmp) // 2^32 - 2^1 + p256Mul(p32, out, p16) // 2^32 - 2^0 + + p256Sqr(out, p32, 32) //2^64 - 2^32 + p256Mul(p64, out, p32) // 2^64 - 2^0 + + p256Sqr(out, p64, 64) //2^128 - 2^64 + p256Mul(out, out, p64) // 2^128 - 2^0 + p256Sqr(out, out, 32) //2^160 - 2^32 + p256Mul(out, out, p32m2) //2^160 - 2^1 + p256Sqr(ptmp, out, 95) //2^255 - 2^96 + + p256Sqr(out, p32m2, 223) //2^225 - 2^224 + p256Mul(ptmp, ptmp, out) //2^226 - 2^224 - 2^96 + + p256Sqr(out, p32, 16) // 2^48 - 2^16 + p256Mul(out, out, p16) // 2^48 - 2^0 + + p256Sqr(out, out, 8) // 2^56 - 2^8 + p256Mul(out, out, p8) // 2^56 - 2^0 + + p256Sqr(out, out, 4) // 2^60 - 2^4 + p256Mul(out, out, p4) // 2^60 - 2^0 + + p256Sqr(out, out, 2) // 2^62 - 2^2 + p256Mul(out, out, p2) // 2^62 - 2^0 + + p256Sqr(out, out, 2) // 2^64 - 2^2 + p256Mul(out, out, in) //2^64 - 2^2 + 2^1 + + p256Mul(out, out, ptmp) //2^256 - 2^224 - 2^96 + 2^64 - 3 +} + +func (p *p256Point) p256StorePoint(r *[16 * 4 * 3]uint64, index int) { + copy(r[index*12:], p.xyz[:]) +} + +func boothW5(in uint) (int, int) { + var s uint = ^((in >> 5) - 1) + var d uint = (1 << 6) - in - 1 + d = (d & s) | (in & (^s)) + d = (d >> 1) + (d & 1) + return int(d), int(s & 1) +} + +func boothW6(in uint) (int, int) { + var s uint = ^((in >> 6) - 1) + var d uint = (1 << 7) - in - 1 + d = (d & s) | (in & (^s)) + d = (d >> 1) + (d & 1) + return int(d), int(s & 1) +} + +func initTable() { + p256Precomputed = new([43][32 * 8]uint64) + + basePoint := []uint64{ + 0x61328990f418029e, 0x3e7981eddca6c050, 0xd6a1ed99ac24c3c3, 0x91167a5ee1c13b05, + 0xc1354e593c2d0ddd, 0xc1f5e5788d3295fa, 0x8d4cfb066e2a48f8, 0x63cd65d481d735bd, + 0x0000000000000001, 0x00000000ffffffff, 0x0000000000000000, 0x0000000100000000, + } + t1 := make([]uint64, 12) + t2 := make([]uint64, 12) + copy(t2, basePoint) + + zInv := make([]uint64, 4) + zInvSq := make([]uint64, 4) + for j := 0; j < 32; j++ { + copy(t1, t2) + for i := 0; i < 43; i++ { + // The window size is 6 so we need to double 6 times. + if i != 0 { + for k := 0; k < 6; k++ { + p256PointDoubleAsm(t1, t1) + } + } + // Convert the point to affine form. (Its values are + // still in Montgomery form however.) + p256Inverse(zInv, t1[8:12]) + p256Sqr(zInvSq, zInv, 1) + p256Mul(zInv, zInv, zInvSq) + + p256Mul(t1[:4], t1[:4], zInvSq) + p256Mul(t1[4:8], t1[4:8], zInv) + + copy(t1[8:12], basePoint[8:12]) + // Update the table entry + copy(p256Precomputed[i][j*8:], t1[:8]) + } + if j == 0 { + p256PointDoubleAsm(t2, basePoint) + } else { + p256PointAddAsm(t2, t2, basePoint) + } + } +} + +func (p *p256Point) p256BaseMult(scalar []uint64) { + precomputeOnce.Do(initTable) + + wvalue := (scalar[0] << 1) & 0x7f + sel, sign := boothW6(uint(wvalue)) + p256SelectBase(p.xyz[0:8], p256Precomputed[0][0:], sel) + p256NegCond(p.xyz[4:8], sign) + + // (This is one, in the Montgomery domain.) + p.xyz[8] = 0x0000000000000001 + p.xyz[9] = 0x00000000ffffffff + p.xyz[10] = 0x0000000000000000 + p.xyz[11] = 0x0000000100000000 + + var t0 p256Point + // (This is one, in the Montgomery domain.) + t0.xyz[8] = 0x0000000000000001 + t0.xyz[9] = 0x00000000ffffffff + t0.xyz[10] = 0x0000000000000000 + t0.xyz[11] = 0x0000000100000000 + + index := uint(5) + zero := sel + + for i := 1; i < 43; i++ { + if index < 192 { + wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x7f + } else { + wvalue = (scalar[index/64] >> (index % 64)) & 0x7f + } + index += 6 + sel, sign = boothW6(uint(wvalue)) + p256SelectBase(t0.xyz[0:8], p256Precomputed[i][0:], sel) + p256PointAddAffineAsm(p.xyz[0:12], p.xyz[0:12], t0.xyz[0:8], sign, sel, zero) + zero |= sel + } +} + +func (p *p256Point) p256ScalarMult(scalar []uint64) { + // precomp is a table of precomputed points that stores powers of p + // from p^1 to p^16. + var precomp [16 * 4 * 3]uint64 + var t0, t1, t2, t3 p256Point + + // Prepare the table + p.p256StorePoint(&precomp, 0) // 1 + + p256PointDoubleAsm(t0.xyz[:], p.xyz[:]) + p256PointDoubleAsm(t1.xyz[:], t0.xyz[:]) + p256PointDoubleAsm(t2.xyz[:], t1.xyz[:]) + p256PointDoubleAsm(t3.xyz[:], t2.xyz[:]) + t0.p256StorePoint(&precomp, 1) // 2 + t1.p256StorePoint(&precomp, 3) // 4 + t2.p256StorePoint(&precomp, 7) // 8 + t3.p256StorePoint(&precomp, 15) // 16 + + p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:]) + p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:]) + p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:]) + t0.p256StorePoint(&precomp, 2) // 3 + t1.p256StorePoint(&precomp, 4) // 5 + t2.p256StorePoint(&precomp, 8) // 9 + + p256PointDoubleAsm(t0.xyz[:], t0.xyz[:]) + p256PointDoubleAsm(t1.xyz[:], t1.xyz[:]) + t0.p256StorePoint(&precomp, 5) // 6 + t1.p256StorePoint(&precomp, 9) // 10 + + p256PointAddAsm(t2.xyz[:], t0.xyz[:], p.xyz[:]) + p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:]) + t2.p256StorePoint(&precomp, 6) // 7 + t1.p256StorePoint(&precomp, 10) // 11 + + p256PointDoubleAsm(t0.xyz[:], t0.xyz[:]) + p256PointDoubleAsm(t2.xyz[:], t2.xyz[:]) + t0.p256StorePoint(&precomp, 11) // 12 + t2.p256StorePoint(&precomp, 13) // 14 + + p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:]) + p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:]) + t0.p256StorePoint(&precomp, 12) // 13 + t2.p256StorePoint(&precomp, 14) // 15 + + // Start scanning the window from top bit + index := uint(254) + var sel, sign int + + wvalue := (scalar[index/64] >> (index % 64)) & 0x3f + sel, _ = boothW5(uint(wvalue)) + + p256Select(p.xyz[0:12], precomp[0:], sel) + zero := sel + + for index > 4 { + index -= 5 + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + + if index < 192 { + wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f + } else { + wvalue = (scalar[index/64] >> (index % 64)) & 0x3f + } + + sel, sign = boothW5(uint(wvalue)) + + p256Select(t0.xyz[0:], precomp[0:], sel) + p256NegCond(t0.xyz[4:8], sign) + p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:]) + p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel) + p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero) + zero |= sel + } + + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + + wvalue = (scalar[0] << 1) & 0x3f + sel, sign = boothW5(uint(wvalue)) + + p256Select(t0.xyz[0:], precomp[0:], sel) + p256NegCond(t0.xyz[4:8], sign) + p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:]) + p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel) + p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero) +} diff --git a/sm2/p256_asm_amd64.s b/sm2/p256_asm_amd64.s new file mode 100644 index 0000000..c76eda2 --- /dev/null +++ b/sm2/p256_asm_amd64.s @@ -0,0 +1,2353 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file contains constant-time, 64-bit assembly implementation of +// P256. The optimizations performed here are described in detail in: +// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with +// 256-bit primes" +// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x +// https://eprint.iacr.org/2013/816.pdf + +#include "textflag.h" + +#define res_ptr DI +#define x_ptr SI +#define y_ptr CX + +#define acc0 R8 +#define acc1 R9 +#define acc2 R10 +#define acc3 R11 +#define acc4 R12 +#define acc5 R13 +#define t0 R14 +#define t1 R15 + +DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff +DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001 +DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff +DATA p256p<>+0x08(SB)/8, $0xffffffff00000000 +DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff +DATA p256p<>+0x18(SB)/8, $0xfffffffeffffffff +DATA p256ordK0<>+0x00(SB)/8, $0x327f9e8872350975 +DATA p256ord<>+0x00(SB)/8, $0x53bbf40939d54123 +DATA p256ord<>+0x08(SB)/8, $0x7203df6b21c6052b +DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff +DATA p256ord<>+0x18(SB)/8, $0xfffffffeffffffff +DATA p256one<>+0x00(SB)/8, $0x0000000000000001 +DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff +DATA p256one<>+0x10(SB)/8, $0x0000000000000000 +DATA p256one<>+0x18(SB)/8, $0x0000000100000000 +GLOBL p256const0<>(SB), 8, $8 +GLOBL p256const1<>(SB), 8, $8 +GLOBL p256p<>(SB), RODATA, $32 +GLOBL p256ordK0<>(SB), RODATA, $8 +GLOBL p256ord<>(SB), RODATA, $32 +GLOBL p256one<>(SB), RODATA, $32 + +/* ---------------------------------------*/ +// func p256LittleToBig(res []byte, in []uint64) +TEXT ·p256LittleToBig(SB),NOSPLIT,$0 + JMP ·p256BigToLittle(SB) +/* ---------------------------------------*/ +// func p256BigToLittle(res []uint64, in []byte) +TEXT ·p256BigToLittle(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in+24(FP), x_ptr + + MOVQ (8*0)(x_ptr), acc0 + MOVQ (8*1)(x_ptr), acc1 + MOVQ (8*2)(x_ptr), acc2 + MOVQ (8*3)(x_ptr), acc3 + + BSWAPQ acc0 + BSWAPQ acc1 + BSWAPQ acc2 + BSWAPQ acc3 + + MOVQ acc3, (8*0)(res_ptr) + MOVQ acc2, (8*1)(res_ptr) + MOVQ acc1, (8*2)(res_ptr) + MOVQ acc0, (8*3)(res_ptr) + + RET +/* ---------------------------------------*/ +// func p256MovCond(res, a, b []uint64, cond int) +// If cond == 0 res=b, else res=a +TEXT ·p256MovCond(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ a+24(FP), x_ptr + MOVQ b+48(FP), y_ptr + MOVQ cond+72(FP), X12 + + PXOR X13, X13 + PSHUFD $0, X12, X12 + PCMPEQL X13, X12 + + MOVOU X12, X0 + MOVOU (16*0)(x_ptr), X6 + PANDN X6, X0 + MOVOU X12, X1 + MOVOU (16*1)(x_ptr), X7 + PANDN X7, X1 + MOVOU X12, X2 + MOVOU (16*2)(x_ptr), X8 + PANDN X8, X2 + MOVOU X12, X3 + MOVOU (16*3)(x_ptr), X9 + PANDN X9, X3 + MOVOU X12, X4 + MOVOU (16*4)(x_ptr), X10 + PANDN X10, X4 + MOVOU X12, X5 + MOVOU (16*5)(x_ptr), X11 + PANDN X11, X5 + + MOVOU (16*0)(y_ptr), X6 + MOVOU (16*1)(y_ptr), X7 + MOVOU (16*2)(y_ptr), X8 + MOVOU (16*3)(y_ptr), X9 + MOVOU (16*4)(y_ptr), X10 + MOVOU (16*5)(y_ptr), X11 + + PAND X12, X6 + PAND X12, X7 + PAND X12, X8 + PAND X12, X9 + PAND X12, X10 + PAND X12, X11 + + PXOR X6, X0 + PXOR X7, X1 + PXOR X8, X2 + PXOR X9, X3 + PXOR X10, X4 + PXOR X11, X5 + + MOVOU X0, (16*0)(res_ptr) + MOVOU X1, (16*1)(res_ptr) + MOVOU X2, (16*2)(res_ptr) + MOVOU X3, (16*3)(res_ptr) + MOVOU X4, (16*4)(res_ptr) + MOVOU X5, (16*5)(res_ptr) + + RET +/* ---------------------------------------*/ +// func p256NegCond(val []uint64, cond int) +TEXT ·p256NegCond(SB),NOSPLIT,$0 + MOVQ val+0(FP), res_ptr + MOVQ cond+24(FP), t0 + // acc = poly + MOVQ $-1, acc0 + MOVQ p256const0<>(SB), acc1 + MOVQ $0, acc2 + MOVQ p256const1<>(SB), acc3 + // Load the original value + MOVQ (8*0)(res_ptr), acc5 + MOVQ (8*1)(res_ptr), x_ptr + MOVQ (8*2)(res_ptr), y_ptr + MOVQ (8*3)(res_ptr), t1 + // Speculatively subtract + SUBQ acc5, acc0 + SBBQ x_ptr, acc1 + SBBQ y_ptr, acc2 + SBBQ t1, acc3 + // If condition is 0, keep original value + TESTQ t0, t0 + CMOVQEQ acc5, acc0 + CMOVQEQ x_ptr, acc1 + CMOVQEQ y_ptr, acc2 + CMOVQEQ t1, acc3 + // Store result + MOVQ acc0, (8*0)(res_ptr) + MOVQ acc1, (8*1)(res_ptr) + MOVQ acc2, (8*2)(res_ptr) + MOVQ acc3, (8*3)(res_ptr) + + RET +/* ---------------------------------------*/ +// func p256Sqr(res, in []uint64, n int) +TEXT ·p256Sqr(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in+24(FP), x_ptr + MOVQ n+48(FP), BX + +sqrLoop: + + // y[1:] * y[0] + MOVQ (8*0)(x_ptr), t0 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + MOVQ AX, acc1 + MOVQ DX, acc2 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc4 + // y[2:] * y[1] + MOVQ (8*1)(x_ptr), t0 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, acc5 + // y[3] * y[2] + MOVQ (8*2)(x_ptr), t0 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, y_ptr + XORQ t1, t1 + // *2 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ acc4, acc4 + ADCQ acc5, acc5 + ADCQ y_ptr, y_ptr + ADCQ $0, t1 + // Missing products + MOVQ (8*0)(x_ptr), AX + MULQ AX + MOVQ AX, acc0 + MOVQ DX, t0 + + MOVQ (8*1)(x_ptr), AX + MULQ AX + ADDQ t0, acc1 + ADCQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t0 + + MOVQ (8*2)(x_ptr), AX + MULQ AX + ADDQ t0, acc3 + ADCQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t0 + + MOVQ (8*3)(x_ptr), AX + MULQ AX + ADDQ t0, acc5 + ADCQ AX, y_ptr + ADCQ DX, t1 + MOVQ t1, x_ptr + // First reduction step + MOVQ acc0, AX + MOVQ acc0, t1 + SHLQ $32, acc0 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc0, acc1 + ADCQ t1, acc2 + ADCQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc0 + // Second reduction step + MOVQ acc1, AX + MOVQ acc1, t1 + SHLQ $32, acc1 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc1, acc2 + ADCQ t1, acc3 + ADCQ AX, acc0 + ADCQ $0, DX + MOVQ DX, acc1 + // Third reduction step + MOVQ acc2, AX + MOVQ acc2, t1 + SHLQ $32, acc2 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc2, acc3 + ADCQ t1, acc0 + ADCQ AX, acc1 + ADCQ $0, DX + MOVQ DX, acc2 + // Last reduction step + XORQ t0, t0 + MOVQ acc3, AX + MOVQ acc3, t1 + SHLQ $32, acc3 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc3, acc0 + ADCQ t1, acc1 + ADCQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + // Add bits [511:256] of the sqr result + ADCQ acc4, acc0 + ADCQ acc5, acc1 + ADCQ y_ptr, acc2 + ADCQ x_ptr, acc3 + ADCQ $0, t0 + + MOVQ acc0, acc4 + MOVQ acc1, acc5 + MOVQ acc2, y_ptr + MOVQ acc3, t1 + // Subtract p256 + SUBQ $-1, acc0 + SBBQ p256const0<>(SB) ,acc1 + SBBQ $0, acc2 + SBBQ p256const1<>(SB), acc3 + SBBQ $0, t0 + + CMOVQCS acc4, acc0 + CMOVQCS acc5, acc1 + CMOVQCS y_ptr, acc2 + CMOVQCS t1, acc3 + + MOVQ acc0, (8*0)(res_ptr) + MOVQ acc1, (8*1)(res_ptr) + MOVQ acc2, (8*2)(res_ptr) + MOVQ acc3, (8*3)(res_ptr) + MOVQ res_ptr, x_ptr + DECQ BX + JNE sqrLoop + + RET +/* ---------------------------------------*/ +// func p256Mul(res, in1, in2 []uint64) +TEXT ·p256Mul(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in1+24(FP), x_ptr + MOVQ in2+48(FP), y_ptr + // x * y[0] + MOVQ (8*0)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + MOVQ AX, acc0 + MOVQ DX, acc1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, acc2 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc4 + XORQ acc5, acc5 + // First reduction step + MOVQ acc0, AX + MOVQ acc0, t1 + SHLQ $32, acc0 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc0, acc1 + ADCQ t1, acc2 + ADCQ AX, acc3 + ADCQ DX, acc4 + ADCQ $0, acc5 + XORQ acc0, acc0 + // x * y[1] + MOVQ (8*1)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ DX, acc5 + ADCQ $0, acc0 + // Second reduction step + MOVQ acc1, AX + MOVQ acc1, t1 + SHLQ $32, acc1 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc1, acc2 + ADCQ t1, acc3 + ADCQ AX, acc4 + ADCQ DX, acc5 + ADCQ $0, acc0 + XORQ acc1, acc1 + // x * y[2] + MOVQ (8*2)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ DX, acc0 + ADCQ $0, acc1 + // Third reduction step + MOVQ acc2, AX + MOVQ acc2, t1 + SHLQ $32, acc2 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc2, acc3 + ADCQ t1, acc4 + ADCQ AX, acc5 + ADCQ DX, acc0 + ADCQ $0, acc1 + XORQ acc2, acc2 + // x * y[3] + MOVQ (8*3)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ DX, acc1 + ADCQ $0, acc2 + // Last reduction step + MOVQ acc3, AX + MOVQ acc3, t1 + SHLQ $32, acc3 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc3, acc4 + ADCQ t1, acc5 + ADCQ AX, acc0 + ADCQ DX, acc1 + ADCQ $0, acc2 + // Copy result [255:0] + MOVQ acc4, x_ptr + MOVQ acc5, acc3 + MOVQ acc0, t0 + MOVQ acc1, t1 + // Subtract p256 + SUBQ $-1, acc4 + SBBQ p256const0<>(SB) ,acc5 + SBBQ $0, acc0 + SBBQ p256const1<>(SB), acc1 + SBBQ $0, acc2 + + CMOVQCS x_ptr, acc4 + CMOVQCS acc3, acc5 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + + MOVQ acc4, (8*0)(res_ptr) + MOVQ acc5, (8*1)(res_ptr) + MOVQ acc0, (8*2)(res_ptr) + MOVQ acc1, (8*3)(res_ptr) + + RET +/* ---------------------------------------*/ +// func p256FromMont(res, in []uint64) +TEXT ·p256FromMont(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in+24(FP), x_ptr + + MOVQ (8*0)(x_ptr), acc0 + MOVQ (8*1)(x_ptr), acc1 + MOVQ (8*2)(x_ptr), acc2 + MOVQ (8*3)(x_ptr), acc3 + XORQ acc4, acc4 + + // Only reduce, no multiplications are needed + // First stage + MOVQ acc0, AX + MOVQ acc0, t1 + SHLQ $32, acc0 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc0, acc1 + ADCQ t1, acc2 + ADCQ AX, acc3 + ADCQ DX, acc4 + XORQ acc5, acc5 + // Second stage + MOVQ acc1, AX + MOVQ acc1, t1 + SHLQ $32, acc1 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc1, acc2 + ADCQ t1, acc3 + ADCQ AX, acc4 + ADCQ DX, acc5 + XORQ acc0, acc0 + // Third stage + MOVQ acc2, AX + MOVQ acc2, t1 + SHLQ $32, acc2 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc2, acc3 + ADCQ t1, acc4 + ADCQ AX, acc5 + ADCQ DX, acc0 + XORQ acc1, acc1 + // Last stage + MOVQ acc3, AX + MOVQ acc3, t1 + SHLQ $32, acc3 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc3, acc4 + ADCQ t1, acc5 + ADCQ AX, acc0 + ADCQ DX, acc1 + + MOVQ acc4, x_ptr + MOVQ acc5, acc3 + MOVQ acc0, t0 + MOVQ acc1, t1 + + SUBQ $-1, acc4 + SBBQ p256const0<>(SB), acc5 + SBBQ $0, acc0 + SBBQ p256const1<>(SB), acc1 + + CMOVQCS x_ptr, acc4 + CMOVQCS acc3, acc5 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + + MOVQ acc4, (8*0)(res_ptr) + MOVQ acc5, (8*1)(res_ptr) + MOVQ acc0, (8*2)(res_ptr) + MOVQ acc1, (8*3)(res_ptr) + + RET +/* ---------------------------------------*/ +// Constant time point access to arbitrary point table. +// Indexed from 1 to 15, with -1 offset +// (index 0 is implicitly point at infinity) +// func p256Select(point, table []uint64, idx int) +TEXT ·p256Select(SB),NOSPLIT,$0 + MOVQ idx+48(FP),AX + MOVQ table+24(FP),DI + MOVQ point+0(FP),DX + + PXOR X15, X15 // X15 = 0 + PCMPEQL X14, X14 // X14 = -1 + PSUBL X14, X15 // X15 = 1 + MOVL AX, X14 + PSHUFD $0, X14, X14 + + PXOR X0, X0 + PXOR X1, X1 + PXOR X2, X2 + PXOR X3, X3 + PXOR X4, X4 + PXOR X5, X5 + MOVQ $16, AX + + MOVOU X15, X13 + +loop_select: + + MOVOU X13, X12 + PADDL X15, X13 + PCMPEQL X14, X12 + + MOVOU (16*0)(DI), X6 + MOVOU (16*1)(DI), X7 + MOVOU (16*2)(DI), X8 + MOVOU (16*3)(DI), X9 + MOVOU (16*4)(DI), X10 + MOVOU (16*5)(DI), X11 + ADDQ $(16*6), DI + + PAND X12, X6 + PAND X12, X7 + PAND X12, X8 + PAND X12, X9 + PAND X12, X10 + PAND X12, X11 + + PXOR X6, X0 + PXOR X7, X1 + PXOR X8, X2 + PXOR X9, X3 + PXOR X10, X4 + PXOR X11, X5 + + DECQ AX + JNE loop_select + + MOVOU X0, (16*0)(DX) + MOVOU X1, (16*1)(DX) + MOVOU X2, (16*2)(DX) + MOVOU X3, (16*3)(DX) + MOVOU X4, (16*4)(DX) + MOVOU X5, (16*5)(DX) + + RET +/* ---------------------------------------*/ +// Constant time point access to base point table. +// func p256SelectBase(point, table []uint64, idx int) +TEXT ·p256SelectBase(SB),NOSPLIT,$0 + MOVQ idx+48(FP),AX + MOVQ table+24(FP),DI + MOVQ point+0(FP),DX + + PXOR X15, X15 // X15 = 0 + PCMPEQL X14, X14 // X14 = -1 + PSUBL X14, X15 // X15 = 1 + MOVL AX, X14 + PSHUFD $0, X14, X14 + + PXOR X0, X0 + PXOR X1, X1 + PXOR X2, X2 + PXOR X3, X3 + MOVQ $16, AX + + MOVOU X15, X13 + +loop_select_base: + + MOVOU X13, X12 + PADDL X15, X13 + PCMPEQL X14, X12 + + MOVOU (16*0)(DI), X4 + MOVOU (16*1)(DI), X5 + MOVOU (16*2)(DI), X6 + MOVOU (16*3)(DI), X7 + + MOVOU (16*4)(DI), X8 + MOVOU (16*5)(DI), X9 + MOVOU (16*6)(DI), X10 + MOVOU (16*7)(DI), X11 + + ADDQ $(16*8), DI + + PAND X12, X4 + PAND X12, X5 + PAND X12, X6 + PAND X12, X7 + + MOVOU X13, X12 + PADDL X15, X13 + PCMPEQL X14, X12 + + PAND X12, X8 + PAND X12, X9 + PAND X12, X10 + PAND X12, X11 + + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X2 + PXOR X7, X3 + + PXOR X8, X0 + PXOR X9, X1 + PXOR X10, X2 + PXOR X11, X3 + + DECQ AX + JNE loop_select_base + + MOVOU X0, (16*0)(DX) + MOVOU X1, (16*1)(DX) + MOVOU X2, (16*2)(DX) + MOVOU X3, (16*3)(DX) + + RET +/* ---------------------------------------*/ +// func p256OrdMul(res, in1, in2 []uint64) +TEXT ·p256OrdMul(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in1+24(FP), x_ptr + MOVQ in2+48(FP), y_ptr + // x * y[0] + MOVQ (8*0)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + MOVQ AX, acc0 + MOVQ DX, acc1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, acc2 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc4 + XORQ acc5, acc5 + // First reduction step + MOVQ acc0, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc0 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x10(SB), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x18(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ DX, acc4 + ADCQ $0, acc5 + // x * y[1] + MOVQ (8*1)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ DX, acc5 + ADCQ $0, acc0 + // Second reduction step + MOVQ acc1, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x10(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x18(SB), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ DX, acc5 + ADCQ $0, acc0 + // x * y[2] + MOVQ (8*2)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ DX, acc0 + ADCQ $0, acc1 + // Third reduction step + MOVQ acc2, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x10(SB), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x18(SB), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ DX, acc0 + ADCQ $0, acc1 + // x * y[3] + MOVQ (8*3)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ DX, acc1 + ADCQ $0, acc2 + // Last reduction step + MOVQ acc3, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x10(SB), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x18(SB), AX + MULQ t0 + ADDQ t1, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ DX, acc1 + ADCQ $0, acc2 + // Copy result [255:0] + MOVQ acc4, x_ptr + MOVQ acc5, acc3 + MOVQ acc0, t0 + MOVQ acc1, t1 + // Subtract p256 + SUBQ p256ord<>+0x00(SB), acc4 + SBBQ p256ord<>+0x08(SB) ,acc5 + SBBQ p256ord<>+0x10(SB), acc0 + SBBQ p256ord<>+0x18(SB), acc1 + SBBQ $0, acc2 + + CMOVQCS x_ptr, acc4 + CMOVQCS acc3, acc5 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + + MOVQ acc4, (8*0)(res_ptr) + MOVQ acc5, (8*1)(res_ptr) + MOVQ acc0, (8*2)(res_ptr) + MOVQ acc1, (8*3)(res_ptr) + + RET +/* ---------------------------------------*/ +// func p256OrdSqr(res, in []uint64, n int) +TEXT ·p256OrdSqr(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in+24(FP), x_ptr + MOVQ n+48(FP), BX + +ordSqrLoop: + + // y[1:] * y[0] + MOVQ (8*0)(x_ptr), t0 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + MOVQ AX, acc1 + MOVQ DX, acc2 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc4 + // y[2:] * y[1] + MOVQ (8*1)(x_ptr), t0 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, acc5 + // y[3] * y[2] + MOVQ (8*2)(x_ptr), t0 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, y_ptr + XORQ t1, t1 + // *2 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ acc4, acc4 + ADCQ acc5, acc5 + ADCQ y_ptr, y_ptr + ADCQ $0, t1 + // Missing products + MOVQ (8*0)(x_ptr), AX + MULQ AX + MOVQ AX, acc0 + MOVQ DX, t0 + + MOVQ (8*1)(x_ptr), AX + MULQ AX + ADDQ t0, acc1 + ADCQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t0 + + MOVQ (8*2)(x_ptr), AX + MULQ AX + ADDQ t0, acc3 + ADCQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t0 + + MOVQ (8*3)(x_ptr), AX + MULQ AX + ADDQ t0, acc5 + ADCQ AX, y_ptr + ADCQ DX, t1 + MOVQ t1, x_ptr + // First reduction step + MOVQ acc0, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc0 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + + MOVQ t0, t1 + ADCQ DX, acc2 + ADCQ $0, t1 + SUBQ t0, acc2 + SBBQ $0, t1 + + MOVQ t0, AX + MOVQ t0, DX + MOVQ t0, acc0 + SHLQ $32, AX + SHRQ $32, DX + + ADDQ t1, acc3 + ADCQ $0, acc0 + SUBQ AX, acc3 + SBBQ DX, acc0 + // Second reduction step + MOVQ acc1, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + + MOVQ t0, t1 + ADCQ DX, acc3 + ADCQ $0, t1 + SUBQ t0, acc3 + SBBQ $0, t1 + + MOVQ t0, AX + MOVQ t0, DX + MOVQ t0, acc1 + SHLQ $32, AX + SHRQ $32, DX + + ADDQ t1, acc0 + ADCQ $0, acc1 + SUBQ AX, acc0 + SBBQ DX, acc1 + // Third reduction step + MOVQ acc2, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + + MOVQ t0, t1 + ADCQ DX, acc0 + ADCQ $0, t1 + SUBQ t0, acc0 + SBBQ $0, t1 + + MOVQ t0, AX + MOVQ t0, DX + MOVQ t0, acc2 + SHLQ $32, AX + SHRQ $32, DX + + ADDQ t1, acc1 + ADCQ $0, acc2 + SUBQ AX, acc1 + SBBQ DX, acc2 + // Last reduction step + MOVQ acc3, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ t0, t1 + ADCQ DX, acc1 + ADCQ $0, t1 + SUBQ t0, acc1 + SBBQ $0, t1 + + MOVQ t0, AX + MOVQ t0, DX + MOVQ t0, acc3 + SHLQ $32, AX + SHRQ $32, DX + + ADDQ t1, acc2 + ADCQ $0, acc3 + SUBQ AX, acc2 + SBBQ DX, acc3 + XORQ t0, t0 + // Add bits [511:256] of the sqr result + ADCQ acc4, acc0 + ADCQ acc5, acc1 + ADCQ y_ptr, acc2 + ADCQ x_ptr, acc3 + ADCQ $0, t0 + + MOVQ acc0, acc4 + MOVQ acc1, acc5 + MOVQ acc2, y_ptr + MOVQ acc3, t1 + // Subtract p256 + SUBQ p256ord<>+0x00(SB), acc0 + SBBQ p256ord<>+0x08(SB) ,acc1 + SBBQ p256ord<>+0x10(SB), acc2 + SBBQ p256ord<>+0x18(SB), acc3 + SBBQ $0, t0 + + CMOVQCS acc4, acc0 + CMOVQCS acc5, acc1 + CMOVQCS y_ptr, acc2 + CMOVQCS t1, acc3 + + MOVQ acc0, (8*0)(res_ptr) + MOVQ acc1, (8*1)(res_ptr) + MOVQ acc2, (8*2)(res_ptr) + MOVQ acc3, (8*3)(res_ptr) + MOVQ res_ptr, x_ptr + DECQ BX + JNE ordSqrLoop + + RET +/* ---------------------------------------*/ +#undef res_ptr +#undef x_ptr +#undef y_ptr + +#undef acc0 +#undef acc1 +#undef acc2 +#undef acc3 +#undef acc4 +#undef acc5 +#undef t0 +#undef t1 +/* ---------------------------------------*/ +#define mul0 AX +#define mul1 DX +#define acc0 BX +#define acc1 CX +#define acc2 R8 +#define acc3 R9 +#define acc4 R10 +#define acc5 R11 +#define acc6 R12 +#define acc7 R13 +#define t0 R14 +#define t1 R15 +#define t2 DI +#define t3 SI +#define hlp BP +/* ---------------------------------------*/ +TEXT sm2P256SubInternal(SB),NOSPLIT,$0 + XORQ mul0, mul0 + SUBQ t0, acc4 + SBBQ t1, acc5 + SBBQ t2, acc6 + SBBQ t3, acc7 + SBBQ $0, mul0 + + MOVQ acc4, acc0 + MOVQ acc5, acc1 + MOVQ acc6, acc2 + MOVQ acc7, acc3 + + ADDQ $-1, acc4 + ADCQ p256const0<>(SB), acc5 + ADCQ $0, acc6 + ADCQ p256const1<>(SB), acc7 + ANDQ $1, mul0 + + CMOVQEQ acc0, acc4 + CMOVQEQ acc1, acc5 + CMOVQEQ acc2, acc6 + CMOVQEQ acc3, acc7 + + RET +/* ---------------------------------------*/ +TEXT sm2P256MulInternal(SB),NOSPLIT,$0 + MOVQ acc4, mul0 + MULQ t0 + MOVQ mul0, acc0 + MOVQ mul1, acc1 + + MOVQ acc4, mul0 + MULQ t1 + ADDQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, acc2 + + MOVQ acc4, mul0 + MULQ t2 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc3 + + MOVQ acc4, mul0 + MULQ t3 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, acc4 + + MOVQ acc5, mul0 + MULQ t0 + ADDQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc5, mul0 + MULQ t1 + ADDQ hlp, acc2 + ADCQ $0, mul1 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc5, mul0 + MULQ t2 + ADDQ hlp, acc3 + ADCQ $0, mul1 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc5, mul0 + MULQ t3 + ADDQ hlp, acc4 + ADCQ $0, mul1 + ADDQ mul0, acc4 + ADCQ $0, mul1 + MOVQ mul1, acc5 + + MOVQ acc6, mul0 + MULQ t0 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc6, mul0 + MULQ t1 + ADDQ hlp, acc3 + ADCQ $0, mul1 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc6, mul0 + MULQ t2 + ADDQ hlp, acc4 + ADCQ $0, mul1 + ADDQ mul0, acc4 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc6, mul0 + MULQ t3 + ADDQ hlp, acc5 + ADCQ $0, mul1 + ADDQ mul0, acc5 + ADCQ $0, mul1 + MOVQ mul1, acc6 + + MOVQ acc7, mul0 + MULQ t0 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc7, mul0 + MULQ t1 + ADDQ hlp, acc4 + ADCQ $0, mul1 + ADDQ mul0, acc4 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc7, mul0 + MULQ t2 + ADDQ hlp, acc5 + ADCQ $0, mul1 + ADDQ mul0, acc5 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc7, mul0 + MULQ t3 + ADDQ hlp, acc6 + ADCQ $0, mul1 + ADDQ mul0, acc6 + ADCQ $0, mul1 + MOVQ mul1, acc7 + // First reduction step + MOVQ acc0, mul0 + MOVQ acc0, hlp + SHLQ $32, acc0 + MULQ p256const1<>(SB) + SHRQ $32, hlp + ADDQ acc0, acc1 + ADCQ hlp, acc2 + ADCQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, acc0 + // Second reduction step + MOVQ acc1, mul0 + MOVQ acc1, hlp + SHLQ $32, acc1 + MULQ p256const1<>(SB) + SHRQ $32, hlp + ADDQ acc1, acc2 + ADCQ hlp, acc3 + ADCQ mul0, acc0 + ADCQ $0, mul1 + MOVQ mul1, acc1 + // Third reduction step + MOVQ acc2, mul0 + MOVQ acc2, hlp + SHLQ $32, acc2 + MULQ p256const1<>(SB) + SHRQ $32, hlp + ADDQ acc2, acc3 + ADCQ hlp, acc0 + ADCQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, acc2 + // Last reduction step + MOVQ acc3, mul0 + MOVQ acc3, hlp + SHLQ $32, acc3 + MULQ p256const1<>(SB) + SHRQ $32, hlp + ADDQ acc3, acc0 + ADCQ hlp, acc1 + ADCQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc3 + MOVQ $0, BP + // Add bits [511:256] of the result + ADCQ acc0, acc4 + ADCQ acc1, acc5 + ADCQ acc2, acc6 + ADCQ acc3, acc7 + ADCQ $0, hlp + // Copy result + MOVQ acc4, acc0 + MOVQ acc5, acc1 + MOVQ acc6, acc2 + MOVQ acc7, acc3 + // Subtract p256 + SUBQ $-1, acc4 + SBBQ p256const0<>(SB) ,acc5 + SBBQ $0, acc6 + SBBQ p256const1<>(SB), acc7 + SBBQ $0, hlp + // If the result of the subtraction is negative, restore the previous result + CMOVQCS acc0, acc4 + CMOVQCS acc1, acc5 + CMOVQCS acc2, acc6 + CMOVQCS acc3, acc7 + + RET +/* ---------------------------------------*/ +TEXT sm2P256SqrInternal(SB),NOSPLIT,$0 + + MOVQ acc4, mul0 + MULQ acc5 + MOVQ mul0, acc1 + MOVQ mul1, acc2 + + MOVQ acc4, mul0 + MULQ acc6 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc3 + + MOVQ acc4, mul0 + MULQ acc7 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, t0 + + MOVQ acc5, mul0 + MULQ acc6 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc5, mul0 + MULQ acc7 + ADDQ hlp, t0 + ADCQ $0, mul1 + ADDQ mul0, t0 + ADCQ $0, mul1 + MOVQ mul1, t1 + + MOVQ acc6, mul0 + MULQ acc7 + ADDQ mul0, t1 + ADCQ $0, mul1 + MOVQ mul1, t2 + XORQ t3, t3 + // *2 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ t0, t0 + ADCQ t1, t1 + ADCQ t2, t2 + ADCQ $0, t3 + // Missing products + MOVQ acc4, mul0 + MULQ mul0 + MOVQ mul0, acc0 + MOVQ DX, acc4 + + MOVQ acc5, mul0 + MULQ mul0 + ADDQ acc4, acc1 + ADCQ mul0, acc2 + ADCQ $0, DX + MOVQ DX, acc4 + + MOVQ acc6, mul0 + MULQ mul0 + ADDQ acc4, acc3 + ADCQ mul0, t0 + ADCQ $0, DX + MOVQ DX, acc4 + + MOVQ acc7, mul0 + MULQ mul0 + ADDQ acc4, t1 + ADCQ mul0, t2 + ADCQ DX, t3 + // First reduction step + MOVQ acc0, mul0 + MOVQ acc0, hlp + SHLQ $32, acc0 + MULQ p256const1<>(SB) + SHRQ $32, hlp + ADDQ acc0, acc1 + ADCQ hlp, acc2 + ADCQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, acc0 + // Second reduction step + MOVQ acc1, mul0 + MOVQ acc1, hlp + SHLQ $32, acc1 + MULQ p256const1<>(SB) + SHRQ $32, hlp + ADDQ acc1, acc2 + ADCQ hlp, acc3 + ADCQ mul0, acc0 + ADCQ $0, mul1 + MOVQ mul1, acc1 + // Third reduction step + MOVQ acc2, mul0 + MOVQ acc2, hlp + SHLQ $32, acc2 + MULQ p256const1<>(SB) + SHRQ $32, hlp + ADDQ acc2, acc3 + ADCQ hlp, acc0 + ADCQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, acc2 + // Last reduction step + MOVQ acc3, mul0 + MOVQ acc3, hlp + SHLQ $32, acc3 + MULQ p256const1<>(SB) + SHRQ $32, hlp + ADDQ acc3, acc0 + ADCQ hlp, acc1 + ADCQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc3 + MOVQ $0, BP + // Add bits [511:256] of the result + ADCQ acc0, t0 + ADCQ acc1, t1 + ADCQ acc2, t2 + ADCQ acc3, t3 + ADCQ $0, hlp + // Copy result + MOVQ t0, acc4 + MOVQ t1, acc5 + MOVQ t2, acc6 + MOVQ t3, acc7 + // Subtract p256 + SUBQ $-1, acc4 + SBBQ p256const0<>(SB) ,acc5 + SBBQ $0, acc6 + SBBQ p256const1<>(SB), acc7 + SBBQ $0, hlp + // If the result of the subtraction is negative, restore the previous result + CMOVQCS t0, acc4 + CMOVQCS t1, acc5 + CMOVQCS t2, acc6 + CMOVQCS t3, acc7 + + RET +/* ---------------------------------------*/ +#define p256MulBy2Inline\ + XORQ mul0, mul0;\ + ADDQ acc4, acc4;\ + ADCQ acc5, acc5;\ + ADCQ acc6, acc6;\ + ADCQ acc7, acc7;\ + ADCQ $0, mul0;\ + MOVQ acc4, t0;\ + MOVQ acc5, t1;\ + MOVQ acc6, t2;\ + MOVQ acc7, t3;\ + SUBQ $-1, t0;\ + SBBQ p256const0<>(SB), t1;\ + SBBQ $0, t2;\ + SBBQ p256const1<>(SB), t3;\ + SBBQ $0, mul0;\ + CMOVQCS acc4, t0;\ + CMOVQCS acc5, t1;\ + CMOVQCS acc6, t2;\ + CMOVQCS acc7, t3; +/* ---------------------------------------*/ +#define p256AddInline \ + XORQ mul0, mul0;\ + ADDQ t0, acc4;\ + ADCQ t1, acc5;\ + ADCQ t2, acc6;\ + ADCQ t3, acc7;\ + ADCQ $0, mul0;\ + MOVQ acc4, t0;\ + MOVQ acc5, t1;\ + MOVQ acc6, t2;\ + MOVQ acc7, t3;\ + SUBQ $-1, t0;\ + SBBQ p256const0<>(SB), t1;\ + SBBQ $0, t2;\ + SBBQ p256const1<>(SB), t3;\ + SBBQ $0, mul0;\ + CMOVQCS acc4, t0;\ + CMOVQCS acc5, t1;\ + CMOVQCS acc6, t2;\ + CMOVQCS acc7, t3; +/* ---------------------------------------*/ +#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 +#define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 +#define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) +#define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) +#define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 +#define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 +/* ---------------------------------------*/ +#define x1in(off) (32*0 + off)(SP) +#define y1in(off) (32*1 + off)(SP) +#define z1in(off) (32*2 + off)(SP) +#define x2in(off) (32*3 + off)(SP) +#define y2in(off) (32*4 + off)(SP) +#define xout(off) (32*5 + off)(SP) +#define yout(off) (32*6 + off)(SP) +#define zout(off) (32*7 + off)(SP) +#define s2(off) (32*8 + off)(SP) +#define z1sqr(off) (32*9 + off)(SP) +#define h(off) (32*10 + off)(SP) +#define r(off) (32*11 + off)(SP) +#define hsqr(off) (32*12 + off)(SP) +#define rsqr(off) (32*13 + off)(SP) +#define hcub(off) (32*14 + off)(SP) +#define rptr (32*15)(SP) +#define sel_save (32*15 + 8)(SP) +#define zero_save (32*15 + 8 + 4)(SP) + +// func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) +TEXT ·p256PointAddAffineAsm(SB),0,$512-96 + // Move input to stack in order to free registers + MOVQ res+0(FP), AX + MOVQ in1+24(FP), BX + MOVQ in2+48(FP), CX + MOVQ sign+72(FP), DX + MOVQ sel+80(FP), t1 + MOVQ zero+88(FP), t2 + + MOVOU (16*0)(BX), X0 + MOVOU (16*1)(BX), X1 + MOVOU (16*2)(BX), X2 + MOVOU (16*3)(BX), X3 + MOVOU (16*4)(BX), X4 + MOVOU (16*5)(BX), X5 + + MOVOU X0, x1in(16*0) + MOVOU X1, x1in(16*1) + MOVOU X2, y1in(16*0) + MOVOU X3, y1in(16*1) + MOVOU X4, z1in(16*0) + MOVOU X5, z1in(16*1) + + MOVOU (16*0)(CX), X0 + MOVOU (16*1)(CX), X1 + + MOVOU X0, x2in(16*0) + MOVOU X1, x2in(16*1) + // Store pointer to result + MOVQ mul0, rptr + MOVL t1, sel_save + MOVL t2, zero_save + // Negate y2in based on sign + MOVQ (16*2 + 8*0)(CX), acc4 + MOVQ (16*2 + 8*1)(CX), acc5 + MOVQ (16*2 + 8*2)(CX), acc6 + MOVQ (16*2 + 8*3)(CX), acc7 + MOVQ $-1, acc0 + MOVQ p256const0<>(SB), acc1 + MOVQ $0, acc2 + MOVQ p256const1<>(SB), acc3 + XORQ mul0, mul0 + // Speculatively subtract + SUBQ acc4, acc0 + SBBQ acc5, acc1 + SBBQ acc6, acc2 + SBBQ acc7, acc3 + SBBQ $0, mul0 + MOVQ acc0, t0 + MOVQ acc1, t1 + MOVQ acc2, t2 + MOVQ acc3, t3 + // Add in case the operand was > p256 + ADDQ $-1, acc0 + ADCQ p256const0<>(SB), acc1 + ADCQ $0, acc2 + ADCQ p256const1<>(SB), acc3 + ADCQ $0, mul0 + CMOVQNE t0, acc0 + CMOVQNE t1, acc1 + CMOVQNE t2, acc2 + CMOVQNE t3, acc3 + // If condition is 0, keep original value + TESTQ DX, DX + CMOVQEQ acc4, acc0 + CMOVQEQ acc5, acc1 + CMOVQEQ acc6, acc2 + CMOVQEQ acc7, acc3 + // Store result + MOVQ acc0, y2in(8*0) + MOVQ acc1, y2in(8*1) + MOVQ acc2, y2in(8*2) + MOVQ acc3, y2in(8*3) + // Begin point add + LDacc (z1in) + CALL sm2P256SqrInternal(SB) // z1ˆ2 + ST (z1sqr) + + LDt (x2in) + CALL sm2P256MulInternal(SB) // x2 * z1ˆ2 + + LDt (x1in) + CALL sm2P256SubInternal(SB) // h = u2 - u1 + ST (h) + + LDt (z1in) + CALL sm2P256MulInternal(SB) // z3 = h * z1 + ST (zout) + + LDacc (z1sqr) + CALL sm2P256MulInternal(SB) // z1ˆ3 + + LDt (y2in) + CALL sm2P256MulInternal(SB) // s2 = y2 * z1ˆ3 + ST (s2) + + LDt (y1in) + CALL sm2P256SubInternal(SB) // r = s2 - s1 + ST (r) + + CALL sm2P256SqrInternal(SB) // rsqr = rˆ2 + ST (rsqr) + + LDacc (h) + CALL sm2P256SqrInternal(SB) // hsqr = hˆ2 + ST (hsqr) + + LDt (h) + CALL sm2P256MulInternal(SB) // hcub = hˆ3 + ST (hcub) + + LDt (y1in) + CALL sm2P256MulInternal(SB) // y1 * hˆ3 + ST (s2) + + LDacc (x1in) + LDt (hsqr) + CALL sm2P256MulInternal(SB) // u1 * hˆ2 + ST (h) + + p256MulBy2Inline // u1 * hˆ2 * 2, inline + LDacc (rsqr) + CALL sm2P256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 + + LDt (hcub) + CALL sm2P256SubInternal(SB) + ST (xout) + + MOVQ acc4, t0 + MOVQ acc5, t1 + MOVQ acc6, t2 + MOVQ acc7, t3 + LDacc (h) + CALL sm2P256SubInternal(SB) + + LDt (r) + CALL sm2P256MulInternal(SB) + + LDt (s2) + CALL sm2P256SubInternal(SB) + ST (yout) + // Load stored values from stack + MOVQ rptr, AX + MOVL sel_save, BX + MOVL zero_save, CX + // The result is not valid if (sel == 0), conditional choose + MOVOU xout(16*0), X0 + MOVOU xout(16*1), X1 + MOVOU yout(16*0), X2 + MOVOU yout(16*1), X3 + MOVOU zout(16*0), X4 + MOVOU zout(16*1), X5 + + MOVL BX, X6 + MOVL CX, X7 + + PXOR X8, X8 + PCMPEQL X9, X9 + + PSHUFD $0, X6, X6 + PSHUFD $0, X7, X7 + + PCMPEQL X8, X6 + PCMPEQL X8, X7 + + MOVOU X6, X15 + PANDN X9, X15 + + MOVOU x1in(16*0), X9 + MOVOU x1in(16*1), X10 + MOVOU y1in(16*0), X11 + MOVOU y1in(16*1), X12 + MOVOU z1in(16*0), X13 + MOVOU z1in(16*1), X14 + + PAND X15, X0 + PAND X15, X1 + PAND X15, X2 + PAND X15, X3 + PAND X15, X4 + PAND X15, X5 + + PAND X6, X9 + PAND X6, X10 + PAND X6, X11 + PAND X6, X12 + PAND X6, X13 + PAND X6, X14 + + PXOR X9, X0 + PXOR X10, X1 + PXOR X11, X2 + PXOR X12, X3 + PXOR X13, X4 + PXOR X14, X5 + // Similarly if zero == 0 + PCMPEQL X9, X9 + MOVOU X7, X15 + PANDN X9, X15 + + MOVOU x2in(16*0), X9 + MOVOU x2in(16*1), X10 + MOVOU y2in(16*0), X11 + MOVOU y2in(16*1), X12 + MOVOU p256one<>+0x00(SB), X13 + MOVOU p256one<>+0x10(SB), X14 + + PAND X15, X0 + PAND X15, X1 + PAND X15, X2 + PAND X15, X3 + PAND X15, X4 + PAND X15, X5 + + PAND X7, X9 + PAND X7, X10 + PAND X7, X11 + PAND X7, X12 + PAND X7, X13 + PAND X7, X14 + + PXOR X9, X0 + PXOR X10, X1 + PXOR X11, X2 + PXOR X12, X3 + PXOR X13, X4 + PXOR X14, X5 + // Finally output the result + MOVOU X0, (16*0)(AX) + MOVOU X1, (16*1)(AX) + MOVOU X2, (16*2)(AX) + MOVOU X3, (16*3)(AX) + MOVOU X4, (16*4)(AX) + MOVOU X5, (16*5)(AX) + MOVQ $0, rptr + + RET +#undef x1in +#undef y1in +#undef z1in +#undef x2in +#undef y2in +#undef xout +#undef yout +#undef zout +#undef s2 +#undef z1sqr +#undef h +#undef r +#undef hsqr +#undef rsqr +#undef hcub +#undef rptr +#undef sel_save +#undef zero_save + +// sm2P256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero +// otherwise. It writes to [acc4..acc7], t0 and t1. +TEXT sm2P256IsZero(SB),NOSPLIT,$0 + // AX contains a flag that is set if the input is zero. + XORQ AX, AX + MOVQ $1, t1 + + // Check whether [acc4..acc7] are all zero. + MOVQ acc4, t0 + ORQ acc5, t0 + ORQ acc6, t0 + ORQ acc7, t0 + + // Set the zero flag if so. (CMOV of a constant to a register doesn't + // appear to be supported in Go. Thus t1 = 1.) + CMOVQEQ t1, AX + + // XOR [acc4..acc7] with P and compare with zero again. + XORQ $-1, acc4 + XORQ p256const0<>(SB), acc5 + XORQ p256const1<>(SB), acc7 + ORQ acc5, acc4 + ORQ acc6, acc4 + ORQ acc7, acc4 + + // Set the zero flag if so. + CMOVQEQ t1, AX + RET + +/* ---------------------------------------*/ +#define x1in(off) (32*0 + off)(SP) +#define y1in(off) (32*1 + off)(SP) +#define z1in(off) (32*2 + off)(SP) +#define x2in(off) (32*3 + off)(SP) +#define y2in(off) (32*4 + off)(SP) +#define z2in(off) (32*5 + off)(SP) + +#define xout(off) (32*6 + off)(SP) +#define yout(off) (32*7 + off)(SP) +#define zout(off) (32*8 + off)(SP) + +#define u1(off) (32*9 + off)(SP) +#define u2(off) (32*10 + off)(SP) +#define s1(off) (32*11 + off)(SP) +#define s2(off) (32*12 + off)(SP) +#define z1sqr(off) (32*13 + off)(SP) +#define z2sqr(off) (32*14 + off)(SP) +#define h(off) (32*15 + off)(SP) +#define r(off) (32*16 + off)(SP) +#define hsqr(off) (32*17 + off)(SP) +#define rsqr(off) (32*18 + off)(SP) +#define hcub(off) (32*19 + off)(SP) +#define rptr (32*20)(SP) +#define points_eq (32*20+8)(SP) + +//func p256PointAddAsm(res, in1, in2 []uint64) int +TEXT ·p256PointAddAsm(SB),0,$680-80 + // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl + // Move input to stack in order to free registers + MOVQ res+0(FP), AX + MOVQ in1+24(FP), BX + MOVQ in2+48(FP), CX + + MOVOU (16*0)(BX), X0 + MOVOU (16*1)(BX), X1 + MOVOU (16*2)(BX), X2 + MOVOU (16*3)(BX), X3 + MOVOU (16*4)(BX), X4 + MOVOU (16*5)(BX), X5 + + MOVOU X0, x1in(16*0) + MOVOU X1, x1in(16*1) + MOVOU X2, y1in(16*0) + MOVOU X3, y1in(16*1) + MOVOU X4, z1in(16*0) + MOVOU X5, z1in(16*1) + + MOVOU (16*0)(CX), X0 + MOVOU (16*1)(CX), X1 + MOVOU (16*2)(CX), X2 + MOVOU (16*3)(CX), X3 + MOVOU (16*4)(CX), X4 + MOVOU (16*5)(CX), X5 + + MOVOU X0, x2in(16*0) + MOVOU X1, x2in(16*1) + MOVOU X2, y2in(16*0) + MOVOU X3, y2in(16*1) + MOVOU X4, z2in(16*0) + MOVOU X5, z2in(16*1) + // Store pointer to result + MOVQ AX, rptr + // Begin point add + LDacc (z2in) + CALL sm2P256SqrInternal(SB) // z2ˆ2 + ST (z2sqr) + LDt (z2in) + CALL sm2P256MulInternal(SB) // z2ˆ3 + LDt (y1in) + CALL sm2P256MulInternal(SB) // s1 = z2ˆ3*y1 + ST (s1) + + LDacc (z1in) + CALL sm2P256SqrInternal(SB) // z1ˆ2 + ST (z1sqr) + LDt (z1in) + CALL sm2P256MulInternal(SB) // z1ˆ3 + LDt (y2in) + CALL sm2P256MulInternal(SB) // s2 = z1ˆ3*y2 + ST (s2) + + LDt (s1) + CALL sm2P256SubInternal(SB) // r = s2 - s1 + ST (r) + CALL sm2P256IsZero(SB) + MOVQ AX, points_eq + + LDacc (z2sqr) + LDt (x1in) + CALL sm2P256MulInternal(SB) // u1 = x1 * z2ˆ2 + ST (u1) + LDacc (z1sqr) + LDt (x2in) + CALL sm2P256MulInternal(SB) // u2 = x2 * z1ˆ2 + ST (u2) + + LDt (u1) + CALL sm2P256SubInternal(SB) // h = u2 - u1 + ST (h) + CALL sm2P256IsZero(SB) + ANDQ points_eq, AX + MOVQ AX, points_eq + + LDacc (r) + CALL sm2P256SqrInternal(SB) // rsqr = rˆ2 + ST (rsqr) + + LDacc (h) + CALL sm2P256SqrInternal(SB) // hsqr = hˆ2 + ST (hsqr) + + LDt (h) + CALL sm2P256MulInternal(SB) // hcub = hˆ3 + ST (hcub) + + LDt (s1) + CALL sm2P256MulInternal(SB) + ST (s2) + + LDacc (z1in) + LDt (z2in) + CALL sm2P256MulInternal(SB) // z1 * z2 + LDt (h) + CALL sm2P256MulInternal(SB) // z1 * z2 * h + ST (zout) + + LDacc (hsqr) + LDt (u1) + CALL sm2P256MulInternal(SB) // hˆ2 * u1 + ST (u2) + + p256MulBy2Inline // u1 * hˆ2 * 2, inline + LDacc (rsqr) + CALL sm2P256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 + + LDt (hcub) + CALL sm2P256SubInternal(SB) + ST (xout) + + MOVQ acc4, t0 + MOVQ acc5, t1 + MOVQ acc6, t2 + MOVQ acc7, t3 + LDacc (u2) + CALL sm2P256SubInternal(SB) + + LDt (r) + CALL sm2P256MulInternal(SB) + + LDt (s2) + CALL sm2P256SubInternal(SB) + ST (yout) + + MOVOU xout(16*0), X0 + MOVOU xout(16*1), X1 + MOVOU yout(16*0), X2 + MOVOU yout(16*1), X3 + MOVOU zout(16*0), X4 + MOVOU zout(16*1), X5 + // Finally output the result + MOVQ rptr, AX + MOVQ $0, rptr + MOVOU X0, (16*0)(AX) + MOVOU X1, (16*1)(AX) + MOVOU X2, (16*2)(AX) + MOVOU X3, (16*3)(AX) + MOVOU X4, (16*4)(AX) + MOVOU X5, (16*5)(AX) + + MOVQ points_eq, AX + MOVQ AX, ret+72(FP) + + RET +#undef x1in +#undef y1in +#undef z1in +#undef x2in +#undef y2in +#undef z2in +#undef xout +#undef yout +#undef zout +#undef s1 +#undef s2 +#undef u1 +#undef u2 +#undef z1sqr +#undef z2sqr +#undef h +#undef r +#undef hsqr +#undef rsqr +#undef hcub +#undef rptr +/* ---------------------------------------*/ +#define x(off) (32*0 + off)(SP) +#define y(off) (32*1 + off)(SP) +#define z(off) (32*2 + off)(SP) + +#define s(off) (32*3 + off)(SP) +#define m(off) (32*4 + off)(SP) +#define zsqr(off) (32*5 + off)(SP) +#define tmp(off) (32*6 + off)(SP) +#define rptr (32*7)(SP) + +//func p256PointDoubleAsm(res, in []uint64) +TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48 + // Move input to stack in order to free registers + MOVQ res+0(FP), AX + MOVQ in+24(FP), BX + + MOVOU (16*0)(BX), X0 + MOVOU (16*1)(BX), X1 + MOVOU (16*2)(BX), X2 + MOVOU (16*3)(BX), X3 + MOVOU (16*4)(BX), X4 + MOVOU (16*5)(BX), X5 + + MOVOU X0, x(16*0) + MOVOU X1, x(16*1) + MOVOU X2, y(16*0) + MOVOU X3, y(16*1) + MOVOU X4, z(16*0) + MOVOU X5, z(16*1) + // Store pointer to result + MOVQ AX, rptr + // Begin point double + LDacc (z) + CALL sm2P256SqrInternal(SB) + ST (zsqr) + + LDt (x) + p256AddInline + STt (m) + + LDacc (z) + LDt (y) + CALL sm2P256MulInternal(SB) + p256MulBy2Inline + MOVQ rptr, AX + // Store z + MOVQ t0, (16*4 + 8*0)(AX) + MOVQ t1, (16*4 + 8*1)(AX) + MOVQ t2, (16*4 + 8*2)(AX) + MOVQ t3, (16*4 + 8*3)(AX) + + LDacc (x) + LDt (zsqr) + CALL sm2P256SubInternal(SB) + LDt (m) + CALL sm2P256MulInternal(SB) + ST (m) + // Multiply by 3 + p256MulBy2Inline + LDacc (m) + p256AddInline + STt (m) + //////////////////////// + LDacc (y) + p256MulBy2Inline + t2acc + CALL sm2P256SqrInternal(SB) + ST (s) + CALL sm2P256SqrInternal(SB) + // Divide by 2 + XORQ mul0, mul0 + MOVQ acc4, t0 + MOVQ acc5, t1 + MOVQ acc6, t2 + MOVQ acc7, t3 + + ADDQ $-1, acc4 + ADCQ p256const0<>(SB), acc5 + ADCQ $0, acc6 + ADCQ p256const1<>(SB), acc7 + ADCQ $0, mul0 + TESTQ $1, t0 + + CMOVQEQ t0, acc4 + CMOVQEQ t1, acc5 + CMOVQEQ t2, acc6 + CMOVQEQ t3, acc7 + ANDQ t0, mul0 + + SHRQ $1, acc5, acc4 + SHRQ $1, acc6, acc5 + SHRQ $1, acc7, acc6 + SHRQ $1, mul0, acc7 + ST (y) + ///////////////////////// + LDacc (x) + LDt (s) + CALL sm2P256MulInternal(SB) + ST (s) + p256MulBy2Inline + STt (tmp) + + LDacc (m) + CALL sm2P256SqrInternal(SB) + LDt (tmp) + CALL sm2P256SubInternal(SB) + + MOVQ rptr, AX + // Store x + MOVQ acc4, (16*0 + 8*0)(AX) + MOVQ acc5, (16*0 + 8*1)(AX) + MOVQ acc6, (16*0 + 8*2)(AX) + MOVQ acc7, (16*0 + 8*3)(AX) + + acc2t + LDacc (s) + CALL sm2P256SubInternal(SB) + + LDt (m) + CALL sm2P256MulInternal(SB) + + LDt (y) + CALL sm2P256SubInternal(SB) + MOVQ rptr, AX + // Store y + MOVQ acc4, (16*2 + 8*0)(AX) + MOVQ acc5, (16*2 + 8*1)(AX) + MOVQ acc6, (16*2 + 8*2)(AX) + MOVQ acc7, (16*2 + 8*3)(AX) + /////////////////////// + MOVQ $0, rptr + + RET +/* ---------------------------------------*/ + diff --git a/sm2/p256_generic.go b/sm2/p256_generic.go new file mode 100644 index 0000000..b805078 --- /dev/null +++ b/sm2/p256_generic.go @@ -0,0 +1,12 @@ +// +build !amd64 + +package sm2 + +var ( + p256 p256Curve +) + +func initP256Arch() { + // Use pure Go implementation. + p256 = p256Curve{p256Params} +} diff --git a/sm2/sm2.go b/sm2/sm2.go index 4a7b450..069b460 100644 --- a/sm2/sm2.go +++ b/sm2/sm2.go @@ -14,6 +14,7 @@ import ( "io" "math/big" "strings" + "sync" "github.com/emmansun/gmsm/sm3" ) @@ -58,8 +59,18 @@ func (priv *PrivateKey) Sign(rand io.Reader, digest []byte, opts crypto.SignerOp return asn1.Marshal(ecdsaSignature{r, s}) } +var ( + one = new(big.Int).SetInt64(1) + initonce sync.Once +) + +// P256 init and return the singleton +func P256() elliptic.Curve { + initonce.Do(initP256) + return p256 +} + ///////////////// below code ship from golan crypto/ecdsa //////////////////// -var one = new(big.Int).SetInt64(1) // randFieldElement returns a random element of the field underlying the given // curve using the procedure given in [NSA] A.2.1.