From 4de671cd8bd1c4003076b677750b79a755256763 Mon Sep 17 00:00:00 2001
From: Emman <emman.sun@cargosmart.com>
Date: Mon, 8 Feb 2021 08:53:07 +0800
Subject: [PATCH] MAGIC - init amd64 optimization

---
 sm2/p256.go          |   28 +-
 sm2/p256_asm.go      |  599 +++++++++++
 sm2/p256_asm_amd64.s | 2353 ++++++++++++++++++++++++++++++++++++++++++
 sm2/p256_generic.go  |   12 +
 sm2/sm2.go           |   13 +-
 5 files changed, 2988 insertions(+), 17 deletions(-)
 create mode 100644 sm2/p256_asm.go
 create mode 100644 sm2/p256_asm_amd64.s
 create mode 100644 sm2/p256_generic.go

diff --git a/sm2/p256.go b/sm2/p256.go
index fb891c1..e4d6743 100644
--- a/sm2/p256.go
+++ b/sm2/p256.go
@@ -1,10 +1,11 @@
+// +build !amd64
+
 package sm2
 
 import (
 	"crypto/elliptic"
 	"fmt"
 	"math/big"
-	"sync"
 )
 
 // See https://www.imperialviolet.org/2010/12/04/ecc.html ([1]) for background.
@@ -14,24 +15,22 @@ type p256Curve struct {
 }
 
 var (
-	p256 p256Curve
+	p256Params *elliptic.CurveParams
 
 	// RInverse contains 1/R mod p - the inverse of the Montgomery constant
 	// (2**257).
 	p256RInverse *big.Int
-
-	initonce sync.Once
 )
 
 func initP256() {
-	p256.CurveParams = &elliptic.CurveParams{Name: "sm2p256v1"}
+	p256Params = &elliptic.CurveParams{Name: "sm2p256v1"}
 	// 2**256 - 2**224 - 2**96 + 2**64 - 1
-	p256.P, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
-	p256.N, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
-	p256.B, _ = new(big.Int).SetString("28E9FA9E9D9F5E344D5A9E4BCF6509A7F39789F515AB8F92DDBCBD414D940E93", 16)
-	p256.Gx, _ = new(big.Int).SetString("32C4AE2C1F1981195F9904466A39C9948FE30BBFF2660BE1715A4589334C74C7", 16)
-	p256.Gy, _ = new(big.Int).SetString("BC3736A2F4F6779C59BDCEE36B692153D0A9877CC62A474002DF32E52139F0A0", 16)
-	p256.BitSize = 256
+	p256Params.P, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
+	p256Params.N, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
+	p256Params.B, _ = new(big.Int).SetString("28E9FA9E9D9F5E344D5A9E4BCF6509A7F39789F515AB8F92DDBCBD414D940E93", 16)
+	p256Params.Gx, _ = new(big.Int).SetString("32C4AE2C1F1981195F9904466A39C9948FE30BBFF2660BE1715A4589334C74C7", 16)
+	p256Params.Gy, _ = new(big.Int).SetString("BC3736A2F4F6779C59BDCEE36B692153D0A9877CC62A474002DF32E52139F0A0", 16)
+	p256Params.BitSize = 256
 
 	// ModeInverse(2**257, P)
 	// p256RInverse = big.NewInt(0)
@@ -39,12 +38,9 @@ func initP256() {
 	// p256RInverse.ModInverse(r, p256.P)
 	// fmt.Printf("%s\n", hex.EncodeToString(p256RInverse.Bytes()))
 	p256RInverse, _ = new(big.Int).SetString("7ffffffd80000002fffffffe000000017ffffffe800000037ffffffc80000002", 16)
-}
 
-// P256 init and return the singleton
-func P256() elliptic.Curve {
-	initonce.Do(initP256)
-	return p256
+	// Arch-specific initialization, i.e. let a platform dynamically pick a P256 implementation
+	initP256Arch()
 }
 
 func (curve p256Curve) Params() *elliptic.CurveParams {
diff --git a/sm2/p256_asm.go b/sm2/p256_asm.go
new file mode 100644
index 0000000..7457d03
--- /dev/null
+++ b/sm2/p256_asm.go
@@ -0,0 +1,599 @@
+// This file contains the Go wrapper for the constant-time, 64-bit assembly
+// implementation of P256. The optimizations performed here are described in
+// detail in:
+// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
+//                          256-bit primes"
+// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
+// https://eprint.iacr.org/2013/816.pdf
+
+// +build amd64
+
+package sm2
+
+import (
+	"crypto/elliptic"
+	"math/big"
+	"sync"
+)
+
+type (
+	p256Curve struct {
+		*elliptic.CurveParams
+	}
+
+	p256Point struct {
+		xyz [12]uint64
+	}
+)
+
+var (
+	p256            p256Curve
+	p256Precomputed *[43][32 * 8]uint64
+	precomputeOnce  sync.Once
+)
+
+func initP256() {
+	// 2**256 - 2**224 - 2**96 + 2**64 - 1
+	p256.CurveParams = &elliptic.CurveParams{Name: "sm2p256v1"}
+	p256.P, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
+	p256.N, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
+	p256.B, _ = new(big.Int).SetString("28E9FA9E9D9F5E344D5A9E4BCF6509A7F39789F515AB8F92DDBCBD414D940E93", 16)
+	p256.Gx, _ = new(big.Int).SetString("32C4AE2C1F1981195F9904466A39C9948FE30BBFF2660BE1715A4589334C74C7", 16)
+	p256.Gy, _ = new(big.Int).SetString("BC3736A2F4F6779C59BDCEE36B692153D0A9877CC62A474002DF32E52139F0A0", 16)
+	p256.BitSize = 256
+}
+
+func (curve p256Curve) Params() *elliptic.CurveParams {
+	return curve.CurveParams
+}
+
+// Functions implemented in p256_asm_*64.s
+// Montgomery multiplication modulo P256
+//go:noescape
+func p256Mul(res, in1, in2 []uint64)
+
+// Montgomery square modulo P256, repeated n times (n >= 1)
+//go:noescape
+func p256Sqr(res, in []uint64, n int)
+
+// Montgomery multiplication by 1
+//go:noescape
+func p256FromMont(res, in []uint64)
+
+// iff cond == 1  val <- -val
+//go:noescape
+func p256NegCond(val []uint64, cond int)
+
+// if cond == 0 res <- b; else res <- a
+//go:noescape
+func p256MovCond(res, a, b []uint64, cond int)
+
+// Endianness swap
+//go:noescape
+func p256BigToLittle(res []uint64, in []byte)
+
+//go:noescape
+func p256LittleToBig(res []byte, in []uint64)
+
+// Constant time table access
+//go:noescape
+func p256Select(point, table []uint64, idx int)
+
+//go:noescape
+func p256SelectBase(point, table []uint64, idx int)
+
+// Montgomery multiplication modulo Ord(G)
+//go:noescape
+func p256OrdMul(res, in1, in2 []uint64)
+
+// Montgomery square modulo Ord(G), repeated n times
+//go:noescape
+func p256OrdSqr(res, in []uint64, n int)
+
+// Point add with in2 being affine point
+// If sign == 1 -> in2 = -in2
+// If sel == 0 -> res = in1
+// if zero == 0 -> res = in2
+//go:noescape
+func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
+
+// Point add. Returns one if the two input points were equal and zero
+// otherwise. (Note that, due to the way that the equations work out, some
+// representations of ∞ are considered equal to everything by this function.)
+//go:noescape
+func p256PointAddAsm(res, in1, in2 []uint64) int
+
+// Point double
+//go:noescape
+func p256PointDoubleAsm(res, in []uint64)
+
+func (curve p256Curve) Inverse(k *big.Int) *big.Int {
+	if k.Sign() < 0 {
+		// This should never happen.
+		k = new(big.Int).Neg(k)
+	}
+
+	if k.Cmp(p256.N) >= 0 {
+		// This should never happen.
+		k = new(big.Int).Mod(k, p256.N)
+	}
+
+	// table will store precomputed powers of x.
+	var table [4 * 9]uint64
+	var (
+		_1      = table[4*0 : 4*1]
+		_11     = table[4*1 : 4*2]
+		_101    = table[4*2 : 4*3]
+		_111    = table[4*3 : 4*4]
+		_1111   = table[4*4 : 4*5]
+		_10101  = table[4*5 : 4*6]
+		_101111 = table[4*6 : 4*7]
+		x       = table[4*7 : 4*8]
+		t       = table[4*8 : 4*9]
+	)
+
+	fromBig(x[:], k)
+	// This code operates in the Montgomery domain where R = 2^256 mod n
+	// and n is the order of the scalar field. (See initP256 for the
+	// value.) Elements in the Montgomery domain take the form a×R and
+	// multiplication of x and y in the calculates (x × y × R^-1) mod n. RR
+	// is R×R mod n thus the Montgomery multiplication x and RR gives x×R,
+	// i.e. converts x into the Montgomery domain.
+	// Window values borrowed from https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
+	RR := []uint64{0x901192af7c114f20, 0x3464504ade6fa2fa, 0x620fc84c3affe0d4, 0x1eb5e412a22b3d3b}
+
+	p256OrdMul(_1, x, RR)      // _1
+	p256OrdSqr(x, _1, 1)       // _10
+	p256OrdMul(_11, x, _1)     // _11
+	p256OrdMul(_101, x, _11)   // _101
+	p256OrdMul(_111, x, _101)  // _111
+	p256OrdSqr(x, _101, 1)     // _1010
+	p256OrdMul(_1111, _101, x) // _1111
+
+	p256OrdSqr(t, x, 1)          // _10100
+	p256OrdMul(_10101, t, _1)    // _10101
+	p256OrdSqr(x, _10101, 1)     // _101010
+	p256OrdMul(_101111, _101, x) // _101111
+	p256OrdMul(x, _10101, x)     // _111111 = x6
+	p256OrdSqr(t, x, 2)          // _11111100
+	p256OrdMul(t, t, _11)        // _11111111 = x8
+	p256OrdSqr(x, t, 8)          // _ff00
+	p256OrdMul(x, x, t)          // _ffff = x16
+	p256OrdSqr(t, x, 16)         // _ffff0000
+	p256OrdMul(t, t, x)          // _ffffffff = x32
+
+	p256OrdSqr(x, t, 64)
+	p256OrdMul(x, x, t)
+	p256OrdSqr(x, x, 32)
+	p256OrdMul(x, x, t)
+
+	sqrs := []uint8{
+		6, 5, 4, 5, 5,
+		4, 3, 3, 5, 9,
+		6, 2, 5, 6, 5,
+		4, 5, 5, 3, 10,
+		2, 5, 5, 3, 7, 6}
+	muls := [][]uint64{
+		_101111, _111, _11, _1111, _10101,
+		_101, _101, _101, _111, _101111,
+		_1111, _1, _1, _1111, _111,
+		_111, _111, _101, _11, _101111,
+		_11, _11, _11, _1, _10101, _1111}
+
+	for i, s := range sqrs {
+		p256OrdSqr(x, x, int(s))
+		p256OrdMul(x, x, muls[i])
+	}
+
+	// Multiplying by one in the Montgomery domain converts a Montgomery
+	// value out of the domain.
+	one := []uint64{1, 0, 0, 0}
+	p256OrdMul(x, x, one)
+
+	xOut := make([]byte, 32)
+	p256LittleToBig(xOut, x)
+	return new(big.Int).SetBytes(xOut)
+}
+
+// fromBig converts a *big.Int into a format used by this code.
+func fromBig(out []uint64, big *big.Int) {
+	for i := range out {
+		out[i] = 0
+	}
+
+	for i, v := range big.Bits() {
+		out[i] = uint64(v)
+	}
+}
+
+// p256GetScalar endian-swaps the big-endian scalar value from in and writes it
+// to out. If the scalar is equal or greater than the order of the group, it's
+// reduced modulo that order.
+func p256GetScalar(out []uint64, in []byte) {
+	n := new(big.Int).SetBytes(in)
+
+	if n.Cmp(p256.N) >= 0 {
+		n.Mod(n, p256.N)
+	}
+	fromBig(out, n)
+}
+
+// p256Mul operates in a Montgomery domain with R = 2^256 mod p, where p is the
+// underlying field of the curve. (See initP256 for the value.) Thus rr here is
+// R×R mod p. See comment in Inverse about how this is used.
+var rr = []uint64{0x0000000000000003, 0x00000002ffffffff, 0x0000000100000001, 0x0000000400000002}
+
+func maybeReduceModP(in *big.Int) *big.Int {
+	if in.Cmp(p256.P) < 0 {
+		return in
+	}
+	return new(big.Int).Mod(in, p256.P)
+}
+
+func (curve p256Curve) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) {
+	scalarReversed := make([]uint64, 4)
+	var r1, r2 p256Point
+	p256GetScalar(scalarReversed, baseScalar)
+	r1IsInfinity := scalarIsZero(scalarReversed)
+	r1.p256BaseMult(scalarReversed)
+
+	p256GetScalar(scalarReversed, scalar)
+	r2IsInfinity := scalarIsZero(scalarReversed)
+	fromBig(r2.xyz[0:4], maybeReduceModP(bigX))
+	fromBig(r2.xyz[4:8], maybeReduceModP(bigY))
+	p256Mul(r2.xyz[0:4], r2.xyz[0:4], rr[:])
+	p256Mul(r2.xyz[4:8], r2.xyz[4:8], rr[:])
+
+	// This sets r2's Z value to 1, in the Montgomery domain.
+	r2.xyz[8] = 0x0000000000000001
+	r2.xyz[9] = 0x00000000ffffffff
+	r2.xyz[10] = 0x0000000000000000
+	r2.xyz[11] = 0x0000000100000000
+
+	r2.p256ScalarMult(scalarReversed)
+
+	var sum, double p256Point
+	pointsEqual := p256PointAddAsm(sum.xyz[:], r1.xyz[:], r2.xyz[:])
+	p256PointDoubleAsm(double.xyz[:], r1.xyz[:])
+	sum.CopyConditional(&double, pointsEqual)
+	sum.CopyConditional(&r1, r2IsInfinity)
+	sum.CopyConditional(&r2, r1IsInfinity)
+
+	return sum.p256PointToAffine()
+}
+
+func (curve p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
+	scalarReversed := make([]uint64, 4)
+	p256GetScalar(scalarReversed, scalar)
+
+	var r p256Point
+	r.p256BaseMult(scalarReversed)
+	return r.p256PointToAffine()
+}
+
+func (curve p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
+	scalarReversed := make([]uint64, 4)
+	p256GetScalar(scalarReversed, scalar)
+
+	var r p256Point
+	fromBig(r.xyz[0:4], maybeReduceModP(bigX))
+	fromBig(r.xyz[4:8], maybeReduceModP(bigY))
+	p256Mul(r.xyz[0:4], r.xyz[0:4], rr[:])
+	p256Mul(r.xyz[4:8], r.xyz[4:8], rr[:])
+	// This sets r2's Z value to 1, in the Montgomery domain.
+	r.xyz[8] = 0x0000000000000001
+	r.xyz[9] = 0x00000000ffffffff
+	r.xyz[10] = 0x0000000000000000
+	r.xyz[11] = 0x0000000100000000
+
+	r.p256ScalarMult(scalarReversed)
+	return r.p256PointToAffine()
+}
+
+// uint64IsZero returns 1 if x is zero and zero otherwise.
+func uint64IsZero(x uint64) int {
+	x = ^x
+	x &= x >> 32
+	x &= x >> 16
+	x &= x >> 8
+	x &= x >> 4
+	x &= x >> 2
+	x &= x >> 1
+	return int(x & 1)
+}
+
+// scalarIsZero returns 1 if scalar represents the zero value, and zero
+// otherwise.
+func scalarIsZero(scalar []uint64) int {
+	return uint64IsZero(scalar[0] | scalar[1] | scalar[2] | scalar[3])
+}
+
+func (p *p256Point) p256PointToAffine() (x, y *big.Int) {
+	zInv := make([]uint64, 4)
+	zInvSq := make([]uint64, 4)
+	p256Inverse(zInv, p.xyz[8:12])
+	p256Sqr(zInvSq, zInv, 1)
+	p256Mul(zInv, zInv, zInvSq)
+
+	p256Mul(zInvSq, p.xyz[0:4], zInvSq)
+	p256Mul(zInv, p.xyz[4:8], zInv)
+
+	p256FromMont(zInvSq, zInvSq)
+	p256FromMont(zInv, zInv)
+
+	xOut := make([]byte, 32)
+	yOut := make([]byte, 32)
+	p256LittleToBig(xOut, zInvSq)
+	p256LittleToBig(yOut, zInv)
+
+	return new(big.Int).SetBytes(xOut), new(big.Int).SetBytes(yOut)
+}
+
+// CopyConditional copies overwrites p with src if v == 1, and leaves p
+// unchanged if v == 0.
+func (p *p256Point) CopyConditional(src *p256Point, v int) {
+	pMask := uint64(v) - 1
+	srcMask := ^pMask
+
+	for i, n := range p.xyz {
+		p.xyz[i] = (n & pMask) | (src.xyz[i] & srcMask)
+	}
+}
+
+// p256Inverse sets out to in^-1 mod p.
+func p256Inverse(out, in []uint64) {
+	var stack [9 * 4]uint64
+	p2 := stack[4*0 : 4*0+4]
+	p4 := stack[4*1 : 4*1+4]
+	p8 := stack[4*2 : 4*2+4]
+	p16 := stack[4*3 : 4*3+4]
+	p32 := stack[4*4 : 4*4+4]
+	p64 := stack[4*5 : 4*5+4]
+	p32m2 := stack[4*6 : 4*6+4]
+	ptmp := stack[4*7 : 4*7+4]
+
+	p256Sqr(out, in, 1)  // 2^1
+	p256Mul(p2, out, in) // 2^2 - 2^0
+
+	p256Sqr(out, p2, 2)    // 2^4 - 2^2
+	p256Mul(ptmp, out, in) // 2^4 - 2^1
+	p256Mul(p4, out, p2)   // 2^4 - 2^0
+
+	p256Sqr(out, p4, 4)      // 2^8 - 2^4
+	p256Mul(ptmp, out, ptmp) // 2^8 - 2^1
+	p256Mul(p8, out, p4)     // 2^8 - 2^0
+
+	p256Sqr(out, p8, 8)      // 2^16 - 2^8
+	p256Mul(ptmp, out, ptmp) // 2^16 - 2^1
+	p256Mul(p16, out, p8)    // 2^16 - 2^0
+
+	p256Sqr(out, p16, 16)     // 2^32 - 2^16
+	p256Mul(p32m2, out, ptmp) // 2^32 - 2^1
+	p256Mul(p32, out, p16)    // 2^32 - 2^0
+
+	p256Sqr(out, p32, 32)  //2^64 - 2^32
+	p256Mul(p64, out, p32) // 2^64 - 2^0
+
+	p256Sqr(out, p64, 64)    //2^128 - 2^64
+	p256Mul(out, out, p64)   // 2^128 - 2^0
+	p256Sqr(out, out, 32)    //2^160 - 2^32
+	p256Mul(out, out, p32m2) //2^160 - 2^1
+	p256Sqr(ptmp, out, 95)   //2^255 - 2^96
+
+	p256Sqr(out, p32m2, 223) //2^225 - 2^224
+	p256Mul(ptmp, ptmp, out) //2^226 - 2^224 - 2^96
+
+	p256Sqr(out, p32, 16)  // 2^48 - 2^16
+	p256Mul(out, out, p16) // 2^48 - 2^0
+
+	p256Sqr(out, out, 8)  // 2^56 - 2^8
+	p256Mul(out, out, p8) // 2^56 - 2^0
+
+	p256Sqr(out, out, 4)  // 2^60 - 2^4
+	p256Mul(out, out, p4) // 2^60 - 2^0
+
+	p256Sqr(out, out, 2)  // 2^62 - 2^2
+	p256Mul(out, out, p2) // 2^62 - 2^0
+
+	p256Sqr(out, out, 2)  // 2^64 - 2^2
+	p256Mul(out, out, in) //2^64 - 2^2 + 2^1
+
+	p256Mul(out, out, ptmp) //2^256 - 2^224 - 2^96 + 2^64 - 3
+}
+
+func (p *p256Point) p256StorePoint(r *[16 * 4 * 3]uint64, index int) {
+	copy(r[index*12:], p.xyz[:])
+}
+
+func boothW5(in uint) (int, int) {
+	var s uint = ^((in >> 5) - 1)
+	var d uint = (1 << 6) - in - 1
+	d = (d & s) | (in & (^s))
+	d = (d >> 1) + (d & 1)
+	return int(d), int(s & 1)
+}
+
+func boothW6(in uint) (int, int) {
+	var s uint = ^((in >> 6) - 1)
+	var d uint = (1 << 7) - in - 1
+	d = (d & s) | (in & (^s))
+	d = (d >> 1) + (d & 1)
+	return int(d), int(s & 1)
+}
+
+func initTable() {
+	p256Precomputed = new([43][32 * 8]uint64)
+
+	basePoint := []uint64{
+		0x61328990f418029e, 0x3e7981eddca6c050, 0xd6a1ed99ac24c3c3, 0x91167a5ee1c13b05,
+		0xc1354e593c2d0ddd, 0xc1f5e5788d3295fa, 0x8d4cfb066e2a48f8, 0x63cd65d481d735bd,
+		0x0000000000000001, 0x00000000ffffffff, 0x0000000000000000, 0x0000000100000000,
+	}
+	t1 := make([]uint64, 12)
+	t2 := make([]uint64, 12)
+	copy(t2, basePoint)
+
+	zInv := make([]uint64, 4)
+	zInvSq := make([]uint64, 4)
+	for j := 0; j < 32; j++ {
+		copy(t1, t2)
+		for i := 0; i < 43; i++ {
+			// The window size is 6 so we need to double 6 times.
+			if i != 0 {
+				for k := 0; k < 6; k++ {
+					p256PointDoubleAsm(t1, t1)
+				}
+			}
+			// Convert the point to affine form. (Its values are
+			// still in Montgomery form however.)
+			p256Inverse(zInv, t1[8:12])
+			p256Sqr(zInvSq, zInv, 1)
+			p256Mul(zInv, zInv, zInvSq)
+
+			p256Mul(t1[:4], t1[:4], zInvSq)
+			p256Mul(t1[4:8], t1[4:8], zInv)
+
+			copy(t1[8:12], basePoint[8:12])
+			// Update the table entry
+			copy(p256Precomputed[i][j*8:], t1[:8])
+		}
+		if j == 0 {
+			p256PointDoubleAsm(t2, basePoint)
+		} else {
+			p256PointAddAsm(t2, t2, basePoint)
+		}
+	}
+}
+
+func (p *p256Point) p256BaseMult(scalar []uint64) {
+	precomputeOnce.Do(initTable)
+
+	wvalue := (scalar[0] << 1) & 0x7f
+	sel, sign := boothW6(uint(wvalue))
+	p256SelectBase(p.xyz[0:8], p256Precomputed[0][0:], sel)
+	p256NegCond(p.xyz[4:8], sign)
+
+	// (This is one, in the Montgomery domain.)
+	p.xyz[8] = 0x0000000000000001
+	p.xyz[9] = 0x00000000ffffffff
+	p.xyz[10] = 0x0000000000000000
+	p.xyz[11] = 0x0000000100000000
+
+	var t0 p256Point
+	// (This is one, in the Montgomery domain.)
+	t0.xyz[8] = 0x0000000000000001
+	t0.xyz[9] = 0x00000000ffffffff
+	t0.xyz[10] = 0x0000000000000000
+	t0.xyz[11] = 0x0000000100000000
+
+	index := uint(5)
+	zero := sel
+
+	for i := 1; i < 43; i++ {
+		if index < 192 {
+			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x7f
+		} else {
+			wvalue = (scalar[index/64] >> (index % 64)) & 0x7f
+		}
+		index += 6
+		sel, sign = boothW6(uint(wvalue))
+		p256SelectBase(t0.xyz[0:8], p256Precomputed[i][0:], sel)
+		p256PointAddAffineAsm(p.xyz[0:12], p.xyz[0:12], t0.xyz[0:8], sign, sel, zero)
+		zero |= sel
+	}
+}
+
+func (p *p256Point) p256ScalarMult(scalar []uint64) {
+	// precomp is a table of precomputed points that stores powers of p
+	// from p^1 to p^16.
+	var precomp [16 * 4 * 3]uint64
+	var t0, t1, t2, t3 p256Point
+
+	// Prepare the table
+	p.p256StorePoint(&precomp, 0) // 1
+
+	p256PointDoubleAsm(t0.xyz[:], p.xyz[:])
+	p256PointDoubleAsm(t1.xyz[:], t0.xyz[:])
+	p256PointDoubleAsm(t2.xyz[:], t1.xyz[:])
+	p256PointDoubleAsm(t3.xyz[:], t2.xyz[:])
+	t0.p256StorePoint(&precomp, 1)  // 2
+	t1.p256StorePoint(&precomp, 3)  // 4
+	t2.p256StorePoint(&precomp, 7)  // 8
+	t3.p256StorePoint(&precomp, 15) // 16
+
+	p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
+	p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
+	p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
+	t0.p256StorePoint(&precomp, 2) // 3
+	t1.p256StorePoint(&precomp, 4) // 5
+	t2.p256StorePoint(&precomp, 8) // 9
+
+	p256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
+	p256PointDoubleAsm(t1.xyz[:], t1.xyz[:])
+	t0.p256StorePoint(&precomp, 5) // 6
+	t1.p256StorePoint(&precomp, 9) // 10
+
+	p256PointAddAsm(t2.xyz[:], t0.xyz[:], p.xyz[:])
+	p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
+	t2.p256StorePoint(&precomp, 6)  // 7
+	t1.p256StorePoint(&precomp, 10) // 11
+
+	p256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
+	p256PointDoubleAsm(t2.xyz[:], t2.xyz[:])
+	t0.p256StorePoint(&precomp, 11) // 12
+	t2.p256StorePoint(&precomp, 13) // 14
+
+	p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
+	p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
+	t0.p256StorePoint(&precomp, 12) // 13
+	t2.p256StorePoint(&precomp, 14) // 15
+
+	// Start scanning the window from top bit
+	index := uint(254)
+	var sel, sign int
+
+	wvalue := (scalar[index/64] >> (index % 64)) & 0x3f
+	sel, _ = boothW5(uint(wvalue))
+
+	p256Select(p.xyz[0:12], precomp[0:], sel)
+	zero := sel
+
+	for index > 4 {
+		index -= 5
+		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
+		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
+		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
+		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
+		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
+
+		if index < 192 {
+			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f
+		} else {
+			wvalue = (scalar[index/64] >> (index % 64)) & 0x3f
+		}
+
+		sel, sign = boothW5(uint(wvalue))
+
+		p256Select(t0.xyz[0:], precomp[0:], sel)
+		p256NegCond(t0.xyz[4:8], sign)
+		p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
+		p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
+		p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
+		zero |= sel
+	}
+
+	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
+	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
+	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
+	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
+	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
+
+	wvalue = (scalar[0] << 1) & 0x3f
+	sel, sign = boothW5(uint(wvalue))
+
+	p256Select(t0.xyz[0:], precomp[0:], sel)
+	p256NegCond(t0.xyz[4:8], sign)
+	p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
+	p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
+	p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
+}
diff --git a/sm2/p256_asm_amd64.s b/sm2/p256_asm_amd64.s
new file mode 100644
index 0000000..c76eda2
--- /dev/null
+++ b/sm2/p256_asm_amd64.s
@@ -0,0 +1,2353 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains constant-time, 64-bit assembly implementation of
+// P256. The optimizations performed here are described in detail in:
+// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
+//                          256-bit primes"
+// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
+// https://eprint.iacr.org/2013/816.pdf
+
+#include "textflag.h"
+
+#define res_ptr DI
+#define x_ptr SI
+#define y_ptr CX
+
+#define acc0 R8
+#define acc1 R9
+#define acc2 R10
+#define acc3 R11
+#define acc4 R12
+#define acc5 R13
+#define t0 R14
+#define t1 R15
+
+DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
+DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
+DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff
+DATA p256p<>+0x08(SB)/8, $0xffffffff00000000
+DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff
+DATA p256p<>+0x18(SB)/8, $0xfffffffeffffffff
+DATA p256ordK0<>+0x00(SB)/8, $0x327f9e8872350975
+DATA p256ord<>+0x00(SB)/8, $0x53bbf40939d54123
+DATA p256ord<>+0x08(SB)/8, $0x7203df6b21c6052b
+DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
+DATA p256ord<>+0x18(SB)/8, $0xfffffffeffffffff
+DATA p256one<>+0x00(SB)/8, $0x0000000000000001
+DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff
+DATA p256one<>+0x10(SB)/8, $0x0000000000000000
+DATA p256one<>+0x18(SB)/8, $0x0000000100000000
+GLOBL p256const0<>(SB), 8, $8
+GLOBL p256const1<>(SB), 8, $8
+GLOBL p256p<>(SB), RODATA, $32
+GLOBL p256ordK0<>(SB), RODATA, $8
+GLOBL p256ord<>(SB), RODATA, $32
+GLOBL p256one<>(SB), RODATA, $32
+
+/* ---------------------------------------*/
+// func p256LittleToBig(res []byte, in []uint64)
+TEXT ·p256LittleToBig(SB),NOSPLIT,$0
+	JMP ·p256BigToLittle(SB)
+/* ---------------------------------------*/
+// func p256BigToLittle(res []uint64, in []byte)
+TEXT ·p256BigToLittle(SB),NOSPLIT,$0
+	MOVQ res+0(FP), res_ptr
+	MOVQ in+24(FP), x_ptr
+
+	MOVQ (8*0)(x_ptr), acc0
+	MOVQ (8*1)(x_ptr), acc1
+	MOVQ (8*2)(x_ptr), acc2
+	MOVQ (8*3)(x_ptr), acc3
+
+	BSWAPQ acc0
+	BSWAPQ acc1
+	BSWAPQ acc2
+	BSWAPQ acc3
+
+	MOVQ acc3, (8*0)(res_ptr)
+	MOVQ acc2, (8*1)(res_ptr)
+	MOVQ acc1, (8*2)(res_ptr)
+	MOVQ acc0, (8*3)(res_ptr)
+
+	RET
+/* ---------------------------------------*/
+// func p256MovCond(res, a, b []uint64, cond int)
+// If cond == 0 res=b, else res=a
+TEXT ·p256MovCond(SB),NOSPLIT,$0
+	MOVQ res+0(FP), res_ptr
+	MOVQ a+24(FP), x_ptr
+	MOVQ b+48(FP), y_ptr
+	MOVQ cond+72(FP), X12
+
+	PXOR X13, X13
+	PSHUFD $0, X12, X12
+	PCMPEQL X13, X12
+
+	MOVOU X12, X0
+	MOVOU (16*0)(x_ptr), X6
+	PANDN X6, X0
+	MOVOU X12, X1
+	MOVOU (16*1)(x_ptr), X7
+	PANDN X7, X1
+	MOVOU X12, X2
+	MOVOU (16*2)(x_ptr), X8
+	PANDN X8, X2
+	MOVOU X12, X3
+	MOVOU (16*3)(x_ptr), X9
+	PANDN X9, X3
+	MOVOU X12, X4
+	MOVOU (16*4)(x_ptr), X10
+	PANDN X10, X4
+	MOVOU X12, X5
+	MOVOU (16*5)(x_ptr), X11
+	PANDN X11, X5
+
+	MOVOU (16*0)(y_ptr), X6
+	MOVOU (16*1)(y_ptr), X7
+	MOVOU (16*2)(y_ptr), X8
+	MOVOU (16*3)(y_ptr), X9
+	MOVOU (16*4)(y_ptr), X10
+	MOVOU (16*5)(y_ptr), X11
+
+	PAND X12, X6
+	PAND X12, X7
+	PAND X12, X8
+	PAND X12, X9
+	PAND X12, X10
+	PAND X12, X11
+
+	PXOR X6, X0
+	PXOR X7, X1
+	PXOR X8, X2
+	PXOR X9, X3
+	PXOR X10, X4
+	PXOR X11, X5
+
+	MOVOU X0, (16*0)(res_ptr)
+	MOVOU X1, (16*1)(res_ptr)
+	MOVOU X2, (16*2)(res_ptr)
+	MOVOU X3, (16*3)(res_ptr)
+	MOVOU X4, (16*4)(res_ptr)
+	MOVOU X5, (16*5)(res_ptr)
+
+	RET
+/* ---------------------------------------*/
+// func p256NegCond(val []uint64, cond int)
+TEXT ·p256NegCond(SB),NOSPLIT,$0
+	MOVQ val+0(FP), res_ptr
+	MOVQ cond+24(FP), t0
+	// acc = poly
+	MOVQ $-1, acc0
+	MOVQ p256const0<>(SB), acc1
+	MOVQ $0, acc2
+	MOVQ p256const1<>(SB), acc3
+	// Load the original value
+	MOVQ (8*0)(res_ptr), acc5
+	MOVQ (8*1)(res_ptr), x_ptr
+	MOVQ (8*2)(res_ptr), y_ptr
+	MOVQ (8*3)(res_ptr), t1
+	// Speculatively subtract
+	SUBQ acc5, acc0
+	SBBQ x_ptr, acc1
+	SBBQ y_ptr, acc2
+	SBBQ t1, acc3
+	// If condition is 0, keep original value
+	TESTQ t0, t0
+	CMOVQEQ acc5, acc0
+	CMOVQEQ x_ptr, acc1
+	CMOVQEQ y_ptr, acc2
+	CMOVQEQ t1, acc3
+	// Store result
+	MOVQ acc0, (8*0)(res_ptr)
+	MOVQ acc1, (8*1)(res_ptr)
+	MOVQ acc2, (8*2)(res_ptr)
+	MOVQ acc3, (8*3)(res_ptr)
+
+	RET
+/* ---------------------------------------*/
+// func p256Sqr(res, in []uint64, n int)
+TEXT ·p256Sqr(SB),NOSPLIT,$0
+	MOVQ res+0(FP), res_ptr
+	MOVQ in+24(FP), x_ptr
+	MOVQ n+48(FP), BX
+
+sqrLoop:
+
+	// y[1:] * y[0]
+	MOVQ (8*0)(x_ptr), t0
+
+	MOVQ (8*1)(x_ptr), AX
+	MULQ t0
+	MOVQ AX, acc1
+	MOVQ DX, acc2
+
+	MOVQ (8*2)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc2
+	ADCQ $0, DX
+	MOVQ DX, acc3
+
+	MOVQ (8*3)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, acc4
+	// y[2:] * y[1]
+	MOVQ (8*1)(x_ptr), t0
+
+	MOVQ (8*2)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*3)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc4
+	ADCQ $0, DX
+	ADDQ AX, acc4
+	ADCQ $0, DX
+	MOVQ DX, acc5
+	// y[3] * y[2]
+	MOVQ (8*2)(x_ptr), t0
+
+	MOVQ (8*3)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc5
+	ADCQ $0, DX
+	MOVQ DX, y_ptr
+	XORQ t1, t1
+	// *2
+	ADDQ acc1, acc1
+	ADCQ acc2, acc2
+	ADCQ acc3, acc3
+	ADCQ acc4, acc4
+	ADCQ acc5, acc5
+	ADCQ y_ptr, y_ptr
+	ADCQ $0, t1
+	// Missing products
+	MOVQ (8*0)(x_ptr), AX
+	MULQ AX
+	MOVQ AX, acc0
+	MOVQ DX, t0
+
+	MOVQ (8*1)(x_ptr), AX
+	MULQ AX
+	ADDQ t0, acc1
+	ADCQ AX, acc2
+	ADCQ $0, DX
+	MOVQ DX, t0
+
+	MOVQ (8*2)(x_ptr), AX
+	MULQ AX
+	ADDQ t0, acc3
+	ADCQ AX, acc4
+	ADCQ $0, DX
+	MOVQ DX, t0
+
+	MOVQ (8*3)(x_ptr), AX
+	MULQ AX
+	ADDQ t0, acc5
+	ADCQ AX, y_ptr
+	ADCQ DX, t1
+	MOVQ t1, x_ptr
+	// First reduction step
+	MOVQ acc0, AX
+	MOVQ acc0, t1
+	SHLQ $32, acc0
+	MULQ p256const1<>(SB)
+	SHRQ $32, t1
+	ADDQ acc0, acc1
+	ADCQ t1, acc2
+	ADCQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, acc0
+	// Second reduction step
+	MOVQ acc1, AX
+	MOVQ acc1, t1
+	SHLQ $32, acc1
+	MULQ p256const1<>(SB)
+	SHRQ $32, t1
+	ADDQ acc1, acc2
+	ADCQ t1, acc3
+	ADCQ AX, acc0
+	ADCQ $0, DX
+	MOVQ DX, acc1
+	// Third reduction step
+	MOVQ acc2, AX
+	MOVQ acc2, t1
+	SHLQ $32, acc2
+	MULQ p256const1<>(SB)
+	SHRQ $32, t1
+	ADDQ acc2, acc3
+	ADCQ t1, acc0
+	ADCQ AX, acc1
+	ADCQ $0, DX
+	MOVQ DX, acc2
+	// Last reduction step
+	XORQ t0, t0
+	MOVQ acc3, AX
+	MOVQ acc3, t1
+	SHLQ $32, acc3
+	MULQ p256const1<>(SB)
+	SHRQ $32, t1
+	ADDQ acc3, acc0
+	ADCQ t1, acc1
+	ADCQ AX, acc2
+	ADCQ $0, DX
+	MOVQ DX, acc3
+	// Add bits [511:256] of the sqr result
+	ADCQ acc4, acc0
+	ADCQ acc5, acc1
+	ADCQ y_ptr, acc2
+	ADCQ x_ptr, acc3
+	ADCQ $0, t0
+
+	MOVQ acc0, acc4
+	MOVQ acc1, acc5
+	MOVQ acc2, y_ptr
+	MOVQ acc3, t1
+	// Subtract p256
+	SUBQ $-1, acc0
+	SBBQ p256const0<>(SB) ,acc1
+	SBBQ $0, acc2
+	SBBQ p256const1<>(SB), acc3
+	SBBQ $0, t0
+
+	CMOVQCS acc4, acc0
+	CMOVQCS acc5, acc1
+	CMOVQCS y_ptr, acc2
+	CMOVQCS t1, acc3
+
+	MOVQ acc0, (8*0)(res_ptr)
+	MOVQ acc1, (8*1)(res_ptr)
+	MOVQ acc2, (8*2)(res_ptr)
+	MOVQ acc3, (8*3)(res_ptr)
+	MOVQ res_ptr, x_ptr
+	DECQ BX
+	JNE  sqrLoop
+
+	RET
+/* ---------------------------------------*/
+// func p256Mul(res, in1, in2 []uint64)
+TEXT ·p256Mul(SB),NOSPLIT,$0
+	MOVQ res+0(FP), res_ptr
+	MOVQ in1+24(FP), x_ptr
+	MOVQ in2+48(FP), y_ptr
+	// x * y[0]
+	MOVQ (8*0)(y_ptr), t0
+
+	MOVQ (8*0)(x_ptr), AX
+	MULQ t0
+	MOVQ AX, acc0
+	MOVQ DX, acc1
+
+	MOVQ (8*1)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc1
+	ADCQ $0, DX
+	MOVQ DX, acc2
+
+	MOVQ (8*2)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc2
+	ADCQ $0, DX
+	MOVQ DX, acc3
+
+	MOVQ (8*3)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, acc4
+	XORQ acc5, acc5
+	// First reduction step
+	MOVQ acc0, AX
+	MOVQ acc0, t1
+	SHLQ $32, acc0
+	MULQ p256const1<>(SB)
+	SHRQ $32, t1
+	ADDQ acc0, acc1
+	ADCQ t1, acc2
+	ADCQ AX, acc3
+	ADCQ DX, acc4
+	ADCQ $0, acc5
+	XORQ acc0, acc0
+	// x * y[1]
+	MOVQ (8*1)(y_ptr), t0
+
+	MOVQ (8*0)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc1
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*1)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc2
+	ADCQ $0, DX
+	ADDQ AX, acc2
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*2)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc3
+	ADCQ $0, DX
+	ADDQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*3)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc4
+	ADCQ $0, DX
+	ADDQ AX, acc4
+	ADCQ DX, acc5
+	ADCQ $0, acc0
+	// Second reduction step
+	MOVQ acc1, AX
+	MOVQ acc1, t1
+	SHLQ $32, acc1
+	MULQ p256const1<>(SB)
+	SHRQ $32, t1
+	ADDQ acc1, acc2
+	ADCQ t1, acc3
+	ADCQ AX, acc4
+	ADCQ DX, acc5
+	ADCQ $0, acc0
+	XORQ acc1, acc1
+	// x * y[2]
+	MOVQ (8*2)(y_ptr), t0
+
+	MOVQ (8*0)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc2
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*1)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc3
+	ADCQ $0, DX
+	ADDQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*2)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc4
+	ADCQ $0, DX
+	ADDQ AX, acc4
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*3)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc5
+	ADCQ $0, DX
+	ADDQ AX, acc5
+	ADCQ DX, acc0
+	ADCQ $0, acc1
+	// Third reduction step
+	MOVQ acc2, AX
+	MOVQ acc2, t1
+	SHLQ $32, acc2
+	MULQ p256const1<>(SB)
+	SHRQ $32, t1
+	ADDQ acc2, acc3
+	ADCQ t1, acc4
+	ADCQ AX, acc5
+	ADCQ DX, acc0
+	ADCQ $0, acc1
+	XORQ acc2, acc2
+	// x * y[3]
+	MOVQ (8*3)(y_ptr), t0
+
+	MOVQ (8*0)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*1)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc4
+	ADCQ $0, DX
+	ADDQ AX, acc4
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*2)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc5
+	ADCQ $0, DX
+	ADDQ AX, acc5
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*3)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc0
+	ADCQ $0, DX
+	ADDQ AX, acc0
+	ADCQ DX, acc1
+	ADCQ $0, acc2
+	// Last reduction step
+	MOVQ acc3, AX
+	MOVQ acc3, t1
+	SHLQ $32, acc3
+	MULQ p256const1<>(SB)
+	SHRQ $32, t1
+	ADDQ acc3, acc4
+	ADCQ t1, acc5
+	ADCQ AX, acc0
+	ADCQ DX, acc1
+	ADCQ $0, acc2
+	// Copy result [255:0]
+	MOVQ acc4, x_ptr
+	MOVQ acc5, acc3
+	MOVQ acc0, t0
+	MOVQ acc1, t1
+	// Subtract p256
+	SUBQ $-1, acc4
+	SBBQ p256const0<>(SB) ,acc5
+	SBBQ $0, acc0
+	SBBQ p256const1<>(SB), acc1
+	SBBQ $0, acc2
+
+	CMOVQCS x_ptr, acc4
+	CMOVQCS acc3, acc5
+	CMOVQCS t0, acc0
+	CMOVQCS t1, acc1
+
+	MOVQ acc4, (8*0)(res_ptr)
+	MOVQ acc5, (8*1)(res_ptr)
+	MOVQ acc0, (8*2)(res_ptr)
+	MOVQ acc1, (8*3)(res_ptr)
+
+	RET
+/* ---------------------------------------*/
+// func p256FromMont(res, in []uint64)
+TEXT ·p256FromMont(SB),NOSPLIT,$0
+	MOVQ res+0(FP), res_ptr
+	MOVQ in+24(FP), x_ptr
+
+	MOVQ (8*0)(x_ptr), acc0
+	MOVQ (8*1)(x_ptr), acc1
+	MOVQ (8*2)(x_ptr), acc2
+	MOVQ (8*3)(x_ptr), acc3
+	XORQ acc4, acc4
+
+	// Only reduce, no multiplications are needed
+	// First stage
+	MOVQ acc0, AX
+	MOVQ acc0, t1
+	SHLQ $32, acc0
+	MULQ p256const1<>(SB)
+	SHRQ $32, t1
+	ADDQ acc0, acc1
+	ADCQ t1, acc2
+	ADCQ AX, acc3
+	ADCQ DX, acc4
+	XORQ acc5, acc5
+	// Second stage
+	MOVQ acc1, AX
+	MOVQ acc1, t1
+	SHLQ $32, acc1
+	MULQ p256const1<>(SB)
+	SHRQ $32, t1
+	ADDQ acc1, acc2
+	ADCQ t1, acc3
+	ADCQ AX, acc4
+	ADCQ DX, acc5
+	XORQ acc0, acc0
+	// Third stage
+	MOVQ acc2, AX
+	MOVQ acc2, t1
+	SHLQ $32, acc2
+	MULQ p256const1<>(SB)
+	SHRQ $32, t1
+	ADDQ acc2, acc3
+	ADCQ t1, acc4
+	ADCQ AX, acc5
+	ADCQ DX, acc0
+	XORQ acc1, acc1
+	// Last stage
+	MOVQ acc3, AX
+	MOVQ acc3, t1
+	SHLQ $32, acc3
+	MULQ p256const1<>(SB)
+	SHRQ $32, t1
+	ADDQ acc3, acc4
+	ADCQ t1, acc5
+	ADCQ AX, acc0
+	ADCQ DX, acc1
+
+	MOVQ acc4, x_ptr
+	MOVQ acc5, acc3
+	MOVQ acc0, t0
+	MOVQ acc1, t1
+
+	SUBQ $-1, acc4
+	SBBQ p256const0<>(SB), acc5
+	SBBQ $0, acc0
+	SBBQ p256const1<>(SB), acc1
+
+	CMOVQCS x_ptr, acc4
+	CMOVQCS acc3, acc5
+	CMOVQCS t0, acc0
+	CMOVQCS t1, acc1
+
+	MOVQ acc4, (8*0)(res_ptr)
+	MOVQ acc5, (8*1)(res_ptr)
+	MOVQ acc0, (8*2)(res_ptr)
+	MOVQ acc1, (8*3)(res_ptr)
+
+	RET
+/* ---------------------------------------*/
+// Constant time point access to arbitrary point table.
+// Indexed from 1 to 15, with -1 offset
+// (index 0 is implicitly point at infinity)
+// func p256Select(point, table []uint64, idx int)
+TEXT ·p256Select(SB),NOSPLIT,$0
+	MOVQ idx+48(FP),AX
+	MOVQ table+24(FP),DI
+	MOVQ point+0(FP),DX
+
+	PXOR X15, X15	// X15 = 0
+	PCMPEQL X14, X14 // X14 = -1
+	PSUBL X14, X15   // X15 = 1
+	MOVL AX, X14
+	PSHUFD $0, X14, X14
+
+	PXOR X0, X0
+	PXOR X1, X1
+	PXOR X2, X2
+	PXOR X3, X3
+	PXOR X4, X4
+	PXOR X5, X5
+	MOVQ $16, AX
+
+	MOVOU X15, X13
+
+loop_select:
+
+		MOVOU X13, X12
+		PADDL X15, X13
+		PCMPEQL X14, X12
+
+		MOVOU (16*0)(DI), X6
+		MOVOU (16*1)(DI), X7
+		MOVOU (16*2)(DI), X8
+		MOVOU (16*3)(DI), X9
+		MOVOU (16*4)(DI), X10
+		MOVOU (16*5)(DI), X11
+		ADDQ $(16*6), DI
+
+		PAND X12, X6
+		PAND X12, X7
+		PAND X12, X8
+		PAND X12, X9
+		PAND X12, X10
+		PAND X12, X11
+
+		PXOR X6, X0
+		PXOR X7, X1
+		PXOR X8, X2
+		PXOR X9, X3
+		PXOR X10, X4
+		PXOR X11, X5
+
+		DECQ AX
+		JNE loop_select
+
+	MOVOU X0, (16*0)(DX)
+	MOVOU X1, (16*1)(DX)
+	MOVOU X2, (16*2)(DX)
+	MOVOU X3, (16*3)(DX)
+	MOVOU X4, (16*4)(DX)
+	MOVOU X5, (16*5)(DX)
+
+	RET
+/* ---------------------------------------*/
+// Constant time point access to base point table.
+// func p256SelectBase(point, table []uint64, idx int)
+TEXT ·p256SelectBase(SB),NOSPLIT,$0
+	MOVQ idx+48(FP),AX
+	MOVQ table+24(FP),DI
+	MOVQ point+0(FP),DX
+
+	PXOR X15, X15	// X15 = 0
+	PCMPEQL X14, X14 // X14 = -1
+	PSUBL X14, X15   // X15 = 1
+	MOVL AX, X14
+	PSHUFD $0, X14, X14
+
+	PXOR X0, X0
+	PXOR X1, X1
+	PXOR X2, X2
+	PXOR X3, X3
+	MOVQ $16, AX
+
+	MOVOU X15, X13
+
+loop_select_base:
+
+		MOVOU X13, X12
+		PADDL X15, X13
+		PCMPEQL X14, X12
+
+		MOVOU (16*0)(DI), X4
+		MOVOU (16*1)(DI), X5
+		MOVOU (16*2)(DI), X6
+		MOVOU (16*3)(DI), X7
+
+		MOVOU (16*4)(DI), X8
+		MOVOU (16*5)(DI), X9
+		MOVOU (16*6)(DI), X10
+		MOVOU (16*7)(DI), X11
+
+		ADDQ $(16*8), DI
+
+		PAND X12, X4
+		PAND X12, X5
+		PAND X12, X6
+		PAND X12, X7
+
+		MOVOU X13, X12
+		PADDL X15, X13
+		PCMPEQL X14, X12
+
+		PAND X12, X8
+		PAND X12, X9
+		PAND X12, X10
+		PAND X12, X11
+
+		PXOR X4, X0
+		PXOR X5, X1
+		PXOR X6, X2
+		PXOR X7, X3
+
+		PXOR X8, X0
+		PXOR X9, X1
+		PXOR X10, X2
+		PXOR X11, X3
+
+		DECQ AX
+		JNE loop_select_base
+
+	MOVOU X0, (16*0)(DX)
+	MOVOU X1, (16*1)(DX)
+	MOVOU X2, (16*2)(DX)
+	MOVOU X3, (16*3)(DX)
+
+	RET
+/* ---------------------------------------*/
+// func p256OrdMul(res, in1, in2 []uint64)
+TEXT ·p256OrdMul(SB),NOSPLIT,$0
+	MOVQ res+0(FP), res_ptr
+	MOVQ in1+24(FP), x_ptr
+	MOVQ in2+48(FP), y_ptr
+	// x * y[0]
+	MOVQ (8*0)(y_ptr), t0
+
+	MOVQ (8*0)(x_ptr), AX
+	MULQ t0
+	MOVQ AX, acc0
+	MOVQ DX, acc1
+
+	MOVQ (8*1)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc1
+	ADCQ $0, DX
+	MOVQ DX, acc2
+
+	MOVQ (8*2)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc2
+	ADCQ $0, DX
+	MOVQ DX, acc3
+
+	MOVQ (8*3)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, acc4
+	XORQ acc5, acc5
+	// First reduction step
+	MOVQ acc0, AX
+	MULQ p256ordK0<>(SB)
+	MOVQ AX, t0
+
+	MOVQ p256ord<>+0x00(SB), AX
+	MULQ t0
+	ADDQ AX, acc0
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ p256ord<>+0x08(SB), AX
+	MULQ t0
+	ADDQ t1, acc1
+	ADCQ $0, DX
+	ADDQ AX, acc1
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ p256ord<>+0x10(SB), AX
+	MULQ t0
+	ADDQ t1, acc2
+	ADCQ $0, DX
+	ADDQ AX, acc2
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ p256ord<>+0x18(SB), AX
+	MULQ t0
+	ADDQ t1, acc3
+	ADCQ $0, DX
+	ADDQ AX, acc3
+	ADCQ DX, acc4
+	ADCQ $0, acc5
+	// x * y[1]
+	MOVQ (8*1)(y_ptr), t0
+
+	MOVQ (8*0)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc1
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*1)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc2
+	ADCQ $0, DX
+	ADDQ AX, acc2
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*2)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc3
+	ADCQ $0, DX
+	ADDQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*3)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc4
+	ADCQ $0, DX
+	ADDQ AX, acc4
+	ADCQ DX, acc5
+	ADCQ $0, acc0
+	// Second reduction step
+	MOVQ acc1, AX
+	MULQ p256ordK0<>(SB)
+	MOVQ AX, t0
+
+	MOVQ p256ord<>+0x00(SB), AX
+	MULQ t0
+	ADDQ AX, acc1
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ p256ord<>+0x08(SB), AX
+	MULQ t0
+	ADDQ t1, acc2
+	ADCQ $0, DX
+	ADDQ AX, acc2
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ p256ord<>+0x10(SB), AX
+	MULQ t0
+	ADDQ t1, acc3
+	ADCQ $0, DX
+	ADDQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ p256ord<>+0x18(SB), AX
+	MULQ t0
+	ADDQ t1, acc4
+	ADCQ $0, DX
+	ADDQ AX, acc4
+	ADCQ DX, acc5
+	ADCQ $0, acc0
+	// x * y[2]
+	MOVQ (8*2)(y_ptr), t0
+
+	MOVQ (8*0)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc2
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*1)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc3
+	ADCQ $0, DX
+	ADDQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*2)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc4
+	ADCQ $0, DX
+	ADDQ AX, acc4
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*3)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc5
+	ADCQ $0, DX
+	ADDQ AX, acc5
+	ADCQ DX, acc0
+	ADCQ $0, acc1
+	// Third reduction step
+	MOVQ acc2, AX
+	MULQ p256ordK0<>(SB)
+	MOVQ AX, t0
+
+	MOVQ p256ord<>+0x00(SB), AX
+	MULQ t0
+	ADDQ AX, acc2
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ p256ord<>+0x08(SB), AX
+	MULQ t0
+	ADDQ t1, acc3
+	ADCQ $0, DX
+	ADDQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ p256ord<>+0x10(SB), AX
+	MULQ t0
+	ADDQ t1, acc4
+	ADCQ $0, DX
+	ADDQ AX, acc4
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ p256ord<>+0x18(SB), AX
+	MULQ t0
+	ADDQ t1, acc5
+	ADCQ $0, DX
+	ADDQ AX, acc5
+	ADCQ DX, acc0
+	ADCQ $0, acc1
+	// x * y[3]
+	MOVQ (8*3)(y_ptr), t0
+
+	MOVQ (8*0)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*1)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc4
+	ADCQ $0, DX
+	ADDQ AX, acc4
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*2)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc5
+	ADCQ $0, DX
+	ADDQ AX, acc5
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*3)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc0
+	ADCQ $0, DX
+	ADDQ AX, acc0
+	ADCQ DX, acc1
+	ADCQ $0, acc2
+	// Last reduction step
+	MOVQ acc3, AX
+	MULQ p256ordK0<>(SB)
+	MOVQ AX, t0
+
+	MOVQ p256ord<>+0x00(SB), AX
+	MULQ t0
+	ADDQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ p256ord<>+0x08(SB), AX
+	MULQ t0
+	ADDQ t1, acc4
+	ADCQ $0, DX
+	ADDQ AX, acc4
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ p256ord<>+0x10(SB), AX
+	MULQ t0
+	ADDQ t1, acc5
+	ADCQ $0, DX
+	ADDQ AX, acc5
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ p256ord<>+0x18(SB), AX
+	MULQ t0
+	ADDQ t1, acc0
+	ADCQ $0, DX
+	ADDQ AX, acc0
+	ADCQ DX, acc1
+	ADCQ $0, acc2
+	// Copy result [255:0]
+	MOVQ acc4, x_ptr
+	MOVQ acc5, acc3
+	MOVQ acc0, t0
+	MOVQ acc1, t1
+	// Subtract p256
+	SUBQ p256ord<>+0x00(SB), acc4
+	SBBQ p256ord<>+0x08(SB) ,acc5
+	SBBQ p256ord<>+0x10(SB), acc0
+	SBBQ p256ord<>+0x18(SB), acc1
+	SBBQ $0, acc2
+
+	CMOVQCS x_ptr, acc4
+	CMOVQCS acc3, acc5
+	CMOVQCS t0, acc0
+	CMOVQCS t1, acc1
+
+	MOVQ acc4, (8*0)(res_ptr)
+	MOVQ acc5, (8*1)(res_ptr)
+	MOVQ acc0, (8*2)(res_ptr)
+	MOVQ acc1, (8*3)(res_ptr)
+
+	RET
+/* ---------------------------------------*/
+// func p256OrdSqr(res, in []uint64, n int)
+TEXT ·p256OrdSqr(SB),NOSPLIT,$0
+	MOVQ res+0(FP), res_ptr
+	MOVQ in+24(FP), x_ptr
+	MOVQ n+48(FP), BX
+
+ordSqrLoop:
+
+	// y[1:] * y[0]
+	MOVQ (8*0)(x_ptr), t0
+
+	MOVQ (8*1)(x_ptr), AX
+	MULQ t0
+	MOVQ AX, acc1
+	MOVQ DX, acc2
+
+	MOVQ (8*2)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc2
+	ADCQ $0, DX
+	MOVQ DX, acc3
+
+	MOVQ (8*3)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, acc4
+	// y[2:] * y[1]
+	MOVQ (8*1)(x_ptr), t0
+
+	MOVQ (8*2)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ (8*3)(x_ptr), AX
+	MULQ t0
+	ADDQ t1, acc4
+	ADCQ $0, DX
+	ADDQ AX, acc4
+	ADCQ $0, DX
+	MOVQ DX, acc5
+	// y[3] * y[2]
+	MOVQ (8*2)(x_ptr), t0
+
+	MOVQ (8*3)(x_ptr), AX
+	MULQ t0
+	ADDQ AX, acc5
+	ADCQ $0, DX
+	MOVQ DX, y_ptr
+	XORQ t1, t1
+	// *2
+	ADDQ acc1, acc1
+	ADCQ acc2, acc2
+	ADCQ acc3, acc3
+	ADCQ acc4, acc4
+	ADCQ acc5, acc5
+	ADCQ y_ptr, y_ptr
+	ADCQ $0, t1
+	// Missing products
+	MOVQ (8*0)(x_ptr), AX
+	MULQ AX
+	MOVQ AX, acc0
+	MOVQ DX, t0
+
+	MOVQ (8*1)(x_ptr), AX
+	MULQ AX
+	ADDQ t0, acc1
+	ADCQ AX, acc2
+	ADCQ $0, DX
+	MOVQ DX, t0
+
+	MOVQ (8*2)(x_ptr), AX
+	MULQ AX
+	ADDQ t0, acc3
+	ADCQ AX, acc4
+	ADCQ $0, DX
+	MOVQ DX, t0
+
+	MOVQ (8*3)(x_ptr), AX
+	MULQ AX
+	ADDQ t0, acc5
+	ADCQ AX, y_ptr
+	ADCQ DX, t1
+	MOVQ t1, x_ptr
+	// First reduction step
+	MOVQ acc0, AX
+	MULQ p256ordK0<>(SB)
+	MOVQ AX, t0
+
+	MOVQ p256ord<>+0x00(SB), AX
+	MULQ t0
+	ADDQ AX, acc0
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ p256ord<>+0x08(SB), AX
+	MULQ t0
+	ADDQ t1, acc1
+	ADCQ $0, DX
+	ADDQ AX, acc1
+
+	MOVQ t0, t1
+	ADCQ DX, acc2
+	ADCQ $0, t1
+	SUBQ t0, acc2
+	SBBQ $0, t1
+
+	MOVQ t0, AX
+	MOVQ t0, DX
+	MOVQ t0, acc0
+	SHLQ $32, AX
+	SHRQ $32, DX
+
+	ADDQ t1, acc3
+	ADCQ $0, acc0
+	SUBQ AX, acc3
+	SBBQ DX, acc0
+	// Second reduction step
+	MOVQ acc1, AX
+	MULQ p256ordK0<>(SB)
+	MOVQ AX, t0
+
+	MOVQ p256ord<>+0x00(SB), AX
+	MULQ t0
+	ADDQ AX, acc1
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ p256ord<>+0x08(SB), AX
+	MULQ t0
+	ADDQ t1, acc2
+	ADCQ $0, DX
+	ADDQ AX, acc2
+
+	MOVQ t0, t1
+	ADCQ DX, acc3
+	ADCQ $0, t1
+	SUBQ t0, acc3
+	SBBQ $0, t1
+
+	MOVQ t0, AX
+	MOVQ t0, DX
+	MOVQ t0, acc1
+	SHLQ $32, AX
+	SHRQ $32, DX
+
+	ADDQ t1, acc0
+	ADCQ $0, acc1
+	SUBQ AX, acc0
+	SBBQ DX, acc1
+	// Third reduction step
+	MOVQ acc2, AX
+	MULQ p256ordK0<>(SB)
+	MOVQ AX, t0
+
+	MOVQ p256ord<>+0x00(SB), AX
+	MULQ t0
+	ADDQ AX, acc2
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ p256ord<>+0x08(SB), AX
+	MULQ t0
+	ADDQ t1, acc3
+	ADCQ $0, DX
+	ADDQ AX, acc3
+
+	MOVQ t0, t1
+	ADCQ DX, acc0
+	ADCQ $0, t1
+	SUBQ t0, acc0
+	SBBQ $0, t1
+
+	MOVQ t0, AX
+	MOVQ t0, DX
+	MOVQ t0, acc2
+	SHLQ $32, AX
+	SHRQ $32, DX
+
+	ADDQ t1, acc1
+	ADCQ $0, acc2
+	SUBQ AX, acc1
+	SBBQ DX, acc2
+	// Last reduction step
+	MOVQ acc3, AX
+	MULQ p256ordK0<>(SB)
+	MOVQ AX, t0
+
+	MOVQ p256ord<>+0x00(SB), AX
+	MULQ t0
+	ADDQ AX, acc3
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ p256ord<>+0x08(SB), AX
+	MULQ t0
+	ADDQ t1, acc0
+	ADCQ $0, DX
+	ADDQ AX, acc0
+	ADCQ $0, DX
+	MOVQ DX, t1
+
+	MOVQ t0, t1
+	ADCQ DX, acc1
+	ADCQ $0, t1
+	SUBQ t0, acc1
+	SBBQ $0, t1
+
+	MOVQ t0, AX
+	MOVQ t0, DX
+	MOVQ t0, acc3
+	SHLQ $32, AX
+	SHRQ $32, DX
+
+	ADDQ t1, acc2
+	ADCQ $0, acc3
+	SUBQ AX, acc2
+	SBBQ DX, acc3
+	XORQ t0, t0
+	// Add bits [511:256] of the sqr result
+	ADCQ acc4, acc0
+	ADCQ acc5, acc1
+	ADCQ y_ptr, acc2
+	ADCQ x_ptr, acc3
+	ADCQ $0, t0
+
+	MOVQ acc0, acc4
+	MOVQ acc1, acc5
+	MOVQ acc2, y_ptr
+	MOVQ acc3, t1
+	// Subtract p256
+	SUBQ p256ord<>+0x00(SB), acc0
+	SBBQ p256ord<>+0x08(SB) ,acc1
+	SBBQ p256ord<>+0x10(SB), acc2
+	SBBQ p256ord<>+0x18(SB), acc3
+	SBBQ $0, t0
+
+	CMOVQCS acc4, acc0
+	CMOVQCS acc5, acc1
+	CMOVQCS y_ptr, acc2
+	CMOVQCS t1, acc3
+
+	MOVQ acc0, (8*0)(res_ptr)
+	MOVQ acc1, (8*1)(res_ptr)
+	MOVQ acc2, (8*2)(res_ptr)
+	MOVQ acc3, (8*3)(res_ptr)
+	MOVQ res_ptr, x_ptr
+	DECQ BX
+	JNE ordSqrLoop
+
+	RET
+/* ---------------------------------------*/
+#undef res_ptr
+#undef x_ptr
+#undef y_ptr
+
+#undef acc0
+#undef acc1
+#undef acc2
+#undef acc3
+#undef acc4
+#undef acc5
+#undef t0
+#undef t1
+/* ---------------------------------------*/
+#define mul0 AX
+#define mul1 DX
+#define acc0 BX
+#define acc1 CX
+#define acc2 R8
+#define acc3 R9
+#define acc4 R10
+#define acc5 R11
+#define acc6 R12
+#define acc7 R13
+#define t0 R14
+#define t1 R15
+#define t2 DI
+#define t3 SI
+#define hlp BP
+/* ---------------------------------------*/
+TEXT sm2P256SubInternal(SB),NOSPLIT,$0
+	XORQ mul0, mul0
+	SUBQ t0, acc4
+	SBBQ t1, acc5
+	SBBQ t2, acc6
+	SBBQ t3, acc7
+	SBBQ $0, mul0
+
+	MOVQ acc4, acc0
+	MOVQ acc5, acc1
+	MOVQ acc6, acc2
+	MOVQ acc7, acc3
+
+	ADDQ $-1, acc4
+	ADCQ p256const0<>(SB), acc5
+	ADCQ $0, acc6
+	ADCQ p256const1<>(SB), acc7
+	ANDQ $1, mul0
+
+	CMOVQEQ acc0, acc4
+	CMOVQEQ acc1, acc5
+	CMOVQEQ acc2, acc6
+	CMOVQEQ acc3, acc7
+
+	RET
+/* ---------------------------------------*/
+TEXT sm2P256MulInternal(SB),NOSPLIT,$0
+	MOVQ acc4, mul0
+	MULQ t0
+	MOVQ mul0, acc0
+	MOVQ mul1, acc1
+
+	MOVQ acc4, mul0
+	MULQ t1
+	ADDQ mul0, acc1
+	ADCQ $0, mul1
+	MOVQ mul1, acc2
+
+	MOVQ acc4, mul0
+	MULQ t2
+	ADDQ mul0, acc2
+	ADCQ $0, mul1
+	MOVQ mul1, acc3
+
+	MOVQ acc4, mul0
+	MULQ t3
+	ADDQ mul0, acc3
+	ADCQ $0, mul1
+	MOVQ mul1, acc4
+
+	MOVQ acc5, mul0
+	MULQ t0
+	ADDQ mul0, acc1
+	ADCQ $0, mul1
+	MOVQ mul1, hlp
+
+	MOVQ acc5, mul0
+	MULQ t1
+	ADDQ hlp, acc2
+	ADCQ $0, mul1
+	ADDQ mul0, acc2
+	ADCQ $0, mul1
+	MOVQ mul1, hlp
+
+	MOVQ acc5, mul0
+	MULQ t2
+	ADDQ hlp, acc3
+	ADCQ $0, mul1
+	ADDQ mul0, acc3
+	ADCQ $0, mul1
+	MOVQ mul1, hlp
+
+	MOVQ acc5, mul0
+	MULQ t3
+	ADDQ hlp, acc4
+	ADCQ $0, mul1
+	ADDQ mul0, acc4
+	ADCQ $0, mul1
+	MOVQ mul1, acc5
+
+	MOVQ acc6, mul0
+	MULQ t0
+	ADDQ mul0, acc2
+	ADCQ $0, mul1
+	MOVQ mul1, hlp
+
+	MOVQ acc6, mul0
+	MULQ t1
+	ADDQ hlp, acc3
+	ADCQ $0, mul1
+	ADDQ mul0, acc3
+	ADCQ $0, mul1
+	MOVQ mul1, hlp
+
+	MOVQ acc6, mul0
+	MULQ t2
+	ADDQ hlp, acc4
+	ADCQ $0, mul1
+	ADDQ mul0, acc4
+	ADCQ $0, mul1
+	MOVQ mul1, hlp
+
+	MOVQ acc6, mul0
+	MULQ t3
+	ADDQ hlp, acc5
+	ADCQ $0, mul1
+	ADDQ mul0, acc5
+	ADCQ $0, mul1
+	MOVQ mul1, acc6
+
+	MOVQ acc7, mul0
+	MULQ t0
+	ADDQ mul0, acc3
+	ADCQ $0, mul1
+	MOVQ mul1, hlp
+
+	MOVQ acc7, mul0
+	MULQ t1
+	ADDQ hlp, acc4
+	ADCQ $0, mul1
+	ADDQ mul0, acc4
+	ADCQ $0, mul1
+	MOVQ mul1, hlp
+
+	MOVQ acc7, mul0
+	MULQ t2
+	ADDQ hlp, acc5
+	ADCQ $0, mul1
+	ADDQ mul0, acc5
+	ADCQ $0, mul1
+	MOVQ mul1, hlp
+
+	MOVQ acc7, mul0
+	MULQ t3
+	ADDQ hlp, acc6
+	ADCQ $0, mul1
+	ADDQ mul0, acc6
+	ADCQ $0, mul1
+	MOVQ mul1, acc7
+	// First reduction step
+	MOVQ acc0, mul0
+	MOVQ acc0, hlp
+	SHLQ $32, acc0
+	MULQ p256const1<>(SB)
+	SHRQ $32, hlp
+	ADDQ acc0, acc1
+	ADCQ hlp, acc2
+	ADCQ mul0, acc3
+	ADCQ $0, mul1
+	MOVQ mul1, acc0
+	// Second reduction step
+	MOVQ acc1, mul0
+	MOVQ acc1, hlp
+	SHLQ $32, acc1
+	MULQ p256const1<>(SB)
+	SHRQ $32, hlp
+	ADDQ acc1, acc2
+	ADCQ hlp, acc3
+	ADCQ mul0, acc0
+	ADCQ $0, mul1
+	MOVQ mul1, acc1
+	// Third reduction step
+	MOVQ acc2, mul0
+	MOVQ acc2, hlp
+	SHLQ $32, acc2
+	MULQ p256const1<>(SB)
+	SHRQ $32, hlp
+	ADDQ acc2, acc3
+	ADCQ hlp, acc0
+	ADCQ mul0, acc1
+	ADCQ $0, mul1
+	MOVQ mul1, acc2
+	// Last reduction step
+	MOVQ acc3, mul0
+	MOVQ acc3, hlp
+	SHLQ $32, acc3
+	MULQ p256const1<>(SB)
+	SHRQ $32, hlp
+	ADDQ acc3, acc0
+	ADCQ hlp, acc1
+	ADCQ mul0, acc2
+	ADCQ $0, mul1
+	MOVQ mul1, acc3
+	MOVQ $0, BP
+	// Add bits [511:256] of the result
+	ADCQ acc0, acc4
+	ADCQ acc1, acc5
+	ADCQ acc2, acc6
+	ADCQ acc3, acc7
+	ADCQ $0, hlp
+	// Copy result
+	MOVQ acc4, acc0
+	MOVQ acc5, acc1
+	MOVQ acc6, acc2
+	MOVQ acc7, acc3
+	// Subtract p256
+	SUBQ $-1, acc4
+	SBBQ p256const0<>(SB) ,acc5
+	SBBQ $0, acc6
+	SBBQ p256const1<>(SB), acc7
+	SBBQ $0, hlp
+	// If the result of the subtraction is negative, restore the previous result
+	CMOVQCS acc0, acc4
+	CMOVQCS acc1, acc5
+	CMOVQCS acc2, acc6
+	CMOVQCS acc3, acc7
+
+	RET
+/* ---------------------------------------*/
+TEXT sm2P256SqrInternal(SB),NOSPLIT,$0
+
+	MOVQ acc4, mul0
+	MULQ acc5
+	MOVQ mul0, acc1
+	MOVQ mul1, acc2
+
+	MOVQ acc4, mul0
+	MULQ acc6
+	ADDQ mul0, acc2
+	ADCQ $0, mul1
+	MOVQ mul1, acc3
+
+	MOVQ acc4, mul0
+	MULQ acc7
+	ADDQ mul0, acc3
+	ADCQ $0, mul1
+	MOVQ mul1, t0
+
+	MOVQ acc5, mul0
+	MULQ acc6
+	ADDQ mul0, acc3
+	ADCQ $0, mul1
+	MOVQ mul1, hlp
+
+	MOVQ acc5, mul0
+	MULQ acc7
+	ADDQ hlp, t0
+	ADCQ $0, mul1
+	ADDQ mul0, t0
+	ADCQ $0, mul1
+	MOVQ mul1, t1
+
+	MOVQ acc6, mul0
+	MULQ acc7
+	ADDQ mul0, t1
+	ADCQ $0, mul1
+	MOVQ mul1, t2
+	XORQ t3, t3
+	// *2
+	ADDQ acc1, acc1
+	ADCQ acc2, acc2
+	ADCQ acc3, acc3
+	ADCQ t0, t0
+	ADCQ t1, t1
+	ADCQ t2, t2
+	ADCQ $0, t3
+	// Missing products
+	MOVQ acc4, mul0
+	MULQ mul0
+	MOVQ mul0, acc0
+	MOVQ DX, acc4
+
+	MOVQ acc5, mul0
+	MULQ mul0
+	ADDQ acc4, acc1
+	ADCQ mul0, acc2
+	ADCQ $0, DX
+	MOVQ DX, acc4
+
+	MOVQ acc6, mul0
+	MULQ mul0
+	ADDQ acc4, acc3
+	ADCQ mul0, t0
+	ADCQ $0, DX
+	MOVQ DX, acc4
+
+	MOVQ acc7, mul0
+	MULQ mul0
+	ADDQ acc4, t1
+	ADCQ mul0, t2
+	ADCQ DX, t3
+	// First reduction step
+	MOVQ acc0, mul0
+	MOVQ acc0, hlp
+	SHLQ $32, acc0
+	MULQ p256const1<>(SB)
+	SHRQ $32, hlp
+	ADDQ acc0, acc1
+	ADCQ hlp, acc2
+	ADCQ mul0, acc3
+	ADCQ $0, mul1
+	MOVQ mul1, acc0
+	// Second reduction step
+	MOVQ acc1, mul0
+	MOVQ acc1, hlp
+	SHLQ $32, acc1
+	MULQ p256const1<>(SB)
+	SHRQ $32, hlp
+	ADDQ acc1, acc2
+	ADCQ hlp, acc3
+	ADCQ mul0, acc0
+	ADCQ $0, mul1
+	MOVQ mul1, acc1
+	// Third reduction step
+	MOVQ acc2, mul0
+	MOVQ acc2, hlp
+	SHLQ $32, acc2
+	MULQ p256const1<>(SB)
+	SHRQ $32, hlp
+	ADDQ acc2, acc3
+	ADCQ hlp, acc0
+	ADCQ mul0, acc1
+	ADCQ $0, mul1
+	MOVQ mul1, acc2
+	// Last reduction step
+	MOVQ acc3, mul0
+	MOVQ acc3, hlp
+	SHLQ $32, acc3
+	MULQ p256const1<>(SB)
+	SHRQ $32, hlp
+	ADDQ acc3, acc0
+	ADCQ hlp, acc1
+	ADCQ mul0, acc2
+	ADCQ $0, mul1
+	MOVQ mul1, acc3
+	MOVQ $0, BP
+	// Add bits [511:256] of the result
+	ADCQ acc0, t0
+	ADCQ acc1, t1
+	ADCQ acc2, t2
+	ADCQ acc3, t3
+	ADCQ $0, hlp
+	// Copy result
+	MOVQ t0, acc4
+	MOVQ t1, acc5
+	MOVQ t2, acc6
+	MOVQ t3, acc7
+	// Subtract p256
+	SUBQ $-1, acc4
+	SBBQ p256const0<>(SB) ,acc5
+	SBBQ $0, acc6
+	SBBQ p256const1<>(SB), acc7
+	SBBQ $0, hlp
+	// If the result of the subtraction is negative, restore the previous result
+	CMOVQCS t0, acc4
+	CMOVQCS t1, acc5
+	CMOVQCS t2, acc6
+	CMOVQCS t3, acc7
+
+	RET
+/* ---------------------------------------*/
+#define p256MulBy2Inline\
+	XORQ mul0, mul0;\
+	ADDQ acc4, acc4;\
+	ADCQ acc5, acc5;\
+	ADCQ acc6, acc6;\
+	ADCQ acc7, acc7;\
+	ADCQ $0, mul0;\
+	MOVQ acc4, t0;\
+	MOVQ acc5, t1;\
+	MOVQ acc6, t2;\
+	MOVQ acc7, t3;\
+	SUBQ $-1, t0;\
+	SBBQ p256const0<>(SB), t1;\
+	SBBQ $0, t2;\
+	SBBQ p256const1<>(SB), t3;\
+	SBBQ $0, mul0;\
+	CMOVQCS acc4, t0;\
+	CMOVQCS acc5, t1;\
+	CMOVQCS acc6, t2;\
+	CMOVQCS acc7, t3;
+/* ---------------------------------------*/
+#define p256AddInline \
+	XORQ mul0, mul0;\
+	ADDQ t0, acc4;\
+	ADCQ t1, acc5;\
+	ADCQ t2, acc6;\
+	ADCQ t3, acc7;\
+	ADCQ $0, mul0;\
+	MOVQ acc4, t0;\
+	MOVQ acc5, t1;\
+	MOVQ acc6, t2;\
+	MOVQ acc7, t3;\
+	SUBQ $-1, t0;\
+	SBBQ p256const0<>(SB), t1;\
+	SBBQ $0, t2;\
+	SBBQ p256const1<>(SB), t3;\
+	SBBQ $0, mul0;\
+	CMOVQCS acc4, t0;\
+	CMOVQCS acc5, t1;\
+	CMOVQCS acc6, t2;\
+	CMOVQCS acc7, t3;
+/* ---------------------------------------*/
+#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
+#define LDt(src)   MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
+#define ST(dst)    MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
+#define STt(dst)   MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
+#define acc2t      MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
+#define t2acc      MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
+/* ---------------------------------------*/
+#define x1in(off) (32*0 + off)(SP)
+#define y1in(off) (32*1 + off)(SP)
+#define z1in(off) (32*2 + off)(SP)
+#define x2in(off) (32*3 + off)(SP)
+#define y2in(off) (32*4 + off)(SP)
+#define xout(off) (32*5 + off)(SP)
+#define yout(off) (32*6 + off)(SP)
+#define zout(off) (32*7 + off)(SP)
+#define s2(off)   (32*8 + off)(SP)
+#define z1sqr(off) (32*9 + off)(SP)
+#define h(off)	  (32*10 + off)(SP)
+#define r(off)	  (32*11 + off)(SP)
+#define hsqr(off) (32*12 + off)(SP)
+#define rsqr(off) (32*13 + off)(SP)
+#define hcub(off) (32*14 + off)(SP)
+#define rptr	  (32*15)(SP)
+#define sel_save  (32*15 + 8)(SP)
+#define zero_save (32*15 + 8 + 4)(SP)
+
+// func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
+TEXT ·p256PointAddAffineAsm(SB),0,$512-96
+	// Move input to stack in order to free registers
+	MOVQ res+0(FP), AX
+	MOVQ in1+24(FP), BX
+	MOVQ in2+48(FP), CX
+	MOVQ sign+72(FP), DX
+	MOVQ sel+80(FP), t1
+	MOVQ zero+88(FP), t2
+
+	MOVOU (16*0)(BX), X0
+	MOVOU (16*1)(BX), X1
+	MOVOU (16*2)(BX), X2
+	MOVOU (16*3)(BX), X3
+	MOVOU (16*4)(BX), X4
+	MOVOU (16*5)(BX), X5
+
+	MOVOU X0, x1in(16*0)
+	MOVOU X1, x1in(16*1)
+	MOVOU X2, y1in(16*0)
+	MOVOU X3, y1in(16*1)
+	MOVOU X4, z1in(16*0)
+	MOVOU X5, z1in(16*1)
+
+	MOVOU (16*0)(CX), X0
+	MOVOU (16*1)(CX), X1
+
+	MOVOU X0, x2in(16*0)
+	MOVOU X1, x2in(16*1)
+	// Store pointer to result
+	MOVQ mul0, rptr
+	MOVL t1, sel_save
+	MOVL t2, zero_save
+	// Negate y2in based on sign
+	MOVQ (16*2 + 8*0)(CX), acc4
+	MOVQ (16*2 + 8*1)(CX), acc5
+	MOVQ (16*2 + 8*2)(CX), acc6
+	MOVQ (16*2 + 8*3)(CX), acc7
+	MOVQ $-1, acc0
+	MOVQ p256const0<>(SB), acc1
+	MOVQ $0, acc2
+	MOVQ p256const1<>(SB), acc3
+	XORQ mul0, mul0
+	// Speculatively subtract
+	SUBQ acc4, acc0
+	SBBQ acc5, acc1
+	SBBQ acc6, acc2
+	SBBQ acc7, acc3
+	SBBQ $0, mul0
+	MOVQ acc0, t0
+	MOVQ acc1, t1
+	MOVQ acc2, t2
+	MOVQ acc3, t3
+	// Add in case the operand was > p256
+	ADDQ $-1, acc0
+	ADCQ p256const0<>(SB), acc1
+	ADCQ $0, acc2
+	ADCQ p256const1<>(SB), acc3
+	ADCQ $0, mul0
+	CMOVQNE t0, acc0
+	CMOVQNE t1, acc1
+	CMOVQNE t2, acc2
+	CMOVQNE t3, acc3
+	// If condition is 0, keep original value
+	TESTQ DX, DX
+	CMOVQEQ acc4, acc0
+	CMOVQEQ acc5, acc1
+	CMOVQEQ acc6, acc2
+	CMOVQEQ acc7, acc3
+	// Store result
+	MOVQ acc0, y2in(8*0)
+	MOVQ acc1, y2in(8*1)
+	MOVQ acc2, y2in(8*2)
+	MOVQ acc3, y2in(8*3)
+	// Begin point add
+	LDacc (z1in)
+	CALL sm2P256SqrInternal(SB)	// z1ˆ2
+	ST (z1sqr)
+
+	LDt (x2in)
+	CALL sm2P256MulInternal(SB)	// x2 * z1ˆ2
+
+	LDt (x1in)
+	CALL sm2P256SubInternal(SB)	// h = u2 - u1
+	ST (h)
+
+	LDt (z1in)
+	CALL sm2P256MulInternal(SB)	// z3 = h * z1
+	ST (zout)
+
+	LDacc (z1sqr)
+	CALL sm2P256MulInternal(SB)	// z1ˆ3
+
+	LDt (y2in)
+	CALL sm2P256MulInternal(SB)	// s2 = y2 * z1ˆ3
+	ST (s2)
+
+	LDt (y1in)
+	CALL sm2P256SubInternal(SB)	// r = s2 - s1
+	ST (r)
+
+	CALL sm2P256SqrInternal(SB)	// rsqr = rˆ2
+	ST (rsqr)
+
+	LDacc (h)
+	CALL sm2P256SqrInternal(SB)	// hsqr = hˆ2
+	ST (hsqr)
+
+	LDt (h)
+	CALL sm2P256MulInternal(SB)	// hcub = hˆ3
+	ST (hcub)
+
+	LDt (y1in)
+	CALL sm2P256MulInternal(SB)	// y1 * hˆ3
+	ST (s2)
+
+	LDacc (x1in)
+	LDt (hsqr)
+	CALL sm2P256MulInternal(SB)	// u1 * hˆ2
+	ST (h)
+
+	p256MulBy2Inline			// u1 * hˆ2 * 2, inline
+	LDacc (rsqr)
+	CALL sm2P256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
+
+	LDt (hcub)
+	CALL sm2P256SubInternal(SB)
+	ST (xout)
+
+	MOVQ acc4, t0
+	MOVQ acc5, t1
+	MOVQ acc6, t2
+	MOVQ acc7, t3
+	LDacc (h)
+	CALL sm2P256SubInternal(SB)
+
+	LDt (r)
+	CALL sm2P256MulInternal(SB)
+
+	LDt (s2)
+	CALL sm2P256SubInternal(SB)
+	ST (yout)
+	// Load stored values from stack
+	MOVQ rptr, AX
+	MOVL sel_save, BX
+	MOVL zero_save, CX
+	// The result is not valid if (sel == 0), conditional choose
+	MOVOU xout(16*0), X0
+	MOVOU xout(16*1), X1
+	MOVOU yout(16*0), X2
+	MOVOU yout(16*1), X3
+	MOVOU zout(16*0), X4
+	MOVOU zout(16*1), X5
+
+	MOVL BX, X6
+	MOVL CX, X7
+
+	PXOR X8, X8
+	PCMPEQL X9, X9
+
+	PSHUFD $0, X6, X6
+	PSHUFD $0, X7, X7
+
+	PCMPEQL X8, X6
+	PCMPEQL X8, X7
+
+	MOVOU X6, X15
+	PANDN X9, X15
+
+	MOVOU x1in(16*0), X9
+	MOVOU x1in(16*1), X10
+	MOVOU y1in(16*0), X11
+	MOVOU y1in(16*1), X12
+	MOVOU z1in(16*0), X13
+	MOVOU z1in(16*1), X14
+
+	PAND X15, X0
+	PAND X15, X1
+	PAND X15, X2
+	PAND X15, X3
+	PAND X15, X4
+	PAND X15, X5
+
+	PAND X6, X9
+	PAND X6, X10
+	PAND X6, X11
+	PAND X6, X12
+	PAND X6, X13
+	PAND X6, X14
+
+	PXOR X9, X0
+	PXOR X10, X1
+	PXOR X11, X2
+	PXOR X12, X3
+	PXOR X13, X4
+	PXOR X14, X5
+	// Similarly if zero == 0
+	PCMPEQL X9, X9
+	MOVOU X7, X15
+	PANDN X9, X15
+
+	MOVOU x2in(16*0), X9
+	MOVOU x2in(16*1), X10
+	MOVOU y2in(16*0), X11
+	MOVOU y2in(16*1), X12
+	MOVOU p256one<>+0x00(SB), X13
+	MOVOU p256one<>+0x10(SB), X14
+
+	PAND X15, X0
+	PAND X15, X1
+	PAND X15, X2
+	PAND X15, X3
+	PAND X15, X4
+	PAND X15, X5
+
+	PAND X7, X9
+	PAND X7, X10
+	PAND X7, X11
+	PAND X7, X12
+	PAND X7, X13
+	PAND X7, X14
+
+	PXOR X9, X0
+	PXOR X10, X1
+	PXOR X11, X2
+	PXOR X12, X3
+	PXOR X13, X4
+	PXOR X14, X5
+	// Finally output the result
+	MOVOU X0, (16*0)(AX)
+	MOVOU X1, (16*1)(AX)
+	MOVOU X2, (16*2)(AX)
+	MOVOU X3, (16*3)(AX)
+	MOVOU X4, (16*4)(AX)
+	MOVOU X5, (16*5)(AX)
+	MOVQ $0, rptr
+
+	RET
+#undef x1in
+#undef y1in
+#undef z1in
+#undef x2in
+#undef y2in
+#undef xout
+#undef yout
+#undef zout
+#undef s2
+#undef z1sqr
+#undef h
+#undef r
+#undef hsqr
+#undef rsqr
+#undef hcub
+#undef rptr
+#undef sel_save
+#undef zero_save
+
+// sm2P256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
+// otherwise. It writes to [acc4..acc7], t0 and t1.
+TEXT sm2P256IsZero(SB),NOSPLIT,$0
+	// AX contains a flag that is set if the input is zero.
+	XORQ AX, AX
+	MOVQ $1, t1
+
+	// Check whether [acc4..acc7] are all zero.
+	MOVQ acc4, t0
+	ORQ acc5, t0
+	ORQ acc6, t0
+	ORQ acc7, t0
+
+	// Set the zero flag if so. (CMOV of a constant to a register doesn't
+	// appear to be supported in Go. Thus t1 = 1.)
+	CMOVQEQ t1, AX
+
+	// XOR [acc4..acc7] with P and compare with zero again.
+	XORQ $-1, acc4
+	XORQ p256const0<>(SB), acc5
+	XORQ p256const1<>(SB), acc7
+	ORQ acc5, acc4
+	ORQ acc6, acc4
+	ORQ acc7, acc4
+
+	// Set the zero flag if so.
+	CMOVQEQ t1, AX
+	RET
+
+/* ---------------------------------------*/
+#define x1in(off) (32*0 + off)(SP)
+#define y1in(off) (32*1 + off)(SP)
+#define z1in(off) (32*2 + off)(SP)
+#define x2in(off) (32*3 + off)(SP)
+#define y2in(off) (32*4 + off)(SP)
+#define z2in(off) (32*5 + off)(SP)
+
+#define xout(off) (32*6 + off)(SP)
+#define yout(off) (32*7 + off)(SP)
+#define zout(off) (32*8 + off)(SP)
+
+#define u1(off)    (32*9 + off)(SP)
+#define u2(off)    (32*10 + off)(SP)
+#define s1(off)    (32*11 + off)(SP)
+#define s2(off)    (32*12 + off)(SP)
+#define z1sqr(off) (32*13 + off)(SP)
+#define z2sqr(off) (32*14 + off)(SP)
+#define h(off)     (32*15 + off)(SP)
+#define r(off)     (32*16 + off)(SP)
+#define hsqr(off)  (32*17 + off)(SP)
+#define rsqr(off)  (32*18 + off)(SP)
+#define hcub(off)  (32*19 + off)(SP)
+#define rptr       (32*20)(SP)
+#define points_eq  (32*20+8)(SP)
+
+//func p256PointAddAsm(res, in1, in2 []uint64) int
+TEXT ·p256PointAddAsm(SB),0,$680-80
+	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
+	// Move input to stack in order to free registers
+	MOVQ res+0(FP), AX
+	MOVQ in1+24(FP), BX
+	MOVQ in2+48(FP), CX
+
+	MOVOU (16*0)(BX), X0
+	MOVOU (16*1)(BX), X1
+	MOVOU (16*2)(BX), X2
+	MOVOU (16*3)(BX), X3
+	MOVOU (16*4)(BX), X4
+	MOVOU (16*5)(BX), X5
+
+	MOVOU X0, x1in(16*0)
+	MOVOU X1, x1in(16*1)
+	MOVOU X2, y1in(16*0)
+	MOVOU X3, y1in(16*1)
+	MOVOU X4, z1in(16*0)
+	MOVOU X5, z1in(16*1)
+
+	MOVOU (16*0)(CX), X0
+	MOVOU (16*1)(CX), X1
+	MOVOU (16*2)(CX), X2
+	MOVOU (16*3)(CX), X3
+	MOVOU (16*4)(CX), X4
+	MOVOU (16*5)(CX), X5
+
+	MOVOU X0, x2in(16*0)
+	MOVOU X1, x2in(16*1)
+	MOVOU X2, y2in(16*0)
+	MOVOU X3, y2in(16*1)
+	MOVOU X4, z2in(16*0)
+	MOVOU X5, z2in(16*1)
+	// Store pointer to result
+	MOVQ AX, rptr
+	// Begin point add
+	LDacc (z2in)
+	CALL sm2P256SqrInternal(SB)	// z2ˆ2
+	ST (z2sqr)
+	LDt (z2in)
+	CALL sm2P256MulInternal(SB)	// z2ˆ3
+	LDt (y1in)
+	CALL sm2P256MulInternal(SB)	// s1 = z2ˆ3*y1
+	ST (s1)
+
+	LDacc (z1in)
+	CALL sm2P256SqrInternal(SB)	// z1ˆ2
+	ST (z1sqr)
+	LDt (z1in)
+	CALL sm2P256MulInternal(SB)	// z1ˆ3
+	LDt (y2in)
+	CALL sm2P256MulInternal(SB)	// s2 = z1ˆ3*y2
+	ST (s2)
+
+	LDt (s1)
+	CALL sm2P256SubInternal(SB)	// r = s2 - s1
+	ST (r)
+	CALL sm2P256IsZero(SB)
+	MOVQ AX, points_eq
+
+	LDacc (z2sqr)
+	LDt (x1in)
+	CALL sm2P256MulInternal(SB)	// u1 = x1 * z2ˆ2
+	ST (u1)
+	LDacc (z1sqr)
+	LDt (x2in)
+	CALL sm2P256MulInternal(SB)	// u2 = x2 * z1ˆ2
+	ST (u2)
+
+	LDt (u1)
+	CALL sm2P256SubInternal(SB)	// h = u2 - u1
+	ST (h)
+	CALL sm2P256IsZero(SB)
+	ANDQ points_eq, AX
+	MOVQ AX, points_eq
+
+	LDacc (r)
+	CALL sm2P256SqrInternal(SB)	// rsqr = rˆ2
+	ST (rsqr)
+
+	LDacc (h)
+	CALL sm2P256SqrInternal(SB)	// hsqr = hˆ2
+	ST (hsqr)
+
+	LDt (h)
+	CALL sm2P256MulInternal(SB)	// hcub = hˆ3
+	ST (hcub)
+
+	LDt (s1)
+	CALL sm2P256MulInternal(SB)
+	ST (s2)
+
+	LDacc (z1in)
+	LDt (z2in)
+	CALL sm2P256MulInternal(SB)	// z1 * z2
+	LDt (h)
+	CALL sm2P256MulInternal(SB)	// z1 * z2 * h
+	ST (zout)
+
+	LDacc (hsqr)
+	LDt (u1)
+	CALL sm2P256MulInternal(SB)	// hˆ2 * u1
+	ST (u2)
+
+	p256MulBy2Inline	// u1 * hˆ2 * 2, inline
+	LDacc (rsqr)
+	CALL sm2P256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
+
+	LDt (hcub)
+	CALL sm2P256SubInternal(SB)
+	ST (xout)
+
+	MOVQ acc4, t0
+	MOVQ acc5, t1
+	MOVQ acc6, t2
+	MOVQ acc7, t3
+	LDacc (u2)
+	CALL sm2P256SubInternal(SB)
+
+	LDt (r)
+	CALL sm2P256MulInternal(SB)
+
+	LDt (s2)
+	CALL sm2P256SubInternal(SB)
+	ST (yout)
+
+	MOVOU xout(16*0), X0
+	MOVOU xout(16*1), X1
+	MOVOU yout(16*0), X2
+	MOVOU yout(16*1), X3
+	MOVOU zout(16*0), X4
+	MOVOU zout(16*1), X5
+	// Finally output the result
+	MOVQ rptr, AX
+	MOVQ $0, rptr
+	MOVOU X0, (16*0)(AX)
+	MOVOU X1, (16*1)(AX)
+	MOVOU X2, (16*2)(AX)
+	MOVOU X3, (16*3)(AX)
+	MOVOU X4, (16*4)(AX)
+	MOVOU X5, (16*5)(AX)
+
+	MOVQ points_eq, AX
+	MOVQ AX, ret+72(FP)
+
+	RET
+#undef x1in
+#undef y1in
+#undef z1in
+#undef x2in
+#undef y2in
+#undef z2in
+#undef xout
+#undef yout
+#undef zout
+#undef s1
+#undef s2
+#undef u1
+#undef u2
+#undef z1sqr
+#undef z2sqr
+#undef h
+#undef r
+#undef hsqr
+#undef rsqr
+#undef hcub
+#undef rptr
+/* ---------------------------------------*/
+#define x(off) (32*0 + off)(SP)
+#define y(off) (32*1 + off)(SP)
+#define z(off) (32*2 + off)(SP)
+
+#define s(off)	(32*3 + off)(SP)
+#define m(off)	(32*4 + off)(SP)
+#define zsqr(off) (32*5 + off)(SP)
+#define tmp(off)  (32*6 + off)(SP)
+#define rptr	  (32*7)(SP)
+
+//func p256PointDoubleAsm(res, in []uint64)
+TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48
+	// Move input to stack in order to free registers
+	MOVQ res+0(FP), AX
+	MOVQ in+24(FP), BX
+
+	MOVOU (16*0)(BX), X0
+	MOVOU (16*1)(BX), X1
+	MOVOU (16*2)(BX), X2
+	MOVOU (16*3)(BX), X3
+	MOVOU (16*4)(BX), X4
+	MOVOU (16*5)(BX), X5
+
+	MOVOU X0, x(16*0)
+	MOVOU X1, x(16*1)
+	MOVOU X2, y(16*0)
+	MOVOU X3, y(16*1)
+	MOVOU X4, z(16*0)
+	MOVOU X5, z(16*1)
+	// Store pointer to result
+	MOVQ AX, rptr
+	// Begin point double
+	LDacc (z)
+	CALL sm2P256SqrInternal(SB)
+	ST (zsqr)
+
+	LDt (x)
+	p256AddInline
+	STt (m)
+
+	LDacc (z)
+	LDt (y)
+	CALL sm2P256MulInternal(SB)
+	p256MulBy2Inline
+	MOVQ rptr, AX
+	// Store z
+	MOVQ t0, (16*4 + 8*0)(AX)
+	MOVQ t1, (16*4 + 8*1)(AX)
+	MOVQ t2, (16*4 + 8*2)(AX)
+	MOVQ t3, (16*4 + 8*3)(AX)
+
+	LDacc (x)
+	LDt (zsqr)
+	CALL sm2P256SubInternal(SB)
+	LDt (m)
+	CALL sm2P256MulInternal(SB)
+	ST (m)
+	// Multiply by 3
+	p256MulBy2Inline
+	LDacc (m)
+	p256AddInline
+	STt (m)
+	////////////////////////
+	LDacc (y)
+	p256MulBy2Inline
+	t2acc
+	CALL sm2P256SqrInternal(SB)
+	ST (s)
+	CALL sm2P256SqrInternal(SB)
+	// Divide by 2
+	XORQ mul0, mul0
+	MOVQ acc4, t0
+	MOVQ acc5, t1
+	MOVQ acc6, t2
+	MOVQ acc7, t3
+
+	ADDQ $-1, acc4
+	ADCQ p256const0<>(SB), acc5
+	ADCQ $0, acc6
+	ADCQ p256const1<>(SB), acc7
+	ADCQ $0, mul0
+	TESTQ $1, t0
+
+	CMOVQEQ t0, acc4
+	CMOVQEQ t1, acc5
+	CMOVQEQ t2, acc6
+	CMOVQEQ t3, acc7
+	ANDQ t0, mul0
+
+	SHRQ $1, acc5, acc4
+	SHRQ $1, acc6, acc5
+	SHRQ $1, acc7, acc6
+	SHRQ $1, mul0, acc7
+	ST (y)
+	/////////////////////////
+	LDacc (x)
+	LDt (s)
+	CALL sm2P256MulInternal(SB)
+	ST (s)
+	p256MulBy2Inline
+	STt (tmp)
+
+	LDacc (m)
+	CALL sm2P256SqrInternal(SB)
+	LDt (tmp)
+	CALL sm2P256SubInternal(SB)
+
+	MOVQ rptr, AX
+	// Store x
+	MOVQ acc4, (16*0 + 8*0)(AX)
+	MOVQ acc5, (16*0 + 8*1)(AX)
+	MOVQ acc6, (16*0 + 8*2)(AX)
+	MOVQ acc7, (16*0 + 8*3)(AX)
+
+	acc2t
+	LDacc (s)
+	CALL sm2P256SubInternal(SB)
+
+	LDt (m)
+	CALL sm2P256MulInternal(SB)
+
+	LDt (y)
+	CALL sm2P256SubInternal(SB)
+	MOVQ rptr, AX
+	// Store y
+	MOVQ acc4, (16*2 + 8*0)(AX)
+	MOVQ acc5, (16*2 + 8*1)(AX)
+	MOVQ acc6, (16*2 + 8*2)(AX)
+	MOVQ acc7, (16*2 + 8*3)(AX)
+	///////////////////////
+	MOVQ $0, rptr
+
+	RET
+/* ---------------------------------------*/
+
diff --git a/sm2/p256_generic.go b/sm2/p256_generic.go
new file mode 100644
index 0000000..b805078
--- /dev/null
+++ b/sm2/p256_generic.go
@@ -0,0 +1,12 @@
+// +build !amd64
+
+package sm2
+
+var (
+	p256 p256Curve
+)
+
+func initP256Arch() {
+	// Use pure Go implementation.
+	p256 = p256Curve{p256Params}
+}
diff --git a/sm2/sm2.go b/sm2/sm2.go
index 4a7b450..069b460 100644
--- a/sm2/sm2.go
+++ b/sm2/sm2.go
@@ -14,6 +14,7 @@ import (
 	"io"
 	"math/big"
 	"strings"
+	"sync"
 
 	"github.com/emmansun/gmsm/sm3"
 )
@@ -58,8 +59,18 @@ func (priv *PrivateKey) Sign(rand io.Reader, digest []byte, opts crypto.SignerOp
 	return asn1.Marshal(ecdsaSignature{r, s})
 }
 
+var (
+	one      = new(big.Int).SetInt64(1)
+	initonce sync.Once
+)
+
+// P256 init and return the singleton
+func P256() elliptic.Curve {
+	initonce.Do(initP256)
+	return p256
+}
+
 ///////////////// below code ship from golan crypto/ecdsa ////////////////////
-var one = new(big.Int).SetInt64(1)
 
 // randFieldElement returns a random element of the field underlying the given
 // curve using the procedure given in [NSA] A.2.1.