mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-27 04:36:19 +08:00
MAGIC - optimize p256Sqr, avoid mul
This commit is contained in:
parent
c0199ac104
commit
b97c484b85
@ -1,7 +1,3 @@
|
|||||||
// Copyright 2015 The Go Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// This file contains constant-time, 64-bit assembly implementation of
|
// This file contains constant-time, 64-bit assembly implementation of
|
||||||
// P256. The optimizations performed here are described in detail in:
|
// P256. The optimizations performed here are described in detail in:
|
||||||
// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
|
// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
|
||||||
@ -24,8 +20,6 @@
|
|||||||
#define t0 R14
|
#define t0 R14
|
||||||
#define t1 R15
|
#define t1 R15
|
||||||
|
|
||||||
DATA p256const0<>+0x00(SB)/8, $0xffffffff00000000
|
|
||||||
DATA p256const1<>+0x00(SB)/8, $0xfffffffeffffffff
|
|
||||||
DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff
|
DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff
|
||||||
DATA p256p<>+0x08(SB)/8, $0xffffffff00000000
|
DATA p256p<>+0x08(SB)/8, $0xffffffff00000000
|
||||||
DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff
|
DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff
|
||||||
@ -39,8 +33,6 @@ DATA p256one<>+0x00(SB)/8, $0x0000000000000001
|
|||||||
DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff
|
DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff
|
||||||
DATA p256one<>+0x10(SB)/8, $0x0000000000000000
|
DATA p256one<>+0x10(SB)/8, $0x0000000000000000
|
||||||
DATA p256one<>+0x18(SB)/8, $0x0000000100000000
|
DATA p256one<>+0x18(SB)/8, $0x0000000100000000
|
||||||
GLOBL p256const0<>(SB), 8, $8
|
|
||||||
GLOBL p256const1<>(SB), 8, $8
|
|
||||||
GLOBL p256p<>(SB), RODATA, $32
|
GLOBL p256p<>(SB), RODATA, $32
|
||||||
GLOBL p256ordK0<>(SB), RODATA, $8
|
GLOBL p256ordK0<>(SB), RODATA, $8
|
||||||
GLOBL p256ord<>(SB), RODATA, $32
|
GLOBL p256ord<>(SB), RODATA, $32
|
||||||
@ -254,94 +246,82 @@ sqrLoop:
|
|||||||
ADCQ DX, t1
|
ADCQ DX, t1
|
||||||
MOVQ t1, x_ptr
|
MOVQ t1, x_ptr
|
||||||
// First reduction step
|
// First reduction step
|
||||||
MOVQ p256p<>+0x08(SB), AX
|
MOVQ acc0, t1
|
||||||
MULQ acc0
|
|
||||||
ADDQ acc0, acc1
|
MOVQ t1, AX
|
||||||
ADCQ $0, DX
|
MOVQ t1, DX
|
||||||
ADDQ AX, acc1
|
SHLQ $32, AX
|
||||||
ADCQ $0, DX
|
SHRQ $32, DX
|
||||||
MOVQ DX, t1
|
|
||||||
MOVQ p256p<>+0x010(SB), AX
|
|
||||||
MULQ acc0
|
|
||||||
ADDQ t1, acc2
|
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc2
|
|
||||||
ADCQ $0, DX
|
|
||||||
MOVQ DX, t1
|
|
||||||
MOVQ p256p<>+0x018(SB), AX
|
|
||||||
MULQ acc0
|
|
||||||
ADDQ t1, acc3
|
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc3
|
|
||||||
ADCQ $0, DX
|
|
||||||
MOVQ DX, acc0
|
|
||||||
// Second reduction step
|
|
||||||
MOVQ p256p<>+0x08(SB), AX
|
|
||||||
MULQ acc1
|
|
||||||
ADDQ acc1, acc2
|
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc2
|
|
||||||
ADCQ $0, DX
|
|
||||||
MOVQ DX, t1
|
|
||||||
MOVQ p256p<>+0x010(SB), AX
|
|
||||||
MULQ acc1
|
|
||||||
ADDQ t1, acc3
|
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc3
|
|
||||||
ADCQ $0, DX
|
|
||||||
MOVQ DX, t1
|
|
||||||
MOVQ p256p<>+0x018(SB), AX
|
|
||||||
MULQ acc1
|
|
||||||
ADDQ t1, acc0
|
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc0
|
|
||||||
ADCQ $0, DX
|
|
||||||
MOVQ DX, acc1
|
|
||||||
// Third reduction step
|
|
||||||
MOVQ p256p<>+0x08(SB), AX
|
|
||||||
MULQ acc2
|
|
||||||
ADDQ acc2, acc3
|
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc3
|
|
||||||
ADCQ $0, DX
|
|
||||||
MOVQ DX, t1
|
|
||||||
MOVQ p256p<>+0x010(SB), AX
|
|
||||||
MULQ acc2
|
|
||||||
ADDQ t1, acc0
|
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc0
|
|
||||||
ADCQ $0, DX
|
|
||||||
MOVQ DX, t1
|
|
||||||
MOVQ p256p<>+0x018(SB), AX
|
|
||||||
MULQ acc2
|
|
||||||
ADDQ t1, acc1
|
ADDQ t1, acc1
|
||||||
ADCQ $0, DX
|
ADCQ $0, acc2
|
||||||
ADDQ AX, acc1
|
SUBQ AX, acc1
|
||||||
ADCQ $0, DX
|
SBBQ DX, acc2
|
||||||
MOVQ DX, acc2
|
|
||||||
|
ADDQ t1, acc3
|
||||||
|
ADCQ $0, acc0
|
||||||
|
SUBQ AX, acc3
|
||||||
|
SBBQ DX, acc0
|
||||||
|
SUBQ t1, acc3
|
||||||
|
SBBQ $0, acc0
|
||||||
|
// Second reduction step
|
||||||
|
MOVQ acc1, t1
|
||||||
|
|
||||||
|
MOVQ t1, AX
|
||||||
|
MOVQ t1, DX
|
||||||
|
SHLQ $32, AX
|
||||||
|
SHRQ $32, DX
|
||||||
|
|
||||||
|
ADDQ t1, acc2
|
||||||
|
ADCQ $0, acc3
|
||||||
|
SUBQ AX, acc2
|
||||||
|
SBBQ DX, acc3
|
||||||
|
|
||||||
|
ADDQ t1, acc0
|
||||||
|
ADCQ $0, acc1
|
||||||
|
SUBQ AX, acc0
|
||||||
|
SBBQ DX, acc1
|
||||||
|
SUBQ t1, acc0
|
||||||
|
SBBQ $0, acc1
|
||||||
|
// Third reduction step
|
||||||
|
MOVQ acc2, t1
|
||||||
|
|
||||||
|
MOVQ t1, AX
|
||||||
|
MOVQ t1, DX
|
||||||
|
SHLQ $32, AX
|
||||||
|
SHRQ $32, DX
|
||||||
|
|
||||||
|
ADDQ t1, acc3
|
||||||
|
ADCQ $0, acc0
|
||||||
|
SUBQ AX, acc3
|
||||||
|
SBBQ DX, acc0
|
||||||
|
|
||||||
|
ADDQ t1, acc1
|
||||||
|
ADCQ $0, acc2
|
||||||
|
SUBQ AX, acc1
|
||||||
|
SBBQ DX, acc2
|
||||||
|
SUBQ t1, acc1
|
||||||
|
SBBQ $0, acc2
|
||||||
// Last reduction step
|
// Last reduction step
|
||||||
XORQ t0, t0
|
XORQ t0, t0
|
||||||
MOVQ p256p<>+0x08(SB), AX
|
MOVQ acc3, t1
|
||||||
MULQ acc3
|
|
||||||
ADDQ acc3, acc0
|
MOVQ t1, AX
|
||||||
ADCQ $0, DX
|
MOVQ t1, DX
|
||||||
ADDQ AX, acc0
|
SHLQ $32, AX
|
||||||
ADCQ $0, DX
|
SHRQ $32, DX
|
||||||
MOVQ DX, t1
|
|
||||||
MOVQ p256p<>+0x010(SB), AX
|
ADDQ t1, acc0
|
||||||
MULQ acc3
|
ADCQ $0, acc1
|
||||||
ADDQ t1, acc1
|
SUBQ AX, acc0
|
||||||
ADCQ $0, DX
|
SBBQ DX, acc1
|
||||||
ADDQ AX, acc1
|
|
||||||
ADCQ $0, DX
|
|
||||||
MOVQ DX, t1
|
|
||||||
MOVQ p256p<>+0x018(SB), AX
|
|
||||||
MULQ acc3
|
|
||||||
ADDQ t1, acc2
|
ADDQ t1, acc2
|
||||||
ADCQ $0, DX
|
ADCQ $0, acc3
|
||||||
ADDQ AX, acc2
|
SUBQ AX, acc2
|
||||||
ADCQ $0, DX
|
SBBQ DX, acc3
|
||||||
MOVQ DX, acc3
|
SUBQ t1, acc2
|
||||||
|
SBBQ $0, acc3
|
||||||
|
|
||||||
// Add bits [511:256] of the sqr result
|
// Add bits [511:256] of the sqr result
|
||||||
ADCQ acc4, acc0
|
ADCQ acc4, acc0
|
||||||
@ -356,9 +336,9 @@ sqrLoop:
|
|||||||
MOVQ acc3, t1
|
MOVQ acc3, t1
|
||||||
// Subtract p256
|
// Subtract p256
|
||||||
SUBQ $-1, acc0
|
SUBQ $-1, acc0
|
||||||
SBBQ p256const0<>(SB) ,acc1
|
SBBQ p256p<>+0x08(SB), acc1
|
||||||
SBBQ $-1, acc2
|
SBBQ $-1, acc2
|
||||||
SBBQ p256const1<>(SB), acc3
|
SBBQ p256p<>+0x018(SB), acc3
|
||||||
SBBQ $0, t0
|
SBBQ $0, t0
|
||||||
|
|
||||||
CMOVQCS acc4, acc0
|
CMOVQCS acc4, acc0
|
||||||
@ -408,22 +388,22 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
|
|||||||
MOVQ DX, acc4
|
MOVQ DX, acc4
|
||||||
XORQ acc5, acc5
|
XORQ acc5, acc5
|
||||||
// First reduction step
|
// First reduction step
|
||||||
MOVQ p256p<>+0x08(SB), AX
|
MOVQ p256p<>+0x08(SB), AX
|
||||||
MULQ acc0
|
MULQ acc0
|
||||||
ADDQ acc0, acc1
|
ADDQ acc0, acc1
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc1
|
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, t1
|
ADDQ AX, acc1
|
||||||
MOVQ p256p<>+0x010(SB), AX
|
ADCQ $0, DX
|
||||||
MULQ acc0
|
MOVQ DX, t1
|
||||||
|
MOVQ p256p<>+0x010(SB), AX
|
||||||
|
MULQ acc0
|
||||||
ADDQ t1, acc2
|
ADDQ t1, acc2
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
ADDQ AX, acc2
|
ADDQ AX, acc2
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, t1
|
MOVQ DX, t1
|
||||||
MOVQ p256p<>+0x018(SB), AX
|
MOVQ p256p<>+0x018(SB), AX
|
||||||
MULQ acc0
|
MULQ acc0
|
||||||
ADDQ t1, acc3
|
ADDQ t1, acc3
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
ADDQ AX, acc3
|
ADDQ AX, acc3
|
||||||
@ -464,22 +444,22 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
|
|||||||
ADCQ DX, acc5
|
ADCQ DX, acc5
|
||||||
ADCQ $0, acc0
|
ADCQ $0, acc0
|
||||||
// Second reduction step
|
// Second reduction step
|
||||||
MOVQ p256p<>+0x08(SB), AX
|
MOVQ p256p<>+0x08(SB), AX
|
||||||
MULQ acc1
|
MULQ acc1
|
||||||
ADDQ acc1, acc2
|
ADDQ acc1, acc2
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc2
|
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, t1
|
ADDQ AX, acc2
|
||||||
MOVQ p256p<>+0x010(SB), AX
|
ADCQ $0, DX
|
||||||
MULQ acc1
|
MOVQ DX, t1
|
||||||
|
MOVQ p256p<>+0x010(SB), AX
|
||||||
|
MULQ acc1
|
||||||
ADDQ t1, acc3
|
ADDQ t1, acc3
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
ADDQ AX, acc3
|
ADDQ AX, acc3
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, t1
|
MOVQ DX, t1
|
||||||
MOVQ p256p<>+0x018(SB), AX
|
MOVQ p256p<>+0x018(SB), AX
|
||||||
MULQ acc1
|
MULQ acc1
|
||||||
ADDQ t1, acc4
|
ADDQ t1, acc4
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
ADDQ AX, acc4
|
ADDQ AX, acc4
|
||||||
@ -520,22 +500,22 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
|
|||||||
ADCQ DX, acc0
|
ADCQ DX, acc0
|
||||||
ADCQ $0, acc1
|
ADCQ $0, acc1
|
||||||
// Third reduction step
|
// Third reduction step
|
||||||
MOVQ p256p<>+0x08(SB), AX
|
MOVQ p256p<>+0x08(SB), AX
|
||||||
MULQ acc2
|
MULQ acc2
|
||||||
ADDQ acc2, acc3
|
ADDQ acc2, acc3
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc3
|
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, t1
|
ADDQ AX, acc3
|
||||||
MOVQ p256p<>+0x010(SB), AX
|
ADCQ $0, DX
|
||||||
MULQ acc2
|
MOVQ DX, t1
|
||||||
|
MOVQ p256p<>+0x010(SB), AX
|
||||||
|
MULQ acc2
|
||||||
ADDQ t1, acc4
|
ADDQ t1, acc4
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
ADDQ AX, acc4
|
ADDQ AX, acc4
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, t1
|
MOVQ DX, t1
|
||||||
MOVQ p256p<>+0x018(SB), AX
|
MOVQ p256p<>+0x018(SB), AX
|
||||||
MULQ acc2
|
MULQ acc2
|
||||||
ADDQ t1, acc5
|
ADDQ t1, acc5
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
ADDQ AX, acc5
|
ADDQ AX, acc5
|
||||||
@ -575,22 +555,22 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
|
|||||||
ADCQ DX, acc1
|
ADCQ DX, acc1
|
||||||
ADCQ $0, acc2
|
ADCQ $0, acc2
|
||||||
// Last reduction step
|
// Last reduction step
|
||||||
MOVQ p256p<>+0x08(SB), AX
|
MOVQ p256p<>+0x08(SB), AX
|
||||||
MULQ acc3
|
MULQ acc3
|
||||||
ADDQ acc3, acc4
|
ADDQ acc3, acc4
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc4
|
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, t1
|
ADDQ AX, acc4
|
||||||
MOVQ p256p<>+0x010(SB), AX
|
ADCQ $0, DX
|
||||||
MULQ acc3
|
MOVQ DX, t1
|
||||||
|
MOVQ p256p<>+0x010(SB), AX
|
||||||
|
MULQ acc3
|
||||||
ADDQ t1, acc5
|
ADDQ t1, acc5
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
ADDQ AX, acc5
|
ADDQ AX, acc5
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, t1
|
MOVQ DX, t1
|
||||||
MOVQ p256p<>+0x018(SB), AX
|
MOVQ p256p<>+0x018(SB), AX
|
||||||
MULQ acc3
|
MULQ acc3
|
||||||
ADDQ t1, acc0
|
ADDQ t1, acc0
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
ADDQ AX, acc0
|
ADDQ AX, acc0
|
||||||
@ -603,9 +583,9 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
|
|||||||
MOVQ acc1, t1
|
MOVQ acc1, t1
|
||||||
// Subtract p256
|
// Subtract p256
|
||||||
SUBQ $-1, acc4
|
SUBQ $-1, acc4
|
||||||
SBBQ p256const0<>(SB) ,acc5
|
SBBQ p256p<>+0x08(SB), acc5
|
||||||
SBBQ $-1, acc0
|
SBBQ $-1, acc0
|
||||||
SBBQ p256const1<>(SB), acc1
|
SBBQ p256p<>+0x018(SB), acc1
|
||||||
SBBQ $0, acc2
|
SBBQ $0, acc2
|
||||||
|
|
||||||
CMOVQCS x_ptr, acc4
|
CMOVQCS x_ptr, acc4
|
||||||
@ -633,22 +613,22 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
|
|||||||
|
|
||||||
// Only reduce, no multiplications are needed
|
// Only reduce, no multiplications are needed
|
||||||
// First stage
|
// First stage
|
||||||
MOVQ p256p<>+0x08(SB), AX
|
MOVQ p256p<>+0x08(SB), AX
|
||||||
MULQ acc0
|
MULQ acc0
|
||||||
ADDQ acc0, acc1
|
ADDQ acc0, acc1
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc1
|
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, t1
|
ADDQ AX, acc1
|
||||||
MOVQ p256p<>+0x010(SB), AX
|
ADCQ $0, DX
|
||||||
MULQ acc0
|
MOVQ DX, t1
|
||||||
|
MOVQ p256p<>+0x010(SB), AX
|
||||||
|
MULQ acc0
|
||||||
ADDQ t1, acc2
|
ADDQ t1, acc2
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
ADDQ AX, acc2
|
ADDQ AX, acc2
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, t1
|
MOVQ DX, t1
|
||||||
MOVQ p256p<>+0x018(SB), AX
|
MOVQ p256p<>+0x018(SB), AX
|
||||||
MULQ acc0
|
MULQ acc0
|
||||||
ADDQ t1, acc3
|
ADDQ t1, acc3
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
ADDQ AX, acc3
|
ADDQ AX, acc3
|
||||||
@ -656,66 +636,66 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
|
|||||||
XORQ acc5, acc5
|
XORQ acc5, acc5
|
||||||
|
|
||||||
// Second stage
|
// Second stage
|
||||||
MOVQ p256p<>+0x08(SB), AX
|
MOVQ p256p<>+0x08(SB), AX
|
||||||
MULQ acc1
|
MULQ acc1
|
||||||
ADDQ acc1, acc2
|
ADDQ acc1, acc2
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc2
|
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, t1
|
ADDQ AX, acc2
|
||||||
MOVQ p256p<>+0x010(SB), AX
|
ADCQ $0, DX
|
||||||
MULQ acc1
|
MOVQ DX, t1
|
||||||
|
MOVQ p256p<>+0x010(SB), AX
|
||||||
|
MULQ acc1
|
||||||
ADDQ t1, acc3
|
ADDQ t1, acc3
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
ADDQ AX, acc3
|
ADDQ AX, acc3
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, t1
|
MOVQ DX, t1
|
||||||
MOVQ p256p<>+0x018(SB), AX
|
MOVQ p256p<>+0x018(SB), AX
|
||||||
MULQ acc1
|
MULQ acc1
|
||||||
ADDQ t1, acc4
|
ADDQ t1, acc4
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
ADDQ AX, acc4
|
ADDQ AX, acc4
|
||||||
ADCQ DX, acc5
|
ADCQ DX, acc5
|
||||||
XORQ acc0, acc0
|
XORQ acc0, acc0
|
||||||
// Third stage
|
// Third stage
|
||||||
MOVQ p256p<>+0x08(SB), AX
|
MOVQ p256p<>+0x08(SB), AX
|
||||||
MULQ acc2
|
MULQ acc2
|
||||||
ADDQ acc2, acc3
|
ADDQ acc2, acc3
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc3
|
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, t1
|
ADDQ AX, acc3
|
||||||
MOVQ p256p<>+0x010(SB), AX
|
ADCQ $0, DX
|
||||||
MULQ acc2
|
MOVQ DX, t1
|
||||||
|
MOVQ p256p<>+0x010(SB), AX
|
||||||
|
MULQ acc2
|
||||||
ADDQ t1, acc4
|
ADDQ t1, acc4
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
ADDQ AX, acc4
|
ADDQ AX, acc4
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, t1
|
MOVQ DX, t1
|
||||||
MOVQ p256p<>+0x018(SB), AX
|
MOVQ p256p<>+0x018(SB), AX
|
||||||
MULQ acc2
|
MULQ acc2
|
||||||
ADDQ t1, acc5
|
ADDQ t1, acc5
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
ADDQ AX, acc5
|
ADDQ AX, acc5
|
||||||
ADCQ DX, acc0
|
ADCQ DX, acc0
|
||||||
XORQ acc1, acc1
|
XORQ acc1, acc1
|
||||||
// Last stage
|
// Last stage
|
||||||
MOVQ p256p<>+0x08(SB), AX
|
MOVQ p256p<>+0x08(SB), AX
|
||||||
MULQ acc3
|
MULQ acc3
|
||||||
ADDQ acc3, acc4
|
ADDQ acc3, acc4
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc4
|
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, t1
|
ADDQ AX, acc4
|
||||||
MOVQ p256p<>+0x010(SB), AX
|
ADCQ $0, DX
|
||||||
MULQ acc3
|
MOVQ DX, t1
|
||||||
|
MOVQ p256p<>+0x010(SB), AX
|
||||||
|
MULQ acc3
|
||||||
ADDQ t1, acc5
|
ADDQ t1, acc5
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
ADDQ AX, acc5
|
ADDQ AX, acc5
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, t1
|
MOVQ DX, t1
|
||||||
MOVQ p256p<>+0x018(SB), AX
|
MOVQ p256p<>+0x018(SB), AX
|
||||||
MULQ acc3
|
MULQ acc3
|
||||||
ADDQ t1, acc0
|
ADDQ t1, acc0
|
||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
ADDQ AX, acc0
|
ADDQ AX, acc0
|
||||||
@ -727,9 +707,9 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
|
|||||||
MOVQ acc1, t1
|
MOVQ acc1, t1
|
||||||
|
|
||||||
SUBQ $-1, acc4
|
SUBQ $-1, acc4
|
||||||
SBBQ p256const0<>(SB), acc5
|
SBBQ p256p<>+0x08(SB), acc5
|
||||||
SBBQ $-1, acc0
|
SBBQ $-1, acc0
|
||||||
SBBQ p256const1<>(SB), acc1
|
SBBQ p256p<>+0x018(SB), acc1
|
||||||
|
|
||||||
CMOVQCS x_ptr, acc4
|
CMOVQCS x_ptr, acc4
|
||||||
CMOVQCS acc3, acc5
|
CMOVQCS acc3, acc5
|
||||||
@ -1473,9 +1453,9 @@ TEXT sm2P256SubInternal(SB),NOSPLIT,$0
|
|||||||
MOVQ acc7, acc3
|
MOVQ acc7, acc3
|
||||||
|
|
||||||
ADDQ $-1, acc4
|
ADDQ $-1, acc4
|
||||||
ADCQ p256const0<>(SB), acc5
|
ADCQ p256p<>+0x08(SB), acc5
|
||||||
ADCQ $-1, acc6
|
ADCQ $-1, acc6
|
||||||
ADCQ p256const1<>(SB), acc7
|
ADCQ p256p<>+0x018(SB), acc7
|
||||||
ANDQ $1, mul0
|
ANDQ $1, mul0
|
||||||
|
|
||||||
CMOVQEQ acc0, acc4
|
CMOVQEQ acc0, acc4
|
||||||
@ -1599,93 +1579,81 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$0
|
|||||||
ADCQ $0, mul1
|
ADCQ $0, mul1
|
||||||
MOVQ mul1, acc7
|
MOVQ mul1, acc7
|
||||||
// First reduction step
|
// First reduction step
|
||||||
MOVQ p256p<>+0x08(SB), mul0
|
MOVQ acc0, hlp
|
||||||
MULQ acc0
|
|
||||||
ADDQ acc0, acc1
|
MOVQ hlp, mul0
|
||||||
ADCQ $0, mul1
|
MOVQ hlp, mul1
|
||||||
ADDQ mul0, acc1
|
SHLQ $32, mul0
|
||||||
ADCQ $0, mul1
|
SHRQ $32, mul1
|
||||||
MOVQ mul1, hlp
|
|
||||||
MOVQ p256p<>+0x010(SB), mul0
|
ADDQ hlp, acc1
|
||||||
MULQ acc0
|
ADCQ $0, acc2
|
||||||
ADDQ hlp, acc2
|
SUBQ mul0, acc1
|
||||||
ADCQ $0, mul1
|
SBBQ mul1, acc2
|
||||||
ADDQ mul0, acc2
|
|
||||||
ADCQ $0, mul1
|
|
||||||
MOVQ mul1, hlp
|
|
||||||
MOVQ p256p<>+0x018(SB), mul0
|
|
||||||
MULQ acc0
|
|
||||||
ADDQ hlp, acc3
|
ADDQ hlp, acc3
|
||||||
ADCQ $0, mul1
|
ADCQ $0, acc0
|
||||||
ADDQ mul0, acc3
|
SUBQ mul0, acc3
|
||||||
ADCQ $0, mul1
|
SBBQ mul1, acc0
|
||||||
MOVQ mul1, acc0
|
SUBQ hlp, acc3
|
||||||
|
SBBQ $0, acc0
|
||||||
// Second reduction step
|
// Second reduction step
|
||||||
MOVQ p256p<>+0x08(SB), mul0
|
MOVQ acc1, hlp
|
||||||
MULQ acc1
|
|
||||||
ADDQ acc1, acc2
|
MOVQ hlp, mul0
|
||||||
ADCQ $0, mul1
|
MOVQ hlp, mul1
|
||||||
ADDQ mul0, acc2
|
SHLQ $32, mul0
|
||||||
ADCQ $0, mul1
|
SHRQ $32, mul1
|
||||||
MOVQ mul1, hlp
|
|
||||||
MOVQ p256p<>+0x010(SB), mul0
|
|
||||||
MULQ acc1
|
|
||||||
ADDQ hlp, acc3
|
|
||||||
ADCQ $0, mul1
|
|
||||||
ADDQ mul0, acc3
|
|
||||||
ADCQ $0, mul1
|
|
||||||
MOVQ mul1, hlp
|
|
||||||
MOVQ p256p<>+0x018(SB), mul0
|
|
||||||
MULQ acc1
|
|
||||||
ADDQ hlp, acc0
|
|
||||||
ADCQ $0, mul1
|
|
||||||
ADDQ mul0, acc0
|
|
||||||
ADCQ $0, mul1
|
|
||||||
MOVQ mul1, acc1
|
|
||||||
// Third reduction step
|
|
||||||
MOVQ p256p<>+0x08(SB), mul0
|
|
||||||
MULQ acc2
|
|
||||||
ADDQ acc2, acc3
|
|
||||||
ADCQ $0, mul1
|
|
||||||
ADDQ mul0, acc3
|
|
||||||
ADCQ $0, mul1
|
|
||||||
MOVQ mul1, hlp
|
|
||||||
MOVQ p256p<>+0x010(SB), mul0
|
|
||||||
MULQ acc2
|
|
||||||
ADDQ hlp, acc0
|
|
||||||
ADCQ $0, mul1
|
|
||||||
ADDQ mul0, acc0
|
|
||||||
ADCQ $0, mul1
|
|
||||||
MOVQ mul1, hlp
|
|
||||||
MOVQ p256p<>+0x018(SB), mul0
|
|
||||||
MULQ acc2
|
|
||||||
ADDQ hlp, acc1
|
|
||||||
ADCQ $0, mul1
|
|
||||||
ADDQ mul0, acc1
|
|
||||||
ADCQ $0, mul1
|
|
||||||
MOVQ mul1, acc2
|
|
||||||
// Last reduction step
|
|
||||||
MOVQ p256p<>+0x08(SB), mul0
|
|
||||||
MULQ acc3
|
|
||||||
ADDQ acc3, acc0
|
|
||||||
ADCQ $0, mul1
|
|
||||||
ADDQ mul0, acc0
|
|
||||||
ADCQ $0, mul1
|
|
||||||
MOVQ mul1, hlp
|
|
||||||
MOVQ p256p<>+0x010(SB), mul0
|
|
||||||
MULQ acc3
|
|
||||||
ADDQ hlp, acc1
|
|
||||||
ADCQ $0, mul1
|
|
||||||
ADDQ mul0, acc1
|
|
||||||
ADCQ $0, mul1
|
|
||||||
MOVQ mul1, hlp
|
|
||||||
MOVQ p256p<>+0x018(SB), mul0
|
|
||||||
MULQ acc3
|
|
||||||
ADDQ hlp, acc2
|
ADDQ hlp, acc2
|
||||||
ADCQ $0, mul1
|
ADCQ $0, acc3
|
||||||
ADDQ mul0, acc2
|
SUBQ mul0, acc2
|
||||||
ADCQ $0, mul1
|
SBBQ mul1, acc3
|
||||||
MOVQ mul1, acc3
|
|
||||||
|
ADDQ hlp, acc0
|
||||||
|
ADCQ $0, acc1
|
||||||
|
SUBQ mul0, acc0
|
||||||
|
SBBQ mul1, acc1
|
||||||
|
SUBQ hlp, acc0
|
||||||
|
SBBQ $0, acc1
|
||||||
|
// Third reduction step
|
||||||
|
MOVQ acc2, hlp
|
||||||
|
|
||||||
|
MOVQ hlp, mul0
|
||||||
|
MOVQ hlp, mul1
|
||||||
|
SHLQ $32, mul0
|
||||||
|
SHRQ $32, mul1
|
||||||
|
|
||||||
|
ADDQ hlp, acc3
|
||||||
|
ADCQ $0, acc0
|
||||||
|
SUBQ mul0, acc3
|
||||||
|
SBBQ mul1, acc0
|
||||||
|
|
||||||
|
ADDQ hlp, acc1
|
||||||
|
ADCQ $0, acc2
|
||||||
|
SUBQ mul0, acc1
|
||||||
|
SBBQ mul1, acc2
|
||||||
|
SUBQ hlp, acc1
|
||||||
|
SBBQ $0, acc2
|
||||||
|
// Last reduction step
|
||||||
|
MOVQ acc3, hlp
|
||||||
|
|
||||||
|
MOVQ hlp, mul0
|
||||||
|
MOVQ hlp, mul1
|
||||||
|
SHLQ $32, mul0
|
||||||
|
SHRQ $32, mul1
|
||||||
|
|
||||||
|
ADDQ hlp, acc0
|
||||||
|
ADCQ $0, acc1
|
||||||
|
SUBQ mul0, acc0
|
||||||
|
SBBQ mul1, acc1
|
||||||
|
|
||||||
|
ADDQ hlp, acc2
|
||||||
|
ADCQ $0, acc3
|
||||||
|
SUBQ mul0, acc2
|
||||||
|
SBBQ mul1, acc3
|
||||||
|
SUBQ hlp, acc2
|
||||||
|
SBBQ $0, acc3
|
||||||
MOVQ $0, BP
|
MOVQ $0, BP
|
||||||
// Add bits [511:256] of the result
|
// Add bits [511:256] of the result
|
||||||
ADCQ acc0, acc4
|
ADCQ acc0, acc4
|
||||||
@ -1700,9 +1668,9 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$0
|
|||||||
MOVQ acc7, acc3
|
MOVQ acc7, acc3
|
||||||
// Subtract p256
|
// Subtract p256
|
||||||
SUBQ $-1, acc4
|
SUBQ $-1, acc4
|
||||||
SBBQ p256const0<>(SB) ,acc5
|
SBBQ p256p<>+0x08(SB), acc5
|
||||||
SBBQ $-1, acc6
|
SBBQ $-1, acc6
|
||||||
SBBQ p256const1<>(SB), acc7
|
SBBQ p256p<>+0x018(SB), acc7
|
||||||
SBBQ $0, hlp
|
SBBQ $0, hlp
|
||||||
// If the result of the subtraction is negative, restore the previous result
|
// If the result of the subtraction is negative, restore the previous result
|
||||||
CMOVQCS acc0, acc4
|
CMOVQCS acc0, acc4
|
||||||
@ -1886,9 +1854,9 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$0
|
|||||||
MOVQ t3, acc7
|
MOVQ t3, acc7
|
||||||
// Subtract p256
|
// Subtract p256
|
||||||
SUBQ $-1, acc4
|
SUBQ $-1, acc4
|
||||||
SBBQ p256const0<>(SB) ,acc5
|
SBBQ p256p<>+0x08(SB), acc5
|
||||||
SBBQ $-1, acc6
|
SBBQ $-1, acc6
|
||||||
SBBQ p256const1<>(SB), acc7
|
SBBQ p256p<>+0x018(SB), acc7
|
||||||
SBBQ $0, hlp
|
SBBQ $0, hlp
|
||||||
// If the result of the subtraction is negative, restore the previous result
|
// If the result of the subtraction is negative, restore the previous result
|
||||||
CMOVQCS t0, acc4
|
CMOVQCS t0, acc4
|
||||||
@ -1910,9 +1878,9 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$0
|
|||||||
MOVQ acc6, t2;\
|
MOVQ acc6, t2;\
|
||||||
MOVQ acc7, t3;\
|
MOVQ acc7, t3;\
|
||||||
SUBQ $-1, t0;\
|
SUBQ $-1, t0;\
|
||||||
SBBQ p256const0<>(SB), t1;\
|
SBBQ p256p<>+0x08(SB), t1;\
|
||||||
SBBQ $-1, t2;\
|
SBBQ $-1, t2;\
|
||||||
SBBQ p256const1<>(SB), t3;\
|
SBBQ p256p<>+0x018(SB), t3;\
|
||||||
SBBQ $0, mul0;\
|
SBBQ $0, mul0;\
|
||||||
CMOVQCS acc4, t0;\
|
CMOVQCS acc4, t0;\
|
||||||
CMOVQCS acc5, t1;\
|
CMOVQCS acc5, t1;\
|
||||||
@ -1931,9 +1899,9 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$0
|
|||||||
MOVQ acc6, t2;\
|
MOVQ acc6, t2;\
|
||||||
MOVQ acc7, t3;\
|
MOVQ acc7, t3;\
|
||||||
SUBQ $-1, t0;\
|
SUBQ $-1, t0;\
|
||||||
SBBQ p256const0<>(SB), t1;\
|
SBBQ p256p<>+0x08(SB), t1;\
|
||||||
SBBQ $-1, t2;\
|
SBBQ $-1, t2;\
|
||||||
SBBQ p256const1<>(SB), t3;\
|
SBBQ p256p<>+0x018(SB), t3;\
|
||||||
SBBQ $0, mul0;\
|
SBBQ $0, mul0;\
|
||||||
CMOVQCS acc4, t0;\
|
CMOVQCS acc4, t0;\
|
||||||
CMOVQCS acc5, t1;\
|
CMOVQCS acc5, t1;\
|
||||||
@ -2005,9 +1973,9 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-96
|
|||||||
MOVQ (16*2 + 8*2)(CX), acc6
|
MOVQ (16*2 + 8*2)(CX), acc6
|
||||||
MOVQ (16*2 + 8*3)(CX), acc7
|
MOVQ (16*2 + 8*3)(CX), acc7
|
||||||
MOVQ $-1, acc0
|
MOVQ $-1, acc0
|
||||||
MOVQ p256const0<>(SB), acc1
|
MOVQ p256p<>+0x08(SB), acc1
|
||||||
MOVQ $-1, acc2
|
MOVQ $-1, acc2
|
||||||
MOVQ p256const1<>(SB), acc3
|
MOVQ p256p<>+0x018(SB), acc3
|
||||||
XORQ mul0, mul0
|
XORQ mul0, mul0
|
||||||
// Speculatively subtract
|
// Speculatively subtract
|
||||||
SUBQ acc4, acc0
|
SUBQ acc4, acc0
|
||||||
@ -2021,9 +1989,9 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-96
|
|||||||
MOVQ acc3, t3
|
MOVQ acc3, t3
|
||||||
// Add in case the operand was > p256
|
// Add in case the operand was > p256
|
||||||
ADDQ $-1, acc0
|
ADDQ $-1, acc0
|
||||||
ADCQ p256const0<>(SB), acc1
|
ADCQ p256p<>+0x08(SB), acc1
|
||||||
ADCQ $-1, acc2
|
ADCQ $-1, acc2
|
||||||
ADCQ p256const1<>(SB), acc3
|
ADCQ p256p<>+0x018(SB), acc3
|
||||||
ADCQ $0, mul0
|
ADCQ $0, mul0
|
||||||
CMOVQNE t0, acc0
|
CMOVQNE t0, acc0
|
||||||
CMOVQNE t1, acc1
|
CMOVQNE t1, acc1
|
||||||
@ -2242,9 +2210,9 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
|
|||||||
|
|
||||||
// XOR [acc4..acc7] with P and compare with zero again.
|
// XOR [acc4..acc7] with P and compare with zero again.
|
||||||
XORQ $-1, acc4
|
XORQ $-1, acc4
|
||||||
XORQ p256const0<>(SB), acc5
|
XORQ p256p<>+0x08(SB), acc5
|
||||||
XORQ $-1, acc6
|
XORQ $-1, acc6
|
||||||
XORQ p256const1<>(SB), acc7
|
XORQ p256p<>+0x018(SB), acc7
|
||||||
ORQ acc5, acc4
|
ORQ acc5, acc4
|
||||||
ORQ acc6, acc4
|
ORQ acc6, acc4
|
||||||
ORQ acc7, acc4
|
ORQ acc7, acc4
|
||||||
@ -2526,9 +2494,9 @@ TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48
|
|||||||
MOVQ acc7, t3
|
MOVQ acc7, t3
|
||||||
|
|
||||||
ADDQ $-1, acc4
|
ADDQ $-1, acc4
|
||||||
ADCQ p256const0<>(SB), acc5
|
ADCQ p256p<>+0x08(SB), acc5
|
||||||
ADCQ $-1, acc6
|
ADCQ $-1, acc6
|
||||||
ADCQ p256const1<>(SB), acc7
|
ADCQ p256p<>+0x018(SB), acc7
|
||||||
ADCQ $0, mul0
|
ADCQ $0, mul0
|
||||||
TESTQ $1, t0
|
TESTQ $1, t0
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user