fix error

This commit is contained in:
Emman 2021-12-20 16:22:05 +08:00
parent 485d6317a3
commit 799a1d3ce4

View File

@ -134,9 +134,9 @@ TEXT ·p256NegCond(SB),NOSPLIT,$0
MOVD cond+24(FP), hlp0 MOVD cond+24(FP), hlp0
MOVD a_ptr, res_ptr MOVD a_ptr, res_ptr
// acc = poly // acc = poly
MOVD $-1, acc0 MOVD p256p<>+0x00(SB), acc0
MOVD p256p<>+0x08(SB), acc1 MOVD p256p<>+0x08(SB), acc1
MOVD $-1, acc2 MOVD p256p<>+0x10(SB), acc2
MOVD p256p<>+0x18(SB), acc3 MOVD p256p<>+0x18(SB), acc3
// Load the original value // Load the original value
LDP 0*16(a_ptr), (t0, t1) LDP 0*16(a_ptr), (t0, t1)
@ -227,13 +227,13 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
MUL const3, acc0, t0 // t0 = L(acc0*p3) MUL const3, acc0, t0 // t0 = L(acc0*p3)
ADCS t0, acc3, acc3 // (carry3,acc3) = acc3 + L(acc0*p3) ADCS t0, acc3, acc3 // (carry3,acc3) = acc3 + L(acc0*p3)
UMULH const3, acc0, hlp1 // hlp1 = H(acc0*p3) UMULH const3, acc0, y1 // y1 = H(acc0*p3)
ADC $0, hlp1 ADC $0, y1
ADDS acc0, acc1, acc1 // (carry4, acc1) = acc0 + acc1 + L(acc0*p1) ADDS acc0, acc1, acc1 // (carry4, acc1) = acc0 + acc1 + L(acc0*p1)
ADCS y0, acc2, acc2 // (carry5, acc2) = carry4 + acc2 + L(acc0*p2) + H(acc0*p1) ADCS y0, acc2, acc2 // (carry5, acc2) = carry4 + acc2 + L(acc0*p2) + H(acc0*p1)
ADCS hlp0, acc3, acc3 // (carry6, acc3) = carry5 + acc3 + L(acc0*p3) + H(acc0*p2) ADCS hlp0, acc3, acc3 // (carry6, acc3) = carry5 + acc3 + L(acc0*p3) + H(acc0*p2)
ADC $0, hlp1, acc0 // acc0 = carry6 + H(acc0*p3) ADC $0, y1, acc0 // acc0 = carry6 + H(acc0*p3)
// Second reduction step // Second reduction step
MUL const1, acc1, t0 MUL const1, acc1, t0
@ -247,13 +247,13 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
MUL const3, acc1, t0 // t0 = L(acc1*p3) MUL const3, acc1, t0 // t0 = L(acc1*p3)
ADCS t0, acc0, acc0 // (carry3,acc0) = acc0 + L(acc1*p3) ADCS t0, acc0, acc0 // (carry3,acc0) = acc0 + L(acc1*p3)
UMULH const3, acc1, hlp1 // hlp1 = H(acc1*p3) UMULH const3, acc1, y1 // y1 = H(acc1*p3)
ADC $0, hlp1 ADC $0, y1
ADDS acc1, acc2, acc2 // (carry4, acc2) = acc1 + acc2 + L(acc1*p1) ADDS acc1, acc2, acc2 // (carry4, acc2) = acc1 + acc2 + L(acc1*p1)
ADCS y0, acc3, acc3 // (carry5, acc3) = carry4 + acc3 + L(acc1*p2) + H(acc1*p1) ADCS y0, acc3, acc3 // (carry5, acc3) = carry4 + acc3 + L(acc1*p2) + H(acc1*p1)
ADCS hlp0, acc0, acc0 // (carry6, acc0) = carry5 + acc0 + L(acc1*p3) + H(acc1*p2) ADCS hlp0, acc0, acc0 // (carry6, acc0) = carry5 + acc0 + L(acc1*p3) + H(acc1*p2)
ADC $0, hlp1, acc1 // acc1 = carry6 + H(acc1*p3) ADC $0, y1, acc1 // acc1 = carry6 + H(acc1*p3)
// Third reduction step // Third reduction step
MUL const1, acc2, t0 MUL const1, acc2, t0
ADDS t0, acc3, acc3 // (carry1, acc3) = acc3 + L(acc2*p1) ADDS t0, acc3, acc3 // (carry1, acc3) = acc3 + L(acc2*p1)
@ -266,13 +266,13 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
MUL const3, acc2, t0 // t0 = L(acc2*p3) MUL const3, acc2, t0 // t0 = L(acc2*p3)
ADCS t0, acc1, acc1 // (carry3,acc1) = acc1 + L(acc2*p3) ADCS t0, acc1, acc1 // (carry3,acc1) = acc1 + L(acc2*p3)
UMULH const3, acc2, hlp1 // hlp1 = H(acc2*p3) UMULH const3, acc2, y1 // y1 = H(acc2*p3)
ADC $0, hlp1 ADC $0, y1
ADDS acc2, acc3, acc3 // (carry4, acc3) = acc2 + acc3 + L(acc2*p1) ADDS acc2, acc3, acc3 // (carry4, acc3) = acc2 + acc3 + L(acc2*p1)
ADCS y0, acc0, acc0 // (carry5, acc0) = carry4 + acc0 + L(acc2*p2) + H(acc2*p1) ADCS y0, acc0, acc0 // (carry5, acc0) = carry4 + acc0 + L(acc2*p2) + H(acc2*p1)
ADCS hlp0, acc1, acc1 // (carry6, acc1) = carry5 + acc1 + L(acc2*p3) + H(acc2*p2) ADCS hlp0, acc1, acc1 // (carry6, acc1) = carry5 + acc1 + L(acc2*p3) + H(acc2*p2)
ADC $0, hlp1, acc2 // acc2 = carry6 + H(acc2*p3) ADC $0, y1, acc2 // acc2 = carry6 + H(acc2*p3)
// Last reduction step // Last reduction step
MUL const1, acc3, t0 MUL const1, acc3, t0
ADDS t0, acc0, acc0 // (carry1, acc0) = acc0 + L(acc3*p1) ADDS t0, acc0, acc0 // (carry1, acc0) = acc0 + L(acc3*p1)
@ -285,13 +285,13 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
MUL const3, acc3, t0 // t0 = L(acc3*p3) MUL const3, acc3, t0 // t0 = L(acc3*p3)
ADCS t0, acc2, acc2 // (carry3,acc2) = acc2 + L(acc3*p3) ADCS t0, acc2, acc2 // (carry3,acc2) = acc2 + L(acc3*p3)
UMULH const3, acc3, hlp1 // hlp1 = H(acc3*p3) UMULH const3, acc3, y1 // y1 = H(acc3*p3)
ADC $0, hlp1 ADC $0, y1
ADDS acc3, acc0, acc0 // (carry4, acc0) = acc3 + acc0 + L(acc3*p1) ADDS acc3, acc0, acc0 // (carry4, acc0) = acc3 + acc0 + L(acc3*p1)
ADCS y0, acc1, acc1 // (carry5, acc1) = carry4 + acc1 + L(acc3*p2) + H(acc3*p1) ADCS y0, acc1, acc1 // (carry5, acc1) = carry4 + acc1 + L(acc3*p2) + H(acc3*p1)
ADCS hlp0, acc2, acc2 // (carry6, acc2) = carry5 + acc2 + L(acc3*p3) + H(acc3*p2) ADCS hlp0, acc2, acc2 // (carry6, acc2) = carry5 + acc2 + L(acc3*p3) + H(acc3*p2)
ADC $0, hlp1, acc3 // acc3 = carry6 + H(acc3*p3) ADC $0, y1, acc3 // acc3 = carry6 + H(acc3*p3)
SUBS const0, acc0, t0 SUBS const0, acc0, t0
SBCS const1, acc1, t1 SBCS const1, acc1, t1
@ -834,10 +834,10 @@ TEXT sm2P256Subinternal<>(SB),NOSPLIT,$0
SBCS x3, y3, acc3 SBCS x3, y3, acc3
SBC $0, ZR, t0 SBC $0, ZR, t0
ADDS $-1, acc0, acc4 ADDS const0, acc0, acc4
ADCS const0, acc1, acc5 ADCS const1, acc1, acc5
ADCS $-1, acc2, acc6 ADCS const2, acc2, acc6
ADC const1, acc3, acc7 ADC const3, acc3, acc7
ANDS $1, t0 ANDS $1, t0
CSEL EQ, acc0, acc4, x0 CSEL EQ, acc0, acc4, x0
@ -906,81 +906,81 @@ TEXT sm2P256SqrInternal<>(SB),NOSPLIT,$0
UMULH x3, x3, t1 UMULH x3, x3, t1
ADCS t1, acc7, acc7 ADCS t1, acc7, acc7
// First reduction step // First reduction step
MUL const0, acc0, t0 MUL const1, acc0, t0
ADDS t0, acc1, acc1 // (carry1, acc1) = acc1 + L(acc0*p1) ADDS t0, acc1, acc1 // (carry1, acc1) = acc1 + L(acc0*p1)
UMULH const0, acc0, y0 // y0 = H(acc0*p1) UMULH const1, acc0, y0 // y0 = H(acc0*p1)
MUL $-1, acc0, t0 MUL const2, acc0, t0
ADCS t0, acc2, acc2 // (carry2, acc2) = acc2 + L(acc0*p2) ADCS t0, acc2, acc2 // (carry2, acc2) = acc2 + L(acc0*p2)
UMULH $-1, acc0, hlp0 // hlp0 = H(acc0*p2) UMULH const2, acc0, hlp0 // hlp0 = H(acc0*p2)
MUL const1, acc0, t0 // t0 = L(acc0*p3) MUL const3, acc0, t0 // t0 = L(acc0*p3)
ADCS t0, acc3, acc3 // (carry3,acc3) = acc3 + L(acc0*p3) ADCS t0, acc3, acc3 // (carry3,acc3) = acc3 + L(acc0*p3)
UMULH const1, acc0, hlp1 // hlp1 = H(acc0*p3) UMULH const3, acc0, y1 // y1 = H(acc0*p3)
ADC $0, hlp1 // hlp1 = carry3 + hlp1 ADC $0, y1
ADDS acc0, acc1, acc1 // (carry4, acc1) = acc0 + acc1 + L(acc0*p1) ADDS acc0, acc1, acc1 // (carry4, acc1) = acc0 + acc1 + L(acc0*p1)
ADCS y0, acc2, acc2 // (carry5, acc2) = carry4 + acc2 + L(acc0*p2) + H(acc0*p1) ADCS y0, acc2, acc2 // (carry5, acc2) = carry4 + acc2 + L(acc0*p2) + H(acc0*p1)
ADCS hlp0, acc3, acc3 // (carry6, acc3) = carry5 + acc3 + L(acc0*p3) + H(acc0*p2) ADCS hlp0, acc3, acc3 // (carry6, acc3) = carry5 + acc3 + L(acc0*p3) + H(acc0*p2)
ADC $0, hlp1, acc0 // acc0 = carry6 + H(acc0*p3) ADC $0, y1, acc0 // acc0 = carry6 + H(acc0*p3)
// Second reduction step // Second reduction step
MUL const0, acc1, t0 MUL const1, acc1, t0
ADDS t0, acc2, acc2 // (carry1, acc2) = acc2 + L(acc1*p1) ADDS t0, acc2, acc2 // (carry1, acc2) = acc2 + L(acc1*p1)
UMULH const0, acc1, y0 // y0 = H(acc1*p1) UMULH const1, acc1, y0 // y0 = H(acc1*p1)
MUL $-1, acc1, t0 MUL const2, acc1, t0
ADCS t0, acc3, acc3 // (carry2, acc3) = acc3 + L(acc1*p2) ADCS t0, acc3, acc3 // (carry2, acc3) = acc3 + L(acc1*p2)
UMULH $-1, acc1, hlp0 // hlp0 = H(acc1*p2) UMULH const2, acc1, hlp0 // hlp0 = H(acc1*p2)
MUL const1, acc1, t0 // t0 = L(acc1*p3) MUL const3, acc1, t0 // t0 = L(acc1*p3)
ADCS t0, acc0, acc0 // (carry3,acc0) = acc0 + L(acc1*p3) ADCS t0, acc0, acc0 // (carry3,acc0) = acc0 + L(acc1*p3)
UMULH const1, acc1, hlp1 // hlp1 = H(acc1*p3) UMULH const3, acc1, y1 // y1 = H(acc1*p3)
ADC $0, hlp1 // hlp1 = carry3 + hlp1 ADC $0, y1
ADDS acc1, acc2, acc2 // (carry4, acc2) = acc1 + acc2 + L(acc1*p1) ADDS acc1, acc2, acc2 // (carry4, acc2) = acc1 + acc2 + L(acc1*p1)
ADCS y0, acc3, acc3 // (carry5, acc3) = carry4 + acc3 + L(acc1*p2) + H(acc1*p1) ADCS y0, acc3, acc3 // (carry5, acc3) = carry4 + acc3 + L(acc1*p2) + H(acc1*p1)
ADCS hlp0, acc0, acc0 // (carry6, acc0) = carry5 + acc0 + L(acc1*p3) + H(acc1*p2) ADCS hlp0, acc0, acc0 // (carry6, acc0) = carry5 + acc0 + L(acc1*p3) + H(acc1*p2)
ADC $0, hlp1, acc1 // acc1 = carry6 + H(acc1*p3) ADC $0, y1, acc1 // acc1 = carry6 + H(acc1*p3)
// Third reduction step // Third reduction step
MUL const0, acc2, t0 MUL const1, acc2, t0
ADDS t0, acc3, acc3 // (carry1, acc3) = acc3 + L(acc2*p1) ADDS t0, acc3, acc3 // (carry1, acc3) = acc3 + L(acc2*p1)
UMULH const0, acc1, y0 // y0 = H(acc2*p1) UMULH const1, acc1, y0 // y0 = H(acc2*p1)
MUL $-1, acc2, t0 MUL const2, acc2, t0
ADCS t0, acc0, acc0 // (carry2, acc0) = acc0 + L(acc2*p2) ADCS t0, acc0, acc0 // (carry2, acc0) = acc0 + L(acc2*p2)
UMULH $-1, acc2, hlp0 // hlp0 = H(acc2*p2) UMULH const2, acc2, hlp0 // hlp0 = H(acc2*p2)
MUL const1, acc2, t0 // t0 = L(acc2*p3) MUL const3, acc2, t0 // t0 = L(acc2*p3)
ADCS t0, acc1, acc1 // (carry3,acc1) = acc1 + L(acc2*p3) ADCS t0, acc1, acc1 // (carry3,acc1) = acc1 + L(acc2*p3)
UMULH const1, acc2, hlp1 // hlp1 = H(acc2*p3) UMULH const3, acc2, y1 // y1 = H(acc2*p3)
ADC $0, hlp1 // hlp1 = carry3 + hlp1 ADC $0, y1
ADDS acc2, acc3, acc3 // (carry4, acc3) = acc2 + acc3 + L(acc2*p1) ADDS acc2, acc3, acc3 // (carry4, acc3) = acc2 + acc3 + L(acc2*p1)
ADCS y0, acc0, acc0 // (carry5, acc0) = carry4 + acc0 + L(acc2*p2) + H(acc2*p1) ADCS y0, acc0, acc0 // (carry5, acc0) = carry4 + acc0 + L(acc2*p2) + H(acc2*p1)
ADCS hlp0, acc1, acc1 // (carry6, acc1) = carry5 + acc1 + L(acc2*p3) + H(acc2*p2) ADCS hlp0, acc1, acc1 // (carry6, acc1) = carry5 + acc1 + L(acc2*p3) + H(acc2*p2)
ADC $0, hlp1, acc2 // acc2 = carry6 + H(acc2*p3) ADC $0, y1, acc2 // acc2 = carry6 + H(acc2*p3)
// Last reduction step // Last reduction step
MUL const0, acc3, t0 MUL const1, acc3, t0
ADDS t0, acc0, acc0 // (carry1, acc0) = acc0 + L(acc3*p1) ADDS t0, acc0, acc0 // (carry1, acc0) = acc0 + L(acc3*p1)
UMULH const0, acc1, y0 // y0 = H(acc2*p1) UMULH const1, acc1, y0 // y0 = H(acc2*p1)
MUL $-1, acc3, t0 MUL const2, acc3, t0
ADCS t0, acc1, acc1 // (carry2, acc1) = acc1 + L(acc3*p2) ADCS t0, acc1, acc1 // (carry2, acc1) = acc1 + L(acc3*p2)
UMULH $-1, acc3, hlp0 // hlp0 = H(acc3*p2) UMULH const2, acc3, hlp0 // hlp0 = H(acc3*p2)
MUL const1, acc3, t0 // t0 = L(acc3*p3) MUL const3, acc3, t0 // t0 = L(acc3*p3)
ADCS t0, acc2, acc2 // (carry3,acc2) = acc2 + L(acc3*p3) ADCS t0, acc2, acc2 // (carry3,acc2) = acc2 + L(acc3*p3)
UMULH const1, acc3, hlp1 // hlp1 = H(acc3*p3) UMULH const3, acc3, y1 // y1 = H(acc3*p3)
ADC $0, acc7 // acc7 = carry3 + acc7 ADC $0, acc7 // acc7 = carry3 + acc7
ADDS acc3, acc0, acc0 // (carry4, acc0) = acc3 + acc0 + L(acc3*p1) ADDS acc3, acc0, acc0 // (carry4, acc0) = acc3 + acc0 + L(acc3*p1)
ADCS y0, acc1, acc1 // (carry5, acc1) = carry4 + acc1 + L(acc3*p2) + H(acc3*p1) ADCS y0, acc1, acc1 // (carry5, acc1) = carry4 + acc1 + L(acc3*p2) + H(acc3*p1)
ADCS hlp0, acc2, acc2 // (carry6, acc2) = carry5 + acc2 + L(acc3*p3) + H(acc3*p2) ADCS hlp0, acc2, acc2 // (carry6, acc2) = carry5 + acc2 + L(acc3*p3) + H(acc3*p2)
ADC $0, hlp1, acc3 // acc3 = carry6 + H(acc3*p3) ADC $0, y1, acc3 // acc3 = carry6 + H(acc3*p3)
// Add bits [511:256] of the sqr result // Add bits [511:256] of the sqr result
ADDS acc4, acc0, acc0 ADDS acc4, acc0, acc0
@ -989,10 +989,10 @@ TEXT sm2P256SqrInternal<>(SB),NOSPLIT,$0
ADCS acc7, acc3, acc3 ADCS acc7, acc3, acc3
ADC $0, ZR, acc4 ADC $0, ZR, acc4
SUBS $-1, acc0, t0 SUBS const0, acc0, t0
SBCS const0, acc1, t1 SBCS const1, acc1, t1
SBCS $-1, acc2, t2 SBCS const2, acc2, t2
SBCS const1, acc3, t3 SBCS const3, acc3, t3
SBCS $0, acc4, acc4 SBCS $0, acc4, acc4
CSEL CS, t0, acc0, y0 CSEL CS, t0, acc0, y0
@ -1019,24 +1019,24 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
UMULH y0, x3, acc4 UMULH y0, x3, acc4
ADC $0, acc4 ADC $0, acc4
// First reduction step // First reduction step
MUL const0, acc0, t0 MUL const1, acc0, t0
ADDS t0, acc1, acc1 // (carry1, acc1) = acc1 + L(acc0*p1) ADDS t0, acc1, acc1 // (carry1, acc1) = acc1 + L(acc0*p1)
UMULH const0, acc0, y0 // y0 = H(acc0*p1) UMULH const1, acc0, y0 // y0 = H(acc0*p1)
MUL $-1, acc0, t0 MUL const2, acc0, t0
ADCS t0, acc2, acc2 // (carry2, acc2) = acc2 + L(acc0*p2) ADCS t0, acc2, acc2 // (carry2, acc2) = acc2 + L(acc0*p2)
UMULH $-1, acc0, hlp0 // hlp0 = H(acc0*p2) UMULH const2, acc0, hlp0 // hlp0 = H(acc0*p2)
MUL const1, acc0, t0 // t0 = L(acc0*p3) MUL const3, acc0, t0 // t0 = L(acc0*p3)
ADCS t0, acc3, acc3 // (carry3,acc3) = acc3 + L(acc0*p3) ADCS t0, acc3, acc3 // (carry3,acc3) = acc3 + L(acc0*p3)
UMULH const1, acc0, hlp1 // hlp1 = H(acc0*p3) UMULH const3, acc0, acc5 // acc5 = H(acc0*p3)
ADC $0, acc4 // acc4 = carry3 + acc4 ADC $0, acc4 // acc4 = carry3 + acc4
ADDS acc0, acc1, acc1 // (carry4, acc1) = acc0 + acc1 + L(acc0*p1) ADDS acc0, acc1, acc1 // (carry4, acc1) = acc0 + acc1 + L(acc0*p1)
ADCS y0, acc2, acc2 // (carry5, acc2) = carry4 + acc2 + L(acc0*p2) + H(acc0*p1) ADCS y0, acc2, acc2 // (carry5, acc2) = carry4 + acc2 + L(acc0*p2) + H(acc0*p1)
ADCS hlp0, acc3, acc3 // (carry6, acc3) = carry5 + acc3 + L(acc0*p3) + H(acc0*p2) ADCS hlp0, acc3, acc3 // (carry6, acc3) = carry5 + acc3 + L(acc0*p3) + H(acc0*p2)
ADC $0, hlp1, acc0 // acc0 = carry6 + H(acc0*p3) ADC $0, acc5, acc0 // acc0 = carry6 + H(acc0*p3)
// y[1] * x // y[1] * x
MUL y1, x0, t0 MUL y1, x0, t0
@ -1061,24 +1061,24 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
ADCS t3, acc4 ADCS t3, acc4
ADC hlp0, acc5 ADC hlp0, acc5
// Second reduction step // Second reduction step
MUL const0, acc1, t0 MUL const1, acc1, t0
ADDS t0, acc2, acc2 // (carry1, acc2) = acc2 + L(acc1*p1) ADDS t0, acc2, acc2 // (carry1, acc2) = acc2 + L(acc1*p1)
UMULH const0, acc1, y0 // y0 = H(acc1*p1) UMULH const1, acc1, y0 // y0 = H(acc1*p1)
MUL $-1, acc1, t0 MUL const2, acc1, t0
ADCS t0, acc3, acc3 // (carry2, acc3) = acc3 + L(acc1*p2) ADCS t0, acc3, acc3 // (carry2, acc3) = acc3 + L(acc1*p2)
UMULH $-1, acc1, hlp0 // hlp0 = H(acc1*p2) UMULH const2, acc1, hlp0 // hlp0 = H(acc1*p2)
MUL const1, acc1, t0 // t0 = L(acc1*p3) MUL const3, acc1, t0 // t0 = L(acc1*p3)
ADCS t0, acc0, acc0 // (carry3,acc0) = acc0 + L(acc1*p3) ADCS t0, acc0, acc0 // (carry3,acc0) = acc0 + L(acc1*p3)
UMULH const1, acc1, hlp1 // hlp1 = H(acc1*p3) UMULH const3, acc1, y1 // y1 = H(acc1*p3)
ADC $0, acc5 // acc5 = carry3 + acc5 ADC $0, acc5 // acc5 = carry3 + acc5
ADDS acc1, acc2, acc2 // (carry4, acc2) = acc1 + acc2 + L(acc1*p1) ADDS acc1, acc2, acc2 // (carry4, acc2) = acc1 + acc2 + L(acc1*p1)
ADCS y0, acc3, acc3 // (carry5, acc3) = carry4 + acc3 + L(acc1*p2) + H(acc1*p1) ADCS y0, acc3, acc3 // (carry5, acc3) = carry4 + acc3 + L(acc1*p2) + H(acc1*p1)
ADCS hlp0, acc0, acc0 // (carry6, acc0) = carry5 + acc0 + L(acc1*p3) + H(acc1*p2) ADCS hlp0, acc0, acc0 // (carry6, acc0) = carry5 + acc0 + L(acc1*p3) + H(acc1*p2)
ADC $0, hlp1, acc1 // acc1 = carry6 + H(acc1*p3) ADC $0, y1, acc1 // acc1 = carry6 + H(acc1*p3)
// y[2] * x // y[2] * x
MUL y2, x0, t0 MUL y2, x0, t0
@ -1103,24 +1103,24 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
ADCS t3, acc5 ADCS t3, acc5
ADC hlp0, acc6 ADC hlp0, acc6
// Third reduction step // Third reduction step
MUL const0, acc2, t0 MUL const1, acc2, t0
ADDS t0, acc3, acc3 // (carry1, acc3) = acc3 + L(acc2*p1) ADDS t0, acc3, acc3 // (carry1, acc3) = acc3 + L(acc2*p1)
UMULH const0, acc1, y0 // y0 = H(acc2*p1) UMULH const1, acc1, y0 // y0 = H(acc2*p1)
MUL $-1, acc2, t0 MUL const2, acc2, t0
ADCS t0, acc0, acc0 // (carry2, acc0) = acc0 + L(acc2*p2) ADCS t0, acc0, acc0 // (carry2, acc0) = acc0 + L(acc2*p2)
UMULH $-1, acc2, hlp0 // hlp0 = H(acc2*p2) UMULH const2, acc2, hlp0 // hlp0 = H(acc2*p2)
MUL const1, acc2, t0 // t0 = L(acc2*p3) MUL const3, acc2, t0 // t0 = L(acc2*p3)
ADCS t0, acc1, acc1 // (carry3,acc1) = acc1 + L(acc2*p3) ADCS t0, acc1, acc1 // (carry3,acc1) = acc1 + L(acc2*p3)
UMULH const1, acc2, hlp1 // hlp1 = H(acc2*p3) UMULH const3, acc2, y1 // y1 = H(acc2*p3)
ADC $0, acc6 // acc6 = carry3 + acc6 ADC $0, acc6 // acc6 = carry3 + acc6
ADDS acc2, acc3, acc3 // (carry4, acc3) = acc2 + acc3 + L(acc2*p1) ADDS acc2, acc3, acc3 // (carry4, acc3) = acc2 + acc3 + L(acc2*p1)
ADCS y0, acc0, acc0 // (carry5, acc0) = carry4 + acc0 + L(acc2*p2) + H(acc2*p1) ADCS y0, acc0, acc0 // (carry5, acc0) = carry4 + acc0 + L(acc2*p2) + H(acc2*p1)
ADCS hlp0, acc1, acc1 // (carry6, acc1) = carry5 + acc1 + L(acc2*p3) + H(acc2*p2) ADCS hlp0, acc1, acc1 // (carry6, acc1) = carry5 + acc1 + L(acc2*p3) + H(acc2*p2)
ADC $0, hlp1, acc2 // acc2 = carry6 + H(acc2*p3) ADC $0, y1, acc2 // acc2 = carry6 + H(acc2*p3)
// y[3] * x // y[3] * x
MUL y3, x0, t0 MUL y3, x0, t0
@ -1145,24 +1145,24 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
ADCS t3, acc6 ADCS t3, acc6
ADC hlp0, acc7 ADC hlp0, acc7
// Last reduction step // Last reduction step
MUL const0, acc3, t0 MUL const1, acc3, t0
ADDS t0, acc0, acc0 // (carry1, acc0) = acc0 + L(acc3*p1) ADDS t0, acc0, acc0 // (carry1, acc0) = acc0 + L(acc3*p1)
UMULH const0, acc1, y0 // y0 = H(acc2*p1) UMULH const1, acc1, y0 // y0 = H(acc2*p1)
MUL $-1, acc3, t0 MUL const2, acc3, t0
ADCS t0, acc1, acc1 // (carry2, acc1) = acc1 + L(acc3*p2) ADCS t0, acc1, acc1 // (carry2, acc1) = acc1 + L(acc3*p2)
UMULH $-1, acc3, hlp0 // hlp0 = H(acc3*p2) UMULH const2, acc3, hlp0 // hlp0 = H(acc3*p2)
MUL const1, acc3, t0 // t0 = L(acc3*p3) MUL const3, acc3, t0 // t0 = L(acc3*p3)
ADCS t0, acc2, acc2 // (carry3,acc2) = acc2 + L(acc3*p3) ADCS t0, acc2, acc2 // (carry3,acc2) = acc2 + L(acc3*p3)
UMULH const1, acc3, hlp1 // hlp1 = H(acc3*p3) UMULH const3, acc3, y1 // y1 = H(acc3*p3)
ADC $0, acc7 // acc7 = carry3 + acc7 ADC $0, acc7 // acc7 = carry3 + acc7
ADDS acc3, acc0, acc0 // (carry4, acc0) = acc3 + acc0 + L(acc3*p1) ADDS acc3, acc0, acc0 // (carry4, acc0) = acc3 + acc0 + L(acc3*p1)
ADCS y0, acc1, acc1 // (carry5, acc1) = carry4 + acc1 + L(acc3*p2) + H(acc3*p1) ADCS y0, acc1, acc1 // (carry5, acc1) = carry4 + acc1 + L(acc3*p2) + H(acc3*p1)
ADCS hlp0, acc2, acc2 // (carry6, acc2) = carry5 + acc2 + L(acc3*p3) + H(acc3*p2) ADCS hlp0, acc2, acc2 // (carry6, acc2) = carry5 + acc2 + L(acc3*p3) + H(acc3*p2)
ADC $0, hlp1, acc3 // acc3 = carry6 + H(acc3*p3) ADC $0, y1, acc3 // acc3 = carry6 + H(acc3*p3)
// Add bits [511:256] of the mul result // Add bits [511:256] of the mul result
ADDS acc4, acc0, acc0 ADDS acc4, acc0, acc0
@ -1171,10 +1171,10 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
ADCS acc7, acc3, acc3 ADCS acc7, acc3, acc3
ADC $0, ZR, acc4 ADC $0, ZR, acc4
SUBS $-1, acc0, t0 SUBS const0, acc0, t0
SBCS const0, acc1, t1 SBCS const1, acc1, t1
SBCS $-1, acc2, t2 SBCS const2, acc2, t2
SBCS const1, acc3, t3 SBCS const3, acc3, t3
SBCS $0, acc4, acc4 SBCS $0, acc4, acc4
CSEL CS, t0, acc0, y0 CSEL CS, t0, acc0, y0
@ -1189,10 +1189,10 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
ADCS y2, y2, x2; \ ADCS y2, y2, x2; \
ADCS y3, y3, x3; \ ADCS y3, y3, x3; \
ADC $0, ZR, hlp0; \ ADC $0, ZR, hlp0; \
SUBS $-1, x0, t0; \ SUBS const0, x0, t0; \
SBCS const0, x1, t1;\ SBCS const1, x1, t1;\
SBCS $-1, x2, t2; \ SBCS const2, x2, t2; \
SBCS const1, x3, t3;\ SBCS const3, x3, t3;\
SBCS $0, hlp0, hlp0;\ SBCS $0, hlp0, hlp0;\
CSEL CC, x0, t0, x0;\ CSEL CC, x0, t0, x0;\
CSEL CC, x1, t1, x1;\ CSEL CC, x1, t1, x1;\
@ -1240,25 +1240,24 @@ TEXT ·p256PointAddAffineAsm(SB),0,$264-96
CMP $0, hlp1 CMP $0, hlp1
CSEL EQ, ZR, t0, hlp1 CSEL EQ, ZR, t0, hlp1
MOVD p256p<>+0x08(SB), const0 LDP p256p<>+0x00(SB), (const0, const1)
MOVD p256p<>+0x18(SB), const1 LDP p256p<>+0x10(SB), (const2, const3)
EOR t2<<1, hlp1 EOR t2<<1, hlp1
// Negate y2in based on sign // Negate y2in based on sign
LDP 2*16(b_ptr), (y0, y1) LDP 2*16(b_ptr), (y0, y1)
LDP 3*16(b_ptr), (y2, y3) LDP 3*16(b_ptr), (y2, y3)
MOVD p256p<>+0x00(SB), acc4 SUBS y0, const0, acc0
SUBS y0, acc4, acc0 SBCS y1, const1, acc1
SBCS y1, const0, acc1 SBCS y2, const2, acc2
SBCS y2, acc4, acc2 SBCS y3, const3, acc3
SBCS y3, const1, acc3
SBC $0, ZR, t0 SBC $0, ZR, t0
ADDS $-1, acc0, acc4 ADDS const0, acc0, acc4
ADCS const0, acc1, acc5 ADCS const1, acc1, acc5
ADCS $-1, acc2, acc6 ADCS const2, acc2, acc6
ADCS const1, acc3, acc7 ADCS const3, acc3, acc7
ADC $0, t0, t0 ADC $0, t0, t0
CMP $0, t0 CMP $0, t0
@ -1408,10 +1407,10 @@ TEXT ·p256PointAddAffineAsm(SB),0,$264-96
ADCS y2, x2, x2; \ ADCS y2, x2, x2; \
ADCS y3, x3, x3; \ ADCS y3, x3, x3; \
ADC $0, ZR, hlp0; \ ADC $0, ZR, hlp0; \
SUBS $-1, x0, t0; \ SUBS const0, x0, t0; \
SBCS const0, x1, t1;\ SBCS const1, x1, t1;\
SBCS $-1, x2, t2; \ SBCS const2, x2, t2; \
SBCS const1, x3, t3;\ SBCS const3, x3, t3;\
SBCS $0, hlp0, hlp0;\ SBCS $0, hlp0, hlp0;\
CSEL CC, x0, t0, x0;\ CSEL CC, x0, t0, x0;\
CSEL CC, x1, t1, x1;\ CSEL CC, x1, t1, x1;\
@ -1428,8 +1427,8 @@ TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-48
MOVD res+0(FP), res_ptr MOVD res+0(FP), res_ptr
MOVD in+24(FP), a_ptr MOVD in+24(FP), a_ptr
MOVD p256p<>+0x08(SB), const0 LDP p256p<>+0x00(SB), (const0, const1)
MOVD p256p<>+0x18(SB), const1 LDP p256p<>+0x10(SB), (const2, const3)
// Begin point double // Begin point double
LDP 4*16(a_ptr), (x0, x1) LDP 4*16(a_ptr), (x0, x1)
@ -1471,10 +1470,10 @@ TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-48
CALL sm2P256SqrInternal<>(SB) CALL sm2P256SqrInternal<>(SB)
// Divide by 2 // Divide by 2
ADDS $-1, y0, t0 ADDS const0, y0, t0
ADCS const0, y1, t1 ADCS const1, y1, t1
ADCS $-1, y2, t2 ADCS const2, y2, t2
ADCS const1, y3, t3 ADCS const3, y3, t3
ADC $0, ZR, hlp0 ADC $0, ZR, hlp0
ANDS $1, y0, ZR ANDS $1, y0, ZR
@ -1530,8 +1529,8 @@ TEXT ·p256PointAddAsm(SB),0,$392-80
MOVD in1+24(FP), a_ptr MOVD in1+24(FP), a_ptr
MOVD in2+48(FP), b_ptr MOVD in2+48(FP), b_ptr
MOVD p256p<>+0x08(SB), const0 LDP p256p<>+0x00(SB), (const0, const1)
MOVD p256p<>+0x18(SB), const1 LDP p256p<>+0x10(SB), (const2, const3)
// Begin point add // Begin point add
LDx(z2in) LDx(z2in)
@ -1558,21 +1557,21 @@ TEXT ·p256PointAddAsm(SB),0,$392-80
STx(r) STx(r)
MOVD $1, acc1 MOVD $1, acc1
ORR x0, x1, t0 // Check if zero mod p256 ORR x0, x1, acc2 // Check if zero mod p256
ORR x2, x3, t1 ORR x2, x3, acc3
ORR t1, t0, t0 ORR acc3, acc2, acc2
CMP $0, t0 CMP $0, acc2
CSEL EQ, acc1, ZR, hlp1 CSEL EQ, acc1, ZR, hlp1
EOR $-1, x0, t0 EOR const0, x0, acc2
EOR const0, x1, t1 EOR const1, x1, acc3
EOR $-1, x2, t2 EOR const2, x2, acc4
EOR const1, x3, t3 EOR const3, x3, acc5
ORR t0, t1, t0 ORR acc2, acc3, acc2
ORR t2, t3, t1 ORR acc4, acc5, acc3
ORR t1, t0, t0 ORR acc3, acc2, acc2
CMP $0, t0 CMP $0, acc2
CSEL EQ, acc1, hlp1, hlp1 CSEL EQ, acc1, hlp1, hlp1
LDx(z2sqr) LDx(z2sqr)
@ -1590,21 +1589,21 @@ TEXT ·p256PointAddAsm(SB),0,$392-80
STx(h) STx(h)
MOVD $1, acc1 MOVD $1, acc1
ORR x0, x1, t0 // Check if zero mod p256 ORR x0, x1, acc2 // Check if zero mod p256
ORR x2, x3, t1 ORR x2, x3, acc3
ORR t1, t0, t0 ORR acc3, acc2, acc2
CMP $0, t0 CMP $0, acc2
CSEL EQ, acc1, ZR, hlp0 CSEL EQ, acc1, ZR, hlp0
EOR $-1, x0, t0 EOR const0, x0, acc2
EOR const0, x1, t1 EOR const1, x1, acc3
EOR $-1, x2, t2 EOR const2, x2, acc4
EOR const1, x3, t3 EOR const3, x3, acc5
ORR t0, t1, t0 ORR acc2, acc3, acc2
ORR t2, t3, t1 ORR acc4, acc5, acc3
ORR t1, t0, t0 ORR acc3, acc2, acc2
CMP $0, t0 CMP $0, acc2
CSEL EQ, acc1, hlp0, hlp0 CSEL EQ, acc1, hlp0, hlp0
AND hlp0, hlp1, hlp1 AND hlp0, hlp1, hlp1