//go:build !purego #include "textflag.h" #define res_ptr R0 #define a_ptr R1 #define b_ptr R2 #define acc0 R3 #define acc1 R4 #define acc2 R5 #define acc3 R6 #define acc4 R7 #define acc5 R8 #define acc6 R9 #define acc7 R10 #define t0 R11 #define t1 R12 #define const0 R13 #define const1 R14 #define const2 R15 #define const3 R16 #define hlp0 R17 #define hlp1 res_ptr #define x0 R19 #define x1 R20 #define x2 R21 #define x3 R22 #define y0 R23 #define y1 R24 #define y2 R25 #define y3 R26 /* ---------------------------------------*/ // (x3, x2, x1, x0) = (y3, y2, y1, y0) - (x3, x2, x1, x0) TEXT gfpSubInternal(SB),NOSPLIT,$0 SUBS x0, y0, acc0 SBCS x1, y1, acc1 SBCS x2, y2, acc2 SBCS x3, y3, acc3 SBC $0, ZR, t0 ADDS const0, acc0, acc4 ADCS const1, acc1, acc5 ADCS const2, acc2, acc6 ADC const3, acc3, acc7 ANDS $1, t0 CSEL EQ, acc0, acc4, x0 CSEL EQ, acc1, acc5, x1 CSEL EQ, acc2, acc6, x2 CSEL EQ, acc3, acc7, x3 RET /* ---------------------------------------*/ // (y3, y2, y1, y0) = (x3, x2, x1, x0) * (y3, y2, y1, y0) TEXT gfpMulInternal(SB),NOSPLIT,$0 // y[0] * x MUL y0, x0, acc0 UMULH y0, x0, acc1 MUL y0, x1, t0 ADDS t0, acc1 UMULH y0, x1, acc2 MUL y0, x2, t0 ADCS t0, acc2 UMULH y0, x2, acc3 MUL y0, x3, t0 ADCS t0, acc3 UMULH y0, x3, acc4 ADC $0, acc4 // First reduction step MUL acc0, hlp1, hlp0 MUL const0, hlp0, t0 ADDS t0, acc0, acc0 UMULH const0, hlp0, t1 MUL const1, hlp0, t0 ADCS t0, acc1, acc1 UMULH const1, hlp0, y0 MUL const2, hlp0, t0 ADCS t0, acc2, acc2 UMULH const2, hlp0, acc0 MUL const3, hlp0, t0 ADCS t0, acc3, acc3 UMULH const3, hlp0, hlp0 ADC $0, acc4 ADDS t1, acc1, acc1 ADCS y0, acc2, acc2 ADCS acc0, acc3, acc3 ADC $0, hlp0, acc0 // y[1] * x MUL y1, x0, t0 ADDS t0, acc1 UMULH y1, x0, t1 MUL y1, x1, t0 ADCS t0, acc2 UMULH y1, x1, y0 MUL y1, x2, t0 ADCS t0, acc3 UMULH y1, x2, hlp0 MUL y1, x3, t0 ADCS t0, acc4 UMULH y1, x3, y1 ADC $0, ZR, acc5 ADDS t1, acc2 ADCS y0, acc3 ADCS hlp0, acc4 ADC y1, acc5 // Second reduction step MUL acc1, hlp1, hlp0 MUL const0, hlp0, t0 ADDS t0, acc1, acc1 UMULH const0, hlp0, t1 MUL const1, hlp0, t0 ADCS t0, acc2, acc2 UMULH const1, hlp0, y0 MUL const2, hlp0, t0 ADCS t0, acc3, acc3 UMULH const2, hlp0, acc1 MUL const3, hlp0, t0 ADCS t0, acc0, acc0 UMULH const3, hlp0, hlp0 ADC $0, acc5 ADDS t1, acc2, acc2 ADCS y0, acc3, acc3 ADCS acc1, acc0, acc0 ADC $0, hlp0, acc1 // y[2] * x MUL y2, x0, t0 ADDS t0, acc2 UMULH y2, x0, t1 MUL y2, x1, t0 ADCS t0, acc3 UMULH y2, x1, y0 MUL y2, x2, t0 ADCS t0, acc4 UMULH y2, x2, y1 MUL y2, x3, t0 ADCS t0, acc5 UMULH y2, x3, hlp0 ADC $0, ZR, acc6 ADDS t1, acc3 ADCS y0, acc4 ADCS y1, acc5 ADC hlp0, acc6 // Third reduction step MUL acc2, hlp1, hlp0 MUL const0, hlp0, t0 ADDS t0, acc2, acc2 UMULH const0, hlp0, t1 MUL const1, hlp0, t0 ADCS t0, acc3, acc3 UMULH const1, hlp0, y0 MUL const2, hlp0, t0 ADCS t0, acc0, acc0 UMULH const2, hlp0, acc2 MUL const3, hlp0, t0 ADCS t0, acc1, acc1 UMULH const3, hlp0, hlp0 ADC $0, acc6 ADDS t1, acc3, acc3 ADCS y0, acc0, acc0 ADCS acc2, acc1, acc1 ADC $0, hlp0, acc2 // y[3] * x MUL y3, x0, t0 ADDS t0, acc3 UMULH y3, x0, t1 MUL y3, x1, t0 ADCS t0, acc4 UMULH y3, x1, y0 MUL y3, x2, t0 ADCS t0, acc5 UMULH y3, x2, y1 MUL y3, x3, t0 ADCS t0, acc6 UMULH y3, x3, hlp0 ADC $0, ZR, acc7 ADDS t1, acc4 ADCS y0, acc5 ADCS y1, acc6 ADC hlp0, acc7 // Last reduction step MUL acc3, hlp1, hlp0 MUL const0, hlp0, t0 ADDS t0, acc3, acc3 UMULH const0, hlp0, t1 MUL const1, hlp0, t0 ADCS t0, acc0, acc0 UMULH const1, hlp0, y0 MUL const2, hlp0, t0 ADCS t0, acc1, acc1 UMULH const2, hlp0, acc3 MUL const3, hlp0, t0 ADCS t0, acc2, acc2 UMULH const3, hlp0, hlp0 ADC $0, acc7 ADDS t1, acc0, acc0 ADCS y0, acc1, acc1 ADCS acc3, acc2, acc2 ADC $0, hlp0, acc3 // Add bits [511:256] of the mul result ADDS acc4, acc0, acc0 ADCS acc5, acc1, acc1 ADCS acc6, acc2, acc2 ADCS acc7, acc3, acc3 ADC $0, ZR, acc4 SUBS const0, acc0, t0 SBCS const1, acc1, t1 SBCS const2, acc2, acc6 SBCS const3, acc3, acc7 SBCS $0, acc4, acc4 CSEL CS, t0, acc0, y0 CSEL CS, t1, acc1, y1 CSEL CS, acc6, acc2, y2 CSEL CS, acc7, acc3, y3 RET /* ---------------------------------------*/ // (y3, y2, y1, y0) = (x3, x2, x1, x0) ^ 2 TEXT gfpSqrInternal(SB),NOSPLIT,$0 // x[1:] * x[0] MUL x0, x1, acc1 UMULH x0, x1, acc2 MUL x0, x2, t0 ADDS t0, acc2, acc2 UMULH x0, x2, acc3 MUL x0, x3, t0 ADCS t0, acc3, acc3 UMULH x0, x3, acc4 ADC $0, acc4, acc4 // x[2:] * x[1] MUL x1, x2, t0 ADDS t0, acc3 UMULH x1, x2, t1 ADCS t1, acc4 ADC $0, ZR, acc5 MUL x1, x3, t0 ADDS t0, acc4 UMULH x1, x3, t1 ADC t1, acc5 // x[3] * x[2] MUL x2, x3, t0 ADDS t0, acc5 UMULH x2, x3, acc6 ADC $0, acc6 MOVD $0, acc7 // *2 ADDS acc1, acc1 ADCS acc2, acc2 ADCS acc3, acc3 ADCS acc4, acc4 ADCS acc5, acc5 ADCS acc6, acc6 ADC $0, acc7 // Missing products MUL x0, x0, acc0 UMULH x0, x0, t0 ADDS t0, acc1, acc1 MUL x1, x1, t0 ADCS t0, acc2, acc2 UMULH x1, x1, t1 ADCS t1, acc3, acc3 MUL x2, x2, t0 ADCS t0, acc4, acc4 UMULH x2, x2, t1 ADCS t1, acc5, acc5 MUL x3, x3, t0 ADCS t0, acc6, acc6 UMULH x3, x3, t1 ADCS t1, acc7, acc7 // First reduction step MUL acc0, hlp1, hlp0 MUL const0, hlp0, t0 ADDS t0, acc0, acc0 UMULH const0, hlp0, t1 MUL const1, hlp0, t0 ADCS t0, acc1, acc1 UMULH const1, hlp0, y0 MUL const2, hlp0, t0 ADCS t0, acc2, acc2 UMULH const2, hlp0, acc0 MUL const3, hlp0, t0 ADCS t0, acc3, acc3 UMULH const3, hlp0, hlp0 ADC $0, hlp0 ADDS t1, acc1, acc1 ADCS y0, acc2, acc2 ADCS acc0, acc3, acc3 ADC $0, hlp0, acc0 // Second reduction step MUL acc1, hlp1, hlp0 MUL const0, hlp0, t0 ADDS t0, acc1, acc1 UMULH const0, hlp0, t1 MUL const1, hlp0, t0 ADCS t0, acc2, acc2 UMULH const1, hlp0, y0 MUL const2, hlp0, t0 ADCS t0, acc3, acc3 UMULH const2, hlp0, acc1 MUL const3, hlp0, t0 ADCS t0, acc0, acc0 UMULH const3, hlp0, hlp0 ADC $0, hlp0 ADDS t1, acc2, acc2 ADCS y0, acc3, acc3 ADCS acc1, acc0, acc0 ADC $0, hlp0, acc1 // Third reduction step MUL acc2, hlp1, hlp0 MUL const0, hlp0, t0 ADDS t0, acc2, acc2 UMULH const0, hlp0, t1 MUL const1, hlp0, t0 ADCS t0, acc3, acc3 UMULH const1, hlp0, y0 MUL const2, hlp0, t0 ADCS t0, acc0, acc0 UMULH const2, hlp0, acc2 MUL const3, hlp0, t0 ADCS t0, acc1, acc1 UMULH const3, hlp0, hlp0 ADC $0, hlp0 ADDS t1, acc3, acc3 ADCS y0, acc0, acc0 ADCS acc2, acc1, acc1 ADC $0, hlp0, acc2 // Last reduction step MUL acc3, hlp1, hlp0 MUL const0, hlp0, t0 ADDS t0, acc3, acc3 UMULH const0, hlp0, t1 MUL const1, hlp0, t0 ADCS t0, acc0, acc0 UMULH const1, hlp0, y0 MUL const2, hlp0, t0 ADCS t0, acc1, acc1 UMULH const2, hlp0, acc3 MUL const3, hlp0, t0 ADCS t0, acc2, acc2 UMULH const3, hlp0, hlp0 ADC $0, acc7 ADDS t1, acc0, acc0 ADCS y0, acc1, acc1 ADCS acc3, acc2, acc2 ADC $0, hlp0, acc3 // Add bits [511:256] of the sqr result ADDS acc4, acc0, acc0 ADCS acc5, acc1, acc1 ADCS acc6, acc2, acc2 ADCS acc7, acc3, acc3 ADC $0, ZR, acc4 SUBS const0, acc0, t0 SBCS const1, acc1, t1 SBCS const2, acc2, acc6 SBCS const3, acc3, acc7 SBCS $0, acc4, acc4 CSEL CS, t0, acc0, y0 CSEL CS, t1, acc1, y1 CSEL CS, acc6, acc2, y2 CSEL CS, acc7, acc3, y3 RET /* ---------------------------------------*/ // (x3, x2, x1, x0) = 2(y3, y2, y1, y0) #define gfpMulBy2Inline \ ADDS y0, y0, x0; \ ADCS y1, y1, x1; \ ADCS y2, y2, x2; \ ADCS y3, y3, x3; \ ADC $0, ZR, hlp0; \ SUBS const0, x0, acc0; \ SBCS const1, x1, acc1;\ SBCS const2, x2, acc2; \ SBCS const3, x3, acc3;\ SBCS $0, hlp0, hlp0;\ CSEL CC, x0, acc0, x0;\ CSEL CC, x1, acc1, x1;\ CSEL CC, x2, acc2, x2;\ CSEL CC, x3, acc3, x3; // (y3, y2, y1, y0) = 2(y3, y2, y1, y0) #define gfpMulBy2Inline2 \ ADDS y0, y0, x0; \ ADCS y1, y1, x1; \ ADCS y2, y2, x2; \ ADCS y3, y3, x3; \ ADC $0, ZR, hlp0; \ SUBS const0, x0, acc0; \ SBCS const1, x1, acc1;\ SBCS const2, x2, acc2; \ SBCS const3, x3, acc3;\ SBCS $0, hlp0, hlp0;\ CSEL CC, x0, acc0, y0;\ CSEL CC, x1, acc1, y1;\ CSEL CC, x2, acc2, y2;\ CSEL CC, x3, acc3, y3; /* ---------------------------------------*/ // (x3, x2, x1, x0) = (x3, x2, x1, x0) + (y3, y2, y1, y0) #define gfpAddInline \ ADDS y0, x0, x0; \ ADCS y1, x1, x1; \ ADCS y2, x2, x2; \ ADCS y3, x3, x3; \ ADC $0, ZR, hlp0; \ SUBS const0, x0, acc0; \ SBCS const1, x1, acc1;\ SBCS const2, x2, acc2; \ SBCS const3, x3, acc3;\ SBCS $0, hlp0, hlp0;\ CSEL CC, x0, acc0, x0;\ CSEL CC, x1, acc1, x1;\ CSEL CC, x2, acc2, x2;\ CSEL CC, x3, acc3, x3; /* ---------------------------------------*/ #define x1in(off) (off)(a_ptr) #define y1in(off) (off + 32)(a_ptr) #define z1in(off) (off + 64)(a_ptr) #define x2in(off) (off)(b_ptr) #define y2in(off) (off + 32)(b_ptr) #define z2in(off) (off + 64)(b_ptr) #define x3out(off) (off)(res_ptr) #define y3out(off) (off + 32)(res_ptr) #define z3out(off) (off + 64)(res_ptr) #define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3) #define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3) #define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16) #define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16) #define y2x MOVD y0, x0; MOVD y1, x1; MOVD y2, x2; MOVD y3, x3 #define x2y MOVD x0, y0; MOVD x1, y1; MOVD x2, y2; MOVD x3, y3 /* ---------------------------------------*/ #define tmp0(off) (32*0 + 8 + off)(RSP) #define tmp1(off) (32*1 + 8 + off)(RSP) #define tmp2(off) (32*2 + 8 + off)(RSP) // func gfp2Mul(c, a, b *gfP2) TEXT ·gfp2Mul(SB),NOSPLIT,$104-24 MOVD in1+8(FP), a_ptr MOVD in2+16(FP), b_ptr MOVD ·np+0x00(SB), hlp1 LDP ·p2+0x00(SB), (const0, const1) LDP ·p2+0x10(SB), (const2, const3) LDx (y1in) LDy (y2in) CALL gfpMulInternal(SB) STy (tmp0) LDx (x1in) LDy (x2in) CALL gfpMulInternal(SB) STy (tmp1) LDx (x1in) LDy (y1in) gfpAddInline STx (tmp2) LDx (x2in) LDy (y2in) gfpAddInline LDy (tmp2) CALL gfpMulInternal(SB) LDx (tmp0) CALL gfpSubInternal(SB) x2y LDx (tmp1) CALL gfpSubInternal(SB) MOVD res+0(FP), res_ptr // not use hlp1 any more STx (x3out) LDy (tmp1) gfpMulBy2Inline LDy (tmp0) CALL gfpSubInternal(SB) STx (y3out) RET // func gfp2MulU(c, a, b *gfP2) TEXT ·gfp2MulU(SB),NOSPLIT,$104-24 MOVD in1+8(FP), a_ptr MOVD in2+16(FP), b_ptr MOVD ·np+0x00(SB), hlp1 LDP ·p2+0x00(SB), (const0, const1) LDP ·p2+0x10(SB), (const2, const3) LDx (y1in) LDy (y2in) CALL gfpMulInternal(SB) STy (tmp0) LDx (x1in) LDy (x2in) CALL gfpMulInternal(SB) STy (tmp1) LDx (x1in) LDy (y1in) gfpAddInline STx (tmp2) LDx (x2in) LDy (y2in) gfpAddInline LDy (tmp2) CALL gfpMulInternal(SB) LDx (tmp0) CALL gfpSubInternal(SB) x2y LDx (tmp1) CALL gfpSubInternal(SB) x2y gfpMulBy2Inline MOVD $0, y0 MOVD $0, y1 MOVD $0, y2 MOVD $0, y3 CALL gfpSubInternal(SB) MOVD res+0(FP), res_ptr // not use hlp1 any more STx (y3out) LDy (tmp1) gfpMulBy2Inline LDy (tmp0) CALL gfpSubInternal(SB) STx (x3out) RET // func gfp2MulU1(c, a *gfP2) TEXT ·gfp2MulU1(SB),NOSPLIT,$0-16 MOVD res+0(FP), b_ptr MOVD in1+8(FP), a_ptr LDP ·p2+0x00(SB), (const0, const1) LDP ·p2+0x10(SB), (const2, const3) LDy (x1in) gfpMulBy2Inline MOVD $0, y0 MOVD $0, y1 MOVD $0, y2 MOVD $0, y3 CALL gfpSubInternal(SB) ADD $32, a_ptr, a_ptr VLD1 (a_ptr), [V0.B16, V1.B16] VST1 [V0.B16, V1.B16], (b_ptr) STx (y2in) RET // func gfp2Square(c, a *gfP2) TEXT ·gfp2Square(SB),NOSPLIT,$72-16 MOVD res+0(FP), b_ptr MOVD in1+8(FP), a_ptr MOVD ·np+0x00(SB), hlp1 LDP ·p2+0x00(SB), (const0, const1) LDP ·p2+0x10(SB), (const2, const3) LDx (y1in) LDy (x1in) gfpAddInline STx (tmp0) gfpMulBy2Inline LDy (y1in) CALL gfpSubInternal(SB) LDy (tmp0) CALL gfpMulInternal(SB) STy (tmp0) LDx (y1in) LDy (x1in) CALL gfpMulInternal(SB) //STy (tmp1) LDx (tmp0) gfpAddInline STx (y2in) //LDy (tmp1) gfpMulBy2Inline STx (x2in) RET // func gfp2SquareU(c, a *gfP2) TEXT ·gfp2SquareU(SB),NOSPLIT,$72-16 MOVD res+0(FP), b_ptr MOVD in1+8(FP), a_ptr MOVD ·np+0x00(SB), hlp1 LDP ·p2+0x00(SB), (const0, const1) LDP ·p2+0x10(SB), (const2, const3) LDx (y1in) LDy (x1in) gfpAddInline STx (tmp0) gfpMulBy2Inline LDy (y1in) CALL gfpSubInternal(SB) LDy (tmp0) CALL gfpMulInternal(SB) STy (tmp0) LDx (y1in) LDy (x1in) CALL gfpMulInternal(SB) //STy (tmp1) LDx (tmp0) gfpAddInline STx (x2in) //LDy (tmp1) gfpMulBy2Inline2 gfpMulBy2Inline MOVD $0, y0 MOVD $0, y1 MOVD $0, y2 MOVD $0, y3 CALL gfpSubInternal(SB) STx (y2in) RET /* ---------------------------------------*/ #undef tmp2 #define x3t(off) (32*2 + 8 + off)(RSP) #define y3t(off) (32*3 + 8 + off)(RSP) #define z3t(off) (32*4 + 8 + off)(RSP) // func curvePointDoubleComplete(c, a *curvePoint) TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$168-16 MOVD res+0(FP), b_ptr MOVD in1+8(FP), a_ptr MOVD ·np+0x00(SB), hlp1 LDP ·p2+0x00(SB), (const0, const1) LDP ·p2+0x10(SB), (const2, const3) LDx (y1in) CALL gfpSqrInternal(SB) // t0 := Y^2 STy (tmp0) gfpMulBy2Inline2 // Z3 := t0 + t0 gfpMulBy2Inline2 // Z3 := Z3 + Z3 gfpMulBy2Inline // Z3 := Z3 + Z3 STx (z3t) LDx (z1in) CALL gfpSqrInternal(SB) // t2 := Z^2 STy (tmp1) gfpMulBy2Inline2 gfpMulBy2Inline2 gfpMulBy2Inline2 gfpMulBy2Inline2 LDx (tmp1) CALL gfpSubInternal(SB) // t2 := 3b * t2 = 3bZ^2 STx (tmp1) LDy (z3t) CALL gfpMulInternal(SB) // X3 := t2 * Z3 STy (x3t) LDx (tmp0) LDy (tmp1) gfpAddInline // Y3 := t0 + t2 STx (y3t) gfpMulBy2Inline gfpAddInline // t2 := t2 + t2 + t2 STx (tmp1) LDy (tmp0) CALL gfpSubInternal(SB) // t0 := t0 - t2 STx (tmp0) LDy (y3t) CALL gfpMulInternal(SB) // Y3 := t0 * Y3 LDx (x3t) gfpAddInline // Y3 := X3 + Y3 STx (y3t) LDx (y1in) LDy (z1in) CALL gfpMulInternal(SB) // t1 := YZ LDx (z3t) CALL gfpMulInternal(SB) // Z3 := t1 * Z3 STy (z2in) // Store Z3 LDx (x1in) LDy (y1in) CALL gfpMulInternal(SB) // t1 := XY LDx (tmp0) CALL gfpMulInternal(SB) // X3 := t0 * t1 gfpMulBy2Inline // X3 := X3 + X3 STx (x2in) // Store X3 // Store Y3 LDx (y3t) STx (y2in) RET /* ---------------------------------------*/ #undef x3t #undef y3t #undef z3t #define tmp2(off) (32*2 + 8 + off)(RSP) #define tmp3(off) (32*3 + 8 + off)(RSP) #define tmp4(off) (32*4 + 8 + off)(RSP) #define x3t(off) (32*5 + 8 + off)(RSP) #define y3t(off) (32*6 + 8 + off)(RSP) #define z3t(off) (32*7 + 8 + off)(RSP) // func curvePointAddComplete(c, a, b *curvePoint) TEXT ·curvePointAddComplete(SB),0,$264-24 MOVD in1+8(FP), a_ptr MOVD in2+16(FP), b_ptr MOVD ·np+0x00(SB), hlp1 LDP ·p2+0x00(SB), (const0, const1) LDP ·p2+0x10(SB), (const2, const3) LDx (x1in) LDy (x2in) CALL gfpMulInternal(SB) // t0 := X1X2 STy (tmp0) LDx (y1in) LDy (y2in) CALL gfpMulInternal(SB) // t1 := Y1Y2 STy (tmp1) LDx (z1in) LDy (z2in) CALL gfpMulInternal(SB) // t2 := Z1Z2 STy (tmp2) LDx (x1in) LDy (y1in) gfpAddInline // t3 := X1 + Y1 STx (tmp3) LDx (x2in) LDy (y2in) gfpAddInline // t4 := X2 + Y2 LDy (tmp3) CALL gfpMulInternal(SB) // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2) STy (tmp3) LDx (tmp0) LDy (tmp1) gfpAddInline // t4 := t0 + t1 LDy (tmp3) CALL gfpSubInternal(SB) // t3 := t3 - t4 = X1Y2 + X2Y1 STx (tmp3) LDx (y1in) LDy (z1in) gfpAddInline // t4 := Y1 + Z1 STx (tmp4) LDx (y2in) LDy (z2in) gfpAddInline // t3 := Y2 + Z2 LDy (tmp4) CALL gfpMulInternal(SB) // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2) STy (tmp4) LDx (tmp1) LDy (tmp2) gfpAddInline // X3 := t1 + t2 LDy (tmp4) CALL gfpSubInternal(SB) // t4 := t4 - X3 = Y1Z2 + Y2Z1 STx (tmp4) LDx (x1in) LDy (z1in) gfpAddInline // X3 := X1 + Z1 STx (x3t) LDx (x2in) LDy (z2in) gfpAddInline // Y3 := X2 + Z2 LDy (x3t) CALL gfpMulInternal(SB) // X3 := X3 * Y3 STy (x3t) LDx (tmp0) LDy (tmp2) gfpAddInline // Y3 := t0 + t2 LDy (x3t) CALL gfpSubInternal(SB) // Y3 := X3 - Y3 = X1Z2 + X2Z1 STx (y3t) LDy (tmp0) gfpMulBy2Inline gfpAddInline // t0 := t0 + t0 + t0 = 3X1X2 STx (tmp0) LDy (tmp2) gfpMulBy2Inline2 gfpMulBy2Inline2 gfpMulBy2Inline2 gfpMulBy2Inline2 LDx (tmp2) CALL gfpSubInternal(SB) // t2 := 3b * t2 = 3bZ1Z2 STx (tmp2) LDy (tmp1) gfpAddInline // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2 STx (z3t) LDx (tmp2) CALL gfpSubInternal(SB) // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2 STx (tmp1) LDy (y3t) gfpMulBy2Inline2 gfpMulBy2Inline2 gfpMulBy2Inline2 gfpMulBy2Inline2 LDx (y3t) CALL gfpSubInternal(SB) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1) STx (y3t) LDy (tmp4) CALL gfpMulInternal(SB) // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1) STy (x3t) MOVD res+0(FP), b_ptr LDx (tmp3) LDy (tmp1) CALL gfpMulInternal(SB) // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) LDx (x3t) CALL gfpSubInternal(SB) // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1) STx (x2in) LDy (y3t) LDx (tmp0) CALL gfpMulInternal(SB) // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1) STy (y3t) LDx (tmp1) LDy (z3t) CALL gfpMulInternal(SB) // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) LDx (y3t) gfpAddInline // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1) STx (y2in) LDx (tmp0) LDy (tmp3) CALL gfpMulInternal(SB) // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1) STy (tmp0) LDx (tmp4) LDy (z3t) CALL gfpMulInternal(SB) // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) LDx (tmp0) gfpAddInline // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1) STx (z2in) RET