//go:build !(purego || plugin) #include "textflag.h" /* ---------------------------------------*/ #define mul0 AX #define mul1 DX #define acc0 BX #define acc1 CX #define acc2 R8 #define acc3 R9 #define acc4 R10 #define acc5 R11 #define acc6 R12 #define acc7 R13 #define t0 R14 #define t1 R15 #define t2 DI #define t3 SI #define hlp BP /* ---------------------------------------*/ // (acc7, acc6, acc5, acc4) = (acc7, acc6, acc5, acc4) - (t3, t2, t1, t0) TEXT gfpSubInternal(SB),NOSPLIT,$0 XORQ mul0, mul0 SUBQ t0, acc4 SBBQ t1, acc5 SBBQ t2, acc6 SBBQ t3, acc7 SBBQ $0, mul0 MOVQ acc4, acc0 MOVQ acc5, acc1 MOVQ acc6, acc2 MOVQ acc7, acc3 ADDQ ·p2+0(SB), acc4 ADCQ ·p2+8(SB), acc5 ADCQ ·p2+16(SB), acc6 ADCQ ·p2+24(SB), acc7 ANDQ $1, mul0 // CMOVQEQ: Move if equal (ZF == 1) CMOVQEQ acc0, acc4 CMOVQEQ acc1, acc5 CMOVQEQ acc2, acc6 CMOVQEQ acc3, acc7 RET /* ---------------------------------------*/ // (acc7, acc6, acc5, acc4) = (acc7, acc6, acc5, acc4) * (t3, t2, t1, t0) // t0, t1 will be overwrited after this function call TEXT gfpMulInternal(SB),NOSPLIT,$8 CMPB ·supportADX(SB), $0 JE noAdxMul // [t3, t2, t1, t0] * acc4 MOVQ acc4, mul1 MULXQ t0, acc0, acc1 MULXQ t1, mul0, acc2 ADDQ mul0, acc1 MULXQ t2, mul0, acc3 ADCQ mul0, acc2 MULXQ t3, mul0, acc4 ADCQ mul0, acc3 ADCQ $0, acc4 // [t3, t2, t1, t0] * acc5 MOVQ acc5, mul1 MULXQ t0, mul0, hlp ADDQ mul0, acc1 ADCQ hlp, acc2 MULXQ t1, mul0, hlp ADCQ $0, hlp ADDQ mul0, acc2 ADCQ hlp, acc3 MULXQ t2, mul0, hlp ADCQ $0, hlp ADDQ mul0, acc3 ADCQ hlp, acc4 MULXQ t3, mul0, acc5 ADCQ $0, acc5 ADDQ mul0, acc4 ADCQ $0, acc5 // [t3, t2, t1, t0] * acc6 MOVQ acc6, mul1 MULXQ t0, mul0, hlp ADDQ mul0, acc2 ADCQ hlp, acc3 MULXQ t1, mul0, hlp ADCQ $0, hlp ADDQ mul0, acc3 ADCQ hlp, acc4 MULXQ t2, mul0, hlp ADCQ $0, hlp ADDQ mul0, acc4 ADCQ hlp, acc5 MULXQ t3, mul0, acc6 ADCQ $0, acc6 ADDQ mul0, acc5 ADCQ $0, acc6 // [t3, t2, t1, t0] * acc7 MOVQ acc7, mul1 MULXQ t0, mul0, hlp ADDQ mul0, acc3 ADCQ hlp, acc4 MULXQ t1, mul0, hlp ADCQ $0, hlp ADDQ mul0, acc4 ADCQ hlp, acc5 MULXQ t2, mul0, hlp ADCQ $0, hlp ADDQ mul0, acc5 ADCQ hlp, acc6 MULXQ t3, mul0, acc7 ADCQ $0, acc7 ADDQ mul0, acc6 ADCQ $0, acc7 // T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0] // First reduction step XORQ t1, t1 MOVQ acc0, mul1 MULXQ ·np+0x00(SB), mul1, mul0 MULXQ ·p2+0x00(SB), mul0, t0 ADOXQ mul0, acc0 // (carry1, acc0) = acc0 + t0 * ord0 MULXQ ·p2+0x08(SB), mul0, hlp ADCXQ t0, mul0 ADOXQ mul0, acc1 MULXQ ·p2+0x10(SB), mul0, t0 ADCXQ hlp, mul0 ADOXQ mul0, acc2 MULXQ ·p2+0x18(SB), mul0, acc0 ADCXQ t0, mul0 ADOXQ mul0, acc3 ADCXQ t1, acc0 ADOXQ t1, acc0 // Second reduction step MOVQ acc1, mul1 MULXQ ·np+0x00(SB), mul1, mul0 MULXQ ·p2+0x00(SB), mul0, t0 ADOXQ mul0, acc1 MULXQ ·p2+0x08(SB), mul0, hlp ADCXQ t0, mul0 ADOXQ mul0, acc2 MULXQ ·p2+0x10(SB), mul0, t0 ADCXQ hlp, mul0 ADOXQ mul0, acc3 MULXQ ·p2+0x18(SB), mul0, acc1 ADCXQ t0, mul0 ADOXQ mul0, acc0 ADCXQ t1, acc1 ADOXQ t1, acc1 // Third reduction step MOVQ acc2, mul1 MULXQ ·np+0x00(SB), mul1, mul0 MULXQ ·p2+0x00(SB), mul0, t0 ADOXQ mul0, acc2 MULXQ ·p2+0x08(SB), mul0, hlp ADCXQ t0, mul0 ADOXQ mul0, acc3 MULXQ ·p2+0x10(SB), mul0, t0 ADCXQ hlp, mul0 ADOXQ mul0, acc0 MULXQ ·p2+0x18(SB), mul0, acc2 ADCXQ t0, mul0 ADOXQ mul0, acc1 ADCXQ t1, acc2 ADOXQ t1, acc2 // Last reduction step MOVQ acc3, mul1 MULXQ ·np+0x00(SB), mul1, mul0 MULXQ ·p2+0x00(SB), mul0, t0 ADOXQ mul0, acc3 MULXQ ·p2+0x08(SB), mul0, hlp ADCXQ t0, mul0 ADOXQ mul0, acc0 MULXQ ·p2+0x10(SB), mul0, t0 ADCXQ hlp, mul0 ADOXQ mul0, acc1 MULXQ ·p2+0x18(SB), mul0, acc3 ADCXQ t0, mul0 ADOXQ mul0, acc2 ADCXQ t1, acc3 ADOXQ t1, acc3 MOVQ $0, hlp // Add bits [511:256] of the result ADDQ acc0, acc4 ADCQ acc1, acc5 ADCQ acc2, acc6 ADCQ acc3, acc7 ADCQ $0, hlp // Copy result MOVQ acc4, acc0 MOVQ acc5, acc1 MOVQ acc6, acc2 MOVQ acc7, acc3 // Subtract p SUBQ ·p2+0(SB), acc4 SBBQ ·p2+8(SB), acc5 SBBQ ·p2+16(SB), acc6 SBBQ ·p2+24(SB), acc7 SBBQ $0, hlp // If the result of the subtraction is negative, restore the previous result CMOVQCS acc0, acc4 CMOVQCS acc1, acc5 CMOVQCS acc2, acc6 CMOVQCS acc3, acc7 RET noAdxMul: // [t3, t2, t1, t0] * acc4 MOVQ acc4, mul0 MULQ t0 MOVQ mul0, acc0 MOVQ mul1, acc1 MOVQ acc4, mul0 MULQ t1 ADDQ mul0, acc1 ADCQ $0, mul1 MOVQ mul1, acc2 MOVQ acc4, mul0 MULQ t2 ADDQ mul0, acc2 ADCQ $0, mul1 MOVQ mul1, acc3 MOVQ acc4, mul0 MULQ t3 ADDQ mul0, acc3 ADCQ $0, mul1 MOVQ mul1, acc4 // [t3, t2, t1, t0] * acc5 MOVQ acc5, mul0 MULQ t0 ADDQ mul0, acc1 ADCQ $0, mul1 MOVQ mul1, hlp MOVQ acc5, mul0 MULQ t1 ADDQ hlp, acc2 ADCQ $0, mul1 ADDQ mul0, acc2 ADCQ $0, mul1 MOVQ mul1, hlp MOVQ acc5, mul0 MULQ t2 ADDQ hlp, acc3 ADCQ $0, mul1 ADDQ mul0, acc3 ADCQ $0, mul1 MOVQ mul1, hlp MOVQ acc5, mul0 MULQ t3 ADDQ hlp, acc4 ADCQ $0, mul1 ADDQ mul0, acc4 ADCQ $0, mul1 MOVQ mul1, acc5 // [t3, t2, t1, t0] * acc6 MOVQ acc6, mul0 MULQ t0 ADDQ mul0, acc2 ADCQ $0, mul1 MOVQ mul1, hlp MOVQ acc6, mul0 MULQ t1 ADDQ hlp, acc3 ADCQ $0, mul1 ADDQ mul0, acc3 ADCQ $0, mul1 MOVQ mul1, hlp MOVQ acc6, mul0 MULQ t2 ADDQ hlp, acc4 ADCQ $0, mul1 ADDQ mul0, acc4 ADCQ $0, mul1 MOVQ mul1, hlp MOVQ acc6, mul0 MULQ t3 ADDQ hlp, acc5 ADCQ $0, mul1 ADDQ mul0, acc5 ADCQ $0, mul1 MOVQ mul1, acc6 // [t3, t2, t1, t0] * acc7 MOVQ acc7, mul0 MULQ t0 ADDQ mul0, acc3 ADCQ $0, mul1 MOVQ mul1, hlp MOVQ acc7, mul0 MULQ t1 ADDQ hlp, acc4 ADCQ $0, mul1 ADDQ mul0, acc4 ADCQ $0, mul1 MOVQ mul1, hlp MOVQ acc7, mul0 MULQ t2 ADDQ hlp, acc5 ADCQ $0, mul1 ADDQ mul0, acc5 ADCQ $0, mul1 MOVQ mul1, hlp MOVQ acc7, mul0 MULQ t3 ADDQ hlp, acc6 ADCQ $0, mul1 ADDQ mul0, acc6 ADCQ $0, mul1 MOVQ mul1, acc7 // T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0] // First reduction step MOVQ acc0, mul0 MULQ ·np+0x00(SB) MOVQ mul0, hlp MOVQ ·p2+0x00(SB), mul0 MULQ hlp ADDQ mul0, acc0 ADCQ $0, mul1 MOVQ mul1, t0 XORQ acc0, acc0 MOVQ ·p2+0x08(SB), mul0 MULQ hlp ADDQ t0, acc1 ADCQ $0, mul1 ADDQ mul0, acc1 ADCQ $0, mul1 MOVQ mul1, t0 MOVQ ·p2+0x10(SB), mul0 MULQ hlp ADDQ t0, acc2 ADCQ $0, mul1 ADDQ mul0, acc2 ADCQ $0, mul1 MOVQ mul1, t0 MOVQ ·p2+0x18(SB), mul0 MULQ hlp ADDQ t0, acc3 ADCQ $0, mul1 ADDQ mul0, acc3 ADCQ mul1, acc0 // Second reduction step MOVQ acc1, mul0 MULQ ·np+0x00(SB) MOVQ mul0, hlp MOVQ ·p2+0x00(SB), mul0 MULQ hlp ADDQ mul0, acc1 ADCQ $0, mul1 MOVQ mul1, t0 XORQ acc1, acc1 MOVQ ·p2+0x08(SB), mul0 MULQ hlp ADDQ t0, acc2 ADCQ $0, mul1 ADDQ mul0, acc2 ADCQ $0, mul1 MOVQ mul1, t0 MOVQ ·p2+0x10(SB), mul0 MULQ hlp ADDQ t0, acc3 ADCQ $0, mul1 ADDQ mul0, acc3 ADCQ $0, mul1 MOVQ mul1, t0 MOVQ ·p2+0x18(SB), mul0 MULQ hlp ADDQ t0, acc0 ADCQ $0, mul1 ADDQ mul0, acc0 ADCQ mul1, acc1 // Third reduction step MOVQ acc2, mul0 MULQ ·np+0x00(SB) MOVQ mul0, hlp MOVQ ·p2+0x00(SB), mul0 MULQ hlp ADDQ mul0, acc2 ADCQ $0, mul1 MOVQ mul1, t0 XORQ acc2, acc2 MOVQ ·p2+0x08(SB), mul0 MULQ hlp ADDQ t0, acc3 ADCQ $0, mul1 ADDQ mul0, acc3 ADCQ $0, mul1 MOVQ mul1, t0 MOVQ ·p2+0x10(SB), mul0 MULQ hlp ADDQ t0, acc0 ADCQ $0, mul1 ADDQ mul0, acc0 ADCQ $0, mul1 MOVQ mul1, t0 MOVQ ·p2+0x18(SB), mul0 MULQ hlp ADDQ t0, acc1 ADCQ $0, mul1 ADDQ mul0, acc1 ADCQ mul1, acc2 // Last reduction step MOVQ acc3, mul0 MULQ ·np+0x00(SB) MOVQ mul0, hlp MOVQ ·p2+0x00(SB), mul0 MULQ hlp ADDQ mul0, acc3 ADCQ $0, mul1 MOVQ mul1, t0 XORQ acc3, acc3 MOVQ ·p2+0x08(SB), mul0 MULQ hlp ADDQ t0, acc0 ADCQ $0, mul1 ADDQ mul0, acc0 ADCQ $0, mul1 MOVQ mul1, t0 MOVQ ·p2+0x10(SB), mul0 MULQ hlp ADDQ t0, acc1 ADCQ $0, mul1 ADDQ mul0, acc1 ADCQ $0, mul1 MOVQ mul1, t0 MOVQ ·p2+0x18(SB), mul0 MULQ hlp ADDQ t0, acc2 ADCQ $0, mul1 ADDQ mul0, acc2 ADCQ mul1, acc3 MOVQ $0, hlp // Add bits [511:256] of the result ADDQ acc0, acc4 ADCQ acc1, acc5 ADCQ acc2, acc6 ADCQ acc3, acc7 ADCQ $0, hlp // Copy result MOVQ acc4, acc0 MOVQ acc5, acc1 MOVQ acc6, acc2 MOVQ acc7, acc3 // Subtract p SUBQ ·p2+0(SB), acc4 SBBQ ·p2+8(SB), acc5 SBBQ ·p2+16(SB), acc6 SBBQ ·p2+24(SB), acc7 SBBQ $0, hlp // If the result of the subtraction is negative, restore the previous result CMOVQCS acc0, acc4 CMOVQCS acc1, acc5 CMOVQCS acc2, acc6 CMOVQCS acc3, acc7 RET /* ---------------------------------------*/ // (acc7, acc6, acc5, acc4) = (acc7, acc6, acc5, acc4) ^ 2 TEXT gfpSqrInternal(SB),NOSPLIT,$8 CMPB ·supportADX(SB), $0 JE noAdxSqr XORQ t3, t3 // [acc7, acc6, acc5] * acc4 MOVQ acc4, mul1 MULXQ acc5, acc1, acc2 MULXQ acc6, mul0, acc3 ADOXQ mul0, acc2 MULXQ acc7, mul0, t0 ADOXQ mul0, acc3 ADOXQ t3, t0 // [acc7, acc6] * acc5 MOVQ acc5, mul1 MULXQ acc6, mul0, hlp ADOXQ mul0, acc3 MULXQ acc7, mul0, t1 ADCXQ hlp, mul0 ADOXQ mul0, t0 ADCXQ t3, t1 // acc7 * acc6 MOVQ acc6, mul1 MULXQ acc7, mul0, t2 ADOXQ mul0, t1 ADOXQ t3, t2 // *2 ADOXQ acc1, acc1 ADOXQ acc2, acc2 ADOXQ acc3, acc3 ADOXQ t0, t0 ADOXQ t1, t1 ADOXQ t2, t2 ADOXQ t3, t3 // Missing products MOVQ acc4, mul1 MULXQ mul1, acc0, acc4 ADCXQ acc4, acc1 MOVQ acc5, mul1 MULXQ mul1, mul0, acc4 ADCXQ mul0, acc2 ADCXQ acc4, acc3 MOVQ acc6, mul1 MULXQ mul1, mul0, acc4 ADCXQ mul0, t0 ADCXQ acc4, t1 MOVQ acc7, mul1 MULXQ mul1, mul0, acc4 ADCXQ mul0, t2 ADCXQ acc4, t3 // T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0] // First reduction step XORQ acc5, acc5 MOVQ acc0, mul1 MULXQ ·np+0x00(SB), mul1, mul0 MULXQ ·p2+0x00(SB), mul0, acc4 ADOXQ mul0, acc0 // (carry1, acc0) = acc0 + acc5 * ord0 MULXQ ·p2+0x08(SB), mul0, hlp ADCXQ acc4, mul0 ADOXQ mul0, acc1 MULXQ ·p2+0x10(SB), mul0, acc4 ADCXQ hlp, mul0 ADOXQ mul0, acc2 MULXQ ·p2+0x18(SB), mul0, acc0 ADCXQ acc4, mul0 ADOXQ mul0, acc3 ADCXQ acc5, acc0 ADOXQ acc5, acc0 // Second reduction step MOVQ acc1, mul1 MULXQ ·np+0x00(SB), mul1, mul0 MULXQ ·p2+0x00(SB), mul0, acc4 ADOXQ mul0, acc1 MULXQ ·p2+0x08(SB), mul0, hlp ADCXQ acc4, mul0 ADOXQ mul0, acc2 MULXQ ·p2+0x10(SB), mul0, acc4 ADCXQ hlp, mul0 ADOXQ mul0, acc3 MULXQ ·p2+0x18(SB), mul0, acc1 ADCXQ acc4, mul0 ADOXQ mul0, acc0 ADCXQ acc5, acc1 ADOXQ acc5, acc1 // Third reduction step MOVQ acc2, mul1 MULXQ ·np+0x00(SB), mul1, mul0 MULXQ ·p2+0x00(SB), mul0, acc4 ADOXQ mul0, acc2 MULXQ ·p2+0x08(SB), mul0, hlp ADCXQ acc4, mul0 ADOXQ mul0, acc3 MULXQ ·p2+0x10(SB), mul0, acc4 ADCXQ hlp, mul0 ADOXQ mul0, acc0 MULXQ ·p2+0x18(SB), mul0, acc2 ADCXQ acc4, mul0 ADOXQ mul0, acc1 ADCXQ acc5, acc2 ADOXQ acc5, acc2 // Last reduction step MOVQ acc3, mul1 MULXQ ·np+0x00(SB), mul1, mul0 MULXQ ·p2+0x00(SB), mul0, acc4 ADOXQ mul0, acc3 MULXQ ·p2+0x08(SB), mul0, hlp ADCXQ acc4, mul0 ADOXQ mul0, acc0 MULXQ ·p2+0x10(SB), mul0, acc4 ADCXQ hlp, mul0 ADOXQ mul0, acc1 MULXQ ·p2+0x18(SB), mul0, acc3 ADCXQ acc4, mul0 ADOXQ mul0, acc2 ADCXQ acc5, acc3 ADOXQ acc5, acc3 MOVQ $0, hlp // Add bits [511:256] of the result ADDQ acc0, t0 ADCQ acc1, t1 ADCQ acc2, t2 ADCQ acc3, t3 ADCQ $0, hlp // Copy result MOVQ t0, acc4 MOVQ t1, acc5 MOVQ t2, acc6 MOVQ t3, acc7 // Subtract p SUBQ ·p2+0(SB), acc4 SBBQ ·p2+8(SB), acc5 SBBQ ·p2+16(SB), acc6 SBBQ ·p2+24(SB), acc7 SBBQ $0, hlp // If the result of the subtraction is negative, restore the previous result CMOVQCS t0, acc4 CMOVQCS t1, acc5 CMOVQCS t2, acc6 CMOVQCS t3, acc7 RET noAdxSqr: MOVQ acc4, mul0 MULQ acc5 MOVQ mul0, acc1 MOVQ mul1, acc2 MOVQ acc4, mul0 MULQ acc6 ADDQ mul0, acc2 ADCQ $0, mul1 MOVQ mul1, acc3 MOVQ acc4, mul0 MULQ acc7 ADDQ mul0, acc3 ADCQ $0, mul1 MOVQ mul1, t0 MOVQ acc5, mul0 MULQ acc6 ADDQ mul0, acc3 ADCQ $0, mul1 MOVQ mul1, hlp MOVQ acc5, mul0 MULQ acc7 ADDQ hlp, t0 ADCQ $0, mul1 ADDQ mul0, t0 ADCQ $0, mul1 MOVQ mul1, t1 MOVQ acc6, mul0 MULQ acc7 ADDQ mul0, t1 ADCQ $0, mul1 MOVQ mul1, t2 XORQ t3, t3 // *2 ADDQ acc1, acc1 ADCQ acc2, acc2 ADCQ acc3, acc3 ADCQ t0, t0 ADCQ t1, t1 ADCQ t2, t2 ADCQ $0, t3 // Missing products MOVQ acc4, mul0 MULQ mul0 MOVQ mul0, acc0 MOVQ DX, acc4 MOVQ acc5, mul0 MULQ mul0 ADDQ acc4, acc1 ADCQ mul0, acc2 ADCQ $0, DX MOVQ DX, acc4 MOVQ acc6, mul0 MULQ mul0 ADDQ acc4, acc3 ADCQ mul0, t0 ADCQ $0, DX MOVQ DX, acc4 MOVQ acc7, mul0 MULQ mul0 ADDQ acc4, t1 ADCQ mul0, t2 ADCQ DX, t3 // T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0] // First reduction step MOVQ acc0, mul0 MULQ ·np+0x00(SB) MOVQ mul0, hlp MOVQ ·p2+0x00(SB), mul0 MULQ hlp ADDQ mul0, acc0 ADCQ $0, mul1 MOVQ mul1, acc5 XORQ acc0, acc0 MOVQ ·p2+0x08(SB), mul0 MULQ hlp ADDQ acc5, acc1 ADCQ $0, mul1 ADDQ mul0, acc1 ADCQ $0, mul1 MOVQ mul1, acc5 MOVQ ·p2+0x10(SB), mul0 MULQ hlp ADDQ acc5, acc2 ADCQ $0, mul1 ADDQ mul0, acc2 ADCQ $0, mul1 MOVQ mul1, acc5 MOVQ ·p2+0x18(SB), mul0 MULQ hlp ADDQ acc5, acc3 ADCQ $0, mul1 ADDQ mul0, acc3 ADCQ mul1, acc0 // Second reduction step MOVQ acc1, mul0 MULQ ·np+0x00(SB) MOVQ mul0, hlp MOVQ ·p2+0x00(SB), mul0 MULQ hlp ADDQ mul0, acc1 ADCQ $0, mul1 MOVQ mul1, acc5 XORQ acc1, acc1 MOVQ ·p2+0x08(SB), mul0 MULQ hlp ADDQ acc5, acc2 ADCQ $0, mul1 ADDQ mul0, acc2 ADCQ $0, mul1 MOVQ mul1, acc5 MOVQ ·p2+0x10(SB), mul0 MULQ hlp ADDQ acc5, acc3 ADCQ $0, mul1 ADDQ mul0, acc3 ADCQ $0, mul1 MOVQ mul1, acc5 MOVQ ·p2+0x18(SB), mul0 MULQ hlp ADDQ acc5, acc0 ADCQ $0, mul1 ADDQ mul0, acc0 ADCQ mul1, acc1 // Third reduction step MOVQ acc2, mul0 MULQ ·np+0x00(SB) MOVQ mul0, hlp MOVQ ·p2+0x00(SB), mul0 MULQ hlp ADDQ mul0, acc2 ADCQ $0, mul1 MOVQ mul1, acc5 XORQ acc2, acc2 MOVQ ·p2+0x08(SB), mul0 MULQ hlp ADDQ acc5, acc3 ADCQ $0, mul1 ADDQ mul0, acc3 ADCQ $0, mul1 MOVQ mul1, acc5 MOVQ ·p2+0x10(SB), mul0 MULQ hlp ADDQ acc5, acc0 ADCQ $0, mul1 ADDQ mul0, acc0 ADCQ $0, mul1 MOVQ mul1, acc5 MOVQ ·p2+0x18(SB), mul0 MULQ hlp ADDQ acc5, acc1 ADCQ $0, mul1 ADDQ mul0, acc1 ADCQ mul1, acc2 // Last reduction step MOVQ acc3, mul0 MULQ ·np+0x00(SB) MOVQ mul0, hlp MOVQ ·p2+0x00(SB), mul0 MULQ hlp ADDQ mul0, acc3 ADCQ $0, mul1 MOVQ mul1, acc5 XORQ acc3, acc3 MOVQ ·p2+0x08(SB), mul0 MULQ hlp ADDQ acc5, acc0 ADCQ $0, mul1 ADDQ mul0, acc0 ADCQ $0, mul1 MOVQ mul1, acc5 MOVQ ·p2+0x10(SB), mul0 MULQ hlp ADDQ acc5, acc1 ADCQ $0, mul1 ADDQ mul0, acc1 ADCQ $0, mul1 MOVQ mul1, acc5 MOVQ ·p2+0x18(SB), mul0 MULQ hlp ADDQ acc5, acc2 ADCQ $0, mul1 ADDQ mul0, acc2 ADCQ mul1, acc3 MOVQ $0, hlp // Add bits [511:256] of the result ADDQ acc0, t0 ADCQ acc1, t1 ADCQ acc2, t2 ADCQ acc3, t3 ADCQ $0, hlp // Copy result MOVQ t0, acc4 MOVQ t1, acc5 MOVQ t2, acc6 MOVQ t3, acc7 // Subtract p SUBQ ·p2+0(SB), acc4 SBBQ ·p2+8(SB), acc5 SBBQ ·p2+16(SB), acc6 SBBQ ·p2+24(SB), acc7 SBBQ $0, hlp // If the result of the subtraction is negative, restore the previous result CMOVQCS t0, acc4 CMOVQCS t1, acc5 CMOVQCS t2, acc6 CMOVQCS t3, acc7 RET /* ---------------------------------------*/ // (t3, t2, t1, t0) = 2(acc7, acc6, acc5, acc4) #define gfpMulBy2Inline \ XORQ mul0, mul0;\ ADDQ acc4, acc4;\ ADCQ acc5, acc5;\ ADCQ acc6, acc6;\ ADCQ acc7, acc7;\ ADCQ $0, mul0;\ MOVQ acc4, t0;\ MOVQ acc5, t1;\ MOVQ acc6, t2;\ MOVQ acc7, t3;\ SUBQ ·p2+0(SB), t0;\ SBBQ ·p2+8(SB), t1;\ SBBQ ·p2+16(SB), t2;\ SBBQ ·p2+24(SB), t3;\ SBBQ $0, mul0;\ CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1) CMOVQCS acc5, t1;\ CMOVQCS acc6, t2;\ CMOVQCS acc7, t3; // (acc7, acc6, acc5, acc4) = 2(acc7, acc6, acc5, acc4) #define gfpMulBy2Inline2 \ XORQ mul0, mul0;\ ADDQ acc4, acc4;\ ADCQ acc5, acc5;\ ADCQ acc6, acc6;\ ADCQ acc7, acc7;\ ADCQ $0, mul0;\ MOVQ acc4, t0;\ MOVQ acc5, t1;\ MOVQ acc6, t2;\ MOVQ acc7, t3;\ SUBQ ·p2+0(SB), acc4;\ SBBQ ·p2+8(SB), acc5;\ SBBQ ·p2+16(SB), acc6;\ SBBQ ·p2+24(SB), acc7;\ SBBQ $0, mul0;\ CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1) CMOVQCS t1, acc5;\ CMOVQCS t2, acc6;\ CMOVQCS t3, acc7; /* ---------------------------------------*/ // (t3, t2, t1, t0) = (acc7, acc6, acc5, acc4) + (t3, t2, t1, t0) #define gfpAddInline \ XORQ mul0, mul0;\ ADDQ t0, acc4;\ ADCQ t1, acc5;\ ADCQ t2, acc6;\ ADCQ t3, acc7;\ ADCQ $0, mul0;\ MOVQ acc4, t0;\ MOVQ acc5, t1;\ MOVQ acc6, t2;\ MOVQ acc7, t3;\ SUBQ ·p2+0(SB), t0;\ SBBQ ·p2+8(SB), t1;\ SBBQ ·p2+16(SB), t2;\ SBBQ ·p2+24(SB), t3;\ SBBQ $0, mul0;\ CMOVQCS acc4, t0;\ CMOVQCS acc5, t1;\ CMOVQCS acc6, t2;\ CMOVQCS acc7, t3; /* ---------------------------------------*/ #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) #define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) #define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 #define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 /* ---------------------------------------*/ #define axin(off) (32*0 + off)(SP) #define ayin(off) (32*1 + off)(SP) #define bxin(off) (32*2 + off)(SP) #define byin(off) (32*3 + off)(SP) #define tmp0(off) (32*4 + off)(SP) #define tmp1(off) (32*5 + off)(SP) #define cxout(off) (32*6 + off)(SP) #define rptr (32*7)(SP) TEXT ·gfp2Mul(SB),NOSPLIT,$256-24 // Move input to stack in order to free registers MOVQ res+0(FP), CX MOVQ in1+8(FP), AX MOVQ in2+16(FP), BX MOVOU (16*0)(AX), X0 MOVOU (16*1)(AX), X1 MOVOU (16*2)(AX), X2 MOVOU (16*3)(AX), X3 MOVOU X0, axin(16*0) MOVOU X1, axin(16*1) MOVOU X2, ayin(16*0) MOVOU X3, ayin(16*1) MOVOU (16*0)(BX), X0 MOVOU (16*1)(BX), X1 MOVOU (16*2)(BX), X2 MOVOU (16*3)(BX), X3 MOVOU X0, bxin(16*0) MOVOU X1, bxin(16*1) MOVOU X2, byin(16*0) MOVOU X3, byin(16*1) // Store pointer to result MOVQ CX, rptr LDacc (ayin) LDt (byin) CALL gfpMulInternal(SB) ST (tmp0) LDacc (axin) LDt (bxin) CALL gfpMulInternal(SB) ST (tmp1) LDacc (axin) LDt (ayin) gfpAddInline STt (cxout) LDacc (bxin) LDt (byin) gfpAddInline LDacc (cxout) CALL gfpMulInternal(SB) LDt (tmp0) CALL gfpSubInternal(SB) LDt (tmp1) CALL gfpSubInternal(SB) // Store x MOVQ rptr, AX MOVQ acc4, (16*0 + 8*0)(AX) MOVQ acc5, (16*0 + 8*1)(AX) MOVQ acc6, (16*0 + 8*2)(AX) MOVQ acc7, (16*0 + 8*3)(AX) LDacc (tmp0) //LDt (tmp1) CALL gfpSubInternal(SB) CALL gfpSubInternal(SB) MOVQ rptr, AX /////////////////////// MOVQ $0, rptr // Store y MOVQ acc4, (16*2 + 8*0)(AX) MOVQ acc5, (16*2 + 8*1)(AX) MOVQ acc6, (16*2 + 8*2)(AX) MOVQ acc7, (16*2 + 8*3)(AX) RET TEXT ·gfp2MulU(SB),NOSPLIT,$256-24 // Move input to stack in order to free registers MOVQ res+0(FP), CX MOVQ in1+8(FP), AX MOVQ in2+16(FP), BX MOVOU (16*0)(AX), X0 MOVOU (16*1)(AX), X1 MOVOU (16*2)(AX), X2 MOVOU (16*3)(AX), X3 MOVOU X0, axin(16*0) MOVOU X1, axin(16*1) MOVOU X2, ayin(16*0) MOVOU X3, ayin(16*1) MOVOU (16*0)(BX), X0 MOVOU (16*1)(BX), X1 MOVOU (16*2)(BX), X2 MOVOU (16*3)(BX), X3 MOVOU X0, bxin(16*0) MOVOU X1, bxin(16*1) MOVOU X2, byin(16*0) MOVOU X3, byin(16*1) // Store pointer to result MOVQ CX, rptr LDacc (ayin) LDt (byin) CALL gfpMulInternal(SB) ST (tmp0) LDacc (axin) LDt (bxin) CALL gfpMulInternal(SB) ST (tmp1) LDacc (axin) LDt (ayin) gfpAddInline STt (cxout) LDacc (bxin) LDt (byin) gfpAddInline LDacc (cxout) CALL gfpMulInternal(SB) LDt (tmp0) CALL gfpSubInternal(SB) LDt (tmp1) CALL gfpSubInternal(SB) gfpMulBy2Inline XORQ acc4, acc4 XORQ acc5, acc5 XORQ acc6, acc6 XORQ acc7, acc7 CALL gfpSubInternal(SB) // Store y MOVQ rptr, AX MOVQ acc4, (16*2 + 8*0)(AX) MOVQ acc5, (16*2 + 8*1)(AX) MOVQ acc6, (16*2 + 8*2)(AX) MOVQ acc7, (16*2 + 8*3)(AX) LDacc (tmp0) LDt (tmp1) CALL gfpSubInternal(SB) CALL gfpSubInternal(SB) MOVQ rptr, AX /////////////////////// MOVQ $0, rptr // Store x MOVQ acc4, (16*0 + 8*0)(AX) MOVQ acc5, (16*0 + 8*1)(AX) MOVQ acc6, (16*0 + 8*2)(AX) MOVQ acc7, (16*0 + 8*3)(AX) RET #undef axin #undef ayin #undef bxin #undef byin #undef tmp0 #undef tmp1 #undef cxout #undef rptr TEXT ·gfp2MulU1(SB),NOSPLIT,$0-16 // Move input to stack in order to free registers MOVQ res+0(FP), mul1 MOVQ in1+8(FP), AX //LDacc (axin) MOVOU (16*2)(AX), X2 MOVOU (16*3)(AX), X3 MOVQ (16*0 + 8*0)(AX), acc4 MOVQ (16*0 + 8*1)(AX), acc5 MOVQ (16*0 + 8*2)(AX), acc6 MOVQ (16*0 + 8*3)(AX), acc7 gfpMulBy2Inline XORQ acc4, acc4 XORQ acc5, acc5 XORQ acc6, acc6 XORQ acc7, acc7 CALL gfpSubInternal(SB) MOVOU X2, (16*0)(mul1) MOVOU X3, (16*1)(mul1) MOVQ acc4, (16*2 + 8*0)(mul1) MOVQ acc5, (16*2 + 8*1)(mul1) MOVQ acc6, (16*2 + 8*2)(mul1) MOVQ acc7, (16*2 + 8*3)(mul1) RET #define axin(off) (32*0 + off)(SP) #define ayin(off) (32*1 + off)(SP) #define cxout(off) (32*2 + off)(SP) #define cyout(off) (32*3 + off)(SP) #define rptr (32*4)(SP) TEXT ·gfp2Square(SB),NOSPLIT,$160-16 // Move input to stack in order to free registers MOVQ res+0(FP), AX MOVQ in1+8(FP), BX MOVOU (16*0)(BX), X0 MOVOU (16*1)(BX), X1 MOVOU (16*2)(BX), X2 MOVOU (16*3)(BX), X3 MOVOU X0, axin(16*0) MOVOU X1, axin(16*1) MOVOU X2, ayin(16*0) MOVOU X3, ayin(16*1) // Store pointer to result MOVQ AX, rptr LDacc (axin) LDt (ayin) gfpAddInline STt (cyout) LDacc (axin) gfpMulBy2Inline LDacc (ayin) CALL gfpSubInternal(SB) LDt (cyout) CALL gfpMulInternal(SB) ST (cyout) LDacc (axin) LDt (ayin) CALL gfpMulInternal(SB) ST (cxout) LDt (cyout) gfpAddInline // Store y MOVQ rptr, AX MOVQ t0, (16*2 + 8*0)(AX) MOVQ t1, (16*2 + 8*1)(AX) MOVQ t2, (16*2 + 8*2)(AX) MOVQ t3, (16*2 + 8*3)(AX) LDacc (cxout) gfpMulBy2Inline // Store x MOVQ rptr, AX /////////////////////// MOVQ $0, rptr MOVQ t0, (16*0 + 8*0)(AX) MOVQ t1, (16*0 + 8*1)(AX) MOVQ t2, (16*0 + 8*2)(AX) MOVQ t3, (16*0 + 8*3)(AX) RET TEXT ·gfp2SquareU(SB),NOSPLIT,$160-16 // Move input to stack in order to free registers MOVQ res+0(FP), AX MOVQ in1+8(FP), BX MOVOU (16*0)(BX), X0 MOVOU (16*1)(BX), X1 MOVOU (16*2)(BX), X2 MOVOU (16*3)(BX), X3 MOVOU X0, axin(16*0) MOVOU X1, axin(16*1) MOVOU X2, ayin(16*0) MOVOU X3, ayin(16*1) // Store pointer to result MOVQ AX, rptr LDacc (axin) LDt (ayin) gfpAddInline STt (cxout) LDacc (axin) gfpMulBy2Inline LDacc (ayin) CALL gfpSubInternal(SB) LDt (cxout) CALL gfpMulInternal(SB) ST (cxout) LDacc (axin) LDt (ayin) CALL gfpMulInternal(SB) ST (cyout) LDt (cxout) gfpAddInline // Store x MOVQ rptr, AX MOVQ t0, (16*0 + 8*0)(AX) MOVQ t1, (16*0 + 8*1)(AX) MOVQ t2, (16*0 + 8*2)(AX) MOVQ t3, (16*0 + 8*3)(AX) LDacc (cyout) gfpMulBy2Inline2 gfpMulBy2Inline XORQ acc4, acc4 XORQ acc5, acc5 XORQ acc6, acc6 XORQ acc7, acc7 CALL gfpSubInternal(SB) // Store y MOVQ rptr, AX /////////////////////// MOVQ $0, rptr MOVQ acc4, (16*2 + 8*0)(AX) MOVQ acc5, (16*2 + 8*1)(AX) MOVQ acc6, (16*2 + 8*2)(AX) MOVQ acc7, (16*2 + 8*3)(AX) RET #undef axin #undef ayin #undef cxout #undef cyout #undef rptr /* ---------------------------------------*/ #define xin(off) (32*0 + off)(SP) #define yin(off) (32*1 + off)(SP) #define zin(off) (32*2 + off)(SP) #define xout(off) (32*3 + off)(SP) #define yout(off) (32*4 + off)(SP) #define zout(off) (32*5 + off)(SP) #define tmp0(off) (32*6 + off)(SP) #define tmp2(off) (32*7 + off)(SP) #define rptr (32*8)(SP) // func curvePointDoubleComplete(c, a *curvePoint) TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$288-16 MOVQ res+0(FP), AX MOVQ in+8(FP), BX MOVOU (16*0)(BX), X0 MOVOU (16*1)(BX), X1 MOVOU (16*2)(BX), X2 MOVOU (16*3)(BX), X3 MOVOU (16*4)(BX), X4 MOVOU (16*5)(BX), X5 MOVOU X0, xin(16*0) MOVOU X1, xin(16*1) MOVOU X2, yin(16*0) MOVOU X3, yin(16*1) MOVOU X4, zin(16*0) MOVOU X5, zin(16*1) // Store pointer to result MOVQ AX, rptr LDacc (yin) CALL gfpSqrInternal(SB) // t0 := Y^2 ST (tmp0) gfpMulBy2Inline2 // Z3 := t0 + t0 gfpMulBy2Inline2 // Z3 := Z3 + Z3 gfpMulBy2Inline // Z3 := Z3 + Z3 STt (zout) LDacc (zin) CALL gfpSqrInternal(SB) // t2 := Z^2 MOVQ acc4, acc0 MOVQ acc5, acc1 MOVQ acc6, acc2 MOVQ acc7, acc3 gfpMulBy2Inline2 gfpMulBy2Inline2 gfpMulBy2Inline2 gfpMulBy2Inline2 MOVQ acc0, t0 MOVQ acc1, t1 MOVQ acc2, t2 MOVQ acc3, t3 CALL gfpSubInternal(SB) // t2 := 3b * t2 ST (tmp2) LDt (zout) CALL gfpMulInternal(SB) // X3 := Z3 * t2 ST (xout) LDacc (tmp0) LDt (tmp2) gfpAddInline // Y3 := t0 + t2 STt (yout) LDacc (yin) LDt (zin) CALL gfpMulInternal(SB) // t1 := YZ LDt (zout) CALL gfpMulInternal(SB) // Z3 := t1 * Z3 MOVQ rptr, AX // Store Z MOVQ acc4, (16*4 + 8*0)(AX) MOVQ acc5, (16*4 + 8*1)(AX) MOVQ acc6, (16*4 + 8*2)(AX) MOVQ acc7, (16*4 + 8*3)(AX) LDacc (tmp2) gfpMulBy2Inline LDacc (tmp2) gfpAddInline // t2 := t2 + t2 + t2 LDacc (tmp0) CALL gfpSubInternal(SB) // t0 := t0 - t2 ST (tmp0) LDt (yout) CALL gfpMulInternal(SB) // Y3 = t0 * Y3 LDt (xout) gfpAddInline // Y3 := X3 + Y3 MOVQ rptr, AX // Store y MOVQ t0, (16*2 + 8*0)(AX) MOVQ t1, (16*2 + 8*1)(AX) MOVQ t2, (16*2 + 8*2)(AX) MOVQ t3, (16*2 + 8*3)(AX) LDacc (xin) LDt (yin) CALL gfpMulInternal(SB) // t1 := XY LDt (tmp0) CALL gfpMulInternal(SB) // X3 := t0 * t1 gfpMulBy2Inline // X3 := X3 + X3 MOVQ rptr, AX MOVQ $0, rptr // Store x MOVQ t0, (16*0 + 8*0)(AX) MOVQ t1, (16*0 + 8*1)(AX) MOVQ t2, (16*0 + 8*2)(AX) MOVQ t3, (16*0 + 8*3)(AX) RET #undef xin #undef yin #undef zin #undef xout #undef yout #undef zout #undef tmp0 #undef tmp2 #undef rptr /* ---------------------------------------*/ #define x1in(off) (32*0 + off)(SP) #define y1in(off) (32*1 + off)(SP) #define z1in(off) (32*2 + off)(SP) #define x2in(off) (32*3 + off)(SP) #define y2in(off) (32*4 + off)(SP) #define z2in(off) (32*5 + off)(SP) #define xout(off) (32*6 + off)(SP) #define yout(off) (32*7 + off)(SP) #define zout(off) (32*8 + off)(SP) #define tmp0(off) (32*9 + off)(SP) #define tmp1(off) (32*10 + off)(SP) #define tmp2(off) (32*11 + off)(SP) #define tmp3(off) (32*12 + off)(SP) #define tmp4(off) (32*13 + off)(SP) #define rptr (32*14)(SP) #define curvePointAddCompleteInline \ LDacc (x1in) \ LDt (x2in) \ CALL gfpMulInternal(SB) \ // t0 := X1X2 ST (tmp0) \ LDacc (y1in) \ LDt (y2in) \ CALL gfpMulInternal(SB) \ // t1 := Y1Y2 ST (tmp1) \ LDacc (z1in) \ LDt (z2in) \ CALL gfpMulInternal(SB) \ // t2 := Z1Z2 ST (tmp2) \ \ LDacc (x1in) \ LDt (y1in) \ gfpAddInline \ STt (tmp3) \ // t3 := X1 + Y1 LDacc (x2in) \ LDt (y2in) \ gfpAddInline \ LDacc (tmp3) \ CALL gfpMulInternal(SB) \ // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2) ST (tmp3) \ LDacc (tmp0) \ LDt (tmp1) \ gfpAddInline \ LDacc (tmp3) \ CALL gfpSubInternal(SB) \ // t3 := t3 - t4 = X1Y2 + X2Y1 ST (tmp3) \ \ LDacc (y1in) \ LDt (z1in) \ gfpAddInline \ // t4 := Y1 + Z1 STt (tmp4) \ LDacc (y2in) \ LDt (z2in) \ gfpAddInline \ LDacc (tmp4) \ CALL gfpMulInternal(SB) \ // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2) ST (tmp4) \ LDacc (tmp1) \ LDt (tmp2) \ gfpAddInline \ LDacc (tmp4) \ CALL gfpSubInternal(SB) \ // t4 := t4 - X3 = Y1Z2 + Y2Z1 ST (tmp4) \ \ LDacc (z1in) \ LDt (x1in) \ gfpAddInline \ // X3 := X1 + Z1 STt (xout) \ LDacc (z2in) \ LDt (x2in) \ gfpAddInline \ LDacc (xout) \ CALL gfpMulInternal(SB) \ // X3 := X3 * Y3 ST (xout) \ LDacc (tmp0) \ LDt (tmp2) \ gfpAddInline \ LDacc (xout) \ CALL gfpSubInternal(SB) \ // Y3 := X3 - Y3 = X1Z2 + X2Z1 ST (yout) \ \ LDacc (tmp0) \ gfpMulBy2Inline \ LDacc (tmp0) \ gfpAddInline \ // t0 := t0 + t0 + t0 = 3X1X2 STt (tmp0) \ \ LDacc (tmp2) \ gfpMulBy2Inline2 \ gfpMulBy2Inline2 \ gfpMulBy2Inline2 \ gfpMulBy2Inline2 \ LDt (tmp2) \ CALL gfpSubInternal(SB) \ // t2 := 3b * t2 = 3bZ1Z2 ST (tmp2) \ \ LDt (tmp1) \ gfpAddInline \ // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2 STt (zout) \ \ LDacc (tmp1) \ LDt (tmp2) \ CALL gfpSubInternal(SB) \ // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2 ST (tmp1) \ \ LDacc (yout) \ gfpMulBy2Inline2 \ gfpMulBy2Inline2 \ gfpMulBy2Inline2 \ gfpMulBy2Inline2 \ LDt (yout) \ CALL gfpSubInternal(SB) \ // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1) ST (yout) \ \ LDt (tmp4) \ CALL gfpMulInternal(SB) \ // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1) ST (xout) \ \ LDacc (tmp1) \ LDt (tmp3) \ CALL gfpMulInternal(SB) \ // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) LDt (xout) \ CALL gfpSubInternal(SB) \ // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1) MOVQ rptr, AX \ \// Store x MOVQ acc4, (16*0 + 8*0)(AX) \ MOVQ acc5, (16*0 + 8*1)(AX) \ MOVQ acc6, (16*0 + 8*2)(AX) \ MOVQ acc7, (16*0 + 8*3)(AX) \ \ LDacc (yout) \ LDt (tmp0) \ CALL gfpMulInternal(SB) \ // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1) ST (yout) \ \ LDacc (tmp1) \ LDt (zout) \ CALL gfpMulInternal(SB) \ // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) LDt (yout) \ gfpAddInline \ // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1) MOVQ rptr, AX \ \// Store y MOVQ t0, (16*2 + 8*0)(AX) \ MOVQ t1, (16*2 + 8*1)(AX) \ MOVQ t2, (16*2 + 8*2)(AX) \ MOVQ t3, (16*2 + 8*3)(AX) \ \ LDacc (tmp0) \ LDt (tmp3) \ CALL gfpMulInternal(SB) \ // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1) ST (tmp0) \ LDacc (zout) \ LDt (tmp4) \ CALL gfpMulInternal(SB) \ // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) LDt (tmp0) \ gfpAddInline \ // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1) MOVQ rptr, AX \ MOVQ $0, rptr \ \// Store z MOVQ t0, (16*4 + 8*0)(AX) \ MOVQ t1, (16*4 + 8*1)(AX) \ MOVQ t2, (16*4 + 8*2)(AX) \ MOVQ t3, (16*4 + 8*3)(AX) \ // func curvePointAddComplete(c, a, b *curvePoint) TEXT ·curvePointAddComplete(SB),0,$480-24 // Move input to stack in order to free registers MOVQ res+0(FP), AX MOVQ in1+8(FP), BX MOVQ in2+16(FP), CX CMPB ·supportAVX2+0(SB), $0x01 JEQ pointadd_avx2 MOVOU (16*0)(BX), X0 MOVOU (16*1)(BX), X1 MOVOU (16*2)(BX), X2 MOVOU (16*3)(BX), X3 MOVOU (16*4)(BX), X4 MOVOU (16*5)(BX), X5 MOVOU X0, x1in(16*0) MOVOU X1, x1in(16*1) MOVOU X2, y1in(16*0) MOVOU X3, y1in(16*1) MOVOU X4, z1in(16*0) MOVOU X5, z1in(16*1) MOVOU (16*0)(CX), X0 MOVOU (16*1)(CX), X1 MOVOU (16*2)(CX), X2 MOVOU (16*3)(CX), X3 MOVOU (16*4)(CX), X4 MOVOU (16*5)(CX), X5 MOVOU X0, x2in(16*0) MOVOU X1, x2in(16*1) MOVOU X2, y2in(16*0) MOVOU X3, y2in(16*1) MOVOU X4, z2in(16*0) MOVOU X5, z2in(16*1) // Store pointer to result MOVQ AX, rptr curvePointAddCompleteInline RET pointadd_avx2: VMOVDQU (32*0)(BX), Y0 VMOVDQU (32*1)(BX), Y1 VMOVDQU (32*2)(BX), Y2 VMOVDQU Y0, x1in(32*0) VMOVDQU Y1, y1in(32*0) VMOVDQU Y2, z1in(32*0) VMOVDQU (32*0)(CX), Y0 VMOVDQU (32*1)(CX), Y1 VMOVDQU (32*2)(CX), Y2 VMOVDQU Y0, x2in(32*0) VMOVDQU Y1, y2in(32*0) VMOVDQU Y2, z2in(32*0) // Store pointer to result MOVQ AX, rptr curvePointAddCompleteInline VZEROUPPER RET #undef x1in #undef y1in #undef z1in #undef x2in #undef y2in #undef z2in #undef xout #undef yout #undef zout #undef tmp0 #undef tmp1 #undef tmp2 #undef tmp3 #undef tmp4 #undef rptr /* ---------------------------------------*/ /* // gfpIsZero returns 1 in AX if [acc4..acc7] represents zero and zero // otherwise. It writes to [acc4..acc7], t0 and t1. TEXT gfpIsZero(SB),NOSPLIT,$0 // AX contains a flag that is set if the input is zero. XORQ AX, AX MOVQ $1, t1 // Check whether [acc4..acc7] are all zero. MOVQ acc4, t0 ORQ acc5, t0 ORQ acc6, t0 ORQ acc7, t0 // Set the zero flag if so. (CMOV of a constant to a register doesn't // appear to be supported in Go. Thus t1 = 1.) CMOVQEQ t1, AX // XOR [acc4..acc7] with P and compare with zero again. XORQ ·p2+0(SB), acc4 XORQ ·p2+8(SB), acc5 XORQ ·p2+16(SB), acc6 XORQ ·p2+24(SB), acc7 ORQ acc5, acc4 ORQ acc6, acc4 ORQ acc7, acc4 // Set the zero flag if so. CMOVQEQ t1, AX RET #define x1in(off) (32*0 + off)(SP) #define y1in(off) (32*1 + off)(SP) #define z1in(off) (32*2 + off)(SP) #define x2in(off) (32*3 + off)(SP) #define y2in(off) (32*4 + off)(SP) #define z2in(off) (32*5 + off)(SP) #define xout(off) (32*6 + off)(SP) #define yout(off) (32*7 + off)(SP) #define zout(off) (32*8 + off)(SP) #define u1(off) (32*9 + off)(SP) #define u2(off) (32*10 + off)(SP) #define s1(off) (32*11 + off)(SP) #define s2(off) (32*12 + off)(SP) #define z1sqr(off) (32*13 + off)(SP) #define z2sqr(off) (32*14 + off)(SP) #define h(off) (32*15 + off)(SP) #define r(off) (32*16 + off)(SP) #define hsqr(off) (32*17 + off)(SP) #define rsqr(off) (32*18 + off)(SP) #define hcub(off) (32*19 + off)(SP) #define rptr (32*20)(SP) #define points_eq (32*20+8)(SP) #define curvePointAddInline \ \// Begin point add LDacc (z2in) \ CALL gfpSqrInternal(SB) \// z2ˆ2 ST (z2sqr) \ LDt (z2in) \ CALL gfpMulInternal(SB) \// z2ˆ3 LDt (y1in) \ CALL gfpMulInternal(SB) \// s1 = z2ˆ3*y1 ST (s1) \ \ LDacc (z1in) \ CALL gfpSqrInternal(SB) \// z1ˆ2 ST (z1sqr) \ LDt (z1in) \ CALL gfpMulInternal(SB) \// z1ˆ3 LDt (y2in) \ CALL gfpMulInternal(SB) \// s2 = z1ˆ3*y2 ST (s2) \ \ LDt (s1) \ CALL gfpSubInternal(SB) \// r = s2 - s1 ST (r) \ CALL gfpIsZero(SB) \ MOVQ AX, points_eq \ \ LDacc (z2sqr) \ LDt (x1in) \ CALL gfpMulInternal(SB) \// u1 = x1 * z2ˆ2 ST (u1) \ LDacc (z1sqr) \ LDt (x2in) \ CALL gfpMulInternal(SB) \// u2 = x2 * z1ˆ2 ST (u2) \ \ LDt (u1) \ CALL gfpSubInternal(SB) \// h = u2 - u1 ST (h) \ CALL gfpIsZero(SB) \ ANDQ points_eq, AX \ MOVQ AX, points_eq \ \ LDacc (r) \ CALL gfpSqrInternal(SB) \// rsqr = rˆ2 ST (rsqr) \ \ LDacc (h) \ CALL gfpSqrInternal(SB) \// hsqr = hˆ2 ST (hsqr) \ \ LDt (h) \ CALL gfpMulInternal(SB) \// hcub = hˆ3 ST (hcub) \ \ LDt (s1) \ CALL gfpMulInternal(SB) \ ST (s2) \ \ LDacc (z1in) \ LDt (z2in) \ CALL gfpMulInternal(SB) \// z1 * z2 LDt (h) \ CALL gfpMulInternal(SB) \// z1 * z2 * h ST (zout) \ \ LDacc (hsqr) \ LDt (u1) \ CALL gfpMulInternal(SB) \// hˆ2 * u1 ST (u2) \ \ gfpMulBy2Inline \// u1 * hˆ2 * 2, inline LDacc (rsqr) \ CALL gfpSubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2 \ LDt (hcub) \ CALL gfpSubInternal(SB) \ ST (xout) \ \ MOVQ acc4, t0 \ MOVQ acc5, t1 \ MOVQ acc6, t2 \ MOVQ acc7, t3 \ LDacc (u2) \ CALL gfpSubInternal(SB) \ \ LDt (r) \ CALL gfpMulInternal(SB) \ \ LDt (s2) \ CALL gfpSubInternal(SB) \ ST (yout) \ // func curvePointAdd(c, a, b *curvePoint) int TEXT ·curvePointAdd(SB),0,$680-32 // Move input to stack in order to free registers MOVQ res+0(FP), AX MOVQ in1+8(FP), BX MOVQ in2+16(FP), CX MOVOU (16*0)(BX), X0 MOVOU (16*1)(BX), X1 MOVOU (16*2)(BX), X2 MOVOU (16*3)(BX), X3 MOVOU (16*4)(BX), X4 MOVOU (16*5)(BX), X5 MOVOU X0, x1in(16*0) MOVOU X1, x1in(16*1) MOVOU X2, y1in(16*0) MOVOU X3, y1in(16*1) MOVOU X4, z1in(16*0) MOVOU X5, z1in(16*1) MOVOU (16*0)(CX), X0 MOVOU (16*1)(CX), X1 MOVOU (16*2)(CX), X2 MOVOU (16*3)(CX), X3 MOVOU (16*4)(CX), X4 MOVOU (16*5)(CX), X5 MOVOU X0, x2in(16*0) MOVOU X1, x2in(16*1) MOVOU X2, y2in(16*0) MOVOU X3, y2in(16*1) MOVOU X4, z2in(16*0) MOVOU X5, z2in(16*1) // Store pointer to result MOVQ AX, rptr curvePointAddInline MOVOU xout(16*0), X0 MOVOU xout(16*1), X1 MOVOU yout(16*0), X2 MOVOU yout(16*1), X3 MOVOU zout(16*0), X4 MOVOU zout(16*1), X5 // Finally output the result MOVQ rptr, AX MOVQ $0, rptr MOVOU X0, (16*0)(AX) MOVOU X1, (16*1)(AX) MOVOU X2, (16*2)(AX) MOVOU X3, (16*3)(AX) MOVOU X4, (16*4)(AX) MOVOU X5, (16*5)(AX) MOVQ points_eq, AX MOVQ AX, ret+24(FP) RET */