Do you happen to have a function for multiplication as well?
// compute a * b = (low, high)
#define AccAdd4WordsBy4_wc(a0, a1, a2, a3, b0, b1, b2) asm ("addq %4, %0; adcx %5, %1; adcx %6, %2; adcq $0, %3;" : "+r"(a0), "+r"(a1), "+r"(a2), "+r"(a3) : "r"(b0), "r"(b1), "r"(b2) : "cc");
// (a0, a1, a2, a3) = (a0, a1, a2, a3) + (b0, b1, b2, 0) without carry
#define MulAcc(c, a0, a1, a, b) asm ("mulx %3, %3, %4; addq %3, %1; adcq %4, %2; adcq $0, %0;" : "+r"(c), "+r"(a0), "+r"(a1), "=a"(a), "=d"(b) : "a"(a), "d"(b) : "cc");
#define MulAcc_11(a0, a1, c0, a, b) asm ("mulx %3, %0, %1; addq %2, %0; adcq $0, %1;" : "+r"(a0), "+r"(a1): "r"(c0), "r"(a), "d"(b) : "cc");
//compute u * v = r mod p
void mul(uint64_t *r, const uint64_t *u, const uint64_t *v) {
uint64_t u0 = u[0];
uint64_t u1 = u[1];
uint64_t u2 = u[2];
uint64_t u3 = u[3];
uint64_t v0 = v[0];
uint64_t v1 = v[1];
uint64_t v2 = v[2];
uint64_t v3 = v[3];
uint64_t r0, r1, r2, r3, r4, r5, r6, r7;
uint64_t z1, z2, z3, z4, z5, z6, z7, z8, z44, z66;
z2 = z3 = z4 = z5 = z6 = z7 = z8 = r1 = r2 = r3 = r4 = r5 = r6 = r7 = 0;
MultiplyWordsLoHi(r0, z1, u0, v0) //x1 --> r0 ok
MultiplyWordsLoHi(z2, z3, u1, v0)
MultiplyWordsLoHi(z4, z5, u2, v0)
MultiplyWordsLoHi(z6, z7, u3, v0)
MultiplyWordsLoHi(z66, z8, u3, v1)//
AccAdd4WordsBy4_wc(z2, z4, z6, z7, z1, z3, z5)
MulAcc_11(r1, z1, z2, u0, v1) //x1 --> r1 ok
MultiplyWordsLoHi(z2, z3, u1, v1)
MultiplyWordsLoHi(z44, z5, u2, v1)
AccAdd4WordsBy4_wc(z1, z3, z5, z8, z4, z6, z7)
AccAdd4WordsBy4_wc(z2, z44, z66, z8, z1, z3, z5)
MulAcc_11(r2, z1, z2, u0, v2) //x1 --> r2 ok
MultiplyWordsLoHi(z2, z3, u1, v2)
MultiplyWordsLoHi(z4, z5, u2, v2)
MultiplyWordsLoHi(z6, z7, u3, v2)
AccAdd4WordsBy4_wc(z1, z3, z5, z7, z44, z66, z8)
AccAdd4WordsBy4_wc(z2, z4, z6, z7, z1, z3, z5)
MulAcc_11(r3, z1, z2, u0, v3) //x1 --> r3 ok
MultiplyWordsLoHi(r4, z3, u1, v3)
MultiplyWordsLoHi(r5, z5, u2, v3)
MultiplyWordsLoHi(r6, r7, u3, v3)
AccAdd4WordsBy4_wc(z1, z3, z5, r7, z4, z6, z7)
AccAdd4WordsBy4_wc(r4, r5, r6, r7, z1, z3, z5) //r4, r5, r6, r7 ok
//Reduction
uint64_t p = 0x1000003d1;
MultiplyWordsLoHi(z3, z4, r5, p)
MultiplyWordsLoHi(z5, z6, r6, p)
MultiplyWordsLoHi(z7, z8, r7, p)
MulAcc_11(z1, z2, r0, r4, p)
AccAdd4WordsBy4_wc(z2, z4, z6, z8, r1, r2, r3)
uint64_t c = 0;
AccAdd4WordsBy4_wc(z3, z5, z7, z8, z2, z4, z6)
MulAcc(c, z1, z3, p, z8)
r[0] = z1;
r[1] = z3;
if(c == 1){
asm (
"addq $1, %0; adcq $0, %1; \n"
: "=r" (z5), "=r" (z7)
: : "cc");
}
r[2] = z5;
r[3] = z7;
}
MultiplyWordsLoHi and other functions are here:
https://www.cryptopp.com/docs/ref/integer_8cpp_source.html